1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
7 #include <boost/algorithm/string.hpp>
9 #include <boost/format.hpp>
10 #include <boost/optional.hpp>
11 #include <boost/utility/in_place_factory.hpp>
13 #include "common/ceph_json.h"
14 #include "common/utf8.h"
16 #include "common/errno.h"
17 #include "common/Formatter.h"
18 #include "common/Throttle.h"
19 #include "common/Finisher.h"
21 #include "rgw_rados.h"
22 #include "rgw_cache.h"
24 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
25 #include "rgw_metadata.h"
26 #include "rgw_bucket.h"
27 #include "rgw_rest_conn.h"
28 #include "rgw_cr_rados.h"
29 #include "rgw_cr_rest.h"
31 #include "cls/rgw/cls_rgw_ops.h"
32 #include "cls/rgw/cls_rgw_types.h"
33 #include "cls/rgw/cls_rgw_client.h"
34 #include "cls/rgw/cls_rgw_const.h"
35 #include "cls/refcount/cls_refcount_client.h"
36 #include "cls/version/cls_version_client.h"
37 #include "cls/log/cls_log_client.h"
38 #include "cls/statelog/cls_statelog_client.h"
39 #include "cls/timeindex/cls_timeindex_client.h"
40 #include "cls/lock/cls_lock_client.h"
41 #include "cls/user/cls_user_client.h"
43 #include "rgw_tools.h"
44 #include "rgw_coroutine.h"
45 #include "rgw_compression.h"
47 #include "rgw_boost_asio_yield.h"
48 #undef fork // fails to compile RGWPeriod::fork() below
50 #include "common/Clock.h"
52 #include "include/rados/librados.hpp"
53 using namespace librados
;
61 #include "auth/Crypto.h" // get_random_bytes()
68 #include "rgw_object_expirer_core.h"
70 #include "rgw_data_sync.h"
71 #include "rgw_realm_watcher.h"
73 #include "compressor/Compressor.h"
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
82 static string notify_oid_prefix
= "notify";
83 static string
*notify_oids
= NULL
;
84 static string shadow_ns
= "shadow";
85 static string dir_oid_prefix
= ".dir.";
86 static string default_storage_pool_suffix
= "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
89 static string avail_pools
= ".pools.avail";
91 static string zone_info_oid_prefix
= "zone_info.";
92 static string zone_names_oid_prefix
= "zone_names.";
93 static string region_info_oid_prefix
= "region_info.";
94 static string zone_group_info_oid_prefix
= "zonegroup_info.";
95 static string realm_names_oid_prefix
= "realms_names.";
96 static string realm_info_oid_prefix
= "realms.";
97 static string default_region_info_oid
= "default.region";
98 static string default_zone_group_info_oid
= "default.zonegroup";
99 static string period_info_oid_prefix
= "periods.";
100 static string period_latest_epoch_info_oid
= ".latest_epoch";
101 static string region_map_oid
= "region_map";
102 static string zonegroup_map_oid
= "zonegroup_map";
103 static string log_lock_name
= "rgw_log_lock";
104 static string default_realm_info_oid
= "default.realm";
105 const string default_zonegroup_name
= "default";
106 const string default_zone_name
= "default";
107 static string zonegroup_names_oid_prefix
= "zonegroups_names.";
108 static RGWObjCategory main_category
= RGW_OBJ_CATEGORY_MAIN
;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL
= "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL
= "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL
= "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL
= "rgw.root";
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
118 #define dout_subsys ceph_subsys_rgw
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
122 const string
& placement_id
, const rgw_obj
& obj
, rgw_pool
*pool
)
124 if (!zone_params
.get_head_data_pool(placement_id
, obj
, pool
)) {
125 RGWZonePlacementInfo placement
;
126 if (!zone_params
.get_placement(zonegroup
.default_placement
, &placement
)) {
130 if (!obj
.in_extra_data
) {
131 *pool
= placement
.data_pool
;
133 *pool
= placement
.data_extra_pool
;
140 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
141 const string
& placement_id
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
143 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
145 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_id
, obj
, &raw_obj
->pool
);
148 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
152 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
158 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
162 store
->obj_to_raw(placement_rule
, obj
, &r
);
168 int rgw_init_ioctx(librados::Rados
*rados
, const rgw_pool
& pool
, IoCtx
& ioctx
, bool create
)
170 int r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
171 if (r
== -ENOENT
&& create
) {
172 r
= rados
->pool_create(pool
.name
.c_str());
173 if (r
< 0 && r
!= -EEXIST
) {
177 r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
182 if (!pool
.ns
.empty()) {
183 ioctx
.set_namespace(pool
.ns
);
189 void RGWObjectCtxImpl
<rgw_obj
, RGWObjState
>::invalidate(rgw_obj
& obj
) {
190 RWLock::WLocker
wl(lock
);
191 auto iter
= objs_state
.find(obj
);
192 if (iter
== objs_state
.end()) {
195 bool is_atomic
= iter
->second
.is_atomic
;
196 bool prefetch_data
= iter
->second
.prefetch_data
;
198 objs_state
.erase(iter
);
200 if (is_atomic
|| prefetch_data
) {
201 auto& s
= objs_state
[obj
];
202 s
.is_atomic
= is_atomic
;
203 s
.prefetch_data
= prefetch_data
;
208 void RGWObjectCtxImpl
<rgw_raw_obj
, RGWRawObjState
>::invalidate(rgw_raw_obj
& obj
) {
209 RWLock::WLocker
wl(lock
);
210 auto iter
= objs_state
.find(obj
);
211 if (iter
== objs_state
.end()) {
215 objs_state
.erase(iter
);
218 void RGWDefaultZoneGroupInfo::dump(Formatter
*f
) const {
219 encode_json("default_zonegroup", default_zonegroup
, f
);
222 void RGWDefaultZoneGroupInfo::decode_json(JSONObj
*obj
) {
224 JSONDecoder::decode_json("default_zonegroup", default_zonegroup
, obj
);
225 /* backward compatability with region */
226 if (default_zonegroup
.empty()) {
227 JSONDecoder::decode_json("default_region", default_zonegroup
, obj
);
231 rgw_pool
RGWZoneGroup::get_pool(CephContext
*cct_
)
233 if (cct_
->_conf
->rgw_zonegroup_root_pool
.empty()) {
234 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL
);
237 return rgw_pool(cct_
->_conf
->rgw_zonegroup_root_pool
);
240 int RGWZoneGroup::create_default(bool old_format
)
242 name
= default_zonegroup_name
;
245 RGWZoneGroupPlacementTarget placement_target
;
246 placement_target
.name
= "default-placement";
247 placement_targets
[placement_target
.name
] = placement_target
;
248 default_placement
= "default-placement";
250 RGWZoneParams
zone_params(default_zone_name
);
252 int r
= zone_params
.init(cct
, store
, false);
254 ldout(cct
, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r
) << dendl
;
258 r
= zone_params
.create_default();
259 if (r
< 0 && r
!= -EEXIST
) {
260 ldout(cct
, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r
) << dendl
;
262 } else if (r
== -EEXIST
) {
263 ldout(cct
, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl
;
264 zone_params
.clear_id();
265 r
= zone_params
.init(cct
, store
);
267 ldout(cct
, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r
) << dendl
;
270 ldout(cct
, 20) << "zone_params::create_default() " << zone_params
.get_name() << " id " << zone_params
.get_id()
274 RGWZone
& default_zone
= zones
[zone_params
.get_id()];
275 default_zone
.name
= zone_params
.get_name();
276 default_zone
.id
= zone_params
.get_id();
277 master_zone
= default_zone
.id
;
280 if (r
< 0 && r
!= -EEXIST
) {
281 ldout(cct
, 0) << "error storing zone group info: " << cpp_strerror(-r
) << dendl
;
286 ldout(cct
, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl
;
288 r
= init(cct
, store
);
298 post_process_params();
303 const string
RGWZoneGroup::get_default_oid(bool old_region_format
)
305 if (old_region_format
) {
306 if (cct
->_conf
->rgw_default_region_info_oid
.empty()) {
307 return default_region_info_oid
;
309 return cct
->_conf
->rgw_default_region_info_oid
;
312 string default_oid
= cct
->_conf
->rgw_default_zonegroup_info_oid
;
314 if (cct
->_conf
->rgw_default_zonegroup_info_oid
.empty()) {
315 default_oid
= default_zone_group_info_oid
;
318 default_oid
+= "." + realm_id
;
323 const string
& RGWZoneGroup::get_info_oid_prefix(bool old_region_format
)
325 if (old_region_format
) {
326 return region_info_oid_prefix
;
328 return zone_group_info_oid_prefix
;
331 const string
& RGWZoneGroup::get_names_oid_prefix()
333 return zonegroup_names_oid_prefix
;
336 const string
& RGWZoneGroup::get_predefined_name(CephContext
*cct
) {
337 return cct
->_conf
->rgw_zonegroup
;
340 int RGWZoneGroup::equals(const string
& other_zonegroup
) const
342 if (is_master
&& other_zonegroup
.empty())
345 return (id
== other_zonegroup
);
348 int RGWZoneGroup::add_zone(const RGWZoneParams
& zone_params
, bool *is_master
, bool *read_only
,
349 const list
<string
>& endpoints
, const string
*ptier_type
,
350 bool *psync_from_all
, list
<string
>& sync_from
, list
<string
>& sync_from_rm
)
352 auto& zone_id
= zone_params
.get_id();
353 auto& zone_name
= zone_params
.get_name();
355 // check for duplicate zone name on insert
356 if (!zones
.count(zone_id
)) {
357 for (const auto& zone
: zones
) {
358 if (zone
.second
.name
== zone_name
) {
359 ldout(cct
, 0) << "ERROR: found existing zone name " << zone_name
360 << " (" << zone
.first
<< ") in zonegroup " << get_name() << dendl
;
368 if (!master_zone
.empty() && master_zone
!= zone_params
.get_id()) {
369 ldout(cct
, 0) << "NOTICE: overriding master zone: " << master_zone
<< dendl
;
371 master_zone
= zone_params
.get_id();
372 } else if (master_zone
== zone_params
.get_id()) {
377 RGWZone
& zone
= zones
[zone_params
.get_id()];
378 zone
.name
= zone_params
.get_name();
379 zone
.id
= zone_params
.get_id();
380 if (!endpoints
.empty()) {
381 zone
.endpoints
= endpoints
;
384 zone
.read_only
= *read_only
;
387 zone
.tier_type
= *ptier_type
;
390 if (psync_from_all
) {
391 zone
.sync_from_all
= *psync_from_all
;
394 for (auto add
: sync_from
) {
395 zone
.sync_from
.insert(add
);
398 for (auto rm
: sync_from_rm
) {
399 zone
.sync_from
.erase(rm
);
402 post_process_params();
408 int RGWZoneGroup::rename_zone(const RGWZoneParams
& zone_params
)
410 RGWZone
& zone
= zones
[zone_params
.get_id()];
411 zone
.name
= zone_params
.get_name();
416 void RGWZoneGroup::post_process_params()
418 bool log_data
= zones
.size() > 1;
420 if (master_zone
.empty()) {
421 map
<string
, RGWZone
>::iterator iter
= zones
.begin();
422 if (iter
!= zones
.end()) {
423 master_zone
= iter
->first
;
427 for (map
<string
, RGWZone
>::iterator iter
= zones
.begin(); iter
!= zones
.end(); ++iter
) {
428 RGWZone
& zone
= iter
->second
;
429 zone
.log_data
= log_data
;
430 zone
.log_meta
= (is_master
&& zone
.id
== master_zone
);
432 RGWZoneParams
zone_params(zone
.id
, zone
.name
);
433 int ret
= zone_params
.init(cct
, store
);
435 ldout(cct
, 0) << "WARNING: could not read zone params for zone id=" << zone
.id
<< " name=" << zone
.name
<< dendl
;
439 for (map
<string
, RGWZonePlacementInfo
>::iterator iter
= zone_params
.placement_pools
.begin();
440 iter
!= zone_params
.placement_pools
.end(); ++iter
) {
441 const string
& placement_name
= iter
->first
;
442 if (placement_targets
.find(placement_name
) == placement_targets
.end()) {
443 RGWZoneGroupPlacementTarget placement_target
;
444 placement_target
.name
= placement_name
;
445 placement_targets
[placement_name
] = placement_target
;
450 if (default_placement
.empty() && !placement_targets
.empty()) {
451 default_placement
= placement_targets
.begin()->first
;
455 int RGWZoneGroup::remove_zone(const std::string
& zone_id
)
457 map
<string
, RGWZone
>::iterator iter
= zones
.find(zone_id
);
458 if (iter
== zones
.end()) {
459 ldout(cct
, 0) << "zone id " << zone_id
<< " is not a part of zonegroup "
466 post_process_params();
471 int RGWZoneGroup::read_default_id(string
& default_id
, bool old_format
)
473 if (realm_id
.empty()) {
474 /* try using default realm */
476 int ret
= realm
.init(cct
, store
);
478 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
481 realm_id
= realm
.get_id();
484 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
487 int RGWZoneGroup::set_as_default(bool exclusive
)
489 if (realm_id
.empty()) {
490 /* try using default realm */
492 int ret
= realm
.init(cct
, store
);
494 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
497 realm_id
= realm
.get_id();
500 return RGWSystemMetaObj::set_as_default(exclusive
);
503 int RGWSystemMetaObj::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
, bool old_format
)
511 if (old_format
&& id
.empty()) {
518 name
= get_predefined_name(cct
);
521 r
= use_default(old_format
);
525 } else if (!old_format
) {
526 r
= read_id(name
, id
);
529 ldout(cct
, 0) << "error in read_id for object name: " << name
<< " : " << cpp_strerror(-r
) << dendl
;
536 return read_info(id
, old_format
);
539 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo
& default_info
, const string
& oid
)
541 auto pool
= get_pool(cct
);
543 RGWObjectCtx
obj_ctx(store
);
544 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
549 bufferlist::iterator iter
= bl
.begin();
550 ::decode(default_info
, iter
);
551 } catch (buffer::error
& err
) {
552 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
559 int RGWSystemMetaObj::read_default_id(string
& default_id
, bool old_format
)
561 RGWDefaultSystemMetaObjInfo default_info
;
563 int ret
= read_default(default_info
, get_default_oid(old_format
));
568 default_id
= default_info
.default_id
;
573 int RGWSystemMetaObj::use_default(bool old_format
)
575 return read_default_id(id
, old_format
);
578 int RGWSystemMetaObj::set_as_default(bool exclusive
)
580 string oid
= get_default_oid();
582 rgw_pool
pool(get_pool(cct
));
585 RGWDefaultSystemMetaObjInfo default_info
;
586 default_info
.default_id
= id
;
588 ::encode(default_info
, bl
);
590 int ret
= rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
591 exclusive
, NULL
, real_time(), NULL
);
598 int RGWSystemMetaObj::read_id(const string
& obj_name
, string
& object_id
)
600 rgw_pool
pool(get_pool(cct
));
603 string oid
= get_names_oid_prefix() + obj_name
;
605 RGWObjectCtx
obj_ctx(store
);
606 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
611 RGWNameToId nameToId
;
613 bufferlist::iterator iter
= bl
.begin();
614 ::decode(nameToId
, iter
);
615 } catch (buffer::error
& err
) {
616 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
619 object_id
= nameToId
.obj_id
;
623 int RGWSystemMetaObj::delete_obj(bool old_format
)
625 rgw_pool
pool(get_pool(cct
));
627 /* check to see if obj is the default */
628 RGWDefaultSystemMetaObjInfo default_info
;
629 int ret
= read_default(default_info
, get_default_oid(old_format
));
630 if (ret
< 0 && ret
!= -ENOENT
)
632 if (default_info
.default_id
== id
|| (old_format
&& default_info
.default_id
== name
)) {
633 string oid
= get_default_oid(old_format
);
634 rgw_raw_obj
default_named_obj(pool
, oid
);
635 ret
= store
->delete_system_obj(default_named_obj
);
637 ldout(cct
, 0) << "Error delete default obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
642 string oid
= get_names_oid_prefix() + name
;
643 rgw_raw_obj
object_name(pool
, oid
);
644 ret
= store
->delete_system_obj(object_name
);
646 ldout(cct
, 0) << "Error delete obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
651 string oid
= get_info_oid_prefix(old_format
);
658 rgw_raw_obj
object_id(pool
, oid
);
659 ret
= store
->delete_system_obj(object_id
);
661 ldout(cct
, 0) << "Error delete object id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
667 int RGWSystemMetaObj::store_name(bool exclusive
)
669 rgw_pool
pool(get_pool(cct
));
670 string oid
= get_names_oid_prefix() + name
;
672 RGWNameToId nameToId
;
673 nameToId
.obj_id
= id
;
676 ::encode(nameToId
, bl
);
677 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
680 int RGWSystemMetaObj::rename(const string
& new_name
)
683 int ret
= read_id(new_name
, new_id
);
687 if (ret
< 0 && ret
!= -ENOENT
) {
688 ldout(cct
, 0) << "Error read_id " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
691 string old_name
= name
;
695 ldout(cct
, 0) << "Error storing new obj info " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
698 ret
= store_name(true);
700 ldout(cct
, 0) << "Error storing new name " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
703 /* delete old name */
704 rgw_pool
pool(get_pool(cct
));
705 string oid
= get_names_oid_prefix() + old_name
;
706 rgw_raw_obj
old_name_obj(pool
, oid
);
707 ret
= store
->delete_system_obj(old_name_obj
);
709 ldout(cct
, 0) << "Error delete old obj name " << old_name
<< ": " << cpp_strerror(-ret
) << dendl
;
716 int RGWSystemMetaObj::read_info(const string
& obj_id
, bool old_format
)
718 rgw_pool
pool(get_pool(cct
));
722 string oid
= get_info_oid_prefix(old_format
) + obj_id
;
724 RGWObjectCtx
obj_ctx(store
);
725 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
727 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << oid
<< ": " << cpp_strerror(-ret
) << dendl
;
732 bufferlist::iterator iter
= bl
.begin();
733 ::decode(*this, iter
);
734 } catch (buffer::error
& err
) {
735 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
742 int RGWSystemMetaObj::read()
744 int ret
= read_id(name
, id
);
749 return read_info(id
);
752 int RGWSystemMetaObj::create(bool exclusive
)
756 /* check to see the name is not used */
757 ret
= read_id(name
, id
);
758 if (exclusive
&& ret
== 0) {
759 ldout(cct
, 10) << "ERROR: name " << name
<< " already in use for obj id " << id
<< dendl
;
761 } else if ( ret
< 0 && ret
!= -ENOENT
) {
762 ldout(cct
, 0) << "failed reading obj id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
767 /* create unique id */
770 new_uuid
.generate_random();
771 new_uuid
.print(uuid_str
);
775 ret
= store_info(exclusive
);
777 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
781 return store_name(exclusive
);
784 int RGWSystemMetaObj::store_info(bool exclusive
)
786 rgw_pool
pool(get_pool(cct
));
788 string oid
= get_info_oid_prefix() + id
;
792 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
795 int RGWSystemMetaObj::write(bool exclusive
)
797 int ret
= store_info(exclusive
);
799 ldout(cct
, 20) << __func__
<< "(): store_info() returned ret=" << ret
<< dendl
;
802 ret
= store_name(exclusive
);
804 ldout(cct
, 20) << __func__
<< "(): store_name() returned ret=" << ret
<< dendl
;
811 const string
& RGWRealm::get_predefined_name(CephContext
*cct
) {
812 return cct
->_conf
->rgw_realm
;
815 int RGWRealm::create(bool exclusive
)
817 int ret
= RGWSystemMetaObj::create(exclusive
);
819 ldout(cct
, 0) << "ERROR creating new realm object " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
822 // create the control object for watch/notify
823 ret
= create_control(exclusive
);
825 ldout(cct
, 0) << "ERROR creating control for new realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
829 if (current_period
.empty()) {
830 /* create new period for the realm */
831 ret
= period
.init(cct
, store
, id
, name
, false);
835 ret
= period
.create(true);
837 ldout(cct
, 0) << "ERROR: creating new period for realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
841 period
= RGWPeriod(current_period
, 0);
842 int ret
= period
.init(cct
, store
, id
, name
);
844 ldout(cct
, 0) << "ERROR: failed to init period " << current_period
<< dendl
;
848 ret
= set_current_period(period
);
850 ldout(cct
, 0) << "ERROR: failed set current period " << current_period
<< dendl
;
853 // try to set as default. may race with another create, so pass exclusive=true
854 // so we don't override an existing default
855 ret
= set_as_default(true);
856 if (ret
< 0 && ret
!= -EEXIST
) {
857 ldout(cct
, 0) << "WARNING: failed to set realm as default realm, ret=" << ret
<< dendl
;
863 int RGWRealm::delete_obj()
865 int ret
= RGWSystemMetaObj::delete_obj();
869 return delete_control();
872 int RGWRealm::create_control(bool exclusive
)
874 auto pool
= rgw_pool
{get_pool(cct
)};
875 auto oid
= get_control_oid();
876 return rgw_put_system_obj(store
, pool
, oid
, nullptr, 0, exclusive
,
877 nullptr, real_time(), nullptr);
880 int RGWRealm::delete_control()
882 auto pool
= rgw_pool
{get_pool(cct
)};
883 auto obj
= rgw_raw_obj
{pool
, get_control_oid()};
884 return store
->delete_system_obj(obj
);
887 rgw_pool
RGWRealm::get_pool(CephContext
*cct
)
889 if (cct
->_conf
->rgw_realm_root_pool
.empty()) {
890 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL
);
892 return rgw_pool(cct
->_conf
->rgw_realm_root_pool
);
895 const string
RGWRealm::get_default_oid(bool old_format
)
897 if (cct
->_conf
->rgw_default_realm_info_oid
.empty()) {
898 return default_realm_info_oid
;
900 return cct
->_conf
->rgw_default_realm_info_oid
;
903 const string
& RGWRealm::get_names_oid_prefix()
905 return realm_names_oid_prefix
;
908 const string
& RGWRealm::get_info_oid_prefix(bool old_format
)
910 return realm_info_oid_prefix
;
913 int RGWRealm::set_current_period(RGWPeriod
& period
)
915 // update realm epoch to match the period's
916 if (epoch
> period
.get_realm_epoch()) {
917 ldout(cct
, 0) << "ERROR: set_current_period with old realm epoch "
918 << period
.get_realm_epoch() << ", current epoch=" << epoch
<< dendl
;
921 if (epoch
== period
.get_realm_epoch() && current_period
!= period
.get_id()) {
922 ldout(cct
, 0) << "ERROR: set_current_period with same realm epoch "
923 << period
.get_realm_epoch() << ", but different period id "
924 << period
.get_id() << " != " << current_period
<< dendl
;
928 epoch
= period
.get_realm_epoch();
929 current_period
= period
.get_id();
933 ldout(cct
, 0) << "ERROR: period update: " << cpp_strerror(-ret
) << dendl
;
937 ret
= period
.reflect();
939 ldout(cct
, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret
) << dendl
;
946 string
RGWRealm::get_control_oid()
948 return get_info_oid_prefix() + id
+ ".control";
951 int RGWRealm::notify_zone(bufferlist
& bl
)
953 // open a context on the realm's pool
954 rgw_pool pool
{get_pool(cct
)};
956 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ctx
);
958 ldout(cct
, 0) << "Failed to open pool " << pool
<< dendl
;
961 // send a notify on the realm object
962 r
= ctx
.notify2(get_control_oid(), bl
, 0, nullptr);
964 ldout(cct
, 0) << "Realm notify failed with " << r
<< dendl
;
970 int RGWRealm::notify_new_period(const RGWPeriod
& period
)
973 // push the period to dependent zonegroups/zones
974 ::encode(RGWRealmNotify::ZonesNeedPeriod
, bl
);
975 ::encode(period
, bl
);
976 // reload the gateway with the new period
977 ::encode(RGWRealmNotify::Reload
, bl
);
979 return notify_zone(bl
);
982 std::string
RGWPeriodConfig::get_oid(const std::string
& realm_id
)
984 if (realm_id
.empty()) {
985 return "period_config.default";
987 return "period_config." + realm_id
;
990 rgw_pool
RGWPeriodConfig::get_pool(CephContext
*cct
)
992 const auto& pool_name
= cct
->_conf
->rgw_period_root_pool
;
993 if (pool_name
.empty()) {
994 return {RGW_DEFAULT_PERIOD_ROOT_POOL
};
999 int RGWPeriodConfig::read(RGWRados
*store
, const std::string
& realm_id
)
1001 RGWObjectCtx
obj_ctx(store
);
1002 const auto& pool
= get_pool(store
->ctx());
1003 const auto& oid
= get_oid(realm_id
);
1006 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, nullptr, nullptr);
1011 bufferlist::iterator iter
= bl
.begin();
1012 ::decode(*this, iter
);
1013 } catch (buffer::error
& err
) {
1019 int RGWPeriodConfig::write(RGWRados
*store
, const std::string
& realm_id
)
1021 const auto& pool
= get_pool(store
->ctx());
1022 const auto& oid
= get_oid(realm_id
);
1024 ::encode(*this, bl
);
1025 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1026 false, nullptr, real_time(), nullptr);
1029 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, const string
& period_realm_id
,
1030 const string
& period_realm_name
, bool setup_obj
)
1034 realm_id
= period_realm_id
;
1035 realm_name
= period_realm_name
;
1040 return init(_cct
, _store
, setup_obj
);
1044 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
)
1053 RGWRealm
realm(realm_id
, realm_name
);
1054 int ret
= realm
.init(cct
, store
);
1056 ldout(cct
, 0) << "RGWPeriod::init failed to init realm " << realm_name
<< " id " << realm_id
<< " : " <<
1057 cpp_strerror(-ret
) << dendl
;
1060 id
= realm
.get_current_period();
1061 realm_id
= realm
.get_id();
1065 int ret
= use_latest_epoch();
1067 ldout(cct
, 0) << "failed to use_latest_epoch period id " << id
<< " realm " << realm_name
<< " id " << realm_id
1068 << " : " << cpp_strerror(-ret
) << dendl
;
1077 int RGWPeriod::get_zonegroup(RGWZoneGroup
& zonegroup
, const string
& zonegroup_id
) {
1078 map
<string
, RGWZoneGroup
>::const_iterator iter
;
1079 if (!zonegroup_id
.empty()) {
1080 iter
= period_map
.zonegroups
.find(zonegroup_id
);
1082 iter
= period_map
.zonegroups
.find("default");
1084 if (iter
!= period_map
.zonegroups
.end()) {
1085 zonegroup
= iter
->second
;
1092 bool RGWPeriod::is_single_zonegroup(CephContext
*cct
, RGWRados
*store
)
1094 return (period_map
.zonegroups
.size() == 1);
1097 const string
& RGWPeriod::get_latest_epoch_oid()
1099 if (cct
->_conf
->rgw_period_latest_epoch_info_oid
.empty()) {
1100 return period_latest_epoch_info_oid
;
1102 return cct
->_conf
->rgw_period_latest_epoch_info_oid
;
1105 const string
& RGWPeriod::get_info_oid_prefix()
1107 return period_info_oid_prefix
;
1110 const string
RGWPeriod::get_period_oid_prefix()
1112 return get_info_oid_prefix() + id
;
1115 const string
RGWPeriod::get_period_oid()
1117 std::ostringstream oss
;
1118 oss
<< get_period_oid_prefix();
1119 // skip the epoch for the staging period
1120 if (id
!= get_staging_id(realm_id
))
1121 oss
<< "." << epoch
;
1125 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo
& info
)
1127 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1129 rgw_pool
pool(get_pool(cct
));
1131 RGWObjectCtx
obj_ctx(store
);
1132 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
1134 ldout(cct
, 1) << "error read_lastest_epoch " << pool
<< ":" << oid
<< dendl
;
1138 bufferlist::iterator iter
= bl
.begin();
1139 ::decode(info
, iter
);
1140 } catch (buffer::error
& err
) {
1141 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
1148 int RGWPeriod::get_latest_epoch(epoch_t
& latest_epoch
)
1150 RGWPeriodLatestEpochInfo info
;
1152 int ret
= read_latest_epoch(info
);
1157 latest_epoch
= info
.epoch
;
1162 int RGWPeriod::use_latest_epoch()
1164 RGWPeriodLatestEpochInfo info
;
1165 int ret
= read_latest_epoch(info
);
1175 int RGWPeriod::set_latest_epoch(epoch_t epoch
, bool exclusive
)
1177 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1179 rgw_pool
pool(get_pool(cct
));
1182 RGWPeriodLatestEpochInfo info
;
1187 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1188 exclusive
, NULL
, real_time(), NULL
);
1191 int RGWPeriod::delete_obj()
1193 rgw_pool
pool(get_pool(cct
));
1195 // delete the object for each period epoch
1196 for (epoch_t e
= 1; e
<= epoch
; e
++) {
1197 RGWPeriod p
{get_id(), e
};
1198 rgw_raw_obj oid
{pool
, p
.get_period_oid()};
1199 int ret
= store
->delete_system_obj(oid
);
1201 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1202 << ": " << cpp_strerror(-ret
) << dendl
;
1206 // delete the .latest_epoch object
1207 rgw_raw_obj oid
{pool
, get_period_oid_prefix() + get_latest_epoch_oid()};
1208 int ret
= store
->delete_system_obj(oid
);
1210 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1211 << ": " << cpp_strerror(-ret
) << dendl
;
1216 int RGWPeriod::read_info()
1218 rgw_pool
pool(get_pool(cct
));
1222 RGWObjectCtx
obj_ctx(store
);
1223 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, get_period_oid(), bl
, NULL
, NULL
);
1225 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << get_period_oid() << ": " << cpp_strerror(-ret
) << dendl
;
1230 bufferlist::iterator iter
= bl
.begin();
1231 ::decode(*this, iter
);
1232 } catch (buffer::error
& err
) {
1233 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << get_period_oid() << dendl
;
1240 int RGWPeriod::create(bool exclusive
)
1244 /* create unique id */
1247 new_uuid
.generate_random();
1248 new_uuid
.print(uuid_str
);
1251 epoch
= FIRST_EPOCH
;
1255 ret
= store_info(exclusive
);
1257 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1260 ret
= set_latest_epoch(epoch
);
1262 ldout(cct
, 0) << "ERROR: setting latest epoch " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1268 int RGWPeriod::store_info(bool exclusive
)
1270 epoch_t latest_epoch
= FIRST_EPOCH
- 1;
1271 int ret
= get_latest_epoch(latest_epoch
);
1272 if (ret
< 0 && ret
!= -ENOENT
) {
1273 ldout(cct
, 0) << "ERROR: RGWPeriod::get_latest_epoch() returned " << cpp_strerror(-ret
) << dendl
;
1277 rgw_pool
pool(get_pool(cct
));
1279 string oid
= get_period_oid();
1281 ::encode(*this, bl
);
1282 ret
= rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
1284 ldout(cct
, 0) << "ERROR: rgw_put_system_obj(" << pool
<< ":" << oid
<< "): " << cpp_strerror(-ret
) << dendl
;
1287 if (latest_epoch
< epoch
) {
1288 ret
= set_latest_epoch(epoch
);
1290 ldout(cct
, 0) << "ERROR: RGWPeriod::set_latest_epoch() returned " << cpp_strerror(-ret
) << dendl
;
1297 rgw_pool
RGWPeriod::get_pool(CephContext
*cct
)
1299 if (cct
->_conf
->rgw_period_root_pool
.empty()) {
1300 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL
);
1302 return rgw_pool(cct
->_conf
->rgw_period_root_pool
);
1305 int RGWPeriod::use_next_epoch()
1307 epoch_t latest_epoch
;
1308 int ret
= get_latest_epoch(latest_epoch
);
1312 epoch
= latest_epoch
+ 1;
1314 if (ret
< 0 && ret
!= -ENOENT
) {
1317 if (ret
== -ENOENT
) {
1320 ldout(cct
, 0) << "Error creating new epoch " << epoch
<< dendl
;
1327 int RGWPeriod::add_zonegroup(const RGWZoneGroup
& zonegroup
)
1329 if (zonegroup
.realm_id
!= realm_id
) {
1332 int ret
= period_map
.update(zonegroup
, cct
);
1334 ldout(cct
, 0) << "ERROR: updating period map: " << cpp_strerror(-ret
) << dendl
;
1338 return store_info(false);
1341 int RGWPeriod::update()
1343 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << get_id() << dendl
;
1344 list
<string
> zonegroups
;
1345 int ret
= store
->list_zonegroups(zonegroups
);
1347 ldout(cct
, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret
) << dendl
;
1351 // clear zone short ids of removed zones. period_map.update() will add the
1352 // remaining zones back
1353 period_map
.short_zone_ids
.clear();
1355 for (auto& iter
: zonegroups
) {
1356 RGWZoneGroup
zg(string(), iter
);
1357 ret
= zg
.init(cct
, store
);
1359 ldout(cct
, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret
) << dendl
;
1363 if (zg
.realm_id
!= realm_id
) {
1364 ldout(cct
, 20) << "skipping zonegroup " << zg
.get_name() << " zone realm id " << zg
.realm_id
<< ", not on our realm " << realm_id
<< dendl
;
1368 if (zg
.master_zone
.empty()) {
1369 ldout(cct
, 0) << "ERROR: zonegroup " << zg
.get_name() << " should have a master zone " << dendl
;
1373 if (zg
.is_master_zonegroup()) {
1374 master_zonegroup
= zg
.get_id();
1375 master_zone
= zg
.master_zone
;
1378 int ret
= period_map
.update(zg
, cct
);
1384 ret
= period_config
.read(store
, realm_id
);
1385 if (ret
< 0 && ret
!= -ENOENT
) {
1386 ldout(cct
, 0) << "ERROR: failed to read period config: "
1387 << cpp_strerror(ret
) << dendl
;
1393 int RGWPeriod::reflect()
1395 for (auto& iter
: period_map
.zonegroups
) {
1396 RGWZoneGroup
& zg
= iter
.second
;
1397 zg
.reinit_instance(cct
, store
);
1398 int r
= zg
.write(false);
1400 ldout(cct
, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter
.first
<< ": " << cpp_strerror(-r
) << dendl
;
1403 if (zg
.is_master_zonegroup()) {
1404 // set master as default if no default exists
1405 r
= zg
.set_as_default(true);
1407 ldout(cct
, 1) << "Set the period's master zonegroup " << zg
.get_id()
1408 << " as the default" << dendl
;
1413 int r
= period_config
.write(store
, realm_id
);
1415 ldout(cct
, 0) << "ERROR: failed to store period config: "
1416 << cpp_strerror(-r
) << dendl
;
1422 void RGWPeriod::fork()
1424 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << id
<< dendl
;
1425 predecessor_uuid
= id
;
1426 id
= get_staging_id(realm_id
);
1431 static int read_sync_status(RGWRados
*store
, rgw_meta_sync_status
*sync_status
)
1433 // initialize a sync status manager to read the status
1434 RGWMetaSyncStatusManager
mgr(store
, store
->get_async_rados());
1439 r
= mgr
.read_sync_status(sync_status
);
1444 int RGWPeriod::update_sync_status(const RGWPeriod
¤t_period
,
1445 std::ostream
& error_stream
,
1446 bool force_if_stale
)
1448 rgw_meta_sync_status status
;
1449 int r
= read_sync_status(store
, &status
);
1451 ldout(cct
, 0) << "period failed to read sync status: "
1452 << cpp_strerror(-r
) << dendl
;
1456 std::vector
<std::string
> markers
;
1458 const auto current_epoch
= current_period
.get_realm_epoch();
1459 if (current_epoch
!= status
.sync_info
.realm_epoch
) {
1460 // no sync status markers for the current period
1461 assert(current_epoch
> status
.sync_info
.realm_epoch
);
1462 const int behind
= current_epoch
- status
.sync_info
.realm_epoch
;
1463 if (!force_if_stale
&& current_epoch
> 1) {
1464 error_stream
<< "ERROR: This zone is " << behind
<< " period(s) behind "
1465 "the current master zone in metadata sync. If this zone is promoted "
1466 "to master, any metadata changes during that time are likely to "
1468 "Waiting for this zone to catch up on metadata sync (see "
1469 "'radosgw-admin sync status') is recommended.\n"
1470 "To promote this zone to master anyway, add the flag "
1471 "--yes-i-really-mean-it." << std::endl
;
1474 // empty sync status markers - other zones will skip this period during
1475 // incremental metadata sync
1476 markers
.resize(status
.sync_info
.num_shards
);
1478 markers
.reserve(status
.sync_info
.num_shards
);
1479 for (auto& i
: status
.sync_markers
) {
1480 auto& marker
= i
.second
;
1481 // filter out markers from other periods
1482 if (marker
.realm_epoch
!= current_epoch
) {
1483 marker
.marker
.clear();
1485 markers
.emplace_back(std::move(marker
.marker
));
1489 std::swap(sync_status
, markers
);
1493 int RGWPeriod::commit(RGWRealm
& realm
, const RGWPeriod
& current_period
,
1494 std::ostream
& error_stream
, bool force_if_stale
)
1496 ldout(cct
, 20) << __func__
<< " realm " << realm
.get_id() << " period " << current_period
.get_id() << dendl
;
1497 // gateway must be in the master zone to commit
1498 if (master_zone
!= store
->get_zone_params().get_id()) {
1499 error_stream
<< "Cannot commit period on zone "
1500 << store
->get_zone_params().get_id() << ", it must be sent to "
1501 "the period's master zone " << master_zone
<< '.' << std::endl
;
1504 // period predecessor must match current period
1505 if (predecessor_uuid
!= current_period
.get_id()) {
1506 error_stream
<< "Period predecessor " << predecessor_uuid
1507 << " does not match current period " << current_period
.get_id()
1508 << ". Use 'period pull' to get the latest period from the master, "
1509 "reapply your changes, and try again." << std::endl
;
1512 // realm epoch must be 1 greater than current period
1513 if (realm_epoch
!= current_period
.get_realm_epoch() + 1) {
1514 error_stream
<< "Period's realm epoch " << realm_epoch
1515 << " does not come directly after current realm epoch "
1516 << current_period
.get_realm_epoch() << ". Use 'realm pull' to get the "
1517 "latest realm and period from the master zone, reapply your changes, "
1518 "and try again." << std::endl
;
1521 // did the master zone change?
1522 if (master_zone
!= current_period
.get_master_zone()) {
1523 // store the current metadata sync status in the period
1524 int r
= update_sync_status(current_period
, error_stream
, force_if_stale
);
1526 ldout(cct
, 0) << "failed to update metadata sync status: "
1527 << cpp_strerror(-r
) << dendl
;
1530 // create an object with a new period id
1533 ldout(cct
, 0) << "failed to create new period: " << cpp_strerror(-r
) << dendl
;
1536 // set as current period
1537 r
= realm
.set_current_period(*this);
1539 ldout(cct
, 0) << "failed to update realm's current period: "
1540 << cpp_strerror(-r
) << dendl
;
1543 ldout(cct
, 4) << "Promoted to master zone and committed new period "
1545 realm
.notify_new_period(*this);
1548 // period must be based on current epoch
1549 if (epoch
!= current_period
.get_epoch()) {
1550 error_stream
<< "Period epoch " << epoch
<< " does not match "
1551 "predecessor epoch " << current_period
.get_epoch()
1552 << ". Use 'period pull' to get the latest epoch from the master zone, "
1553 "reapply your changes, and try again." << std::endl
;
1556 // set period as next epoch
1557 set_id(current_period
.get_id());
1558 set_epoch(current_period
.get_epoch() + 1);
1559 set_predecessor(current_period
.get_predecessor());
1560 realm_epoch
= current_period
.get_realm_epoch();
1561 // write the period to rados
1562 int r
= store_info(false);
1564 ldout(cct
, 0) << "failed to store period: " << cpp_strerror(-r
) << dendl
;
1567 // set as latest epoch
1568 r
= set_latest_epoch(epoch
);
1570 ldout(cct
, 0) << "failed to set latest epoch: " << cpp_strerror(-r
) << dendl
;
1575 ldout(cct
, 0) << "failed to update local objects: " << cpp_strerror(-r
) << dendl
;
1578 ldout(cct
, 4) << "Committed new epoch " << epoch
1579 << " for period " << id
<< dendl
;
1580 realm
.notify_new_period(*this);
1584 int RGWZoneParams::create_default(bool old_format
)
1586 name
= default_zone_name
;
1601 int get_zones_pool_set(CephContext
* cct
,
1603 const list
<string
>& zones
,
1604 const string
& my_zone_id
,
1605 set
<rgw_pool
>& pool_names
)
1607 for(auto const& iter
: zones
) {
1608 RGWZoneParams
zone(iter
);
1609 int r
= zone
.init(cct
, store
);
1611 ldout(cct
, 0) << "Error: init zone " << iter
<< ":" << cpp_strerror(-r
) << dendl
;
1614 if (zone
.get_id() != my_zone_id
) {
1615 pool_names
.insert(zone
.domain_root
);
1616 pool_names
.insert(zone
.metadata_heap
);
1617 pool_names
.insert(zone
.control_pool
);
1618 pool_names
.insert(zone
.gc_pool
);
1619 pool_names
.insert(zone
.log_pool
);
1620 pool_names
.insert(zone
.intent_log_pool
);
1621 pool_names
.insert(zone
.usage_log_pool
);
1622 pool_names
.insert(zone
.user_keys_pool
);
1623 pool_names
.insert(zone
.user_email_pool
);
1624 pool_names
.insert(zone
.user_swift_pool
);
1625 pool_names
.insert(zone
.user_uid_pool
);
1626 pool_names
.insert(zone
.roles_pool
);
1627 for(auto& iter
: zone
.placement_pools
) {
1628 pool_names
.insert(iter
.second
.index_pool
);
1629 pool_names
.insert(iter
.second
.data_pool
);
1630 pool_names
.insert(iter
.second
.data_extra_pool
);
1637 rgw_pool
fix_zone_pool_dup(set
<rgw_pool
> pools
,
1638 const string
& default_prefix
,
1639 const string
& default_suffix
,
1640 const rgw_pool
& suggested_pool
)
1642 string suggested_name
= suggested_pool
.to_str();
1644 string prefix
= default_prefix
;
1645 string suffix
= default_suffix
;
1647 if (!suggested_pool
.empty()) {
1648 prefix
= suggested_name
.substr(0, suggested_name
.find("."));
1649 suffix
= suggested_name
.substr(prefix
.length());
1652 rgw_pool
pool(prefix
+ suffix
);
1654 if (pools
.find(pool
) == pools
.end()) {
1658 pool
= prefix
+ "_" + std::to_string(std::rand()) + suffix
;
1659 if (pools
.find(pool
) == pools
.end()) {
1666 int RGWZoneParams::fix_pool_names()
1670 int r
= store
->list_zones(zones
);
1672 ldout(cct
, 10) << "WARNING: store->list_zones() returned r=" << r
<< dendl
;
1675 set
<rgw_pool
> pools
;
1676 r
= get_zones_pool_set(cct
, store
, zones
, id
, pools
);
1678 ldout(cct
, 0) << "Error: get_zones_pool_names" << r
<< dendl
;
1682 domain_root
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:root", domain_root
);
1683 if (!metadata_heap
.name
.empty()) {
1684 metadata_heap
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:heap", metadata_heap
);
1686 control_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.control", control_pool
);
1687 gc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:gc", gc_pool
);
1688 lc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:lc", lc_pool
);
1689 log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log", log_pool
);
1690 intent_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:intent", intent_log_pool
);
1691 usage_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:usage", usage_log_pool
);
1692 user_keys_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.keys", user_keys_pool
);
1693 user_email_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.email", user_email_pool
);
1694 user_swift_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.swift", user_swift_pool
);
1695 user_uid_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.uid", user_uid_pool
);
1696 roles_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:roles", roles_pool
);
1698 for(auto& iter
: placement_pools
) {
1699 iter
.second
.index_pool
= fix_zone_pool_dup(pools
, name
, "." + default_bucket_index_pool_suffix
,
1700 iter
.second
.index_pool
);
1701 iter
.second
.data_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_pool_suffix
,
1702 iter
.second
.data_pool
);
1703 iter
.second
.data_extra_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_extra_pool_suffix
,
1704 iter
.second
.data_extra_pool
);
1710 int RGWZoneParams::create(bool exclusive
)
1712 /* check for old pools config */
1713 rgw_raw_obj
obj(domain_root
, avail_pools
);
1714 int r
= store
->raw_obj_stat(obj
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1716 ldout(store
->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl
;
1717 /* a new system, let's set new placement info */
1718 RGWZonePlacementInfo default_placement
;
1719 default_placement
.index_pool
= name
+ "." + default_bucket_index_pool_suffix
;
1720 default_placement
.data_pool
= name
+ "." + default_storage_pool_suffix
;
1721 default_placement
.data_extra_pool
= name
+ "." + default_storage_extra_pool_suffix
;
1722 placement_pools
["default-placement"] = default_placement
;
1725 r
= fix_pool_names();
1727 ldout(cct
, 0) << "ERROR: fix_pool_names returned r=" << r
<< dendl
;
1731 r
= RGWSystemMetaObj::create(exclusive
);
1736 // try to set as default. may race with another create, so pass exclusive=true
1737 // so we don't override an existing default
1738 r
= set_as_default(true);
1739 if (r
< 0 && r
!= -EEXIST
) {
1740 ldout(cct
, 10) << "WARNING: failed to set zone as default, r=" << r
<< dendl
;
1746 rgw_pool
RGWZoneParams::get_pool(CephContext
*cct
)
1748 if (cct
->_conf
->rgw_zone_root_pool
.empty()) {
1749 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL
);
1752 return rgw_pool(cct
->_conf
->rgw_zone_root_pool
);
1755 const string
RGWZoneParams::get_default_oid(bool old_format
)
1758 return cct
->_conf
->rgw_default_zone_info_oid
;
1761 return cct
->_conf
->rgw_default_zone_info_oid
+ "." + realm_id
;
1764 const string
& RGWZoneParams::get_names_oid_prefix()
1766 return zone_names_oid_prefix
;
1769 const string
& RGWZoneParams::get_info_oid_prefix(bool old_format
)
1771 return zone_info_oid_prefix
;
1774 const string
& RGWZoneParams::get_predefined_name(CephContext
*cct
) {
1775 return cct
->_conf
->rgw_zone
;
1778 int RGWZoneParams::init(CephContext
*cct
, RGWRados
*store
, bool setup_obj
, bool old_format
)
1781 name
= cct
->_conf
->rgw_zone
;
1784 return RGWSystemMetaObj::init(cct
, store
, setup_obj
, old_format
);
1787 int RGWZoneParams::read_default_id(string
& default_id
, bool old_format
)
1789 if (realm_id
.empty()) {
1790 /* try using default realm */
1792 int ret
= realm
.init(cct
, store
);
1794 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
1797 realm_id
= realm
.get_id();
1800 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
1804 int RGWZoneParams::set_as_default(bool exclusive
)
1806 if (realm_id
.empty()) {
1807 /* try using default realm */
1809 int ret
= realm
.init(cct
, store
);
1811 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
1814 realm_id
= realm
.get_id();
1817 return RGWSystemMetaObj::set_as_default(exclusive
);
1820 const string
& RGWZoneParams::get_compression_type(const string
& placement_rule
) const
1822 static const std::string NONE
{"none"};
1823 auto p
= placement_pools
.find(placement_rule
);
1824 if (p
== placement_pools
.end()) {
1827 const auto& type
= p
->second
.compression_type
;
1828 return !type
.empty() ? type
: NONE
;
1831 void RGWPeriodMap::encode(bufferlist
& bl
) const {
1832 ENCODE_START(2, 1, bl
);
1834 ::encode(zonegroups
, bl
);
1835 ::encode(master_zonegroup
, bl
);
1836 ::encode(short_zone_ids
, bl
);
1840 void RGWPeriodMap::decode(bufferlist::iterator
& bl
) {
1841 DECODE_START(2, bl
);
1843 ::decode(zonegroups
, bl
);
1844 ::decode(master_zonegroup
, bl
);
1845 if (struct_v
>= 2) {
1846 ::decode(short_zone_ids
, bl
);
1850 zonegroups_by_api
.clear();
1851 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
1852 iter
!= zonegroups
.end(); ++iter
) {
1853 RGWZoneGroup
& zonegroup
= iter
->second
;
1854 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1855 if (zonegroup
.is_master
) {
1856 master_zonegroup
= zonegroup
.get_id();
1861 // run an MD5 hash on the zone_id and return the first 32 bits
1862 static uint32_t gen_short_zone_id(const std::string zone_id
)
1864 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
1866 hash
.Update((const byte
*)zone_id
.c_str(), zone_id
.size());
1870 memcpy((char *)&short_id
, md5
, sizeof(short_id
));
1871 return std::max(short_id
, 1u);
1874 int RGWPeriodMap::update(const RGWZoneGroup
& zonegroup
, CephContext
*cct
)
1876 if (zonegroup
.is_master
&& (!master_zonegroup
.empty() && zonegroup
.get_id() != master_zonegroup
)) {
1877 ldout(cct
,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl
;
1878 ldout(cct
,0) << "master zonegroup: " << master_zonegroup
<< " and " << zonegroup
.get_id() <<dendl
;
1881 map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.find(zonegroup
.get_id());
1882 if (iter
!= zonegroups
.end()) {
1883 RGWZoneGroup
& old_zonegroup
= iter
->second
;
1884 if (!old_zonegroup
.api_name
.empty()) {
1885 zonegroups_by_api
.erase(old_zonegroup
.api_name
);
1888 zonegroups
[zonegroup
.get_id()] = zonegroup
;
1890 if (!zonegroup
.api_name
.empty()) {
1891 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1894 if (zonegroup
.is_master
) {
1895 master_zonegroup
= zonegroup
.get_id();
1896 } else if (master_zonegroup
== zonegroup
.get_id()) {
1897 master_zonegroup
= "";
1900 for (auto& i
: zonegroup
.zones
) {
1901 auto& zone
= i
.second
;
1902 if (short_zone_ids
.find(zone
.id
) != short_zone_ids
.end()) {
1905 // calculate the zone's short id
1906 uint32_t short_id
= gen_short_zone_id(zone
.id
);
1908 // search for an existing zone with the same short id
1909 for (auto& s
: short_zone_ids
) {
1910 if (s
.second
== short_id
) {
1911 ldout(cct
, 0) << "New zone '" << zone
.name
<< "' (" << zone
.id
1912 << ") generates the same short_zone_id " << short_id
1913 << " as existing zone id " << s
.first
<< dendl
;
1918 short_zone_ids
[zone
.id
] = short_id
;
1924 uint32_t RGWPeriodMap::get_zone_short_id(const string
& zone_id
) const
1926 auto i
= short_zone_ids
.find(zone_id
);
1927 if (i
== short_zone_ids
.end()) {
1933 int RGWZoneGroupMap::read(CephContext
*cct
, RGWRados
*store
)
1937 int ret
= period
.init(cct
, store
);
1939 cerr
<< "failed to read current period info: " << cpp_strerror(ret
);
1943 bucket_quota
= period
.get_config().bucket_quota
;
1944 user_quota
= period
.get_config().user_quota
;
1945 zonegroups
= period
.get_map().zonegroups
;
1946 zonegroups_by_api
= period
.get_map().zonegroups_by_api
;
1947 master_zonegroup
= period
.get_map().master_zonegroup
;
1952 void RGWRegionMap::encode(bufferlist
& bl
) const {
1953 ENCODE_START( 3, 1, bl
);
1954 ::encode(regions
, bl
);
1955 ::encode(master_region
, bl
);
1956 ::encode(bucket_quota
, bl
);
1957 ::encode(user_quota
, bl
);
1961 void RGWRegionMap::decode(bufferlist::iterator
& bl
) {
1962 DECODE_START(3, bl
);
1963 ::decode(regions
, bl
);
1964 ::decode(master_region
, bl
);
1966 ::decode(bucket_quota
, bl
);
1968 ::decode(user_quota
, bl
);
1972 void RGWZoneGroupMap::encode(bufferlist
& bl
) const {
1973 ENCODE_START( 3, 1, bl
);
1974 ::encode(zonegroups
, bl
);
1975 ::encode(master_zonegroup
, bl
);
1976 ::encode(bucket_quota
, bl
);
1977 ::encode(user_quota
, bl
);
1981 void RGWZoneGroupMap::decode(bufferlist::iterator
& bl
) {
1982 DECODE_START(3, bl
);
1983 ::decode(zonegroups
, bl
);
1984 ::decode(master_zonegroup
, bl
);
1986 ::decode(bucket_quota
, bl
);
1988 ::decode(user_quota
, bl
);
1991 zonegroups_by_api
.clear();
1992 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
1993 iter
!= zonegroups
.end(); ++iter
) {
1994 RGWZoneGroup
& zonegroup
= iter
->second
;
1995 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1996 if (zonegroup
.is_master
) {
1997 master_zonegroup
= zonegroup
.get_name();
2002 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
2004 obj_version
*check_objv
= version_for_check();
2007 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2010 cls_version_read(*op
, &read_version
);
2013 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
2015 obj_version
*check_objv
= version_for_check();
2016 obj_version
*modify_version
= version_for_write();
2019 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2022 if (modify_version
) {
2023 cls_version_set(*op
, *modify_version
);
2025 cls_version_inc(*op
);
2029 void RGWObjManifest::obj_iterator::operator++()
2031 if (manifest
->explicit_objs
) {
2034 if (explicit_iter
== manifest
->objs
.end()) {
2035 ofs
= manifest
->obj_size
;
2039 update_explicit_pos();
2045 uint64_t obj_size
= manifest
->get_obj_size();
2046 uint64_t head_size
= manifest
->get_head_size();
2048 if (ofs
== obj_size
) {
2052 if (manifest
->rules
.empty()) {
2056 /* are we still pointing at the head? */
2057 if (ofs
< head_size
) {
2058 rule_iter
= manifest
->rules
.begin();
2059 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2060 ofs
= MIN(head_size
, obj_size
);
2063 stripe_size
= MIN(obj_size
- ofs
, rule
->stripe_max_size
);
2064 if (rule
->part_size
> 0) {
2065 stripe_size
= MIN(stripe_size
, rule
->part_size
);
2071 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2073 stripe_ofs
+= rule
->stripe_max_size
;
2075 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule
->part_size
<< " rules.size()=" << manifest
->rules
.size() << dendl
;
2077 if (rule
->part_size
> 0) {
2078 /* multi part, multi stripes object */
2080 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2082 if (stripe_ofs
>= part_ofs
+ rule
->part_size
) {
2083 /* moved to the next part */
2085 part_ofs
+= rule
->part_size
;
2086 stripe_ofs
= part_ofs
;
2088 bool last_rule
= (next_rule_iter
== manifest
->rules
.end());
2089 /* move to the next rule? */
2090 if (!last_rule
&& stripe_ofs
>= next_rule_iter
->second
.start_ofs
) {
2091 rule_iter
= next_rule_iter
;
2092 last_rule
= (next_rule_iter
== manifest
->rules
.end());
2096 cur_part_id
= rule_iter
->second
.start_part_num
;
2101 rule
= &rule_iter
->second
;
2104 stripe_size
= MIN(rule
->part_size
- (stripe_ofs
- part_ofs
), rule
->stripe_max_size
);
2107 cur_override_prefix
= rule
->override_prefix
;
2110 if (ofs
> obj_size
) {
2116 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs
<< " stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2120 int RGWObjManifest::generator::create_begin(CephContext
*cct
, RGWObjManifest
*_m
, const string
& placement_rule
, rgw_bucket
& _b
, rgw_obj
& _obj
)
2124 manifest
->set_tail_placement(placement_rule
, _b
);
2125 manifest
->set_head(placement_rule
, _obj
, 0);
2128 if (manifest
->get_prefix().empty()) {
2130 gen_rand_alphanumeric(cct
, buf
, sizeof(buf
) - 1);
2132 string oid_prefix
= ".";
2133 oid_prefix
.append(buf
);
2134 oid_prefix
.append("_");
2136 manifest
->set_prefix(oid_prefix
);
2139 bool found
= manifest
->get_rule(0, &rule
);
2141 derr
<< "ERROR: manifest->get_rule() could not find rule" << dendl
;
2145 uint64_t head_size
= manifest
->get_head_size();
2147 if (head_size
> 0) {
2148 cur_stripe_size
= head_size
;
2150 cur_stripe_size
= rule
.stripe_max_size
;
2153 cur_part_id
= rule
.start_part_num
;
2155 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, 0, NULL
, &cur_obj
);
2157 // Normal object which not generated through copy operation
2158 manifest
->set_tail_instance(_obj
.key
.instance
);
2160 manifest
->update_iterators();
2165 int RGWObjManifest::generator::create_next(uint64_t ofs
)
2167 if (ofs
< last_ofs
) /* only going forward */
2170 uint64_t max_head_size
= manifest
->get_max_head_size();
2172 if (ofs
< max_head_size
) {
2173 manifest
->set_head_size(ofs
);
2176 if (ofs
>= max_head_size
) {
2177 manifest
->set_head_size(max_head_size
);
2178 cur_stripe
= (ofs
- max_head_size
) / rule
.stripe_max_size
;
2179 cur_stripe_size
= rule
.stripe_max_size
;
2181 if (cur_part_id
== 0 && max_head_size
> 0) {
2187 manifest
->set_obj_size(ofs
);
2189 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, ofs
, NULL
, &cur_obj
);
2191 manifest
->update_iterators();
2196 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_begin()
2201 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_end()
2206 RGWObjManifest::obj_iterator
RGWObjManifest::obj_find(uint64_t ofs
)
2208 if (ofs
> obj_size
) {
2211 RGWObjManifest::obj_iterator
iter(this);
2216 int RGWObjManifest::append(RGWObjManifest
& m
, RGWZoneGroup
& zonegroup
, RGWZoneParams
& zone_params
)
2218 if (explicit_objs
|| m
.explicit_objs
) {
2219 return append_explicit(m
, zonegroup
, zone_params
);
2222 if (rules
.empty()) {
2227 string override_prefix
;
2229 if (prefix
.empty()) {
2233 if (prefix
!= m
.prefix
) {
2234 override_prefix
= m
.prefix
;
2237 map
<uint64_t, RGWObjManifestRule
>::iterator miter
= m
.rules
.begin();
2238 if (miter
== m
.rules
.end()) {
2239 return append_explicit(m
, zonegroup
, zone_params
);
2242 for (; miter
!= m
.rules
.end(); ++miter
) {
2243 map
<uint64_t, RGWObjManifestRule
>::reverse_iterator last_rule
= rules
.rbegin();
2245 RGWObjManifestRule
& rule
= last_rule
->second
;
2247 if (rule
.part_size
== 0) {
2248 rule
.part_size
= obj_size
- rule
.start_ofs
;
2251 RGWObjManifestRule
& next_rule
= miter
->second
;
2252 if (!next_rule
.part_size
) {
2253 next_rule
.part_size
= m
.obj_size
- next_rule
.start_ofs
;
2256 string rule_prefix
= prefix
;
2257 if (!rule
.override_prefix
.empty()) {
2258 rule_prefix
= rule
.override_prefix
;
2261 string next_rule_prefix
= m
.prefix
;
2262 if (!next_rule
.override_prefix
.empty()) {
2263 next_rule_prefix
= next_rule
.override_prefix
;
2266 if (rule
.part_size
!= next_rule
.part_size
||
2267 rule
.stripe_max_size
!= next_rule
.stripe_max_size
||
2268 rule_prefix
!= next_rule_prefix
) {
2269 if (next_rule_prefix
!= prefix
) {
2270 append_rules(m
, miter
, &next_rule_prefix
);
2272 append_rules(m
, miter
, NULL
);
2277 uint64_t expected_part_num
= rule
.start_part_num
+ 1;
2278 if (rule
.part_size
> 0) {
2279 expected_part_num
= rule
.start_part_num
+ (obj_size
+ next_rule
.start_ofs
- rule
.start_ofs
) / rule
.part_size
;
2282 if (expected_part_num
!= next_rule
.start_part_num
) {
2283 append_rules(m
, miter
, NULL
);
2288 set_obj_size(obj_size
+ m
.obj_size
);
2293 int RGWObjManifest::append(RGWObjManifest
& m
, RGWRados
*store
)
2295 return append(m
, store
->get_zonegroup(), store
->get_zone_params());
2298 void RGWObjManifest::append_rules(RGWObjManifest
& m
, map
<uint64_t, RGWObjManifestRule
>::iterator
& miter
,
2299 string
*override_prefix
)
2301 for (; miter
!= m
.rules
.end(); ++miter
) {
2302 RGWObjManifestRule rule
= miter
->second
;
2303 rule
.start_ofs
+= obj_size
;
2304 if (override_prefix
)
2305 rule
.override_prefix
= *override_prefix
;
2306 rules
[rule
.start_ofs
] = rule
;
2310 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2312 if (explicit_objs
) {
2315 obj_iterator iter
= obj_begin();
2317 while (iter
!= obj_end()) {
2318 RGWObjManifestPart
& part
= objs
[iter
.get_stripe_ofs()];
2319 const rgw_obj_select
& os
= iter
.get_location();
2320 const rgw_raw_obj
& raw_loc
= os
.get_raw_obj(zonegroup
, zone_params
);
2323 uint64_t ofs
= iter
.get_stripe_ofs();
2328 rgw_raw_obj_to_obj(tail_placement
.bucket
, raw_loc
, &part
.loc
);
2331 uint64_t next_ofs
= iter
.get_stripe_ofs();
2333 part
.size
= next_ofs
- ofs
;
2336 explicit_objs
= true;
2341 int RGWObjManifest::append_explicit(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2343 if (!explicit_objs
) {
2344 convert_to_explicit(zonegroup
, zone_params
);
2346 if (!m
.explicit_objs
) {
2347 m
.convert_to_explicit(zonegroup
, zone_params
);
2349 map
<uint64_t, RGWObjManifestPart
>::iterator iter
;
2350 uint64_t base
= obj_size
;
2351 for (iter
= m
.objs
.begin(); iter
!= m
.objs
.end(); ++iter
) {
2352 RGWObjManifestPart
& part
= iter
->second
;
2353 objs
[base
+ iter
->first
] = part
;
2355 obj_size
+= m
.obj_size
;
2360 bool RGWObjManifest::get_rule(uint64_t ofs
, RGWObjManifestRule
*rule
)
2362 if (rules
.empty()) {
2366 map
<uint64_t, RGWObjManifestRule
>::iterator iter
= rules
.upper_bound(ofs
);
2367 if (iter
!= rules
.begin()) {
2371 *rule
= iter
->second
;
2376 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
2378 write_version
.ver
= 1;
2381 write_version
.tag
.clear();
2382 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
2385 int RGWPutObjProcessor::complete(size_t accounted_size
, const string
& etag
,
2386 real_time
*mtime
, real_time set_mtime
,
2387 map
<string
, bufferlist
>& attrs
, real_time delete_at
,
2388 const char *if_match
, const char *if_nomatch
, const string
*user_data
)
2390 int r
= do_complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
, if_match
, if_nomatch
, user_data
);
2394 is_complete
= !canceled
;
2398 CephContext
*RGWPutObjProcessor::ctx()
2400 return store
->ctx();
2403 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2410 set
<rgw_raw_obj
>::iterator iter
;
2411 bool need_to_remove_head
= false;
2412 rgw_raw_obj raw_head
;
2414 if (!head_obj
.empty()) {
2415 store
->obj_to_raw(bucket_info
.placement_rule
, head_obj
, &raw_head
);
2419 * We should delete the object in the "multipart" namespace to avoid race condition.
2420 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2421 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2422 * written by the second upload may be deleted by the first upload.
2423 * details is describled on #11749
2425 * The above comment still stands, but instead of searching for a specific object in the multipart
2426 * namespace, we just make sure that we remove the object that is marked as the head object after
2427 * we remove all the other raw objects. Note that we use different call to remove the head object,
2428 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2430 for (iter
= written_objs
.begin(); iter
!= written_objs
.end(); ++iter
) {
2431 const rgw_raw_obj
& obj
= *iter
;
2432 if (!head_obj
.empty() && obj
== raw_head
) {
2433 ldout(store
->ctx(), 5) << "NOTE: we should not process the head object (" << obj
<< ") here" << dendl
;
2434 need_to_remove_head
= true;
2438 int r
= store
->delete_raw_obj(obj
);
2439 if (r
< 0 && r
!= -ENOENT
) {
2440 ldout(store
->ctx(), 5) << "WARNING: failed to remove obj (" << obj
<< "), leaked" << dendl
;
2444 if (need_to_remove_head
) {
2445 ldout(store
->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head
<< ")" << dendl
;
2446 int r
= store
->delete_obj(obj_ctx
, bucket_info
, head_obj
, 0, 0);
2447 if (r
< 0 && r
!= -ENOENT
) {
2448 ldout(store
->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head
<< "), leaked" << dendl
;
2453 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj
& obj
, bufferlist
& bl
, off_t ofs
, off_t abs_ofs
, void **phandle
, bool exclusive
)
2455 if ((uint64_t)abs_ofs
+ bl
.length() > obj_len
)
2456 obj_len
= abs_ofs
+ bl
.length();
2458 if (!(obj
== last_written_obj
)) {
2459 last_written_obj
= obj
;
2462 // For the first call pass -1 as the offset to
2464 return store
->aio_put_obj_data(NULL
, obj
, bl
, ((ofs
!= 0) ? ofs
: -1), exclusive
, phandle
);
2467 struct put_obj_aio_info
RGWPutObjProcessor_Aio::pop_pending()
2469 struct put_obj_aio_info info
;
2470 info
= pending
.front();
2471 pending
.pop_front();
2472 pending_size
-= info
.size
;
2476 int RGWPutObjProcessor_Aio::wait_pending_front()
2478 if (pending
.empty()) {
2481 struct put_obj_aio_info info
= pop_pending();
2482 int ret
= store
->aio_wait(info
.handle
);
2485 add_written_obj(info
.obj
);
2491 bool RGWPutObjProcessor_Aio::pending_has_completed()
2493 if (pending
.empty())
2496 struct put_obj_aio_info
& info
= pending
.front();
2497 return store
->aio_completed(info
.handle
);
2500 int RGWPutObjProcessor_Aio::drain_pending()
2503 while (!pending
.empty()) {
2504 int r
= wait_pending_front();
2511 int RGWPutObjProcessor_Aio::throttle_data(void *handle
, const rgw_raw_obj
& obj
, uint64_t size
, bool need_to_wait
)
2513 bool _wait
= need_to_wait
;
2516 struct put_obj_aio_info info
;
2517 info
.handle
= handle
;
2520 pending_size
+= size
;
2521 pending
.push_back(info
);
2523 size_t orig_size
= pending_size
;
2525 /* first drain complete IOs */
2526 while (pending_has_completed()) {
2527 int r
= wait_pending_front();
2534 /* resize window in case messages are draining too fast */
2535 if (orig_size
- pending_size
>= window_size
) {
2536 window_size
+= store
->ctx()->_conf
->rgw_max_chunk_size
;
2537 uint64_t max_window_size
= store
->ctx()->_conf
->rgw_put_obj_max_window_size
;
2538 if (window_size
> max_window_size
) {
2539 window_size
= max_window_size
;
2543 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2544 if (pending_size
> window_size
|| _wait
) {
2545 int r
= wait_pending_front();
2552 int RGWPutObjProcessor_Atomic::write_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool exclusive
)
2554 if (ofs
>= next_part_ofs
) {
2555 int r
= prepare_next_part(ofs
);
2566 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj
, bl
, ofs
- cur_part_ofs
, ofs
, phandle
, exclusive
);
2569 int RGWPutObjProcessor_Aio::prepare(RGWRados
*store
, string
*oid_rand
)
2571 RGWPutObjProcessor::prepare(store
, oid_rand
);
2573 window_size
= store
->ctx()->_conf
->rgw_put_obj_min_window_size
;
2578 int RGWPutObjProcessor_Atomic::handle_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool *again
)
2581 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2583 pending_data_bl
.claim_append(bl
);
2584 if (pending_data_bl
.length() < max_write_size
) {
2589 pending_data_bl
.splice(0, max_write_size
, &bl
);
2591 /* do we have enough data pending accumulated that needs to be written? */
2592 *again
= (pending_data_bl
.length() >= max_chunk_size
);
2594 if (!data_ofs
&& !immutable_head()) {
2595 first_chunk
.claim(bl
);
2596 obj_len
= (uint64_t)first_chunk
.length();
2597 int r
= prepare_next_part(obj_len
);
2604 off_t write_ofs
= data_ofs
;
2605 data_ofs
= write_ofs
+ bl
.length();
2606 bool exclusive
= (!write_ofs
&& immutable_head()); /* immutable head object, need to verify nothing exists there
2607 we could be racing with another upload, to the same
2608 object and cleanup can be messy */
2609 int ret
= write_data(bl
, write_ofs
, phandle
, pobj
, exclusive
);
2610 if (ret
>= 0) { /* we might return, need to clear bl as it was already sent */
2617 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados
*store
, string
*oid_rand
)
2619 RGWPutObjProcessor_Aio::prepare(store
, oid_rand
);
2621 int r
= store
->get_max_chunk_size(bucket_info
.placement_rule
, head_obj
, &max_chunk_size
);
2629 int RGWPutObjProcessor_Atomic::prepare(RGWRados
*store
, string
*oid_rand
)
2631 head_obj
.init(bucket
, obj_str
);
2633 int r
= prepare_init(store
, oid_rand
);
2638 if (!version_id
.empty()) {
2639 head_obj
.key
.set_instance(version_id
);
2640 } else if (versioned_object
) {
2641 store
->gen_rand_obj_instance_name(&head_obj
);
2644 manifest
.set_trivial_rule(max_chunk_size
, store
->ctx()->_conf
->rgw_obj_stripe_size
);
2646 r
= manifest_gen
.create_begin(store
->ctx(), &manifest
, bucket_info
.placement_rule
, head_obj
.bucket
, head_obj
);
2654 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs
) {
2656 int ret
= manifest_gen
.create_next(ofs
);
2658 lderr(store
->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret
<< dendl
;
2662 next_part_ofs
= ofs
+ manifest_gen
.cur_stripe_max_size();
2663 cur_obj
= manifest_gen
.get_cur_obj(store
);
2668 int RGWPutObjProcessor_Atomic::complete_parts()
2670 if (obj_len
> (uint64_t)cur_part_ofs
) {
2671 return prepare_next_part(obj_len
);
2676 int RGWPutObjProcessor_Atomic::complete_writing_data()
2678 if (!data_ofs
&& !immutable_head()) {
2679 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2680 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2681 * clobber first_chunk
2683 if (pending_data_bl
.length() > 0) {
2684 first_chunk
.claim(pending_data_bl
);
2686 obj_len
= (uint64_t)first_chunk
.length();
2688 while (pending_data_bl
.length()) {
2691 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2692 if (max_write_size
> pending_data_bl
.length()) {
2693 max_write_size
= pending_data_bl
.length();
2696 pending_data_bl
.splice(0, max_write_size
, &bl
);
2697 uint64_t write_len
= bl
.length();
2698 int r
= write_data(bl
, data_ofs
, &handle
, &obj
, false);
2700 ldout(store
->ctx(), 0) << "ERROR: write_data() returned " << r
<< dendl
;
2703 data_ofs
+= write_len
;
2704 r
= throttle_data(handle
, obj
, write_len
, false);
2706 ldout(store
->ctx(), 0) << "ERROR: throttle_data() returned " << r
<< dendl
;
2710 if (data_ofs
>= next_part_ofs
) {
2711 r
= prepare_next_part(data_ofs
);
2713 ldout(store
->ctx(), 0) << "ERROR: prepare_next_part() returned " << r
<< dendl
;
2718 int r
= complete_parts();
2723 r
= drain_pending();
2730 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size
, const string
& etag
,
2731 real_time
*mtime
, real_time set_mtime
,
2732 map
<string
, bufferlist
>& attrs
,
2733 real_time delete_at
,
2734 const char *if_match
,
2735 const char *if_nomatch
, const string
*user_data
) {
2736 int r
= complete_writing_data();
2740 obj_ctx
.obj
.set_atomic(head_obj
);
2742 RGWRados::Object
op_target(store
, bucket_info
, obj_ctx
, head_obj
);
2744 /* some object types shouldn't be versioned, e.g., multipart parts */
2745 op_target
.set_versioning_disabled(!versioned_object
);
2747 RGWRados::Object::Write
obj_op(&op_target
);
2749 obj_op
.meta
.data
= &first_chunk
;
2750 obj_op
.meta
.manifest
= &manifest
;
2751 obj_op
.meta
.ptag
= &unique_tag
; /* use req_id as operation tag */
2752 obj_op
.meta
.if_match
= if_match
;
2753 obj_op
.meta
.if_nomatch
= if_nomatch
;
2754 obj_op
.meta
.mtime
= mtime
;
2755 obj_op
.meta
.set_mtime
= set_mtime
;
2756 obj_op
.meta
.owner
= bucket_info
.owner
;
2757 obj_op
.meta
.flags
= PUT_OBJ_CREATE
;
2758 obj_op
.meta
.olh_epoch
= olh_epoch
;
2759 obj_op
.meta
.delete_at
= delete_at
;
2760 obj_op
.meta
.user_data
= user_data
;
2762 r
= obj_op
.write_meta(obj_len
, accounted_size
, attrs
);
2767 canceled
= obj_op
.meta
.canceled
;
2772 int RGWRados::watch(const string
& oid
, uint64_t *watch_handle
, librados::WatchCtx2
*ctx
) {
2773 int r
= control_pool_ctx
.watch2(oid
, watch_handle
, ctx
);
2779 int RGWRados::unwatch(uint64_t watch_handle
)
2781 int r
= control_pool_ctx
.unwatch2(watch_handle
);
2783 ldout(cct
, 0) << "ERROR: rados->unwatch2() returned r=" << r
<< dendl
;
2786 r
= rados
[0].watch_flush();
2788 ldout(cct
, 0) << "ERROR: rados->watch_flush() returned r=" << r
<< dendl
;
2794 void RGWRados::add_watcher(int i
)
2796 ldout(cct
, 20) << "add_watcher() i=" << i
<< dendl
;
2797 Mutex::Locker
l(watchers_lock
);
2798 watchers_set
.insert(i
);
2799 if (watchers_set
.size() == (size_t)num_watchers
) {
2800 ldout(cct
, 2) << "all " << num_watchers
<< " watchers are set, enabling cache" << dendl
;
2801 set_cache_enabled(true);
2805 void RGWRados::remove_watcher(int i
)
2807 ldout(cct
, 20) << "remove_watcher() i=" << i
<< dendl
;
2808 Mutex::Locker
l(watchers_lock
);
2809 size_t orig_size
= watchers_set
.size();
2810 watchers_set
.erase(i
);
2811 if (orig_size
== (size_t)num_watchers
&&
2812 watchers_set
.size() < orig_size
) { /* actually removed */
2813 ldout(cct
, 2) << "removed watcher, disabling cache" << dendl
;
2814 set_cache_enabled(false);
2818 class RGWWatcher
: public librados::WatchCtx2
{
2822 uint64_t watch_handle
;
2824 class C_ReinitWatch
: public Context
{
2825 RGWWatcher
*watcher
;
2827 explicit C_ReinitWatch(RGWWatcher
*_watcher
) : watcher(_watcher
) {}
2828 void finish(int r
) override
{
2833 RGWWatcher(RGWRados
*r
, int i
, const string
& o
) : rados(r
), index(i
), oid(o
), watch_handle(0) {}
2834 void handle_notify(uint64_t notify_id
,
2836 uint64_t notifier_id
,
2837 bufferlist
& bl
) override
{
2838 ldout(rados
->ctx(), 10) << "RGWWatcher::handle_notify() "
2839 << " notify_id " << notify_id
2840 << " cookie " << cookie
2841 << " notifier " << notifier_id
2842 << " bl.length()=" << bl
.length() << dendl
;
2843 rados
->watch_cb(notify_id
, cookie
, notifier_id
, bl
);
2845 bufferlist reply_bl
; // empty reply payload
2846 rados
->control_pool_ctx
.notify_ack(oid
, notify_id
, cookie
, reply_bl
);
2848 void handle_error(uint64_t cookie
, int err
) override
{
2849 lderr(rados
->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2850 << " err " << cpp_strerror(err
) << dendl
;
2851 rados
->remove_watcher(index
);
2852 rados
->schedule_context(new C_ReinitWatch(this));
2856 int ret
= unregister_watch();
2858 ldout(rados
->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret
<< dendl
;
2861 ret
= register_watch();
2863 ldout(rados
->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret
<< dendl
;
2868 int unregister_watch() {
2869 int r
= rados
->unwatch(watch_handle
);
2873 rados
->remove_watcher(index
);
2877 int register_watch() {
2878 int r
= rados
->watch(oid
, &watch_handle
, this);
2882 rados
->add_watcher(index
);
2887 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
2889 RGWHTTPManager http_manager
;
2892 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2893 http_manager(store
->ctx(), completion_mgr
) {
2894 http_manager
.set_threaded();
2897 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
2898 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
2902 list
<RGWCoroutinesStack
*> stacks
;
2903 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2904 RGWRESTConn
*conn
= iter
->second
;
2905 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2906 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2908 stacks
.push_back(stack
);
2914 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
2916 RGWHTTPManager http_manager
;
2919 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2920 http_manager(store
->ctx(), completion_mgr
) {
2921 http_manager
.set_threaded();
2924 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
2925 rgw_http_param_pair pairs
[] = { { "type", "data" },
2927 { "source-zone", store
->get_zone_params().get_id().c_str() },
2930 list
<RGWCoroutinesStack
*> stacks
;
2931 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2932 RGWRESTConn
*conn
= iter
->second
;
2933 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2934 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2936 stacks
.push_back(stack
);
2942 class RGWRadosThread
{
2943 class Worker
: public Thread
{
2945 RGWRadosThread
*processor
;
2950 Worker(CephContext
*_cct
, RGWRadosThread
*_p
) : cct(_cct
), processor(_p
), lock("RGWRadosThread::Worker") {}
2951 void *entry() override
;
2953 Mutex::Locker
l(lock
);
2964 std::atomic
<bool> down_flag
= { false };
2968 virtual uint64_t interval_msec() = 0;
2969 virtual void stop_process() {}
2971 RGWRadosThread(RGWRados
*_store
, const string
& thread_name
= "radosgw")
2972 : worker(NULL
), cct(_store
->ctx()), store(_store
), thread_name(thread_name
) {}
2973 virtual ~RGWRadosThread() {
2977 virtual int init() { return 0; }
2978 virtual int process() = 0;
2980 bool going_down() { return down_flag
; }
2986 void RGWRadosThread::start()
2988 worker
= new Worker(cct
, this);
2989 worker
->create(thread_name
.c_str());
2992 void RGWRadosThread::stop()
3004 void *RGWRadosThread::Worker::entry() {
3005 uint64_t msec
= processor
->interval_msec();
3006 utime_t interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3009 utime_t start
= ceph_clock_now();
3010 int r
= processor
->process();
3012 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
3015 if (processor
->going_down())
3018 utime_t end
= ceph_clock_now();
3021 uint64_t cur_msec
= processor
->interval_msec();
3022 if (cur_msec
!= msec
) { /* was it reconfigured? */
3024 interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3028 if (interval
<= end
)
3029 continue; // next round
3031 utime_t wait_time
= interval
;
3035 cond
.WaitInterval(lock
, wait_time
);
3042 } while (!processor
->going_down());
3047 class RGWMetaNotifier
: public RGWRadosThread
{
3048 RGWMetaNotifierManager notify_mgr
;
3049 RGWMetadataLog
*const log
;
3051 uint64_t interval_msec() override
{
3052 return cct
->_conf
->rgw_md_notify_interval_msec
;
3055 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
3056 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
3058 int process() override
;
3061 int RGWMetaNotifier::process()
3065 log
->read_clear_modified(shards
);
3067 if (shards
.empty()) {
3071 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3072 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
3075 notify_mgr
.notify_all(store
->zone_conn_map
, shards
);
3080 class RGWDataNotifier
: public RGWRadosThread
{
3081 RGWDataNotifierManager notify_mgr
;
3083 uint64_t interval_msec() override
{
3084 return cct
->_conf
->rgw_md_notify_interval_msec
;
3087 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
3089 int process() override
;
3092 int RGWDataNotifier::process()
3094 if (!store
->data_log
) {
3098 map
<int, set
<string
> > shards
;
3100 store
->data_log
->read_clear_modified(shards
);
3102 if (shards
.empty()) {
3106 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3107 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
3110 notify_mgr
.notify_all(store
->zone_data_notify_to_map
, shards
);
3115 class RGWSyncProcessorThread
: public RGWRadosThread
{
3117 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
3118 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
3119 ~RGWSyncProcessorThread() override
{}
3120 int init() override
= 0 ;
3121 int process() override
= 0;
3124 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
3126 RGWMetaSyncStatusManager sync
;
3128 uint64_t interval_msec() override
{
3129 return 0; /* no interval associated, it'll run once until stopped */
3131 void stop_process() override
{
3135 RGWMetaSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
)
3136 : RGWSyncProcessorThread(_store
, "meta-sync"), sync(_store
, async_rados
) {}
3138 void wakeup_sync_shards(set
<int>& shard_ids
) {
3139 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3143 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
3145 int init() override
{
3146 int ret
= sync
.init();
3148 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
3154 int process() override
{
3160 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
3162 RGWDataSyncStatusManager sync
;
3165 uint64_t interval_msec() override
{
3167 return 0; /* no interval associated, it'll run once until stopped */
3169 #define DATA_SYNC_INIT_WAIT_SEC 20
3170 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
3173 void stop_process() override
{
3177 RGWDataSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
,
3178 const string
& _source_zone
)
3179 : RGWSyncProcessorThread(_store
, "data-sync"), sync(_store
, async_rados
, _source_zone
),
3180 initialized(false) {}
3182 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
3183 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3184 sync
.wakeup(iter
->first
, iter
->second
);
3187 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
3189 int init() override
{
3193 int process() override
{
3194 while (!initialized
) {
3198 int ret
= sync
.init();
3203 /* we'll be back! */
3211 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
3213 RGWCoroutinesManager crs
;
3215 RGWHTTPManager http
;
3216 const utime_t trim_interval
;
3218 uint64_t interval_msec() override
{ return 0; }
3219 void stop_process() override
{ crs
.stop(); }
3221 RGWSyncLogTrimThread(RGWRados
*store
, int interval
)
3222 : RGWSyncProcessorThread(store
, "sync-log-trim"),
3223 crs(store
->ctx(), store
->get_cr_registry()), store(store
),
3224 http(store
->ctx(), crs
.get_completion_mgr()),
3225 trim_interval(interval
, 0)
3228 int init() override
{
3229 return http
.set_threaded();
3231 int process() override
{
3232 list
<RGWCoroutinesStack
*> stacks
;
3233 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3234 meta
->call(create_meta_log_trim_cr(store
, &http
,
3235 cct
->_conf
->rgw_md_log_max_shards
,
3237 stacks
.push_back(meta
);
3239 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3240 data
->call(create_data_log_trim_cr(store
, &http
,
3241 cct
->_conf
->rgw_data_log_num_shards
,
3243 stacks
.push_back(data
);
3250 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
3252 Mutex::Locker
l(meta_sync_thread_lock
);
3253 if (meta_sync_processor_thread
) {
3254 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
3258 void RGWRados::wakeup_data_sync_shards(const string
& source_zone
, map
<int, set
<string
> >& shard_ids
)
3260 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
3261 Mutex::Locker
l(data_sync_thread_lock
);
3262 map
<string
, RGWDataSyncProcessorThread
*>::iterator iter
= data_sync_processor_threads
.find(source_zone
);
3263 if (iter
== data_sync_processor_threads
.end()) {
3264 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
3268 RGWDataSyncProcessorThread
*thread
= iter
->second
;
3270 thread
->wakeup_sync_shards(shard_ids
);
3273 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
3275 Mutex::Locker
l(meta_sync_thread_lock
);
3276 if (meta_sync_processor_thread
) {
3277 return meta_sync_processor_thread
->get_manager();
3282 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const std::string
& source_zone
)
3284 Mutex::Locker
l(data_sync_thread_lock
);
3285 auto thread
= data_sync_processor_threads
.find(source_zone
);
3286 if (thread
== data_sync_processor_threads
.end()) {
3289 return thread
->second
->get_manager();
3292 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
3295 int r
= open_pool_ctx(pool
, ioctx
);
3297 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
3302 r
= ioctx
.pool_requires_alignment2(&requires
);
3304 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3315 r
= ioctx
.pool_required_alignment2(&align
);
3317 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3322 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
3328 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
)
3331 int r
= get_required_alignment(pool
, &alignment
);
3336 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
3338 if (alignment
== 0) {
3339 *max_chunk_size
= config_chunk_size
;
3343 if (config_chunk_size
<= alignment
) {
3344 *max_chunk_size
= alignment
;
3348 *max_chunk_size
= config_chunk_size
- (config_chunk_size
% alignment
);
3350 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
3355 int RGWRados::get_max_chunk_size(const string
& placement_rule
, const rgw_obj
& obj
, uint64_t *max_chunk_size
)
3358 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
3359 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
3362 return get_max_chunk_size(pool
, max_chunk_size
);
3365 void RGWRados::finalize()
3367 if (run_sync_thread
) {
3368 Mutex::Locker
l(meta_sync_thread_lock
);
3369 meta_sync_processor_thread
->stop();
3371 Mutex::Locker
dl(data_sync_thread_lock
);
3372 for (auto iter
: data_sync_processor_threads
) {
3373 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3376 if (sync_log_trimmer
) {
3377 sync_log_trimmer
->stop();
3381 async_rados
->stop();
3383 if (run_sync_thread
) {
3384 delete meta_sync_processor_thread
;
3385 meta_sync_processor_thread
= NULL
;
3386 Mutex::Locker
dl(data_sync_thread_lock
);
3387 for (auto iter
: data_sync_processor_threads
) {
3388 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3391 data_sync_processor_threads
.clear();
3392 delete sync_log_trimmer
;
3393 sync_log_trimmer
= nullptr;
3398 if (need_watch_notify()) {
3402 /* delete finisher only after cleaning up watches, as watch error path might call
3403 * into finisher. We stop finisher before finalizing watch to make sure we don't
3404 * actually handle any racing work
3408 if (meta_notifier
) {
3409 meta_notifier
->stop();
3410 delete meta_notifier
;
3412 if (data_notifier
) {
3413 data_notifier
->stop();
3414 delete data_notifier
;
3420 if (use_gc_thread
) {
3421 gc
->stop_processor();
3422 obj_expirer
->stop_processor();
3427 if (use_lc_thread
) {
3428 lc
->stop_processor();
3436 delete rest_master_conn
;
3438 map
<string
, RGWRESTConn
*>::iterator iter
;
3439 for (iter
= zone_conn_map
.begin(); iter
!= zone_conn_map
.end(); ++iter
) {
3440 RGWRESTConn
*conn
= iter
->second
;
3444 for (iter
= zonegroup_conn_map
.begin(); iter
!= zonegroup_conn_map
.end(); ++iter
) {
3445 RGWRESTConn
*conn
= iter
->second
;
3448 RGWQuotaHandler::free_handler(quota_handler
);
3454 delete obj_tombstone_cache
;
3455 delete sync_modules_manager
;
3459 * Initialize the RADOS instance and prepare to do other ops
3460 * Returns 0 on success, -ERR# on failure.
3462 int RGWRados::init_rados()
3465 auto handles
= std::vector
<librados::Rados
>{cct
->_conf
->rgw_num_rados_handles
};
3467 for (auto& r
: handles
) {
3468 ret
= r
.init_with_context(cct
);
3479 sync_modules_manager
= new RGWSyncModulesManager();
3481 rgw_register_sync_modules(sync_modules_manager
);
3483 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
3484 new RGWCoroutinesManagerRegistry(cct
)};
3485 ret
= crs
->hook_to_admin_command("cr dump");
3490 meta_mgr
= new RGWMetadataManager(cct
, this);
3491 data_log
= new RGWDataChangesLog(cct
, this);
3492 cr_registry
= crs
.release();
3494 std::swap(handles
, rados
);
3499 * Add new connection to connections map
3500 * @param zonegroup_conn_map map which new connection will be added to
3501 * @param zonegroup zonegroup which new connection will connect to
3502 * @param new_connection pointer to new connection instance
3504 static void add_new_connection_to_map(map
<string
, RGWRESTConn
*> &zonegroup_conn_map
,
3505 const RGWZoneGroup
&zonegroup
, RGWRESTConn
*new_connection
)
3507 // Delete if connection is already exists
3508 map
<string
, RGWRESTConn
*>::iterator iterZoneGroup
= zonegroup_conn_map
.find(zonegroup
.get_id());
3509 if (iterZoneGroup
!= zonegroup_conn_map
.end()) {
3510 delete iterZoneGroup
->second
;
3513 // Add new connection to connections map
3514 zonegroup_conn_map
[zonegroup
.get_id()] = new_connection
;
3517 int RGWRados::convert_regionmap()
3519 RGWZoneGroupMap zonegroupmap
;
3521 string pool_name
= cct
->_conf
->rgw_zone_root_pool
;
3522 if (pool_name
.empty()) {
3523 pool_name
= RGW_DEFAULT_ZONE_ROOT_POOL
;
3525 string oid
= region_map_oid
;
3527 rgw_pool
pool(pool_name
);
3529 RGWObjectCtx
obj_ctx(this);
3530 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
3531 if (ret
< 0 && ret
!= -ENOENT
) {
3533 } else if (ret
== -ENOENT
) {
3538 bufferlist::iterator iter
= bl
.begin();
3539 ::decode(zonegroupmap
, iter
);
3540 } catch (buffer::error
& err
) {
3541 ldout(cct
, 0) << "error decoding regionmap from " << pool
<< ":" << oid
<< dendl
;
3545 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroupmap
.zonegroups
.begin();
3546 iter
!= zonegroupmap
.zonegroups
.end(); ++iter
) {
3547 RGWZoneGroup
& zonegroup
= iter
->second
;
3548 ret
= zonegroup
.init(cct
, this, false);
3549 ret
= zonegroup
.update();
3550 if (ret
< 0 && ret
!= -ENOENT
) {
3551 ldout(cct
, 0) << "Error could not update zonegroup " << zonegroup
.get_name() << ": " <<
3552 cpp_strerror(-ret
) << dendl
;
3554 } else if (ret
== -ENOENT
) {
3555 ret
= zonegroup
.create();
3557 ldout(cct
, 0) << "Error could not create " << zonegroup
.get_name() << ": " <<
3558 cpp_strerror(-ret
) << dendl
;
3564 current_period
.set_user_quota(zonegroupmap
.user_quota
);
3565 current_period
.set_bucket_quota(zonegroupmap
.bucket_quota
);
3567 // remove the region_map so we don't try to convert again
3568 rgw_raw_obj
obj(pool
, oid
);
3569 ret
= delete_system_obj(obj
);
3571 ldout(cct
, 0) << "Error could not remove " << obj
3572 << " after upgrading to zonegroup map: " << cpp_strerror(ret
) << dendl
;
3580 * Replace all region configuration with zonegroup for
3581 * backward compatability
3582 * Returns 0 on success, -ERR# on failure.
3584 int RGWRados::replace_region_with_zonegroup()
3586 /* copy default region */
3587 /* convert default region to default zonegroup */
3588 string default_oid
= cct
->_conf
->rgw_default_region_info_oid
;
3589 if (default_oid
.empty()) {
3590 default_oid
= default_region_info_oid
;
3594 RGWZoneGroup default_zonegroup
;
3595 rgw_pool pool
{default_zonegroup
.get_pool(cct
)};
3596 string oid
= "converted";
3598 RGWObjectCtx
obj_ctx(this);
3600 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
,oid
, bl
, NULL
, NULL
);
3601 if (ret
< 0 && ret
!= -ENOENT
) {
3602 ldout(cct
, 0) << __func__
<< " failed to read converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
3605 } else if (ret
!= -ENOENT
) {
3606 ldout(cct
, 20) << "System already converted " << dendl
;
3610 string default_region
;
3611 ret
= default_zonegroup
.init(cct
, this, false, true);
3613 ldout(cct
, 0) << __func__
<< " failed init default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3616 ret
= default_zonegroup
.read_default_id(default_region
, true);
3617 if (ret
< 0 && ret
!= -ENOENT
) {
3618 ldout(cct
, 0) << __func__
<< " failed reading old default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3622 /* convert regions to zonegroups */
3623 list
<string
> regions
;
3624 ret
= list_regions(regions
);
3625 if (ret
< 0 && ret
!= -ENOENT
) {
3626 ldout(cct
, 0) << __func__
<< " failed to list regions: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3628 } else if (ret
== -ENOENT
|| regions
.empty()) {
3629 RGWZoneParams
zoneparams(default_zone_name
);
3630 int ret
= zoneparams
.init(cct
, this);
3631 if (ret
< 0 && ret
!= -ENOENT
) {
3632 ldout(cct
, 0) << __func__
<< ": error initializing default zone params: " << cpp_strerror(-ret
) << dendl
;
3635 /* update master zone */
3636 RGWZoneGroup
default_zg(default_zonegroup_name
);
3637 ret
= default_zg
.init(cct
, this);
3638 if (ret
< 0 && ret
!= -ENOENT
) {
3639 ldout(cct
, 0) << __func__
<< ": error in initializing default zonegroup: " << cpp_strerror(-ret
) << dendl
;
3642 if (ret
!= -ENOENT
&& default_zg
.master_zone
.empty()) {
3643 default_zg
.master_zone
= zoneparams
.get_id();
3644 return default_zg
.update();
3649 string master_region
, master_zone
;
3650 for (list
<string
>::iterator iter
= regions
.begin(); iter
!= regions
.end(); ++iter
) {
3651 if (*iter
!= default_zonegroup_name
){
3652 RGWZoneGroup
region(*iter
);
3653 int ret
= region
.init(cct
, this, true, true);
3655 ldout(cct
, 0) << __func__
<< " failed init region "<< *iter
<< ": " << cpp_strerror(-ret
) << dendl
;
3658 if (region
.is_master
) {
3659 master_region
= region
.get_id();
3660 master_zone
= region
.master_zone
;
3665 /* create realm if there is none.
3666 The realm name will be the region and zone concatenated
3667 realm id will be mds of its name */
3668 if (realm
.get_id().empty() && !master_region
.empty() && !master_zone
.empty()) {
3669 string new_realm_name
= master_region
+ "." + master_zone
;
3670 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
3671 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
3673 hash
.Update((const byte
*)new_realm_name
.c_str(), new_realm_name
.length());
3675 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
3676 string
new_realm_id(md5_str
);
3677 RGWRealm
new_realm(new_realm_id
,new_realm_name
);
3678 ret
= new_realm
.init(cct
, this, false);
3680 ldout(cct
, 0) << __func__
<< " Error initing new realm: " << cpp_strerror(-ret
) << dendl
;
3683 ret
= new_realm
.create();
3684 if (ret
< 0 && ret
!= -EEXIST
) {
3685 ldout(cct
, 0) << __func__
<< " Error creating new realm: " << cpp_strerror(-ret
) << dendl
;
3688 ret
= new_realm
.set_as_default();
3690 ldout(cct
, 0) << __func__
<< " Error setting realm as default: " << cpp_strerror(-ret
) << dendl
;
3693 ret
= realm
.init(cct
, this);
3695 ldout(cct
, 0) << __func__
<< " Error initing realm: " << cpp_strerror(-ret
) << dendl
;
3698 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
3700 ldout(cct
, 0) << __func__
<< " Error initing current period: " << cpp_strerror(-ret
) << dendl
;
3705 list
<string
>::iterator iter
;
3706 /* create zonegroups */
3707 for (iter
= regions
.begin(); iter
!= regions
.end(); ++iter
)
3709 ldout(cct
, 0) << __func__
<< "Converting " << *iter
<< dendl
;
3710 /* check to see if we don't have already a zonegroup with this name */
3711 RGWZoneGroup
new_zonegroup(*iter
);
3712 ret
= new_zonegroup
.init(cct
, this);
3713 if (ret
== 0 && new_zonegroup
.get_id() != *iter
) {
3714 ldout(cct
, 0) << __func__
<< " zonegroup "<< *iter
<< " already exists id " << new_zonegroup
.get_id () <<
3715 " skipping conversion " << dendl
;
3718 RGWZoneGroup
zonegroup(*iter
);
3719 zonegroup
.set_id(*iter
);
3720 int ret
= zonegroup
.init(cct
, this, true, true);
3722 ldout(cct
, 0) << __func__
<< " failed init zonegroup: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3725 zonegroup
.realm_id
= realm
.get_id();
3726 /* fix default region master zone */
3727 if (*iter
== default_zonegroup_name
&& zonegroup
.master_zone
.empty()) {
3728 ldout(cct
, 0) << __func__
<< " Setting default zone as master for default region" << dendl
;
3729 zonegroup
.master_zone
= default_zone_name
;
3731 ret
= zonegroup
.update();
3732 if (ret
< 0 && ret
!= -EEXIST
) {
3733 ldout(cct
, 0) << __func__
<< " failed to update zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
3737 ret
= zonegroup
.update_name();
3738 if (ret
< 0 && ret
!= -EEXIST
) {
3739 ldout(cct
, 0) << __func__
<< " failed to update_name for zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
3743 if (zonegroup
.get_name() == default_region
) {
3744 ret
= zonegroup
.set_as_default();
3746 ldout(cct
, 0) << __func__
<< " failed to set_as_default " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
3751 for (map
<string
, RGWZone
>::const_iterator iter
= zonegroup
.zones
.begin(); iter
!= zonegroup
.zones
.end();
3753 ldout(cct
, 0) << __func__
<< " Converting zone" << iter
->first
<< dendl
;
3754 RGWZoneParams
zoneparams(iter
->first
, iter
->first
);
3755 zoneparams
.set_id(iter
->first
);
3756 zoneparams
.realm_id
= realm
.get_id();
3757 ret
= zoneparams
.init(cct
, this);
3758 if (ret
< 0 && ret
!= -ENOENT
) {
3759 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
3761 } else if (ret
== -ENOENT
) {
3762 ldout(cct
, 0) << __func__
<< " zone is part of another cluster " << iter
->first
<< " skipping " << dendl
;
3765 zonegroup
.realm_id
= realm
.get_id();
3766 ret
= zoneparams
.update();
3767 if (ret
< 0 && ret
!= -EEXIST
) {
3768 ldout(cct
, 0) << __func__
<< " failed to update zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
3771 ret
= zoneparams
.update_name();
3772 if (ret
< 0 && ret
!= -EEXIST
) {
3773 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
3778 if (!current_period
.get_id().empty()) {
3779 ret
= current_period
.add_zonegroup(zonegroup
);
3781 ldout(cct
, 0) << __func__
<< " failed to add zonegroup to current_period: " << cpp_strerror(-ret
) << dendl
;
3787 if (!current_period
.get_id().empty()) {
3788 ret
= current_period
.update();
3790 ldout(cct
, 0) << __func__
<< " failed to update new period: " << cpp_strerror(-ret
) << dendl
;
3793 ret
= current_period
.store_info(false);
3795 ldout(cct
, 0) << __func__
<< " failed to store new period: " << cpp_strerror(-ret
) << dendl
;
3798 ret
= current_period
.reflect();
3800 ldout(cct
, 0) << __func__
<< " failed to update local objects: " << cpp_strerror(-ret
) << dendl
;
3805 for (auto const& iter
: regions
) {
3806 RGWZoneGroup
zonegroup(iter
);
3807 int ret
= zonegroup
.init(cct
, this, true, true);
3809 ldout(cct
, 0) << __func__
<< " failed init zonegroup" << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3812 ret
= zonegroup
.delete_obj(true);
3813 if (ret
< 0 && ret
!= -ENOENT
) {
3814 ldout(cct
, 0) << __func__
<< " failed to delete region " << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
3820 /* mark as converted */
3821 ret
= rgw_put_system_obj(this, pool
, oid
, bl
.c_str(), bl
.length(),
3822 true, NULL
, real_time(), NULL
);
3824 ldout(cct
, 0) << __func__
<< " failed to mark cluster as converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
3832 int RGWRados::init_zg_from_period(bool *initialized
)
3834 *initialized
= false;
3836 if (current_period
.get_id().empty()) {
3840 int ret
= zonegroup
.init(cct
, this);
3841 ldout(cct
, 20) << "period zonegroup init ret " << ret
<< dendl
;
3842 if (ret
== -ENOENT
) {
3846 ldout(cct
, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret
) << dendl
;
3849 ldout(cct
, 20) << "period zonegroup name " << zonegroup
.get_name() << dendl
;
3851 map
<string
, RGWZoneGroup
>::const_iterator iter
=
3852 current_period
.get_map().zonegroups
.find(zonegroup
.get_id());
3854 if (iter
!= current_period
.get_map().zonegroups
.end()) {
3855 ldout(cct
, 20) << "using current period zonegroup " << zonegroup
.get_name() << dendl
;
3856 zonegroup
= iter
->second
;
3857 ret
= zonegroup
.init(cct
, this, false);
3859 ldout(cct
, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret
) << dendl
;
3862 ret
= zone_params
.init(cct
, this);
3863 if (ret
< 0 && ret
!= -ENOENT
) {
3864 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
3866 } if (ret
==-ENOENT
&& zonegroup
.get_name() == default_zonegroup_name
) {
3867 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
3868 zone_params
.set_name(default_zone_name
);
3869 ret
= zone_params
.init(cct
, this);
3870 if (ret
< 0 && ret
!= -ENOENT
) {
3871 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
3876 for (iter
= current_period
.get_map().zonegroups
.begin();
3877 iter
!= current_period
.get_map().zonegroups
.end(); ++iter
){
3878 const RGWZoneGroup
& zg
= iter
->second
;
3879 // use endpoints from the zonegroup's master zone
3880 auto master
= zg
.zones
.find(zg
.master_zone
);
3881 if (master
== zg
.zones
.end()) {
3882 // fix missing master zone for a single zone zonegroup
3883 if (zg
.master_zone
.empty() && zg
.zones
.size() == 1) {
3884 master
= zg
.zones
.begin();
3885 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing master_zone, setting zone " <<
3886 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
3887 if (zonegroup
.get_id() == zg
.get_id()) {
3888 zonegroup
.master_zone
= master
->second
.id
;
3889 ret
= zonegroup
.update();
3891 ldout(cct
, 0) << "error updating zonegroup : " << cpp_strerror(-ret
) << dendl
;
3895 RGWZoneGroup
fixed_zg(zg
.get_id(),zg
.get_name());
3896 ret
= fixed_zg
.init(cct
, this);
3898 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
3901 fixed_zg
.master_zone
= master
->second
.id
;
3902 ret
= fixed_zg
.update();
3904 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
3909 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing zone for master_zone=" <<
3910 zg
.master_zone
<< dendl
;
3914 const auto& endpoints
= master
->second
.endpoints
;
3915 add_new_connection_to_map(zonegroup_conn_map
, zg
, new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
));
3916 if (!current_period
.get_master_zonegroup().empty() &&
3917 zg
.get_id() == current_period
.get_master_zonegroup()) {
3918 rest_master_conn
= new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
);
3922 *initialized
= true;
3927 int RGWRados::init_zg_from_local(bool *creating_defaults
)
3929 int ret
= zonegroup
.init(cct
, this);
3930 if ( (ret
< 0 && ret
!= -ENOENT
) || (ret
== -ENOENT
&& !cct
->_conf
->rgw_zonegroup
.empty())) {
3931 ldout(cct
, 0) << "failed reading zonegroup info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3933 } else if (ret
== -ENOENT
) {
3934 *creating_defaults
= true;
3935 ldout(cct
, 10) << "Creating default zonegroup " << dendl
;
3936 ret
= zonegroup
.create_default();
3938 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
3942 ret
= zonegroup
.init(cct
, this);
3944 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
3949 ldout(cct
, 20) << "zonegroup " << zonegroup
.get_name() << dendl
;
3950 if (zonegroup
.is_master
) {
3951 // use endpoints from the zonegroup's master zone
3952 auto master
= zonegroup
.zones
.find(zonegroup
.master_zone
);
3953 if (master
== zonegroup
.zones
.end()) {
3954 // fix missing master zone for a single zone zonegroup
3955 if (zonegroup
.master_zone
.empty() && zonegroup
.zones
.size() == 1) {
3956 master
= zonegroup
.zones
.begin();
3957 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing master_zone, setting zone " <<
3958 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
3959 zonegroup
.master_zone
= master
->second
.id
;
3960 ret
= zonegroup
.update();
3962 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
3966 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing zone for "
3967 "master_zone=" << zonegroup
.master_zone
<< dendl
;
3971 const auto& endpoints
= master
->second
.endpoints
;
3972 rest_master_conn
= new RGWRESTConn(cct
, this, zonegroup
.get_id(), endpoints
);
3979 bool RGWRados::zone_syncs_from(RGWZone
& target_zone
, RGWZone
& source_zone
)
3981 return target_zone
.syncs_from(source_zone
.name
) &&
3982 sync_modules_manager
->supports_data_export(source_zone
.tier_type
);
3986 * Initialize the RADOS instance and prepare to do other ops
3987 * Returns 0 on success, -ERR# on failure.
3989 int RGWRados::init_complete()
3991 int ret
= realm
.init(cct
, this);
3992 if (ret
< 0 && ret
!= -ENOENT
) {
3993 ldout(cct
, 0) << "failed reading realm info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
3995 } else if (ret
!= -ENOENT
) {
3996 ldout(cct
, 20) << "realm " << realm
.get_name() << " " << realm
.get_id() << dendl
;
3997 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
3998 if (ret
< 0 && ret
!= -ENOENT
) {
3999 ldout(cct
, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret
) << dendl
;
4002 ldout(cct
, 20) << "current period " << current_period
.get_id() << dendl
;
4005 ret
= replace_region_with_zonegroup();
4007 lderr(cct
) << "failed converting region to zonegroup : ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4011 ret
= convert_regionmap();
4013 lderr(cct
) << "failed converting regionmap: " << cpp_strerror(-ret
) << dendl
;
4017 bool zg_initialized
= false;
4019 if (!current_period
.get_id().empty()) {
4020 ret
= init_zg_from_period(&zg_initialized
);
4026 bool creating_defaults
= false;
4027 bool using_local
= (!zg_initialized
);
4029 ldout(cct
, 10) << " cannot find current period zonegroup using local zonegroup" << dendl
;
4030 ret
= init_zg_from_local(&creating_defaults
);
4034 // read period_config into current_period
4035 auto& period_config
= current_period
.get_config();
4036 ret
= period_config
.read(this, zonegroup
.realm_id
);
4037 if (ret
< 0 && ret
!= -ENOENT
) {
4038 ldout(cct
, 0) << "ERROR: failed to read period config: "
4039 << cpp_strerror(ret
) << dendl
;
4044 ldout(cct
, 10) << "Cannot find current period zone using local zone" << dendl
;
4045 if (creating_defaults
&& cct
->_conf
->rgw_zone
.empty()) {
4046 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
4047 zone_params
.set_name(default_zone_name
);
4050 ret
= zone_params
.init(cct
, this);
4051 if (ret
< 0 && ret
!= -ENOENT
) {
4052 lderr(cct
) << "failed reading zone info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4055 map
<string
, RGWZone
>::iterator zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4056 if (zone_iter
== get_zonegroup().zones
.end()) {
4058 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4061 ldout(cct
, 1) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << "), switching to local zonegroup configuration" << dendl
;
4062 ret
= init_zg_from_local(&creating_defaults
);
4066 zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4068 if (zone_iter
!= get_zonegroup().zones
.end()) {
4069 zone_public_config
= zone_iter
->second
;
4070 ldout(cct
, 20) << "zone " << zone_params
.get_name() << dendl
;
4072 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4076 zone_short_id
= current_period
.get_map().get_zone_short_id(zone_params
.get_id());
4078 ret
= sync_modules_manager
->create_instance(cct
, zone_public_config
.tier_type
, zone_params
.tier_config
, &sync_module
);
4080 lderr(cct
) << "ERROR: failed to init sync module instance, ret=" << ret
<< dendl
;
4084 writeable_zone
= (zone_public_config
.tier_type
.empty() || zone_public_config
.tier_type
== "rgw");
4086 init_unique_trans_id_deps();
4088 finisher
= new Finisher(cct
);
4091 period_puller
.reset(new RGWPeriodPuller(this));
4092 period_history
.reset(new RGWPeriodHistory(cct
, period_puller
.get(),
4095 if (need_watch_notify()) {
4098 lderr(cct
) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret
) << dendl
;
4103 /* first build all zones index */
4104 for (auto ziter
: get_zonegroup().zones
) {
4105 const string
& id
= ziter
.first
;
4106 RGWZone
& z
= ziter
.second
;
4107 zone_id_by_name
[z
.name
] = id
;
4111 if (zone_by_id
.find(zone_id()) == zone_by_id
.end()) {
4112 ldout(cct
, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl
;
4114 zone_public_config
= zone_by_id
[zone_id()];
4115 for (auto ziter
: get_zonegroup().zones
) {
4116 const string
& id
= ziter
.first
;
4117 RGWZone
& z
= ziter
.second
;
4118 if (id
== zone_id()) {
4121 if (z
.endpoints
.empty()) {
4122 ldout(cct
, 0) << "WARNING: can't generate connection for zone " << z
.id
<< " id " << z
.name
<< ": no endpoints defined" << dendl
;
4125 ldout(cct
, 20) << "generating connection object for zone " << z
.name
<< " id " << z
.id
<< dendl
;
4126 RGWRESTConn
*conn
= new RGWRESTConn(cct
, this, z
.id
, z
.endpoints
);
4127 zone_conn_map
[id
] = conn
;
4128 if (zone_syncs_from(zone_public_config
, z
) ||
4129 zone_syncs_from(z
, zone_public_config
)) {
4130 if (zone_syncs_from(zone_public_config
, z
)) {
4131 zone_data_sync_from_map
[id
] = conn
;
4133 if (zone_syncs_from(z
, zone_public_config
)) {
4134 zone_data_notify_to_map
[id
] = conn
;
4137 ldout(cct
, 20) << "NOTICE: not syncing to/from zone " << z
.name
<< " id " << z
.id
<< dendl
;
4141 ret
= open_root_pool_ctx();
4145 ret
= open_gc_pool_ctx();
4149 ret
= open_lc_pool_ctx();
4153 ret
= open_objexp_pool_ctx();
4157 pools_initialized
= true;
4160 gc
->initialize(cct
, this);
4162 obj_expirer
= new RGWObjectExpirer(this);
4164 if (use_gc_thread
) {
4165 gc
->start_processor();
4166 obj_expirer
->start_processor();
4169 if (run_sync_thread
) {
4170 // initialize the log period history. we want to do this any time we're not
4171 // running under radosgw-admin, so we check run_sync_thread here before
4172 // disabling it based on the zone/zonegroup setup
4173 meta_mgr
->init_oldest_log_period();
4176 /* no point of running sync thread if we don't have a master zone configured
4177 or there is no rest_master_conn */
4178 if (get_zonegroup().master_zone
.empty() || !rest_master_conn
4179 || current_period
.get_id().empty()) {
4180 run_sync_thread
= false;
4183 async_rados
= new RGWAsyncRadosProcessor(this, cct
->_conf
->rgw_num_async_rados_threads
);
4184 async_rados
->start();
4186 ret
= meta_mgr
->init(current_period
.get_id());
4188 lderr(cct
) << "ERROR: failed to initialize metadata log: "
4189 << cpp_strerror(-ret
) << dendl
;
4193 if (is_meta_master()) {
4194 auto md_log
= meta_mgr
->get_log(current_period
.get_id());
4195 meta_notifier
= new RGWMetaNotifier(this, md_log
);
4196 meta_notifier
->start();
4199 if (run_sync_thread
) {
4200 Mutex::Locker
l(meta_sync_thread_lock
);
4201 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this, async_rados
);
4202 ret
= meta_sync_processor_thread
->init();
4204 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
4207 meta_sync_processor_thread
->start();
4209 Mutex::Locker
dl(data_sync_thread_lock
);
4210 for (auto iter
: zone_data_sync_from_map
) {
4211 ldout(cct
, 5) << "starting data sync thread for zone " << iter
.first
<< dendl
;
4212 RGWDataSyncProcessorThread
*thread
= new RGWDataSyncProcessorThread(this, async_rados
, iter
.first
);
4213 ret
= thread
->init();
4215 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
4219 data_sync_processor_threads
[iter
.first
] = thread
;
4221 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
4223 sync_log_trimmer
= new RGWSyncLogTrimThread(this, interval
);
4224 ret
= sync_log_trimmer
->init();
4226 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
4229 sync_log_trimmer
->start();
4232 data_notifier
= new RGWDataNotifier(this);
4233 data_notifier
->start();
4236 lc
->initialize(cct
, this);
4239 lc
->start_processor();
4241 quota_handler
= RGWQuotaHandler::generate_handler(this, quota_threads
);
4243 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
4244 get_zone().bucket_index_max_shards
);
4245 if (bucket_index_max_shards
> MAX_BUCKET_INDEX_SHARDS_PRIME
) {
4246 bucket_index_max_shards
= MAX_BUCKET_INDEX_SHARDS_PRIME
;
4247 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
4248 << MAX_BUCKET_INDEX_SHARDS_PRIME
<< dendl
;
4250 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
4252 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
4253 binfo_cache
->init(this);
4255 bool need_tombstone_cache
= !zone_data_notify_to_map
.empty(); /* have zones syncing from us */
4257 if (need_tombstone_cache
) {
4258 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
4265 * Initialize the RADOS instance and prepare to do other ops
4266 * Returns 0 on success, -ERR# on failure.
4268 int RGWRados::initialize()
4276 return init_complete();
4279 void RGWRados::finalize_watch()
4281 for (int i
= 0; i
< num_watchers
; i
++) {
4282 RGWWatcher
*watcher
= watchers
[i
];
4283 watcher
->unregister_watch();
4287 delete[] notify_oids
;
4291 void RGWRados::schedule_context(Context
*c
) {
4295 int RGWRados::list_raw_prefixed_objs(const rgw_pool
& pool
, const string
& prefix
, list
<string
>& result
)
4298 RGWListRawObjsCtx ctx
;
4301 int r
= list_raw_objects(pool
, prefix
, 1000,
4302 ctx
, oids
, &is_truncated
);
4306 list
<string
>::iterator iter
;
4307 for (iter
= oids
.begin(); iter
!= oids
.end(); ++iter
) {
4308 string
& val
= *iter
;
4309 if (val
.size() > prefix
.size())
4310 result
.push_back(val
.substr(prefix
.size()));
4312 } while (is_truncated
);
4317 int RGWRados::list_regions(list
<string
>& regions
)
4319 RGWZoneGroup zonegroup
;
4321 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), region_info_oid_prefix
, regions
);
4324 int RGWRados::list_zonegroups(list
<string
>& zonegroups
)
4326 RGWZoneGroup zonegroup
;
4328 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), zonegroup_names_oid_prefix
, zonegroups
);
4331 int RGWRados::list_zones(list
<string
>& zones
)
4333 RGWZoneParams zoneparams
;
4335 return list_raw_prefixed_objs(zoneparams
.get_pool(cct
), zone_names_oid_prefix
, zones
);
4338 int RGWRados::list_realms(list
<string
>& realms
)
4340 RGWRealm
realm(cct
, this);
4341 return list_raw_prefixed_objs(realm
.get_pool(cct
), realm_names_oid_prefix
, realms
);
4344 int RGWRados::list_periods(list
<string
>& periods
)
4347 list
<string
> raw_periods
;
4348 int ret
= list_raw_prefixed_objs(period
.get_pool(cct
), period
.get_info_oid_prefix(), raw_periods
);
4352 for (const auto& oid
: raw_periods
) {
4353 size_t pos
= oid
.find(".");
4354 if (pos
!= std::string::npos
) {
4355 periods
.push_back(oid
.substr(0, pos
));
4357 periods
.push_back(oid
);
4360 periods
.sort(); // unique() only detects duplicates if they're adjacent
4366 int RGWRados::list_periods(const string
& current_period
, list
<string
>& periods
)
4369 string period_id
= current_period
;
4370 while(!period_id
.empty()) {
4371 RGWPeriod
period(period_id
);
4372 ret
= period
.init(cct
, this);
4376 periods
.push_back(period
.get_id());
4377 period_id
= period
.get_predecessor();
4384 * Open the pool used as root for this gateway
4385 * Returns: 0 on success, -ERR# otherwise.
4387 int RGWRados::open_root_pool_ctx()
4389 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root
, root_pool_ctx
, true);
4392 int RGWRados::open_gc_pool_ctx()
4394 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool
, gc_pool_ctx
, true);
4397 int RGWRados::open_lc_pool_ctx()
4399 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool
, lc_pool_ctx
, true);
4402 int RGWRados::open_objexp_pool_ctx()
4404 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, objexp_pool_ctx
, true);
4407 int RGWRados::init_watch()
4409 int r
= rgw_init_ioctx(&rados
[0], get_zone_params().control_pool
, control_pool_ctx
, true);
4414 num_watchers
= cct
->_conf
->rgw_num_control_oids
;
4416 bool compat_oid
= (num_watchers
== 0);
4418 if (num_watchers
<= 0)
4421 notify_oids
= new string
[num_watchers
];
4422 watchers
= new RGWWatcher
*[num_watchers
];
4424 for (int i
=0; i
< num_watchers
; i
++) {
4425 string
& notify_oid
= notify_oids
[i
];
4426 notify_oid
= notify_oid_prefix
;
4429 snprintf(buf
, sizeof(buf
), ".%d", i
);
4430 notify_oid
.append(buf
);
4432 r
= control_pool_ctx
.create(notify_oid
, false);
4433 if (r
< 0 && r
!= -EEXIST
)
4436 RGWWatcher
*watcher
= new RGWWatcher(this, i
, notify_oid
);
4437 watchers
[i
] = watcher
;
4439 r
= watcher
->register_watch();
4444 watch_initialized
= true;
4446 set_cache_enabled(true);
4451 void RGWRados::pick_control_oid(const string
& key
, string
& notify_oid
)
4453 uint32_t r
= ceph_str_hash_linux(key
.c_str(), key
.size());
4455 int i
= r
% num_watchers
;
4457 snprintf(buf
, sizeof(buf
), ".%d", i
);
4459 notify_oid
= notify_oid_prefix
;
4460 notify_oid
.append(buf
);
4463 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
)
4465 librados::Rados
*rad
= get_rados_handle();
4466 int r
= rgw_init_ioctx(rad
, pool
, io_ctx
);
4470 if (!pools_initialized
)
4473 r
= rad
->pool_create(pool
.name
.c_str());
4474 if (r
< 0 && r
!= -EEXIST
)
4477 return rgw_init_ioctx(rad
, pool
, io_ctx
);
4480 void RGWRados::build_bucket_index_marker(const string
& shard_id_str
, const string
& shard_marker
,
4483 *marker
= shard_id_str
;
4484 marker
->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR
);
4485 marker
->append(shard_marker
);
4489 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
)
4491 const string
*rule
= &bucket_info
.placement_rule
;
4492 if (rule
->empty()) {
4493 rule
= &zonegroup
.default_placement
;
4495 auto iter
= zone_params
.placement_pools
.find(*rule
);
4496 if (iter
== zone_params
.placement_pools
.end()) {
4497 ldout(cct
, 0) << "could not find placement rule " << *rule
<< " within zonegroup " << dendl
;
4501 int r
= open_pool_ctx(iter
->second
.index_pool
, index_ctx
);
4509 * set up a bucket listing.
4510 * handle is filled in.
4511 * Returns 0 on success, -ERR# otherwise.
4513 int RGWRados::list_buckets_init(RGWAccessHandle
*handle
)
4515 librados::NObjectIterator
*state
= new librados::NObjectIterator(root_pool_ctx
.nobjects_begin());
4516 *handle
= (RGWAccessHandle
)state
;
4521 * get the next bucket in the listing.
4523 * handle is updated.
4524 * returns 0 on success, -ERR# otherwise.
4526 int RGWRados::list_buckets_next(rgw_bucket_dir_entry
& obj
, RGWAccessHandle
*handle
)
4528 librados::NObjectIterator
*state
= (librados::NObjectIterator
*)*handle
;
4531 if (*state
== root_pool_ctx
.nobjects_end()) {
4536 obj
.key
.name
= (*state
)->get_oid();
4537 if (obj
.key
.name
[0] == '_') {
4538 obj
.key
.name
= obj
.key
.name
.substr(1);
4542 } while (obj
.key
.name
[0] == '.'); /* skip all entries starting with '.' */
4550 struct log_list_state
{
4552 librados::IoCtx io_ctx
;
4553 librados::NObjectIterator obit
;
4556 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
4558 log_list_state
*state
= new log_list_state
;
4559 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
4564 state
->prefix
= prefix
;
4565 state
->obit
= state
->io_ctx
.nobjects_begin();
4566 *handle
= (RGWAccessHandle
)state
;
4570 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
4572 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
4574 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
4578 if (state
->prefix
.length() &&
4579 state
->obit
->get_oid().find(state
->prefix
) != 0) {
4583 *name
= state
->obit
->get_oid();
4590 int RGWRados::log_remove(const string
& name
)
4592 librados::IoCtx io_ctx
;
4593 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4596 return io_ctx
.remove(name
);
4599 struct log_show_state
{
4600 librados::IoCtx io_ctx
;
4602 bufferlist::iterator p
;
4606 log_show_state() : pos(0), eof(false) {}
4609 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
4611 log_show_state
*state
= new log_show_state
;
4612 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
4618 *handle
= (RGWAccessHandle
)state
;
4622 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
4624 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
4625 off_t off
= state
->p
.get_off();
4627 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
4629 << " eof " << (int)state
->eof
4632 unsigned chunk
= 1024*1024;
4633 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
4635 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
4641 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
4642 } catch (buffer::error
& err
) {
4646 state
->bl
.claim(old
);
4647 state
->bl
.claim_append(more
);
4648 state
->p
= state
->bl
.begin();
4649 if ((unsigned)r
< chunk
)
4651 ldout(cct
, 10) << " read " << r
<< dendl
;
4655 return 0; // end of file
4657 ::decode(*entry
, state
->p
);
4659 catch (const buffer::error
&e
) {
4666 * usage_log_hash: get usage log key hash, based on name and index
4668 * Get the usage object name. Since a user may have more than 1
4669 * object holding that info (multiple shards), we use index to
4670 * specify that shard number. Once index exceeds max shards it
4672 * If name is not being set, results for all users will be returned
4673 * and index will wrap only after total shards number.
4675 * @param cct [in] ceph context
4676 * @param name [in] user name
4677 * @param hash [out] hash value
4678 * @param index [in] shard index number
4680 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
4682 uint32_t val
= index
;
4684 if (!name
.empty()) {
4685 int max_user_shards
= max(cct
->_conf
->rgw_usage_max_user_shards
, 1);
4686 val
%= max_user_shards
;
4687 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
4690 int max_shards
= max(cct
->_conf
->rgw_usage_max_shards
, 1);
4691 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
4695 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
4699 map
<string
, rgw_usage_log_info
> log_objs
;
4704 /* restructure usage map, zone by object hash */
4705 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
4706 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
4707 const rgw_user_bucket
& ub
= iter
->first
;
4708 RGWUsageBatch
& info
= iter
->second
;
4710 if (ub
.user
.empty()) {
4711 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
4715 if (ub
.user
!= last_user
) {
4716 /* index *should* be random, but why waste extra cycles
4717 in most cases max user shards is not going to exceed 1,
4718 so just incrementing it */
4719 usage_log_hash(cct
, ub
.user
, hash
, index
++);
4721 last_user
= ub
.user
;
4722 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
4724 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
4725 v
.push_back(miter
->second
);
4729 map
<string
, rgw_usage_log_info
>::iterator liter
;
4731 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
4732 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
4739 int RGWRados::read_usage(const rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
4740 bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
)
4742 uint32_t num
= max_entries
;
4743 string hash
, first_hash
;
4744 string user_str
= user
.to_str();
4745 usage_log_hash(cct
, user_str
, first_hash
, 0);
4747 if (usage_iter
.index
) {
4748 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
4756 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
4757 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
4759 int ret
= cls_obj_usage_log_read(hash
, user_str
, start_epoch
, end_epoch
, num
,
4760 usage_iter
.read_iter
, ret_usage
, is_truncated
);
4767 num
-= ret_usage
.size();
4769 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
4770 usage
[iter
->first
].aggregate(iter
->second
);
4774 if (!*is_truncated
) {
4775 usage_iter
.read_iter
.clear();
4776 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
4778 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
4782 int RGWRados::trim_usage(rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
4785 string hash
, first_hash
;
4786 string user_str
= user
.to_str();
4787 usage_log_hash(cct
, user_str
, first_hash
, index
);
4792 int ret
= cls_obj_usage_log_trim(hash
, user_str
, start_epoch
, end_epoch
);
4800 usage_log_hash(cct
, user_str
, hash
, ++index
);
4801 } while (hash
!= first_hash
);
4806 #define MAX_SHARDS_PRIME 7877
4808 int RGWRados::key_to_shard_id(const string
& key
, int max_shards
)
4810 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size()) % MAX_SHARDS_PRIME
;
4811 return val
% max_shards
;
4814 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& key
, string
& name
, int *shard_id
)
4816 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
4819 *shard_id
= val
% max_shards
;
4821 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
4822 name
= prefix
+ buf
;
4825 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& section
, const string
& key
, string
& name
)
4827 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
4828 val
^= ceph_str_hash_linux(section
.c_str(), section
.size());
4830 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
4831 name
= prefix
+ buf
;
4834 void RGWRados::shard_name(const string
& prefix
, unsigned shard_id
, string
& name
)
4837 snprintf(buf
, sizeof(buf
), "%u", shard_id
);
4838 name
= prefix
+ buf
;
4842 void RGWRados::time_log_prepare_entry(cls_log_entry
& entry
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
4844 cls_log_add_prepare_entry(entry
, utime_t(ut
), section
, key
, bl
);
4847 int RGWRados::time_log_add_init(librados::IoCtx
& io_ctx
)
4849 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
, true);
4853 int RGWRados::time_log_add(const string
& oid
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
4855 librados::IoCtx io_ctx
;
4857 int r
= time_log_add_init(io_ctx
);
4862 ObjectWriteOperation op
;
4864 cls_log_add(op
, t
, section
, key
, bl
);
4866 return io_ctx
.operate(oid
, &op
);
4869 int RGWRados::time_log_add(const string
& oid
, list
<cls_log_entry
>& entries
,
4870 librados::AioCompletion
*completion
, bool monotonic_inc
)
4872 librados::IoCtx io_ctx
;
4874 int r
= time_log_add_init(io_ctx
);
4879 ObjectWriteOperation op
;
4880 cls_log_add(op
, entries
, monotonic_inc
);
4883 r
= io_ctx
.operate(oid
, &op
);
4885 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
4890 int RGWRados::time_log_list(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
4891 int max_entries
, list
<cls_log_entry
>& entries
,
4892 const string
& marker
,
4896 librados::IoCtx io_ctx
;
4898 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4901 librados::ObjectReadOperation op
;
4903 utime_t
st(start_time
);
4904 utime_t
et(end_time
);
4906 cls_log_list(op
, st
, et
, marker
, max_entries
, entries
,
4907 out_marker
, truncated
);
4911 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
4918 int RGWRados::time_log_info(const string
& oid
, cls_log_header
*header
)
4920 librados::IoCtx io_ctx
;
4922 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4925 librados::ObjectReadOperation op
;
4927 cls_log_info(op
, header
);
4931 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
4938 int RGWRados::time_log_info_async(librados::IoCtx
& io_ctx
, const string
& oid
, cls_log_header
*header
, librados::AioCompletion
*completion
)
4940 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4944 librados::ObjectReadOperation op
;
4946 cls_log_info(op
, header
);
4948 int ret
= io_ctx
.aio_operate(oid
, completion
, &op
, NULL
);
4955 int RGWRados::time_log_trim(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
4956 const string
& from_marker
, const string
& to_marker
,
4957 librados::AioCompletion
*completion
)
4959 librados::IoCtx io_ctx
;
4961 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
4965 utime_t
st(start_time
);
4966 utime_t
et(end_time
);
4968 ObjectWriteOperation op
;
4969 cls_log_trim(op
, st
, et
, from_marker
, to_marker
);
4972 r
= io_ctx
.operate(oid
, &op
);
4974 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
4979 string
RGWRados::objexp_hint_get_shardname(int shard_num
)
4982 snprintf(buf
, sizeof(buf
), "%010u", (unsigned)shard_num
);
4984 string
objname("obj_delete_at_hint.");
4985 return objname
+ buf
;
4988 #define MAX_OBJEXP_SHARDS_PRIME 7877
4990 int RGWRados::objexp_key_shard(const rgw_obj_index_key
& key
)
4992 string obj_key
= key
.name
+ key
.instance
;
4993 int num_shards
= cct
->_conf
->rgw_objexp_hints_num_shards
;
4994 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
4995 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
4996 sid
= sid2
% MAX_OBJEXP_SHARDS_PRIME
% num_shards
;
4997 return sid
% num_shards
;
5000 static string
objexp_hint_get_keyext(const string
& tenant_name
,
5001 const string
& bucket_name
,
5002 const string
& bucket_id
,
5003 const rgw_obj_key
& obj_key
)
5005 return tenant_name
+ (tenant_name
.empty() ? "" : ":") + bucket_name
+ ":" + bucket_id
+
5006 ":" + obj_key
.name
+ ":" + obj_key
.instance
;
5009 int RGWRados::objexp_hint_add(const ceph::real_time
& delete_at
,
5010 const string
& tenant_name
,
5011 const string
& bucket_name
,
5012 const string
& bucket_id
,
5013 const rgw_obj_index_key
& obj_key
)
5015 const string keyext
= objexp_hint_get_keyext(tenant_name
, bucket_name
,
5016 bucket_id
, obj_key
);
5017 objexp_hint_entry he
= {
5018 .tenant
= tenant_name
,
5019 .bucket_name
= bucket_name
,
5020 .bucket_id
= bucket_id
,
5022 .exp_time
= delete_at
};
5025 ObjectWriteOperation op
;
5026 cls_timeindex_add(op
, utime_t(delete_at
), keyext
, hebl
);
5028 string shard_name
= objexp_hint_get_shardname(objexp_key_shard(obj_key
));
5029 return objexp_pool_ctx
.operate(shard_name
, &op
);
5032 void RGWRados::objexp_get_shard(int shard_num
,
5033 string
& shard
) /* out */
5035 shard
= objexp_hint_get_shardname(shard_num
);
5038 int RGWRados::objexp_hint_list(const string
& oid
,
5039 const ceph::real_time
& start_time
,
5040 const ceph::real_time
& end_time
,
5041 const int max_entries
,
5042 const string
& marker
,
5043 list
<cls_timeindex_entry
>& entries
, /* out */
5044 string
*out_marker
, /* out */
5045 bool *truncated
) /* out */
5047 librados::ObjectReadOperation op
;
5048 cls_timeindex_list(op
, utime_t(start_time
), utime_t(end_time
), marker
, max_entries
, entries
,
5049 out_marker
, truncated
);
5052 int ret
= objexp_pool_ctx
.operate(oid
, &op
, &obl
);
5054 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5058 if ((ret
== -ENOENT
) && truncated
) {
5065 int RGWRados::objexp_hint_parse(cls_timeindex_entry
&ti_entry
, /* in */
5066 objexp_hint_entry
& hint_entry
) /* out */
5069 bufferlist::iterator iter
= ti_entry
.value
.begin();
5070 ::decode(hint_entry
, iter
);
5071 } catch (buffer::error
& err
) {
5072 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
5078 int RGWRados::objexp_hint_trim(const string
& oid
,
5079 const ceph::real_time
& start_time
,
5080 const ceph::real_time
& end_time
,
5081 const string
& from_marker
,
5082 const string
& to_marker
)
5084 int ret
= cls_timeindex_trim(objexp_pool_ctx
, oid
, utime_t(start_time
), utime_t(end_time
),
5085 from_marker
, to_marker
);
5086 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5093 int RGWRados::lock_exclusive(rgw_pool
& pool
, const string
& oid
, timespan
& duration
,
5094 string
& zone_id
, string
& owner_id
) {
5095 librados::IoCtx io_ctx
;
5097 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5101 uint64_t msec
= std::chrono::duration_cast
<std::chrono::milliseconds
>(duration
).count();
5102 utime_t
ut(msec
/ 1000, msec
% 1000);
5104 rados::cls::lock::Lock
l(log_lock_name
);
5106 l
.set_cookie(owner_id
);
5110 return l
.lock_exclusive(&io_ctx
, oid
);
5113 int RGWRados::unlock(rgw_pool
& pool
, const string
& oid
, string
& zone_id
, string
& owner_id
) {
5114 librados::IoCtx io_ctx
;
5116 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5121 rados::cls::lock::Lock
l(log_lock_name
);
5123 l
.set_cookie(owner_id
);
5125 return l
.unlock(&io_ctx
, oid
);
5128 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
5130 bufferlist::iterator i
= bl
.begin();
5131 RGWAccessControlPolicy
policy(cct
);
5133 policy
.decode_owner(i
);
5134 } catch (buffer::error
& err
) {
5135 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5138 *owner
= policy
.get_owner();
5142 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
5144 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
5145 if (aiter
== attrset
.end())
5148 bufferlist
& bl
= aiter
->second
;
5149 bufferlist::iterator iter
= bl
.begin();
5151 policy
->decode(iter
);
5152 } catch (buffer::error
& err
) {
5153 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5156 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 15)) {
5157 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
5158 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
5159 s3policy
->to_xml(*_dout
);
5167 * get listing of the objects in a bucket.
5169 * max: maximum number of results to return
5170 * bucket: bucket to list contents of
5171 * prefix: only return results that match this prefix
5172 * delim: do not include results that match this string.
5173 * Any skipped results will have the matching portion of their name
5174 * inserted in common_prefixes with a "true" mark.
5175 * marker: if filled in, begin the listing with this object.
5176 * end_marker: if filled in, end the listing with this object.
5177 * result: the objects are put in here.
5178 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5179 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5181 int RGWRados::Bucket::List::list_objects(int max
, vector
<rgw_bucket_dir_entry
> *result
,
5182 map
<string
, bool> *common_prefixes
,
5185 RGWRados
*store
= target
->get_store();
5186 CephContext
*cct
= store
->ctx();
5187 int shard_id
= target
->get_shard_id();
5190 bool truncated
= true;
5191 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
,max
);
5195 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
5197 rgw_obj_key end_marker_obj
;
5198 rgw_obj_index_key cur_end_marker
;
5199 if (!params
.ns
.empty()) {
5200 end_marker_obj
= rgw_obj_key(params
.end_marker
.name
, params
.end_marker
.instance
, params
.ns
);
5201 end_marker_obj
.ns
= params
.ns
;
5202 end_marker_obj
.get_index_key(&cur_end_marker
);
5204 rgw_obj_index_key cur_marker
;
5205 marker_obj
.get_index_key(&cur_marker
);
5207 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
5209 rgw_obj_key
prefix_obj(params
.prefix
);
5210 prefix_obj
.ns
= params
.ns
;
5211 string cur_prefix
= prefix_obj
.get_index_key_name();
5213 string bigger_than_delim
;
5215 if (!params
.delim
.empty()) {
5216 unsigned long val
= decode_utf8((unsigned char *)params
.delim
.c_str(), params
.delim
.size());
5217 char buf
[params
.delim
.size() + 16];
5218 int r
= encode_utf8(val
+ 1, (unsigned char *)buf
);
5220 ldout(cct
,0) << "ERROR: encode_utf8() failed" << dendl
;
5225 bigger_than_delim
= buf
;
5227 /* if marker points at a common prefix, fast forward it into its upperbound string */
5228 int delim_pos
= cur_marker
.name
.find(params
.delim
, params
.prefix
.size());
5229 if (delim_pos
>= 0) {
5230 string s
= cur_marker
.name
.substr(0, delim_pos
);
5231 s
.append(bigger_than_delim
);
5236 string skip_after_delim
;
5237 while (truncated
&& count
<= max
) {
5238 if (skip_after_delim
> cur_marker
.name
) {
5239 cur_marker
= skip_after_delim
;
5240 ldout(cct
, 20) << "setting cur_marker=" << cur_marker
.name
<< "[" << cur_marker
.instance
<< "]" << dendl
;
5242 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
5243 int r
= store
->cls_bucket_list(target
->get_bucket_info(), shard_id
, cur_marker
, cur_prefix
,
5244 read_ahead
+ 1 - count
, params
.list_versions
, ent_map
,
5245 &truncated
, &cur_marker
);
5249 std::map
<string
, rgw_bucket_dir_entry
>::iterator eiter
;
5250 for (eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
5251 rgw_bucket_dir_entry
& entry
= eiter
->second
;
5252 rgw_obj_index_key index_key
= entry
.key
;
5254 rgw_obj_key
obj(index_key
);
5256 /* note that parse_raw_oid() here will not set the correct object's instance, as
5257 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5258 * not needed for the checks here and we end up using the raw entry for the return vector
5260 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
5262 ldout(cct
, 0) << "ERROR: could not parse object name: " << obj
.name
<< dendl
;
5265 bool check_ns
= (obj
.ns
== params
.ns
);
5266 if (!params
.list_versions
&& !entry
.is_visible()) {
5270 if (params
.enforce_ns
&& !check_ns
) {
5271 if (!params
.ns
.empty()) {
5272 /* we've iterated past the namespace we're searching -- done now */
5277 /* we're not looking at the namespace this object is in, next! */
5281 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
5287 params
.marker
= index_key
;
5288 next_marker
= index_key
;
5291 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
5294 if (params
.prefix
.size() && (obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
) != 0))
5297 if (!params
.delim
.empty()) {
5298 int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
5300 if (delim_pos
>= 0) {
5301 string prefix_key
= obj
.name
.substr(0, delim_pos
+ 1);
5303 if (common_prefixes
&&
5304 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
5309 next_marker
= prefix_key
;
5310 (*common_prefixes
)[prefix_key
] = true;
5312 skip_after_delim
= obj
.name
.substr(0, delim_pos
);
5313 skip_after_delim
.append(bigger_than_delim
);
5315 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
5329 result
->emplace_back(std::move(entry
));
5333 // Either the back-end telling us truncated, or we don't consume all
5334 // items returned per the amount caller request
5335 truncated
= (truncated
|| eiter
!= ent_map
.end());
5340 *is_truncated
= truncated
;
5346 * create a rados pool, associated meta info
5347 * returns 0 on success, -ERR# otherwise.
5349 int RGWRados::create_pool(const rgw_pool
& pool
)
5353 librados::Rados
*rad
= get_rados_handle();
5354 ret
= rad
->pool_create(pool
.name
.c_str(), 0);
5357 else if (ret
== -ERANGE
) {
5360 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret
)
5361 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5370 int RGWRados::init_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
5372 librados::IoCtx index_ctx
; // context for new bucket
5374 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5378 string dir_oid
= dir_oid_prefix
;
5379 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
5381 map
<int, string
> bucket_objs
;
5382 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
5384 return CLSRGWIssueBucketIndexInit(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
5387 void RGWRados::create_bucket_id(string
*bucket_id
)
5389 uint64_t iid
= instance_id();
5390 uint64_t bid
= next_bucket_id();
5391 char buf
[get_zone_params().get_id().size() + 48];
5392 snprintf(buf
, sizeof(buf
), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid
, (long long)bid
);
5397 * create a bucket with name bucket and the given list of attrs
5398 * returns 0 on success, -ERR# otherwise.
5400 int RGWRados::create_bucket(RGWUserInfo
& owner
, rgw_bucket
& bucket
,
5401 const string
& zonegroup_id
,
5402 const string
& placement_rule
,
5403 const string
& swift_ver_location
,
5404 const RGWQuotaInfo
* pquota_info
,
5405 map
<std::string
, bufferlist
>& attrs
,
5406 RGWBucketInfo
& info
,
5408 obj_version
*pep_objv
,
5409 real_time creation_time
,
5410 rgw_bucket
*pmaster_bucket
,
5411 uint32_t *pmaster_num_shards
,
5414 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5415 string selected_placement_rule_name
;
5416 RGWZonePlacementInfo rule_info
;
5418 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
5420 ret
= select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
5421 &selected_placement_rule_name
, &rule_info
);
5425 if (!pmaster_bucket
) {
5426 create_bucket_id(&bucket
.marker
);
5427 bucket
.bucket_id
= bucket
.marker
;
5429 bucket
.marker
= pmaster_bucket
->marker
;
5430 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
5433 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
5436 objv_tracker
.write_version
= *pobjv
;
5438 objv_tracker
.generate_new_write_ver(cct
);
5441 info
.bucket
= bucket
;
5442 info
.owner
= owner
.user_id
;
5443 info
.zonegroup
= zonegroup_id
;
5444 info
.placement_rule
= selected_placement_rule_name
;
5445 info
.index_type
= rule_info
.index_type
;
5446 info
.swift_ver_location
= swift_ver_location
;
5447 info
.swift_versioning
= (!swift_ver_location
.empty());
5448 if (pmaster_num_shards
) {
5449 info
.num_shards
= *pmaster_num_shards
;
5451 info
.num_shards
= bucket_index_max_shards
;
5453 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
5454 info
.requester_pays
= false;
5455 if (real_clock::is_zero(creation_time
)) {
5456 info
.creation_time
= ceph::real_clock::now();
5458 info
.creation_time
= creation_time
;
5461 info
.quota
= *pquota_info
;
5464 int r
= init_bucket_index(info
, info
.num_shards
);
5469 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
5470 if (ret
== -EEXIST
) {
5471 librados::IoCtx index_ctx
;
5472 map
<int, string
> bucket_objs
;
5473 int r
= open_bucket_index(info
, index_ctx
, bucket_objs
);
5477 /* we need to reread the info and return it, caller will have a use for it */
5478 RGWObjVersionTracker instance_ver
= info
.objv_tracker
;
5479 info
.objv_tracker
.clear();
5480 RGWObjectCtx
obj_ctx(this);
5481 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, NULL
);
5486 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
5490 /* only remove it if it's a different bucket instance */
5491 if (info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
5492 /* remove bucket meta instance */
5493 string entry
= bucket
.get_key();
5494 r
= rgw_bucket_instance_remove_entry(this, entry
, &instance_ver
);
5498 map
<int, string
>::const_iterator biter
;
5499 for (biter
= bucket_objs
.begin(); biter
!= bucket_objs
.end(); ++biter
) {
5500 // Do best effort removal
5501 index_ctx
.remove(biter
->second
);
5504 /* ret == -ENOENT here */
5509 /* this is highly unlikely */
5510 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
5514 int RGWRados::select_new_bucket_location(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& request_rule
,
5515 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
5518 /* first check that rule exists within the specific zonegroup */
5519 RGWZoneGroup zonegroup
;
5520 int ret
= get_zonegroup(zonegroup_id
, zonegroup
);
5522 ldout(cct
, 0) << "could not find zonegroup " << zonegroup_id
<< " in current period" << dendl
;
5526 /* now check that tag exists within zonegroup */
5527 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5528 string rule
= request_rule
;
5530 rule
= user_info
.default_placement
;
5532 rule
= zonegroup
.default_placement
;
5536 ldout(cct
, 0) << "misconfiguration, should not have an empty placement rule name" << dendl
;
5540 map
<string
, RGWZoneGroupPlacementTarget
>::iterator titer
= zonegroup
.placement_targets
.find(rule
);
5541 if (titer
== zonegroup
.placement_targets
.end()) {
5542 ldout(cct
, 0) << "could not find placement rule " << rule
<< " within zonegroup " << dendl
;
5546 /* now check tag for the rule, whether user is permitted to use rule */
5547 RGWZoneGroupPlacementTarget
& target_rule
= titer
->second
;
5548 if (!target_rule
.user_permitted(user_info
.placement_tags
)) {
5549 ldout(cct
, 0) << "user not permitted to use placement rule" << dendl
;
5553 if (pselected_rule_name
)
5554 *pselected_rule_name
= rule
;
5556 return select_bucket_location_by_rule(rule
, rule_info
);
5559 int RGWRados::select_bucket_location_by_rule(const string
& location_rule
, RGWZonePlacementInfo
*rule_info
)
5561 if (location_rule
.empty()) {
5562 /* we can only reach here if we're trying to set a bucket location from a bucket
5563 * created on a different zone, using a legacy / default pool configuration
5565 return select_legacy_bucket_placement(rule_info
);
5569 * make sure that zone has this rule configured. We're
5570 * checking it for the local zone, because that's where this bucket object is going to
5573 map
<string
, RGWZonePlacementInfo
>::iterator piter
= get_zone_params().placement_pools
.find(location_rule
);
5574 if (piter
== get_zone_params().placement_pools
.end()) {
5575 /* couldn't find, means we cannot really place data for this bucket in this zone */
5576 if (get_zonegroup().equals(zonegroup_id
)) {
5577 /* that's a configuration error, zone should have that rule, as we're within the requested
5581 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5586 RGWZonePlacementInfo
& placement_info
= piter
->second
;
5589 *rule_info
= placement_info
;
5595 int RGWRados::select_bucket_placement(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& placement_rule
,
5596 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
5598 if (!get_zone_params().placement_pools
.empty()) {
5599 return select_new_bucket_location(user_info
, zonegroup_id
, placement_rule
,
5600 pselected_rule_name
, rule_info
);
5603 if (pselected_rule_name
) {
5604 pselected_rule_name
->clear();
5607 return select_legacy_bucket_placement(rule_info
);
5610 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo
*rule_info
)
5613 map
<string
, bufferlist
> m
;
5615 bool write_map
= false;
5617 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5619 RGWObjectCtx
obj_ctx(this);
5620 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, avail_pools
, map_bl
, NULL
, NULL
);
5626 bufferlist::iterator iter
= map_bl
.begin();
5628 } catch (buffer::error
& err
) {
5629 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
5635 ret
= omap_get_all(obj
, header
, m
);
5640 if (ret
< 0 || m
.empty()) {
5641 vector
<rgw_pool
> pools
;
5642 string s
= string("default.") + default_storage_pool_suffix
;
5643 pools
.push_back(rgw_pool(s
));
5644 vector
<int> retcodes
;
5646 ret
= create_pools(pools
, retcodes
);
5649 ret
= omap_set(obj
, s
, bl
);
5657 ::encode(m
, new_bl
);
5658 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
5660 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
5664 map
<string
, bufferlist
>::iterator miter
;
5667 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
5668 v
.push_back(miter
->first
);
5672 ret
= get_random_bytes((char *)&r
, sizeof(r
));
5676 int i
= r
% v
.size();
5680 pool_name
= miter
->first
;
5683 rule_info
->data_pool
= pool_name
;
5684 rule_info
->data_extra_pool
= pool_name
;
5685 rule_info
->index_pool
= pool_name
;
5686 rule_info
->index_type
= RGWBIType_Normal
;
5691 bool RGWRados::get_obj_data_pool(const string
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
5693 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_rule
, obj
, pool
);
5696 bool RGWRados::obj_to_raw(const string
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
5698 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
5700 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
5703 int RGWRados::update_placement_map()
5706 map
<string
, bufferlist
> m
;
5707 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5708 int ret
= omap_get_all(obj
, header
, m
);
5713 ::encode(m
, new_bl
);
5714 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
5716 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
5722 int RGWRados::add_bucket_placement(const rgw_pool
& new_pool
)
5724 librados::Rados
*rad
= get_rados_handle();
5725 int ret
= rad
->pool_lookup(new_pool
.name
.c_str());
5726 if (ret
< 0) // DNE, or something
5729 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5730 bufferlist empty_bl
;
5731 ret
= omap_set(obj
, new_pool
.to_str(), empty_bl
);
5733 // don't care about return value
5734 update_placement_map();
5739 int RGWRados::remove_bucket_placement(const rgw_pool
& old_pool
)
5741 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5742 int ret
= omap_del(obj
, old_pool
.to_str());
5744 // don't care about return value
5745 update_placement_map();
5750 int RGWRados::list_placement_set(set
<rgw_pool
>& names
)
5753 map
<string
, bufferlist
> m
;
5755 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
5756 int ret
= omap_get_all(obj
, header
, m
);
5761 map
<string
, bufferlist
>::iterator miter
;
5762 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
5763 names
.insert(rgw_pool(miter
->first
));
5766 return names
.size();
5769 int RGWRados::create_pools(vector
<rgw_pool
>& pools
, vector
<int>& retcodes
)
5771 vector
<librados::PoolAsyncCompletion
*> completions
;
5774 librados::Rados
*rad
= get_rados_handle();
5775 for (auto iter
= pools
.begin(); iter
!= pools
.end(); ++iter
) {
5776 librados::PoolAsyncCompletion
*c
= librados::Rados::pool_async_create_completion();
5777 completions
.push_back(c
);
5778 rgw_pool
& pool
= *iter
;
5779 int ret
= rad
->pool_create_async(pool
.name
.c_str(), c
);
5780 rets
.push_back(ret
);
5783 vector
<int>::iterator riter
;
5784 vector
<librados::PoolAsyncCompletion
*>::iterator citer
;
5786 assert(rets
.size() == completions
.size());
5787 for (riter
= rets
.begin(), citer
= completions
.begin(); riter
!= rets
.end(); ++riter
, ++citer
) {
5789 PoolAsyncCompletion
*c
= *citer
;
5792 r
= c
->get_return_value();
5794 ldout(cct
, 0) << "WARNING: async pool_create returned " << r
<< dendl
;
5798 retcodes
.push_back(r
);
5803 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
5806 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
5809 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
5810 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
5814 int r
= open_pool_ctx(pool
, *ioctx
);
5819 ioctx
->locator_set_key(key
);
5824 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
5826 get_obj_bucket_and_oid_loc(obj
, ref
->oid
, ref
->key
);
5829 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
5830 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
5834 int r
= open_pool_ctx(pool
, ref
->ioctx
);
5839 ref
->ioctx
.locator_set_key(ref
->key
);
5844 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
, rgw_pool
*pool
)
5851 if (ref
->oid
.empty()) {
5852 ref
->oid
= obj
.pool
.to_str();
5853 ref
->pool
= get_zone_params().domain_root
;
5855 ref
->pool
= obj
.pool
;
5860 r
= open_pool_ctx(ref
->pool
, ref
->ioctx
);
5864 ref
->ioctx
.locator_set_key(ref
->key
);
5869 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
, rgw_pool
*pool
)
5871 return get_raw_obj_ref(obj
, ref
, pool
);
5875 * fixes an issue where head objects were supposed to have a locator created, but ended
5878 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
5880 const rgw_bucket
& bucket
= bucket_info
.bucket
;
5884 rgw_obj
obj(bucket
, key
);
5886 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
5888 if (locator
.empty()) {
5889 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
5893 librados::IoCtx ioctx
;
5895 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
5897 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
5900 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
5905 struct timespec mtime_ts
;
5906 map
<string
, bufferlist
> attrs
;
5907 librados::ObjectReadOperation op
;
5908 op
.getxattrs(&attrs
, NULL
);
5909 op
.stat2(&size
, &mtime_ts
, NULL
);
5910 #define HEAD_SIZE 512 * 1024
5911 op
.read(0, HEAD_SIZE
, &data
, NULL
);
5913 ret
= ioctx
.operate(oid
, &op
, NULL
);
5915 lderr(cct
) << "ERROR: ioctx.operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
5919 if (size
> HEAD_SIZE
) {
5920 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
5924 if (size
!= data
.length()) {
5925 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
5930 librados::ObjectWriteOperation wop
;
5932 wop
.mtime2(&mtime_ts
);
5934 map
<string
, bufferlist
>::iterator iter
;
5935 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
5936 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
5941 ioctx
.locator_set_key(locator
);
5942 ioctx
.operate(oid
, &wop
);
5946 ioctx
.locator_set_key(string());
5948 ret
= ioctx
.remove(oid
);
5950 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
5958 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
5959 const string
& src_oid
, const string
& src_locator
,
5960 librados::IoCtx
& dst_ioctx
,
5961 const string
& dst_oid
, const string
& dst_locator
)
5964 #define COPY_BUF_SIZE (4 * 1024 * 1024)
5966 uint64_t chunk_size
= COPY_BUF_SIZE
;
5970 struct timespec mtime_ts
;
5973 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
5977 src_ioctx
.locator_set_key(src_locator
);
5978 dst_ioctx
.locator_set_key(dst_locator
);
5982 ObjectReadOperation rop
;
5983 ObjectWriteOperation wop
;
5986 rop
.stat2(&size
, &mtime_ts
, NULL
);
5987 mtime
= real_clock::from_timespec(mtime_ts
);
5989 rop
.read(ofs
, chunk_size
, &data
, NULL
);
5990 ret
= src_ioctx
.operate(src_oid
, &rop
, NULL
);
5995 if (data
.length() == 0) {
6000 wop
.create(true); /* make it exclusive */
6001 wop
.mtime2(&mtime_ts
);
6002 mtime
= real_clock::from_timespec(mtime_ts
);
6004 wop
.write(ofs
, data
);
6005 ret
= dst_ioctx
.operate(dst_oid
, &wop
);
6006 ofs
+= data
.length();
6007 done
= data
.length() != chunk_size
;
6011 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
6012 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
6017 src_ioctx
.remove(src_oid
);
6022 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
6027 * fixes an issue where head objects were supposed to have a locator created, but ended
6030 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
)
6032 const rgw_bucket
& bucket
= bucket_info
.bucket
;
6033 rgw_obj
obj(bucket
, key
);
6040 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6045 RGWObjState
*astate
= NULL
;
6046 RGWObjectCtx
rctx(this);
6047 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
6051 if (astate
->has_manifest
) {
6052 RGWObjManifest::obj_iterator miter
;
6053 RGWObjManifest
& manifest
= astate
->manifest
;
6054 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
6055 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
6060 rgw_raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
6062 if (loc
.key
.ns
.empty()) {
6063 /* continue, we're only interested in tail objects */
6067 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
6068 ref
.ioctx
.locator_set_key(locator
);
6070 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
6072 r
= ref
.ioctx
.stat(oid
, NULL
, NULL
);
6078 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
6080 /* create a new ioctx with the bad locator */
6081 librados::IoCtx src_ioctx
;
6082 src_ioctx
.dup(ref
.ioctx
);
6083 src_ioctx
.locator_set_key(bad_loc
);
6085 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
6087 /* cannot find a broken part */
6090 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
6095 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ref
.ioctx
, oid
, locator
);
6097 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
6106 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
, const rgw_obj
& obj
)
6110 RGWObjectCtx
obj_ctx(store
);
6112 RGWBucketInfo bucket_info
;
6113 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
6118 ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
6120 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6123 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6128 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
, int sid
)
6133 RGWObjectCtx
obj_ctx(store
);
6135 RGWBucketInfo bucket_info
;
6136 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
6141 ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, shard_id
, &bucket_obj
);
6143 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6146 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6152 /* Execute @handler on last item in bucket listing for bucket specified
6153 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6154 * to objects matching these criterias. */
6155 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
6156 const std::string
& obj_prefix
,
6157 const std::string
& obj_delim
,
6158 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
6160 RGWRados::Bucket
target(this, bucket_info
);
6161 RGWRados::Bucket::List
list_op(&target
);
6163 list_op
.params
.prefix
= obj_prefix
;
6164 list_op
.params
.delim
= obj_delim
;
6166 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
6167 << ", obj_prefix=" << obj_prefix
6168 << ", obj_delim=" << obj_delim
6171 bool is_truncated
= false;
6173 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
6174 /* We need to rewind to the last object in a listing. */
6176 /* List bucket entries in chunks. */
6177 static constexpr int MAX_LIST_OBJS
= 100;
6178 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
6180 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
6184 } else if (!entries
.empty()) {
6185 last_entry
= entries
.back();
6187 } while (is_truncated
);
6190 return handler(*last_entry
);
6193 /* Empty listing - no items we can run handler on. */
6198 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
6199 const rgw_user
& user
,
6200 RGWBucketInfo
& bucket_info
,
6203 if (! swift_versioning_enabled(bucket_info
)) {
6207 obj_ctx
.obj
.set_atomic(obj
);
6209 RGWObjState
* state
= nullptr;
6210 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false);
6215 if (!state
->exists
) {
6222 const string
& src_name
= obj
.get_oid();
6223 char buf
[src_name
.size() + 32];
6224 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
6225 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
6226 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
6228 RGWBucketInfo dest_bucket_info
;
6230 r
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, NULL
);
6232 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
6234 return -ERR_PRECONDITION_FAILED
;
6239 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
6240 return -ERR_PRECONDITION_FAILED
;
6243 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
6244 obj_ctx
.obj
.set_atomic(dest_obj
);
6248 r
= copy_obj(obj_ctx
,
6252 NULL
, /* req_info *info */
6258 NULL
, /* time_t *src_mtime */
6259 NULL
, /* time_t *mtime */
6260 NULL
, /* const time_t *mod_ptr */
6261 NULL
, /* const time_t *unmod_ptr */
6262 false, /* bool high_precision_time */
6263 NULL
, /* const char *if_match */
6264 NULL
, /* const char *if_nomatch */
6265 RGWRados::ATTRSMOD_NONE
,
6266 true, /* bool copy_if_newer */
6268 RGW_OBJ_CATEGORY_MAIN
,
6269 0, /* uint64_t olh_epoch */
6270 real_time(), /* time_t delete_at */
6271 NULL
, /* string *version_id */
6272 NULL
, /* string *ptag */
6273 NULL
, /* string *petag */
6274 NULL
, /* struct rgw_err *err */
6275 NULL
, /* void (*progress_cb)(off_t, void *) */
6276 NULL
); /* void *progress_data */
6277 if (r
== -ECANCELED
|| r
== -ENOENT
) {
6278 /* Has already been overwritten, meaning another rgw process already
6286 int RGWRados::swift_versioning_restore(RGWObjectCtx
& obj_ctx
,
6287 const rgw_user
& user
,
6288 RGWBucketInfo
& bucket_info
,
6290 bool& restored
) /* out */
6292 if (! swift_versioning_enabled(bucket_info
)) {
6296 /* Bucket info of the bucket that stores previous versions of our object. */
6297 RGWBucketInfo archive_binfo
;
6299 int ret
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
,
6300 bucket_info
.swift_ver_location
, archive_binfo
,
6306 /* Abort the operation if the bucket storing our archive belongs to someone
6307 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6308 * into consideration. For we can live with that.
6310 * TODO: delegate this check to un upper layer and compare with ACLs. */
6311 if (bucket_info
.owner
!= archive_binfo
.owner
) {
6315 /* This code will be executed on latest version of the object. */
6316 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
6317 std::string no_client_id
;
6318 std::string no_op_id
;
6319 std::string no_zone
;
6321 /* We don't support object versioning of Swift API on those buckets that
6322 * are already versioned using the S3 mechanism. This affects also bucket
6323 * storing archived objects. Otherwise the delete operation would create
6324 * a deletion marker. */
6325 if (archive_binfo
.versioned()) {
6327 return -ERR_PRECONDITION_FAILED
;
6330 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6331 * irrelevant and may be safely skipped. */
6332 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
6334 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
6335 obj_ctx
.obj
.set_atomic(archive_obj
);
6336 obj_ctx
.obj
.set_atomic(obj
);
6338 int ret
= copy_obj(obj_ctx
,
6342 nullptr, /* req_info *info */
6345 archive_obj
, /* src obj */
6346 bucket_info
, /* dest bucket info */
6347 archive_binfo
, /* src bucket info */
6348 nullptr, /* time_t *src_mtime */
6349 nullptr, /* time_t *mtime */
6350 nullptr, /* const time_t *mod_ptr */
6351 nullptr, /* const time_t *unmod_ptr */
6352 false, /* bool high_precision_time */
6353 nullptr, /* const char *if_match */
6354 nullptr, /* const char *if_nomatch */
6355 RGWRados::ATTRSMOD_NONE
,
6356 true, /* bool copy_if_newer */
6358 RGW_OBJ_CATEGORY_MAIN
,
6359 0, /* uint64_t olh_epoch */
6360 real_time(), /* time_t delete_at */
6361 nullptr, /* string *version_id */
6362 nullptr, /* string *ptag */
6363 nullptr, /* string *petag */
6364 nullptr, /* struct rgw_err *err */
6365 nullptr, /* void (*progress_cb)(off_t, void *) */
6366 nullptr); /* void *progress_data */
6367 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
6368 /* Has already been overwritten, meaning another rgw process already
6371 } else if (ret
< 0) {
6377 /* Need to remove the archived copy. */
6378 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
6379 archive_binfo
.versioning_status());
6384 const std::string
& obj_name
= obj
.get_oid();
6385 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
6388 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
6393 * Write/overwrite an object to the bucket storage.
6394 * bucket: the bucket to store the object in
6395 * obj: the object name/key
6396 * data: the object contents/value
6397 * size: the amount of data to write (data must be this long)
6398 * accounted_size: original size of data before compression, encryption
6399 * mtime: if non-NULL, writes the given mtime to the bucket storage
6400 * attrs: all the given attrs are written to bucket storage for the given object
6401 * exclusive: create object exclusively
6402 * Returns: 0 on success, -ERR# otherwise.
6404 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
6405 map
<string
, bufferlist
>& attrs
, bool assume_noent
,
6408 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
6411 RGWRados
*store
= target
->get_store();
6413 ObjectWriteOperation op
;
6416 int r
= target
->get_state(&state
, false, assume_noent
);
6420 rgw_obj
& obj
= target
->get_obj();
6422 if (obj
.get_oid().empty()) {
6423 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
6427 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
6431 bool is_olh
= state
->is_olh
;
6433 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
6435 const string
*ptag
= meta
.ptag
;
6436 if (!ptag
&& !index_op
->get_optag()->empty()) {
6437 ptag
= index_op
->get_optag();
6439 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false);
6443 if (real_clock::is_zero(meta
.set_mtime
)) {
6444 meta
.set_mtime
= real_clock::now();
6447 if (state
->is_olh
) {
6448 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
6451 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
6452 op
.mtime2(&mtime_ts
);
6455 /* if we want to overwrite the data, we also want to overwrite the
6456 xattrs, so just remove the object */
6457 op
.write_full(*meta
.data
);
6461 string content_type
;
6464 map
<string
, bufferlist
>::iterator iter
;
6466 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
6467 const string
& name
= iter
->first
;
6468 op
.rmxattr(name
.c_str());
6472 if (meta
.manifest
) {
6473 /* remove existing manifest attr */
6474 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
6475 if (iter
!= attrs
.end())
6479 ::encode(*meta
.manifest
, bl
);
6480 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
6483 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6484 const string
& name
= iter
->first
;
6485 bufferlist
& bl
= iter
->second
;
6490 op
.setxattr(name
.c_str(), bl
);
6492 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
6494 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
6495 content_type
= bl
.c_str();
6496 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
6500 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
6501 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
6504 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
6506 ::encode(store
->get_zone_short_id(), bl
);
6507 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
6516 bool orig_exists
= state
->exists
;
6517 uint64_t orig_size
= state
->accounted_size
;
6519 bool versioned_target
= (meta
.olh_epoch
> 0 || !obj
.key
.instance
.empty());
6521 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
6524 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
6527 if (!index_op
->is_prepared()) {
6528 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
6533 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
6534 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
6535 or -ENOENT if was removed, or -EEXIST if it did not exist
6536 before and now it does */
6537 if (r
== -EEXIST
&& assume_noent
) {
6538 target
->invalidate_state();
6544 epoch
= ref
.ioctx
.get_last_version();
6545 poolid
= ref
.ioctx
.get_id();
6547 r
= target
->complete_atomic_modification();
6549 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
6552 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
6553 meta
.set_mtime
, etag
, content_type
, &acl_bl
,
6554 meta
.category
, meta
.remove_objs
, meta
.user_data
);
6559 *meta
.mtime
= meta
.set_mtime
;
6562 /* note that index_op was using state so we couldn't invalidate it earlier */
6563 target
->invalidate_state();
6567 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, meta
.olh_epoch
, real_time(), false);
6573 if (!real_clock::is_zero(meta
.delete_at
)) {
6574 rgw_obj_index_key obj_key
;
6575 obj
.key
.get_index_key(&obj_key
);
6577 r
= store
->objexp_hint_add(meta
.delete_at
,
6578 obj
.bucket
.tenant
, obj
.bucket
.name
, obj
.bucket
.bucket_id
, obj_key
);
6580 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
6581 /* ignoring error, nothing we can do at this point */
6584 meta
.canceled
= false;
6586 /* update quota cache */
6587 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
6588 accounted_size
, orig_size
);
6592 int ret
= index_op
->cancel();
6594 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
6597 meta
.canceled
= true;
6599 /* we lost in a race. There are a few options:
6600 * - existing object was rewritten (ECANCELED)
6601 * - non existing object was created (EEXIST)
6602 * - object was removed (ENOENT)
6603 * should treat it as a success
6605 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
6606 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
6610 if (meta
.if_match
!= NULL
) {
6611 // only overwrite existing object
6612 if (strcmp(meta
.if_match
, "*") == 0) {
6614 r
= -ERR_PRECONDITION_FAILED
;
6615 } else if (r
== -ECANCELED
) {
6621 if (meta
.if_nomatch
!= NULL
) {
6622 // only create a new object
6623 if (strcmp(meta
.if_nomatch
, "*") == 0) {
6625 r
= -ERR_PRECONDITION_FAILED
;
6626 } else if (r
== -ENOENT
) {
6636 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
6637 map
<string
, bufferlist
>& attrs
)
6639 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
6641 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
6642 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
6644 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
6647 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, (void *)&index_op
);
6649 assume_noent
= false;
6652 if (!assume_noent
) {
6653 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, (void *)&index_op
);
6658 /** Write/overwrite a system object. */
6659 int RGWRados::put_system_obj_impl(rgw_raw_obj
& obj
, uint64_t size
, real_time
*mtime
,
6660 map
<std::string
, bufferlist
>& attrs
, int flags
,
6662 RGWObjVersionTracker
*objv_tracker
,
6663 real_time set_mtime
/* 0 for don't set */)
6667 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
6671 ObjectWriteOperation op
;
6673 if (flags
& PUT_OBJ_EXCL
) {
6674 if (!(flags
& PUT_OBJ_CREATE
))
6676 op
.create(true); // exclusive create
6679 op
.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK
);
6684 objv_tracker
->prepare_op_for_write(&op
);
6687 if (real_clock::is_zero(set_mtime
)) {
6688 set_mtime
= real_clock::now();
6691 struct timespec mtime_ts
= real_clock::to_timespec(set_mtime
);
6692 op
.mtime2(&mtime_ts
);
6693 op
.write_full(data
);
6697 for (map
<string
, bufferlist
>::iterator iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6698 const string
& name
= iter
->first
;
6699 bufferlist
& bl
= iter
->second
;
6704 op
.setxattr(name
.c_str(), bl
);
6707 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
6713 objv_tracker
->apply_write();
6723 int RGWRados::put_system_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
6724 off_t ofs
, bool exclusive
,
6725 RGWObjVersionTracker
*objv_tracker
)
6729 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
6734 ObjectWriteOperation op
;
6740 objv_tracker
->prepare_op_for_write(&op
);
6747 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
6752 objv_tracker
->apply_write();
6758 * Write/overwrite an object to the bucket storage.
6759 * bucket: the bucket to store the object in
6760 * obj: the object name/key
6761 * data: the object contents/value
6762 * offset: the offet to write to in the object
6763 * If this is -1, we will overwrite the whole object.
6764 * size: the amount of data to write (data must be this long)
6765 * attrs: all the given attrs are written to bucket storage for the given object
6766 * Returns: 0 on success, -ERR# otherwise.
6769 int RGWRados::aio_put_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
6770 off_t ofs
, bool exclusive
,
6774 int r
= get_raw_obj_ref(obj
, &ref
);
6779 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
6782 ObjectWriteOperation op
;
6792 r
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
6799 int RGWRados::aio_wait(void *handle
)
6801 AioCompletion
*c
= (AioCompletion
*)handle
;
6803 int ret
= c
->get_return_value();
6808 bool RGWRados::aio_completed(void *handle
)
6810 AioCompletion
*c
= (AioCompletion
*)handle
;
6811 return c
->is_safe();
6814 class RGWRadosPutObj
: public RGWGetDataCB
6818 RGWPutObjDataProcessor
*filter
;
6819 boost::optional
<RGWPutObj_Compress
>& compressor
;
6820 CompressorRef
& plugin
;
6821 RGWPutObjProcessor_Atomic
*processor
;
6822 RGWOpStateSingleOp
*opstate
;
6823 void (*progress_cb
)(off_t
, void *);
6824 void *progress_data
;
6825 bufferlist extra_data_bl
;
6826 uint64_t extra_data_len
;
6828 map
<string
, bufferlist
> src_attrs
;
6830 RGWRadosPutObj(CephContext
* cct
,
6831 CompressorRef
& plugin
,
6832 boost::optional
<RGWPutObj_Compress
>& compressor
,
6833 RGWPutObjProcessor_Atomic
*p
,
6834 RGWOpStateSingleOp
*_ops
,
6835 void (*_progress_cb
)(off_t
, void *),
6836 void *_progress_data
) :
6839 compressor(compressor
),
6843 progress_cb(_progress_cb
),
6844 progress_data(_progress_data
),
6848 int process_attrs(void) {
6849 if (extra_data_bl
.length()) {
6851 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
6852 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
6856 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
6858 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
6859 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
6862 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
6863 //do not compress if object is encrypted
6864 compressor
= boost::in_place(cct
, plugin
, filter
);
6865 filter
= &*compressor
;
6870 int handle_data(bufferlist
& bl
, off_t ofs
, off_t len
) override
{
6872 progress_cb(ofs
, progress_data
);
6874 if (extra_data_len
) {
6875 size_t extra_len
= bl
.length();
6876 if (extra_len
> extra_data_len
)
6877 extra_len
= extra_data_len
;
6880 bl
.splice(0, extra_len
, &extra
);
6881 extra_data_bl
.append(extra
);
6883 extra_data_len
-= extra_len
;
6884 if (extra_data_len
== 0) {
6885 int res
= process_attrs();
6889 if (bl
.length() == 0) {
6893 data_len
+= bl
.length();
6896 bool need_opstate
= true;
6899 void *handle
= NULL
;
6901 uint64_t size
= bl
.length();
6902 int ret
= filter
->handle_data(bl
, ofs
, &handle
, &obj
, &again
);
6906 if (need_opstate
&& opstate
) {
6907 /* need to update opstate repository with new state. This is ratelimited, so we're not
6908 * really doing it every time
6910 ret
= opstate
->renew_state();
6912 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret
<< dendl
;
6913 int r
= filter
->throttle_data(handle
, obj
, size
, false);
6915 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r
<< dendl
;
6917 /* could not renew state! might have been marked as cancelled */
6920 need_opstate
= false;
6923 ret
= filter
->throttle_data(handle
, obj
, size
, false);
6931 bufferlist
& get_extra_data() { return extra_data_bl
; }
6933 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
6935 void set_extra_data_len(uint64_t len
) override
{
6936 extra_data_len
= len
;
6939 uint64_t get_data_len() {
6943 int complete(const string
& etag
, real_time
*mtime
, real_time set_mtime
,
6944 map
<string
, bufferlist
>& attrs
, real_time delete_at
) {
6945 return processor
->complete(data_len
, etag
, mtime
, set_mtime
, attrs
, delete_at
);
6948 bool is_canceled() {
6949 return processor
->is_canceled();
6954 * prepare attrset depending on attrs_mod.
6956 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
6957 map
<string
, bufferlist
>& attrs
,
6958 RGWRados::AttrsMod attrs_mod
)
6960 switch (attrs_mod
) {
6961 case RGWRados::ATTRSMOD_NONE
:
6964 case RGWRados::ATTRSMOD_REPLACE
:
6965 if (!attrs
[RGW_ATTR_ETAG
].length()) {
6966 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
6969 case RGWRados::ATTRSMOD_MERGE
:
6970 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
6971 if (attrs
.find(it
->first
) == attrs
.end()) {
6972 attrs
[it
->first
] = it
->second
;
6979 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, rgw_obj
& obj
)
6981 map
<string
, bufferlist
> attrset
;
6985 RGWObjectCtx
rctx(this);
6987 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
6988 RGWRados::Object::Read
read_op(&op_target
);
6990 read_op
.params
.attrs
= &attrset
;
6991 read_op
.params
.lastmod
= &mtime
;
6992 read_op
.params
.obj_size
= &obj_size
;
6994 int ret
= read_op
.prepare();
6998 attrset
.erase(RGW_ATTR_ID_TAG
);
7000 uint64_t max_chunk_size
;
7002 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, obj
, &max_chunk_size
);
7004 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj
.bucket
<< dendl
;
7008 return copy_obj_data(rctx
, dest_bucket_info
, read_op
, obj_size
- 1, obj
, obj
, max_chunk_size
, NULL
, mtime
, attrset
,
7009 RGW_OBJ_CATEGORY_MAIN
, 0, real_time(), NULL
, NULL
, NULL
, NULL
);
7012 struct obj_time_weight
{
7014 uint32_t zone_short_id
;
7016 bool high_precision
;
7018 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7020 bool compare_low_precision(const obj_time_weight
& rhs
) {
7021 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
7022 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
7031 if (zone_short_id
!= rhs
.zone_short_id
) {
7032 return (zone_short_id
< rhs
.zone_short_id
);
7034 return (pg_ver
< rhs
.pg_ver
);
7038 bool operator<(const obj_time_weight
& rhs
) {
7039 if (!high_precision
|| !rhs
.high_precision
) {
7040 return compare_low_precision(rhs
);
7042 if (mtime
> rhs
.mtime
) {
7045 if (mtime
< rhs
.mtime
) {
7048 if (zone_short_id
!= rhs
.zone_short_id
) {
7049 return (zone_short_id
< rhs
.zone_short_id
);
7051 return (pg_ver
< rhs
.pg_ver
);
7054 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
7056 zone_short_id
= _short_id
;
7060 void init(RGWObjState
*state
) {
7061 mtime
= state
->mtime
;
7062 zone_short_id
= state
->zone_short_id
;
7063 pg_ver
= state
->pg_ver
;
7067 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
7070 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
7071 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
7077 class RGWGetExtraDataCB
: public RGWGetDataCB
{
7078 bufferlist extra_data
;
7080 RGWGetExtraDataCB() {}
7081 int handle_data(bufferlist
& bl
, off_t bl_ofs
, off_t bl_len
) override
{
7082 if (extra_data
.length() < extra_data_len
) {
7083 off_t max
= extra_data_len
- extra_data
.length();
7087 bl
.splice(0, max
, &extra_data
);
7092 bufferlist
& get_extra_data() {
7097 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
7098 const rgw_user
& user_id
,
7099 const string
& client_id
,
7101 const string
& source_zone
,
7103 RGWBucketInfo
& src_bucket_info
,
7104 real_time
*src_mtime
,
7106 const real_time
*mod_ptr
,
7107 const real_time
*unmod_ptr
,
7108 bool high_precision_time
,
7109 const char *if_match
,
7110 const char *if_nomatch
,
7111 map
<string
, bufferlist
> *pattrs
,
7116 /* source is in a different zonegroup, copy from there */
7118 RGWRESTStreamRWRequest
*in_stream_req
;
7120 map
<string
, bufferlist
> src_attrs
;
7121 append_rand_alpha(cct
, tag
, tag
, 32);
7122 obj_time_weight set_mtime_weight
;
7123 set_mtime_weight
.high_precision
= high_precision_time
;
7126 if (source_zone
.empty()) {
7127 if (src_bucket_info
.zonegroup
.empty()) {
7128 /* source is in the master zonegroup */
7129 conn
= rest_master_conn
;
7131 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7132 if (iter
== zonegroup_conn_map
.end()) {
7133 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
7136 conn
= iter
->second
;
7139 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
7140 if (iter
== zone_conn_map
.end()) {
7141 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
7144 conn
= iter
->second
;
7147 RGWGetExtraDataCB cb
;
7149 map
<string
, string
> req_headers
;
7150 real_time set_mtime
;
7152 const real_time
*pmod
= mod_ptr
;
7154 obj_time_weight dest_mtime_weight
;
7156 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
7157 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
7158 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7159 true /* sync manifest */, &cb
, &in_stream_req
);
7164 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, psize
, req_headers
);
7169 bufferlist
& extra_data_bl
= cb
.get_extra_data();
7170 if (extra_data_bl
.length()) {
7172 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
7173 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
7177 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
7179 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
7183 *src_mtime
= set_mtime
;
7187 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
7188 if (iter
!= src_attrs
.end()) {
7189 bufferlist
& etagbl
= iter
->second
;
7190 *petag
= etagbl
.to_str();
7195 *pattrs
= src_attrs
;
7201 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
7202 const rgw_user
& user_id
,
7203 const string
& client_id
,
7204 const string
& op_id
,
7205 bool record_op_state
,
7207 const string
& source_zone
,
7210 RGWBucketInfo
& dest_bucket_info
,
7211 RGWBucketInfo
& src_bucket_info
,
7212 real_time
*src_mtime
,
7214 const real_time
*mod_ptr
,
7215 const real_time
*unmod_ptr
,
7216 bool high_precision_time
,
7217 const char *if_match
,
7218 const char *if_nomatch
,
7221 map
<string
, bufferlist
>& attrs
,
7222 RGWObjCategory category
,
7224 real_time delete_at
,
7227 ceph::buffer::list
*petag
,
7228 struct rgw_err
*err
,
7229 void (*progress_cb
)(off_t
, void *),
7230 void *progress_data
)
7232 /* source is in a different zonegroup, copy from there */
7234 RGWRESTStreamRWRequest
*in_stream_req
;
7237 append_rand_alpha(cct
, tag
, tag
, 32);
7238 obj_time_weight set_mtime_weight
;
7239 set_mtime_weight
.high_precision
= high_precision_time
;
7241 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
7242 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.key
.name
,
7243 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
7244 if (version_id
&& *version_id
!= "null") {
7245 processor
.set_version_id(*version_id
);
7247 processor
.set_olh_epoch(olh_epoch
);
7248 int ret
= processor
.prepare(this, NULL
);
7254 if (source_zone
.empty()) {
7255 if (dest_bucket_info
.zonegroup
.empty()) {
7256 /* source is in the master zonegroup */
7257 conn
= rest_master_conn
;
7259 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7260 if (iter
== zonegroup_conn_map
.end()) {
7261 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
7264 conn
= iter
->second
;
7267 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
7268 if (iter
== zone_conn_map
.end()) {
7269 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
7272 conn
= iter
->second
;
7275 string obj_name
= dest_obj
.bucket
.name
+ "/" + dest_obj
.get_oid();
7277 RGWOpStateSingleOp
*opstate
= NULL
;
7279 if (record_op_state
) {
7280 opstate
= new RGWOpStateSingleOp(this, client_id
, op_id
, obj_name
);
7282 ret
= opstate
->set_state(RGWOpState::OPSTATE_IN_PROGRESS
);
7284 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
7290 boost::optional
<RGWPutObj_Compress
> compressor
;
7291 CompressorRef plugin
;
7293 const auto& compression_type
= zone_params
.get_compression_type(
7294 dest_bucket_info
.placement_rule
);
7295 if (compression_type
!= "none") {
7296 plugin
= Compressor::create(cct
, compression_type
);
7298 ldout(cct
, 1) << "Cannot load plugin for compression type "
7299 << compression_type
<< dendl
;
7303 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, opstate
, progress_cb
, progress_data
);
7306 map
<string
, string
> req_headers
;
7307 real_time set_mtime
;
7309 RGWObjState
*dest_state
= NULL
;
7311 const real_time
*pmod
= mod_ptr
;
7313 obj_time_weight dest_mtime_weight
;
7315 if (copy_if_newer
) {
7316 /* need to get mtime for destination */
7317 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
7321 if (!real_clock::is_zero(dest_state
->mtime
)) {
7322 dest_mtime_weight
.init(dest_state
);
7323 pmod
= &dest_mtime_weight
.mtime
;
7327 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
7328 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
7329 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7330 true /* sync manifest */, &cb
, &in_stream_req
);
7335 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, nullptr, req_headers
);
7339 if (compressor
&& compressor
->is_compressed()) {
7341 RGWCompressionInfo cs_info
;
7342 cs_info
.compression_type
= plugin
->get_type_name();
7343 cs_info
.orig_size
= cb
.get_data_len();
7344 cs_info
.blocks
= move(compressor
->get_compression_blocks());
7345 ::encode(cs_info
, tmp
);
7346 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
7349 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7350 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
7352 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
7353 if (iter
!= cb
.get_attrs().end()) {
7355 ::decode(delete_at
, iter
->second
);
7356 } catch (buffer::error
& err
) {
7357 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
7363 *src_mtime
= set_mtime
;
7367 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
7368 if (iter
!= cb
.get_attrs().end()) {
7369 *petag
= iter
->second
;
7373 if (source_zone
.empty()) {
7374 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
7376 attrs
= cb
.get_attrs();
7379 if (copy_if_newer
) {
7380 uint64_t pg_ver
= 0;
7381 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
7382 if (i
!= attrs
.end() && i
->second
.length() > 0) {
7383 bufferlist::iterator iter
= i
->second
.begin();
7385 ::decode(pg_ver
, iter
);
7386 } catch (buffer::error
& err
) {
7387 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
7388 /* non critical error */
7391 set_mtime_weight
.init(set_mtime
, get_zone_short_id(), pg_ver
);
7394 #define MAX_COMPLETE_RETRY 100
7395 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
7396 ret
= cb
.complete(etag
, mtime
, set_mtime
, attrs
, delete_at
);
7400 if (copy_if_newer
&& cb
.is_canceled()) {
7401 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
7402 obj_ctx
.obj
.invalidate(dest_obj
); /* object was overwritten */
7403 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
7405 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
7408 dest_mtime_weight
.init(dest_state
);
7409 dest_mtime_weight
.high_precision
= high_precision_time
;
7410 if (!dest_state
->exists
||
7411 dest_mtime_weight
< set_mtime_weight
) {
7412 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
7415 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
7421 if (i
== MAX_COMPLETE_RETRY
) {
7422 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
7428 ret
= opstate
->set_state(RGWOpState::OPSTATE_COMPLETE
);
7430 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
7437 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
7441 RGWOpState::OpState state
;
7443 state
= RGWOpState::OPSTATE_ERROR
;
7445 state
= RGWOpState::OPSTATE_COMPLETE
;
7447 int r
= opstate
->set_state(state
);
7449 ldout(cct
, 0) << "ERROR: failed to set opstate r=" << ret
<< dendl
;
7457 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
7458 map
<string
, bufferlist
>& src_attrs
,
7459 RGWRados::Object::Read
& read_op
,
7460 const rgw_user
& user_id
,
7466 RGWRESTStreamWriteRequest
*out_stream_req
;
7468 int ret
= rest_master_conn
->put_obj_init(user_id
, dest_obj
, astate
->size
, src_attrs
, &out_stream_req
);
7470 delete out_stream_req
;
7474 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb());
7478 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
7487 * dest_obj: the object to copy into
7488 * src_obj: the object to copy from
7489 * attrs: usage depends on attrs_mod parameter
7490 * attrs_mod: the modification mode of the attrs, may have the following values:
7491 * ATTRSMOD_NONE - the attributes of the source object will be
7492 * copied without modifications, attrs parameter is ignored;
7493 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7494 * parameter, source object attributes are not copied;
7495 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7496 * are overwritten by values contained in attrs parameter.
7497 * err: stores any errors resulting from the get of the original object
7498 * Returns: 0 on success, -ERR# otherwise.
7500 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
7501 const rgw_user
& user_id
,
7502 const string
& client_id
,
7503 const string
& op_id
,
7505 const string
& source_zone
,
7508 RGWBucketInfo
& dest_bucket_info
,
7509 RGWBucketInfo
& src_bucket_info
,
7510 real_time
*src_mtime
,
7512 const real_time
*mod_ptr
,
7513 const real_time
*unmod_ptr
,
7514 bool high_precision_time
,
7515 const char *if_match
,
7516 const char *if_nomatch
,
7519 map
<string
, bufferlist
>& attrs
,
7520 RGWObjCategory category
,
7522 real_time delete_at
,
7525 ceph::buffer::list
*petag
,
7526 struct rgw_err
*err
,
7527 void (*progress_cb
)(off_t
, void *),
7528 void *progress_data
)
7532 rgw_obj shadow_obj
= dest_obj
;
7538 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
7539 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
7541 remote_dest
= !get_zonegroup().equals(dest_bucket_info
.zonegroup
);
7542 remote_src
= !get_zonegroup().equals(src_bucket_info
.zonegroup
);
7544 if (remote_src
&& remote_dest
) {
7545 ldout(cct
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
7549 ldout(cct
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
7551 if (remote_src
|| !source_zone
.empty()) {
7552 return fetch_remote_obj(obj_ctx
, user_id
, client_id
, op_id
, true, info
, source_zone
,
7553 dest_obj
, src_obj
, dest_bucket_info
, src_bucket_info
, src_mtime
, mtime
, mod_ptr
,
7554 unmod_ptr
, high_precision_time
,
7555 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
7556 olh_epoch
, delete_at
, version_id
, ptag
, petag
, err
, progress_cb
, progress_data
);
7559 map
<string
, bufferlist
> src_attrs
;
7560 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
7561 RGWRados::Object::Read
read_op(&src_op_target
);
7563 read_op
.conds
.mod_ptr
= mod_ptr
;
7564 read_op
.conds
.unmod_ptr
= unmod_ptr
;
7565 read_op
.conds
.high_precision_time
= high_precision_time
;
7566 read_op
.conds
.if_match
= if_match
;
7567 read_op
.conds
.if_nomatch
= if_nomatch
;
7568 read_op
.params
.attrs
= &src_attrs
;
7569 read_op
.params
.lastmod
= src_mtime
;
7570 read_op
.params
.obj_size
= &obj_size
;
7571 read_op
.params
.perr
= err
;
7573 ret
= read_op
.prepare();
7578 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
7579 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
7581 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
7582 attrs
.erase(RGW_ATTR_ID_TAG
);
7583 attrs
.erase(RGW_ATTR_PG_VER
);
7584 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
7585 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
7586 if (cmp
!= src_attrs
.end())
7587 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
7589 RGWObjManifest manifest
;
7590 RGWObjState
*astate
= NULL
;
7592 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
);
7597 vector
<rgw_raw_obj
> ref_objs
;
7600 /* dest is in a different zonegroup, copy it there */
7601 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
7603 uint64_t max_chunk_size
;
7605 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
7607 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
7613 if (!get_obj_data_pool(src_bucket_info
.placement_rule
, src_obj
, &src_pool
)) {
7614 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
7617 if (!get_obj_data_pool(dest_bucket_info
.placement_rule
, dest_obj
, &dest_pool
)) {
7618 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
7623 bool copy_data
= !astate
->has_manifest
|| (src_pool
!= dest_pool
);
7624 bool copy_first
= false;
7625 if (astate
->has_manifest
) {
7626 if (!astate
->manifest
.has_tail()) {
7629 uint64_t head_size
= astate
->manifest
.get_head_size();
7631 if (head_size
> 0) {
7632 if (head_size
> max_chunk_size
) {
7642 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
7643 if (iter
!= attrs
.end()) {
7644 *petag
= iter
->second
;
7648 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
7649 return copy_obj_data(obj_ctx
, dest_bucket_info
, read_op
, obj_size
- 1, dest_obj
, src_obj
,
7650 max_chunk_size
, mtime
, real_time(), attrs
, category
, olh_epoch
, delete_at
,
7651 version_id
, ptag
, petag
, err
);
7654 RGWObjManifest::obj_iterator miter
= astate
->manifest
.obj_begin();
7656 if (copy_first
) { // we need to copy first chunk, not increase refcount
7661 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
7666 bool versioned_dest
= dest_bucket_info
.versioning_enabled();
7668 if (version_id
&& !version_id
->empty()) {
7669 versioned_dest
= true;
7670 dest_obj
.key
.set_instance(*version_id
);
7671 } else if (versioned_dest
) {
7672 gen_rand_obj_instance_name(&dest_obj
);
7675 bufferlist first_chunk
;
7677 bool copy_itself
= (dest_obj
== src_obj
);
7678 RGWObjManifest
*pmanifest
;
7679 ldout(cct
, 0) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
7681 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
7682 RGWRados::Object::Write
write_op(&dest_op_target
);
7691 append_rand_alpha(cct
, tag
, tag
, 32);
7695 manifest
= astate
->manifest
;
7696 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
7697 if (tail_placement
.bucket
.name
.empty()) {
7698 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
7701 for (; miter
!= astate
->manifest
.obj_end(); ++miter
) {
7702 ObjectWriteOperation op
;
7703 cls_refcount_get(op
, tag
, true);
7704 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
7705 ref
.ioctx
.locator_set_key(loc
.loc
);
7707 ret
= ref
.ioctx
.operate(loc
.oid
, &op
);
7712 ref_objs
.push_back(loc
);
7715 pmanifest
= &manifest
;
7717 pmanifest
= &astate
->manifest
;
7718 /* don't send the object's tail for garbage collection */
7719 astate
->keep_tail
= true;
7723 ret
= read_op
.read(0, max_chunk_size
, first_chunk
);
7728 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
7730 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
7733 write_op
.meta
.data
= &first_chunk
;
7734 write_op
.meta
.manifest
= pmanifest
;
7735 write_op
.meta
.ptag
= &tag
;
7736 write_op
.meta
.owner
= dest_bucket_info
.owner
;
7737 write_op
.meta
.mtime
= mtime
;
7738 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
7739 write_op
.meta
.category
= category
;
7740 write_op
.meta
.olh_epoch
= olh_epoch
;
7741 write_op
.meta
.delete_at
= delete_at
;
7743 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
);
7752 vector
<rgw_raw_obj
>::iterator riter
;
7756 /* rollback reference */
7757 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
7758 ObjectWriteOperation op
;
7759 cls_refcount_put(op
, tag
, true);
7761 ref
.ioctx
.locator_set_key(riter
->loc
);
7763 int r
= ref
.ioctx
.operate(riter
->oid
, &op
);
7765 ldout(cct
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
7773 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
7774 RGWBucketInfo
& dest_bucket_info
,
7775 RGWRados::Object::Read
& read_op
, off_t end
,
7778 uint64_t max_chunk_size
,
7780 real_time set_mtime
,
7781 map
<string
, bufferlist
>& attrs
,
7782 RGWObjCategory category
,
7784 real_time delete_at
,
7787 ceph::buffer::list
*petag
,
7788 struct rgw_err
*err
)
7790 bufferlist first_chunk
;
7791 RGWObjManifest manifest
;
7794 append_rand_alpha(cct
, tag
, tag
, 32);
7796 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
7797 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.get_oid(),
7798 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
7800 processor
.set_version_id(*version_id
);
7802 processor
.set_olh_epoch(olh_epoch
);
7803 int ret
= processor
.prepare(this, NULL
);
7811 ret
= read_op
.read(ofs
, end
, bl
);
7813 uint64_t read_len
= ret
;
7820 ret
= processor
.handle_data(bl
, ofs
, &handle
, &obj
, &again
);
7824 ret
= processor
.throttle_data(handle
, obj
, read_len
, false);
7830 } while (ofs
<= end
);
7833 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
7834 if (iter
!= attrs
.end()) {
7835 bufferlist
& bl
= iter
->second
;
7836 etag
= string(bl
.c_str(), bl
.length());
7842 uint64_t accounted_size
;
7844 bool compressed
{false};
7845 RGWCompressionInfo cs_info
;
7846 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
7848 ldout(cct
, 0) << "ERROR: failed to read compression info" << dendl
;
7851 // pass original size if compressed
7852 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
7855 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
);
7858 bool RGWRados::is_meta_master()
7860 if (!get_zonegroup().is_master
) {
7864 return (get_zonegroup().master_zone
== zone_public_config
.id
);
7868 * Check to see if the bucket metadata could be synced
7869 * bucket: the bucket to check
7870 * Returns false is the bucket is not synced
7872 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket
& bucket
)
7875 /* no current period */
7876 if (current_period
.get_id().empty()) {
7880 /* zonegroup is not master zonegroup */
7881 if (!get_zonegroup().is_master
) {
7885 /* single zonegroup and a single zone */
7886 if (current_period
.is_single_zonegroup(cct
, this) && get_zonegroup().zones
.size() == 1) {
7890 /* zone is not master */
7891 if (get_zonegroup().master_zone
.compare(zone_public_config
.id
) != 0) {
7898 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
)
7900 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
7901 rgw_obj_index_key marker
;
7906 #define NUM_ENTRIES 1000
7907 int r
= cls_bucket_list(bucket_info
, RGW_NO_SHARD
, marker
, prefix
, NUM_ENTRIES
, true, ent_map
,
7908 &is_truncated
, &marker
);
7913 std::map
<string
, rgw_bucket_dir_entry
>::iterator eiter
;
7914 for (eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
7917 if (rgw_obj_key::oid_to_key_in_ns(eiter
->second
.key
.name
, &obj
, ns
))
7920 } while (is_truncated
);
7926 * bucket: the name of the bucket to delete
7927 * Returns 0 on success, -ERR# otherwise.
7929 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, bool check_empty
)
7931 const rgw_bucket
& bucket
= bucket_info
.bucket
;
7932 librados::IoCtx index_ctx
;
7933 map
<int, string
> bucket_objs
;
7934 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
7939 r
= check_bucket_empty(bucket_info
);
7945 r
= rgw_bucket_delete_bucket_obj(this, bucket
.tenant
, bucket
.name
, objv_tracker
);
7949 /* if the bucket is not synced we can remove the meta file */
7950 if (!is_syncing_bucket_meta(bucket
)) {
7951 RGWObjVersionTracker objv_tracker
;
7952 string entry
= bucket
.get_key();
7953 r
= rgw_bucket_instance_remove_entry(this, entry
, &objv_tracker
);
7957 /* remove bucket index objects*/
7958 map
<int, string
>::const_iterator biter
;
7959 for (biter
= bucket_objs
.begin(); biter
!= bucket_objs
.end(); ++biter
) {
7960 index_ctx
.remove(biter
->second
);
7966 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
7969 map
<string
, bufferlist
> attrs
;
7970 RGWObjectCtx
obj_ctx(this);
7971 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
7973 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
7977 info
.owner
= owner
.get_id();
7979 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
7981 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
7989 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
7993 vector
<rgw_bucket
>::iterator iter
;
7995 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
7996 rgw_bucket
& bucket
= *iter
;
7998 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
8000 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
8003 map
<string
, bufferlist
> attrs
;
8004 RGWObjectCtx
obj_ctx(this);
8005 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
8007 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8012 info
.flags
&= ~BUCKET_SUSPENDED
;
8014 info
.flags
|= BUCKET_SUSPENDED
;
8017 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
8019 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8027 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
8029 RGWBucketInfo bucket_info
;
8030 RGWObjectCtx
obj_ctx(this);
8031 int ret
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
);
8036 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
8040 int RGWRados::Object::complete_atomic_modification()
8042 if (!state
->has_manifest
|| state
->keep_tail
)
8045 cls_rgw_obj_chain chain
;
8046 store
->update_gc_chain(obj
, state
->manifest
, &chain
);
8048 if (chain
.empty()) {
8052 string tag
= state
->obj_tag
.to_str();
8053 return store
->gc
->send_chain(chain
, tag
, false); // do it async
8056 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
8058 RGWObjManifest::obj_iterator iter
;
8059 rgw_raw_obj raw_head
;
8060 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
8061 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
8062 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
8063 if (mobj
== raw_head
)
8065 cls_rgw_obj_key
key(mobj
.oid
);
8066 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
8070 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
, bool sync
)
8072 return gc
->send_chain(chain
, tag
, sync
);
8075 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
, string
& bucket_oid
)
8077 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8078 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8082 if (bucket
.bucket_id
.empty()) {
8083 ldout(cct
, 0) << "ERROR: empty bucket id for bucket operation" << dendl
;
8087 bucket_oid
= dir_oid_prefix
;
8088 bucket_oid
.append(bucket
.bucket_id
);
8093 int RGWRados::open_bucket_index_base(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8094 string
& bucket_oid_base
) {
8095 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8096 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8100 if (bucket
.bucket_id
.empty()) {
8101 ldout(cct
, 0) << "ERROR: empty bucket_id for bucket operation" << dendl
;
8105 bucket_oid_base
= dir_oid_prefix
;
8106 bucket_oid_base
.append(bucket
.bucket_id
);
8112 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8113 map
<int, string
>& bucket_objs
, int shard_id
, map
<int, string
> *bucket_instance_ids
) {
8114 string bucket_oid_base
;
8115 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8120 get_bucket_index_objects(bucket_oid_base
, bucket_info
.num_shards
, bucket_objs
, shard_id
);
8121 if (bucket_instance_ids
) {
8122 get_bucket_instance_ids(bucket_info
, shard_id
, bucket_instance_ids
);
8127 template<typename T
>
8128 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8129 map
<int, string
>& oids
, map
<int, T
>& bucket_objs
,
8130 int shard_id
, map
<int, string
> *bucket_instance_ids
)
8132 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, bucket_instance_ids
);
8136 map
<int, string
>::const_iterator iter
= oids
.begin();
8137 for (; iter
!= oids
.end(); ++iter
) {
8138 bucket_objs
[iter
->first
] = T();
8143 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8144 const string
& obj_key
, string
*bucket_obj
, int *shard_id
)
8146 string bucket_oid_base
;
8147 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8151 RGWObjectCtx
obj_ctx(this);
8153 ret
= get_bucket_index_object(bucket_oid_base
, obj_key
, bucket_info
.num_shards
,
8154 (RGWBucketInfo::BIShardsHashType
)bucket_info
.bucket_index_shard_hash_type
, bucket_obj
, shard_id
);
8156 ldout(cct
, 10) << "get_bucket_index_object() returned ret=" << ret
<< dendl
;
8162 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8163 int shard_id
, string
*bucket_obj
)
8165 string bucket_oid_base
;
8166 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8170 RGWObjectCtx
obj_ctx(this);
8172 get_bucket_index_object(bucket_oid_base
, bucket_info
.num_shards
,
8173 shard_id
, bucket_obj
);
8177 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
8178 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
8180 for (const auto& pair
: header
.stats
) {
8181 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
8182 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
8184 RGWStorageStats
& s
= stats
[category
];
8186 s
.category
= category
;
8187 s
.size
+= header_stats
.total_size
;
8188 s
.size_rounded
+= header_stats
.total_size_rounded
;
8189 s
.size_utilized
+= header_stats
.actual_size
;
8190 s
.num_objects
+= header_stats
.num_entries
;
8194 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
8195 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
8196 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
8198 librados::IoCtx index_ctx
;
8199 // key - bucket index object id
8200 // value - bucket index check OP returned result with the given bucket index object (shard)
8201 map
<int, string
> oids
;
8202 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
8203 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, bucket_objs_ret
);
8207 ret
= CLSRGWIssueBucketCheck(index_ctx
, oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8211 // Aggregate results (from different shards if there is any)
8212 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
8213 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
8214 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
8215 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
8221 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
8223 librados::IoCtx index_ctx
;
8224 map
<int, string
> bucket_objs
;
8225 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
8229 return CLSRGWIssueBucketRebuild(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8233 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
8235 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
8236 std::string oid
, key
;
8237 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
8241 RGWObjState
*state
= NULL
;
8243 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false);
8247 if (!state
->is_atomic
) {
8248 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
8252 if (state
->obj_tag
.length() == 0) {// check for backward compatibility
8253 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
8257 string tag
= state
->obj_tag
.c_str();
8259 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
8261 return gc
->defer_chain(tag
, false);
8264 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
8266 list
<string
> prefixes
;
8267 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
8268 cls_rgw_remove_obj(op
, prefixes
);
8271 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
8273 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
8276 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
8278 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
8284 * bucket: name of the bucket storing the object
8285 * obj: name of the object to delete
8286 * Returns: 0 on success, -ERR# otherwise.
8288 int RGWRados::Object::Delete::delete_obj()
8290 RGWRados
*store
= target
->get_store();
8291 rgw_obj
& src_obj
= target
->get_obj();
8292 const string
& instance
= src_obj
.key
.instance
;
8293 rgw_obj obj
= src_obj
;
8295 if (instance
== "null") {
8296 obj
.key
.instance
.clear();
8299 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
8301 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
8302 if (instance
.empty() || explicit_marker_version
) {
8303 rgw_obj marker
= obj
;
8305 if (!params
.marker_version_id
.empty()) {
8306 if (params
.marker_version_id
!= "null") {
8307 marker
.key
.set_instance(params
.marker_version_id
);
8309 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
8310 store
->gen_rand_obj_instance_name(&marker
);
8313 result
.version_id
= marker
.key
.instance
;
8314 result
.delete_marker
= true;
8316 struct rgw_bucket_dir_entry_meta meta
;
8318 meta
.owner
= params
.obj_owner
.get_id().to_str();
8319 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
8321 if (real_clock::is_zero(params
.mtime
)) {
8322 meta
.mtime
= real_clock::now();
8324 meta
.mtime
= params
.mtime
;
8327 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
);
8332 rgw_bucket_dir_entry dirent
;
8334 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
8338 result
.delete_marker
= dirent
.is_delete_marker();
8339 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
);
8343 result
.version_id
= instance
;
8347 int r
= target
->get_bucket_shard(&bs
);
8349 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
8353 r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
8355 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
8363 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
8369 r
= target
->get_state(&state
, false);
8373 ObjectWriteOperation op
;
8375 if (!real_clock::is_zero(params
.unmod_since
)) {
8376 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
8377 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
8378 if (!params
.high_precision_time
) {
8383 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
8384 if (ctime
> unmod
) {
8385 return -ERR_PRECONDITION_FAILED
;
8388 /* only delete object if mtime is less than or equal to params.unmod_since */
8389 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
8391 uint64_t obj_size
= state
->size
;
8393 if (!real_clock::is_zero(params
.expiration_time
)) {
8395 real_time delete_at
;
8397 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
8399 bufferlist::iterator iter
= bl
.begin();
8400 ::decode(delete_at
, iter
);
8401 } catch (buffer::error
& err
) {
8402 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
8406 if (params
.expiration_time
!= delete_at
) {
8407 return -ERR_PRECONDITION_FAILED
;
8410 return -ERR_PRECONDITION_FAILED
;
8414 if (!state
->exists
) {
8415 target
->invalidate_state();
8419 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true);
8423 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
8425 RGWRados::Bucket
bop(store
, bucket_info
);
8426 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
8428 index_op
.set_bilog_flags(params
.bilog_flags
);
8431 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
);
8435 store
->remove_rgw_head_obj(op
);
8436 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8437 bool need_invalidate
= false;
8438 if (r
== -ECANCELED
) {
8439 /* raced with another operation, we can regard it as removed */
8440 need_invalidate
= true;
8443 bool removed
= (r
>= 0);
8445 int64_t poolid
= ref
.ioctx
.get_id();
8447 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
8448 if (obj_tombstone_cache
) {
8449 tombstone_entry entry
{*state
};
8450 obj_tombstone_cache
->add(obj
, entry
);
8452 r
= index_op
.complete_del(poolid
, ref
.ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
8454 int ret
= index_op
.cancel();
8456 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
8460 int ret
= target
->complete_atomic_modification();
8462 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
8464 /* other than that, no need to propagate error */
8467 if (need_invalidate
) {
8468 target
->invalidate_state();
8474 /* update quota cache */
8475 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_size
);
8480 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
8481 const RGWBucketInfo
& bucket_info
,
8483 int versioning_status
,
8484 uint16_t bilog_flags
,
8485 const real_time
& expiration_time
)
8487 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
8488 RGWRados::Object::Delete
del_op(&del_target
);
8490 del_op
.params
.bucket_owner
= bucket_info
.owner
;
8491 del_op
.params
.versioning_status
= versioning_status
;
8492 del_op
.params
.bilog_flags
= bilog_flags
;
8493 del_op
.params
.expiration_time
= expiration_time
;
8495 return del_op
.delete_obj();
8498 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
8502 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
8507 ObjectWriteOperation op
;
8510 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8517 int RGWRados::delete_system_obj(rgw_raw_obj
& obj
, RGWObjVersionTracker
*objv_tracker
)
8520 ldout(cct
, 1) << "delete_system_obj got empty object name "
8521 << obj
<< ", returning EINVAL" << dendl
;
8526 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
8531 ObjectWriteOperation op
;
8534 objv_tracker
->prepare_op_for_write(&op
);
8538 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
8545 int RGWRados::delete_obj_index(const rgw_obj
& obj
)
8547 std::string oid
, key
;
8548 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
8550 RGWObjectCtx
obj_ctx(this);
8552 RGWBucketInfo bucket_info
;
8553 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
);
8555 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
8559 RGWRados::Bucket
bop(this, bucket_info
);
8560 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
8562 real_time removed_mtime
;
8563 int r
= index_op
.complete_del(-1 /* pool */, 0, removed_mtime
, NULL
);
8568 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
8572 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
8573 if (mi
!= manifest
.obj_end()) {
8574 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
8576 tag
= mi
.get_location().get_raw_obj(store
).oid
;
8580 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
8581 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
8583 hash
.Update((const byte
*)manifest_bl
.c_str(), manifest_bl
.length());
8585 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
8586 if (iter
!= attrset
.end()) {
8587 bufferlist
& bl
= iter
->second
;
8588 hash
.Update((const byte
*)bl
.c_str(), bl
.length());
8592 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
8593 tag
.append(md5_str
);
8595 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
8597 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
8600 static bool is_olh(map
<string
, bufferlist
>& attrs
)
8602 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
8603 return (iter
!= attrs
.end());
8606 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
8608 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
8609 return (iter
!= attrs
.end());
8612 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8613 RGWObjState
*olh_state
, RGWObjState
**target_state
)
8615 assert(olh_state
->is_olh
);
8618 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
8622 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false);
8630 int RGWRados::get_system_obj_state_impl(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
8636 RGWRawObjState
*s
= rctx
->raw
.get_state(obj
);
8637 ldout(cct
, 20) << "get_system_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
8645 int r
= raw_obj_stat(obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), objv_tracker
);
8648 s
->has_attrs
= true;
8649 s
->mtime
= real_time();
8656 s
->has_attrs
= true;
8657 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
8659 if (s
->obj_tag
.length())
8660 ldout(cct
, 20) << "get_system_obj_state: setting s->obj_tag to "
8661 << string(s
->obj_tag
.c_str(), s
->obj_tag
.length()) << dendl
;
8663 ldout(cct
, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl
;
8668 int RGWRados::get_system_obj_state(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
8673 ret
= get_system_obj_state_impl(rctx
, obj
, state
, objv_tracker
);
8674 } while (ret
== -EAGAIN
);
8679 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8680 RGWObjState
**state
, bool follow_olh
, bool assume_noent
)
8686 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
8688 RGWObjState
*s
= rctx
->obj
.get_state(obj
);
8689 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
8692 if (s
->is_olh
&& need_follow_olh
) {
8693 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
8700 rgw_raw_obj raw_obj
;
8701 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
8705 if (!assume_noent
) {
8706 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
);
8711 s
->has_attrs
= true;
8712 tombstone_entry entry
;
8713 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
8714 s
->mtime
= entry
.mtime
;
8715 s
->zone_short_id
= entry
.zone_short_id
;
8716 s
->pg_ver
= entry
.pg_ver
;
8717 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
8718 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
8720 s
->mtime
= real_time();
8728 s
->has_attrs
= true;
8729 s
->accounted_size
= s
->size
;
8731 auto iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
8732 if (iter
!= s
->attrset
.end()) {
8733 // use uncompressed size for accounted_size
8735 RGWCompressionInfo info
;
8736 auto p
= iter
->second
.begin();
8738 s
->accounted_size
= info
.orig_size
;
8739 } catch (buffer::error
&) {
8740 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
8745 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
8746 if (iter
!= s
->attrset
.end()) {
8747 bufferlist bl
= iter
->second
;
8748 bufferlist::iterator it
= bl
.begin();
8749 it
.copy(bl
.length(), s
->shadow_obj
);
8750 s
->shadow_obj
[bl
.length()] = '\0';
8752 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
8754 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
8755 if (manifest_bl
.length()) {
8756 bufferlist::iterator miter
= manifest_bl
.begin();
8758 ::decode(s
->manifest
, miter
);
8759 s
->has_manifest
= true;
8760 s
->manifest
.set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
8761 broken due to old bugs */
8762 s
->size
= s
->manifest
.get_obj_size();
8763 } catch (buffer::error
& err
) {
8764 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
8767 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
.get_obj_size() << dendl
;
8768 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20) && s
->manifest
.has_explicit_objs()) {
8769 RGWObjManifest::obj_iterator mi
;
8770 for (mi
= s
->manifest
.obj_begin(); mi
!= s
->manifest
.obj_end(); ++mi
) {
8771 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
8775 if (!s
->obj_tag
.length()) {
8777 * Uh oh, something's wrong, object with manifest should have tag. Let's
8778 * create one out of the manifest, would be unique
8780 generate_fake_tag(this, s
->attrset
, s
->manifest
, manifest_bl
, s
->obj_tag
);
8784 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
8785 if (aiter
!= s
->attrset
.end()) {
8786 bufferlist
& pg_ver_bl
= aiter
->second
;
8787 if (pg_ver_bl
.length()) {
8788 bufferlist::iterator pgbl
= pg_ver_bl
.begin();
8790 ::decode(s
->pg_ver
, pgbl
);
8791 } catch (buffer::error
& err
) {
8792 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
8796 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
8797 if (aiter
!= s
->attrset
.end()) {
8798 bufferlist
& zone_short_id_bl
= aiter
->second
;
8799 if (zone_short_id_bl
.length()) {
8800 bufferlist::iterator zbl
= zone_short_id_bl
.begin();
8802 ::decode(s
->zone_short_id
, zbl
);
8803 } catch (buffer::error
& err
) {
8804 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
8808 if (s
->obj_tag
.length())
8809 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << string(s
->obj_tag
.c_str(), s
->obj_tag
.length()) << dendl
;
8811 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
8813 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
8814 * it exist, and not only if is_olh() returns true
8816 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
8817 if (iter
!= s
->attrset
.end()) {
8818 s
->olh_tag
= iter
->second
;
8821 if (is_olh(s
->attrset
)) {
8824 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
8826 if (need_follow_olh
) {
8827 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
8834 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
8835 bool follow_olh
, bool assume_noent
)
8840 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, assume_noent
);
8841 } while (ret
== -EAGAIN
);
8846 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
)
8848 RGWObjState
*astate
;
8849 int r
= get_state(&astate
, true);
8854 *pmanifest
= &astate
->manifest
;
8859 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
)
8862 int r
= source
->get_state(&state
, true);
8867 if (!state
->get_attr(name
, dest
))
8874 int RGWRados::Object::Stat::stat_async()
8876 RGWObjectCtx
& ctx
= source
->get_ctx();
8877 rgw_obj
& obj
= source
->get_obj();
8878 RGWRados
*store
= source
->get_store();
8880 RGWObjState
*s
= ctx
.obj
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
8884 result
.size
= s
->size
;
8885 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
8886 result
.attrs
= s
->attrset
;
8887 result
.has_manifest
= s
->has_manifest
;
8888 result
.manifest
= s
->manifest
;
8894 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
8896 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
8901 librados::ObjectReadOperation op
;
8902 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
8903 op
.getxattrs(&result
.attrs
, NULL
);
8904 state
.completion
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
8905 state
.io_ctx
.locator_set_key(loc
);
8906 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
8908 ldout(store
->ctx(), 5) << __func__
8909 << ": ERROR: aio_operate() returned ret=" << r
8918 int RGWRados::Object::Stat::wait()
8920 if (!state
.completion
) {
8924 state
.completion
->wait_for_safe();
8925 state
.ret
= state
.completion
->get_return_value();
8926 state
.completion
->release();
8928 if (state
.ret
!= 0) {
8935 int RGWRados::Object::Stat::finish()
8937 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
8938 if (iter
!= result
.attrs
.end()) {
8939 bufferlist
& bl
= iter
->second
;
8940 bufferlist::iterator biter
= bl
.begin();
8942 ::decode(result
.manifest
, biter
);
8943 } catch (buffer::error
& err
) {
8944 RGWRados
*store
= source
->get_store();
8945 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
8948 result
.has_manifest
= true;
8955 * Get the attributes for an object.
8956 * bucket: name of the bucket holding the object.
8957 * obj: name of the object
8958 * name: name of the attr to retrieve
8959 * dest: bufferlist to store the result in
8960 * Returns: 0 on success, -ERR# otherwise.
8962 int RGWRados::system_obj_get_attr(rgw_raw_obj
& obj
, const char *name
, bufferlist
& dest
)
8966 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
8971 ObjectReadOperation op
;
8974 op
.getxattr(name
, &dest
, &rval
);
8976 r
= ref
.ioctx
.operate(ref
.oid
, &op
, NULL
);
8983 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
8984 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
8985 ObjectOperation
& op
, RGWObjState
**pstate
)
8990 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false);
8994 RGWObjState
*state
= *pstate
;
8996 if (!state
->is_atomic
) {
8997 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not appending atomic test" << dendl
;
9001 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
9002 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9004 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
9009 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, bool assume_noent
)
9011 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, assume_noent
);
9014 void RGWRados::Object::invalidate_state()
9016 ctx
.obj
.invalidate(obj
);
9019 void RGWRados::SystemObject::invalidate_state()
9021 ctx
.raw
.invalidate(obj
);
9024 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
9025 const char *if_match
, const char *if_nomatch
, bool removal_op
)
9027 int r
= get_state(&state
, false);
9031 bool need_guard
= (state
->has_manifest
|| (state
->obj_tag
.length() != 0) ||
9032 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
9035 if (!state
->is_atomic
) {
9036 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
9040 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
9047 /* first verify that the object wasn't replaced under */
9048 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
9049 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9050 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9054 if (strcmp(if_match
, "*") == 0) {
9055 // test the object is existing
9056 if (!state
->exists
) {
9057 return -ERR_PRECONDITION_FAILED
;
9061 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9062 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
9063 return -ERR_PRECONDITION_FAILED
;
9069 if (strcmp(if_nomatch
, "*") == 0) {
9070 // test the object is NOT existing
9071 if (state
->exists
) {
9072 return -ERR_PRECONDITION_FAILED
;
9076 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9077 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
9078 return -ERR_PRECONDITION_FAILED
;
9085 if (state
->exists
) {
9087 store
->remove_rgw_head_obj(op
);
9094 /* the object is being removed, no need to update its tag */
9099 state
->write_tag
= *ptag
;
9101 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
9104 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
9106 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
9108 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
9113 int RGWRados::system_obj_set_attr(void *ctx
, rgw_raw_obj
& obj
, const char *name
, bufferlist
& bl
,
9114 RGWObjVersionTracker
*objv_tracker
)
9116 map
<string
, bufferlist
> attrs
;
9118 return system_obj_set_attrs(ctx
, obj
, attrs
, NULL
, objv_tracker
);
9121 int RGWRados::system_obj_set_attrs(void *ctx
, rgw_raw_obj
& obj
,
9122 map
<string
, bufferlist
>& attrs
,
9123 map
<string
, bufferlist
>* rmattrs
,
9124 RGWObjVersionTracker
*objv_tracker
)
9128 int r
= get_system_obj_ref(obj
, &ref
, &pool
);
9132 ObjectWriteOperation op
;
9135 objv_tracker
->prepare_op_for_write(&op
);
9138 map
<string
, bufferlist
>::iterator iter
;
9140 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9141 const string
& name
= iter
->first
;
9142 op
.rmxattr(name
.c_str());
9146 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9147 const string
& name
= iter
->first
;
9148 bufferlist
& bl
= iter
->second
;
9153 op
.setxattr(name
.c_str(), bl
);
9161 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9169 * Set an attr on an object.
9170 * bucket: name of the bucket holding the object
9171 * obj: name of the object to set the attr on
9172 * name: the attr to set
9173 * bl: the contents of the attr
9174 * Returns: 0 on success, -ERR# otherwise.
9176 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
9178 map
<string
, bufferlist
> attrs
;
9180 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
);
9183 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
,
9184 map
<string
, bufferlist
>& attrs
,
9185 map
<string
, bufferlist
>* rmattrs
)
9188 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
9192 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
9194 ObjectWriteOperation op
;
9195 RGWObjState
*state
= NULL
;
9197 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
);
9201 map
<string
, bufferlist
>::iterator iter
;
9203 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9204 const string
& name
= iter
->first
;
9205 op
.rmxattr(name
.c_str());
9209 const rgw_bucket
& bucket
= obj
.bucket
;
9211 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9212 const string
& name
= iter
->first
;
9213 bufferlist
& bl
= iter
->second
;
9218 op
.setxattr(name
.c_str(), bl
);
9220 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
9225 rgw_obj_index_key obj_key
;
9226 obj
.key
.get_index_key(&obj_key
);
9228 objexp_hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
9229 } catch (buffer::error
& err
) {
9230 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
9238 RGWObjectCtx
obj_ctx(this);
9241 RGWRados::Bucket
bop(this, bucket_info
);
9242 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9246 append_rand_alpha(cct
, tag
, tag
, 32);
9247 state
->write_tag
= tag
;
9248 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
9253 bl
.append(tag
.c_str(), tag
.size() + 1);
9255 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
9258 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9261 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
9262 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
9263 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
9264 string
etag(etag_bl
.c_str(), etag_bl
.length());
9265 string
content_type(content_type_bl
.c_str(), content_type_bl
.length());
9266 uint64_t epoch
= ref
.ioctx
.get_last_version();
9267 int64_t poolid
= ref
.ioctx
.get_id();
9268 real_time mtime
= real_clock::now();
9269 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
9270 mtime
, etag
, content_type
, &acl_bl
,
9271 RGW_OBJ_CATEGORY_MAIN
, NULL
);
9273 int ret
= index_op
.cancel();
9275 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
9283 state
->obj_tag
.swap(bl
);
9285 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9286 state
->attrset
.erase(iter
->first
);
9289 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9290 state
->attrset
[iter
->first
] = iter
->second
;
9298 * Get data about an object out of RADOS and into memory.
9299 * bucket: name of the bucket the object is in.
9300 * obj: name/key of the object to read
9301 * data: if get_data==true, this pointer will be set
9302 * to an address containing the object's data/value
9303 * attrs: if non-NULL, the pointed-to map will contain
9304 * all the attrs of the object when this function returns
9305 * mod_ptr: if non-NULL, compares the object's mtime to *mod_ptr,
9306 * and if mtime is smaller it fails.
9307 * unmod_ptr: if non-NULL, compares the object's mtime to *unmod_ptr,
9308 * and if mtime is >= it fails.
9309 * if_match/nomatch: if non-NULL, compares the object's etag attr
9310 * to the string and, if it doesn't/does match, fails out.
9311 * get_data: if true, the object's data/value will be read out, otherwise not
9312 * err: Many errors will result in this structure being filled
9313 * with extra informatin on the error.
9314 * Returns: -ERR# on failure, otherwise
9315 * (if get_data==true) length of read data,
9316 * (if get_data==false) length of the object
9318 // P3 XXX get_data is not seen used anywhere.
9319 int RGWRados::Object::Read::prepare()
9321 RGWRados
*store
= source
->get_store();
9322 CephContext
*cct
= store
->ctx();
9326 map
<string
, bufferlist
>::iterator iter
;
9328 RGWObjState
*astate
;
9329 int r
= source
->get_state(&astate
, true);
9333 if (!astate
->exists
) {
9337 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
9339 state
.obj
= astate
->obj
;
9340 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
9342 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, &state
.io_ctx
);
9347 *params
.attrs
= astate
->attrset
;
9348 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
9349 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
9350 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
9355 /* Convert all times go GMT to make them compatible */
9356 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
9357 obj_time_weight src_weight
;
9358 src_weight
.init(astate
);
9359 src_weight
.high_precision
= conds
.high_precision_time
;
9361 obj_time_weight dest_weight
;
9362 dest_weight
.high_precision
= conds
.high_precision_time
;
9364 if (conds
.mod_ptr
) {
9365 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
9366 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
9367 if (!(dest_weight
< src_weight
)) {
9368 return -ERR_NOT_MODIFIED
;
9372 if (conds
.unmod_ptr
) {
9373 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
9374 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
9375 if (dest_weight
< src_weight
) {
9376 return -ERR_PRECONDITION_FAILED
;
9380 if (conds
.if_match
|| conds
.if_nomatch
) {
9381 r
= get_attr(RGW_ATTR_ETAG
, etag
);
9385 if (conds
.if_match
) {
9386 string if_match_str
= rgw_string_unquote(conds
.if_match
);
9387 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-Match: " << if_match_str
<< dendl
;
9388 if (if_match_str
.compare(etag
.c_str()) != 0) {
9389 return -ERR_PRECONDITION_FAILED
;
9393 if (conds
.if_nomatch
) {
9394 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
9395 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
9396 if (if_nomatch_str
.compare(etag
.c_str()) == 0) {
9397 return -ERR_NOT_MODIFIED
;
9402 if (params
.obj_size
)
9403 *params
.obj_size
= astate
->size
;
9405 *params
.lastmod
= astate
->mtime
;
9410 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
9417 } else if (end
< 0) {
9422 if (ofs
>= (off_t
)obj_size
) {
9425 if (end
>= (off_t
)obj_size
) {
9432 int RGWRados::SystemObject::get_state(RGWRawObjState
**pstate
, RGWObjVersionTracker
*objv_tracker
)
9434 return store
->get_system_obj_state(&ctx
, obj
, pstate
, objv_tracker
);
9437 int RGWRados::stat_system_obj(RGWObjectCtx
& obj_ctx
,
9438 RGWRados::SystemObject::Read::GetObjState
& state
,
9440 map
<string
, bufferlist
> *attrs
,
9443 RGWObjVersionTracker
*objv_tracker
)
9445 RGWRawObjState
*astate
= NULL
;
9447 int r
= get_system_obj_state(&obj_ctx
, obj
, &astate
, objv_tracker
);
9451 if (!astate
->exists
) {
9456 *attrs
= astate
->attrset
;
9457 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
9458 map
<string
, bufferlist
>::iterator iter
;
9459 for (iter
= attrs
->begin(); iter
!= attrs
->end(); ++iter
) {
9460 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
9466 *obj_size
= astate
->size
;
9468 *lastmod
= astate
->mtime
;
9473 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker
*objv_tracker
)
9475 RGWRados
*store
= source
->get_store();
9476 rgw_raw_obj
& obj
= source
->get_obj();
9478 return store
->stat_system_obj(source
->get_ctx(), state
, obj
, stat_params
.attrs
,
9479 stat_params
.lastmod
, stat_params
.obj_size
, objv_tracker
);
9482 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
)
9487 RGWRados
*store
= target
->get_store();
9489 int ret
= get_bucket_shard(&bs
);
9491 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9495 if (write_tag
&& write_tag
->length()) {
9496 optag
= string(write_tag
->c_str(), write_tag
->length());
9498 if (optag
.empty()) {
9499 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
9503 int r
= store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
);
9511 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
9512 uint64_t size
, uint64_t accounted_size
,
9513 ceph::real_time
& ut
, const string
& etag
,
9514 const string
& content_type
,
9516 RGWObjCategory category
,
9517 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
)
9522 RGWRados
*store
= target
->get_store();
9524 int ret
= get_bucket_shard(&bs
);
9526 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9530 rgw_bucket_dir_entry ent
;
9531 obj
.key
.get_index_key(&ent
.key
);
9532 ent
.meta
.size
= size
;
9533 ent
.meta
.accounted_size
= accounted_size
;
9534 ent
.meta
.mtime
= ut
;
9535 ent
.meta
.etag
= etag
;
9537 ent
.meta
.user_data
= *user_data
;
9540 if (acl_bl
&& acl_bl
->length()) {
9541 int ret
= store
->decode_policy(*acl_bl
, &owner
);
9543 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
9546 ent
.meta
.owner
= owner
.get_id().to_str();
9547 ent
.meta
.owner_display_name
= owner
.get_display_name();
9548 ent
.meta
.content_type
= content_type
;
9550 ret
= store
->cls_obj_complete_add(*bs
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
);
9552 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9554 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9560 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
9561 real_time
& removed_mtime
,
9562 list
<rgw_obj_index_key
> *remove_objs
)
9567 RGWRados
*store
= target
->get_store();
9569 int ret
= get_bucket_shard(&bs
);
9571 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9575 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
);
9577 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9579 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9586 int RGWRados::Bucket::UpdateIndex::cancel()
9591 RGWRados
*store
= target
->get_store();
9593 int ret
= get_bucket_shard(&bs
);
9595 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
9599 ret
= store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
);
9602 * need to update data log anyhow, so that whoever follows needs to update its internal markers
9603 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
9604 * have no way to tell that they're all caught up
9606 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9608 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9614 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
)
9616 RGWRados
*store
= source
->get_store();
9617 CephContext
*cct
= store
->ctx();
9619 std::string oid
, key
;
9620 rgw_raw_obj read_obj
;
9621 uint64_t read_ofs
= ofs
;
9622 uint64_t len
, read_len
;
9623 bool reading_from_head
= true;
9624 ObjectReadOperation op
;
9626 bool merge_bl
= false;
9627 bufferlist
*pbl
= &bl
;
9629 uint64_t max_chunk_size
;
9631 RGWObjState
*astate
;
9632 int r
= source
->get_state(&astate
, true);
9639 len
= end
- ofs
+ 1;
9641 if (astate
->has_manifest
&& astate
->manifest
.has_tail()) {
9642 /* now get the relevant object part */
9643 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
9645 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
9646 read_obj
= iter
.get_location().get_raw_obj(store
);
9647 len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
9648 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
9649 reading_from_head
= (read_obj
== state
.head_obj
);
9651 read_obj
= state
.head_obj
;
9654 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
9656 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
9660 if (len
> max_chunk_size
)
9661 len
= max_chunk_size
;
9664 state
.io_ctx
.locator_set_key(read_obj
.loc
);
9668 if (reading_from_head
) {
9669 /* only when reading from the head object do we need to do the atomic test */
9670 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
);
9674 if (astate
&& astate
->prefetch_data
) {
9675 if (!ofs
&& astate
->data
.length() >= len
) {
9680 if (ofs
< astate
->data
.length()) {
9681 unsigned copy_len
= min((uint64_t)astate
->data
.length() - ofs
, len
);
9682 astate
->data
.copy(ofs
, copy_len
, bl
);
9683 read_len
-= copy_len
;
9684 read_ofs
+= copy_len
;
9694 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
9695 op
.read(read_ofs
, read_len
, pbl
, NULL
);
9697 r
= state
.io_ctx
.operate(read_obj
.oid
, &op
, NULL
);
9698 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
9711 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados
*store
, rgw_raw_obj
& obj
, rgw_rados_ref
**pref
)
9715 int r
= store
->get_raw_obj_ref(obj
, &ref
, &pool
);
9726 int RGWRados::get_system_obj(RGWObjectCtx
& obj_ctx
, RGWRados::SystemObject::Read::GetObjState
& read_state
,
9727 RGWObjVersionTracker
*objv_tracker
, rgw_raw_obj
& obj
,
9728 bufferlist
& bl
, off_t ofs
, off_t end
,
9729 map
<string
, bufferlist
> *attrs
,
9730 rgw_cache_entry_info
*cache_info
)
9733 ObjectReadOperation op
;
9738 len
= end
- ofs
+ 1;
9741 objv_tracker
->prepare_op_for_read(&op
);
9744 ldout(cct
, 20) << "rados->read ofs=" << ofs
<< " len=" << len
<< dendl
;
9745 op
.read(ofs
, len
, &bl
, NULL
);
9748 op
.getxattrs(attrs
, NULL
);
9752 int r
= read_state
.get_ref(this, obj
, &ref
);
9754 ldout(cct
, 20) << "read_state.get_ref() on obj=" << obj
<< " returned " << r
<< dendl
;
9757 r
= ref
->ioctx
.operate(ref
->oid
, &op
, NULL
);
9759 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
9762 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
9764 uint64_t op_ver
= ref
->ioctx
.get_last_version();
9766 if (read_state
.last_ver
> 0 &&
9767 read_state
.last_ver
!= op_ver
) {
9768 ldout(cct
, 5) << "raced with an object write, abort" << dendl
;
9772 read_state
.last_ver
= op_ver
;
9777 int RGWRados::SystemObject::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
, RGWObjVersionTracker
*objv_tracker
)
9779 RGWRados
*store
= source
->get_store();
9780 rgw_raw_obj
& obj
= source
->get_obj();
9782 return store
->get_system_obj(source
->get_ctx(), state
, objv_tracker
, obj
, bl
, ofs
, end
, read_params
.attrs
, read_params
.cache_info
);
9785 int RGWRados::SystemObject::Read::get_attr(const char *name
, bufferlist
& dest
)
9787 RGWRados
*store
= source
->get_store();
9788 rgw_raw_obj
& obj
= source
->get_obj();
9790 return store
->system_obj_get_attr(obj
, name
, dest
);
9793 struct get_obj_data
;
9795 struct get_obj_aio_data
{
9796 struct get_obj_data
*op_data
;
9806 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
);
9808 struct get_obj_data
: public RefCountedObject
{
9813 map
<off_t
, get_obj_io
> io_map
;
9814 map
<off_t
, librados::AioCompletion
*> completion_map
;
9815 uint64_t total_read
;
9818 list
<get_obj_aio_data
> aio_data
;
9819 RGWGetDataCB
*client_cb
;
9820 std::atomic
<bool> cancelled
= { false };
9821 std::atomic
<int64_t> err_code
= { 0 };
9823 list
<bufferlist
> read_list
;
9825 explicit get_obj_data(CephContext
*_cct
)
9827 rados(NULL
), ctx(NULL
),
9828 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
9830 throttle(cct
, "get_obj_data", cct
->_conf
->rgw_get_obj_window_size
, false) {}
9831 ~get_obj_data() override
{ }
9832 void set_cancelled(int r
) {
9837 bool is_cancelled() {
9841 int get_err_code() {
9845 int wait_next_io(bool *done
) {
9847 map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
9848 if (iter
== completion_map
.end()) {
9853 off_t cur_ofs
= iter
->first
;
9854 librados::AioCompletion
*c
= iter
->second
;
9857 c
->wait_for_safe_and_cb();
9858 int r
= c
->get_return_value();
9861 completion_map
.erase(cur_ofs
);
9863 if (completion_map
.empty()) {
9873 void add_io(off_t ofs
, off_t len
, bufferlist
**pbl
, AioCompletion
**pc
) {
9874 Mutex::Locker
l(lock
);
9876 const auto& io_iter
= io_map
.insert(
9877 map
<off_t
, get_obj_io
>::value_type(ofs
, get_obj_io()));
9879 assert(io_iter
.second
); // assert new insertion
9881 get_obj_io
& io
= (io_iter
.first
)->second
;
9884 struct get_obj_aio_data aio
;
9889 aio_data
.push_back(aio
);
9891 struct get_obj_aio_data
*paio_data
= &aio_data
.back(); /* last element */
9893 librados::AioCompletion
*c
= librados::Rados::aio_create_completion((void *)paio_data
, NULL
, _get_obj_aio_completion_cb
);
9894 completion_map
[ofs
] = c
;
9898 /* we have a reference per IO, plus one reference for the calling function.
9899 * reference is dropped for each callback, plus when we're done iterating
9904 void cancel_io(off_t ofs
) {
9905 ldout(cct
, 20) << "get_obj_data::cancel_io() ofs=" << ofs
<< dendl
;
9907 map
<off_t
, AioCompletion
*>::iterator iter
= completion_map
.find(ofs
);
9908 if (iter
!= completion_map
.end()) {
9909 AioCompletion
*c
= iter
->second
;
9911 completion_map
.erase(ofs
);
9916 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
9917 * need IoCtx to live, as io callback may still be called
9921 void cancel_all_io() {
9922 ldout(cct
, 20) << "get_obj_data::cancel_all_io()" << dendl
;
9923 Mutex::Locker
l(lock
);
9924 for (map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
9925 iter
!= completion_map
.end(); ++iter
) {
9926 librados::AioCompletion
*c
= iter
->second
;
9931 int get_complete_ios(off_t ofs
, list
<bufferlist
>& bl_list
) {
9932 Mutex::Locker
l(lock
);
9934 map
<off_t
, get_obj_io
>::iterator liter
= io_map
.begin();
9936 if (liter
== io_map
.end() ||
9937 liter
->first
!= ofs
) {
9941 map
<off_t
, librados::AioCompletion
*>::iterator aiter
;
9942 aiter
= completion_map
.find(ofs
);
9943 if (aiter
== completion_map
.end()) {
9944 /* completion map does not hold this io, it was cancelled */
9948 AioCompletion
*completion
= aiter
->second
;
9949 int r
= completion
->get_return_value();
9953 for (; aiter
!= completion_map
.end(); ++aiter
) {
9954 completion
= aiter
->second
;
9955 if (!completion
->is_safe()) {
9956 /* reached a request that is not yet complete, stop */
9960 r
= completion
->get_return_value();
9962 set_cancelled(r
); /* mark it as cancelled, so that we don't continue processing next operations */
9968 map
<off_t
, get_obj_io
>::iterator old_liter
= liter
++;
9969 bl_list
.push_back(old_liter
->second
.bl
);
9970 io_map
.erase(old_liter
);
9977 static int _get_obj_iterate_cb(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, const rgw_raw_obj
& read_obj
, off_t obj_ofs
, off_t read_ofs
, off_t len
, bool is_head_obj
, RGWObjState
*astate
, void *arg
)
9979 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
9981 return d
->rados
->get_obj_iterate_cb(d
->ctx
, astate
, bucket_info
, obj
, read_obj
, obj_ofs
, read_ofs
, len
, is_head_obj
, arg
);
9984 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
)
9986 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
9987 struct get_obj_data
*d
= aio_data
->op_data
;
9989 d
->rados
->get_obj_aio_completion_cb(cb
, arg
);
9993 void RGWRados::get_obj_aio_completion_cb(completion_t c
, void *arg
)
9995 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
9996 struct get_obj_data
*d
= aio_data
->op_data
;
9997 off_t ofs
= aio_data
->ofs
;
9998 off_t len
= aio_data
->len
;
10000 list
<bufferlist
> bl_list
;
10001 list
<bufferlist
>::iterator iter
;
10004 ldout(cct
, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs
<< " len=" << len
<< dendl
;
10005 d
->throttle
.put(len
);
10007 r
= rados_aio_get_return_value(c
);
10009 ldout(cct
, 0) << "ERROR: got unexpected error when trying to read object: " << r
<< dendl
;
10010 d
->set_cancelled(r
);
10014 if (d
->is_cancelled()) {
10018 d
->data_lock
.Lock();
10020 r
= d
->get_complete_ios(ofs
, bl_list
);
10025 d
->read_list
.splice(d
->read_list
.end(), bl_list
);
10028 d
->data_lock
.Unlock();
10034 int RGWRados::flush_read_list(struct get_obj_data
*d
)
10036 d
->data_lock
.Lock();
10037 list
<bufferlist
> l
;
10038 l
.swap(d
->read_list
);
10040 d
->read_list
.clear();
10042 d
->data_lock
.Unlock();
10046 list
<bufferlist
>::iterator iter
;
10047 for (iter
= l
.begin(); iter
!= l
.end(); ++iter
) {
10048 bufferlist
& bl
= *iter
;
10049 r
= d
->client_cb
->handle_data(bl
, 0, bl
.length());
10051 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r
<< dendl
;
10056 d
->data_lock
.Lock();
10059 d
->set_cancelled(r
);
10061 d
->data_lock
.Unlock();
10065 int RGWRados::get_obj_iterate_cb(RGWObjectCtx
*ctx
, RGWObjState
*astate
,
10066 const RGWBucketInfo
& bucket_info
,
10067 const rgw_obj
& obj
,
10068 const rgw_raw_obj
& read_obj
,
10070 off_t read_ofs
, off_t len
,
10071 bool is_head_obj
, void *arg
)
10073 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
10074 ObjectReadOperation op
;
10075 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
10083 /* only when reading from the head object do we need to do the atomic test */
10084 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &astate
);
10089 obj_ofs
< astate
->data
.length()) {
10090 unsigned chunk_len
= min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
10092 d
->data_lock
.Lock();
10093 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
10094 d
->data_lock
.Unlock();
10099 d
->total_read
+= chunk_len
;
10103 read_ofs
+= chunk_len
;
10104 obj_ofs
+= chunk_len
;
10110 d
->throttle
.get(len
);
10111 if (d
->is_cancelled()) {
10112 return d
->get_err_code();
10115 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10118 d
->add_io(obj_ofs
, len
, &pbl
, &c
);
10120 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
10121 op
.read(read_ofs
, len
, pbl
, NULL
);
10123 librados::IoCtx
io_ctx(d
->io_ctx
);
10124 io_ctx
.locator_set_key(read_obj
.loc
);
10126 r
= io_ctx
.aio_operate(read_obj
.oid
, c
, &op
, NULL
);
10128 ldout(cct
, 0) << "rados->aio_operate r=" << r
<< dendl
;
10132 // Flush data to client if there is any
10133 r
= flush_read_list(d
);
10140 ldout(cct
, 20) << "cancelling io r=" << r
<< " obj_ofs=" << obj_ofs
<< dendl
;
10141 d
->set_cancelled(r
);
10142 d
->cancel_io(obj_ofs
);
10147 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
)
10149 RGWRados
*store
= source
->get_store();
10150 CephContext
*cct
= store
->ctx();
10152 struct get_obj_data
*data
= new get_obj_data(cct
);
10155 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
10157 data
->rados
= store
;
10158 data
->io_ctx
.dup(state
.io_ctx
);
10159 data
->client_cb
= cb
;
10161 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
, ofs
, end
, cct
->_conf
->rgw_get_obj_max_req_size
, _get_obj_iterate_cb
, (void *)data
);
10163 data
->cancel_all_io();
10168 r
= data
->wait_next_io(&done
);
10170 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
10171 data
->cancel_all_io();
10174 r
= store
->flush_read_list(data
);
10176 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
10177 data
->cancel_all_io();
10187 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
10188 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
10189 off_t ofs
, off_t end
,
10190 uint64_t max_chunk_size
,
10191 int (*iterate_obj_cb
)(const RGWBucketInfo
&, const rgw_obj
& obj
,
10192 const rgw_raw_obj
&, off_t
, off_t
, off_t
, bool,
10193 RGWObjState
*, void *),
10196 rgw_raw_obj head_obj
;
10197 rgw_raw_obj read_obj
;
10198 uint64_t read_ofs
= ofs
;
10200 bool reading_from_head
= true;
10201 RGWObjState
*astate
= NULL
;
10203 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
10205 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false);
10213 len
= end
- ofs
+ 1;
10215 if (astate
->has_manifest
) {
10216 /* now get the relevant object stripe */
10217 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
10219 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
.obj_end();
10221 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
10222 off_t stripe_ofs
= iter
.get_stripe_ofs();
10223 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
10225 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
10226 read_obj
= iter
.get_location().get_raw_obj(this);
10227 uint64_t read_len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
10228 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
10230 if (read_len
> max_chunk_size
) {
10231 read_len
= max_chunk_size
;
10234 reading_from_head
= (read_obj
== head_obj
);
10235 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
10245 while (ofs
<= end
) {
10246 read_obj
= head_obj
;
10247 uint64_t read_len
= min(len
, max_chunk_size
);
10249 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
10262 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
10265 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10270 return ref
.ioctx
.operate(ref
.oid
, op
);
10273 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
10276 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10283 return ref
.ioctx
.operate(ref
.oid
, op
, &outbl
);
10286 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
10288 ObjectWriteOperation op
;
10290 assert(olh_obj
.key
.instance
.empty());
10292 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
10294 if (!state
.exists
) {
10297 op
.assert_exists();
10301 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10302 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10303 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10304 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10305 * log will reflect that.
10307 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10308 * is used for object data instance, olh_tag for olh instance.
10311 /* guard against racing writes */
10312 bucket_index_guard_olh_op(state
, op
);
10318 int ret
= gen_rand_alphanumeric_lower(cct
, &obj_tag
, 32);
10320 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10324 bl
.append(obj_tag
.c_str(), obj_tag
.size());
10325 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
10327 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
10328 state
.obj_tag
= bl
;
10332 ret
= gen_rand_alphanumeric_lower(cct
, &olh_tag
, 32);
10334 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10338 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
10339 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
10341 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
10342 state
.olh_tag
= olh_bl
;
10343 state
.is_olh
= true;
10346 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
10350 RGWOLHPendingInfo pending_info
;
10351 pending_info
.time
= real_clock::now();
10352 ::encode(pending_info
, bl
);
10354 #define OLH_PENDING_TAG_LEN 32
10355 /* tag will start with current time epoch, this so that entries are sorted by time */
10357 utime_t
ut(pending_info
.time
);
10358 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
10362 int ret
= gen_rand_alphanumeric_lower(cct
, &s
, OLH_PENDING_TAG_LEN
- op_tag
->size());
10364 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
10369 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
10370 attr_name
.append(*op_tag
);
10372 op
.setxattr(attr_name
.c_str(), bl
);
10374 ret
= obj_operate(bucket_info
, olh_obj
, &op
);
10379 state
.exists
= true;
10380 state
.attrset
[attr_name
] = bl
;
10385 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
10389 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
10390 if (ret
== -EEXIST
) {
10397 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
10398 bool delete_marker
,
10399 const string
& op_tag
,
10400 struct rgw_bucket_dir_entry_meta
*meta
,
10401 uint64_t olh_epoch
,
10402 real_time unmod_since
, bool high_precision_time
)
10405 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10410 BucketShard
bs(this);
10411 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10413 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10417 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
10418 ret
= cls_rgw_bucket_link_olh(bs
.index_ctx
, bs
.bucket_obj
, key
, olh_state
.olh_tag
, delete_marker
, op_tag
, meta
, olh_epoch
,
10419 unmod_since
, high_precision_time
,
10420 get_zone().log_data
);
10428 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
10430 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
10431 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
10434 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
10435 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
)
10438 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10443 BucketShard
bs(this);
10444 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10446 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10450 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
10451 ret
= cls_rgw_bucket_unlink_instance(bs
.index_ctx
, bs
.bucket_obj
, key
, op_tag
, olh_tag
, olh_epoch
, get_zone().log_data
);
10459 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
10460 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
10461 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
10462 bool *is_truncated
)
10465 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10470 BucketShard
bs(this);
10471 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10473 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10477 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10479 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10481 ObjectReadOperation op
;
10483 ret
= cls_rgw_get_olh_log(bs
.index_ctx
, bs
.bucket_obj
, op
, key
, ver_marker
, olh_tag
, log
, is_truncated
);
10490 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
10493 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10498 BucketShard
bs(this);
10499 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10501 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10505 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10507 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10509 ObjectWriteOperation op
;
10511 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
10513 ret
= bs
.index_ctx
.operate(bs
.bucket_obj
, &op
);
10520 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
10523 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
10528 BucketShard
bs(this);
10529 int ret
= bs
.init(obj_instance
.bucket
, obj_instance
);
10531 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
10535 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
10537 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
10539 ret
= cls_rgw_clear_olh(bs
.index_ctx
, bs
.bucket_obj
, key
, olh_tag
);
10541 ldout(cct
, 5) << "cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
10548 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
10549 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
10550 uint64_t *plast_ver
)
10556 librados::ObjectWriteOperation op
;
10558 uint64_t last_ver
= log
.rbegin()->first
;
10559 *plast_ver
= last_ver
;
10561 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
10563 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
10564 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
10566 bool need_to_link
= false;
10567 cls_rgw_obj_key key
;
10568 bool delete_marker
= false;
10569 list
<cls_rgw_obj_key
> remove_instances
;
10570 bool need_to_remove
= false;
10572 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
10573 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
10574 for (; viter
!= iter
->second
.end(); ++viter
) {
10575 rgw_bucket_olh_log_entry
& entry
= *viter
;
10577 ldout(cct
, 20) << "olh_log_entry: op=" << (int)entry
.op
10578 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
10579 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
10580 switch (entry
.op
) {
10581 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
10582 remove_instances
.push_back(entry
.key
);
10584 case CLS_RGW_OLH_OP_LINK_OLH
:
10585 need_to_link
= true;
10586 need_to_remove
= false;
10588 delete_marker
= entry
.delete_marker
;
10590 case CLS_RGW_OLH_OP_UNLINK_OLH
:
10591 need_to_remove
= true;
10592 need_to_link
= false;
10595 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
10598 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
10599 attr_name
.append(entry
.op_tag
);
10600 op
.rmxattr(attr_name
.c_str());
10605 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10610 const rgw_bucket
& bucket
= obj
.bucket
;
10612 if (need_to_link
) {
10613 rgw_obj
target(bucket
, key
);
10615 info
.target
= target
;
10616 info
.removed
= delete_marker
;
10618 ::encode(info
, bl
);
10619 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
10622 /* first remove object instances */
10623 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
10624 liter
!= remove_instances
.end(); ++liter
) {
10625 cls_rgw_obj_key
& key
= *liter
;
10626 rgw_obj
obj_instance(bucket
, key
);
10627 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
);
10628 if (ret
< 0 && ret
!= -ENOENT
) {
10629 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
10634 /* update olh object */
10635 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
10636 if (r
== -ECANCELED
) {
10640 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
10644 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
10646 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
10650 if (need_to_remove
) {
10651 ObjectWriteOperation rm_op
;
10653 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
10654 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
10655 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
10658 r
= ref
.ioctx
.operate(ref
.oid
, &rm_op
);
10659 if (r
== -ECANCELED
) {
10660 return 0; /* someone else won this race */
10663 * only clear if was successful, otherwise we might clobber pending operations on this object
10665 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
10667 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
10677 * read olh log and apply it
10679 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
10681 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
10683 uint64_t ver_marker
= 0;
10686 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
10690 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
);
10694 } while (is_truncated
);
10699 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
10700 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
)
10704 rgw_obj olh_obj
= target_obj
;
10705 olh_obj
.key
.instance
.clear();
10707 RGWObjState
*state
= NULL
;
10712 #define MAX_ECANCELED_RETRY 100
10713 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
10714 if (ret
== -ECANCELED
) {
10715 obj_ctx
.obj
.invalidate(olh_obj
);
10718 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
10723 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
10725 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
10726 if (ret
== -ECANCELED
) {
10731 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
, op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
);
10733 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
10734 if (ret
== -ECANCELED
) {
10742 if (i
== MAX_ECANCELED_RETRY
) {
10743 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
10747 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
10748 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
10752 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
10759 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
10760 uint64_t olh_epoch
)
10764 rgw_obj olh_obj
= target_obj
;
10765 olh_obj
.key
.instance
.clear();
10767 RGWObjState
*state
= NULL
;
10772 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
10773 if (ret
== -ECANCELED
) {
10774 obj_ctx
.obj
.invalidate(olh_obj
);
10777 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
10781 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
10783 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
10784 if (ret
== -ECANCELED
) {
10790 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
10792 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
);
10794 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
10795 if (ret
== -ECANCELED
) {
10803 if (i
== MAX_ECANCELED_RETRY
) {
10804 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
10808 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
10809 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
10813 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
10820 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
10822 #define OBJ_INSTANCE_LEN 32
10823 char buf
[OBJ_INSTANCE_LEN
+ 1];
10825 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
10826 no underscore for instance name due to the way we encode the raw keys */
10828 target_obj
->key
.set_instance(buf
);
10831 static void filter_attrset(map
<string
, bufferlist
>& unfiltered_attrset
, const string
& check_prefix
,
10832 map
<string
, bufferlist
> *attrset
)
10835 map
<string
, bufferlist
>::iterator iter
;
10836 for (iter
= unfiltered_attrset
.lower_bound(check_prefix
);
10837 iter
!= unfiltered_attrset
.end(); ++iter
) {
10838 if (!boost::algorithm::starts_with(iter
->first
, check_prefix
))
10840 (*attrset
)[iter
->first
] = iter
->second
;
10844 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
10846 map
<string
, bufferlist
> unfiltered_attrset
;
10848 ObjectReadOperation op
;
10849 op
.getxattrs(&unfiltered_attrset
, NULL
);
10852 int r
= obj_operate(bucket_info
, obj
, &op
);
10857 map
<string
, bufferlist
> attrset
;
10859 filter_attrset(unfiltered_attrset
, RGW_ATTR_OLH_PREFIX
, &attrset
);
10861 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
10862 if (iter
== attrset
.end()) { /* not an olh */
10867 bufferlist::iterator biter
= iter
->second
.begin();
10868 ::decode(*olh
, biter
);
10869 } catch (buffer::error
& err
) {
10870 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
10877 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
10878 map
<string
, bufferlist
> *rm_pending_entries
)
10880 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
10882 real_time now
= real_clock::now();
10884 while (iter
!= pending_entries
.end()) {
10885 bufferlist::iterator biter
= iter
->second
.begin();
10886 RGWOLHPendingInfo pending_info
;
10888 ::decode(pending_info
, biter
);
10889 } catch (buffer::error
& err
) {
10890 /* skipping bad entry, we could remove it but it might hide a bug */
10891 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
10896 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
10898 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
10899 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
10900 pending_entries
.erase(cur_iter
);
10902 /* entries names are sorted by time (rounded to a second) */
10908 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
10910 ObjectWriteOperation op
;
10912 bucket_index_guard_olh_op(state
, op
);
10914 for (map
<string
, bufferlist
>::iterator iter
= pending_attrs
.begin(); iter
!= pending_attrs
.end(); ++iter
) {
10915 op
.rmxattr(iter
->first
.c_str());
10919 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
10924 /* update olh object */
10925 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
10926 if (r
== -ENOENT
|| r
== -ECANCELED
) {
10927 /* raced with some other change, shouldn't sweat about it */
10931 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
10938 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
10940 map
<string
, bufferlist
> pending_entries
;
10941 filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
10943 map
<string
, bufferlist
> rm_pending_entries
;
10944 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
10946 if (!rm_pending_entries
.empty()) {
10947 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
10949 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
10953 if (!pending_entries
.empty()) {
10954 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
10956 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
10962 map
<string
, bufferlist
>::iterator iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
10963 assert(iter
!= state
->attrset
.end());
10966 bufferlist::iterator biter
= iter
->second
.begin();
10967 ::decode(olh
, biter
);
10968 } catch (buffer::error
& err
) {
10969 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
10977 *target
= olh
.target
;
10982 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
10983 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
10984 RGWObjVersionTracker
*objv_tracker
)
10987 int r
= get_raw_obj_ref(obj
, &ref
);
10992 map
<string
, bufferlist
> unfiltered_attrset
;
10994 struct timespec mtime_ts
;
10996 ObjectReadOperation op
;
10997 if (objv_tracker
) {
10998 objv_tracker
->prepare_op_for_read(&op
);
11001 op
.getxattrs(&unfiltered_attrset
, NULL
);
11003 if (psize
|| pmtime
) {
11004 op
.stat2(&size
, &mtime_ts
, NULL
);
11007 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
11010 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &outbl
);
11013 *epoch
= ref
.ioctx
.get_last_version();
11022 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
11024 filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
11030 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
11031 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
)
11033 map
<string
, rgw_bucket_dir_header
> headers
;
11034 map
<int, string
> bucket_instance_ids
;
11035 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
11040 assert(headers
.size() == bucket_instance_ids
.size());
11042 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
11043 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
11044 BucketIndexShardsManager ver_mgr
;
11045 BucketIndexShardsManager master_ver_mgr
;
11046 BucketIndexShardsManager marker_mgr
;
11047 string shard_marker
;
11049 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
11050 accumulate_raw_stats(iter
->second
, stats
);
11051 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.ver
);
11052 ver_mgr
.add(viter
->first
, string(buf
));
11053 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.master_ver
);
11054 master_ver_mgr
.add(viter
->first
, string(buf
));
11055 if (shard_id
>= 0) {
11056 *max_marker
= iter
->second
.max_marker
;
11058 marker_mgr
.add(viter
->first
, iter
->second
.max_marker
);
11061 ver_mgr
.to_string(bucket_ver
);
11062 master_ver_mgr
.to_string(master_ver
);
11063 if (shard_id
< 0) {
11064 marker_mgr
.to_string(max_marker
);
11069 int RGWRados::get_bi_log_status(RGWBucketInfo
& bucket_info
, int shard_id
,
11070 map
<int, string
>& markers
)
11072 map
<string
, rgw_bucket_dir_header
> headers
;
11073 map
<int, string
> bucket_instance_ids
;
11074 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
11078 assert(headers
.size() == bucket_instance_ids
.size());
11080 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
11081 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
11083 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
11084 if (shard_id
>= 0) {
11085 markers
[shard_id
] = iter
->second
.max_marker
;
11087 markers
[viter
->first
] = iter
->second
.max_marker
;
11093 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
11094 RGWGetBucketStats_CB
*cb
;
11096 map
<RGWObjCategory
, RGWStorageStats
> stats
;
11102 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
11103 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true),
11104 lock("RGWGetBucketStatsContext") {}
11106 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
11107 Mutex::Locker
l(lock
);
11110 accumulate_raw_stats(header
, stats
);
11115 // Are we all done?
11116 if (--pendings
== 0) {
11118 cb
->set_response(&stats
);
11120 cb
->handle_response(ret_code
);
11127 Mutex::Locker
l(lock
);
11132 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
11135 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
);
11137 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
11142 get_ctx
->unset_cb();
11148 class RGWGetUserStatsContext
: public RGWGetUserHeader_CB
{
11149 RGWGetUserStats_CB
*cb
;
11152 explicit RGWGetUserStatsContext(RGWGetUserStats_CB
* const cb
)
11155 void handle_response(int r
, cls_user_header
& header
) override
{
11156 const cls_user_stats
& hs
= header
.stats
;
11158 RGWStorageStats stats
;
11160 stats
.size
= hs
.total_bytes
;
11161 stats
.size_rounded
= hs
.total_bytes_rounded
;
11162 stats
.num_objects
= hs
.total_entries
;
11164 cb
->set_response(stats
);
11167 cb
->handle_response(r
);
11173 int RGWRados::get_user_stats(const rgw_user
& user
, RGWStorageStats
& stats
)
11175 string user_str
= user
.to_str();
11177 cls_user_header header
;
11178 int r
= cls_user_get_header(user_str
, &header
);
11182 const cls_user_stats
& hs
= header
.stats
;
11184 stats
.size
= hs
.total_bytes
;
11185 stats
.size_rounded
= hs
.total_bytes_rounded
;
11186 stats
.num_objects
= hs
.total_entries
;
11191 int RGWRados::get_user_stats_async(const rgw_user
& user
, RGWGetUserStats_CB
*ctx
)
11193 string user_str
= user
.to_str();
11195 RGWGetUserStatsContext
*get_ctx
= new RGWGetUserStatsContext(ctx
);
11196 int r
= cls_user_get_header_async(user_str
, get_ctx
);
11206 void RGWRados::get_bucket_meta_oid(const rgw_bucket
& bucket
, string
& oid
)
11208 oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ bucket
.get_key(':');
11211 void RGWRados::get_bucket_instance_obj(const rgw_bucket
& bucket
, rgw_raw_obj
& obj
)
11213 if (!bucket
.oid
.empty()) {
11214 obj
.init(get_zone_params().domain_root
, bucket
.oid
);
11217 get_bucket_meta_oid(bucket
, oid
);
11218 obj
.init(get_zone_params().domain_root
, oid
);
11222 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
11223 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11225 size_t pos
= meta_key
.find(':');
11226 if (pos
== string::npos
) {
11229 string oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ meta_key
;
11230 rgw_bucket_instance_key_to_oid(oid
);
11232 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
11235 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
11236 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11239 if (bucket
.oid
.empty()) {
11240 get_bucket_meta_oid(bucket
, oid
);
11245 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
11248 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx
& obj_ctx
, string
& oid
, RGWBucketInfo
& info
,
11249 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
,
11250 rgw_cache_entry_info
*cache_info
)
11252 ldout(cct
, 20) << "reading from " << get_zone_params().domain_root
<< ":" << oid
<< dendl
;
11256 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, oid
, epbl
, &info
.objv_tracker
, pmtime
, pattrs
, cache_info
);
11261 bufferlist::iterator iter
= epbl
.begin();
11263 ::decode(info
, iter
);
11264 } catch (buffer::error
& err
) {
11265 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
11268 info
.bucket
.oid
= oid
;
11272 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx
& obj_ctx
,
11273 const string
& tenant_name
,
11274 const string
& bucket_name
,
11275 RGWBucketEntryPoint
& entry_point
,
11276 RGWObjVersionTracker
*objv_tracker
,
11278 map
<string
, bufferlist
> *pattrs
,
11279 rgw_cache_entry_info
*cache_info
)
11282 string bucket_entry
;
11284 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
11285 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, bucket_entry
, bl
, objv_tracker
, pmtime
, pattrs
, cache_info
);
11290 bufferlist::iterator iter
= bl
.begin();
11292 ::decode(entry_point
, iter
);
11293 } catch (buffer::error
& err
) {
11294 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
11300 int RGWRados::convert_old_bucket_info(RGWObjectCtx
& obj_ctx
,
11301 const string
& tenant_name
,
11302 const string
& bucket_name
)
11304 RGWBucketEntryPoint entry_point
;
11305 real_time ep_mtime
;
11306 RGWObjVersionTracker ot
;
11307 map
<string
, bufferlist
> attrs
;
11308 RGWBucketInfo info
;
11310 ldout(cct
, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name
<< dendl
;
11312 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant_name
, bucket_name
, entry_point
, &ot
, &ep_mtime
, &attrs
);
11314 ldout(cct
, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret
<< " bucket=" << bucket_name
<< dendl
;
11318 if (!entry_point
.has_bucket_info
) {
11319 /* already converted! */
11323 info
= entry_point
.old_bucket_info
;
11324 info
.bucket
.oid
= bucket_name
;
11325 info
.ep_objv
= ot
.read_version
;
11327 ot
.generate_new_write_ver(cct
);
11329 ret
= put_linked_bucket_info(info
, false, ep_mtime
, &ot
.write_version
, &attrs
, true);
11331 ldout(cct
, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret
<< dendl
;
11338 int RGWRados::get_bucket_info(RGWObjectCtx
& obj_ctx
,
11339 const string
& tenant
, const string
& bucket_name
, RGWBucketInfo
& info
,
11340 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
11342 bucket_info_entry e
;
11343 string bucket_entry
;
11344 rgw_make_bucket_entry_name(tenant
, bucket_name
, bucket_entry
);
11346 if (binfo_cache
->find(bucket_entry
, &e
)) {
11355 RGWBucketEntryPoint entry_point
;
11356 real_time ep_mtime
;
11357 RGWObjVersionTracker ot
;
11358 rgw_cache_entry_info entry_cache_info
;
11359 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant
, bucket_name
, entry_point
, &ot
, &ep_mtime
, pattrs
, &entry_cache_info
);
11361 /* only init these fields */
11362 info
.bucket
.tenant
= tenant
;
11363 info
.bucket
.name
= bucket_name
;
11367 if (entry_point
.has_bucket_info
) {
11368 info
= entry_point
.old_bucket_info
;
11369 info
.bucket
.oid
= bucket_name
;
11370 info
.bucket
.tenant
= tenant
;
11371 info
.ep_objv
= ot
.read_version
;
11372 ldout(cct
, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info
.bucket
<< " owner " << info
.owner
<< dendl
;
11376 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11383 ldout(cct
, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point
.bucket
<< dendl
;
11386 /* read bucket instance info */
11389 get_bucket_meta_oid(entry_point
.bucket
, oid
);
11391 rgw_cache_entry_info cache_info
;
11393 ret
= get_bucket_instance_from_oid(obj_ctx
, oid
, e
.info
, &e
.mtime
, &e
.attrs
, &cache_info
);
11394 e
.info
.ep_objv
= ot
.read_version
;
11397 info
.bucket
.tenant
= tenant
;
11398 info
.bucket
.name
= bucket_name
;
11399 // XXX and why return anything in case of an error anyway?
11408 list
<rgw_cache_entry_info
*> cache_info_entries
;
11409 cache_info_entries
.push_back(&entry_cache_info
);
11410 cache_info_entries
.push_back(&cache_info
);
11413 /* chain to both bucket entry point and bucket instance */
11414 if (!binfo_cache
->put(this, bucket_entry
, &e
, cache_info_entries
)) {
11415 ldout(cct
, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl
;
11421 int RGWRados::put_bucket_entrypoint_info(const string
& tenant_name
, const string
& bucket_name
, RGWBucketEntryPoint
& entry_point
,
11422 bool exclusive
, RGWObjVersionTracker
& objv_tracker
, real_time mtime
,
11423 map
<string
, bufferlist
> *pattrs
)
11426 ::encode(entry_point
, epbl
);
11427 string bucket_entry
;
11428 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
11429 return rgw_bucket_store_info(this, bucket_entry
, epbl
, exclusive
, pattrs
, &objv_tracker
, mtime
);
11432 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
11433 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
11435 info
.has_instance_obj
= true;
11438 ::encode(info
, bl
);
11440 string key
= info
.bucket
.get_key(); /* when we go through meta api, we don't use oid directly */
11441 int ret
= rgw_bucket_instance_store_info(this, key
, bl
, exclusive
, pattrs
, &info
.objv_tracker
, mtime
);
11442 if (ret
== -EEXIST
) {
11443 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11444 * bucket operation on this specific bucket (e.g., being synced from the master), but
11445 * since bucket instace meta object is unique for this specific bucket instace, we don't
11446 * need to return an error.
11447 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11448 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11449 * locally, while in the sync thread we sync the new bucket.
11456 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
11457 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
11459 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
11461 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
11467 return 0; /* done! */
11469 RGWBucketEntryPoint entry_point
;
11470 entry_point
.bucket
= info
.bucket
;
11471 entry_point
.owner
= info
.owner
;
11472 entry_point
.creation_time
= info
.creation_time
;
11473 entry_point
.linked
= true;
11474 RGWObjVersionTracker ot
;
11475 if (pep_objv
&& !pep_objv
->tag
.empty()) {
11476 ot
.write_version
= *pep_objv
;
11478 ot
.generate_new_write_ver(cct
);
11480 *pep_objv
= ot
.write_version
;
11483 ret
= put_bucket_entrypoint_info(info
.bucket
.tenant
, info
.bucket
.name
, entry_point
, exclusive
, ot
, mtime
, NULL
);
11490 int RGWRados::omap_get_vals(rgw_raw_obj
& obj
, bufferlist
& header
, const string
& marker
, uint64_t count
, std::map
<string
, bufferlist
>& m
)
11493 int r
= get_raw_obj_ref(obj
, &ref
);
11498 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, marker
, count
, &m
);
11506 int RGWRados::omap_get_all(rgw_raw_obj
& obj
, bufferlist
& header
,
11507 std::map
<string
, bufferlist
>& m
)
11510 int r
= get_raw_obj_ref(obj
, &ref
);
11515 #define MAX_OMAP_GET_ENTRIES 1024
11516 const int count
= MAX_OMAP_GET_ENTRIES
;
11517 string start_after
;
11520 std::map
<string
, bufferlist
> t
;
11521 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, start_after
, count
, &t
);
11528 start_after
= t
.rbegin()->first
;
11529 m
.insert(t
.begin(), t
.end());
11534 int RGWRados::omap_set(rgw_raw_obj
& obj
, const std::string
& key
, bufferlist
& bl
)
11537 int r
= get_raw_obj_ref(obj
, &ref
);
11541 ldout(cct
, 15) << "omap_set obj=" << obj
<< " key=" << key
<< dendl
;
11543 map
<string
, bufferlist
> m
;
11546 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
11551 int RGWRados::omap_set(rgw_raw_obj
& obj
, std::map
<std::string
, bufferlist
>& m
)
11554 int r
= get_raw_obj_ref(obj
, &ref
);
11559 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
11564 int RGWRados::omap_del(rgw_raw_obj
& obj
, const std::string
& key
)
11567 int r
= get_raw_obj_ref(obj
, &ref
);
11575 r
= ref
.ioctx
.omap_rm_keys(ref
.oid
, k
);
11579 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
11581 RGWObjectCtx
obj_ctx(this);
11583 map
<string
, RGWBucketEnt
>::iterator iter
;
11584 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
11585 RGWBucketEnt
& ent
= iter
->second
;
11586 rgw_bucket
& bucket
= ent
.bucket
;
11589 ent
.size_rounded
= 0;
11591 map
<string
, rgw_bucket_dir_header
> headers
;
11593 RGWBucketInfo bucket_info
;
11594 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
11599 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
11603 map
<string
, rgw_bucket_dir_header
>::iterator hiter
= headers
.begin();
11604 for (; hiter
!= headers
.end(); ++hiter
) {
11605 RGWObjCategory category
= main_category
;
11606 map
<uint8_t, struct rgw_bucket_category_stats
>::iterator iter
= (hiter
->second
.stats
).find((uint8_t)category
);
11607 if (iter
!= hiter
->second
.stats
.end()) {
11608 struct rgw_bucket_category_stats
& stats
= iter
->second
;
11609 ent
.count
+= stats
.num_entries
;
11610 ent
.size
+= stats
.total_size
;
11611 ent
.size_rounded
+= stats
.total_size_rounded
;
11619 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
11622 int r
= get_raw_obj_ref(obj
, &ref
);
11626 librados::Rados
*rad
= get_rados_handle();
11627 librados::AioCompletion
*completion
= rad
->aio_create_completion(NULL
, NULL
, NULL
);
11629 r
= ref
.ioctx
.aio_append(ref
.oid
, completion
, bl
, size
);
11630 completion
->release();
11634 int RGWRados::distribute(const string
& key
, bufferlist
& bl
)
11637 * we were called before watch was initialized. This can only happen if we're updating some system
11638 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
11639 * objects, they're currently only read on startup anyway.
11641 if (!watch_initialized
)
11645 pick_control_oid(key
, notify_oid
);
11647 ldout(cct
, 10) << "distributing notification oid=" << notify_oid
<< " bl.length()=" << bl
.length() << dendl
;
11648 return control_pool_ctx
.notify2(notify_oid
, bl
, 0, NULL
);
11651 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
11653 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
11654 librados::NObjectIterator
& iter
= ctx
.iter
;
11656 int r
= open_pool_ctx(pool
, io_ctx
);
11660 iter
= io_ctx
.nobjects_begin();
11665 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
11666 bool *is_truncated
, RGWAccessListFilter
*filter
)
11668 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
11669 librados::NObjectIterator
& iter
= ctx
.iter
;
11671 if (iter
== io_ctx
.nobjects_end())
11676 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
11677 rgw_bucket_dir_entry e
;
11679 string oid
= iter
->get_oid();
11680 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
11682 // fill it in with initial values; we may correct later
11683 if (filter
&& !filter
->filter(oid
, oid
))
11691 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
11693 return objs
.size();
11695 struct RGWAccessListFilterPrefix
: public RGWAccessListFilter
{
11698 explicit RGWAccessListFilterPrefix(const string
& _prefix
) : prefix(_prefix
) {}
11699 bool filter(string
& name
, string
& key
) override
{
11700 return (prefix
.compare(key
.substr(0, prefix
.size())) == 0);
11704 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
11705 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
11706 bool *is_truncated
)
11708 RGWAccessListFilterPrefix
filter(prefix_filter
);
11710 if (!ctx
.initialized
) {
11711 int r
= pool_iterate_begin(pool
, ctx
.iter_ctx
);
11713 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
11716 ctx
.initialized
= true;
11719 vector
<rgw_bucket_dir_entry
> objs
;
11720 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
11723 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
11727 vector
<rgw_bucket_dir_entry
>::iterator iter
;
11728 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
11729 oids
.push_back(iter
->key
.name
);
11732 return oids
.size();
11735 int RGWRados::list_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& marker
, uint32_t max
,
11736 std::list
<rgw_bi_log_entry
>& result
, bool *truncated
)
11738 ldout(cct
, 20) << __func__
<< ": " << bucket_info
.bucket
<< " marker " << marker
<< " shard_id=" << shard_id
<< " max " << max
<< dendl
;
11741 librados::IoCtx index_ctx
;
11742 map
<int, string
> oids
;
11743 map
<int, cls_rgw_bi_log_list_ret
> bi_log_lists
;
11744 map
<int, string
> bucket_instance_ids
;
11745 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, &bucket_instance_ids
);
11749 BucketIndexShardsManager marker_mgr
;
11750 bool has_shards
= (oids
.size() > 1 || shard_id
>= 0);
11751 // If there are multiple shards for the bucket index object, the marker
11752 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
11753 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
11754 // only contain one record, and the key is the bucket instance id.
11755 r
= marker_mgr
.from_string(marker
, shard_id
);
11759 r
= CLSRGWIssueBILogList(index_ctx
, marker_mgr
, max
, oids
, bi_log_lists
, cct
->_conf
->rgw_bucket_index_max_aio
)();
11763 map
<int, list
<rgw_bi_log_entry
>::iterator
> vcurrents
;
11764 map
<int, list
<rgw_bi_log_entry
>::iterator
> vends
;
11766 *truncated
= false;
11768 map
<int, cls_rgw_bi_log_list_ret
>::iterator miter
= bi_log_lists
.begin();
11769 for (; miter
!= bi_log_lists
.end(); ++miter
) {
11770 int shard_id
= miter
->first
;
11771 vcurrents
[shard_id
] = miter
->second
.entries
.begin();
11772 vends
[shard_id
] = miter
->second
.entries
.end();
11774 *truncated
= (*truncated
|| miter
->second
.truncated
);
11779 bool has_more
= true;
11780 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator viter
;
11781 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator eiter
;
11782 while (total
< max
&& has_more
) {
11785 viter
= vcurrents
.begin();
11786 eiter
= vends
.begin();
11788 for (; total
< max
&& viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
11789 assert (eiter
!= vends
.end());
11791 int shard_id
= viter
->first
;
11792 list
<rgw_bi_log_entry
>::iterator
& liter
= viter
->second
;
11794 if (liter
== eiter
->second
){
11797 rgw_bi_log_entry
& entry
= *(liter
);
11800 snprintf(buf
, sizeof(buf
), "%d", shard_id
);
11802 build_bucket_index_marker(buf
, entry
.id
, &tmp_id
);
11803 entry
.id
.swap(tmp_id
);
11805 marker_mgr
.add(shard_id
, entry
.id
);
11806 result
.push_back(entry
);
11814 for (viter
= vcurrents
.begin(), eiter
= vends
.begin(); viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
11815 assert (eiter
!= vends
.end());
11816 *truncated
= (*truncated
|| (viter
->second
!= eiter
->second
));
11820 // Refresh marker, if there are multiple shards, the output will look like
11821 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
11822 // if there is no sharding, the simply marker (without oid) is returned
11824 marker_mgr
.to_string(&marker
);
11826 if (!result
.empty()) {
11827 marker
= result
.rbegin()->id
;
11834 int RGWRados::trim_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& start_marker
, string
& end_marker
)
11836 librados::IoCtx index_ctx
;
11837 map
<int, string
> bucket_objs
;
11838 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
11842 BucketIndexShardsManager start_marker_mgr
;
11843 r
= start_marker_mgr
.from_string(start_marker
, shard_id
);
11846 BucketIndexShardsManager end_marker_mgr
;
11847 r
= end_marker_mgr
.from_string(end_marker
, shard_id
);
11851 return CLSRGWIssueBILogTrim(index_ctx
, start_marker_mgr
, end_marker_mgr
, bucket_objs
,
11852 cct
->_conf
->rgw_bucket_index_max_aio
)();
11855 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, rgw_bucket_dir_entry
*dirent
)
11858 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
11863 rgw_cls_bi_entry bi_entry
;
11864 r
= bi_get(obj
.bucket
, obj
, InstanceIdx
, &bi_entry
);
11865 if (r
< 0 && r
!= -ENOENT
) {
11866 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
11871 bufferlist::iterator iter
= bi_entry
.data
.begin();
11873 ::decode(*dirent
, iter
);
11874 } catch (buffer::error
& err
) {
11875 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
11882 int RGWRados::bi_get(rgw_bucket
& bucket
, rgw_obj
& obj
, BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
11884 BucketShard
bs(this);
11885 int ret
= bs
.init(bucket
, obj
);
11887 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11891 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
11893 ret
= cls_rgw_bi_get(bs
.index_ctx
, bs
.bucket_obj
, index_type
, key
, entry
);
11900 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
11902 cls_rgw_bi_put(op
, bs
.bucket_obj
, entry
);
11905 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
11907 int ret
= cls_rgw_bi_put(bs
.index_ctx
, bs
.bucket_obj
, entry
);
11914 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
11916 BucketShard
bs(this);
11917 int ret
= bs
.init(bucket
, obj
);
11919 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11923 return bi_put(bs
, entry
);
11926 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
11928 rgw_obj
obj(bucket
, obj_name
);
11929 BucketShard
bs(this);
11930 int ret
= bs
.init(bucket
, obj
);
11932 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11936 ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, obj_name
, marker
, max
, entries
, is_truncated
);
11943 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
11945 int ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, filter_obj
, marker
, max
, entries
, is_truncated
);
11952 int RGWRados::bi_remove(BucketShard
& bs
)
11954 int ret
= bs
.index_ctx
.remove(bs
.bucket_obj
);
11955 if (ret
== -ENOENT
) {
11959 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
11966 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
11968 BucketShard
bs(this);
11969 int ret
= bs
.init(bucket
, shard_id
);
11971 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11975 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
11978 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
11980 return gc_pool_ctx
.operate(oid
, op
);
11983 int RGWRados::gc_aio_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
11985 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
11986 int r
= gc_pool_ctx
.aio_operate(oid
, c
, op
);
11991 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
11993 return gc_pool_ctx
.operate(oid
, op
, pbl
);
11996 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
)
11998 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
);
12001 int RGWRados::process_gc()
12003 return gc
->process();
12006 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
12008 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
12011 int RGWRados::process_lc()
12013 return lc
->process();
12016 int RGWRados::process_expire_objects()
12018 obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
12022 int RGWRados::cls_rgw_init_index(librados::IoCtx
& index_ctx
, librados::ObjectWriteOperation
& op
, string
& oid
)
12025 cls_rgw_bucket_init(op
);
12026 return index_ctx
.operate(oid
, &op
);
12029 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
12030 rgw_obj
& obj
, uint16_t bilog_flags
)
12032 ObjectWriteOperation o
;
12033 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
12034 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), get_zone().log_data
, bilog_flags
);
12035 return bs
.index_ctx
.operate(bs
.bucket_obj
, &o
);
12038 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
12039 int64_t pool
, uint64_t epoch
,
12040 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
12041 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
)
12043 list
<cls_rgw_obj_key
> *pro
= NULL
;
12044 list
<cls_rgw_obj_key
> ro
;
12047 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
12048 ro
.push_back(*iter
);
12053 ObjectWriteOperation o
;
12054 rgw_bucket_dir_entry_meta dir_meta
;
12055 dir_meta
= ent
.meta
;
12056 dir_meta
.category
= category
;
12058 rgw_bucket_entry_ver ver
;
12061 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
12062 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, pro
,
12063 get_zone().log_data
, bilog_flags
);
12065 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
12066 int ret
= bs
.index_ctx
.aio_operate(bs
.bucket_obj
, c
, &o
);
12071 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, string
& tag
,
12072 int64_t pool
, uint64_t epoch
,
12073 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
12074 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
)
12076 return cls_obj_complete_op(bs
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
);
12079 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
12080 int64_t pool
, uint64_t epoch
,
12082 real_time
& removed_mtime
,
12083 list
<rgw_obj_index_key
> *remove_objs
,
12084 uint16_t bilog_flags
)
12086 rgw_bucket_dir_entry ent
;
12087 ent
.meta
.mtime
= removed_mtime
;
12088 obj
.key
.get_index_key(&ent
.key
);
12089 return cls_obj_complete_op(bs
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
, ent
, RGW_OBJ_CATEGORY_NONE
, remove_objs
, bilog_flags
);
12092 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
)
12094 rgw_bucket_dir_entry ent
;
12095 obj
.key
.get_index_key(&ent
.key
);
12096 return cls_obj_complete_op(bs
, CLS_RGW_OP_CANCEL
, tag
, -1 /* pool id */, 0, ent
, RGW_OBJ_CATEGORY_NONE
, NULL
, bilog_flags
);
12099 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
12101 librados::IoCtx index_ctx
;
12102 map
<int, string
> bucket_objs
;
12103 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
12107 return CLSRGWIssueSetTagTimeout(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
12110 int RGWRados::cls_bucket_list(RGWBucketInfo
& bucket_info
, int shard_id
, rgw_obj_index_key
& start
, const string
& prefix
,
12111 uint32_t num_entries
, bool list_versions
, map
<string
, rgw_bucket_dir_entry
>& m
,
12112 bool *is_truncated
, rgw_obj_index_key
*last_entry
,
12113 bool (*force_check_filter
)(const string
& name
))
12115 ldout(cct
, 10) << "cls_bucket_list " << bucket_info
.bucket
<< " start " << start
.name
<< "[" << start
.instance
<< "] num_entries " << num_entries
<< dendl
;
12117 librados::IoCtx index_ctx
;
12118 // key - oid (for different shards if there is any)
12119 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12120 map
<int, string
> oids
;
12121 map
<int, struct rgw_cls_list_ret
> list_results
;
12122 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
12126 cls_rgw_obj_key
start_key(start
.name
, start
.instance
);
12127 r
= CLSRGWIssueBucketList(index_ctx
, start_key
, prefix
, num_entries
, list_versions
,
12128 oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12132 // Create a list of iterators that are used to iterate each shard
12133 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vcurrents(list_results
.size());
12134 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vends(list_results
.size());
12135 vector
<string
> vnames(list_results
.size());
12136 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
12137 *is_truncated
= false;
12138 for (; iter
!= list_results
.end(); ++iter
) {
12139 vcurrents
.push_back(iter
->second
.dir
.m
.begin());
12140 vends
.push_back(iter
->second
.dir
.m
.end());
12141 vnames
.push_back(oids
[iter
->first
]);
12142 *is_truncated
= (*is_truncated
|| iter
->second
.is_truncated
);
12145 // Create a map to track the next candidate entry from each shard, if the entry
12146 // from a specified shard is selected/erased, the next entry from that shard will
12147 // be inserted for next round selection
12148 map
<string
, size_t> candidates
;
12149 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
12150 if (vcurrents
[i
] != vends
[i
]) {
12151 candidates
[vcurrents
[i
]->first
] = i
;
12155 map
<string
, bufferlist
> updates
;
12156 uint32_t count
= 0;
12157 while (count
< num_entries
&& !candidates
.empty()) {
12159 // Select the next one
12160 int pos
= candidates
.begin()->second
;
12161 const string
& name
= vcurrents
[pos
]->first
;
12162 struct rgw_bucket_dir_entry
& dirent
= vcurrents
[pos
]->second
;
12164 bool force_check
= force_check_filter
&& force_check_filter(dirent
.key
.name
);
12165 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) || !dirent
.pending_map
.empty() || force_check
) {
12166 /* there are uncommitted ops. We need to check the current state,
12167 * and if the tags are old we need to do cleanup as well. */
12168 librados::IoCtx sub_ctx
;
12169 sub_ctx
.dup(index_ctx
);
12170 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[vnames
[pos
]]);
12171 if (r
< 0 && r
!= -ENOENT
) {
12176 ldout(cct
, 10) << "RGWRados::cls_bucket_list: got " << dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
12177 m
[name
] = std::move(dirent
);
12181 // Refresh the candidates map
12182 candidates
.erase(candidates
.begin());
12184 if (vcurrents
[pos
] != vends
[pos
]) {
12185 candidates
[vcurrents
[pos
]->first
] = pos
;
12189 // Suggest updates if there is any
12190 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
12191 for (; miter
!= updates
.end(); ++miter
) {
12192 if (miter
->second
.length()) {
12193 ObjectWriteOperation o
;
12194 cls_rgw_suggest_changes(o
, miter
->second
);
12195 // we don't care if we lose suggested updates, send them off blindly
12196 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
12197 index_ctx
.aio_operate(miter
->first
, c
, &o
);
12202 // Check if all the returned entries are consumed or not
12203 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
12204 if (vcurrents
[i
] != vends
[i
])
12205 *is_truncated
= true;
12208 *last_entry
= m
.rbegin()->first
;
12213 int RGWRados::cls_obj_usage_log_add(const string
& oid
, rgw_usage_log_info
& info
)
12215 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12219 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12224 ObjectWriteOperation op
;
12225 cls_rgw_usage_log_add(op
, info
);
12227 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12231 int RGWRados::cls_obj_usage_log_read(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
12232 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
, bool *is_truncated
)
12234 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12238 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12243 *is_truncated
= false;
12245 r
= cls_rgw_usage_log_read(ref
.ioctx
, ref
.oid
, user
, start_epoch
, end_epoch
,
12246 max_entries
, read_iter
, usage
, is_truncated
);
12251 int RGWRados::cls_obj_usage_log_trim(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
12253 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
12257 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12262 ObjectWriteOperation op
;
12263 cls_rgw_usage_log_trim(op
, user
, start_epoch
, end_epoch
);
12265 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12269 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
12271 librados::IoCtx index_ctx
;
12274 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
12276 int r
= open_bucket_index(bucket_info
, index_ctx
, dir_oid
);
12280 bufferlist updates
;
12282 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
12283 rgw_bucket_dir_entry entry
;
12285 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
12286 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12287 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
12288 ::encode(entry
, updates
);
12293 r
= index_ctx
.exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
12298 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
12299 const RGWBucketInfo
& bucket_info
,
12300 rgw_bucket_dir_entry
& list_state
,
12301 rgw_bucket_dir_entry
& object
,
12302 bufferlist
& suggested_updates
)
12304 const rgw_bucket
& bucket
= bucket_info
.bucket
;
12305 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
12309 rgw_obj
obj(bucket
, list_state
.key
);
12312 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
12314 if (loc
!= list_state
.locator
) {
12315 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
12318 io_ctx
.locator_set_key(list_state
.locator
);
12320 RGWObjState
*astate
= NULL
;
12321 RGWObjectCtx
rctx(this);
12322 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
12326 list_state
.pending_map
.clear(); // we don't need this and it inflates size
12327 if (!astate
->exists
) {
12328 /* object doesn't exist right now -- hopefully because it's
12329 * marked as !exists and got deleted */
12330 if (list_state
.exists
) {
12331 /* FIXME: what should happen now? Work out if there are any
12332 * non-bad ways this could happen (there probably are, but annoying
12335 // encode a suggested removal of that key
12336 list_state
.ver
.epoch
= io_ctx
.get_last_version();
12337 list_state
.ver
.pool
= io_ctx
.get_id();
12338 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
12343 string content_type
;
12346 object
.meta
.size
= astate
->size
;
12347 object
.meta
.accounted_size
= astate
->accounted_size
;
12348 object
.meta
.mtime
= astate
->mtime
;
12350 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
12351 if (iter
!= astate
->attrset
.end()) {
12352 etag
= iter
->second
.c_str();
12354 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
12355 if (iter
!= astate
->attrset
.end()) {
12356 content_type
= iter
->second
.c_str();
12358 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
12359 if (iter
!= astate
->attrset
.end()) {
12360 r
= decode_policy(iter
->second
, &owner
);
12362 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
12366 if (astate
->has_manifest
) {
12367 RGWObjManifest::obj_iterator miter
;
12368 RGWObjManifest
& manifest
= astate
->manifest
;
12369 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
12370 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
12372 rgw_raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
12374 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
12375 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
12376 r
= delete_obj_index(loc
);
12378 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
12384 object
.meta
.etag
= etag
;
12385 object
.meta
.content_type
= content_type
;
12386 object
.meta
.owner
= owner
.get_id().to_str();
12387 object
.meta
.owner_display_name
= owner
.get_display_name();
12389 // encode suggested updates
12390 list_state
.ver
.pool
= io_ctx
.get_id();
12391 list_state
.ver
.epoch
= astate
->epoch
;
12392 list_state
.meta
.size
= object
.meta
.size
;
12393 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
12394 list_state
.meta
.mtime
= object
.meta
.mtime
;
12395 list_state
.meta
.category
= main_category
;
12396 list_state
.meta
.etag
= etag
;
12397 list_state
.meta
.content_type
= content_type
;
12398 if (astate
->obj_tag
.length() > 0)
12399 list_state
.tag
= astate
->obj_tag
.c_str();
12400 list_state
.meta
.owner
= owner
.get_id().to_str();
12401 list_state
.meta
.owner_display_name
= owner
.get_display_name();
12403 list_state
.exists
= true;
12404 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
12408 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<string
, struct rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
12410 librados::IoCtx index_ctx
;
12411 map
<int, string
> oids
;
12412 map
<int, struct rgw_cls_list_ret
> list_results
;
12413 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, list_results
, shard_id
, bucket_instance_ids
);
12417 r
= CLSRGWIssueGetDirHeader(index_ctx
, oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12421 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
12422 for(; iter
!= list_results
.end(); ++iter
) {
12423 headers
[oids
[iter
->first
]] = iter
->second
.dir
.header
;
12428 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
12430 librados::IoCtx index_ctx
;
12431 map
<int, string
> bucket_objs
;
12432 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12436 map
<int, string
>::iterator iter
= bucket_objs
.begin();
12437 for (; iter
!= bucket_objs
.end(); ++iter
) {
12438 r
= cls_rgw_get_dir_header_async(index_ctx
, iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
12449 int RGWRados::cls_user_get_header(const string
& user_id
, cls_user_header
*header
)
12451 string buckets_obj_id
;
12452 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
12453 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
12457 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12462 librados::ObjectReadOperation op
;
12464 ::cls_user_get_header(op
, header
, &rc
);
12466 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
12475 int RGWRados::cls_user_get_header_async(const string
& user_id
, RGWGetUserHeader_CB
*ctx
)
12477 string buckets_obj_id
;
12478 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
12479 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
12483 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12488 r
= ::cls_user_get_header_async(ref
.ioctx
, ref
.oid
, ctx
);
12495 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj
& user_obj
, const RGWBucketInfo
& bucket_info
)
12497 map
<string
, struct rgw_bucket_dir_header
> headers
;
12498 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
12500 ldout(cct
, 20) << "cls_bucket_header() returned " << r
<< dendl
;
12504 cls_user_bucket_entry entry
;
12506 bucket_info
.bucket
.convert(&entry
.bucket
);
12508 map
<string
, struct rgw_bucket_dir_header
>::iterator hiter
= headers
.begin();
12509 for (; hiter
!= headers
.end(); ++hiter
) {
12510 map
<uint8_t, struct rgw_bucket_category_stats
>::iterator iter
= hiter
->second
.stats
.begin();
12511 for (; iter
!= hiter
->second
.stats
.end(); ++iter
) {
12512 struct rgw_bucket_category_stats
& header_stats
= iter
->second
;
12513 entry
.size
+= header_stats
.total_size
;
12514 entry
.size_rounded
+= header_stats
.total_size_rounded
;
12515 entry
.count
+= header_stats
.num_entries
;
12519 list
<cls_user_bucket_entry
> entries
;
12520 entries
.push_back(entry
);
12522 r
= cls_user_update_buckets(user_obj
, entries
, false);
12524 ldout(cct
, 20) << "cls_user_update_buckets() returned " << r
<< dendl
;
12531 int RGWRados::cls_user_list_buckets(rgw_raw_obj
& obj
,
12532 const string
& in_marker
,
12533 const string
& end_marker
,
12534 const int max_entries
,
12535 list
<cls_user_bucket_entry
>& entries
,
12536 string
* const out_marker
,
12537 bool * const truncated
)
12541 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12546 librados::ObjectReadOperation op
;
12549 cls_user_bucket_list(op
, in_marker
, end_marker
, max_entries
, entries
, out_marker
, truncated
, &rc
);
12551 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
12560 int RGWRados::cls_user_update_buckets(rgw_raw_obj
& obj
, list
<cls_user_bucket_entry
>& entries
, bool add
)
12564 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12569 librados::ObjectWriteOperation op
;
12570 cls_user_set_buckets(op
, entries
, add
);
12571 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12578 int RGWRados::complete_sync_user_stats(const rgw_user
& user_id
)
12580 string buckets_obj_id
;
12581 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
12582 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
12583 return cls_user_complete_stats_sync(obj
);
12586 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj
& obj
)
12590 int r
= get_raw_obj_ref(obj
, &ref
, &pool
);
12595 librados::ObjectWriteOperation op
;
12596 ::cls_user_complete_stats_sync(op
);
12597 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12604 int RGWRados::cls_user_add_bucket(rgw_raw_obj
& obj
, const cls_user_bucket_entry
& entry
)
12606 list
<cls_user_bucket_entry
> l
;
12607 l
.push_back(entry
);
12609 return cls_user_update_buckets(obj
, l
, true);
12612 int RGWRados::cls_user_remove_bucket(rgw_raw_obj
& obj
, const cls_user_bucket
& bucket
)
12616 int r
= get_system_obj_ref(obj
, &ref
, &p
);
12621 librados::ObjectWriteOperation op
;
12622 ::cls_user_remove_bucket(op
, bucket
);
12623 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
12630 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
12631 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
)
12633 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
12636 void RGWRados::get_bucket_index_objects(const string
& bucket_oid_base
,
12637 uint32_t num_shards
, map
<int, string
>& bucket_objects
, int shard_id
)
12640 bucket_objects
[0] = bucket_oid_base
;
12642 char buf
[bucket_oid_base
.size() + 32];
12643 if (shard_id
< 0) {
12644 for (uint32_t i
= 0; i
< num_shards
; ++i
) {
12645 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), i
);
12646 bucket_objects
[i
] = buf
;
12649 if ((uint32_t)shard_id
> num_shards
) {
12652 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
12653 bucket_objects
[shard_id
] = buf
;
12658 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<int, string
> *result
)
12660 const rgw_bucket
& bucket
= bucket_info
.bucket
;
12661 string plain_id
= bucket
.name
+ ":" + bucket
.bucket_id
;
12662 if (!bucket_info
.num_shards
) {
12663 (*result
)[0] = plain_id
;
12666 if (shard_id
< 0) {
12667 for (uint32_t i
= 0; i
< bucket_info
.num_shards
; ++i
) {
12668 snprintf(buf
, sizeof(buf
), ":%d", i
);
12669 (*result
)[i
] = plain_id
+ buf
;
12672 if ((uint32_t)shard_id
> bucket_info
.num_shards
) {
12675 snprintf(buf
, sizeof(buf
), ":%d", shard_id
);
12676 (*result
)[shard_id
] = plain_id
+ buf
;
12681 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
12685 switch (bucket_info
.bucket_index_shard_hash_type
) {
12686 case RGWBucketInfo::MOD
:
12687 if (!bucket_info
.num_shards
) {
12692 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
12693 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
12694 sid
= sid2
% MAX_BUCKET_INDEX_SHARDS_PRIME
% bucket_info
.num_shards
;
12696 *shard_id
= (int)sid
;
12706 void RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, uint32_t num_shards
,
12707 int shard_id
, string
*bucket_obj
)
12710 // By default with no sharding, we use the bucket oid as itself
12711 (*bucket_obj
) = bucket_oid_base
;
12713 char buf
[bucket_oid_base
.size() + 32];
12714 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
12715 (*bucket_obj
) = buf
;
12719 int RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, const string
& obj_key
,
12720 uint32_t num_shards
, RGWBucketInfo::BIShardsHashType hash_type
, string
*bucket_obj
, int *shard_id
)
12723 switch (hash_type
) {
12724 case RGWBucketInfo::MOD
:
12726 // By default with no sharding, we use the bucket oid as itself
12727 (*bucket_obj
) = bucket_oid_base
;
12732 uint32_t sid
= ceph_str_hash_linux(obj_key
.c_str(), obj_key
.size());
12733 uint32_t sid2
= sid
^ ((sid
& 0xFF) << 24);
12734 sid
= sid2
% MAX_BUCKET_INDEX_SHARDS_PRIME
% num_shards
;
12735 char buf
[bucket_oid_base
.size() + 32];
12736 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), sid
);
12737 (*bucket_obj
) = buf
;
12739 *shard_id
= (int)sid
;
12749 void RGWStateLog::oid_str(int shard
, string
& oid
) {
12750 oid
= RGW_STATELOG_OBJ_PREFIX
+ module_name
+ ".";
12752 snprintf(buf
, sizeof(buf
), "%d", shard
);
12756 int RGWStateLog::get_shard_num(const string
& object
) {
12757 uint32_t val
= ceph_str_hash_linux(object
.c_str(), object
.length());
12758 return val
% num_shards
;
12761 string
RGWStateLog::get_oid(const string
& object
) {
12762 int shard
= get_shard_num(object
);
12764 oid_str(shard
, oid
);
12768 int RGWStateLog::open_ioctx(librados::IoCtx
& ioctx
) {
12770 store
->get_log_pool(pool
);
12771 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ioctx
);
12773 lderr(store
->ctx()) << "ERROR: could not open rados pool" << dendl
;
12779 int RGWStateLog::store_entry(const string
& client_id
, const string
& op_id
, const string
& object
,
12780 uint32_t state
, bufferlist
*bl
, uint32_t *check_state
)
12782 if (client_id
.empty() ||
12785 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
12788 librados::IoCtx ioctx
;
12789 int r
= open_ioctx(ioctx
);
12793 string oid
= get_oid(object
);
12795 librados::ObjectWriteOperation op
;
12797 cls_statelog_check_state(op
, client_id
, op_id
, object
, *check_state
);
12799 utime_t ts
= ceph_clock_now();
12801 cls_statelog_add(op
, client_id
, op_id
, object
, ts
, state
, (bl
? *bl
: nobl
));
12802 r
= ioctx
.operate(oid
, &op
);
12810 int RGWStateLog::remove_entry(const string
& client_id
, const string
& op_id
, const string
& object
)
12812 if (client_id
.empty() ||
12815 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
12818 librados::IoCtx ioctx
;
12819 int r
= open_ioctx(ioctx
);
12823 string oid
= get_oid(object
);
12825 librados::ObjectWriteOperation op
;
12826 cls_statelog_remove_by_object(op
, object
, op_id
);
12827 r
= ioctx
.operate(oid
, &op
);
12835 void RGWStateLog::init_list_entries(const string
& client_id
, const string
& op_id
, const string
& object
,
12838 list_state
*state
= new list_state
;
12839 state
->client_id
= client_id
;
12840 state
->op_id
= op_id
;
12841 state
->object
= object
;
12842 if (object
.empty()) {
12843 state
->cur_shard
= 0;
12844 state
->max_shard
= num_shards
- 1;
12846 state
->cur_shard
= state
->max_shard
= get_shard_num(object
);
12848 *handle
= (void *)state
;
12851 int RGWStateLog::list_entries(void *handle
, int max_entries
,
12852 list
<cls_statelog_entry
>& entries
,
12855 list_state
*state
= static_cast<list_state
*>(handle
);
12857 librados::IoCtx ioctx
;
12858 int r
= open_ioctx(ioctx
);
12864 for (; state
->cur_shard
<= state
->max_shard
&& max_entries
> 0; ++state
->cur_shard
) {
12866 oid_str(state
->cur_shard
, oid
);
12868 librados::ObjectReadOperation op
;
12869 list
<cls_statelog_entry
> ents
;
12871 cls_statelog_list(op
, state
->client_id
, state
->op_id
, state
->object
, state
->marker
,
12872 max_entries
, ents
, &state
->marker
, &truncated
);
12874 r
= ioctx
.operate(oid
, &op
, &ibl
);
12875 if (r
== -ENOENT
) {
12880 ldout(store
->ctx(), 0) << "cls_statelog_list returned " << r
<< dendl
;
12885 state
->marker
.clear();
12888 max_entries
-= ents
.size();
12890 entries
.splice(entries
.end(), ents
);
12896 *done
= (state
->cur_shard
> state
->max_shard
);
12901 void RGWStateLog::finish_list_entries(void *handle
)
12903 list_state
*state
= static_cast<list_state
*>(handle
);
12907 void RGWStateLog::dump_entry(const cls_statelog_entry
& entry
, Formatter
*f
)
12909 f
->open_object_section("statelog_entry");
12910 f
->dump_string("client_id", entry
.client_id
);
12911 f
->dump_string("op_id", entry
.op_id
);
12912 f
->dump_string("object", entry
.object
);
12913 entry
.timestamp
.gmtime_nsec(f
->dump_stream("timestamp"));
12914 if (!dump_entry_internal(entry
, f
)) {
12915 f
->dump_int("state", entry
.state
);
12917 f
->close_section();
12920 RGWOpState::RGWOpState(RGWRados
*_store
) : RGWStateLog(_store
, _store
->ctx()->_conf
->rgw_num_zone_opstate_shards
, string("obj_opstate"))
12924 bool RGWOpState::dump_entry_internal(const cls_statelog_entry
& entry
, Formatter
*f
)
12927 switch ((OpState
)entry
.state
) {
12928 case OPSTATE_UNKNOWN
:
12931 case OPSTATE_IN_PROGRESS
:
12934 case OPSTATE_COMPLETE
:
12937 case OPSTATE_ERROR
:
12940 case OPSTATE_ABORT
:
12943 case OPSTATE_CANCELLED
:
12949 f
->dump_string("state", s
);
12953 int RGWOpState::state_from_str(const string
& s
, OpState
*state
)
12955 if (s
== "unknown") {
12956 *state
= OPSTATE_UNKNOWN
;
12957 } else if (s
== "in-progress") {
12958 *state
= OPSTATE_IN_PROGRESS
;
12959 } else if (s
== "complete") {
12960 *state
= OPSTATE_COMPLETE
;
12961 } else if (s
== "error") {
12962 *state
= OPSTATE_ERROR
;
12963 } else if (s
== "abort") {
12964 *state
= OPSTATE_ABORT
;
12965 } else if (s
== "cancelled") {
12966 *state
= OPSTATE_CANCELLED
;
12974 int RGWOpState::set_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
12976 uint32_t s
= (uint32_t)state
;
12977 return store_entry(client_id
, op_id
, object
, s
, NULL
, NULL
);
12980 int RGWOpState::renew_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
12982 uint32_t s
= (uint32_t)state
;
12983 return store_entry(client_id
, op_id
, object
, s
, NULL
, &s
);
12986 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados
*store
, const string
& cid
, const string
& oid
,
12987 const string
& obj
) : os(store
), client_id(cid
), op_id(oid
), object(obj
)
12989 cct
= store
->ctx();
12990 cur_state
= RGWOpState::OPSTATE_UNKNOWN
;
12993 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state
) {
12994 last_update
= real_clock::now();
12996 return os
.set_state(client_id
, op_id
, object
, state
);
12999 int RGWOpStateSingleOp::renew_state() {
13000 real_time now
= real_clock::now();
13002 int rate_limit_sec
= cct
->_conf
->rgw_opstate_ratelimit_sec
;
13004 if (rate_limit_sec
&& now
- last_update
< make_timespan(rate_limit_sec
)) {
13009 return os
.renew_state(client_id
, op_id
, object
, cur_state
);
13013 uint64_t RGWRados::instance_id()
13015 return get_rados_handle()->get_instance_id();
13018 uint64_t RGWRados::next_bucket_id()
13020 Mutex::Locker
l(bucket_id_lock
);
13021 return ++max_bucket_id
;
13024 RGWRados
*RGWStoreManager::init_storage_provider(CephContext
*cct
, bool use_gc_thread
, bool use_lc_thread
, bool quota_threads
, bool run_sync_thread
)
13026 int use_cache
= cct
->_conf
->rgw_cache_enabled
;
13027 RGWRados
*store
= NULL
;
13029 store
= new RGWRados
;
13031 store
= new RGWCache
<RGWRados
>;
13034 if (store
->initialize(cct
, use_gc_thread
, use_lc_thread
, quota_threads
, run_sync_thread
) < 0) {
13042 RGWRados
*RGWStoreManager::init_raw_storage_provider(CephContext
*cct
)
13044 RGWRados
*store
= NULL
;
13045 store
= new RGWRados
;
13047 store
->set_context(cct
);
13049 if (store
->init_rados() < 0) {
13057 void RGWStoreManager::close_storage(RGWRados
*store
)
13067 librados::Rados
* RGWRados::get_rados_handle()
13069 if (rados
.size() == 1) {
13072 handle_lock
.get_read();
13073 pthread_t id
= pthread_self();
13074 std::map
<pthread_t
, int>:: iterator it
= rados_map
.find(id
);
13076 if (it
!= rados_map
.end()) {
13077 handle_lock
.put_read();
13078 return &rados
[it
->second
];
13080 handle_lock
.put_read();
13081 handle_lock
.get_write();
13082 const uint32_t handle
= next_rados_handle
;
13083 rados_map
[id
] = handle
;
13084 if (++next_rados_handle
== rados
.size()) {
13085 next_rados_handle
= 0;
13087 handle_lock
.put_write();
13088 return &rados
[handle
];
13093 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
13096 int ret
= get_raw_obj_ref(obj
, &ref
);
13098 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
13102 ObjectWriteOperation op
;
13103 list
<string
> prefixes
;
13104 cls_rgw_remove_obj(op
, prefixes
);
13106 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13107 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
13109 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
13114 handles
.push_back(c
);
13119 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
13120 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
13121 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
)
13124 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
13126 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
13130 if (keep_index_consistent
) {
13131 RGWRados::Bucket
bop(this, bucket_info
);
13132 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
13134 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
);
13136 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
13141 ObjectWriteOperation op
;
13142 list
<string
> prefixes
;
13143 cls_rgw_remove_obj(op
, prefixes
);
13145 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13146 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
13148 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
13153 handles
.push_back(c
);
13155 if (keep_index_consistent
) {
13156 ret
= delete_obj_index(obj
);
13158 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
13165 int rgw_compression_info_from_attrset(map
<string
, bufferlist
>& attrs
, bool& need_decompress
, RGWCompressionInfo
& cs_info
) {
13166 map
<string
, bufferlist
>::iterator value
= attrs
.find(RGW_ATTR_COMPRESSION
);
13167 if (value
!= attrs
.end()) {
13168 bufferlist::iterator bliter
= value
->second
.begin();
13170 ::decode(cs_info
, bliter
);
13171 } catch (buffer::error
& err
) {
13174 if (cs_info
.blocks
.size() == 0) {
13177 if (cs_info
.compression_type
!= "none")
13178 need_decompress
= true;
13180 need_decompress
= false;
13183 need_decompress
= false;