1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "include/compat.h"
8 #include <boost/algorithm/string.hpp>
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "osd/osd_types.h"
45 #include "rgw_tools.h"
46 #include "rgw_coroutine.h"
47 #include "rgw_compression.h"
49 #undef fork // fails to compile RGWPeriod::fork() below
51 #include "common/Clock.h"
53 #include "include/rados/librados.hpp"
54 using namespace librados
;
62 #include "auth/Crypto.h" // get_random_bytes()
69 #include "rgw_object_expirer_core.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
75 #include "compressor/Compressor.h"
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
82 static string notify_oid_prefix
= "notify";
83 static string
*notify_oids
= NULL
;
84 static string shadow_ns
= "shadow";
85 static string dir_oid_prefix
= ".dir.";
86 static string default_storage_pool_suffix
= "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix
= "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix
= "rgw.buckets.non-ec";
89 static string avail_pools
= ".pools.avail";
91 static string zone_info_oid_prefix
= "zone_info.";
92 static string zone_names_oid_prefix
= "zone_names.";
93 static string region_info_oid_prefix
= "region_info.";
94 static string zone_group_info_oid_prefix
= "zonegroup_info.";
95 static string realm_names_oid_prefix
= "realms_names.";
96 static string realm_info_oid_prefix
= "realms.";
97 static string default_region_info_oid
= "default.region";
98 static string default_zone_group_info_oid
= "default.zonegroup";
99 static string period_info_oid_prefix
= "periods.";
100 static string period_latest_epoch_info_oid
= ".latest_epoch";
101 static string region_map_oid
= "region_map";
102 static string zonegroup_map_oid
= "zonegroup_map";
103 static string log_lock_name
= "rgw_log_lock";
104 static string default_realm_info_oid
= "default.realm";
105 const string default_zonegroup_name
= "default";
106 const string default_zone_name
= "default";
107 static string zonegroup_names_oid_prefix
= "zonegroups_names.";
108 static RGWObjCategory main_category
= RGW_OBJ_CATEGORY_MAIN
;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL
= "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL
= "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL
= "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL
= "rgw.root";
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
118 #define dout_subsys ceph_subsys_rgw
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
122 const string
& placement_id
, const rgw_obj
& obj
, rgw_pool
*pool
)
124 if (!zone_params
.get_head_data_pool(placement_id
, obj
, pool
)) {
125 RGWZonePlacementInfo placement
;
126 if (!zone_params
.get_placement(zonegroup
.default_placement
, &placement
)) {
130 if (!obj
.in_extra_data
) {
131 *pool
= placement
.data_pool
;
133 *pool
= placement
.get_data_extra_pool();
140 static bool rgw_obj_to_raw(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
,
141 const string
& placement_id
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
143 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
145 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_id
, obj
, &raw_obj
->pool
);
148 rgw_raw_obj
rgw_obj_select::get_raw_obj(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
) const
152 rgw_obj_to_raw(zonegroup
, zone_params
, placement_rule
, obj
, &r
);
158 rgw_raw_obj
rgw_obj_select::get_raw_obj(RGWRados
*store
) const
162 store
->obj_to_raw(placement_rule
, obj
, &r
);
168 int rgw_init_ioctx(librados::Rados
*rados
, const rgw_pool
& pool
, IoCtx
& ioctx
, bool create
)
170 int r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
171 if (r
== -ENOENT
&& create
) {
172 r
= rados
->pool_create(pool
.name
.c_str());
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r
)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
181 if (r
< 0 && r
!= -EEXIST
) {
185 r
= rados
->ioctx_create(pool
.name
.c_str(), ioctx
);
190 r
= ioctx
.application_enable(pg_pool_t::APPLICATION_NAME_RGW
, false);
191 if (r
< 0 && r
!= -EOPNOTSUPP
) {
197 if (!pool
.ns
.empty()) {
198 ioctx
.set_namespace(pool
.ns
);
204 void RGWObjectCtxImpl
<rgw_obj
, RGWObjState
>::invalidate(rgw_obj
& obj
) {
205 RWLock::WLocker
wl(lock
);
206 auto iter
= objs_state
.find(obj
);
207 if (iter
== objs_state
.end()) {
210 bool is_atomic
= iter
->second
.is_atomic
;
211 bool prefetch_data
= iter
->second
.prefetch_data
;
213 objs_state
.erase(iter
);
215 if (is_atomic
|| prefetch_data
) {
216 auto& s
= objs_state
[obj
];
217 s
.is_atomic
= is_atomic
;
218 s
.prefetch_data
= prefetch_data
;
223 void RGWObjectCtxImpl
<rgw_raw_obj
, RGWRawObjState
>::invalidate(rgw_raw_obj
& obj
) {
224 RWLock::WLocker
wl(lock
);
225 auto iter
= objs_state
.find(obj
);
226 if (iter
== objs_state
.end()) {
230 objs_state
.erase(iter
);
233 void RGWDefaultZoneGroupInfo::dump(Formatter
*f
) const {
234 encode_json("default_zonegroup", default_zonegroup
, f
);
237 void RGWDefaultZoneGroupInfo::decode_json(JSONObj
*obj
) {
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup
, obj
);
240 /* backward compatability with region */
241 if (default_zonegroup
.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup
, obj
);
246 rgw_pool
RGWZoneGroup::get_pool(CephContext
*cct_
)
248 if (cct_
->_conf
->rgw_zonegroup_root_pool
.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL
);
252 return rgw_pool(cct_
->_conf
->rgw_zonegroup_root_pool
);
255 int RGWZoneGroup::create_default(bool old_format
)
257 name
= default_zonegroup_name
;
260 RGWZoneGroupPlacementTarget placement_target
;
261 placement_target
.name
= "default-placement";
262 placement_targets
[placement_target
.name
] = placement_target
;
263 default_placement
= "default-placement";
265 RGWZoneParams
zone_params(default_zone_name
);
267 int r
= zone_params
.init(cct
, store
, false);
269 ldout(cct
, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r
) << dendl
;
273 r
= zone_params
.create_default();
274 if (r
< 0 && r
!= -EEXIST
) {
275 ldout(cct
, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r
) << dendl
;
277 } else if (r
== -EEXIST
) {
278 ldout(cct
, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl
;
279 zone_params
.clear_id();
280 r
= zone_params
.init(cct
, store
);
282 ldout(cct
, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r
) << dendl
;
285 ldout(cct
, 20) << "zone_params::create_default() " << zone_params
.get_name() << " id " << zone_params
.get_id()
289 RGWZone
& default_zone
= zones
[zone_params
.get_id()];
290 default_zone
.name
= zone_params
.get_name();
291 default_zone
.id
= zone_params
.get_id();
292 master_zone
= default_zone
.id
;
295 if (r
< 0 && r
!= -EEXIST
) {
296 ldout(cct
, 0) << "error storing zone group info: " << cpp_strerror(-r
) << dendl
;
301 ldout(cct
, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl
;
303 r
= init(cct
, store
);
313 post_process_params();
318 const string
RGWZoneGroup::get_default_oid(bool old_region_format
)
320 if (old_region_format
) {
321 if (cct
->_conf
->rgw_default_region_info_oid
.empty()) {
322 return default_region_info_oid
;
324 return cct
->_conf
->rgw_default_region_info_oid
;
327 string default_oid
= cct
->_conf
->rgw_default_zonegroup_info_oid
;
329 if (cct
->_conf
->rgw_default_zonegroup_info_oid
.empty()) {
330 default_oid
= default_zone_group_info_oid
;
333 default_oid
+= "." + realm_id
;
338 const string
& RGWZoneGroup::get_info_oid_prefix(bool old_region_format
)
340 if (old_region_format
) {
341 return region_info_oid_prefix
;
343 return zone_group_info_oid_prefix
;
346 const string
& RGWZoneGroup::get_names_oid_prefix()
348 return zonegroup_names_oid_prefix
;
351 const string
& RGWZoneGroup::get_predefined_name(CephContext
*cct
) {
352 return cct
->_conf
->rgw_zonegroup
;
355 int RGWZoneGroup::equals(const string
& other_zonegroup
) const
357 if (is_master
&& other_zonegroup
.empty())
360 return (id
== other_zonegroup
);
363 int RGWZoneGroup::add_zone(const RGWZoneParams
& zone_params
, bool *is_master
, bool *read_only
,
364 const list
<string
>& endpoints
, const string
*ptier_type
,
365 bool *psync_from_all
, list
<string
>& sync_from
, list
<string
>& sync_from_rm
)
367 auto& zone_id
= zone_params
.get_id();
368 auto& zone_name
= zone_params
.get_name();
370 // check for duplicate zone name on insert
371 if (!zones
.count(zone_id
)) {
372 for (const auto& zone
: zones
) {
373 if (zone
.second
.name
== zone_name
) {
374 ldout(cct
, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone
.first
<< ") in zonegroup " << get_name() << dendl
;
383 if (!master_zone
.empty() && master_zone
!= zone_params
.get_id()) {
384 ldout(cct
, 0) << "NOTICE: overriding master zone: " << master_zone
<< dendl
;
386 master_zone
= zone_params
.get_id();
387 } else if (master_zone
== zone_params
.get_id()) {
392 RGWZone
& zone
= zones
[zone_params
.get_id()];
393 zone
.name
= zone_params
.get_name();
394 zone
.id
= zone_params
.get_id();
395 if (!endpoints
.empty()) {
396 zone
.endpoints
= endpoints
;
399 zone
.read_only
= *read_only
;
402 zone
.tier_type
= *ptier_type
;
405 if (psync_from_all
) {
406 zone
.sync_from_all
= *psync_from_all
;
409 for (auto add
: sync_from
) {
410 zone
.sync_from
.insert(add
);
413 for (auto rm
: sync_from_rm
) {
414 zone
.sync_from
.erase(rm
);
417 post_process_params();
423 int RGWZoneGroup::rename_zone(const RGWZoneParams
& zone_params
)
425 RGWZone
& zone
= zones
[zone_params
.get_id()];
426 zone
.name
= zone_params
.get_name();
431 void RGWZoneGroup::post_process_params()
433 bool log_data
= zones
.size() > 1;
435 if (master_zone
.empty()) {
436 map
<string
, RGWZone
>::iterator iter
= zones
.begin();
437 if (iter
!= zones
.end()) {
438 master_zone
= iter
->first
;
442 for (map
<string
, RGWZone
>::iterator iter
= zones
.begin(); iter
!= zones
.end(); ++iter
) {
443 RGWZone
& zone
= iter
->second
;
444 zone
.log_data
= log_data
;
446 RGWZoneParams
zone_params(zone
.id
, zone
.name
);
447 int ret
= zone_params
.init(cct
, store
);
449 ldout(cct
, 0) << "WARNING: could not read zone params for zone id=" << zone
.id
<< " name=" << zone
.name
<< dendl
;
453 for (map
<string
, RGWZonePlacementInfo
>::iterator iter
= zone_params
.placement_pools
.begin();
454 iter
!= zone_params
.placement_pools
.end(); ++iter
) {
455 const string
& placement_name
= iter
->first
;
456 if (placement_targets
.find(placement_name
) == placement_targets
.end()) {
457 RGWZoneGroupPlacementTarget placement_target
;
458 placement_target
.name
= placement_name
;
459 placement_targets
[placement_name
] = placement_target
;
464 if (default_placement
.empty() && !placement_targets
.empty()) {
465 default_placement
= placement_targets
.begin()->first
;
469 int RGWZoneGroup::remove_zone(const std::string
& zone_id
)
471 map
<string
, RGWZone
>::iterator iter
= zones
.find(zone_id
);
472 if (iter
== zones
.end()) {
473 ldout(cct
, 0) << "zone id " << zone_id
<< " is not a part of zonegroup "
480 post_process_params();
485 int RGWZoneGroup::read_default_id(string
& default_id
, bool old_format
)
487 if (realm_id
.empty()) {
488 /* try using default realm */
490 int ret
= realm
.init(cct
, store
);
491 // no default realm exist
493 return read_id(default_zonegroup_name
, default_id
);
495 realm_id
= realm
.get_id();
498 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
501 int RGWZoneGroup::set_as_default(bool exclusive
)
503 if (realm_id
.empty()) {
504 /* try using default realm */
506 int ret
= realm
.init(cct
, store
);
508 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
511 realm_id
= realm
.get_id();
514 return RGWSystemMetaObj::set_as_default(exclusive
);
517 int RGWSystemMetaObj::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
, bool old_format
)
525 if (old_format
&& id
.empty()) {
532 name
= get_predefined_name(cct
);
535 r
= use_default(old_format
);
539 } else if (!old_format
) {
540 r
= read_id(name
, id
);
543 ldout(cct
, 0) << "error in read_id for object name: " << name
<< " : " << cpp_strerror(-r
) << dendl
;
550 return read_info(id
, old_format
);
553 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo
& default_info
, const string
& oid
)
555 auto pool
= get_pool(cct
);
557 RGWObjectCtx
obj_ctx(store
);
558 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
563 bufferlist::iterator iter
= bl
.begin();
564 ::decode(default_info
, iter
);
565 } catch (buffer::error
& err
) {
566 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
573 int RGWSystemMetaObj::read_default_id(string
& default_id
, bool old_format
)
575 RGWDefaultSystemMetaObjInfo default_info
;
577 int ret
= read_default(default_info
, get_default_oid(old_format
));
582 default_id
= default_info
.default_id
;
587 int RGWSystemMetaObj::use_default(bool old_format
)
589 return read_default_id(id
, old_format
);
592 int RGWSystemMetaObj::set_as_default(bool exclusive
)
594 string oid
= get_default_oid();
596 rgw_pool
pool(get_pool(cct
));
599 RGWDefaultSystemMetaObjInfo default_info
;
600 default_info
.default_id
= id
;
602 ::encode(default_info
, bl
);
604 int ret
= rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
605 exclusive
, NULL
, real_time(), NULL
);
612 int RGWSystemMetaObj::read_id(const string
& obj_name
, string
& object_id
)
614 rgw_pool
pool(get_pool(cct
));
617 string oid
= get_names_oid_prefix() + obj_name
;
619 RGWObjectCtx
obj_ctx(store
);
620 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
625 RGWNameToId nameToId
;
627 bufferlist::iterator iter
= bl
.begin();
628 ::decode(nameToId
, iter
);
629 } catch (buffer::error
& err
) {
630 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
633 object_id
= nameToId
.obj_id
;
637 int RGWSystemMetaObj::delete_obj(bool old_format
)
639 rgw_pool
pool(get_pool(cct
));
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info
;
643 int ret
= read_default(default_info
, get_default_oid(old_format
));
644 if (ret
< 0 && ret
!= -ENOENT
)
646 if (default_info
.default_id
== id
|| (old_format
&& default_info
.default_id
== name
)) {
647 string oid
= get_default_oid(old_format
);
648 rgw_raw_obj
default_named_obj(pool
, oid
);
649 ret
= store
->delete_system_obj(default_named_obj
);
651 ldout(cct
, 0) << "Error delete default obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
656 string oid
= get_names_oid_prefix() + name
;
657 rgw_raw_obj
object_name(pool
, oid
);
658 ret
= store
->delete_system_obj(object_name
);
660 ldout(cct
, 0) << "Error delete obj name " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
665 string oid
= get_info_oid_prefix(old_format
);
672 rgw_raw_obj
object_id(pool
, oid
);
673 ret
= store
->delete_system_obj(object_id
);
675 ldout(cct
, 0) << "Error delete object id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
681 int RGWSystemMetaObj::store_name(bool exclusive
)
683 rgw_pool
pool(get_pool(cct
));
684 string oid
= get_names_oid_prefix() + name
;
686 RGWNameToId nameToId
;
687 nameToId
.obj_id
= id
;
690 ::encode(nameToId
, bl
);
691 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
694 int RGWSystemMetaObj::rename(const string
& new_name
)
697 int ret
= read_id(new_name
, new_id
);
701 if (ret
< 0 && ret
!= -ENOENT
) {
702 ldout(cct
, 0) << "Error read_id " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
705 string old_name
= name
;
709 ldout(cct
, 0) << "Error storing new obj info " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
712 ret
= store_name(true);
714 ldout(cct
, 0) << "Error storing new name " << new_name
<< ": " << cpp_strerror(-ret
) << dendl
;
717 /* delete old name */
718 rgw_pool
pool(get_pool(cct
));
719 string oid
= get_names_oid_prefix() + old_name
;
720 rgw_raw_obj
old_name_obj(pool
, oid
);
721 ret
= store
->delete_system_obj(old_name_obj
);
723 ldout(cct
, 0) << "Error delete old obj name " << old_name
<< ": " << cpp_strerror(-ret
) << dendl
;
730 int RGWSystemMetaObj::read_info(const string
& obj_id
, bool old_format
)
732 rgw_pool
pool(get_pool(cct
));
736 string oid
= get_info_oid_prefix(old_format
) + obj_id
;
738 RGWObjectCtx
obj_ctx(store
);
739 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
741 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << oid
<< ": " << cpp_strerror(-ret
) << dendl
;
746 bufferlist::iterator iter
= bl
.begin();
747 ::decode(*this, iter
);
748 } catch (buffer::error
& err
) {
749 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << oid
<< dendl
;
756 int RGWSystemMetaObj::read()
758 int ret
= read_id(name
, id
);
763 return read_info(id
);
766 int RGWSystemMetaObj::create(bool exclusive
)
770 /* check to see the name is not used */
771 ret
= read_id(name
, id
);
772 if (exclusive
&& ret
== 0) {
773 ldout(cct
, 10) << "ERROR: name " << name
<< " already in use for obj id " << id
<< dendl
;
775 } else if ( ret
< 0 && ret
!= -ENOENT
) {
776 ldout(cct
, 0) << "failed reading obj id " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
781 /* create unique id */
784 new_uuid
.generate_random();
785 new_uuid
.print(uuid_str
);
789 ret
= store_info(exclusive
);
791 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
795 return store_name(exclusive
);
798 int RGWSystemMetaObj::store_info(bool exclusive
)
800 rgw_pool
pool(get_pool(cct
));
802 string oid
= get_info_oid_prefix() + id
;
806 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(), exclusive
, NULL
, real_time(), NULL
);
809 int RGWSystemMetaObj::write(bool exclusive
)
811 int ret
= store_info(exclusive
);
813 ldout(cct
, 20) << __func__
<< "(): store_info() returned ret=" << ret
<< dendl
;
816 ret
= store_name(exclusive
);
818 ldout(cct
, 20) << __func__
<< "(): store_name() returned ret=" << ret
<< dendl
;
825 const string
& RGWRealm::get_predefined_name(CephContext
*cct
) {
826 return cct
->_conf
->rgw_realm
;
829 int RGWRealm::create(bool exclusive
)
831 int ret
= RGWSystemMetaObj::create(exclusive
);
833 ldout(cct
, 0) << "ERROR creating new realm object " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
836 // create the control object for watch/notify
837 ret
= create_control(exclusive
);
839 ldout(cct
, 0) << "ERROR creating control for new realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
843 if (current_period
.empty()) {
844 /* create new period for the realm */
845 ret
= period
.init(cct
, store
, id
, name
, false);
849 ret
= period
.create(true);
851 ldout(cct
, 0) << "ERROR: creating new period for realm " << name
<< ": " << cpp_strerror(-ret
) << dendl
;
855 period
= RGWPeriod(current_period
, 0);
856 int ret
= period
.init(cct
, store
, id
, name
);
858 ldout(cct
, 0) << "ERROR: failed to init period " << current_period
<< dendl
;
862 ret
= set_current_period(period
);
864 ldout(cct
, 0) << "ERROR: failed set current period " << current_period
<< dendl
;
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret
= set_as_default(true);
870 if (ret
< 0 && ret
!= -EEXIST
) {
871 ldout(cct
, 0) << "WARNING: failed to set realm as default realm, ret=" << ret
<< dendl
;
877 int RGWRealm::delete_obj()
879 int ret
= RGWSystemMetaObj::delete_obj();
883 return delete_control();
886 int RGWRealm::create_control(bool exclusive
)
888 auto pool
= rgw_pool
{get_pool(cct
)};
889 auto oid
= get_control_oid();
890 return rgw_put_system_obj(store
, pool
, oid
, nullptr, 0, exclusive
,
891 nullptr, real_time(), nullptr);
894 int RGWRealm::delete_control()
896 auto pool
= rgw_pool
{get_pool(cct
)};
897 auto obj
= rgw_raw_obj
{pool
, get_control_oid()};
898 return store
->delete_system_obj(obj
);
901 rgw_pool
RGWRealm::get_pool(CephContext
*cct
)
903 if (cct
->_conf
->rgw_realm_root_pool
.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL
);
906 return rgw_pool(cct
->_conf
->rgw_realm_root_pool
);
909 const string
RGWRealm::get_default_oid(bool old_format
)
911 if (cct
->_conf
->rgw_default_realm_info_oid
.empty()) {
912 return default_realm_info_oid
;
914 return cct
->_conf
->rgw_default_realm_info_oid
;
917 const string
& RGWRealm::get_names_oid_prefix()
919 return realm_names_oid_prefix
;
922 const string
& RGWRealm::get_info_oid_prefix(bool old_format
)
924 return realm_info_oid_prefix
;
927 int RGWRealm::set_current_period(RGWPeriod
& period
)
929 // update realm epoch to match the period's
930 if (epoch
> period
.get_realm_epoch()) {
931 ldout(cct
, 0) << "ERROR: set_current_period with old realm epoch "
932 << period
.get_realm_epoch() << ", current epoch=" << epoch
<< dendl
;
935 if (epoch
== period
.get_realm_epoch() && current_period
!= period
.get_id()) {
936 ldout(cct
, 0) << "ERROR: set_current_period with same realm epoch "
937 << period
.get_realm_epoch() << ", but different period id "
938 << period
.get_id() << " != " << current_period
<< dendl
;
942 epoch
= period
.get_realm_epoch();
943 current_period
= period
.get_id();
947 ldout(cct
, 0) << "ERROR: period update: " << cpp_strerror(-ret
) << dendl
;
951 ret
= period
.reflect();
953 ldout(cct
, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret
) << dendl
;
960 string
RGWRealm::get_control_oid()
962 return get_info_oid_prefix() + id
+ ".control";
965 int RGWRealm::notify_zone(bufferlist
& bl
)
967 // open a context on the realm's pool
968 rgw_pool pool
{get_pool(cct
)};
970 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ctx
);
972 ldout(cct
, 0) << "Failed to open pool " << pool
<< dendl
;
975 // send a notify on the realm object
976 r
= ctx
.notify2(get_control_oid(), bl
, 0, nullptr);
978 ldout(cct
, 0) << "Realm notify failed with " << r
<< dendl
;
984 int RGWRealm::notify_new_period(const RGWPeriod
& period
)
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod
, bl
);
989 ::encode(period
, bl
);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload
, bl
);
993 return notify_zone(bl
);
996 std::string
RGWPeriodConfig::get_oid(const std::string
& realm_id
)
998 if (realm_id
.empty()) {
999 return "period_config.default";
1001 return "period_config." + realm_id
;
1004 rgw_pool
RGWPeriodConfig::get_pool(CephContext
*cct
)
1006 const auto& pool_name
= cct
->_conf
->rgw_period_root_pool
;
1007 if (pool_name
.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL
};
1013 int RGWPeriodConfig::read(RGWRados
*store
, const std::string
& realm_id
)
1015 RGWObjectCtx
obj_ctx(store
);
1016 const auto& pool
= get_pool(store
->ctx());
1017 const auto& oid
= get_oid(realm_id
);
1020 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, nullptr, nullptr);
1025 bufferlist::iterator iter
= bl
.begin();
1026 ::decode(*this, iter
);
1027 } catch (buffer::error
& err
) {
1033 int RGWPeriodConfig::write(RGWRados
*store
, const std::string
& realm_id
)
1035 const auto& pool
= get_pool(store
->ctx());
1036 const auto& oid
= get_oid(realm_id
);
1038 ::encode(*this, bl
);
1039 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1040 false, nullptr, real_time(), nullptr);
1043 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, const string
& period_realm_id
,
1044 const string
& period_realm_name
, bool setup_obj
)
1048 realm_id
= period_realm_id
;
1049 realm_name
= period_realm_name
;
1054 return init(_cct
, _store
, setup_obj
);
1058 int RGWPeriod::init(CephContext
*_cct
, RGWRados
*_store
, bool setup_obj
)
1067 RGWRealm
realm(realm_id
, realm_name
);
1068 int ret
= realm
.init(cct
, store
);
1070 ldout(cct
, 0) << "RGWPeriod::init failed to init realm " << realm_name
<< " id " << realm_id
<< " : " <<
1071 cpp_strerror(-ret
) << dendl
;
1074 id
= realm
.get_current_period();
1075 realm_id
= realm
.get_id();
1079 int ret
= use_latest_epoch();
1081 ldout(cct
, 0) << "failed to use_latest_epoch period id " << id
<< " realm " << realm_name
<< " id " << realm_id
1082 << " : " << cpp_strerror(-ret
) << dendl
;
1091 int RGWPeriod::get_zonegroup(RGWZoneGroup
& zonegroup
, const string
& zonegroup_id
) {
1092 map
<string
, RGWZoneGroup
>::const_iterator iter
;
1093 if (!zonegroup_id
.empty()) {
1094 iter
= period_map
.zonegroups
.find(zonegroup_id
);
1096 iter
= period_map
.zonegroups
.find("default");
1098 if (iter
!= period_map
.zonegroups
.end()) {
1099 zonegroup
= iter
->second
;
1106 const string
& RGWPeriod::get_latest_epoch_oid()
1108 if (cct
->_conf
->rgw_period_latest_epoch_info_oid
.empty()) {
1109 return period_latest_epoch_info_oid
;
1111 return cct
->_conf
->rgw_period_latest_epoch_info_oid
;
1114 const string
& RGWPeriod::get_info_oid_prefix()
1116 return period_info_oid_prefix
;
1119 const string
RGWPeriod::get_period_oid_prefix()
1121 return get_info_oid_prefix() + id
;
1124 const string
RGWPeriod::get_period_oid()
1126 std::ostringstream oss
;
1127 oss
<< get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id
!= get_staging_id(realm_id
))
1130 oss
<< "." << epoch
;
1134 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo
& info
,
1135 RGWObjVersionTracker
*objv
)
1137 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1139 rgw_pool
pool(get_pool(cct
));
1141 RGWObjectCtx
obj_ctx(store
);
1142 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, oid
, bl
, objv
, nullptr);
1144 ldout(cct
, 1) << "error read_lastest_epoch " << pool
<< ":" << oid
<< dendl
;
1148 bufferlist::iterator iter
= bl
.begin();
1149 ::decode(info
, iter
);
1150 } catch (buffer::error
& err
) {
1151 ldout(cct
, 0) << "error decoding data from " << pool
<< ":" << oid
<< dendl
;
1158 int RGWPeriod::get_latest_epoch(epoch_t
& latest_epoch
)
1160 RGWPeriodLatestEpochInfo info
;
1162 int ret
= read_latest_epoch(info
);
1167 latest_epoch
= info
.epoch
;
1172 int RGWPeriod::use_latest_epoch()
1174 RGWPeriodLatestEpochInfo info
;
1175 int ret
= read_latest_epoch(info
);
1185 int RGWPeriod::set_latest_epoch(epoch_t epoch
, bool exclusive
,
1186 RGWObjVersionTracker
*objv
)
1188 string oid
= get_period_oid_prefix() + get_latest_epoch_oid();
1190 rgw_pool
pool(get_pool(cct
));
1193 RGWPeriodLatestEpochInfo info
;
1198 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1199 exclusive
, objv
, real_time(), nullptr);
1202 int RGWPeriod::update_latest_epoch(epoch_t epoch
)
1204 static constexpr int MAX_RETRIES
= 20;
1206 for (int i
= 0; i
< MAX_RETRIES
; i
++) {
1207 RGWPeriodLatestEpochInfo info
;
1208 RGWObjVersionTracker objv
;
1209 bool exclusive
= false;
1211 // read existing epoch
1212 int r
= read_latest_epoch(info
, &objv
);
1214 // use an exclusive create to set the epoch atomically
1216 ldout(cct
, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id
<< dendl
;
1219 ldout(cct
, 0) << "ERROR: failed to read latest_epoch" << dendl
;
1221 } else if (epoch
<= info
.epoch
) {
1222 r
= -EEXIST
; // fail with EEXIST if epoch is not newer
1223 ldout(cct
, 1) << "found existing latest_epoch " << info
.epoch
1224 << " >= given epoch " << epoch
<< ", returning r=" << r
<< dendl
;
1227 ldout(cct
, 20) << "updating latest_epoch from " << info
.epoch
1228 << " -> " << epoch
<< " on period=" << id
<< dendl
;
1231 r
= set_latest_epoch(epoch
, exclusive
, &objv
);
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r
== -ECANCELED
) {
1235 continue; // write raced with a conflicting version, retry
1238 ldout(cct
, 0) << "ERROR: failed to write latest_epoch" << dendl
;
1241 return 0; // return success
1244 return -ECANCELED
; // fail after max retries
1247 int RGWPeriod::delete_obj()
1249 rgw_pool
pool(get_pool(cct
));
1251 // delete the object for each period epoch
1252 for (epoch_t e
= 1; e
<= epoch
; e
++) {
1253 RGWPeriod p
{get_id(), e
};
1254 rgw_raw_obj oid
{pool
, p
.get_period_oid()};
1255 int ret
= store
->delete_system_obj(oid
);
1257 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret
) << dendl
;
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid
{pool
, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret
= store
->delete_system_obj(oid
);
1266 ldout(cct
, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret
) << dendl
;
1272 int RGWPeriod::read_info()
1274 rgw_pool
pool(get_pool(cct
));
1278 RGWObjectCtx
obj_ctx(store
);
1279 int ret
= rgw_get_system_obj(store
, obj_ctx
, pool
, get_period_oid(), bl
, NULL
, NULL
);
1281 ldout(cct
, 0) << "failed reading obj info from " << pool
<< ":" << get_period_oid() << ": " << cpp_strerror(-ret
) << dendl
;
1286 bufferlist::iterator iter
= bl
.begin();
1287 ::decode(*this, iter
);
1288 } catch (buffer::error
& err
) {
1289 ldout(cct
, 0) << "ERROR: failed to decode obj from " << pool
<< ":" << get_period_oid() << dendl
;
1296 int RGWPeriod::create(bool exclusive
)
1300 /* create unique id */
1303 new_uuid
.generate_random();
1304 new_uuid
.print(uuid_str
);
1307 epoch
= FIRST_EPOCH
;
1311 ret
= store_info(exclusive
);
1313 ldout(cct
, 0) << "ERROR: storing info for " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1317 ret
= set_latest_epoch(epoch
);
1319 ldout(cct
, 0) << "ERROR: setting latest epoch " << id
<< ": " << cpp_strerror(-ret
) << dendl
;
1325 int RGWPeriod::store_info(bool exclusive
)
1327 rgw_pool
pool(get_pool(cct
));
1329 string oid
= get_period_oid();
1331 ::encode(*this, bl
);
1333 return rgw_put_system_obj(store
, pool
, oid
, bl
.c_str(), bl
.length(),
1334 exclusive
, NULL
, real_time(), NULL
);
1337 rgw_pool
RGWPeriod::get_pool(CephContext
*cct
)
1339 if (cct
->_conf
->rgw_period_root_pool
.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL
);
1342 return rgw_pool(cct
->_conf
->rgw_period_root_pool
);
1345 int RGWPeriod::add_zonegroup(const RGWZoneGroup
& zonegroup
)
1347 if (zonegroup
.realm_id
!= realm_id
) {
1350 int ret
= period_map
.update(zonegroup
, cct
);
1352 ldout(cct
, 0) << "ERROR: updating period map: " << cpp_strerror(-ret
) << dendl
;
1356 return store_info(false);
1359 int RGWPeriod::update()
1361 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << get_id() << dendl
;
1362 list
<string
> zonegroups
;
1363 int ret
= store
->list_zonegroups(zonegroups
);
1365 ldout(cct
, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret
) << dendl
;
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map
.short_zone_ids
.clear();
1373 for (auto& iter
: zonegroups
) {
1374 RGWZoneGroup
zg(string(), iter
);
1375 ret
= zg
.init(cct
, store
);
1377 ldout(cct
, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret
) << dendl
;
1381 if (zg
.realm_id
!= realm_id
) {
1382 ldout(cct
, 20) << "skipping zonegroup " << zg
.get_name() << " zone realm id " << zg
.realm_id
<< ", not on our realm " << realm_id
<< dendl
;
1386 if (zg
.master_zone
.empty()) {
1387 ldout(cct
, 0) << "ERROR: zonegroup " << zg
.get_name() << " should have a master zone " << dendl
;
1391 if (zg
.zones
.find(zg
.master_zone
) == zg
.zones
.end()) {
1392 ldout(cct
,0) << "ERROR: zonegroup " << zg
.get_name()
1393 << " has a non existent master zone "<< dendl
;
1397 if (zg
.is_master_zonegroup()) {
1398 master_zonegroup
= zg
.get_id();
1399 master_zone
= zg
.master_zone
;
1402 int ret
= period_map
.update(zg
, cct
);
1408 ret
= period_config
.read(store
, realm_id
);
1409 if (ret
< 0 && ret
!= -ENOENT
) {
1410 ldout(cct
, 0) << "ERROR: failed to read period config: "
1411 << cpp_strerror(ret
) << dendl
;
1417 int RGWPeriod::reflect()
1419 for (auto& iter
: period_map
.zonegroups
) {
1420 RGWZoneGroup
& zg
= iter
.second
;
1421 zg
.reinit_instance(cct
, store
);
1422 int r
= zg
.write(false);
1424 ldout(cct
, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter
.first
<< ": " << cpp_strerror(-r
) << dendl
;
1427 if (zg
.is_master_zonegroup()) {
1428 // set master as default if no default exists
1429 r
= zg
.set_as_default(true);
1431 ldout(cct
, 1) << "Set the period's master zonegroup " << zg
.get_id()
1432 << " as the default" << dendl
;
1437 int r
= period_config
.write(store
, realm_id
);
1439 ldout(cct
, 0) << "ERROR: failed to store period config: "
1440 << cpp_strerror(-r
) << dendl
;
1446 void RGWPeriod::fork()
1448 ldout(cct
, 20) << __func__
<< " realm " << realm_id
<< " period " << id
<< dendl
;
1449 predecessor_uuid
= id
;
1450 id
= get_staging_id(realm_id
);
1455 static int read_sync_status(RGWRados
*store
, rgw_meta_sync_status
*sync_status
)
1457 // initialize a sync status manager to read the status
1458 RGWMetaSyncStatusManager
mgr(store
, store
->get_async_rados());
1463 r
= mgr
.read_sync_status(sync_status
);
1468 int RGWPeriod::update_sync_status(const RGWPeriod
¤t_period
,
1469 std::ostream
& error_stream
,
1470 bool force_if_stale
)
1472 rgw_meta_sync_status status
;
1473 int r
= read_sync_status(store
, &status
);
1475 ldout(cct
, 0) << "period failed to read sync status: "
1476 << cpp_strerror(-r
) << dendl
;
1480 std::vector
<std::string
> markers
;
1482 const auto current_epoch
= current_period
.get_realm_epoch();
1483 if (current_epoch
!= status
.sync_info
.realm_epoch
) {
1484 // no sync status markers for the current period
1485 assert(current_epoch
> status
.sync_info
.realm_epoch
);
1486 const int behind
= current_epoch
- status
.sync_info
.realm_epoch
;
1487 if (!force_if_stale
&& current_epoch
> 1) {
1488 error_stream
<< "ERROR: This zone is " << behind
<< " period(s) behind "
1489 "the current master zone in metadata sync. If this zone is promoted "
1490 "to master, any metadata changes during that time are likely to "
1492 "Waiting for this zone to catch up on metadata sync (see "
1493 "'radosgw-admin sync status') is recommended.\n"
1494 "To promote this zone to master anyway, add the flag "
1495 "--yes-i-really-mean-it." << std::endl
;
1498 // empty sync status markers - other zones will skip this period during
1499 // incremental metadata sync
1500 markers
.resize(status
.sync_info
.num_shards
);
1502 markers
.reserve(status
.sync_info
.num_shards
);
1503 for (auto& i
: status
.sync_markers
) {
1504 auto& marker
= i
.second
;
1505 // filter out markers from other periods
1506 if (marker
.realm_epoch
!= current_epoch
) {
1507 marker
.marker
.clear();
1509 markers
.emplace_back(std::move(marker
.marker
));
1513 std::swap(sync_status
, markers
);
1517 int RGWPeriod::commit(RGWRealm
& realm
, const RGWPeriod
& current_period
,
1518 std::ostream
& error_stream
, bool force_if_stale
)
1520 ldout(cct
, 20) << __func__
<< " realm " << realm
.get_id() << " period " << current_period
.get_id() << dendl
;
1521 // gateway must be in the master zone to commit
1522 if (master_zone
!= store
->get_zone_params().get_id()) {
1523 error_stream
<< "Cannot commit period on zone "
1524 << store
->get_zone_params().get_id() << ", it must be sent to "
1525 "the period's master zone " << master_zone
<< '.' << std::endl
;
1528 // period predecessor must match current period
1529 if (predecessor_uuid
!= current_period
.get_id()) {
1530 error_stream
<< "Period predecessor " << predecessor_uuid
1531 << " does not match current period " << current_period
.get_id()
1532 << ". Use 'period pull' to get the latest period from the master, "
1533 "reapply your changes, and try again." << std::endl
;
1536 // realm epoch must be 1 greater than current period
1537 if (realm_epoch
!= current_period
.get_realm_epoch() + 1) {
1538 error_stream
<< "Period's realm epoch " << realm_epoch
1539 << " does not come directly after current realm epoch "
1540 << current_period
.get_realm_epoch() << ". Use 'realm pull' to get the "
1541 "latest realm and period from the master zone, reapply your changes, "
1542 "and try again." << std::endl
;
1545 // did the master zone change?
1546 if (master_zone
!= current_period
.get_master_zone()) {
1547 // store the current metadata sync status in the period
1548 int r
= update_sync_status(current_period
, error_stream
, force_if_stale
);
1550 ldout(cct
, 0) << "failed to update metadata sync status: "
1551 << cpp_strerror(-r
) << dendl
;
1554 // create an object with a new period id
1557 ldout(cct
, 0) << "failed to create new period: " << cpp_strerror(-r
) << dendl
;
1560 // set as current period
1561 r
= realm
.set_current_period(*this);
1563 ldout(cct
, 0) << "failed to update realm's current period: "
1564 << cpp_strerror(-r
) << dendl
;
1567 ldout(cct
, 4) << "Promoted to master zone and committed new period "
1569 realm
.notify_new_period(*this);
1572 // period must be based on current epoch
1573 if (epoch
!= current_period
.get_epoch()) {
1574 error_stream
<< "Period epoch " << epoch
<< " does not match "
1575 "predecessor epoch " << current_period
.get_epoch()
1576 << ". Use 'period pull' to get the latest epoch from the master zone, "
1577 "reapply your changes, and try again." << std::endl
;
1580 // set period as next epoch
1581 set_id(current_period
.get_id());
1582 set_epoch(current_period
.get_epoch() + 1);
1583 set_predecessor(current_period
.get_predecessor());
1584 realm_epoch
= current_period
.get_realm_epoch();
1585 // write the period to rados
1586 int r
= store_info(false);
1588 ldout(cct
, 0) << "failed to store period: " << cpp_strerror(-r
) << dendl
;
1591 // set as latest epoch
1592 r
= update_latest_epoch(epoch
);
1594 // already have this epoch (or a more recent one)
1598 ldout(cct
, 0) << "failed to set latest epoch: " << cpp_strerror(-r
) << dendl
;
1603 ldout(cct
, 0) << "failed to update local objects: " << cpp_strerror(-r
) << dendl
;
1606 ldout(cct
, 4) << "Committed new epoch " << epoch
1607 << " for period " << id
<< dendl
;
1608 realm
.notify_new_period(*this);
1612 int RGWZoneParams::create_default(bool old_format
)
1614 name
= default_zone_name
;
1629 int get_zones_pool_set(CephContext
* cct
,
1631 const list
<string
>& zones
,
1632 const string
& my_zone_id
,
1633 set
<rgw_pool
>& pool_names
)
1635 for(auto const& iter
: zones
) {
1636 RGWZoneParams
zone(iter
);
1637 int r
= zone
.init(cct
, store
);
1639 ldout(cct
, 0) << "Error: init zone " << iter
<< ":" << cpp_strerror(-r
) << dendl
;
1642 if (zone
.get_id() != my_zone_id
) {
1643 pool_names
.insert(zone
.domain_root
);
1644 pool_names
.insert(zone
.metadata_heap
);
1645 pool_names
.insert(zone
.control_pool
);
1646 pool_names
.insert(zone
.gc_pool
);
1647 pool_names
.insert(zone
.log_pool
);
1648 pool_names
.insert(zone
.intent_log_pool
);
1649 pool_names
.insert(zone
.usage_log_pool
);
1650 pool_names
.insert(zone
.user_keys_pool
);
1651 pool_names
.insert(zone
.user_email_pool
);
1652 pool_names
.insert(zone
.user_swift_pool
);
1653 pool_names
.insert(zone
.user_uid_pool
);
1654 pool_names
.insert(zone
.roles_pool
);
1655 pool_names
.insert(zone
.reshard_pool
);
1656 for(auto& iter
: zone
.placement_pools
) {
1657 pool_names
.insert(iter
.second
.index_pool
);
1658 pool_names
.insert(iter
.second
.data_pool
);
1659 pool_names
.insert(iter
.second
.data_extra_pool
);
1666 rgw_pool
fix_zone_pool_dup(set
<rgw_pool
> pools
,
1667 const string
& default_prefix
,
1668 const string
& default_suffix
,
1669 const rgw_pool
& suggested_pool
)
1671 string suggested_name
= suggested_pool
.to_str();
1673 string prefix
= default_prefix
;
1674 string suffix
= default_suffix
;
1676 if (!suggested_pool
.empty()) {
1677 prefix
= suggested_name
.substr(0, suggested_name
.find("."));
1678 suffix
= suggested_name
.substr(prefix
.length());
1681 rgw_pool
pool(prefix
+ suffix
);
1683 if (pools
.find(pool
) == pools
.end()) {
1687 pool
= prefix
+ "_" + std::to_string(std::rand()) + suffix
;
1688 if (pools
.find(pool
) == pools
.end()) {
1695 int RGWZoneParams::fix_pool_names()
1699 int r
= store
->list_zones(zones
);
1701 ldout(cct
, 10) << "WARNING: store->list_zones() returned r=" << r
<< dendl
;
1704 set
<rgw_pool
> pools
;
1705 r
= get_zones_pool_set(cct
, store
, zones
, id
, pools
);
1707 ldout(cct
, 0) << "Error: get_zones_pool_names" << r
<< dendl
;
1711 domain_root
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:root", domain_root
);
1712 if (!metadata_heap
.name
.empty()) {
1713 metadata_heap
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:heap", metadata_heap
);
1715 control_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.control", control_pool
);
1716 gc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:gc", gc_pool
);
1717 lc_pool
= fix_zone_pool_dup(pools
, name
,".rgw.log:lc", lc_pool
);
1718 log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log", log_pool
);
1719 intent_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:intent", intent_log_pool
);
1720 usage_log_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:usage", usage_log_pool
);
1721 user_keys_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.keys", user_keys_pool
);
1722 user_email_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.email", user_email_pool
);
1723 user_swift_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.swift", user_swift_pool
);
1724 user_uid_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:users.uid", user_uid_pool
);
1725 roles_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.meta:roles", roles_pool
);
1726 reshard_pool
= fix_zone_pool_dup(pools
, name
, ".rgw.log:reshard", reshard_pool
);
1728 for(auto& iter
: placement_pools
) {
1729 iter
.second
.index_pool
= fix_zone_pool_dup(pools
, name
, "." + default_bucket_index_pool_suffix
,
1730 iter
.second
.index_pool
);
1731 iter
.second
.data_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_pool_suffix
,
1732 iter
.second
.data_pool
);
1733 iter
.second
.data_extra_pool
= fix_zone_pool_dup(pools
, name
, "." + default_storage_extra_pool_suffix
,
1734 iter
.second
.data_extra_pool
);
1740 int RGWZoneParams::create(bool exclusive
)
1742 /* check for old pools config */
1743 rgw_raw_obj
obj(domain_root
, avail_pools
);
1744 int r
= store
->raw_obj_stat(obj
, NULL
, NULL
, NULL
, NULL
, NULL
, NULL
);
1746 ldout(store
->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl
;
1747 /* a new system, let's set new placement info */
1748 RGWZonePlacementInfo default_placement
;
1749 default_placement
.index_pool
= name
+ "." + default_bucket_index_pool_suffix
;
1750 default_placement
.data_pool
= name
+ "." + default_storage_pool_suffix
;
1751 default_placement
.data_extra_pool
= name
+ "." + default_storage_extra_pool_suffix
;
1752 placement_pools
["default-placement"] = default_placement
;
1755 r
= fix_pool_names();
1757 ldout(cct
, 0) << "ERROR: fix_pool_names returned r=" << r
<< dendl
;
1761 r
= RGWSystemMetaObj::create(exclusive
);
1766 // try to set as default. may race with another create, so pass exclusive=true
1767 // so we don't override an existing default
1768 r
= set_as_default(true);
1769 if (r
< 0 && r
!= -EEXIST
) {
1770 ldout(cct
, 10) << "WARNING: failed to set zone as default, r=" << r
<< dendl
;
1776 rgw_pool
RGWZoneParams::get_pool(CephContext
*cct
)
1778 if (cct
->_conf
->rgw_zone_root_pool
.empty()) {
1779 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL
);
1782 return rgw_pool(cct
->_conf
->rgw_zone_root_pool
);
1785 const string
RGWZoneParams::get_default_oid(bool old_format
)
1788 return cct
->_conf
->rgw_default_zone_info_oid
;
1791 return cct
->_conf
->rgw_default_zone_info_oid
+ "." + realm_id
;
1794 const string
& RGWZoneParams::get_names_oid_prefix()
1796 return zone_names_oid_prefix
;
1799 const string
& RGWZoneParams::get_info_oid_prefix(bool old_format
)
1801 return zone_info_oid_prefix
;
1804 const string
& RGWZoneParams::get_predefined_name(CephContext
*cct
) {
1805 return cct
->_conf
->rgw_zone
;
1808 int RGWZoneParams::init(CephContext
*cct
, RGWRados
*store
, bool setup_obj
, bool old_format
)
1811 name
= cct
->_conf
->rgw_zone
;
1814 return RGWSystemMetaObj::init(cct
, store
, setup_obj
, old_format
);
1817 int RGWZoneParams::read_default_id(string
& default_id
, bool old_format
)
1819 if (realm_id
.empty()) {
1820 /* try using default realm */
1822 int ret
= realm
.init(cct
, store
);
1823 //no default realm exist
1825 return read_id(default_zone_name
, default_id
);
1827 realm_id
= realm
.get_id();
1830 return RGWSystemMetaObj::read_default_id(default_id
, old_format
);
1834 int RGWZoneParams::set_as_default(bool exclusive
)
1836 if (realm_id
.empty()) {
1837 /* try using default realm */
1839 int ret
= realm
.init(cct
, store
);
1841 ldout(cct
, 10) << "could not read realm id: " << cpp_strerror(-ret
) << dendl
;
1844 realm_id
= realm
.get_id();
1847 return RGWSystemMetaObj::set_as_default(exclusive
);
1850 const string
& RGWZoneParams::get_compression_type(const string
& placement_rule
) const
1852 static const std::string NONE
{"none"};
1853 auto p
= placement_pools
.find(placement_rule
);
1854 if (p
== placement_pools
.end()) {
1857 const auto& type
= p
->second
.compression_type
;
1858 return !type
.empty() ? type
: NONE
;
1861 void RGWPeriodMap::encode(bufferlist
& bl
) const {
1862 ENCODE_START(2, 1, bl
);
1864 ::encode(zonegroups
, bl
);
1865 ::encode(master_zonegroup
, bl
);
1866 ::encode(short_zone_ids
, bl
);
1870 void RGWPeriodMap::decode(bufferlist::iterator
& bl
) {
1871 DECODE_START(2, bl
);
1873 ::decode(zonegroups
, bl
);
1874 ::decode(master_zonegroup
, bl
);
1875 if (struct_v
>= 2) {
1876 ::decode(short_zone_ids
, bl
);
1880 zonegroups_by_api
.clear();
1881 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
1882 iter
!= zonegroups
.end(); ++iter
) {
1883 RGWZoneGroup
& zonegroup
= iter
->second
;
1884 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1885 if (zonegroup
.is_master_zonegroup()) {
1886 master_zonegroup
= zonegroup
.get_id();
1891 // run an MD5 hash on the zone_id and return the first 32 bits
1892 static uint32_t gen_short_zone_id(const std::string zone_id
)
1894 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
1896 hash
.Update((const byte
*)zone_id
.c_str(), zone_id
.size());
1900 memcpy((char *)&short_id
, md5
, sizeof(short_id
));
1901 return std::max(short_id
, 1u);
1904 int RGWPeriodMap::update(const RGWZoneGroup
& zonegroup
, CephContext
*cct
)
1906 if (zonegroup
.is_master_zonegroup() && (!master_zonegroup
.empty() && zonegroup
.get_id() != master_zonegroup
)) {
1907 ldout(cct
,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl
;
1908 ldout(cct
,0) << "master zonegroup: " << master_zonegroup
<< " and " << zonegroup
.get_id() <<dendl
;
1911 map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.find(zonegroup
.get_id());
1912 if (iter
!= zonegroups
.end()) {
1913 RGWZoneGroup
& old_zonegroup
= iter
->second
;
1914 if (!old_zonegroup
.api_name
.empty()) {
1915 zonegroups_by_api
.erase(old_zonegroup
.api_name
);
1918 zonegroups
[zonegroup
.get_id()] = zonegroup
;
1920 if (!zonegroup
.api_name
.empty()) {
1921 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
1924 if (zonegroup
.is_master_zonegroup()) {
1925 master_zonegroup
= zonegroup
.get_id();
1926 } else if (master_zonegroup
== zonegroup
.get_id()) {
1927 master_zonegroup
= "";
1930 for (auto& i
: zonegroup
.zones
) {
1931 auto& zone
= i
.second
;
1932 if (short_zone_ids
.find(zone
.id
) != short_zone_ids
.end()) {
1935 // calculate the zone's short id
1936 uint32_t short_id
= gen_short_zone_id(zone
.id
);
1938 // search for an existing zone with the same short id
1939 for (auto& s
: short_zone_ids
) {
1940 if (s
.second
== short_id
) {
1941 ldout(cct
, 0) << "New zone '" << zone
.name
<< "' (" << zone
.id
1942 << ") generates the same short_zone_id " << short_id
1943 << " as existing zone id " << s
.first
<< dendl
;
1948 short_zone_ids
[zone
.id
] = short_id
;
1954 uint32_t RGWPeriodMap::get_zone_short_id(const string
& zone_id
) const
1956 auto i
= short_zone_ids
.find(zone_id
);
1957 if (i
== short_zone_ids
.end()) {
1963 int RGWZoneGroupMap::read(CephContext
*cct
, RGWRados
*store
)
1967 int ret
= period
.init(cct
, store
);
1969 cerr
<< "failed to read current period info: " << cpp_strerror(ret
);
1973 bucket_quota
= period
.get_config().bucket_quota
;
1974 user_quota
= period
.get_config().user_quota
;
1975 zonegroups
= period
.get_map().zonegroups
;
1976 zonegroups_by_api
= period
.get_map().zonegroups_by_api
;
1977 master_zonegroup
= period
.get_map().master_zonegroup
;
1982 void RGWRegionMap::encode(bufferlist
& bl
) const {
1983 ENCODE_START( 3, 1, bl
);
1984 ::encode(regions
, bl
);
1985 ::encode(master_region
, bl
);
1986 ::encode(bucket_quota
, bl
);
1987 ::encode(user_quota
, bl
);
1991 void RGWRegionMap::decode(bufferlist::iterator
& bl
) {
1992 DECODE_START(3, bl
);
1993 ::decode(regions
, bl
);
1994 ::decode(master_region
, bl
);
1996 ::decode(bucket_quota
, bl
);
1998 ::decode(user_quota
, bl
);
2002 void RGWZoneGroupMap::encode(bufferlist
& bl
) const {
2003 ENCODE_START( 3, 1, bl
);
2004 ::encode(zonegroups
, bl
);
2005 ::encode(master_zonegroup
, bl
);
2006 ::encode(bucket_quota
, bl
);
2007 ::encode(user_quota
, bl
);
2011 void RGWZoneGroupMap::decode(bufferlist::iterator
& bl
) {
2012 DECODE_START(3, bl
);
2013 ::decode(zonegroups
, bl
);
2014 ::decode(master_zonegroup
, bl
);
2016 ::decode(bucket_quota
, bl
);
2018 ::decode(user_quota
, bl
);
2021 zonegroups_by_api
.clear();
2022 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroups
.begin();
2023 iter
!= zonegroups
.end(); ++iter
) {
2024 RGWZoneGroup
& zonegroup
= iter
->second
;
2025 zonegroups_by_api
[zonegroup
.api_name
] = zonegroup
;
2026 if (zonegroup
.is_master_zonegroup()) {
2027 master_zonegroup
= zonegroup
.get_name();
2032 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation
*op
)
2034 obj_version
*check_objv
= version_for_check();
2037 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2040 cls_version_read(*op
, &read_version
);
2043 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation
*op
)
2045 obj_version
*check_objv
= version_for_check();
2046 obj_version
*modify_version
= version_for_write();
2049 cls_version_check(*op
, *check_objv
, VER_COND_EQ
);
2052 if (modify_version
) {
2053 cls_version_set(*op
, *modify_version
);
2055 cls_version_inc(*op
);
2059 void RGWObjManifest::obj_iterator::operator++()
2061 if (manifest
->explicit_objs
) {
2064 if (explicit_iter
== manifest
->objs
.end()) {
2065 ofs
= manifest
->obj_size
;
2069 update_explicit_pos();
2075 uint64_t obj_size
= manifest
->get_obj_size();
2076 uint64_t head_size
= manifest
->get_head_size();
2078 if (ofs
== obj_size
) {
2082 if (manifest
->rules
.empty()) {
2086 /* are we still pointing at the head? */
2087 if (ofs
< head_size
) {
2088 rule_iter
= manifest
->rules
.begin();
2089 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2090 ofs
= MIN(head_size
, obj_size
);
2093 stripe_size
= MIN(obj_size
- ofs
, rule
->stripe_max_size
);
2094 if (rule
->part_size
> 0) {
2095 stripe_size
= MIN(stripe_size
, rule
->part_size
);
2101 RGWObjManifestRule
*rule
= &rule_iter
->second
;
2103 stripe_ofs
+= rule
->stripe_max_size
;
2105 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule
->part_size
<< " rules.size()=" << manifest
->rules
.size() << dendl
;
2107 if (rule
->part_size
> 0) {
2108 /* multi part, multi stripes object */
2110 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2112 if (stripe_ofs
>= part_ofs
+ rule
->part_size
) {
2113 /* moved to the next part */
2115 part_ofs
+= rule
->part_size
;
2116 stripe_ofs
= part_ofs
;
2118 bool last_rule
= (next_rule_iter
== manifest
->rules
.end());
2119 /* move to the next rule? */
2120 if (!last_rule
&& stripe_ofs
>= next_rule_iter
->second
.start_ofs
) {
2121 rule_iter
= next_rule_iter
;
2122 last_rule
= (next_rule_iter
== manifest
->rules
.end());
2126 cur_part_id
= rule_iter
->second
.start_part_num
;
2131 rule
= &rule_iter
->second
;
2134 stripe_size
= MIN(rule
->part_size
- (stripe_ofs
- part_ofs
), rule
->stripe_max_size
);
2137 cur_override_prefix
= rule
->override_prefix
;
2140 if (ofs
> obj_size
) {
2146 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs
<< " stripe_ofs=" << stripe_ofs
<< " part_ofs=" << part_ofs
<< " rule->part_size=" << rule
->part_size
<< dendl
;
2150 int RGWObjManifest::generator::create_begin(CephContext
*cct
, RGWObjManifest
*_m
, const string
& placement_rule
, rgw_bucket
& _b
, rgw_obj
& _obj
)
2154 manifest
->set_tail_placement(placement_rule
, _b
);
2155 manifest
->set_head(placement_rule
, _obj
, 0);
2158 if (manifest
->get_prefix().empty()) {
2160 gen_rand_alphanumeric(cct
, buf
, sizeof(buf
) - 1);
2162 string oid_prefix
= ".";
2163 oid_prefix
.append(buf
);
2164 oid_prefix
.append("_");
2166 manifest
->set_prefix(oid_prefix
);
2169 bool found
= manifest
->get_rule(0, &rule
);
2171 derr
<< "ERROR: manifest->get_rule() could not find rule" << dendl
;
2175 uint64_t head_size
= manifest
->get_head_size();
2177 if (head_size
> 0) {
2178 cur_stripe_size
= head_size
;
2180 cur_stripe_size
= rule
.stripe_max_size
;
2183 cur_part_id
= rule
.start_part_num
;
2185 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, 0, NULL
, &cur_obj
);
2187 // Normal object which not generated through copy operation
2188 manifest
->set_tail_instance(_obj
.key
.instance
);
2190 manifest
->update_iterators();
2195 int RGWObjManifest::generator::create_next(uint64_t ofs
)
2197 if (ofs
< last_ofs
) /* only going forward */
2200 uint64_t max_head_size
= manifest
->get_max_head_size();
2202 if (ofs
< max_head_size
) {
2203 manifest
->set_head_size(ofs
);
2206 if (ofs
>= max_head_size
) {
2207 manifest
->set_head_size(max_head_size
);
2208 cur_stripe
= (ofs
- max_head_size
) / rule
.stripe_max_size
;
2209 cur_stripe_size
= rule
.stripe_max_size
;
2211 if (cur_part_id
== 0 && max_head_size
> 0) {
2217 manifest
->set_obj_size(ofs
);
2219 manifest
->get_implicit_location(cur_part_id
, cur_stripe
, ofs
, NULL
, &cur_obj
);
2221 manifest
->update_iterators();
2226 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_begin()
2231 const RGWObjManifest::obj_iterator
& RGWObjManifest::obj_end()
2236 RGWObjManifest::obj_iterator
RGWObjManifest::obj_find(uint64_t ofs
)
2238 if (ofs
> obj_size
) {
2241 RGWObjManifest::obj_iterator
iter(this);
2246 int RGWObjManifest::append(RGWObjManifest
& m
, RGWZoneGroup
& zonegroup
, RGWZoneParams
& zone_params
)
2248 if (explicit_objs
|| m
.explicit_objs
) {
2249 return append_explicit(m
, zonegroup
, zone_params
);
2252 if (rules
.empty()) {
2257 string override_prefix
;
2259 if (prefix
.empty()) {
2263 if (prefix
!= m
.prefix
) {
2264 override_prefix
= m
.prefix
;
2267 map
<uint64_t, RGWObjManifestRule
>::iterator miter
= m
.rules
.begin();
2268 if (miter
== m
.rules
.end()) {
2269 return append_explicit(m
, zonegroup
, zone_params
);
2272 for (; miter
!= m
.rules
.end(); ++miter
) {
2273 map
<uint64_t, RGWObjManifestRule
>::reverse_iterator last_rule
= rules
.rbegin();
2275 RGWObjManifestRule
& rule
= last_rule
->second
;
2277 if (rule
.part_size
== 0) {
2278 rule
.part_size
= obj_size
- rule
.start_ofs
;
2281 RGWObjManifestRule
& next_rule
= miter
->second
;
2282 if (!next_rule
.part_size
) {
2283 next_rule
.part_size
= m
.obj_size
- next_rule
.start_ofs
;
2286 string rule_prefix
= prefix
;
2287 if (!rule
.override_prefix
.empty()) {
2288 rule_prefix
= rule
.override_prefix
;
2291 string next_rule_prefix
= m
.prefix
;
2292 if (!next_rule
.override_prefix
.empty()) {
2293 next_rule_prefix
= next_rule
.override_prefix
;
2296 if (rule
.part_size
!= next_rule
.part_size
||
2297 rule
.stripe_max_size
!= next_rule
.stripe_max_size
||
2298 rule_prefix
!= next_rule_prefix
) {
2299 if (next_rule_prefix
!= prefix
) {
2300 append_rules(m
, miter
, &next_rule_prefix
);
2302 append_rules(m
, miter
, NULL
);
2307 uint64_t expected_part_num
= rule
.start_part_num
+ 1;
2308 if (rule
.part_size
> 0) {
2309 expected_part_num
= rule
.start_part_num
+ (obj_size
+ next_rule
.start_ofs
- rule
.start_ofs
) / rule
.part_size
;
2312 if (expected_part_num
!= next_rule
.start_part_num
) {
2313 append_rules(m
, miter
, NULL
);
2318 set_obj_size(obj_size
+ m
.obj_size
);
2323 int RGWObjManifest::append(RGWObjManifest
& m
, RGWRados
*store
)
2325 return append(m
, store
->get_zonegroup(), store
->get_zone_params());
2328 void RGWObjManifest::append_rules(RGWObjManifest
& m
, map
<uint64_t, RGWObjManifestRule
>::iterator
& miter
,
2329 string
*override_prefix
)
2331 for (; miter
!= m
.rules
.end(); ++miter
) {
2332 RGWObjManifestRule rule
= miter
->second
;
2333 rule
.start_ofs
+= obj_size
;
2334 if (override_prefix
)
2335 rule
.override_prefix
= *override_prefix
;
2336 rules
[rule
.start_ofs
] = rule
;
2340 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2342 if (explicit_objs
) {
2345 obj_iterator iter
= obj_begin();
2347 while (iter
!= obj_end()) {
2348 RGWObjManifestPart
& part
= objs
[iter
.get_stripe_ofs()];
2349 const rgw_obj_select
& os
= iter
.get_location();
2350 const rgw_raw_obj
& raw_loc
= os
.get_raw_obj(zonegroup
, zone_params
);
2353 uint64_t ofs
= iter
.get_stripe_ofs();
2358 rgw_raw_obj_to_obj(tail_placement
.bucket
, raw_loc
, &part
.loc
);
2361 uint64_t next_ofs
= iter
.get_stripe_ofs();
2363 part
.size
= next_ofs
- ofs
;
2366 explicit_objs
= true;
2371 int RGWObjManifest::append_explicit(RGWObjManifest
& m
, const RGWZoneGroup
& zonegroup
, const RGWZoneParams
& zone_params
)
2373 if (!explicit_objs
) {
2374 convert_to_explicit(zonegroup
, zone_params
);
2376 if (!m
.explicit_objs
) {
2377 m
.convert_to_explicit(zonegroup
, zone_params
);
2379 map
<uint64_t, RGWObjManifestPart
>::iterator iter
;
2380 uint64_t base
= obj_size
;
2381 for (iter
= m
.objs
.begin(); iter
!= m
.objs
.end(); ++iter
) {
2382 RGWObjManifestPart
& part
= iter
->second
;
2383 objs
[base
+ iter
->first
] = part
;
2385 obj_size
+= m
.obj_size
;
2390 bool RGWObjManifest::get_rule(uint64_t ofs
, RGWObjManifestRule
*rule
)
2392 if (rules
.empty()) {
2396 map
<uint64_t, RGWObjManifestRule
>::iterator iter
= rules
.upper_bound(ofs
);
2397 if (iter
!= rules
.begin()) {
2401 *rule
= iter
->second
;
2406 void RGWObjVersionTracker::generate_new_write_ver(CephContext
*cct
)
2408 write_version
.ver
= 1;
2411 write_version
.tag
.clear();
2412 append_rand_alpha(cct
, write_version
.tag
, write_version
.tag
, TAG_LEN
);
2415 int RGWPutObjProcessor::complete(size_t accounted_size
, const string
& etag
,
2416 real_time
*mtime
, real_time set_mtime
,
2417 map
<string
, bufferlist
>& attrs
, real_time delete_at
,
2418 const char *if_match
, const char *if_nomatch
, const string
*user_data
,
2419 rgw_zone_set
*zones_trace
)
2421 int r
= do_complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
, if_match
, if_nomatch
, user_data
, zones_trace
);
2425 is_complete
= !canceled
;
2429 CephContext
*RGWPutObjProcessor::ctx()
2431 return store
->ctx();
2434 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2441 set
<rgw_raw_obj
>::iterator iter
;
2442 bool need_to_remove_head
= false;
2443 rgw_raw_obj raw_head
;
2445 if (!head_obj
.empty()) {
2446 store
->obj_to_raw(bucket_info
.placement_rule
, head_obj
, &raw_head
);
2450 * We should delete the object in the "multipart" namespace to avoid race condition.
2451 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2452 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2453 * written by the second upload may be deleted by the first upload.
2454 * details is describled on #11749
2456 * The above comment still stands, but instead of searching for a specific object in the multipart
2457 * namespace, we just make sure that we remove the object that is marked as the head object after
2458 * we remove all the other raw objects. Note that we use different call to remove the head object,
2459 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2461 for (iter
= written_objs
.begin(); iter
!= written_objs
.end(); ++iter
) {
2462 const rgw_raw_obj
& obj
= *iter
;
2463 if (!head_obj
.empty() && obj
== raw_head
) {
2464 ldout(store
->ctx(), 5) << "NOTE: we should not process the head object (" << obj
<< ") here" << dendl
;
2465 need_to_remove_head
= true;
2469 int r
= store
->delete_raw_obj(obj
);
2470 if (r
< 0 && r
!= -ENOENT
) {
2471 ldout(store
->ctx(), 5) << "WARNING: failed to remove obj (" << obj
<< "), leaked" << dendl
;
2475 if (need_to_remove_head
) {
2476 ldout(store
->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head
<< ")" << dendl
;
2477 int r
= store
->delete_obj(obj_ctx
, bucket_info
, head_obj
, 0, 0);
2478 if (r
< 0 && r
!= -ENOENT
) {
2479 ldout(store
->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head
<< "), leaked" << dendl
;
2484 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj
& obj
, bufferlist
& bl
, off_t ofs
, off_t abs_ofs
, void **phandle
, bool exclusive
)
2486 if ((uint64_t)abs_ofs
+ bl
.length() > obj_len
)
2487 obj_len
= abs_ofs
+ bl
.length();
2489 if (!(obj
== last_written_obj
)) {
2490 last_written_obj
= obj
;
2493 // For the first call pass -1 as the offset to
2495 return store
->aio_put_obj_data(NULL
, obj
, bl
, ((ofs
!= 0) ? ofs
: -1), exclusive
, phandle
);
2498 struct put_obj_aio_info
RGWPutObjProcessor_Aio::pop_pending()
2500 struct put_obj_aio_info info
;
2501 info
= pending
.front();
2502 pending
.pop_front();
2503 pending_size
-= info
.size
;
2507 int RGWPutObjProcessor_Aio::wait_pending_front()
2509 if (pending
.empty()) {
2512 struct put_obj_aio_info info
= pop_pending();
2513 int ret
= store
->aio_wait(info
.handle
);
2516 add_written_obj(info
.obj
);
2522 bool RGWPutObjProcessor_Aio::pending_has_completed()
2524 if (pending
.empty())
2527 struct put_obj_aio_info
& info
= pending
.front();
2528 return store
->aio_completed(info
.handle
);
2531 int RGWPutObjProcessor_Aio::drain_pending()
2534 while (!pending
.empty()) {
2535 int r
= wait_pending_front();
2542 int RGWPutObjProcessor_Aio::throttle_data(void *handle
, const rgw_raw_obj
& obj
, uint64_t size
, bool need_to_wait
)
2544 bool _wait
= need_to_wait
;
2547 struct put_obj_aio_info info
;
2548 info
.handle
= handle
;
2551 pending_size
+= size
;
2552 pending
.push_back(info
);
2554 size_t orig_size
= pending_size
;
2556 /* first drain complete IOs */
2557 while (pending_has_completed()) {
2558 int r
= wait_pending_front();
2565 /* resize window in case messages are draining too fast */
2566 if (orig_size
- pending_size
>= window_size
) {
2567 window_size
+= store
->ctx()->_conf
->rgw_max_chunk_size
;
2568 uint64_t max_window_size
= store
->ctx()->_conf
->rgw_put_obj_max_window_size
;
2569 if (window_size
> max_window_size
) {
2570 window_size
= max_window_size
;
2574 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2575 if (pending_size
> window_size
|| _wait
) {
2576 int r
= wait_pending_front();
2583 int RGWPutObjProcessor_Atomic::write_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool exclusive
)
2585 if (ofs
>= next_part_ofs
) {
2586 int r
= prepare_next_part(ofs
);
2599 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj
, bl
, ofs
- cur_part_ofs
, ofs
, phandle
, exclusive
);
2602 int RGWPutObjProcessor_Aio::prepare(RGWRados
*store
, string
*oid_rand
)
2604 RGWPutObjProcessor::prepare(store
, oid_rand
);
2606 window_size
= store
->ctx()->_conf
->rgw_put_obj_min_window_size
;
2611 int RGWPutObjProcessor_Atomic::handle_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
, bool *again
)
2614 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2616 pending_data_bl
.claim_append(bl
);
2617 if (pending_data_bl
.length() < max_write_size
) {
2622 pending_data_bl
.splice(0, max_write_size
, &bl
);
2624 /* do we have enough data pending accumulated that needs to be written? */
2625 *again
= (pending_data_bl
.length() >= max_chunk_size
);
2627 if (!data_ofs
&& !immutable_head()) {
2628 first_chunk
.claim(bl
);
2629 obj_len
= (uint64_t)first_chunk
.length();
2630 int r
= prepare_next_part(obj_len
);
2637 off_t write_ofs
= data_ofs
;
2638 data_ofs
= write_ofs
+ bl
.length();
2639 bool exclusive
= (!write_ofs
&& immutable_head()); /* immutable head object, need to verify nothing exists there
2640 we could be racing with another upload, to the same
2641 object and cleanup can be messy */
2642 int ret
= write_data(bl
, write_ofs
, phandle
, pobj
, exclusive
);
2643 if (ret
>= 0) { /* we might return, need to clear bl as it was already sent */
2650 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados
*store
, string
*oid_rand
)
2652 RGWPutObjProcessor_Aio::prepare(store
, oid_rand
);
2654 int r
= store
->get_max_chunk_size(bucket_info
.placement_rule
, head_obj
, &max_chunk_size
);
2662 int RGWPutObjProcessor_Atomic::prepare(RGWRados
*store
, string
*oid_rand
)
2664 head_obj
.init(bucket
, obj_str
);
2666 int r
= prepare_init(store
, oid_rand
);
2671 if (!version_id
.empty()) {
2672 head_obj
.key
.set_instance(version_id
);
2673 } else if (versioned_object
) {
2674 store
->gen_rand_obj_instance_name(&head_obj
);
2677 manifest
.set_trivial_rule(max_chunk_size
, store
->ctx()->_conf
->rgw_obj_stripe_size
);
2679 r
= manifest_gen
.create_begin(store
->ctx(), &manifest
, bucket_info
.placement_rule
, head_obj
.bucket
, head_obj
);
2687 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs
) {
2689 int ret
= manifest_gen
.create_next(ofs
);
2691 lderr(store
->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret
<< dendl
;
2695 next_part_ofs
= ofs
+ manifest_gen
.cur_stripe_max_size();
2696 cur_obj
= manifest_gen
.get_cur_obj(store
);
2701 int RGWPutObjProcessor_Atomic::complete_parts()
2703 if (obj_len
> (uint64_t)cur_part_ofs
) {
2704 return prepare_next_part(obj_len
);
2709 int RGWPutObjProcessor_Atomic::complete_writing_data()
2711 if (!data_ofs
&& !immutable_head()) {
2712 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2713 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2714 * clobber first_chunk
2716 if (pending_data_bl
.length() > 0) {
2717 first_chunk
.claim(pending_data_bl
);
2719 obj_len
= (uint64_t)first_chunk
.length();
2721 while (pending_data_bl
.length()) {
2722 void *handle
= nullptr;
2724 uint64_t max_write_size
= MIN(max_chunk_size
, (uint64_t)next_part_ofs
- data_ofs
);
2725 if (max_write_size
> pending_data_bl
.length()) {
2726 max_write_size
= pending_data_bl
.length();
2729 pending_data_bl
.splice(0, max_write_size
, &bl
);
2730 uint64_t write_len
= bl
.length();
2731 int r
= write_data(bl
, data_ofs
, &handle
, &obj
, false);
2733 ldout(store
->ctx(), 0) << "ERROR: write_data() returned " << r
<< dendl
;
2736 data_ofs
+= write_len
;
2737 r
= throttle_data(handle
, obj
, write_len
, false);
2739 ldout(store
->ctx(), 0) << "ERROR: throttle_data() returned " << r
<< dendl
;
2743 if (data_ofs
>= next_part_ofs
) {
2744 r
= prepare_next_part(data_ofs
);
2746 ldout(store
->ctx(), 0) << "ERROR: prepare_next_part() returned " << r
<< dendl
;
2751 int r
= complete_parts();
2756 r
= drain_pending();
2763 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size
, const string
& etag
,
2764 real_time
*mtime
, real_time set_mtime
,
2765 map
<string
, bufferlist
>& attrs
,
2766 real_time delete_at
,
2767 const char *if_match
,
2768 const char *if_nomatch
, const string
*user_data
,
2769 rgw_zone_set
*zones_trace
) {
2770 int r
= complete_writing_data();
2774 obj_ctx
.obj
.set_atomic(head_obj
);
2776 RGWRados::Object
op_target(store
, bucket_info
, obj_ctx
, head_obj
);
2778 /* some object types shouldn't be versioned, e.g., multipart parts */
2779 op_target
.set_versioning_disabled(!versioned_object
);
2781 RGWRados::Object::Write
obj_op(&op_target
);
2783 obj_op
.meta
.data
= &first_chunk
;
2784 obj_op
.meta
.manifest
= &manifest
;
2785 obj_op
.meta
.ptag
= &unique_tag
; /* use req_id as operation tag */
2786 obj_op
.meta
.if_match
= if_match
;
2787 obj_op
.meta
.if_nomatch
= if_nomatch
;
2788 obj_op
.meta
.mtime
= mtime
;
2789 obj_op
.meta
.set_mtime
= set_mtime
;
2790 obj_op
.meta
.owner
= bucket_info
.owner
;
2791 obj_op
.meta
.flags
= PUT_OBJ_CREATE
;
2792 obj_op
.meta
.olh_epoch
= olh_epoch
;
2793 obj_op
.meta
.delete_at
= delete_at
;
2794 obj_op
.meta
.user_data
= user_data
;
2795 obj_op
.meta
.zones_trace
= zones_trace
;
2796 obj_op
.meta
.modify_tail
= true;
2798 r
= obj_op
.write_meta(obj_len
, accounted_size
, attrs
);
2803 canceled
= obj_op
.meta
.canceled
;
2808 const char* RGWRados::admin_commands
[4][3] = {
2810 "cache list name=filter,type=CephString,req=false",
2811 "cache list [filter_str]: list object cache, possibly matching substrings" },
2813 "cache inspect name=target,type=CephString,req=true",
2814 "cache inspect target: print cache element" },
2816 "cache erase name=target,type=CephString,req=true",
2817 "cache erase target: erase element from cache" },
2820 "cache zap: erase all elements from cache" }
2824 int RGWRados::watch(const string
& oid
, uint64_t *watch_handle
, librados::WatchCtx2
*ctx
) {
2825 int r
= control_pool_ctx
.watch2(oid
, watch_handle
, ctx
);
2831 int RGWRados::unwatch(uint64_t watch_handle
)
2833 int r
= control_pool_ctx
.unwatch2(watch_handle
);
2835 ldout(cct
, 0) << "ERROR: rados->unwatch2() returned r=" << r
<< dendl
;
2838 r
= rados
[0].watch_flush();
2840 ldout(cct
, 0) << "ERROR: rados->watch_flush() returned r=" << r
<< dendl
;
2846 void RGWRados::add_watcher(int i
)
2848 ldout(cct
, 20) << "add_watcher() i=" << i
<< dendl
;
2849 Mutex::Locker
l(watchers_lock
);
2850 watchers_set
.insert(i
);
2851 if (watchers_set
.size() == (size_t)num_watchers
) {
2852 ldout(cct
, 2) << "all " << num_watchers
<< " watchers are set, enabling cache" << dendl
;
2853 set_cache_enabled(true);
2857 void RGWRados::remove_watcher(int i
)
2859 ldout(cct
, 20) << "remove_watcher() i=" << i
<< dendl
;
2860 Mutex::Locker
l(watchers_lock
);
2861 size_t orig_size
= watchers_set
.size();
2862 watchers_set
.erase(i
);
2863 if (orig_size
== (size_t)num_watchers
&&
2864 watchers_set
.size() < orig_size
) { /* actually removed */
2865 ldout(cct
, 2) << "removed watcher, disabling cache" << dendl
;
2866 set_cache_enabled(false);
2870 class RGWWatcher
: public librados::WatchCtx2
{
2874 uint64_t watch_handle
;
2876 class C_ReinitWatch
: public Context
{
2877 RGWWatcher
*watcher
;
2879 explicit C_ReinitWatch(RGWWatcher
*_watcher
) : watcher(_watcher
) {}
2880 void finish(int r
) override
{
2885 RGWWatcher(RGWRados
*r
, int i
, const string
& o
) : rados(r
), index(i
), oid(o
), watch_handle(0) {}
2886 void handle_notify(uint64_t notify_id
,
2888 uint64_t notifier_id
,
2889 bufferlist
& bl
) override
{
2890 ldout(rados
->ctx(), 10) << "RGWWatcher::handle_notify() "
2891 << " notify_id " << notify_id
2892 << " cookie " << cookie
2893 << " notifier " << notifier_id
2894 << " bl.length()=" << bl
.length() << dendl
;
2895 rados
->watch_cb(notify_id
, cookie
, notifier_id
, bl
);
2897 bufferlist reply_bl
; // empty reply payload
2898 rados
->control_pool_ctx
.notify_ack(oid
, notify_id
, cookie
, reply_bl
);
2900 void handle_error(uint64_t cookie
, int err
) override
{
2901 lderr(rados
->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2902 << " err " << cpp_strerror(err
) << dendl
;
2903 rados
->remove_watcher(index
);
2904 rados
->schedule_context(new C_ReinitWatch(this));
2908 int ret
= unregister_watch();
2910 ldout(rados
->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret
<< dendl
;
2913 ret
= register_watch();
2915 ldout(rados
->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret
<< dendl
;
2920 int unregister_watch() {
2921 int r
= rados
->unwatch(watch_handle
);
2925 rados
->remove_watcher(index
);
2929 int register_watch() {
2930 int r
= rados
->watch(oid
, &watch_handle
, this);
2934 rados
->add_watcher(index
);
2939 class RGWMetaNotifierManager
: public RGWCoroutinesManager
{
2941 RGWHTTPManager http_manager
;
2944 RGWMetaNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2945 http_manager(store
->ctx(), completion_mgr
) {
2946 http_manager
.set_threaded();
2949 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, set
<int>& shards
) {
2950 rgw_http_param_pair pairs
[] = { { "type", "metadata" },
2954 list
<RGWCoroutinesStack
*> stacks
;
2955 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2956 RGWRESTConn
*conn
= iter
->second
;
2957 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2958 stack
->call(new RGWPostRESTResourceCR
<set
<int>, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2960 stacks
.push_back(stack
);
2966 class RGWDataNotifierManager
: public RGWCoroutinesManager
{
2968 RGWHTTPManager http_manager
;
2971 RGWDataNotifierManager(RGWRados
*_store
) : RGWCoroutinesManager(_store
->ctx(), _store
->get_cr_registry()), store(_store
),
2972 http_manager(store
->ctx(), completion_mgr
) {
2973 http_manager
.set_threaded();
2976 int notify_all(map
<string
, RGWRESTConn
*>& conn_map
, map
<int, set
<string
> >& shards
) {
2977 rgw_http_param_pair pairs
[] = { { "type", "data" },
2979 { "source-zone", store
->get_zone_params().get_id().c_str() },
2982 list
<RGWCoroutinesStack
*> stacks
;
2983 for (map
<string
, RGWRESTConn
*>::iterator iter
= conn_map
.begin(); iter
!= conn_map
.end(); ++iter
) {
2984 RGWRESTConn
*conn
= iter
->second
;
2985 RGWCoroutinesStack
*stack
= new RGWCoroutinesStack(store
->ctx(), this);
2986 stack
->call(new RGWPostRESTResourceCR
<map
<int, set
<string
> >, int>(store
->ctx(), conn
, &http_manager
, "/admin/log", pairs
, shards
, NULL
));
2988 stacks
.push_back(stack
);
2994 class RGWRadosThread
{
2995 class Worker
: public Thread
{
2997 RGWRadosThread
*processor
;
3002 Mutex::Locker
l(lock
);
3006 void wait_interval(const utime_t
& wait_time
) {
3007 Mutex::Locker
l(lock
);
3008 cond
.WaitInterval(lock
, wait_time
);
3012 Worker(CephContext
*_cct
, RGWRadosThread
*_p
) : cct(_cct
), processor(_p
), lock("RGWRadosThread::Worker") {}
3013 void *entry() override
;
3015 Mutex::Locker
l(lock
);
3026 std::atomic
<bool> down_flag
= { false };
3030 virtual uint64_t interval_msec() = 0;
3031 virtual void stop_process() {}
3033 RGWRadosThread(RGWRados
*_store
, const string
& thread_name
= "radosgw")
3034 : worker(NULL
), cct(_store
->ctx()), store(_store
), thread_name(thread_name
) {}
3035 virtual ~RGWRadosThread() {
3039 virtual int init() { return 0; }
3040 virtual int process() = 0;
3042 bool going_down() { return down_flag
; }
3054 void RGWRadosThread::start()
3056 worker
= new Worker(cct
, this);
3057 worker
->create(thread_name
.c_str());
3060 void RGWRadosThread::stop()
3072 void *RGWRadosThread::Worker::entry() {
3073 uint64_t msec
= processor
->interval_msec();
3074 utime_t interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3077 utime_t start
= ceph_clock_now();
3078 int r
= processor
->process();
3080 dout(0) << "ERROR: processor->process() returned error r=" << r
<< dendl
;
3083 if (processor
->going_down())
3086 utime_t end
= ceph_clock_now();
3089 uint64_t cur_msec
= processor
->interval_msec();
3090 if (cur_msec
!= msec
) { /* was it reconfigured? */
3092 interval
= utime_t(msec
/ 1000, (msec
% 1000) * 1000000);
3096 if (interval
<= end
)
3097 continue; // next round
3099 utime_t wait_time
= interval
;
3102 wait_interval(wait_time
);
3106 } while (!processor
->going_down());
3111 class RGWMetaNotifier
: public RGWRadosThread
{
3112 RGWMetaNotifierManager notify_mgr
;
3113 RGWMetadataLog
*const log
;
3115 uint64_t interval_msec() override
{
3116 return cct
->_conf
->rgw_md_notify_interval_msec
;
3118 void stop_process() override
{
3122 RGWMetaNotifier(RGWRados
*_store
, RGWMetadataLog
* log
)
3123 : RGWRadosThread(_store
, "meta-notifier"), notify_mgr(_store
), log(log
) {}
3125 int process() override
;
3128 int RGWMetaNotifier::process()
3132 log
->read_clear_modified(shards
);
3134 if (shards
.empty()) {
3138 for (set
<int>::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3139 ldout(cct
, 20) << __func__
<< "(): notifying mdlog change, shard_id=" << *iter
<< dendl
;
3142 notify_mgr
.notify_all(store
->zone_conn_map
, shards
);
3147 class RGWDataNotifier
: public RGWRadosThread
{
3148 RGWDataNotifierManager notify_mgr
;
3150 uint64_t interval_msec() override
{
3151 return cct
->_conf
->get_val
<int64_t>("rgw_data_notify_interval_msec");
3153 void stop_process() override
{
3157 RGWDataNotifier(RGWRados
*_store
) : RGWRadosThread(_store
, "data-notifier"), notify_mgr(_store
) {}
3159 int process() override
;
3162 int RGWDataNotifier::process()
3164 if (!store
->data_log
) {
3168 map
<int, set
<string
> > shards
;
3170 store
->data_log
->read_clear_modified(shards
);
3172 if (shards
.empty()) {
3176 for (map
<int, set
<string
> >::iterator iter
= shards
.begin(); iter
!= shards
.end(); ++iter
) {
3177 ldout(cct
, 20) << __func__
<< "(): notifying datalog change, shard_id=" << iter
->first
<< ": " << iter
->second
<< dendl
;
3180 notify_mgr
.notify_all(store
->zone_data_notify_to_map
, shards
);
3185 class RGWSyncProcessorThread
: public RGWRadosThread
{
3187 RGWSyncProcessorThread(RGWRados
*_store
, const string
& thread_name
= "radosgw") : RGWRadosThread(_store
, thread_name
) {}
3188 RGWSyncProcessorThread(RGWRados
*_store
) : RGWRadosThread(_store
) {}
3189 ~RGWSyncProcessorThread() override
{}
3190 int init() override
= 0 ;
3191 int process() override
= 0;
3194 class RGWMetaSyncProcessorThread
: public RGWSyncProcessorThread
3196 RGWMetaSyncStatusManager sync
;
3198 uint64_t interval_msec() override
{
3199 return 0; /* no interval associated, it'll run once until stopped */
3201 void stop_process() override
{
3205 RGWMetaSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
)
3206 : RGWSyncProcessorThread(_store
, "meta-sync"), sync(_store
, async_rados
) {}
3208 void wakeup_sync_shards(set
<int>& shard_ids
) {
3209 for (set
<int>::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3213 RGWMetaSyncStatusManager
* get_manager() { return &sync
; }
3215 int init() override
{
3216 int ret
= sync
.init();
3218 ldout(store
->ctx(), 0) << "ERROR: sync.init() returned " << ret
<< dendl
;
3224 int process() override
{
3230 class RGWDataSyncProcessorThread
: public RGWSyncProcessorThread
3232 RGWDataSyncStatusManager sync
;
3235 uint64_t interval_msec() override
{
3237 return 0; /* no interval associated, it'll run once until stopped */
3239 #define DATA_SYNC_INIT_WAIT_SEC 20
3240 return DATA_SYNC_INIT_WAIT_SEC
* 1000;
3243 void stop_process() override
{
3247 RGWDataSyncProcessorThread(RGWRados
*_store
, RGWAsyncRadosProcessor
*async_rados
,
3248 const string
& _source_zone
)
3249 : RGWSyncProcessorThread(_store
, "data-sync"),
3250 sync(_store
, async_rados
, _source_zone
),
3251 initialized(false) {}
3253 void wakeup_sync_shards(map
<int, set
<string
> >& shard_ids
) {
3254 for (map
<int, set
<string
> >::iterator iter
= shard_ids
.begin(); iter
!= shard_ids
.end(); ++iter
) {
3255 sync
.wakeup(iter
->first
, iter
->second
);
3258 RGWDataSyncStatusManager
* get_manager() { return &sync
; }
3260 int init() override
{
3264 int process() override
{
3265 while (!initialized
) {
3269 int ret
= sync
.init();
3274 /* we'll be back! */
3282 class RGWSyncLogTrimThread
: public RGWSyncProcessorThread
3284 RGWCoroutinesManager crs
;
3286 rgw::BucketTrimManager
*bucket_trim
;
3287 RGWHTTPManager http
;
3288 const utime_t trim_interval
;
3290 uint64_t interval_msec() override
{ return 0; }
3291 void stop_process() override
{ crs
.stop(); }
3293 RGWSyncLogTrimThread(RGWRados
*store
, rgw::BucketTrimManager
*bucket_trim
,
3295 : RGWSyncProcessorThread(store
, "sync-log-trim"),
3296 crs(store
->ctx(), store
->get_cr_registry()), store(store
),
3297 bucket_trim(bucket_trim
),
3298 http(store
->ctx(), crs
.get_completion_mgr()),
3299 trim_interval(interval
, 0)
3302 int init() override
{
3303 return http
.set_threaded();
3305 int process() override
{
3306 list
<RGWCoroutinesStack
*> stacks
;
3307 auto meta
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3308 meta
->call(create_meta_log_trim_cr(store
, &http
,
3309 cct
->_conf
->rgw_md_log_max_shards
,
3311 stacks
.push_back(meta
);
3313 auto data
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3314 data
->call(create_data_log_trim_cr(store
, &http
,
3315 cct
->_conf
->rgw_data_log_num_shards
,
3317 stacks
.push_back(data
);
3319 auto bucket
= new RGWCoroutinesStack(store
->ctx(), &crs
);
3320 bucket
->call(bucket_trim
->create_bucket_trim_cr(&http
));
3321 stacks
.push_back(bucket
);
3328 void RGWRados::wakeup_meta_sync_shards(set
<int>& shard_ids
)
3330 Mutex::Locker
l(meta_sync_thread_lock
);
3331 if (meta_sync_processor_thread
) {
3332 meta_sync_processor_thread
->wakeup_sync_shards(shard_ids
);
3336 void RGWRados::wakeup_data_sync_shards(const string
& source_zone
, map
<int, set
<string
> >& shard_ids
)
3338 ldout(ctx(), 20) << __func__
<< ": source_zone=" << source_zone
<< ", shard_ids=" << shard_ids
<< dendl
;
3339 Mutex::Locker
l(data_sync_thread_lock
);
3340 map
<string
, RGWDataSyncProcessorThread
*>::iterator iter
= data_sync_processor_threads
.find(source_zone
);
3341 if (iter
== data_sync_processor_threads
.end()) {
3342 ldout(ctx(), 10) << __func__
<< ": couldn't find sync thread for zone " << source_zone
<< ", skipping async data sync processing" << dendl
;
3346 RGWDataSyncProcessorThread
*thread
= iter
->second
;
3348 thread
->wakeup_sync_shards(shard_ids
);
3351 RGWMetaSyncStatusManager
* RGWRados::get_meta_sync_manager()
3353 Mutex::Locker
l(meta_sync_thread_lock
);
3354 if (meta_sync_processor_thread
) {
3355 return meta_sync_processor_thread
->get_manager();
3360 RGWDataSyncStatusManager
* RGWRados::get_data_sync_manager(const std::string
& source_zone
)
3362 Mutex::Locker
l(data_sync_thread_lock
);
3363 auto thread
= data_sync_processor_threads
.find(source_zone
);
3364 if (thread
== data_sync_processor_threads
.end()) {
3367 return thread
->second
->get_manager();
3370 int RGWRados::get_required_alignment(const rgw_pool
& pool
, uint64_t *alignment
)
3373 int r
= open_pool_ctx(pool
, ioctx
);
3375 ldout(cct
, 0) << "ERROR: open_pool_ctx() returned " << r
<< dendl
;
3380 r
= ioctx
.pool_requires_alignment2(&requires
);
3382 ldout(cct
, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3393 r
= ioctx
.pool_required_alignment2(&align
);
3395 ldout(cct
, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3400 ldout(cct
, 20) << "required alignment=" << align
<< dendl
;
3406 int RGWRados::get_max_chunk_size(const rgw_pool
& pool
, uint64_t *max_chunk_size
)
3408 uint64_t alignment
= 0;
3409 int r
= get_required_alignment(pool
, &alignment
);
3414 uint64_t config_chunk_size
= cct
->_conf
->rgw_max_chunk_size
;
3416 if (alignment
== 0) {
3417 *max_chunk_size
= config_chunk_size
;
3421 if (config_chunk_size
<= alignment
) {
3422 *max_chunk_size
= alignment
;
3426 *max_chunk_size
= config_chunk_size
- (config_chunk_size
% alignment
);
3428 ldout(cct
, 20) << "max_chunk_size=" << *max_chunk_size
<< dendl
;
3433 int RGWRados::get_max_chunk_size(const string
& placement_rule
, const rgw_obj
& obj
, uint64_t *max_chunk_size
)
3436 if (!get_obj_data_pool(placement_rule
, obj
, &pool
)) {
3437 ldout(cct
, 0) << "ERROR: failed to get data pool for object " << obj
<< dendl
;
3440 return get_max_chunk_size(pool
, max_chunk_size
);
3443 class RGWIndexCompletionManager
;
3445 struct complete_op_data
{
3446 Mutex lock
{"complete_op_data"};
3447 AioCompletion
*rados_completion
{nullptr};
3448 int manager_shard_id
{-1};
3449 RGWIndexCompletionManager
*manager
{nullptr};
3453 rgw_bucket_entry_ver ver
;
3454 cls_rgw_obj_key key
;
3455 rgw_bucket_dir_entry_meta dir_meta
;
3456 list
<cls_rgw_obj_key
> remove_objs
;
3459 rgw_zone_set zones_trace
;
3461 bool stopped
{false};
3464 Mutex::Locker
l(lock
);
3469 class RGWIndexCompletionThread
: public RGWRadosThread
{
3472 uint64_t interval_msec() override
{
3476 list
<complete_op_data
*> completions
;
3478 Mutex completions_lock
;
3480 RGWIndexCompletionThread(RGWRados
*_store
)
3481 : RGWRadosThread(_store
, "index-complete"), store(_store
), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3483 int process() override
;
3485 void add_completion(complete_op_data
*completion
) {
3487 Mutex::Locker
l(completions_lock
);
3488 completions
.push_back(completion
);
3495 int RGWIndexCompletionThread::process()
3497 list
<complete_op_data
*> comps
;
3500 Mutex::Locker
l(completions_lock
);
3501 completions
.swap(comps
);
3504 for (auto c
: comps
) {
3505 std::unique_ptr
<complete_op_data
> up
{c
};
3510 ldout(store
->ctx(), 20) << __func__
<< "(): handling completion for key=" << c
->key
<< dendl
;
3512 RGWRados::BucketShard
bs(store
);
3513 RGWBucketInfo bucket_info
;
3515 int r
= bs
.init(c
->obj
.bucket
, c
->obj
, &bucket_info
);
3517 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): failed to initialize BucketShard, obj=" << c
->obj
<< " r=" << r
<< dendl
;
3518 /* not much to do */
3522 r
= store
->guard_reshard(&bs
, c
->obj
, bucket_info
,
3523 [&](RGWRados::BucketShard
*bs
) -> int {
3524 librados::ObjectWriteOperation o
;
3525 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
3526 cls_rgw_bucket_complete_op(o
, c
->op
, c
->tag
, c
->ver
, c
->key
, c
->dir_meta
, &c
->remove_objs
,
3527 c
->log_op
, c
->bilog_op
, &c
->zones_trace
);
3528 return bs
->index_ctx
.operate(bs
->bucket_obj
, &o
);
3531 ldout(cct
, 0) << "ERROR: " << __func__
<< "(): bucket index completion failed, obj=" << c
->obj
<< " r=" << r
<< dendl
;
3532 /* ignoring error, can't do anything about it */
3535 r
= store
->data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
3537 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
3544 class RGWIndexCompletionManager
{
3545 RGWRados
*store
{nullptr};
3546 vector
<Mutex
*> locks
;
3547 vector
<set
<complete_op_data
*> > completions
;
3549 RGWIndexCompletionThread
*completion_thread
{nullptr};
3553 std::atomic
<int> cur_shard
{0};
3557 RGWIndexCompletionManager(RGWRados
*_store
) : store(_store
) {
3558 num_shards
= store
->ctx()->_conf
->rgw_thread_pool_size
;
3560 for (int i
= 0; i
< num_shards
; i
++) {
3562 snprintf(buf
, sizeof(buf
), "RGWIndexCompletionManager::lock::%d", i
);
3563 locks
.push_back(new Mutex(buf
));
3566 completions
.resize(num_shards
);
3568 ~RGWIndexCompletionManager() {
3571 for (auto l
: locks
) {
3577 int result
= cur_shard
% num_shards
;
3582 void create_completion(const rgw_obj
& obj
,
3583 RGWModifyOp op
, string
& tag
,
3584 rgw_bucket_entry_ver
& ver
,
3585 const cls_rgw_obj_key
& key
,
3586 rgw_bucket_dir_entry_meta
& dir_meta
,
3587 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
3589 rgw_zone_set
*zones_trace
,
3590 complete_op_data
**result
);
3591 bool handle_completion(completion_t cb
, complete_op_data
*arg
);
3594 completion_thread
= new RGWIndexCompletionThread(store
);
3595 int ret
= completion_thread
->init();
3599 completion_thread
->start();
3603 if (completion_thread
) {
3604 completion_thread
->stop();
3605 delete completion_thread
;
3608 for (int i
= 0; i
< num_shards
; ++i
) {
3609 Mutex::Locker
l(*locks
[i
]);
3610 for (auto c
: completions
[i
]) {
3614 completions
.clear();
3618 static void obj_complete_cb(completion_t cb
, void *arg
)
3620 complete_op_data
*completion
= (complete_op_data
*)arg
;
3621 completion
->lock
.Lock();
3622 if (completion
->stopped
) {
3623 completion
->lock
.Unlock(); /* can drop lock, no one else is referencing us */
3627 bool need_delete
= completion
->manager
->handle_completion(cb
, completion
);
3628 completion
->lock
.Unlock();
3635 void RGWIndexCompletionManager::create_completion(const rgw_obj
& obj
,
3636 RGWModifyOp op
, string
& tag
,
3637 rgw_bucket_entry_ver
& ver
,
3638 const cls_rgw_obj_key
& key
,
3639 rgw_bucket_dir_entry_meta
& dir_meta
,
3640 list
<cls_rgw_obj_key
> *remove_objs
, bool log_op
,
3642 rgw_zone_set
*zones_trace
,
3643 complete_op_data
**result
)
3645 complete_op_data
*entry
= new complete_op_data
;
3647 int shard_id
= next_shard();
3649 entry
->manager_shard_id
= shard_id
;
3650 entry
->manager
= this;
3656 entry
->dir_meta
= dir_meta
;
3657 entry
->log_op
= log_op
;
3658 entry
->bilog_op
= bilog_op
;
3661 for (auto iter
= remove_objs
->begin(); iter
!= remove_objs
->end(); ++iter
) {
3662 entry
->remove_objs
.push_back(*iter
);
3667 entry
->zones_trace
= *zones_trace
;
3669 entry
->zones_trace
.insert(store
->get_zone().id
);
3674 entry
->rados_completion
= librados::Rados::aio_create_completion(entry
, NULL
, obj_complete_cb
);
3676 Mutex::Locker
l(*locks
[shard_id
]);
3677 completions
[shard_id
].insert(entry
);
3680 bool RGWIndexCompletionManager::handle_completion(completion_t cb
, complete_op_data
*arg
)
3682 int shard_id
= arg
->manager_shard_id
;
3684 Mutex::Locker
l(*locks
[shard_id
]);
3686 auto& comps
= completions
[shard_id
];
3688 auto iter
= comps
.find(arg
);
3689 if (iter
== comps
.end()) {
3696 int r
= rados_aio_get_return_value(cb
);
3697 if (r
!= -ERR_BUSY_RESHARDING
) {
3700 completion_thread
->add_completion(arg
);
3704 void RGWRados::finalize()
3706 auto admin_socket
= cct
->get_admin_socket();
3707 for (auto cmd
: admin_commands
) {
3708 int r
= admin_socket
->unregister_command(cmd
[0]);
3710 lderr(cct
) << "ERROR: fail to unregister admin socket command (r=" << r
3715 if (run_sync_thread
) {
3716 Mutex::Locker
l(meta_sync_thread_lock
);
3717 meta_sync_processor_thread
->stop();
3719 Mutex::Locker
dl(data_sync_thread_lock
);
3720 for (auto iter
: data_sync_processor_threads
) {
3721 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3724 if (sync_log_trimmer
) {
3725 sync_log_trimmer
->stop();
3729 async_rados
->stop();
3731 if (run_sync_thread
) {
3732 delete meta_sync_processor_thread
;
3733 meta_sync_processor_thread
= NULL
;
3734 Mutex::Locker
dl(data_sync_thread_lock
);
3735 for (auto iter
: data_sync_processor_threads
) {
3736 RGWDataSyncProcessorThread
*thread
= iter
.second
;
3739 data_sync_processor_threads
.clear();
3740 delete sync_log_trimmer
;
3741 sync_log_trimmer
= nullptr;
3742 bucket_trim
= boost::none
;
3747 if (need_watch_notify()) {
3751 /* delete finisher only after cleaning up watches, as watch error path might call
3752 * into finisher. We stop finisher before finalizing watch to make sure we don't
3753 * actually handle any racing work
3757 if (meta_notifier
) {
3758 meta_notifier
->stop();
3759 delete meta_notifier
;
3761 if (data_notifier
) {
3762 data_notifier
->stop();
3763 delete data_notifier
;
3779 delete rest_master_conn
;
3781 map
<string
, RGWRESTConn
*>::iterator iter
;
3782 for (iter
= zone_conn_map
.begin(); iter
!= zone_conn_map
.end(); ++iter
) {
3783 RGWRESTConn
*conn
= iter
->second
;
3787 for (iter
= zonegroup_conn_map
.begin(); iter
!= zonegroup_conn_map
.end(); ++iter
) {
3788 RGWRESTConn
*conn
= iter
->second
;
3791 RGWQuotaHandler::free_handler(quota_handler
);
3797 delete obj_tombstone_cache
;
3798 delete sync_modules_manager
;
3800 if (reshard_wait
.get()) {
3801 reshard_wait
->stop();
3802 reshard_wait
.reset();
3805 if (run_reshard_thread
) {
3806 reshard
->stop_processor();
3809 delete index_completion_manager
;
3813 * Initialize the RADOS instance and prepare to do other ops
3814 * Returns 0 on success, -ERR# on failure.
3816 int RGWRados::init_rados()
3819 auto admin_socket
= cct
->get_admin_socket();
3820 for (auto cmd
: admin_commands
) {
3821 int r
= admin_socket
->register_command(cmd
[0], cmd
[1], this,
3824 lderr(cct
) << "ERROR: fail to register admin socket command (r=" << r
3830 auto handles
= std::vector
<librados::Rados
>{cct
->_conf
->rgw_num_rados_handles
};
3832 for (auto& r
: handles
) {
3833 ret
= r
.init_with_context(cct
);
3843 sync_modules_manager
= new RGWSyncModulesManager();
3845 rgw_register_sync_modules(sync_modules_manager
);
3847 auto crs
= std::unique_ptr
<RGWCoroutinesManagerRegistry
>{
3848 new RGWCoroutinesManagerRegistry(cct
)};
3849 ret
= crs
->hook_to_admin_command("cr dump");
3854 meta_mgr
= new RGWMetadataManager(cct
, this);
3855 data_log
= new RGWDataChangesLog(cct
, this);
3856 cr_registry
= crs
.release();
3858 std::swap(handles
, rados
);
3863 int RGWRados::register_to_service_map(const string
& daemon_type
, const map
<string
, string
>& meta
)
3865 map
<string
,string
> metadata
= meta
;
3866 metadata
["num_handles"] = stringify(rados
.size());
3867 metadata
["zonegroup_id"] = zonegroup
.get_id();
3868 metadata
["zonegroup_name"] = zonegroup
.get_name();
3869 metadata
["zone_name"] = zone_name();
3870 metadata
["zone_id"] = zone_id();;
3871 string name
= cct
->_conf
->name
.get_id();
3872 if (name
.find("rgw.") == 0) {
3873 name
= name
.substr(4);
3875 int ret
= rados
[0].service_daemon_register(daemon_type
, name
, metadata
);
3877 ldout(cct
, 0) << "ERROR: service_daemon_register() returned ret=" << ret
<< ": " << cpp_strerror(-ret
) << dendl
;
3885 * Add new connection to connections map
3886 * @param zonegroup_conn_map map which new connection will be added to
3887 * @param zonegroup zonegroup which new connection will connect to
3888 * @param new_connection pointer to new connection instance
3890 static void add_new_connection_to_map(map
<string
, RGWRESTConn
*> &zonegroup_conn_map
,
3891 const RGWZoneGroup
&zonegroup
, RGWRESTConn
*new_connection
)
3893 // Delete if connection is already exists
3894 map
<string
, RGWRESTConn
*>::iterator iterZoneGroup
= zonegroup_conn_map
.find(zonegroup
.get_id());
3895 if (iterZoneGroup
!= zonegroup_conn_map
.end()) {
3896 delete iterZoneGroup
->second
;
3899 // Add new connection to connections map
3900 zonegroup_conn_map
[zonegroup
.get_id()] = new_connection
;
3903 int RGWRados::convert_regionmap()
3905 RGWZoneGroupMap zonegroupmap
;
3907 string pool_name
= cct
->_conf
->rgw_zone_root_pool
;
3908 if (pool_name
.empty()) {
3909 pool_name
= RGW_DEFAULT_ZONE_ROOT_POOL
;
3911 string oid
= region_map_oid
;
3913 rgw_pool
pool(pool_name
);
3915 RGWObjectCtx
obj_ctx(this);
3916 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
, oid
, bl
, NULL
, NULL
);
3917 if (ret
< 0 && ret
!= -ENOENT
) {
3919 } else if (ret
== -ENOENT
) {
3924 bufferlist::iterator iter
= bl
.begin();
3925 ::decode(zonegroupmap
, iter
);
3926 } catch (buffer::error
& err
) {
3927 ldout(cct
, 0) << "error decoding regionmap from " << pool
<< ":" << oid
<< dendl
;
3931 for (map
<string
, RGWZoneGroup
>::iterator iter
= zonegroupmap
.zonegroups
.begin();
3932 iter
!= zonegroupmap
.zonegroups
.end(); ++iter
) {
3933 RGWZoneGroup
& zonegroup
= iter
->second
;
3934 ret
= zonegroup
.init(cct
, this, false);
3935 ret
= zonegroup
.update();
3936 if (ret
< 0 && ret
!= -ENOENT
) {
3937 ldout(cct
, 0) << "Error could not update zonegroup " << zonegroup
.get_name() << ": " <<
3938 cpp_strerror(-ret
) << dendl
;
3940 } else if (ret
== -ENOENT
) {
3941 ret
= zonegroup
.create();
3943 ldout(cct
, 0) << "Error could not create " << zonegroup
.get_name() << ": " <<
3944 cpp_strerror(-ret
) << dendl
;
3950 current_period
.set_user_quota(zonegroupmap
.user_quota
);
3951 current_period
.set_bucket_quota(zonegroupmap
.bucket_quota
);
3953 // remove the region_map so we don't try to convert again
3954 rgw_raw_obj
obj(pool
, oid
);
3955 ret
= delete_system_obj(obj
);
3957 ldout(cct
, 0) << "Error could not remove " << obj
3958 << " after upgrading to zonegroup map: " << cpp_strerror(ret
) << dendl
;
3966 * Replace all region configuration with zonegroup for
3967 * backward compatability
3968 * Returns 0 on success, -ERR# on failure.
3970 int RGWRados::replace_region_with_zonegroup()
3972 /* copy default region */
3973 /* convert default region to default zonegroup */
3974 string default_oid
= cct
->_conf
->rgw_default_region_info_oid
;
3975 if (default_oid
.empty()) {
3976 default_oid
= default_region_info_oid
;
3980 RGWZoneGroup default_zonegroup
;
3981 rgw_pool pool
{default_zonegroup
.get_pool(cct
)};
3982 string oid
= "converted";
3984 RGWObjectCtx
obj_ctx(this);
3986 int ret
= rgw_get_system_obj(this, obj_ctx
, pool
,oid
, bl
, NULL
, NULL
);
3987 if (ret
< 0 && ret
!= -ENOENT
) {
3988 ldout(cct
, 0) << __func__
<< " failed to read converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
3991 } else if (ret
!= -ENOENT
) {
3992 ldout(cct
, 20) << "System already converted " << dendl
;
3996 string default_region
;
3997 ret
= default_zonegroup
.init(cct
, this, false, true);
3999 ldout(cct
, 0) << __func__
<< " failed init default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4002 ret
= default_zonegroup
.read_default_id(default_region
, true);
4003 if (ret
< 0 && ret
!= -ENOENT
) {
4004 ldout(cct
, 0) << __func__
<< " failed reading old default region: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4008 /* convert regions to zonegroups */
4009 list
<string
> regions
;
4010 ret
= list_regions(regions
);
4011 if (ret
< 0 && ret
!= -ENOENT
) {
4012 ldout(cct
, 0) << __func__
<< " failed to list regions: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4014 } else if (ret
== -ENOENT
|| regions
.empty()) {
4015 RGWZoneParams
zoneparams(default_zone_name
);
4016 int ret
= zoneparams
.init(cct
, this);
4017 if (ret
< 0 && ret
!= -ENOENT
) {
4018 ldout(cct
, 0) << __func__
<< ": error initializing default zone params: " << cpp_strerror(-ret
) << dendl
;
4021 /* update master zone */
4022 RGWZoneGroup
default_zg(default_zonegroup_name
);
4023 ret
= default_zg
.init(cct
, this);
4024 if (ret
< 0 && ret
!= -ENOENT
) {
4025 ldout(cct
, 0) << __func__
<< ": error in initializing default zonegroup: " << cpp_strerror(-ret
) << dendl
;
4028 if (ret
!= -ENOENT
&& default_zg
.master_zone
.empty()) {
4029 default_zg
.master_zone
= zoneparams
.get_id();
4030 return default_zg
.update();
4035 string master_region
, master_zone
;
4036 for (list
<string
>::iterator iter
= regions
.begin(); iter
!= regions
.end(); ++iter
) {
4037 if (*iter
!= default_zonegroup_name
){
4038 RGWZoneGroup
region(*iter
);
4039 int ret
= region
.init(cct
, this, true, true);
4041 ldout(cct
, 0) << __func__
<< " failed init region "<< *iter
<< ": " << cpp_strerror(-ret
) << dendl
;
4044 if (region
.is_master_zonegroup()) {
4045 master_region
= region
.get_id();
4046 master_zone
= region
.master_zone
;
4051 /* create realm if there is none.
4052 The realm name will be the region and zone concatenated
4053 realm id will be mds of its name */
4054 if (realm
.get_id().empty() && !master_region
.empty() && !master_zone
.empty()) {
4055 string new_realm_name
= master_region
+ "." + master_zone
;
4056 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
4057 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
4059 hash
.Update((const byte
*)new_realm_name
.c_str(), new_realm_name
.length());
4061 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
4062 string
new_realm_id(md5_str
);
4063 RGWRealm
new_realm(new_realm_id
,new_realm_name
);
4064 ret
= new_realm
.init(cct
, this, false);
4066 ldout(cct
, 0) << __func__
<< " Error initing new realm: " << cpp_strerror(-ret
) << dendl
;
4069 ret
= new_realm
.create();
4070 if (ret
< 0 && ret
!= -EEXIST
) {
4071 ldout(cct
, 0) << __func__
<< " Error creating new realm: " << cpp_strerror(-ret
) << dendl
;
4074 ret
= new_realm
.set_as_default();
4076 ldout(cct
, 0) << __func__
<< " Error setting realm as default: " << cpp_strerror(-ret
) << dendl
;
4079 ret
= realm
.init(cct
, this);
4081 ldout(cct
, 0) << __func__
<< " Error initing realm: " << cpp_strerror(-ret
) << dendl
;
4084 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
4086 ldout(cct
, 0) << __func__
<< " Error initing current period: " << cpp_strerror(-ret
) << dendl
;
4091 list
<string
>::iterator iter
;
4092 /* create zonegroups */
4093 for (iter
= regions
.begin(); iter
!= regions
.end(); ++iter
)
4095 ldout(cct
, 0) << __func__
<< "Converting " << *iter
<< dendl
;
4096 /* check to see if we don't have already a zonegroup with this name */
4097 RGWZoneGroup
new_zonegroup(*iter
);
4098 ret
= new_zonegroup
.init(cct
, this);
4099 if (ret
== 0 && new_zonegroup
.get_id() != *iter
) {
4100 ldout(cct
, 0) << __func__
<< " zonegroup "<< *iter
<< " already exists id " << new_zonegroup
.get_id () <<
4101 " skipping conversion " << dendl
;
4104 RGWZoneGroup
zonegroup(*iter
);
4105 zonegroup
.set_id(*iter
);
4106 int ret
= zonegroup
.init(cct
, this, true, true);
4108 ldout(cct
, 0) << __func__
<< " failed init zonegroup: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4111 zonegroup
.realm_id
= realm
.get_id();
4112 /* fix default region master zone */
4113 if (*iter
== default_zonegroup_name
&& zonegroup
.master_zone
.empty()) {
4114 ldout(cct
, 0) << __func__
<< " Setting default zone as master for default region" << dendl
;
4115 zonegroup
.master_zone
= default_zone_name
;
4117 ret
= zonegroup
.update();
4118 if (ret
< 0 && ret
!= -EEXIST
) {
4119 ldout(cct
, 0) << __func__
<< " failed to update zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4123 ret
= zonegroup
.update_name();
4124 if (ret
< 0 && ret
!= -EEXIST
) {
4125 ldout(cct
, 0) << __func__
<< " failed to update_name for zonegroup " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4129 if (zonegroup
.get_name() == default_region
) {
4130 ret
= zonegroup
.set_as_default();
4132 ldout(cct
, 0) << __func__
<< " failed to set_as_default " << *iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4137 for (map
<string
, RGWZone
>::const_iterator iter
= zonegroup
.zones
.begin(); iter
!= zonegroup
.zones
.end();
4139 ldout(cct
, 0) << __func__
<< " Converting zone" << iter
->first
<< dendl
;
4140 RGWZoneParams
zoneparams(iter
->first
, iter
->first
);
4141 zoneparams
.set_id(iter
->first
);
4142 zoneparams
.realm_id
= realm
.get_id();
4143 ret
= zoneparams
.init(cct
, this);
4144 if (ret
< 0 && ret
!= -ENOENT
) {
4145 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4147 } else if (ret
== -ENOENT
) {
4148 ldout(cct
, 0) << __func__
<< " zone is part of another cluster " << iter
->first
<< " skipping " << dendl
;
4151 zonegroup
.realm_id
= realm
.get_id();
4152 ret
= zoneparams
.update();
4153 if (ret
< 0 && ret
!= -EEXIST
) {
4154 ldout(cct
, 0) << __func__
<< " failed to update zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4157 ret
= zoneparams
.update_name();
4158 if (ret
< 0 && ret
!= -EEXIST
) {
4159 ldout(cct
, 0) << __func__
<< " failed to init zoneparams " << iter
->first
<< ": " << cpp_strerror(-ret
) << dendl
;
4164 if (!current_period
.get_id().empty()) {
4165 ret
= current_period
.add_zonegroup(zonegroup
);
4167 ldout(cct
, 0) << __func__
<< " failed to add zonegroup to current_period: " << cpp_strerror(-ret
) << dendl
;
4173 if (!current_period
.get_id().empty()) {
4174 ret
= current_period
.update();
4176 ldout(cct
, 0) << __func__
<< " failed to update new period: " << cpp_strerror(-ret
) << dendl
;
4179 ret
= current_period
.store_info(false);
4181 ldout(cct
, 0) << __func__
<< " failed to store new period: " << cpp_strerror(-ret
) << dendl
;
4184 ret
= current_period
.reflect();
4186 ldout(cct
, 0) << __func__
<< " failed to update local objects: " << cpp_strerror(-ret
) << dendl
;
4191 for (auto const& iter
: regions
) {
4192 RGWZoneGroup
zonegroup(iter
);
4193 int ret
= zonegroup
.init(cct
, this, true, true);
4195 ldout(cct
, 0) << __func__
<< " failed init zonegroup" << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4198 ret
= zonegroup
.delete_obj(true);
4199 if (ret
< 0 && ret
!= -ENOENT
) {
4200 ldout(cct
, 0) << __func__
<< " failed to delete region " << iter
<< ": ret "<< ret
<< " " << cpp_strerror(-ret
)
4206 /* mark as converted */
4207 ret
= rgw_put_system_obj(this, pool
, oid
, bl
.c_str(), bl
.length(),
4208 true, NULL
, real_time(), NULL
);
4210 ldout(cct
, 0) << __func__
<< " failed to mark cluster as converted: ret "<< ret
<< " " << cpp_strerror(-ret
)
4218 int RGWRados::init_zg_from_period(bool *initialized
)
4220 *initialized
= false;
4222 if (current_period
.get_id().empty()) {
4226 int ret
= zonegroup
.init(cct
, this);
4227 ldout(cct
, 20) << "period zonegroup init ret " << ret
<< dendl
;
4228 if (ret
== -ENOENT
) {
4232 ldout(cct
, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret
) << dendl
;
4235 ldout(cct
, 20) << "period zonegroup name " << zonegroup
.get_name() << dendl
;
4237 map
<string
, RGWZoneGroup
>::const_iterator iter
=
4238 current_period
.get_map().zonegroups
.find(zonegroup
.get_id());
4240 if (iter
!= current_period
.get_map().zonegroups
.end()) {
4241 ldout(cct
, 20) << "using current period zonegroup " << zonegroup
.get_name() << dendl
;
4242 zonegroup
= iter
->second
;
4243 ret
= zonegroup
.init(cct
, this, false);
4245 ldout(cct
, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret
) << dendl
;
4248 ret
= zone_params
.init(cct
, this);
4249 if (ret
< 0 && ret
!= -ENOENT
) {
4250 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
4252 } if (ret
==-ENOENT
&& zonegroup
.get_name() == default_zonegroup_name
) {
4253 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
4254 zone_params
.set_name(default_zone_name
);
4255 ret
= zone_params
.init(cct
, this);
4256 if (ret
< 0 && ret
!= -ENOENT
) {
4257 ldout(cct
, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret
) << dendl
;
4262 for (iter
= current_period
.get_map().zonegroups
.begin();
4263 iter
!= current_period
.get_map().zonegroups
.end(); ++iter
){
4264 const RGWZoneGroup
& zg
= iter
->second
;
4265 // use endpoints from the zonegroup's master zone
4266 auto master
= zg
.zones
.find(zg
.master_zone
);
4267 if (master
== zg
.zones
.end()) {
4268 // Check for empty zonegroup which can happen if zone was deleted before removal
4269 if (zg
.zones
.size() == 0)
4271 // fix missing master zone for a single zone zonegroup
4272 if (zg
.master_zone
.empty() && zg
.zones
.size() == 1) {
4273 master
= zg
.zones
.begin();
4274 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing master_zone, setting zone " <<
4275 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
4276 if (zonegroup
.get_id() == zg
.get_id()) {
4277 zonegroup
.master_zone
= master
->second
.id
;
4278 ret
= zonegroup
.update();
4280 ldout(cct
, 0) << "error updating zonegroup : " << cpp_strerror(-ret
) << dendl
;
4284 RGWZoneGroup
fixed_zg(zg
.get_id(),zg
.get_name());
4285 ret
= fixed_zg
.init(cct
, this);
4287 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4290 fixed_zg
.master_zone
= master
->second
.id
;
4291 ret
= fixed_zg
.update();
4293 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4298 ldout(cct
, 0) << "zonegroup " << zg
.get_name() << " missing zone for master_zone=" <<
4299 zg
.master_zone
<< dendl
;
4303 const auto& endpoints
= master
->second
.endpoints
;
4304 add_new_connection_to_map(zonegroup_conn_map
, zg
, new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
));
4305 if (!current_period
.get_master_zonegroup().empty() &&
4306 zg
.get_id() == current_period
.get_master_zonegroup()) {
4307 rest_master_conn
= new RGWRESTConn(cct
, this, zg
.get_id(), endpoints
);
4311 *initialized
= true;
4316 int RGWRados::init_zg_from_local(bool *creating_defaults
)
4318 int ret
= zonegroup
.init(cct
, this);
4319 if ( (ret
< 0 && ret
!= -ENOENT
) || (ret
== -ENOENT
&& !cct
->_conf
->rgw_zonegroup
.empty())) {
4320 ldout(cct
, 0) << "failed reading zonegroup info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4322 } else if (ret
== -ENOENT
) {
4323 *creating_defaults
= true;
4324 ldout(cct
, 10) << "Creating default zonegroup " << dendl
;
4325 ret
= zonegroup
.create_default();
4327 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
4331 ret
= zonegroup
.init(cct
, this);
4333 ldout(cct
, 0) << "failure in zonegroup create_default: ret "<< ret
<< " " << cpp_strerror(-ret
)
4338 ldout(cct
, 20) << "zonegroup " << zonegroup
.get_name() << dendl
;
4339 if (zonegroup
.is_master_zonegroup()) {
4340 // use endpoints from the zonegroup's master zone
4341 auto master
= zonegroup
.zones
.find(zonegroup
.master_zone
);
4342 if (master
== zonegroup
.zones
.end()) {
4343 // fix missing master zone for a single zone zonegroup
4344 if (zonegroup
.master_zone
.empty() && zonegroup
.zones
.size() == 1) {
4345 master
= zonegroup
.zones
.begin();
4346 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing master_zone, setting zone " <<
4347 master
->second
.name
<< " id:" << master
->second
.id
<< " as master" << dendl
;
4348 zonegroup
.master_zone
= master
->second
.id
;
4349 ret
= zonegroup
.update();
4351 ldout(cct
, 0) << "error initializing zonegroup : " << cpp_strerror(-ret
) << dendl
;
4355 ldout(cct
, 0) << "zonegroup " << zonegroup
.get_name() << " missing zone for "
4356 "master_zone=" << zonegroup
.master_zone
<< dendl
;
4360 const auto& endpoints
= master
->second
.endpoints
;
4361 rest_master_conn
= new RGWRESTConn(cct
, this, zonegroup
.get_id(), endpoints
);
4368 bool RGWRados::zone_syncs_from(RGWZone
& target_zone
, RGWZone
& source_zone
)
4370 return target_zone
.syncs_from(source_zone
.name
) &&
4371 sync_modules_manager
->supports_data_export(source_zone
.tier_type
);
4375 * Initialize the RADOS instance and prepare to do other ops
4376 * Returns 0 on success, -ERR# on failure.
4378 int RGWRados::init_complete()
4380 int ret
= realm
.init(cct
, this);
4381 if (ret
< 0 && ret
!= -ENOENT
) {
4382 ldout(cct
, 0) << "failed reading realm info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4384 } else if (ret
!= -ENOENT
) {
4385 ldout(cct
, 20) << "realm " << realm
.get_name() << " " << realm
.get_id() << dendl
;
4386 ret
= current_period
.init(cct
, this, realm
.get_id(), realm
.get_name());
4387 if (ret
< 0 && ret
!= -ENOENT
) {
4388 ldout(cct
, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret
) << dendl
;
4391 ldout(cct
, 20) << "current period " << current_period
.get_id() << dendl
;
4394 ret
= replace_region_with_zonegroup();
4396 lderr(cct
) << "failed converting region to zonegroup : ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4400 ret
= convert_regionmap();
4402 lderr(cct
) << "failed converting regionmap: " << cpp_strerror(-ret
) << dendl
;
4406 bool zg_initialized
= false;
4408 if (!current_period
.get_id().empty()) {
4409 ret
= init_zg_from_period(&zg_initialized
);
4415 bool creating_defaults
= false;
4416 bool using_local
= (!zg_initialized
);
4418 ldout(cct
, 10) << " cannot find current period zonegroup using local zonegroup" << dendl
;
4419 ret
= init_zg_from_local(&creating_defaults
);
4423 // read period_config into current_period
4424 auto& period_config
= current_period
.get_config();
4425 ret
= period_config
.read(this, zonegroup
.realm_id
);
4426 if (ret
< 0 && ret
!= -ENOENT
) {
4427 ldout(cct
, 0) << "ERROR: failed to read period config: "
4428 << cpp_strerror(ret
) << dendl
;
4433 ldout(cct
, 10) << "Cannot find current period zone using local zone" << dendl
;
4434 if (creating_defaults
&& cct
->_conf
->rgw_zone
.empty()) {
4435 ldout(cct
, 10) << " Using default name "<< default_zone_name
<< dendl
;
4436 zone_params
.set_name(default_zone_name
);
4439 ret
= zone_params
.init(cct
, this);
4440 if (ret
< 0 && ret
!= -ENOENT
) {
4441 lderr(cct
) << "failed reading zone info: ret "<< ret
<< " " << cpp_strerror(-ret
) << dendl
;
4444 map
<string
, RGWZone
>::iterator zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4445 if (zone_iter
== get_zonegroup().zones
.end()) {
4447 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4450 ldout(cct
, 1) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << "), switching to local zonegroup configuration" << dendl
;
4451 ret
= init_zg_from_local(&creating_defaults
);
4455 zone_iter
= get_zonegroup().zones
.find(zone_params
.get_id());
4457 if (zone_iter
!= get_zonegroup().zones
.end()) {
4458 zone_public_config
= zone_iter
->second
;
4459 ldout(cct
, 20) << "zone " << zone_params
.get_name() << dendl
;
4461 lderr(cct
) << "Cannot find zone id=" << zone_params
.get_id() << " (name=" << zone_params
.get_name() << ")" << dendl
;
4465 zone_short_id
= current_period
.get_map().get_zone_short_id(zone_params
.get_id());
4467 if (run_sync_thread
) {
4468 ret
= sync_modules_manager
->create_instance(cct
, zone_public_config
.tier_type
, zone_params
.tier_config
, &sync_module
);
4470 lderr(cct
) << "ERROR: failed to init sync module instance, ret=" << ret
<< dendl
;
4475 writeable_zone
= (zone_public_config
.tier_type
.empty() || zone_public_config
.tier_type
== "rgw");
4477 init_unique_trans_id_deps();
4479 finisher
= new Finisher(cct
);
4482 period_puller
.reset(new RGWPeriodPuller(this));
4483 period_history
.reset(new RGWPeriodHistory(cct
, period_puller
.get(),
4486 if (need_watch_notify()) {
4489 lderr(cct
) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret
) << dendl
;
4494 /* first build all zones index */
4495 for (auto ziter
: get_zonegroup().zones
) {
4496 const string
& id
= ziter
.first
;
4497 RGWZone
& z
= ziter
.second
;
4498 zone_id_by_name
[z
.name
] = id
;
4502 if (zone_by_id
.find(zone_id()) == zone_by_id
.end()) {
4503 ldout(cct
, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl
;
4505 zone_public_config
= zone_by_id
[zone_id()];
4506 for (auto ziter
: get_zonegroup().zones
) {
4507 const string
& id
= ziter
.first
;
4508 RGWZone
& z
= ziter
.second
;
4509 if (id
== zone_id()) {
4512 if (z
.endpoints
.empty()) {
4513 ldout(cct
, 0) << "WARNING: can't generate connection for zone " << z
.id
<< " id " << z
.name
<< ": no endpoints defined" << dendl
;
4516 ldout(cct
, 20) << "generating connection object for zone " << z
.name
<< " id " << z
.id
<< dendl
;
4517 RGWRESTConn
*conn
= new RGWRESTConn(cct
, this, z
.id
, z
.endpoints
);
4518 zone_conn_map
[id
] = conn
;
4519 if (zone_syncs_from(zone_public_config
, z
) ||
4520 zone_syncs_from(z
, zone_public_config
)) {
4521 if (zone_syncs_from(zone_public_config
, z
)) {
4522 zone_data_sync_from_map
[id
] = conn
;
4524 if (zone_syncs_from(z
, zone_public_config
)) {
4525 zone_data_notify_to_map
[id
] = conn
;
4528 ldout(cct
, 20) << "NOTICE: not syncing to/from zone " << z
.name
<< " id " << z
.id
<< dendl
;
4532 ret
= open_root_pool_ctx();
4536 ret
= open_gc_pool_ctx();
4540 ret
= open_lc_pool_ctx();
4544 ret
= open_objexp_pool_ctx();
4548 ret
= open_reshard_pool_ctx();
4552 pools_initialized
= true;
4555 gc
->initialize(cct
, this);
4557 obj_expirer
= new RGWObjectExpirer(this);
4559 if (use_gc_thread
) {
4560 gc
->start_processor();
4561 obj_expirer
->start_processor();
4564 /* no point of running sync thread if we don't have a master zone configured
4565 or there is no rest_master_conn */
4566 if (get_zonegroup().master_zone
.empty() || !rest_master_conn
4567 || current_period
.get_id().empty()) {
4568 run_sync_thread
= false;
4571 if (run_sync_thread
) {
4572 // initialize the log period history
4573 meta_mgr
->init_oldest_log_period();
4576 async_rados
= new RGWAsyncRadosProcessor(this, cct
->_conf
->rgw_num_async_rados_threads
);
4577 async_rados
->start();
4579 ret
= meta_mgr
->init(current_period
.get_id());
4581 lderr(cct
) << "ERROR: failed to initialize metadata log: "
4582 << cpp_strerror(-ret
) << dendl
;
4586 if (is_meta_master()) {
4587 auto md_log
= meta_mgr
->get_log(current_period
.get_id());
4588 meta_notifier
= new RGWMetaNotifier(this, md_log
);
4589 meta_notifier
->start();
4592 if (run_sync_thread
) {
4593 Mutex::Locker
l(meta_sync_thread_lock
);
4594 meta_sync_processor_thread
= new RGWMetaSyncProcessorThread(this, async_rados
);
4595 ret
= meta_sync_processor_thread
->init();
4597 ldout(cct
, 0) << "ERROR: failed to initialize meta sync thread" << dendl
;
4600 meta_sync_processor_thread
->start();
4602 // configure the bucket trim manager
4603 rgw::BucketTrimConfig config
;
4604 rgw::configure_bucket_trim(cct
, config
);
4606 bucket_trim
.emplace(this, config
);
4607 ret
= bucket_trim
->init();
4609 ldout(cct
, 0) << "ERROR: failed to start bucket trim manager" << dendl
;
4612 data_log
->set_observer(&*bucket_trim
);
4614 Mutex::Locker
dl(data_sync_thread_lock
);
4615 for (auto iter
: zone_data_sync_from_map
) {
4616 ldout(cct
, 5) << "starting data sync thread for zone " << iter
.first
<< dendl
;
4617 auto *thread
= new RGWDataSyncProcessorThread(this, async_rados
, iter
.first
);
4618 ret
= thread
->init();
4620 ldout(cct
, 0) << "ERROR: failed to initialize data sync thread" << dendl
;
4624 data_sync_processor_threads
[iter
.first
] = thread
;
4626 auto interval
= cct
->_conf
->rgw_sync_log_trim_interval
;
4628 sync_log_trimmer
= new RGWSyncLogTrimThread(this, &*bucket_trim
, interval
);
4629 ret
= sync_log_trimmer
->init();
4631 ldout(cct
, 0) << "ERROR: failed to initialize sync log trim thread" << dendl
;
4634 sync_log_trimmer
->start();
4637 data_notifier
= new RGWDataNotifier(this);
4638 data_notifier
->start();
4641 lc
->initialize(cct
, this);
4644 lc
->start_processor();
4646 quota_handler
= RGWQuotaHandler::generate_handler(this, quota_threads
);
4648 bucket_index_max_shards
= (cct
->_conf
->rgw_override_bucket_index_max_shards
? cct
->_conf
->rgw_override_bucket_index_max_shards
:
4649 get_zone().bucket_index_max_shards
);
4650 if (bucket_index_max_shards
> get_max_bucket_shards()) {
4651 bucket_index_max_shards
= get_max_bucket_shards();
4652 ldout(cct
, 1) << __func__
<< " bucket index max shards is too large, reset to value: "
4653 << get_max_bucket_shards() << dendl
;
4655 ldout(cct
, 20) << __func__
<< " bucket index max shards: " << bucket_index_max_shards
<< dendl
;
4657 binfo_cache
= new RGWChainedCacheImpl
<bucket_info_entry
>;
4658 binfo_cache
->init(this);
4660 bool need_tombstone_cache
= !zone_data_notify_to_map
.empty(); /* have zones syncing from us */
4662 if (need_tombstone_cache
) {
4663 obj_tombstone_cache
= new tombstone_cache_t(cct
->_conf
->rgw_obj_tombstone_cache_size
);
4666 reshard_wait
= std::make_shared
<RGWReshardWait
>(this);
4668 reshard
= new RGWReshard(this);
4670 /* only the master zone in the zonegroup reshards buckets */
4671 run_reshard_thread
= run_reshard_thread
&& (get_zonegroup().master_zone
== zone_public_config
.id
);
4672 if (run_reshard_thread
) {
4673 reshard
->start_processor();
4676 index_completion_manager
= new RGWIndexCompletionManager(this);
4677 ret
= index_completion_manager
->start();
4683 * Initialize the RADOS instance and prepare to do other ops
4684 * Returns 0 on success, -ERR# on failure.
4686 int RGWRados::initialize()
4694 return init_complete();
4697 void RGWRados::finalize_watch()
4699 for (int i
= 0; i
< num_watchers
; i
++) {
4700 RGWWatcher
*watcher
= watchers
[i
];
4701 watcher
->unregister_watch();
4705 delete[] notify_oids
;
4709 void RGWRados::schedule_context(Context
*c
) {
4713 int RGWRados::list_raw_prefixed_objs(const rgw_pool
& pool
, const string
& prefix
, list
<string
>& result
)
4716 RGWListRawObjsCtx ctx
;
4719 int r
= list_raw_objects(pool
, prefix
, 1000,
4720 ctx
, oids
, &is_truncated
);
4724 list
<string
>::iterator iter
;
4725 for (iter
= oids
.begin(); iter
!= oids
.end(); ++iter
) {
4726 string
& val
= *iter
;
4727 if (val
.size() > prefix
.size())
4728 result
.push_back(val
.substr(prefix
.size()));
4730 } while (is_truncated
);
4735 int RGWRados::list_regions(list
<string
>& regions
)
4737 RGWZoneGroup zonegroup
;
4739 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), region_info_oid_prefix
, regions
);
4742 int RGWRados::list_zonegroups(list
<string
>& zonegroups
)
4744 RGWZoneGroup zonegroup
;
4746 return list_raw_prefixed_objs(zonegroup
.get_pool(cct
), zonegroup_names_oid_prefix
, zonegroups
);
4749 int RGWRados::list_zones(list
<string
>& zones
)
4751 RGWZoneParams zoneparams
;
4753 return list_raw_prefixed_objs(zoneparams
.get_pool(cct
), zone_names_oid_prefix
, zones
);
4756 int RGWRados::list_realms(list
<string
>& realms
)
4758 RGWRealm
realm(cct
, this);
4759 return list_raw_prefixed_objs(realm
.get_pool(cct
), realm_names_oid_prefix
, realms
);
4762 int RGWRados::list_periods(list
<string
>& periods
)
4765 list
<string
> raw_periods
;
4766 int ret
= list_raw_prefixed_objs(period
.get_pool(cct
), period
.get_info_oid_prefix(), raw_periods
);
4770 for (const auto& oid
: raw_periods
) {
4771 size_t pos
= oid
.find(".");
4772 if (pos
!= std::string::npos
) {
4773 periods
.push_back(oid
.substr(0, pos
));
4775 periods
.push_back(oid
);
4778 periods
.sort(); // unique() only detects duplicates if they're adjacent
4784 int RGWRados::list_periods(const string
& current_period
, list
<string
>& periods
)
4787 string period_id
= current_period
;
4788 while(!period_id
.empty()) {
4789 RGWPeriod
period(period_id
);
4790 ret
= period
.init(cct
, this);
4794 periods
.push_back(period
.get_id());
4795 period_id
= period
.get_predecessor();
4802 * Open the pool used as root for this gateway
4803 * Returns: 0 on success, -ERR# otherwise.
4805 int RGWRados::open_root_pool_ctx()
4807 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root
, root_pool_ctx
, true);
4810 int RGWRados::open_gc_pool_ctx()
4812 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool
, gc_pool_ctx
, true);
4815 int RGWRados::open_lc_pool_ctx()
4817 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool
, lc_pool_ctx
, true);
4820 int RGWRados::open_objexp_pool_ctx()
4822 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, objexp_pool_ctx
, true);
4825 int RGWRados::open_reshard_pool_ctx()
4827 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool
, reshard_pool_ctx
, true);
4830 int RGWRados::init_watch()
4832 int r
= rgw_init_ioctx(&rados
[0], get_zone_params().control_pool
, control_pool_ctx
, true);
4837 num_watchers
= cct
->_conf
->rgw_num_control_oids
;
4839 bool compat_oid
= (num_watchers
== 0);
4841 if (num_watchers
<= 0)
4844 notify_oids
= new string
[num_watchers
];
4845 watchers
= new RGWWatcher
*[num_watchers
];
4847 for (int i
=0; i
< num_watchers
; i
++) {
4848 string
& notify_oid
= notify_oids
[i
];
4849 notify_oid
= notify_oid_prefix
;
4852 snprintf(buf
, sizeof(buf
), ".%d", i
);
4853 notify_oid
.append(buf
);
4855 r
= control_pool_ctx
.create(notify_oid
, false);
4856 if (r
< 0 && r
!= -EEXIST
)
4859 RGWWatcher
*watcher
= new RGWWatcher(this, i
, notify_oid
);
4860 watchers
[i
] = watcher
;
4862 r
= watcher
->register_watch();
4867 watch_initialized
= true;
4869 set_cache_enabled(true);
4874 void RGWRados::pick_control_oid(const string
& key
, string
& notify_oid
)
4876 uint32_t r
= ceph_str_hash_linux(key
.c_str(), key
.size());
4878 int i
= r
% num_watchers
;
4880 snprintf(buf
, sizeof(buf
), ".%d", i
);
4882 notify_oid
= notify_oid_prefix
;
4883 notify_oid
.append(buf
);
4886 int RGWRados::open_pool_ctx(const rgw_pool
& pool
, librados::IoCtx
& io_ctx
)
4888 constexpr bool create
= true; // create the pool if it doesn't exist
4889 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
4892 void RGWRados::build_bucket_index_marker(const string
& shard_id_str
, const string
& shard_marker
,
4895 *marker
= shard_id_str
;
4896 marker
->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR
);
4897 marker
->append(shard_marker
);
4901 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
)
4903 const rgw_pool
& explicit_pool
= bucket_info
.bucket
.explicit_placement
.index_pool
;
4905 if (!explicit_pool
.empty()) {
4906 return open_pool_ctx(explicit_pool
, index_ctx
);
4909 const string
*rule
= &bucket_info
.placement_rule
;
4910 if (rule
->empty()) {
4911 rule
= &zonegroup
.default_placement
;
4913 auto iter
= zone_params
.placement_pools
.find(*rule
);
4914 if (iter
== zone_params
.placement_pools
.end()) {
4915 ldout(cct
, 0) << "could not find placement rule " << *rule
<< " within zonegroup " << dendl
;
4919 int r
= open_pool_ctx(iter
->second
.index_pool
, index_ctx
);
4927 * set up a bucket listing.
4928 * handle is filled in.
4929 * Returns 0 on success, -ERR# otherwise.
4931 int RGWRados::list_buckets_init(RGWAccessHandle
*handle
)
4934 auto iter
= root_pool_ctx
.nobjects_begin();
4935 librados::NObjectIterator
*state
= new librados::NObjectIterator(iter
);
4936 *handle
= (RGWAccessHandle
)state
;
4938 } catch (const std::system_error
& e
) {
4939 int r
= -e
.code().value();
4940 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
4941 << ", returning " << r
<< dendl
;
4943 } catch (const std::exception
& e
) {
4944 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
4945 << ", returning -5" << dendl
;
4951 * get the next bucket in the listing.
4953 * handle is updated.
4954 * returns 0 on success, -ERR# otherwise.
4956 int RGWRados::list_buckets_next(rgw_bucket_dir_entry
& obj
, RGWAccessHandle
*handle
)
4958 librados::NObjectIterator
*state
= (librados::NObjectIterator
*)*handle
;
4961 if (*state
== root_pool_ctx
.nobjects_end()) {
4966 obj
.key
.name
= (*state
)->get_oid();
4967 if (obj
.key
.name
[0] == '_') {
4968 obj
.key
.name
= obj
.key
.name
.substr(1);
4972 } catch (const std::system_error
& e
) {
4973 int r
= -e
.code().value();
4974 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
4975 << ", returning " << r
<< dendl
;
4977 } catch (const std::exception
& e
) {
4978 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
4979 << ", returning -5" << dendl
;
4982 } while (obj
.key
.name
[0] == '.'); /* skip all entries starting with '.' */
4990 struct log_list_state
{
4992 librados::IoCtx io_ctx
;
4993 librados::NObjectIterator obit
;
4996 int RGWRados::log_list_init(const string
& prefix
, RGWAccessHandle
*handle
)
4998 log_list_state
*state
= new log_list_state
;
4999 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
5004 state
->prefix
= prefix
;
5005 state
->obit
= state
->io_ctx
.nobjects_begin();
5006 *handle
= (RGWAccessHandle
)state
;
5010 int RGWRados::log_list_next(RGWAccessHandle handle
, string
*name
)
5012 log_list_state
*state
= static_cast<log_list_state
*>(handle
);
5014 if (state
->obit
== state
->io_ctx
.nobjects_end()) {
5018 if (state
->prefix
.length() &&
5019 state
->obit
->get_oid().find(state
->prefix
) != 0) {
5023 *name
= state
->obit
->get_oid();
5030 int RGWRados::log_remove(const string
& name
)
5032 librados::IoCtx io_ctx
;
5033 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5036 return io_ctx
.remove(name
);
5039 struct log_show_state
{
5040 librados::IoCtx io_ctx
;
5042 bufferlist::iterator p
;
5046 log_show_state() : pos(0), eof(false) {}
5049 int RGWRados::log_show_init(const string
& name
, RGWAccessHandle
*handle
)
5051 log_show_state
*state
= new log_show_state
;
5052 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, state
->io_ctx
);
5058 *handle
= (RGWAccessHandle
)state
;
5062 int RGWRados::log_show_next(RGWAccessHandle handle
, rgw_log_entry
*entry
)
5064 log_show_state
*state
= static_cast<log_show_state
*>(handle
);
5065 off_t off
= state
->p
.get_off();
5067 ldout(cct
, 10) << "log_show_next pos " << state
->pos
<< " bl " << state
->bl
.length()
5069 << " eof " << (int)state
->eof
5072 unsigned chunk
= 1024*1024;
5073 if ((state
->bl
.length() - off
) < chunk
/2 && !state
->eof
) {
5075 int r
= state
->io_ctx
.read(state
->name
, more
, chunk
, state
->pos
);
5081 old
.substr_of(state
->bl
, off
, state
->bl
.length() - off
);
5082 } catch (buffer::error
& err
) {
5086 state
->bl
.claim(old
);
5087 state
->bl
.claim_append(more
);
5088 state
->p
= state
->bl
.begin();
5089 if ((unsigned)r
< chunk
)
5091 ldout(cct
, 10) << " read " << r
<< dendl
;
5095 return 0; // end of file
5097 ::decode(*entry
, state
->p
);
5099 catch (const buffer::error
&e
) {
5106 * usage_log_hash: get usage log key hash, based on name and index
5108 * Get the usage object name. Since a user may have more than 1
5109 * object holding that info (multiple shards), we use index to
5110 * specify that shard number. Once index exceeds max shards it
5112 * If name is not being set, results for all users will be returned
5113 * and index will wrap only after total shards number.
5115 * @param cct [in] ceph context
5116 * @param name [in] user name
5117 * @param hash [out] hash value
5118 * @param index [in] shard index number
5120 static void usage_log_hash(CephContext
*cct
, const string
& name
, string
& hash
, uint32_t index
)
5122 uint32_t val
= index
;
5124 if (!name
.empty()) {
5125 int max_user_shards
= cct
->_conf
->rgw_usage_max_user_shards
;
5126 val
%= max_user_shards
;
5127 val
+= ceph_str_hash_linux(name
.c_str(), name
.size());
5130 int max_shards
= cct
->_conf
->rgw_usage_max_shards
;
5131 snprintf(buf
, sizeof(buf
), RGW_USAGE_OBJ_PREFIX
"%u", (unsigned)(val
% max_shards
));
5135 int RGWRados::log_usage(map
<rgw_user_bucket
, RGWUsageBatch
>& usage_info
)
5139 map
<string
, rgw_usage_log_info
> log_objs
;
5144 /* restructure usage map, zone by object hash */
5145 map
<rgw_user_bucket
, RGWUsageBatch
>::iterator iter
;
5146 for (iter
= usage_info
.begin(); iter
!= usage_info
.end(); ++iter
) {
5147 const rgw_user_bucket
& ub
= iter
->first
;
5148 RGWUsageBatch
& info
= iter
->second
;
5150 if (ub
.user
.empty()) {
5151 ldout(cct
, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub
.bucket
<< "), skipping" << dendl
;
5155 if (ub
.user
!= last_user
) {
5156 /* index *should* be random, but why waste extra cycles
5157 in most cases max user shards is not going to exceed 1,
5158 so just incrementing it */
5159 usage_log_hash(cct
, ub
.user
, hash
, index
++);
5161 last_user
= ub
.user
;
5162 vector
<rgw_usage_log_entry
>& v
= log_objs
[hash
].entries
;
5164 for (auto miter
= info
.m
.begin(); miter
!= info
.m
.end(); ++miter
) {
5165 v
.push_back(miter
->second
);
5169 map
<string
, rgw_usage_log_info
>::iterator liter
;
5171 for (liter
= log_objs
.begin(); liter
!= log_objs
.end(); ++liter
) {
5172 int r
= cls_obj_usage_log_add(liter
->first
, liter
->second
);
5179 int RGWRados::read_usage(const rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
5180 bool *is_truncated
, RGWUsageIter
& usage_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
)
5182 uint32_t num
= max_entries
;
5183 string hash
, first_hash
;
5184 string user_str
= user
.to_str();
5185 usage_log_hash(cct
, user_str
, first_hash
, 0);
5187 if (usage_iter
.index
) {
5188 usage_log_hash(cct
, user_str
, hash
, usage_iter
.index
);
5196 map
<rgw_user_bucket
, rgw_usage_log_entry
> ret_usage
;
5197 map
<rgw_user_bucket
, rgw_usage_log_entry
>::iterator iter
;
5199 int ret
= cls_obj_usage_log_read(hash
, user_str
, start_epoch
, end_epoch
, num
,
5200 usage_iter
.read_iter
, ret_usage
, is_truncated
);
5207 num
-= ret_usage
.size();
5209 for (iter
= ret_usage
.begin(); iter
!= ret_usage
.end(); ++iter
) {
5210 usage
[iter
->first
].aggregate(iter
->second
);
5214 if (!*is_truncated
) {
5215 usage_iter
.read_iter
.clear();
5216 usage_log_hash(cct
, user_str
, hash
, ++usage_iter
.index
);
5218 } while (num
&& !*is_truncated
&& hash
!= first_hash
);
5222 int RGWRados::trim_usage(rgw_user
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
5225 string hash
, first_hash
;
5226 string user_str
= user
.to_str();
5227 usage_log_hash(cct
, user_str
, first_hash
, index
);
5231 int ret
= cls_obj_usage_log_trim(hash
, user_str
, start_epoch
, end_epoch
);
5233 if (ret
< 0 && ret
!= -ENOENT
)
5236 usage_log_hash(cct
, user_str
, hash
, ++index
);
5237 } while (hash
!= first_hash
);
5242 int RGWRados::key_to_shard_id(const string
& key
, int max_shards
)
5244 return rgw_shard_id(key
, max_shards
);
5247 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& key
, string
& name
, int *shard_id
)
5249 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
5252 *shard_id
= val
% max_shards
;
5254 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
5255 name
= prefix
+ buf
;
5258 void RGWRados::shard_name(const string
& prefix
, unsigned max_shards
, const string
& section
, const string
& key
, string
& name
)
5260 uint32_t val
= ceph_str_hash_linux(key
.c_str(), key
.size());
5261 val
^= ceph_str_hash_linux(section
.c_str(), section
.size());
5263 snprintf(buf
, sizeof(buf
), "%u", (unsigned)(val
% max_shards
));
5264 name
= prefix
+ buf
;
5267 void RGWRados::shard_name(const string
& prefix
, unsigned shard_id
, string
& name
)
5270 snprintf(buf
, sizeof(buf
), "%u", shard_id
);
5271 name
= prefix
+ buf
;
5275 void RGWRados::time_log_prepare_entry(cls_log_entry
& entry
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
5277 cls_log_add_prepare_entry(entry
, utime_t(ut
), section
, key
, bl
);
5280 int RGWRados::time_log_add_init(librados::IoCtx
& io_ctx
)
5282 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
, true);
5286 int RGWRados::time_log_add(const string
& oid
, const real_time
& ut
, const string
& section
, const string
& key
, bufferlist
& bl
)
5288 librados::IoCtx io_ctx
;
5290 int r
= time_log_add_init(io_ctx
);
5295 ObjectWriteOperation op
;
5297 cls_log_add(op
, t
, section
, key
, bl
);
5299 return io_ctx
.operate(oid
, &op
);
5302 int RGWRados::time_log_add(const string
& oid
, list
<cls_log_entry
>& entries
,
5303 librados::AioCompletion
*completion
, bool monotonic_inc
)
5305 librados::IoCtx io_ctx
;
5307 int r
= time_log_add_init(io_ctx
);
5312 ObjectWriteOperation op
;
5313 cls_log_add(op
, entries
, monotonic_inc
);
5316 r
= io_ctx
.operate(oid
, &op
);
5318 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
5323 int RGWRados::time_log_list(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
5324 int max_entries
, list
<cls_log_entry
>& entries
,
5325 const string
& marker
,
5329 librados::IoCtx io_ctx
;
5331 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5334 librados::ObjectReadOperation op
;
5336 utime_t
st(start_time
);
5337 utime_t
et(end_time
);
5339 cls_log_list(op
, st
, et
, marker
, max_entries
, entries
,
5340 out_marker
, truncated
);
5344 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
5351 int RGWRados::time_log_info(const string
& oid
, cls_log_header
*header
)
5353 librados::IoCtx io_ctx
;
5355 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5358 librados::ObjectReadOperation op
;
5360 cls_log_info(op
, header
);
5364 int ret
= io_ctx
.operate(oid
, &op
, &obl
);
5371 int RGWRados::time_log_info_async(librados::IoCtx
& io_ctx
, const string
& oid
, cls_log_header
*header
, librados::AioCompletion
*completion
)
5373 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5377 librados::ObjectReadOperation op
;
5379 cls_log_info(op
, header
);
5381 int ret
= io_ctx
.aio_operate(oid
, completion
, &op
, NULL
);
5388 int RGWRados::time_log_trim(const string
& oid
, const real_time
& start_time
, const real_time
& end_time
,
5389 const string
& from_marker
, const string
& to_marker
,
5390 librados::AioCompletion
*completion
)
5392 librados::IoCtx io_ctx
;
5394 int r
= rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool
, io_ctx
);
5398 utime_t
st(start_time
);
5399 utime_t
et(end_time
);
5401 ObjectWriteOperation op
;
5402 cls_log_trim(op
, st
, et
, from_marker
, to_marker
);
5405 r
= io_ctx
.operate(oid
, &op
);
5407 r
= io_ctx
.aio_operate(oid
, completion
, &op
);
5412 string
RGWRados::objexp_hint_get_shardname(int shard_num
)
5415 snprintf(buf
, sizeof(buf
), "%010u", (unsigned)shard_num
);
5417 string
objname("obj_delete_at_hint.");
5418 return objname
+ buf
;
5421 int RGWRados::objexp_key_shard(const rgw_obj_index_key
& key
)
5423 string obj_key
= key
.name
+ key
.instance
;
5424 int num_shards
= cct
->_conf
->rgw_objexp_hints_num_shards
;
5425 return rgw_bucket_shard_index(obj_key
, num_shards
);
5428 static string
objexp_hint_get_keyext(const string
& tenant_name
,
5429 const string
& bucket_name
,
5430 const string
& bucket_id
,
5431 const rgw_obj_key
& obj_key
)
5433 return tenant_name
+ (tenant_name
.empty() ? "" : ":") + bucket_name
+ ":" + bucket_id
+
5434 ":" + obj_key
.name
+ ":" + obj_key
.instance
;
5437 int RGWRados::objexp_hint_add(const ceph::real_time
& delete_at
,
5438 const string
& tenant_name
,
5439 const string
& bucket_name
,
5440 const string
& bucket_id
,
5441 const rgw_obj_index_key
& obj_key
)
5443 const string keyext
= objexp_hint_get_keyext(tenant_name
, bucket_name
,
5444 bucket_id
, obj_key
);
5445 objexp_hint_entry he
= {
5446 .tenant
= tenant_name
,
5447 .bucket_name
= bucket_name
,
5448 .bucket_id
= bucket_id
,
5450 .exp_time
= delete_at
};
5453 ObjectWriteOperation op
;
5454 cls_timeindex_add(op
, utime_t(delete_at
), keyext
, hebl
);
5456 string shard_name
= objexp_hint_get_shardname(objexp_key_shard(obj_key
));
5457 return objexp_pool_ctx
.operate(shard_name
, &op
);
5460 void RGWRados::objexp_get_shard(int shard_num
,
5461 string
& shard
) /* out */
5463 shard
= objexp_hint_get_shardname(shard_num
);
5466 int RGWRados::objexp_hint_list(const string
& oid
,
5467 const ceph::real_time
& start_time
,
5468 const ceph::real_time
& end_time
,
5469 const int max_entries
,
5470 const string
& marker
,
5471 list
<cls_timeindex_entry
>& entries
, /* out */
5472 string
*out_marker
, /* out */
5473 bool *truncated
) /* out */
5475 librados::ObjectReadOperation op
;
5476 cls_timeindex_list(op
, utime_t(start_time
), utime_t(end_time
), marker
, max_entries
, entries
,
5477 out_marker
, truncated
);
5480 int ret
= objexp_pool_ctx
.operate(oid
, &op
, &obl
);
5482 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5486 if ((ret
== -ENOENT
) && truncated
) {
5493 int RGWRados::objexp_hint_parse(cls_timeindex_entry
&ti_entry
, /* in */
5494 objexp_hint_entry
& hint_entry
) /* out */
5497 bufferlist::iterator iter
= ti_entry
.value
.begin();
5498 ::decode(hint_entry
, iter
);
5499 } catch (buffer::error
& err
) {
5500 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
5506 int RGWRados::objexp_hint_trim(const string
& oid
,
5507 const ceph::real_time
& start_time
,
5508 const ceph::real_time
& end_time
,
5509 const string
& from_marker
,
5510 const string
& to_marker
)
5512 int ret
= cls_timeindex_trim(objexp_pool_ctx
, oid
, utime_t(start_time
), utime_t(end_time
),
5513 from_marker
, to_marker
);
5514 if ((ret
< 0 ) && (ret
!= -ENOENT
)) {
5521 int RGWRados::lock_exclusive(rgw_pool
& pool
, const string
& oid
, timespan
& duration
,
5522 string
& zone_id
, string
& owner_id
) {
5523 librados::IoCtx io_ctx
;
5525 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5529 uint64_t msec
= std::chrono::duration_cast
<std::chrono::milliseconds
>(duration
).count();
5530 utime_t
ut(msec
/ 1000, msec
% 1000);
5532 rados::cls::lock::Lock
l(log_lock_name
);
5534 l
.set_cookie(owner_id
);
5536 l
.set_may_renew(true);
5538 return l
.lock_exclusive(&io_ctx
, oid
);
5541 int RGWRados::unlock(rgw_pool
& pool
, const string
& oid
, string
& zone_id
, string
& owner_id
) {
5542 librados::IoCtx io_ctx
;
5544 int r
= rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
);
5549 rados::cls::lock::Lock
l(log_lock_name
);
5551 l
.set_cookie(owner_id
);
5553 return l
.unlock(&io_ctx
, oid
);
5556 int RGWRados::decode_policy(bufferlist
& bl
, ACLOwner
*owner
)
5558 bufferlist::iterator i
= bl
.begin();
5559 RGWAccessControlPolicy
policy(cct
);
5561 policy
.decode_owner(i
);
5562 } catch (buffer::error
& err
) {
5563 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5566 *owner
= policy
.get_owner();
5570 int rgw_policy_from_attrset(CephContext
*cct
, map
<string
, bufferlist
>& attrset
, RGWAccessControlPolicy
*policy
)
5572 map
<string
, bufferlist
>::iterator aiter
= attrset
.find(RGW_ATTR_ACL
);
5573 if (aiter
== attrset
.end())
5576 bufferlist
& bl
= aiter
->second
;
5577 bufferlist::iterator iter
= bl
.begin();
5579 policy
->decode(iter
);
5580 } catch (buffer::error
& err
) {
5581 ldout(cct
, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl
;
5584 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 15)) {
5585 RGWAccessControlPolicy_S3
*s3policy
= static_cast<RGWAccessControlPolicy_S3
*>(policy
);
5586 ldout(cct
, 15) << __func__
<< " Read AccessControlPolicy";
5587 s3policy
->to_xml(*_dout
);
5594 int RGWRados::Bucket::update_bucket_id(const string
& new_bucket_id
)
5596 rgw_bucket bucket
= bucket_info
.bucket
;
5597 bucket
.update_bucket_id(new_bucket_id
);
5599 RGWObjectCtx
obj_ctx(store
);
5601 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, nullptr, nullptr);
5611 * Get ordered listing of the objects in a bucket.
5613 * max: maximum number of results to return
5614 * bucket: bucket to list contents of
5615 * prefix: only return results that match this prefix
5616 * delim: do not include results that match this string.
5617 * Any skipped results will have the matching portion of their name
5618 * inserted in common_prefixes with a "true" mark.
5619 * marker: if filled in, begin the listing with this object.
5620 * end_marker: if filled in, end the listing with this object.
5621 * result: the objects are put in here.
5622 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5623 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5625 int RGWRados::Bucket::List::list_objects_ordered(int64_t max
,
5626 vector
<rgw_bucket_dir_entry
> *result
,
5627 map
<string
, bool> *common_prefixes
,
5630 RGWRados
*store
= target
->get_store();
5631 CephContext
*cct
= store
->ctx();
5632 int shard_id
= target
->get_shard_id();
5635 bool truncated
= true;
5636 int read_ahead
= std::max(cct
->_conf
->rgw_list_bucket_min_readahead
,max
);
5640 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
5641 rgw_obj_index_key cur_marker
;
5642 marker_obj
.get_index_key(&cur_marker
);
5644 rgw_obj_key
end_marker_obj(params
.end_marker
.name
, params
.end_marker
.instance
,
5646 rgw_obj_index_key cur_end_marker
;
5647 end_marker_obj
.get_index_key(&cur_end_marker
);
5648 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
5650 rgw_obj_key
prefix_obj(params
.prefix
);
5651 prefix_obj
.ns
= params
.ns
;
5652 string cur_prefix
= prefix_obj
.get_index_key_name();
5654 string bigger_than_delim
;
5656 if (!params
.delim
.empty()) {
5657 unsigned long val
= decode_utf8((unsigned char *)params
.delim
.c_str(),
5658 params
.delim
.size());
5659 char buf
[params
.delim
.size() + 16];
5660 int r
= encode_utf8(val
+ 1, (unsigned char *)buf
);
5662 ldout(cct
,0) << "ERROR: encode_utf8() failed" << dendl
;
5667 bigger_than_delim
= buf
;
5669 /* if marker points at a common prefix, fast forward it into its upperbound string */
5670 int delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
5671 if (delim_pos
>= 0) {
5672 string s
= cur_marker
.name
.substr(0, delim_pos
);
5673 s
.append(bigger_than_delim
);
5678 string skip_after_delim
;
5679 while (truncated
&& count
<= max
) {
5680 if (skip_after_delim
> cur_marker
.name
) {
5681 cur_marker
= skip_after_delim
;
5682 ldout(cct
, 20) << "setting cur_marker=" << cur_marker
.name
<< "[" << cur_marker
.instance
<< "]" << dendl
;
5684 std::map
<string
, rgw_bucket_dir_entry
> ent_map
;
5685 int r
= store
->cls_bucket_list_ordered(target
->get_bucket_info(),
5689 read_ahead
+ 1 - count
,
5690 params
.list_versions
,
5697 for (auto eiter
= ent_map
.begin(); eiter
!= ent_map
.end(); ++eiter
) {
5698 rgw_bucket_dir_entry
& entry
= eiter
->second
;
5699 rgw_obj_index_key index_key
= entry
.key
;
5701 rgw_obj_key
obj(index_key
);
5703 /* note that parse_raw_oid() here will not set the correct
5704 * object's instance, as rgw_obj_index_key encodes that
5705 * separately. We don't need to set the instance because it's
5706 * not needed for the checks here and we end up using the raw
5707 * entry for the return vector
5709 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
5711 ldout(cct
, 0) << "ERROR: could not parse object name: " << obj
.name
<< dendl
;
5714 bool check_ns
= (obj
.ns
== params
.ns
);
5715 if (!params
.list_versions
&& !entry
.is_visible()) {
5719 if (params
.enforce_ns
&& !check_ns
) {
5720 if (!params
.ns
.empty()) {
5721 /* we've iterated past the namespace we're searching -- done now */
5726 /* we're not looking at the namespace this object is in, next! */
5730 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
5736 params
.marker
= index_key
;
5737 next_marker
= index_key
;
5740 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
5743 if (params
.prefix
.size() &&
5744 (obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
) != 0))
5747 if (!params
.delim
.empty()) {
5748 int delim_pos
= obj
.name
.find(params
.delim
, params
.prefix
.size());
5750 if (delim_pos
>= 0) {
5751 string prefix_key
= obj
.name
.substr(0, delim_pos
+ 1);
5753 if (common_prefixes
&&
5754 common_prefixes
->find(prefix_key
) == common_prefixes
->end()) {
5759 next_marker
= prefix_key
;
5760 (*common_prefixes
)[prefix_key
] = true;
5762 int marker_delim_pos
= cur_marker
.name
.find(params
.delim
, cur_prefix
.size());
5764 skip_after_delim
= cur_marker
.name
.substr(0, marker_delim_pos
);
5765 skip_after_delim
.append(bigger_than_delim
);
5767 ldout(cct
, 20) << "skip_after_delim=" << skip_after_delim
<< dendl
;
5781 result
->emplace_back(std::move(entry
));
5788 *is_truncated
= truncated
;
5791 } // list_objects_ordered
5795 * Get listing of the objects in a bucket and allow the results to be out
5798 * Even though there are key differences with the ordered counterpart,
5799 * the parameters are the same to maintain some compatability.
5801 * max: maximum number of results to return
5802 * bucket: bucket to list contents of
5803 * prefix: only return results that match this prefix
5804 * delim: should not be set; if it is we should have indicated an error
5805 * marker: if filled in, begin the listing with this object.
5806 * end_marker: if filled in, end the listing with this object.
5807 * result: the objects are put in here.
5808 * common_prefixes: this is never filled with an unordered list; the param
5809 * is maintained for compatibility
5810 * is_truncated: if number of objects in the bucket is bigger than max, then
5813 int RGWRados::Bucket::List::list_objects_unordered(int64_t max
,
5814 vector
<rgw_bucket_dir_entry
> *result
,
5815 map
<string
, bool> *common_prefixes
,
5818 RGWRados
*store
= target
->get_store();
5819 CephContext
*cct
= store
->ctx();
5820 int shard_id
= target
->get_shard_id();
5823 bool truncated
= true;
5825 // read a few extra in each call to cls_bucket_list_unordered in
5826 // case some are filtered out due to namespace matching, versioning,
5828 const int64_t max_read_ahead
= 100;
5829 const uint32_t read_ahead
= uint32_t(max
+ std::min(max
, max_read_ahead
));
5833 rgw_obj_key
marker_obj(params
.marker
.name
, params
.marker
.instance
, params
.ns
);
5834 rgw_obj_index_key cur_marker
;
5835 marker_obj
.get_index_key(&cur_marker
);
5837 rgw_obj_key
end_marker_obj(params
.end_marker
.name
, params
.end_marker
.instance
,
5839 rgw_obj_index_key cur_end_marker
;
5840 end_marker_obj
.get_index_key(&cur_end_marker
);
5841 const bool cur_end_marker_valid
= !params
.end_marker
.empty();
5843 rgw_obj_key
prefix_obj(params
.prefix
);
5844 prefix_obj
.ns
= params
.ns
;
5845 string cur_prefix
= prefix_obj
.get_index_key_name();
5847 while (truncated
&& count
<= max
) {
5848 std::vector
<rgw_bucket_dir_entry
> ent_list
;
5849 int r
= store
->cls_bucket_list_unordered(target
->get_bucket_info(),
5854 params
.list_versions
,
5861 // NB: while regions of ent_list will be sorted, we have no
5862 // guarantee that all items will be sorted since they can cross
5865 for (auto& entry
: ent_list
) {
5866 rgw_obj_index_key index_key
= entry
.key
;
5867 rgw_obj_key
obj(index_key
);
5869 /* note that parse_raw_oid() here will not set the correct
5870 * object's instance, as rgw_obj_index_key encodes that
5871 * separately. We don't need to set the instance because it's
5872 * not needed for the checks here and we end up using the raw
5873 * entry for the return vector
5875 bool valid
= rgw_obj_key::parse_raw_oid(index_key
.name
, &obj
);
5877 ldout(cct
, 0) << "ERROR: could not parse object name: " <<
5882 if (!params
.list_versions
&& !entry
.is_visible()) {
5886 if (params
.enforce_ns
&& obj
.ns
!= params
.ns
) {
5890 if (cur_end_marker_valid
&& cur_end_marker
<= index_key
) {
5891 // we're not guaranteed items will come in order, so we have
5892 // to loop through all
5897 params
.marker
= index_key
;
5898 next_marker
= index_key
;
5901 if (params
.filter
&& !params
.filter
->filter(obj
.name
, index_key
.name
))
5904 if (params
.prefix
.size() &&
5905 (0 != obj
.name
.compare(0, params
.prefix
.size(), params
.prefix
)))
5913 result
->emplace_back(std::move(entry
));
5915 } // for (auto& entry : ent_list)
5916 } // while (truncated && count <= max)
5920 *is_truncated
= truncated
;
5923 } // list_objects_unordered
5927 * create a rados pool, associated meta info
5928 * returns 0 on success, -ERR# otherwise.
5930 int RGWRados::create_pool(const rgw_pool
& pool
)
5932 librados::IoCtx io_ctx
;
5933 constexpr bool create
= true;
5934 return rgw_init_ioctx(get_rados_handle(), pool
, io_ctx
, create
);
5937 int RGWRados::init_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
5939 librados::IoCtx index_ctx
;
5941 string dir_oid
= dir_oid_prefix
;
5942 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5947 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
5949 map
<int, string
> bucket_objs
;
5950 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
5952 return CLSRGWIssueBucketIndexInit(index_ctx
,
5954 cct
->_conf
->rgw_bucket_index_max_aio
)();
5957 int RGWRados::clean_bucket_index(RGWBucketInfo
& bucket_info
, int num_shards
)
5959 librados::IoCtx index_ctx
;
5961 std::string dir_oid
= dir_oid_prefix
;
5962 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
5967 dir_oid
.append(bucket_info
.bucket
.bucket_id
);
5969 std::map
<int, std::string
> bucket_objs
;
5970 get_bucket_index_objects(dir_oid
, num_shards
, bucket_objs
);
5972 return CLSRGWIssueBucketIndexClean(index_ctx
,
5974 cct
->_conf
->rgw_bucket_index_max_aio
)();
5977 void RGWRados::create_bucket_id(string
*bucket_id
)
5979 uint64_t iid
= instance_id();
5980 uint64_t bid
= next_bucket_id();
5981 char buf
[get_zone_params().get_id().size() + 48];
5982 snprintf(buf
, sizeof(buf
), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid
, (long long)bid
);
5986 int RGWRados::create_bucket(RGWUserInfo
& owner
, rgw_bucket
& bucket
,
5987 const string
& zonegroup_id
,
5988 const string
& placement_rule
,
5989 const string
& swift_ver_location
,
5990 const RGWQuotaInfo
* pquota_info
,
5991 map
<std::string
, bufferlist
>& attrs
,
5992 RGWBucketInfo
& info
,
5994 obj_version
*pep_objv
,
5995 real_time creation_time
,
5996 rgw_bucket
*pmaster_bucket
,
5997 uint32_t *pmaster_num_shards
,
6000 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
6001 string selected_placement_rule_name
;
6002 RGWZonePlacementInfo rule_info
;
6004 for (int i
= 0; i
< MAX_CREATE_RETRIES
; i
++) {
6006 ret
= select_bucket_placement(owner
, zonegroup_id
, placement_rule
,
6007 &selected_placement_rule_name
, &rule_info
);
6011 if (!pmaster_bucket
) {
6012 create_bucket_id(&bucket
.marker
);
6013 bucket
.bucket_id
= bucket
.marker
;
6015 bucket
.marker
= pmaster_bucket
->marker
;
6016 bucket
.bucket_id
= pmaster_bucket
->bucket_id
;
6019 RGWObjVersionTracker
& objv_tracker
= info
.objv_tracker
;
6022 objv_tracker
.write_version
= *pobjv
;
6024 objv_tracker
.generate_new_write_ver(cct
);
6027 info
.bucket
= bucket
;
6028 info
.owner
= owner
.user_id
;
6029 info
.zonegroup
= zonegroup_id
;
6030 info
.placement_rule
= selected_placement_rule_name
;
6031 info
.index_type
= rule_info
.index_type
;
6032 info
.swift_ver_location
= swift_ver_location
;
6033 info
.swift_versioning
= (!swift_ver_location
.empty());
6034 if (pmaster_num_shards
) {
6035 info
.num_shards
= *pmaster_num_shards
;
6037 info
.num_shards
= bucket_index_max_shards
;
6039 info
.bucket_index_shard_hash_type
= RGWBucketInfo::MOD
;
6040 info
.requester_pays
= false;
6041 if (real_clock::is_zero(creation_time
)) {
6042 info
.creation_time
= ceph::real_clock::now();
6044 info
.creation_time
= creation_time
;
6047 info
.quota
= *pquota_info
;
6050 int r
= init_bucket_index(info
, info
.num_shards
);
6055 ret
= put_linked_bucket_info(info
, exclusive
, ceph::real_time(), pep_objv
, &attrs
, true);
6056 if (ret
== -EEXIST
) {
6057 librados::IoCtx index_ctx
;
6058 map
<int, string
> bucket_objs
;
6059 int r
= open_bucket_index(info
, index_ctx
, bucket_objs
);
6063 /* we need to reread the info and return it, caller will have a use for it */
6064 RGWObjVersionTracker instance_ver
= info
.objv_tracker
;
6065 info
.objv_tracker
.clear();
6066 RGWObjectCtx
obj_ctx(this);
6067 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, NULL
);
6072 ldout(cct
, 0) << "get_bucket_info returned " << r
<< dendl
;
6076 /* only remove it if it's a different bucket instance */
6077 if (info
.bucket
.bucket_id
!= bucket
.bucket_id
) {
6078 /* remove bucket meta instance */
6079 r
= rgw_bucket_instance_remove_entry(this,
6085 /* remove bucket index objects asynchronously by best effort */
6086 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
6088 cct
->_conf
->rgw_bucket_index_max_aio
)();
6090 /* ret == -ENOENT here */
6095 /* this is highly unlikely */
6096 ldout(cct
, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl
;
6100 int RGWRados::select_new_bucket_location(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& request_rule
,
6101 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
6104 /* first check that zonegroup exists within current period. */
6105 RGWZoneGroup zonegroup
;
6106 int ret
= get_zonegroup(zonegroup_id
, zonegroup
);
6108 ldout(cct
, 0) << "could not find zonegroup " << zonegroup_id
<< " in current period" << dendl
;
6112 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
6113 std::map
<std::string
, RGWZoneGroupPlacementTarget
>::const_iterator titer
;
6115 if (!request_rule
.empty()) {
6116 titer
= zonegroup
.placement_targets
.find(request_rule
);
6117 if (titer
== zonegroup
.placement_targets
.end()) {
6118 ldout(cct
, 0) << "could not find requested placement id " << request_rule
6119 << " within zonegroup " << dendl
;
6120 return -ERR_INVALID_LOCATION_CONSTRAINT
;
6122 } else if (!user_info
.default_placement
.empty()) {
6123 titer
= zonegroup
.placement_targets
.find(user_info
.default_placement
);
6124 if (titer
== zonegroup
.placement_targets
.end()) {
6125 ldout(cct
, 0) << "could not find user default placement id " << user_info
.default_placement
6126 << " within zonegroup " << dendl
;
6127 return -ERR_INVALID_LOCATION_CONSTRAINT
;
6130 if (zonegroup
.default_placement
.empty()) { // zonegroup default rule as fallback, it should not be empty.
6131 ldout(cct
, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl
;
6132 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION
;
6134 titer
= zonegroup
.placement_targets
.find(zonegroup
.default_placement
);
6135 if (titer
== zonegroup
.placement_targets
.end()) {
6136 ldout(cct
, 0) << "could not find zonegroup default placement id " << zonegroup
.default_placement
6137 << " within zonegroup " << dendl
;
6138 return -ERR_INVALID_LOCATION_CONSTRAINT
;
6143 /* now check tag for the rule, whether user is permitted to use rule */
6144 const auto& target_rule
= titer
->second
;
6145 if (!target_rule
.user_permitted(user_info
.placement_tags
)) {
6146 ldout(cct
, 0) << "user not permitted to use placement rule " << titer
->first
<< dendl
;
6150 if (pselected_rule_name
)
6151 *pselected_rule_name
= titer
->first
;
6153 return select_bucket_location_by_rule(titer
->first
, rule_info
);
6156 int RGWRados::select_bucket_location_by_rule(const string
& location_rule
, RGWZonePlacementInfo
*rule_info
)
6158 if (location_rule
.empty()) {
6159 /* we can only reach here if we're trying to set a bucket location from a bucket
6160 * created on a different zone, using a legacy / default pool configuration
6162 return select_legacy_bucket_placement(rule_info
);
6166 * make sure that zone has this rule configured. We're
6167 * checking it for the local zone, because that's where this bucket object is going to
6170 map
<string
, RGWZonePlacementInfo
>::iterator piter
= get_zone_params().placement_pools
.find(location_rule
);
6171 if (piter
== get_zone_params().placement_pools
.end()) {
6172 /* couldn't find, means we cannot really place data for this bucket in this zone */
6173 if (get_zonegroup().equals(zonegroup
.get_id())) {
6174 /* that's a configuration error, zone should have that rule, as we're within the requested
6178 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6183 RGWZonePlacementInfo
& placement_info
= piter
->second
;
6186 *rule_info
= placement_info
;
6192 int RGWRados::select_bucket_placement(RGWUserInfo
& user_info
, const string
& zonegroup_id
, const string
& placement_rule
,
6193 string
*pselected_rule_name
, RGWZonePlacementInfo
*rule_info
)
6195 if (!get_zone_params().placement_pools
.empty()) {
6196 return select_new_bucket_location(user_info
, zonegroup_id
, placement_rule
,
6197 pselected_rule_name
, rule_info
);
6200 if (pselected_rule_name
) {
6201 pselected_rule_name
->clear();
6204 return select_legacy_bucket_placement(rule_info
);
6207 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo
*rule_info
)
6210 map
<string
, bufferlist
> m
;
6212 bool write_map
= false;
6214 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6216 RGWObjectCtx
obj_ctx(this);
6217 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
, avail_pools
, map_bl
, NULL
, NULL
);
6223 bufferlist::iterator iter
= map_bl
.begin();
6225 } catch (buffer::error
& err
) {
6226 ldout(cct
, 0) << "ERROR: couldn't decode avail_pools" << dendl
;
6232 ret
= omap_get_all(obj
, header
, m
);
6237 if (ret
< 0 || m
.empty()) {
6238 vector
<rgw_pool
> pools
;
6239 string s
= string("default.") + default_storage_pool_suffix
;
6240 pools
.push_back(rgw_pool(s
));
6241 vector
<int> retcodes
;
6243 ret
= create_pools(pools
, retcodes
);
6246 ret
= omap_set(obj
, s
, bl
);
6254 ::encode(m
, new_bl
);
6255 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
6257 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
6261 map
<string
, bufferlist
>::iterator miter
;
6264 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
6265 v
.push_back(miter
->first
);
6269 ret
= get_random_bytes((char *)&r
, sizeof(r
));
6273 int i
= r
% v
.size();
6277 pool_name
= miter
->first
;
6280 rule_info
->data_pool
= pool_name
;
6281 rule_info
->data_extra_pool
= pool_name
;
6282 rule_info
->index_pool
= pool_name
;
6283 rule_info
->index_type
= RGWBIType_Normal
;
6288 bool RGWRados::get_obj_data_pool(const string
& placement_rule
, const rgw_obj
& obj
, rgw_pool
*pool
)
6290 return rgw_get_obj_data_pool(zonegroup
, zone_params
, placement_rule
, obj
, pool
);
6293 bool RGWRados::obj_to_raw(const string
& placement_rule
, const rgw_obj
& obj
, rgw_raw_obj
*raw_obj
)
6295 get_obj_bucket_and_oid_loc(obj
, raw_obj
->oid
, raw_obj
->loc
);
6297 return get_obj_data_pool(placement_rule
, obj
, &raw_obj
->pool
);
6300 int RGWRados::update_placement_map()
6303 map
<string
, bufferlist
> m
;
6304 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6305 int ret
= omap_get_all(obj
, header
, m
);
6310 ::encode(m
, new_bl
);
6311 ret
= put_system_obj_data(NULL
, obj
, new_bl
, -1, false);
6313 ldout(cct
, 0) << "WARNING: could not save avail pools map info ret=" << ret
<< dendl
;
6319 int RGWRados::add_bucket_placement(const rgw_pool
& new_pool
)
6321 librados::Rados
*rad
= get_rados_handle();
6322 int ret
= rad
->pool_lookup(new_pool
.name
.c_str());
6323 if (ret
< 0) // DNE, or something
6326 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6327 bufferlist empty_bl
;
6328 ret
= omap_set(obj
, new_pool
.to_str(), empty_bl
);
6330 // don't care about return value
6331 update_placement_map();
6336 int RGWRados::remove_bucket_placement(const rgw_pool
& old_pool
)
6338 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6339 int ret
= omap_del(obj
, old_pool
.to_str());
6341 // don't care about return value
6342 update_placement_map();
6347 int RGWRados::list_placement_set(set
<rgw_pool
>& names
)
6350 map
<string
, bufferlist
> m
;
6352 rgw_raw_obj
obj(get_zone_params().domain_root
, avail_pools
);
6353 int ret
= omap_get_all(obj
, header
, m
);
6358 map
<string
, bufferlist
>::iterator miter
;
6359 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
6360 names
.insert(rgw_pool(miter
->first
));
6363 return names
.size();
6366 int RGWRados::create_pools(vector
<rgw_pool
>& pools
, vector
<int>& retcodes
)
6368 vector
<librados::PoolAsyncCompletion
*> completions
;
6371 librados::Rados
*rad
= get_rados_handle();
6372 for (auto iter
= pools
.begin(); iter
!= pools
.end(); ++iter
) {
6373 librados::PoolAsyncCompletion
*c
= librados::Rados::pool_async_create_completion();
6374 completions
.push_back(c
);
6375 rgw_pool
& pool
= *iter
;
6376 int ret
= rad
->pool_create_async(pool
.name
.c_str(), c
);
6377 rets
.push_back(ret
);
6380 vector
<int>::iterator riter
;
6381 vector
<librados::PoolAsyncCompletion
*>::iterator citer
;
6384 assert(rets
.size() == completions
.size());
6385 for (riter
= rets
.begin(), citer
= completions
.begin(); riter
!= rets
.end(); ++riter
, ++citer
) {
6387 PoolAsyncCompletion
*c
= *citer
;
6390 r
= c
->get_return_value();
6392 ldout(cct
, 0) << "WARNING: async pool_create returned " << r
<< dendl
;
6397 retcodes
.push_back(r
);
6403 std::vector
<librados::IoCtx
> io_ctxs
;
6405 for (auto pool
: pools
) {
6406 io_ctxs
.emplace_back();
6407 int ret
= rad
->ioctx_create(pool
.name
.c_str(), io_ctxs
.back());
6409 ldout(cct
, 0) << "WARNING: ioctx_create returned " << ret
<< dendl
;
6412 retcodes
.push_back(ret
);
6418 completions
.clear();
6419 for (auto &io_ctx
: io_ctxs
) {
6420 librados::PoolAsyncCompletion
*c
=
6421 librados::Rados::pool_async_create_completion();
6422 completions
.push_back(c
);
6423 int ret
= io_ctx
.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW
,
6429 for (auto c
: completions
) {
6431 int ret
= c
->get_return_value();
6432 if (ret
== -EOPNOTSUPP
) {
6434 } else if (ret
< 0) {
6435 ldout(cct
, 0) << "WARNING: async application_enable returned " << ret
6440 retcodes
.push_back(ret
);
6445 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, librados::IoCtx
*ioctx
)
6448 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
6451 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
6452 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
6456 int r
= open_pool_ctx(pool
, *ioctx
);
6461 ioctx
->locator_set_key(key
);
6466 int RGWRados::get_obj_head_ref(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_rados_ref
*ref
)
6468 get_obj_bucket_and_oid_loc(obj
, ref
->oid
, ref
->key
);
6471 if (!get_obj_data_pool(bucket_info
.placement_rule
, obj
, &pool
)) {
6472 ldout(cct
, 0) << "ERROR: cannot get data pool for obj=" << obj
<< ", probably misconfiguration" << dendl
;
6476 int r
= open_pool_ctx(pool
, ref
->ioctx
);
6481 ref
->ioctx
.locator_set_key(ref
->key
);
6486 int RGWRados::get_raw_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
6493 if (ref
->oid
.empty()) {
6494 ref
->oid
= obj
.pool
.to_str();
6495 ref
->pool
= get_zone_params().domain_root
;
6497 ref
->pool
= obj
.pool
;
6499 r
= open_pool_ctx(ref
->pool
, ref
->ioctx
);
6503 ref
->ioctx
.locator_set_key(ref
->key
);
6508 int RGWRados::get_system_obj_ref(const rgw_raw_obj
& obj
, rgw_rados_ref
*ref
)
6510 return get_raw_obj_ref(obj
, ref
);
6514 * fixes an issue where head objects were supposed to have a locator created, but ended
6517 int RGWRados::fix_head_obj_locator(const RGWBucketInfo
& bucket_info
, bool copy_obj
, bool remove_bad
, rgw_obj_key
& key
)
6519 const rgw_bucket
& bucket
= bucket_info
.bucket
;
6523 rgw_obj
obj(bucket
, key
);
6525 get_obj_bucket_and_oid_loc(obj
, oid
, locator
);
6527 if (locator
.empty()) {
6528 ldout(cct
, 20) << "object does not have a locator, nothing to fix" << dendl
;
6532 librados::IoCtx ioctx
;
6534 int ret
= get_obj_head_ioctx(bucket_info
, obj
, &ioctx
);
6536 cerr
<< "ERROR: get_obj_head_ioctx() returned ret=" << ret
<< std::endl
;
6539 ioctx
.locator_set_key(string()); /* override locator for this object, use empty locator */
6544 struct timespec mtime_ts
;
6545 map
<string
, bufferlist
> attrs
;
6546 librados::ObjectReadOperation op
;
6547 op
.getxattrs(&attrs
, NULL
);
6548 op
.stat2(&size
, &mtime_ts
, NULL
);
6549 #define HEAD_SIZE 512 * 1024
6550 op
.read(0, HEAD_SIZE
, &data
, NULL
);
6552 ret
= ioctx
.operate(oid
, &op
, NULL
);
6554 lderr(cct
) << "ERROR: ioctx.operate(oid=" << oid
<< ") returned ret=" << ret
<< dendl
;
6558 if (size
> HEAD_SIZE
) {
6559 lderr(cct
) << "ERROR: returned object size (" << size
<< ") > HEAD_SIZE (" << HEAD_SIZE
<< ")" << dendl
;
6563 if (size
!= data
.length()) {
6564 lderr(cct
) << "ERROR: returned object size (" << size
<< ") != data.length() (" << data
.length() << ")" << dendl
;
6569 librados::ObjectWriteOperation wop
;
6571 wop
.mtime2(&mtime_ts
);
6573 map
<string
, bufferlist
>::iterator iter
;
6574 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
6575 wop
.setxattr(iter
->first
.c_str(), iter
->second
);
6580 ioctx
.locator_set_key(locator
);
6581 ioctx
.operate(oid
, &wop
);
6585 ioctx
.locator_set_key(string());
6587 ret
= ioctx
.remove(oid
);
6589 lderr(cct
) << "ERROR: failed to remove original bad object" << dendl
;
6597 int RGWRados::move_rados_obj(librados::IoCtx
& src_ioctx
,
6598 const string
& src_oid
, const string
& src_locator
,
6599 librados::IoCtx
& dst_ioctx
,
6600 const string
& dst_oid
, const string
& dst_locator
)
6603 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6605 uint64_t chunk_size
= COPY_BUF_SIZE
;
6609 struct timespec mtime_ts
;
6612 if (src_oid
== dst_oid
&& src_locator
== dst_locator
) {
6616 src_ioctx
.locator_set_key(src_locator
);
6617 dst_ioctx
.locator_set_key(dst_locator
);
6621 ObjectReadOperation rop
;
6622 ObjectWriteOperation wop
;
6625 rop
.stat2(&size
, &mtime_ts
, NULL
);
6626 mtime
= real_clock::from_timespec(mtime_ts
);
6628 rop
.read(ofs
, chunk_size
, &data
, NULL
);
6629 ret
= src_ioctx
.operate(src_oid
, &rop
, NULL
);
6634 if (data
.length() == 0) {
6639 wop
.create(true); /* make it exclusive */
6640 wop
.mtime2(&mtime_ts
);
6641 mtime
= real_clock::from_timespec(mtime_ts
);
6643 wop
.write(ofs
, data
);
6644 ret
= dst_ioctx
.operate(dst_oid
, &wop
);
6645 ofs
+= data
.length();
6646 done
= data
.length() != chunk_size
;
6650 lderr(cct
) << "ERROR: " << __func__
<< ": copying " << src_oid
<< " -> " << dst_oid
6651 << ": expected " << size
<< " bytes to copy, ended up with " << ofs
<< dendl
;
6656 src_ioctx
.remove(src_oid
);
6661 lderr(cct
) << "ERROR: failed to copy " << src_oid
<< " -> " << dst_oid
<< dendl
;
6666 * fixes an issue where head objects were supposed to have a locator created, but ended
6669 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo
& bucket_info
, rgw_obj_key
& key
, bool fix
, bool *need_fix
)
6671 const rgw_bucket
& bucket
= bucket_info
.bucket
;
6672 rgw_obj
obj(bucket
, key
);
6679 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
6684 RGWObjState
*astate
= NULL
;
6685 RGWObjectCtx
rctx(this);
6686 r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
6690 if (astate
->has_manifest
) {
6691 RGWObjManifest::obj_iterator miter
;
6692 RGWObjManifest
& manifest
= astate
->manifest
;
6693 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
6694 rgw_raw_obj raw_loc
= miter
.get_location().get_raw_obj(this);
6699 rgw_raw_obj_to_obj(manifest
.get_tail_placement().bucket
, raw_loc
, &loc
);
6701 if (loc
.key
.ns
.empty()) {
6702 /* continue, we're only interested in tail objects */
6706 get_obj_bucket_and_oid_loc(loc
, oid
, locator
);
6707 ref
.ioctx
.locator_set_key(locator
);
6709 ldout(cct
, 20) << __func__
<< ": key=" << key
<< " oid=" << oid
<< " locator=" << locator
<< dendl
;
6711 r
= ref
.ioctx
.stat(oid
, NULL
, NULL
);
6717 prepend_bucket_marker(bucket
, loc
.key
.name
, bad_loc
);
6719 /* create a new ioctx with the bad locator */
6720 librados::IoCtx src_ioctx
;
6721 src_ioctx
.dup(ref
.ioctx
);
6722 src_ioctx
.locator_set_key(bad_loc
);
6724 r
= src_ioctx
.stat(oid
, NULL
, NULL
);
6726 /* cannot find a broken part */
6729 ldout(cct
, 20) << __func__
<< ": found bad object part: " << loc
<< dendl
;
6734 r
= move_rados_obj(src_ioctx
, oid
, bad_loc
, ref
.ioctx
, oid
, locator
);
6736 lderr(cct
) << "ERROR: copy_rados_obj() on oid=" << oid
<< " returned r=" << r
<< dendl
;
6745 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
6747 RGWBucketInfo
* bucket_info_out
)
6751 RGWObjectCtx
obj_ctx(store
);
6753 RGWBucketInfo bucket_info
;
6754 RGWBucketInfo
* bucket_info_p
=
6755 bucket_info_out
? bucket_info_out
: &bucket_info
;
6757 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
6762 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, obj
.get_hash_object(), &bucket_obj
, &shard_id
);
6764 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6767 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6772 int RGWRados::BucketShard::init(const rgw_bucket
& _bucket
,
6774 RGWBucketInfo
* bucket_info_out
)
6779 RGWObjectCtx
obj_ctx(store
);
6781 RGWBucketInfo bucket_info
;
6782 RGWBucketInfo
* bucket_info_p
=
6783 bucket_info_out
? bucket_info_out
: &bucket_info
;
6784 int ret
= store
->get_bucket_instance_info(obj_ctx
, bucket
, *bucket_info_p
, NULL
, NULL
);
6789 ret
= store
->open_bucket_index_shard(*bucket_info_p
, index_ctx
, shard_id
, &bucket_obj
);
6791 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6794 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6799 int RGWRados::BucketShard::init(const RGWBucketInfo
& bucket_info
, int sid
)
6801 bucket
= bucket_info
.bucket
;
6804 int ret
= store
->open_bucket_index_shard(bucket_info
, index_ctx
, shard_id
, &bucket_obj
);
6806 ldout(store
->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret
<< dendl
;
6809 ldout(store
->ctx(), 20) << " bucket index object: " << bucket_obj
<< dendl
;
6815 /* Execute @handler on last item in bucket listing for bucket specified
6816 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6817 * to objects matching these criterias. */
6818 int RGWRados::on_last_entry_in_listing(RGWBucketInfo
& bucket_info
,
6819 const std::string
& obj_prefix
,
6820 const std::string
& obj_delim
,
6821 std::function
<int(const rgw_bucket_dir_entry
&)> handler
)
6823 RGWRados::Bucket
target(this, bucket_info
);
6824 RGWRados::Bucket::List
list_op(&target
);
6826 list_op
.params
.prefix
= obj_prefix
;
6827 list_op
.params
.delim
= obj_delim
;
6829 ldout(cct
, 20) << "iterating listing for bucket=" << bucket_info
.bucket
.name
6830 << ", obj_prefix=" << obj_prefix
6831 << ", obj_delim=" << obj_delim
6834 bool is_truncated
= false;
6836 boost::optional
<rgw_bucket_dir_entry
> last_entry
;
6837 /* We need to rewind to the last object in a listing. */
6839 /* List bucket entries in chunks. */
6840 static constexpr int MAX_LIST_OBJS
= 100;
6841 std::vector
<rgw_bucket_dir_entry
> entries(MAX_LIST_OBJS
);
6843 int ret
= list_op
.list_objects(MAX_LIST_OBJS
, &entries
, nullptr,
6847 } else if (!entries
.empty()) {
6848 last_entry
= entries
.back();
6850 } while (is_truncated
);
6853 return handler(*last_entry
);
6856 /* Empty listing - no items we can run handler on. */
6861 int RGWRados::swift_versioning_copy(RGWObjectCtx
& obj_ctx
,
6862 const rgw_user
& user
,
6863 RGWBucketInfo
& bucket_info
,
6866 if (! swift_versioning_enabled(bucket_info
)) {
6870 obj_ctx
.obj
.set_atomic(obj
);
6872 RGWObjState
* state
= nullptr;
6873 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &state
, false);
6878 if (!state
->exists
) {
6885 const string
& src_name
= obj
.get_oid();
6886 char buf
[src_name
.size() + 32];
6887 struct timespec ts
= ceph::real_clock::to_timespec(state
->mtime
);
6888 snprintf(buf
, sizeof(buf
), "%03x%s/%lld.%06ld", (int)src_name
.size(),
6889 src_name
.c_str(), (long long)ts
.tv_sec
, ts
.tv_nsec
/ 1000);
6891 RGWBucketInfo dest_bucket_info
;
6893 r
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
, bucket_info
.swift_ver_location
, dest_bucket_info
, NULL
, NULL
);
6895 ldout(cct
, 10) << "failed to read dest bucket info: r=" << r
<< dendl
;
6897 return -ERR_PRECONDITION_FAILED
;
6902 if (dest_bucket_info
.owner
!= bucket_info
.owner
) {
6903 return -ERR_PRECONDITION_FAILED
;
6906 rgw_obj
dest_obj(dest_bucket_info
.bucket
, buf
);
6907 obj_ctx
.obj
.set_atomic(dest_obj
);
6911 r
= copy_obj(obj_ctx
,
6915 NULL
, /* req_info *info */
6921 NULL
, /* time_t *src_mtime */
6922 NULL
, /* time_t *mtime */
6923 NULL
, /* const time_t *mod_ptr */
6924 NULL
, /* const time_t *unmod_ptr */
6925 false, /* bool high_precision_time */
6926 NULL
, /* const char *if_match */
6927 NULL
, /* const char *if_nomatch */
6928 RGWRados::ATTRSMOD_NONE
,
6929 true, /* bool copy_if_newer */
6931 RGW_OBJ_CATEGORY_MAIN
,
6932 0, /* uint64_t olh_epoch */
6933 real_time(), /* time_t delete_at */
6934 NULL
, /* string *version_id */
6935 NULL
, /* string *ptag */
6936 NULL
, /* string *petag */
6937 NULL
, /* void (*progress_cb)(off_t, void *) */
6938 NULL
); /* void *progress_data */
6939 if (r
== -ECANCELED
|| r
== -ENOENT
) {
6940 /* Has already been overwritten, meaning another rgw process already
6948 int RGWRados::swift_versioning_restore(RGWObjectCtx
& obj_ctx
,
6949 const rgw_user
& user
,
6950 RGWBucketInfo
& bucket_info
,
6952 bool& restored
) /* out */
6954 if (! swift_versioning_enabled(bucket_info
)) {
6958 /* Bucket info of the bucket that stores previous versions of our object. */
6959 RGWBucketInfo archive_binfo
;
6961 int ret
= get_bucket_info(obj_ctx
, bucket_info
.bucket
.tenant
,
6962 bucket_info
.swift_ver_location
, archive_binfo
,
6968 /* Abort the operation if the bucket storing our archive belongs to someone
6969 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6970 * into consideration. For we can live with that.
6972 * TODO: delegate this check to un upper layer and compare with ACLs. */
6973 if (bucket_info
.owner
!= archive_binfo
.owner
) {
6977 /* This code will be executed on latest version of the object. */
6978 const auto handler
= [&](const rgw_bucket_dir_entry
& entry
) -> int {
6979 std::string no_client_id
;
6980 std::string no_op_id
;
6981 std::string no_zone
;
6983 /* We don't support object versioning of Swift API on those buckets that
6984 * are already versioned using the S3 mechanism. This affects also bucket
6985 * storing archived objects. Otherwise the delete operation would create
6986 * a deletion marker. */
6987 if (archive_binfo
.versioned()) {
6989 return -ERR_PRECONDITION_FAILED
;
6992 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6993 * irrelevant and may be safely skipped. */
6994 std::map
<std::string
, ceph::bufferlist
> no_attrs
;
6996 rgw_obj
archive_obj(archive_binfo
.bucket
, entry
.key
);
6997 obj_ctx
.obj
.set_atomic(archive_obj
);
6998 obj_ctx
.obj
.set_atomic(obj
);
7000 int ret
= copy_obj(obj_ctx
,
7004 nullptr, /* req_info *info */
7007 archive_obj
, /* src obj */
7008 bucket_info
, /* dest bucket info */
7009 archive_binfo
, /* src bucket info */
7010 nullptr, /* time_t *src_mtime */
7011 nullptr, /* time_t *mtime */
7012 nullptr, /* const time_t *mod_ptr */
7013 nullptr, /* const time_t *unmod_ptr */
7014 false, /* bool high_precision_time */
7015 nullptr, /* const char *if_match */
7016 nullptr, /* const char *if_nomatch */
7017 RGWRados::ATTRSMOD_NONE
,
7018 true, /* bool copy_if_newer */
7020 RGW_OBJ_CATEGORY_MAIN
,
7021 0, /* uint64_t olh_epoch */
7022 real_time(), /* time_t delete_at */
7023 nullptr, /* string *version_id */
7024 nullptr, /* string *ptag */
7025 nullptr, /* string *petag */
7026 nullptr, /* void (*progress_cb)(off_t, void *) */
7027 nullptr); /* void *progress_data */
7028 if (ret
== -ECANCELED
|| ret
== -ENOENT
) {
7029 /* Has already been overwritten, meaning another rgw process already
7032 } else if (ret
< 0) {
7038 /* Need to remove the archived copy. */
7039 ret
= delete_obj(obj_ctx
, archive_binfo
, archive_obj
,
7040 archive_binfo
.versioning_status());
7045 const std::string
& obj_name
= obj
.get_oid();
7046 const auto prefix
= boost::str(boost::format("%03x%s") % obj_name
.size()
7049 return on_last_entry_in_listing(archive_binfo
, prefix
, std::string(),
7054 * Write/overwrite an object to the bucket storage.
7055 * bucket: the bucket to store the object in
7056 * obj: the object name/key
7057 * data: the object contents/value
7058 * size: the amount of data to write (data must be this long)
7059 * accounted_size: original size of data before compression, encryption
7060 * mtime: if non-NULL, writes the given mtime to the bucket storage
7061 * attrs: all the given attrs are written to bucket storage for the given object
7062 * exclusive: create object exclusively
7063 * Returns: 0 on success, -ERR# otherwise.
7065 int RGWRados::Object::Write::_do_write_meta(uint64_t size
, uint64_t accounted_size
,
7066 map
<string
, bufferlist
>& attrs
,
7067 bool assume_noent
, bool modify_tail
,
7070 RGWRados::Bucket::UpdateIndex
*index_op
= static_cast<RGWRados::Bucket::UpdateIndex
*>(_index_op
);
7071 RGWRados
*store
= target
->get_store();
7073 ObjectWriteOperation op
;
7076 int r
= target
->get_state(&state
, false, assume_noent
);
7080 rgw_obj
& obj
= target
->get_obj();
7082 if (obj
.get_oid().empty()) {
7083 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< "(): cannot write object with empty name" << dendl
;
7088 r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
7092 bool is_olh
= state
->is_olh
;
7094 bool reset_obj
= (meta
.flags
& PUT_OBJ_CREATE
) != 0;
7096 const string
*ptag
= meta
.ptag
;
7097 if (!ptag
&& !index_op
->get_optag()->empty()) {
7098 ptag
= index_op
->get_optag();
7100 r
= target
->prepare_atomic_modification(op
, reset_obj
, ptag
, meta
.if_match
, meta
.if_nomatch
, false, modify_tail
);
7104 if (real_clock::is_zero(meta
.set_mtime
)) {
7105 meta
.set_mtime
= real_clock::now();
7108 if (state
->is_olh
) {
7109 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, state
->olh_tag
);
7112 struct timespec mtime_ts
= real_clock::to_timespec(meta
.set_mtime
);
7113 op
.mtime2(&mtime_ts
);
7116 /* if we want to overwrite the data, we also want to overwrite the
7117 xattrs, so just remove the object */
7118 op
.write_full(*meta
.data
);
7122 string content_type
;
7125 map
<string
, bufferlist
>::iterator iter
;
7127 for (iter
= meta
.rmattrs
->begin(); iter
!= meta
.rmattrs
->end(); ++iter
) {
7128 const string
& name
= iter
->first
;
7129 op
.rmxattr(name
.c_str());
7133 if (meta
.manifest
) {
7134 /* remove existing manifest attr */
7135 iter
= attrs
.find(RGW_ATTR_MANIFEST
);
7136 if (iter
!= attrs
.end())
7140 ::encode(*meta
.manifest
, bl
);
7141 op
.setxattr(RGW_ATTR_MANIFEST
, bl
);
7144 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
7145 const string
& name
= iter
->first
;
7146 bufferlist
& bl
= iter
->second
;
7151 op
.setxattr(name
.c_str(), bl
);
7153 if (name
.compare(RGW_ATTR_ETAG
) == 0) {
7155 } else if (name
.compare(RGW_ATTR_CONTENT_TYPE
) == 0) {
7156 content_type
= bl
.c_str();
7157 } else if (name
.compare(RGW_ATTR_ACL
) == 0) {
7161 if (attrs
.find(RGW_ATTR_PG_VER
) == attrs
.end()) {
7162 cls_rgw_obj_store_pg_ver(op
, RGW_ATTR_PG_VER
);
7165 if (attrs
.find(RGW_ATTR_SOURCE_ZONE
) == attrs
.end()) {
7167 ::encode(store
->get_zone_short_id(), bl
);
7168 op
.setxattr(RGW_ATTR_SOURCE_ZONE
, bl
);
7179 if (!reset_obj
) { //Multipart upload, it has immutable head.
7180 orig_exists
= false;
7183 orig_exists
= state
->exists
;
7184 orig_size
= state
->accounted_size
;
7187 bool versioned_target
= (meta
.olh_epoch
&& *meta
.olh_epoch
> 0) ||
7188 !obj
.key
.instance
.empty();
7190 bool versioned_op
= (target
->versioning_enabled() || is_olh
|| versioned_target
);
7193 index_op
->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP
);
7196 if (!index_op
->is_prepared()) {
7197 r
= index_op
->prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
7202 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
7203 if (r
< 0) { /* we can expect to get -ECANCELED if object was replaced under,
7204 or -ENOENT if was removed, or -EEXIST if it did not exist
7205 before and now it does */
7206 if (r
== -EEXIST
&& assume_noent
) {
7207 target
->invalidate_state();
7213 epoch
= ref
.ioctx
.get_last_version();
7214 poolid
= ref
.ioctx
.get_id();
7216 r
= target
->complete_atomic_modification();
7218 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r
<< dendl
;
7221 r
= index_op
->complete(poolid
, epoch
, size
, accounted_size
,
7222 meta
.set_mtime
, etag
, content_type
, &acl_bl
,
7223 meta
.category
, meta
.remove_objs
, meta
.user_data
);
7228 *meta
.mtime
= meta
.set_mtime
;
7231 /* note that index_op was using state so we couldn't invalidate it earlier */
7232 target
->invalidate_state();
7235 if (versioned_op
&& meta
.olh_epoch
) {
7236 r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), obj
, false, NULL
, *meta
.olh_epoch
, real_time(), false, meta
.zones_trace
);
7242 if (!real_clock::is_zero(meta
.delete_at
)) {
7243 rgw_obj_index_key obj_key
;
7244 obj
.key
.get_index_key(&obj_key
);
7246 r
= store
->objexp_hint_add(meta
.delete_at
,
7247 obj
.bucket
.tenant
, obj
.bucket
.name
, obj
.bucket
.bucket_id
, obj_key
);
7249 ldout(store
->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r
<< ", object will not get removed" << dendl
;
7250 /* ignoring error, nothing we can do at this point */
7253 meta
.canceled
= false;
7255 /* update quota cache */
7256 if (meta
.completeMultipart
){
7257 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
7261 store
->quota_handler
->update_stats(meta
.owner
, obj
.bucket
, (orig_exists
? 0 : 1),
7262 accounted_size
, orig_size
);
7267 int ret
= index_op
->cancel();
7269 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret
<< dendl
;
7272 meta
.canceled
= true;
7274 /* we lost in a race. There are a few options:
7275 * - existing object was rewritten (ECANCELED)
7276 * - non existing object was created (EEXIST)
7277 * - object was removed (ENOENT)
7278 * should treat it as a success
7280 if (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
) {
7281 if (r
== -ECANCELED
|| r
== -ENOENT
|| r
== -EEXIST
) {
7285 if (meta
.if_match
!= NULL
) {
7286 // only overwrite existing object
7287 if (strcmp(meta
.if_match
, "*") == 0) {
7289 r
= -ERR_PRECONDITION_FAILED
;
7290 } else if (r
== -ECANCELED
) {
7296 if (meta
.if_nomatch
!= NULL
) {
7297 // only create a new object
7298 if (strcmp(meta
.if_nomatch
, "*") == 0) {
7300 r
= -ERR_PRECONDITION_FAILED
;
7301 } else if (r
== -ENOENT
) {
7311 int RGWRados::Object::Write::write_meta(uint64_t size
, uint64_t accounted_size
,
7312 map
<string
, bufferlist
>& attrs
)
7314 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
7316 RGWRados::Bucket
bop(target
->get_store(), bucket_info
);
7317 RGWRados::Bucket::UpdateIndex
index_op(&bop
, target
->get_obj());
7318 index_op
.set_zones_trace(meta
.zones_trace
);
7320 bool assume_noent
= (meta
.if_match
== NULL
&& meta
.if_nomatch
== NULL
);
7323 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
7325 assume_noent
= false;
7328 if (!assume_noent
) {
7329 r
= _do_write_meta(size
, accounted_size
, attrs
, assume_noent
, meta
.modify_tail
, (void *)&index_op
);
7334 /** Write/overwrite a system object. */
7335 int RGWRados::put_system_obj_impl(rgw_raw_obj
& obj
, uint64_t size
, real_time
*mtime
,
7336 map
<std::string
, bufferlist
>& attrs
, int flags
,
7338 RGWObjVersionTracker
*objv_tracker
,
7339 real_time set_mtime
/* 0 for don't set */)
7342 int r
= get_system_obj_ref(obj
, &ref
);
7346 ObjectWriteOperation op
;
7348 if (flags
& PUT_OBJ_EXCL
) {
7349 if (!(flags
& PUT_OBJ_CREATE
))
7351 op
.create(true); // exclusive create
7354 op
.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK
);
7359 objv_tracker
->prepare_op_for_write(&op
);
7362 if (real_clock::is_zero(set_mtime
)) {
7363 set_mtime
= real_clock::now();
7366 struct timespec mtime_ts
= real_clock::to_timespec(set_mtime
);
7367 op
.mtime2(&mtime_ts
);
7368 op
.write_full(data
);
7372 for (map
<string
, bufferlist
>::iterator iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
7373 const string
& name
= iter
->first
;
7374 bufferlist
& bl
= iter
->second
;
7379 op
.setxattr(name
.c_str(), bl
);
7382 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
7388 objv_tracker
->apply_write();
7398 int RGWRados::put_system_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
7399 off_t ofs
, bool exclusive
,
7400 RGWObjVersionTracker
*objv_tracker
)
7403 int r
= get_system_obj_ref(obj
, &ref
);
7408 ObjectWriteOperation op
;
7414 objv_tracker
->prepare_op_for_write(&op
);
7421 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
7426 objv_tracker
->apply_write();
7432 * Write/overwrite an object to the bucket storage.
7433 * bucket: the bucket to store the object in
7434 * obj: the object name/key
7435 * data: the object contents/value
7436 * offset: the offet to write to in the object
7437 * If this is -1, we will overwrite the whole object.
7438 * size: the amount of data to write (data must be this long)
7439 * attrs: all the given attrs are written to bucket storage for the given object
7440 * Returns: 0 on success, -ERR# otherwise.
7443 int RGWRados::aio_put_obj_data(void *ctx
, rgw_raw_obj
& obj
, bufferlist
& bl
,
7444 off_t ofs
, bool exclusive
,
7448 int r
= get_raw_obj_ref(obj
, &ref
);
7453 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
7456 ObjectWriteOperation op
;
7466 r
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
7473 int RGWRados::aio_wait(void *handle
)
7475 AioCompletion
*c
= (AioCompletion
*)handle
;
7477 int ret
= c
->get_return_value();
7482 bool RGWRados::aio_completed(void *handle
)
7484 AioCompletion
*c
= (AioCompletion
*)handle
;
7485 return c
->is_safe();
7488 // PutObj filter that buffers data so we don't try to compress tiny blocks.
7489 // libcurl reads in 16k at a time, and we need at least 64k to get a good
7490 // compression ratio
7491 class RGWPutObj_Buffer
: public RGWPutObj_Filter
{
7492 const unsigned buffer_size
;
7495 RGWPutObj_Buffer(RGWPutObjDataProcessor
* next
, unsigned buffer_size
)
7496 : RGWPutObj_Filter(next
), buffer_size(buffer_size
) {
7497 assert(ISP2(buffer_size
)); // must be power of 2
7500 int handle_data(bufferlist
& bl
, off_t ofs
, void **phandle
, rgw_raw_obj
*pobj
,
7501 bool *again
) override
{
7502 if (*again
|| !bl
.length()) {
7503 // flush buffered data
7504 return RGWPutObj_Filter::handle_data(buffer
, ofs
, phandle
, pobj
, again
);
7506 // transform offset to the beginning of the buffer
7507 ofs
= ofs
- buffer
.length();
7508 buffer
.claim_append(bl
);
7509 if (buffer
.length() < buffer_size
) {
7510 *again
= false; // don't come back until there's more data
7513 const auto count
= P2ALIGN(buffer
.length(), buffer_size
);
7514 buffer
.splice(0, count
, &bl
);
7515 return RGWPutObj_Filter::handle_data(bl
, ofs
, phandle
, pobj
, again
);
7519 class RGWRadosPutObj
: public RGWGetDataCB
7523 RGWPutObjDataProcessor
*filter
;
7524 boost::optional
<RGWPutObj_Compress
>& compressor
;
7525 boost::optional
<RGWPutObj_Buffer
> buffering
;
7526 CompressorRef
& plugin
;
7527 RGWPutObjProcessor_Atomic
*processor
;
7528 RGWOpStateSingleOp
*opstate
;
7529 void (*progress_cb
)(off_t
, void *);
7530 void *progress_data
;
7531 bufferlist extra_data_bl
;
7532 uint64_t extra_data_left
;
7534 map
<string
, bufferlist
> src_attrs
;
7536 RGWRadosPutObj(CephContext
* cct
,
7537 CompressorRef
& plugin
,
7538 boost::optional
<RGWPutObj_Compress
>& compressor
,
7539 RGWPutObjProcessor_Atomic
*p
,
7540 RGWOpStateSingleOp
*_ops
,
7541 void (*_progress_cb
)(off_t
, void *),
7542 void *_progress_data
) :
7545 compressor(compressor
),
7549 progress_cb(_progress_cb
),
7550 progress_data(_progress_data
),
7554 int process_attrs(void) {
7555 if (extra_data_bl
.length()) {
7557 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
7558 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
7562 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
7564 src_attrs
.erase(RGW_ATTR_COMPRESSION
);
7565 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
7568 if (plugin
&& src_attrs
.find(RGW_ATTR_CRYPT_MODE
) == src_attrs
.end()) {
7569 //do not compress if object is encrypted
7570 compressor
= boost::in_place(cct
, plugin
, filter
);
7571 constexpr unsigned buffer_size
= 512 * 1024;
7572 buffering
= boost::in_place(&*compressor
, buffer_size
);
7573 filter
= &*buffering
;
7578 int handle_data(bufferlist
& bl
, off_t ofs
, off_t len
) override
{
7580 progress_cb(ofs
, progress_data
);
7582 if (extra_data_left
) {
7583 size_t extra_len
= bl
.length();
7584 if (extra_len
> extra_data_left
)
7585 extra_len
= extra_data_left
;
7588 bl
.splice(0, extra_len
, &extra
);
7589 extra_data_bl
.append(extra
);
7591 extra_data_left
-= extra_len
;
7592 if (extra_data_left
== 0) {
7593 int res
= process_attrs();
7597 if (bl
.length() == 0) {
7602 // adjust ofs based on extra_data_len, so the result is a logical offset
7603 // into the object data
7604 assert(uint64_t(ofs
) >= extra_data_len
);
7605 ofs
-= extra_data_len
;
7607 data_len
+= bl
.length();
7610 bool need_opstate
= true;
7613 void *handle
= NULL
;
7615 uint64_t size
= bl
.length();
7616 int ret
= filter
->handle_data(bl
, ofs
, &handle
, &obj
, &again
);
7620 if (need_opstate
&& opstate
) {
7621 /* need to update opstate repository with new state. This is ratelimited, so we're not
7622 * really doing it every time
7624 ret
= opstate
->renew_state();
7626 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret
<< dendl
;
7627 int r
= filter
->throttle_data(handle
, obj
, size
, false);
7629 ldout(cct
, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r
<< dendl
;
7631 /* could not renew state! might have been marked as cancelled */
7634 need_opstate
= false;
7637 ret
= filter
->throttle_data(handle
, obj
, size
, false);
7647 return put_data_and_throttle(filter
, bl
, 0, false);
7650 bufferlist
& get_extra_data() { return extra_data_bl
; }
7652 map
<string
, bufferlist
>& get_attrs() { return src_attrs
; }
7654 void set_extra_data_len(uint64_t len
) override
{
7655 extra_data_left
= len
;
7656 RGWGetDataCB::set_extra_data_len(len
);
7659 uint64_t get_data_len() {
7663 int complete(const string
& etag
, real_time
*mtime
, real_time set_mtime
,
7664 map
<string
, bufferlist
>& attrs
, real_time delete_at
, rgw_zone_set
*zones_trace
) {
7665 return processor
->complete(data_len
, etag
, mtime
, set_mtime
, attrs
, delete_at
, NULL
, NULL
, NULL
, zones_trace
);
7668 bool is_canceled() {
7669 return processor
->is_canceled();
7674 * prepare attrset depending on attrs_mod.
7676 static void set_copy_attrs(map
<string
, bufferlist
>& src_attrs
,
7677 map
<string
, bufferlist
>& attrs
,
7678 RGWRados::AttrsMod attrs_mod
)
7680 switch (attrs_mod
) {
7681 case RGWRados::ATTRSMOD_NONE
:
7684 case RGWRados::ATTRSMOD_REPLACE
:
7685 if (!attrs
[RGW_ATTR_ETAG
].length()) {
7686 attrs
[RGW_ATTR_ETAG
] = src_attrs
[RGW_ATTR_ETAG
];
7688 if (!attrs
[RGW_ATTR_TAIL_TAG
].length()) {
7689 auto ttiter
= src_attrs
.find(RGW_ATTR_TAIL_TAG
);
7690 if (ttiter
!= src_attrs
.end()) {
7691 attrs
[RGW_ATTR_TAIL_TAG
] = src_attrs
[RGW_ATTR_TAIL_TAG
];
7695 case RGWRados::ATTRSMOD_MERGE
:
7696 for (map
<string
, bufferlist
>::iterator it
= src_attrs
.begin(); it
!= src_attrs
.end(); ++it
) {
7697 if (attrs
.find(it
->first
) == attrs
.end()) {
7698 attrs
[it
->first
] = it
->second
;
7705 int RGWRados::rewrite_obj(RGWBucketInfo
& dest_bucket_info
, rgw_obj
& obj
)
7707 map
<string
, bufferlist
> attrset
;
7711 RGWObjectCtx
rctx(this);
7713 RGWRados::Object
op_target(this, dest_bucket_info
, rctx
, obj
);
7714 RGWRados::Object::Read
read_op(&op_target
);
7716 read_op
.params
.attrs
= &attrset
;
7717 read_op
.params
.lastmod
= &mtime
;
7718 read_op
.params
.obj_size
= &obj_size
;
7720 int ret
= read_op
.prepare();
7724 attrset
.erase(RGW_ATTR_ID_TAG
);
7725 attrset
.erase(RGW_ATTR_TAIL_TAG
);
7727 uint64_t max_chunk_size
;
7729 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, obj
, &max_chunk_size
);
7731 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj
.bucket
<< dendl
;
7735 return copy_obj_data(rctx
, dest_bucket_info
, read_op
, obj_size
- 1, obj
, obj
,
7736 max_chunk_size
, NULL
, mtime
, attrset
,
7737 RGW_OBJ_CATEGORY_MAIN
, 0, real_time(),
7738 (obj
.key
.instance
.empty() ? NULL
: &(obj
.key
.instance
)),
7742 struct obj_time_weight
{
7744 uint32_t zone_short_id
;
7746 bool high_precision
;
7748 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7750 bool compare_low_precision(const obj_time_weight
& rhs
) {
7751 struct timespec l
= ceph::real_clock::to_timespec(mtime
);
7752 struct timespec r
= ceph::real_clock::to_timespec(rhs
.mtime
);
7761 if (zone_short_id
!= rhs
.zone_short_id
) {
7762 return (zone_short_id
< rhs
.zone_short_id
);
7764 return (pg_ver
< rhs
.pg_ver
);
7768 bool operator<(const obj_time_weight
& rhs
) {
7769 if (!high_precision
|| !rhs
.high_precision
) {
7770 return compare_low_precision(rhs
);
7772 if (mtime
> rhs
.mtime
) {
7775 if (mtime
< rhs
.mtime
) {
7778 if (zone_short_id
!= rhs
.zone_short_id
) {
7779 return (zone_short_id
< rhs
.zone_short_id
);
7781 return (pg_ver
< rhs
.pg_ver
);
7784 void init(const real_time
& _mtime
, uint32_t _short_id
, uint64_t _pg_ver
) {
7786 zone_short_id
= _short_id
;
7790 void init(RGWObjState
*state
) {
7791 mtime
= state
->mtime
;
7792 zone_short_id
= state
->zone_short_id
;
7793 pg_ver
= state
->pg_ver
;
7797 inline ostream
& operator<<(ostream
& out
, const obj_time_weight
&o
) {
7800 if (o
.zone_short_id
!= 0 || o
.pg_ver
!= 0) {
7801 out
<< "[zid=" << o
.zone_short_id
<< ", pgv=" << o
.pg_ver
<< "]";
7807 class RGWGetExtraDataCB
: public RGWGetDataCB
{
7808 bufferlist extra_data
;
7810 RGWGetExtraDataCB() {}
7811 int handle_data(bufferlist
& bl
, off_t bl_ofs
, off_t bl_len
) override
{
7812 if (extra_data
.length() < extra_data_len
) {
7813 off_t max
= extra_data_len
- extra_data
.length();
7817 bl
.splice(0, max
, &extra_data
);
7822 bufferlist
& get_extra_data() {
7827 int RGWRados::stat_remote_obj(RGWObjectCtx
& obj_ctx
,
7828 const rgw_user
& user_id
,
7829 const string
& client_id
,
7831 const string
& source_zone
,
7833 RGWBucketInfo
& src_bucket_info
,
7834 real_time
*src_mtime
,
7836 const real_time
*mod_ptr
,
7837 const real_time
*unmod_ptr
,
7838 bool high_precision_time
,
7839 const char *if_match
,
7840 const char *if_nomatch
,
7841 map
<string
, bufferlist
> *pattrs
,
7846 /* source is in a different zonegroup, copy from there */
7848 RGWRESTStreamRWRequest
*in_stream_req
;
7850 map
<string
, bufferlist
> src_attrs
;
7851 append_rand_alpha(cct
, tag
, tag
, 32);
7852 obj_time_weight set_mtime_weight
;
7853 set_mtime_weight
.high_precision
= high_precision_time
;
7856 if (source_zone
.empty()) {
7857 if (src_bucket_info
.zonegroup
.empty()) {
7858 /* source is in the master zonegroup */
7859 conn
= rest_master_conn
;
7861 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7862 if (iter
== zonegroup_conn_map
.end()) {
7863 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
7866 conn
= iter
->second
;
7869 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
7870 if (iter
== zone_conn_map
.end()) {
7871 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
7874 conn
= iter
->second
;
7877 RGWGetExtraDataCB cb
;
7879 map
<string
, string
> req_headers
;
7880 real_time set_mtime
;
7882 const real_time
*pmod
= mod_ptr
;
7884 obj_time_weight dest_mtime_weight
;
7886 constexpr bool prepend_meta
= true;
7887 constexpr bool get_op
= true;
7888 constexpr bool rgwx_stat
= true;
7889 constexpr bool sync_manifest
= true;
7890 constexpr bool skip_decrypt
= true;
7891 int ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
7892 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
7893 prepend_meta
, get_op
, rgwx_stat
,
7894 sync_manifest
, skip_decrypt
, &cb
, &in_stream_req
);
7899 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, psize
, req_headers
);
7904 bufferlist
& extra_data_bl
= cb
.get_extra_data();
7905 if (extra_data_bl
.length()) {
7907 if (!jp
.parse(extra_data_bl
.c_str(), extra_data_bl
.length())) {
7908 ldout(cct
, 0) << "failed to parse response extra data. len=" << extra_data_bl
.length() << " data=" << extra_data_bl
.c_str() << dendl
;
7912 JSONDecoder::decode_json("attrs", src_attrs
, &jp
);
7914 src_attrs
.erase(RGW_ATTR_MANIFEST
); // not interested in original object layout
7918 *src_mtime
= set_mtime
;
7922 map
<string
, bufferlist
>::iterator iter
= src_attrs
.find(RGW_ATTR_ETAG
);
7923 if (iter
!= src_attrs
.end()) {
7924 bufferlist
& etagbl
= iter
->second
;
7925 *petag
= etagbl
.to_str();
7930 *pattrs
= src_attrs
;
7936 int RGWRados::fetch_remote_obj(RGWObjectCtx
& obj_ctx
,
7937 const rgw_user
& user_id
,
7938 const string
& client_id
,
7939 const string
& op_id
,
7940 bool record_op_state
,
7942 const string
& source_zone
,
7945 RGWBucketInfo
& dest_bucket_info
,
7946 RGWBucketInfo
& src_bucket_info
,
7947 real_time
*src_mtime
,
7949 const real_time
*mod_ptr
,
7950 const real_time
*unmod_ptr
,
7951 bool high_precision_time
,
7952 const char *if_match
,
7953 const char *if_nomatch
,
7956 map
<string
, bufferlist
>& attrs
,
7957 RGWObjCategory category
,
7958 boost::optional
<uint64_t> olh_epoch
,
7959 real_time delete_at
,
7962 ceph::buffer::list
*petag
,
7963 void (*progress_cb
)(off_t
, void *),
7964 void *progress_data
,
7965 rgw_zone_set
*zones_trace
)
7967 /* source is in a different zonegroup, copy from there */
7969 RGWRESTStreamRWRequest
*in_stream_req
;
7972 append_rand_alpha(cct
, tag
, tag
, 32);
7973 obj_time_weight set_mtime_weight
;
7974 set_mtime_weight
.high_precision
= high_precision_time
;
7976 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
7977 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.key
.name
,
7978 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
7979 if (version_id
&& *version_id
!= "null") {
7980 processor
.set_version_id(*version_id
);
7983 processor
.set_olh_epoch(*olh_epoch
);
7985 int ret
= processor
.prepare(this, NULL
);
7991 if (source_zone
.empty()) {
7992 if (dest_bucket_info
.zonegroup
.empty()) {
7993 /* source is in the master zonegroup */
7994 conn
= rest_master_conn
;
7996 map
<string
, RGWRESTConn
*>::iterator iter
= zonegroup_conn_map
.find(src_bucket_info
.zonegroup
);
7997 if (iter
== zonegroup_conn_map
.end()) {
7998 ldout(cct
, 0) << "could not find zonegroup connection to zonegroup: " << source_zone
<< dendl
;
8001 conn
= iter
->second
;
8004 map
<string
, RGWRESTConn
*>::iterator iter
= zone_conn_map
.find(source_zone
);
8005 if (iter
== zone_conn_map
.end()) {
8006 ldout(cct
, 0) << "could not find zone connection to zone: " << source_zone
<< dendl
;
8009 conn
= iter
->second
;
8012 string obj_name
= dest_obj
.bucket
.name
+ "/" + dest_obj
.get_oid();
8014 RGWOpStateSingleOp
*opstate
= NULL
;
8016 if (record_op_state
) {
8017 opstate
= new RGWOpStateSingleOp(this, client_id
, op_id
, obj_name
);
8019 ret
= opstate
->set_state(RGWOpState::OPSTATE_IN_PROGRESS
);
8021 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
8027 boost::optional
<RGWPutObj_Compress
> compressor
;
8028 CompressorRef plugin
;
8030 const auto& compression_type
= zone_params
.get_compression_type(
8031 dest_bucket_info
.placement_rule
);
8032 if (compression_type
!= "none") {
8033 plugin
= Compressor::create(cct
, compression_type
);
8035 ldout(cct
, 1) << "Cannot load plugin for compression type "
8036 << compression_type
<< dendl
;
8040 RGWRadosPutObj
cb(cct
, plugin
, compressor
, &processor
, opstate
, progress_cb
, progress_data
);
8043 map
<string
, string
> req_headers
;
8044 real_time set_mtime
;
8046 RGWObjState
*dest_state
= NULL
;
8048 const real_time
*pmod
= mod_ptr
;
8050 obj_time_weight dest_mtime_weight
;
8052 if (copy_if_newer
) {
8053 /* need to get mtime for destination */
8054 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
8058 if (!real_clock::is_zero(dest_state
->mtime
)) {
8059 dest_mtime_weight
.init(dest_state
);
8060 pmod
= &dest_mtime_weight
.mtime
;
8064 static constexpr bool prepend_meta
= true;
8065 static constexpr bool get_op
= true;
8066 static constexpr bool rgwx_stat
= false;
8067 static constexpr bool sync_manifest
= true;
8068 static constexpr bool skip_decrypt
= true;
8069 ret
= conn
->get_obj(user_id
, info
, src_obj
, pmod
, unmod_ptr
,
8070 dest_mtime_weight
.zone_short_id
, dest_mtime_weight
.pg_ver
,
8071 prepend_meta
, get_op
, rgwx_stat
,
8072 sync_manifest
, skip_decrypt
, &cb
, &in_stream_req
);
8077 ret
= conn
->complete_request(in_stream_req
, etag
, &set_mtime
, nullptr, req_headers
);
8085 if (compressor
&& compressor
->is_compressed()) {
8087 RGWCompressionInfo cs_info
;
8088 cs_info
.compression_type
= plugin
->get_type_name();
8089 cs_info
.orig_size
= cb
.get_data_len();
8090 cs_info
.blocks
= move(compressor
->get_compression_blocks());
8091 ::encode(cs_info
, tmp
);
8092 cb
.get_attrs()[RGW_ATTR_COMPRESSION
] = tmp
;
8095 if (source_zone
.empty()) { /* need to preserve expiration if copy in the same zonegroup */
8096 cb
.get_attrs().erase(RGW_ATTR_DELETE_AT
);
8098 map
<string
, bufferlist
>::iterator iter
= cb
.get_attrs().find(RGW_ATTR_DELETE_AT
);
8099 if (iter
!= cb
.get_attrs().end()) {
8101 ::decode(delete_at
, iter
->second
);
8102 } catch (buffer::error
& err
) {
8103 ldout(cct
, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl
;
8109 *src_mtime
= set_mtime
;
8113 const auto iter
= cb
.get_attrs().find(RGW_ATTR_ETAG
);
8114 if (iter
!= cb
.get_attrs().end()) {
8115 *petag
= iter
->second
;
8119 if (source_zone
.empty()) {
8120 set_copy_attrs(cb
.get_attrs(), attrs
, attrs_mod
);
8122 attrs
= cb
.get_attrs();
8125 if (copy_if_newer
) {
8126 uint64_t pg_ver
= 0;
8127 auto i
= attrs
.find(RGW_ATTR_PG_VER
);
8128 if (i
!= attrs
.end() && i
->second
.length() > 0) {
8129 bufferlist::iterator iter
= i
->second
.begin();
8131 ::decode(pg_ver
, iter
);
8132 } catch (buffer::error
& err
) {
8133 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl
;
8134 /* non critical error */
8137 set_mtime_weight
.init(set_mtime
, get_zone_short_id(), pg_ver
);
8140 #define MAX_COMPLETE_RETRY 100
8141 for (i
= 0; i
< MAX_COMPLETE_RETRY
; i
++) {
8142 ret
= cb
.complete(etag
, mtime
, set_mtime
, attrs
, delete_at
, zones_trace
);
8146 if (copy_if_newer
&& cb
.is_canceled()) {
8147 ldout(cct
, 20) << "raced with another write of obj: " << dest_obj
<< dendl
;
8148 obj_ctx
.obj
.invalidate(dest_obj
); /* object was overwritten */
8149 ret
= get_obj_state(&obj_ctx
, dest_bucket_info
, dest_obj
, &dest_state
, false);
8151 ldout(cct
, 0) << "ERROR: " << __func__
<< ": get_err_state() returned ret=" << ret
<< dendl
;
8154 dest_mtime_weight
.init(dest_state
);
8155 dest_mtime_weight
.high_precision
= high_precision_time
;
8156 if (!dest_state
->exists
||
8157 dest_mtime_weight
< set_mtime_weight
) {
8158 ldout(cct
, 20) << "retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
8161 ldout(cct
, 20) << "not retrying writing object mtime=" << set_mtime
<< " dest_state->mtime=" << dest_state
->mtime
<< " dest_state->exists=" << dest_state
->exists
<< dendl
;
8167 if (i
== MAX_COMPLETE_RETRY
) {
8168 ldout(cct
, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl
;
8174 ret
= opstate
->set_state(RGWOpState::OPSTATE_COMPLETE
);
8176 ldout(cct
, 0) << "ERROR: failed to set opstate ret=" << ret
<< dendl
;
8183 if (copy_if_newer
&& ret
== -ERR_NOT_MODIFIED
) {
8184 // we may have already fetched during sync of OP_ADD, but were waiting
8185 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
8186 if (olh_epoch
&& *olh_epoch
> 0) {
8187 constexpr bool log_data_change
= true;
8188 ret
= set_olh(obj_ctx
, dest_bucket_info
, dest_obj
, false, nullptr,
8189 *olh_epoch
, real_time(), false, zones_trace
, log_data_change
);
8191 // we already have the latest copy
8196 RGWOpState::OpState state
;
8198 state
= RGWOpState::OPSTATE_ERROR
;
8200 state
= RGWOpState::OPSTATE_COMPLETE
;
8202 int r
= opstate
->set_state(state
);
8204 ldout(cct
, 0) << "ERROR: failed to set opstate r=" << ret
<< dendl
;
8212 int RGWRados::copy_obj_to_remote_dest(RGWObjState
*astate
,
8213 map
<string
, bufferlist
>& src_attrs
,
8214 RGWRados::Object::Read
& read_op
,
8215 const rgw_user
& user_id
,
8221 RGWRESTStreamWriteRequest
*out_stream_req
;
8223 int ret
= rest_master_conn
->put_obj_init(user_id
, dest_obj
, astate
->size
, src_attrs
, &out_stream_req
);
8228 ret
= read_op
.iterate(0, astate
->size
- 1, out_stream_req
->get_out_cb());
8230 delete out_stream_req
;
8234 ret
= rest_master_conn
->complete_request(out_stream_req
, etag
, mtime
);
8243 * dest_obj: the object to copy into
8244 * src_obj: the object to copy from
8245 * attrs: usage depends on attrs_mod parameter
8246 * attrs_mod: the modification mode of the attrs, may have the following values:
8247 * ATTRSMOD_NONE - the attributes of the source object will be
8248 * copied without modifications, attrs parameter is ignored;
8249 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8250 * parameter, source object attributes are not copied;
8251 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8252 * are overwritten by values contained in attrs parameter.
8253 * err: stores any errors resulting from the get of the original object
8254 * Returns: 0 on success, -ERR# otherwise.
8256 int RGWRados::copy_obj(RGWObjectCtx
& obj_ctx
,
8257 const rgw_user
& user_id
,
8258 const string
& client_id
,
8259 const string
& op_id
,
8261 const string
& source_zone
,
8264 RGWBucketInfo
& dest_bucket_info
,
8265 RGWBucketInfo
& src_bucket_info
,
8266 real_time
*src_mtime
,
8268 const real_time
*mod_ptr
,
8269 const real_time
*unmod_ptr
,
8270 bool high_precision_time
,
8271 const char *if_match
,
8272 const char *if_nomatch
,
8275 map
<string
, bufferlist
>& attrs
,
8276 RGWObjCategory category
,
8278 real_time delete_at
,
8281 ceph::buffer::list
*petag
,
8282 void (*progress_cb
)(off_t
, void *),
8283 void *progress_data
)
8287 rgw_obj shadow_obj
= dest_obj
;
8293 append_rand_alpha(cct
, dest_obj
.get_oid(), shadow_oid
, 32);
8294 shadow_obj
.init_ns(dest_obj
.bucket
, shadow_oid
, shadow_ns
);
8296 remote_dest
= !get_zonegroup().equals(dest_bucket_info
.zonegroup
);
8297 remote_src
= !get_zonegroup().equals(src_bucket_info
.zonegroup
);
8299 if (remote_src
&& remote_dest
) {
8300 ldout(cct
, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl
;
8304 ldout(cct
, 5) << "Copy object " << src_obj
.bucket
<< ":" << src_obj
.get_oid() << " => " << dest_obj
.bucket
<< ":" << dest_obj
.get_oid() << dendl
;
8306 if (remote_src
|| !source_zone
.empty()) {
8307 return fetch_remote_obj(obj_ctx
, user_id
, client_id
, op_id
, true, info
, source_zone
,
8308 dest_obj
, src_obj
, dest_bucket_info
, src_bucket_info
, src_mtime
, mtime
, mod_ptr
,
8309 unmod_ptr
, high_precision_time
,
8310 if_match
, if_nomatch
, attrs_mod
, copy_if_newer
, attrs
, category
,
8311 olh_epoch
, delete_at
, version_id
, ptag
, petag
, progress_cb
, progress_data
);
8314 map
<string
, bufferlist
> src_attrs
;
8315 RGWRados::Object
src_op_target(this, src_bucket_info
, obj_ctx
, src_obj
);
8316 RGWRados::Object::Read
read_op(&src_op_target
);
8318 read_op
.conds
.mod_ptr
= mod_ptr
;
8319 read_op
.conds
.unmod_ptr
= unmod_ptr
;
8320 read_op
.conds
.high_precision_time
= high_precision_time
;
8321 read_op
.conds
.if_match
= if_match
;
8322 read_op
.conds
.if_nomatch
= if_nomatch
;
8323 read_op
.params
.attrs
= &src_attrs
;
8324 read_op
.params
.lastmod
= src_mtime
;
8325 read_op
.params
.obj_size
= &obj_size
;
8327 ret
= read_op
.prepare();
8331 if (src_attrs
.count(RGW_ATTR_CRYPT_MODE
)) {
8332 // Current implementation does not follow S3 spec and even
8333 // may result in data corruption silently when copying
8334 // multipart objects acorss pools. So reject COPY operations
8335 //on encrypted objects before it is fully functional.
8336 ldout(cct
, 0) << "ERROR: copy op for encrypted object " << src_obj
8337 << " has not been implemented." << dendl
;
8338 return -ERR_NOT_IMPLEMENTED
;
8341 src_attrs
[RGW_ATTR_ACL
] = attrs
[RGW_ATTR_ACL
];
8342 src_attrs
.erase(RGW_ATTR_DELETE_AT
);
8344 set_copy_attrs(src_attrs
, attrs
, attrs_mod
);
8345 attrs
.erase(RGW_ATTR_ID_TAG
);
8346 attrs
.erase(RGW_ATTR_PG_VER
);
8347 attrs
.erase(RGW_ATTR_SOURCE_ZONE
);
8348 map
<string
, bufferlist
>::iterator cmp
= src_attrs
.find(RGW_ATTR_COMPRESSION
);
8349 if (cmp
!= src_attrs
.end())
8350 attrs
[RGW_ATTR_COMPRESSION
] = cmp
->second
;
8352 RGWObjManifest manifest
;
8353 RGWObjState
*astate
= NULL
;
8355 ret
= get_obj_state(&obj_ctx
, src_bucket_info
, src_obj
, &astate
);
8360 vector
<rgw_raw_obj
> ref_objs
;
8363 /* dest is in a different zonegroup, copy it there */
8364 return copy_obj_to_remote_dest(astate
, attrs
, read_op
, user_id
, dest_obj
, mtime
);
8366 uint64_t max_chunk_size
;
8368 ret
= get_max_chunk_size(dest_bucket_info
.placement_rule
, dest_obj
, &max_chunk_size
);
8370 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj
.bucket
<< dendl
;
8376 if (!get_obj_data_pool(src_bucket_info
.placement_rule
, src_obj
, &src_pool
)) {
8377 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << src_obj
<< dendl
;
8380 if (!get_obj_data_pool(dest_bucket_info
.placement_rule
, dest_obj
, &dest_pool
)) {
8381 ldout(cct
, 0) << "ERROR: failed to locate data pool for " << dest_obj
<< dendl
;
8386 bool copy_data
= !astate
->has_manifest
|| (src_pool
!= dest_pool
);
8387 bool copy_first
= false;
8388 if (astate
->has_manifest
) {
8389 if (!astate
->manifest
.has_tail()) {
8392 uint64_t head_size
= astate
->manifest
.get_head_size();
8394 if (head_size
> 0) {
8395 if (head_size
> max_chunk_size
) {
8405 const auto iter
= attrs
.find(RGW_ATTR_ETAG
);
8406 if (iter
!= attrs
.end()) {
8407 *petag
= iter
->second
;
8411 if (copy_data
) { /* refcounting tail wouldn't work here, just copy the data */
8412 return copy_obj_data(obj_ctx
, dest_bucket_info
, read_op
, obj_size
- 1, dest_obj
, src_obj
,
8413 max_chunk_size
, mtime
, real_time(), attrs
, category
, olh_epoch
, delete_at
,
8414 version_id
, ptag
, petag
);
8417 RGWObjManifest::obj_iterator miter
= astate
->manifest
.obj_begin();
8419 if (copy_first
) { // we need to copy first chunk, not increase refcount
8424 ret
= get_raw_obj_ref(miter
.get_location().get_raw_obj(this), &ref
);
8429 bool versioned_dest
= dest_bucket_info
.versioning_enabled();
8431 if (version_id
&& !version_id
->empty()) {
8432 versioned_dest
= true;
8433 dest_obj
.key
.set_instance(*version_id
);
8434 } else if (versioned_dest
) {
8435 gen_rand_obj_instance_name(&dest_obj
);
8438 bufferlist first_chunk
;
8440 bool copy_itself
= (dest_obj
== src_obj
);
8441 RGWObjManifest
*pmanifest
;
8442 ldout(cct
, 20) << "dest_obj=" << dest_obj
<< " src_obj=" << src_obj
<< " copy_itself=" << (int)copy_itself
<< dendl
;
8444 RGWRados::Object
dest_op_target(this, dest_bucket_info
, obj_ctx
, dest_obj
);
8445 RGWRados::Object::Write
write_op(&dest_op_target
);
8454 append_rand_alpha(cct
, tag
, tag
, 32);
8458 attrs
.erase(RGW_ATTR_TAIL_TAG
);
8459 manifest
= astate
->manifest
;
8460 const rgw_bucket_placement
& tail_placement
= manifest
.get_tail_placement();
8461 if (tail_placement
.bucket
.name
.empty()) {
8462 manifest
.set_tail_placement(tail_placement
.placement_rule
, src_obj
.bucket
);
8465 for (; miter
!= astate
->manifest
.obj_end(); ++miter
) {
8466 ObjectWriteOperation op
;
8467 ref_tag
= tag
+ '\0';
8468 cls_refcount_get(op
, ref_tag
, true);
8469 const rgw_raw_obj
& loc
= miter
.get_location().get_raw_obj(this);
8470 ref
.ioctx
.locator_set_key(loc
.loc
);
8472 ret
= ref
.ioctx
.operate(loc
.oid
, &op
);
8477 ref_objs
.push_back(loc
);
8480 pmanifest
= &manifest
;
8482 pmanifest
= &astate
->manifest
;
8483 /* don't send the object's tail for garbage collection */
8484 astate
->keep_tail
= true;
8488 ret
= read_op
.read(0, max_chunk_size
, first_chunk
);
8493 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, first_chunk
.length());
8495 pmanifest
->set_head(dest_bucket_info
.placement_rule
, dest_obj
, 0);
8498 write_op
.meta
.data
= &first_chunk
;
8499 write_op
.meta
.manifest
= pmanifest
;
8500 write_op
.meta
.ptag
= &tag
;
8501 write_op
.meta
.owner
= dest_bucket_info
.owner
;
8502 write_op
.meta
.mtime
= mtime
;
8503 write_op
.meta
.flags
= PUT_OBJ_CREATE
;
8504 write_op
.meta
.category
= category
;
8505 write_op
.meta
.olh_epoch
= olh_epoch
;
8506 write_op
.meta
.delete_at
= delete_at
;
8507 write_op
.meta
.modify_tail
= !copy_itself
;
8509 ret
= write_op
.write_meta(obj_size
, astate
->accounted_size
, attrs
);
8518 vector
<rgw_raw_obj
>::iterator riter
;
8520 /* rollback reference */
8521 for (riter
= ref_objs
.begin(); riter
!= ref_objs
.end(); ++riter
) {
8522 ObjectWriteOperation op
;
8523 cls_refcount_put(op
, tag
, true);
8525 ref
.ioctx
.locator_set_key(riter
->loc
);
8527 int r
= ref
.ioctx
.operate(riter
->oid
, &op
);
8529 ldout(cct
, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter
<< dendl
;
8537 int RGWRados::copy_obj_data(RGWObjectCtx
& obj_ctx
,
8538 RGWBucketInfo
& dest_bucket_info
,
8539 RGWRados::Object::Read
& read_op
, off_t end
,
8542 uint64_t max_chunk_size
,
8544 real_time set_mtime
,
8545 map
<string
, bufferlist
>& attrs
,
8546 RGWObjCategory category
,
8548 real_time delete_at
,
8551 ceph::buffer::list
*petag
)
8553 bufferlist first_chunk
;
8554 RGWObjManifest manifest
;
8557 append_rand_alpha(cct
, tag
, tag
, 32);
8559 RGWPutObjProcessor_Atomic
processor(obj_ctx
,
8560 dest_bucket_info
, dest_obj
.bucket
, dest_obj
.key
.name
,
8561 cct
->_conf
->rgw_obj_stripe_size
, tag
, dest_bucket_info
.versioning_enabled());
8563 processor
.set_version_id(*version_id
);
8565 processor
.set_olh_epoch(olh_epoch
);
8566 int ret
= processor
.prepare(this, NULL
);
8574 ret
= read_op
.read(ofs
, end
, bl
);
8576 uint64_t read_len
= ret
;
8583 ret
= processor
.handle_data(bl
, ofs
, &handle
, &obj
, &again
);
8587 ret
= processor
.throttle_data(handle
, obj
, read_len
, false);
8593 } while (ofs
<= end
);
8596 auto iter
= attrs
.find(RGW_ATTR_ETAG
);
8597 if (iter
!= attrs
.end()) {
8598 bufferlist
& bl
= iter
->second
;
8599 etag
= string(bl
.c_str(), bl
.length());
8605 uint64_t accounted_size
;
8607 bool compressed
{false};
8608 RGWCompressionInfo cs_info
;
8609 ret
= rgw_compression_info_from_attrset(attrs
, compressed
, cs_info
);
8611 ldout(cct
, 0) << "ERROR: failed to read compression info" << dendl
;
8614 // pass original size if compressed
8615 accounted_size
= compressed
? cs_info
.orig_size
: ofs
;
8618 return processor
.complete(accounted_size
, etag
, mtime
, set_mtime
, attrs
, delete_at
);
8621 bool RGWRados::is_meta_master()
8623 if (!get_zonegroup().is_master_zonegroup()) {
8627 return (get_zonegroup().master_zone
== zone_public_config
.id
);
8631 * Check to see if the bucket metadata could be synced
8632 * bucket: the bucket to check
8633 * Returns false is the bucket is not synced
8635 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket
& bucket
)
8638 /* no current period */
8639 if (current_period
.get_id().empty()) {
8643 /* zonegroup is not master zonegroup */
8644 if (!get_zonegroup().is_master_zonegroup()) {
8648 /* single zonegroup and a single zone */
8649 if (current_period
.is_single_zonegroup() && get_zonegroup().zones
.size() == 1) {
8653 /* zone is not master */
8654 if (get_zonegroup().master_zone
.compare(zone_public_config
.id
) != 0) {
8661 int RGWRados::check_bucket_empty(RGWBucketInfo
& bucket_info
)
8663 std::vector
<rgw_bucket_dir_entry
> ent_list
;
8664 rgw_obj_index_key marker
;
8669 constexpr uint NUM_ENTRIES
= 1000u;
8670 int r
= cls_bucket_list_unordered(bucket_info
,
8683 for (auto const& dirent
: ent_list
) {
8686 if (rgw_obj_key::oid_to_key_in_ns(dirent
.key
.name
, &obj
, ns
))
8689 } while (is_truncated
);
8696 * bucket: the name of the bucket to delete
8697 * Returns 0 on success, -ERR# otherwise.
8699 int RGWRados::delete_bucket(RGWBucketInfo
& bucket_info
, RGWObjVersionTracker
& objv_tracker
, bool check_empty
)
8701 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8702 librados::IoCtx index_ctx
;
8703 map
<int, string
> bucket_objs
;
8704 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
8709 r
= check_bucket_empty(bucket_info
);
8715 r
= rgw_bucket_delete_bucket_obj(this, bucket
.tenant
, bucket
.name
, objv_tracker
);
8719 /* if the bucket is not synced we can remove the meta file */
8720 if (!is_syncing_bucket_meta(bucket
)) {
8721 RGWObjVersionTracker objv_tracker
;
8722 r
= rgw_bucket_instance_remove_entry(this, bucket
.get_key(), &objv_tracker
);
8727 /* remove bucket index objects asynchronously by best effort */
8728 (void) CLSRGWIssueBucketIndexClean(index_ctx
,
8730 cct
->_conf
->rgw_bucket_index_max_aio
)();
8736 int RGWRados::set_bucket_owner(rgw_bucket
& bucket
, ACLOwner
& owner
)
8739 map
<string
, bufferlist
> attrs
;
8740 RGWObjectCtx
obj_ctx(this);
8742 if (bucket
.bucket_id
.empty()) {
8743 r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
8745 r
= get_bucket_instance_info(obj_ctx
, bucket
, info
, nullptr, &attrs
);
8748 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
8752 info
.owner
= owner
.get_id();
8754 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
8756 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< dendl
;
8764 int RGWRados::set_buckets_enabled(vector
<rgw_bucket
>& buckets
, bool enabled
)
8768 vector
<rgw_bucket
>::iterator iter
;
8770 for (iter
= buckets
.begin(); iter
!= buckets
.end(); ++iter
) {
8771 rgw_bucket
& bucket
= *iter
;
8773 ldout(cct
, 20) << "enabling bucket name=" << bucket
.name
<< dendl
;
8775 ldout(cct
, 20) << "disabling bucket name=" << bucket
.name
<< dendl
;
8778 map
<string
, bufferlist
> attrs
;
8779 RGWObjectCtx
obj_ctx(this);
8780 int r
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, info
, NULL
, &attrs
);
8782 ldout(cct
, 0) << "NOTICE: get_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8787 info
.flags
&= ~BUCKET_SUSPENDED
;
8789 info
.flags
|= BUCKET_SUSPENDED
;
8792 r
= put_bucket_instance_info(info
, false, real_time(), &attrs
);
8794 ldout(cct
, 0) << "NOTICE: put_bucket_info on bucket=" << bucket
.name
<< " returned err=" << r
<< ", skipping bucket" << dendl
;
8802 int RGWRados::bucket_suspended(rgw_bucket
& bucket
, bool *suspended
)
8804 RGWBucketInfo bucket_info
;
8805 RGWObjectCtx
obj_ctx(this);
8806 int ret
= get_bucket_info(obj_ctx
, bucket
.tenant
, bucket
.name
, bucket_info
, NULL
);
8811 *suspended
= ((bucket_info
.flags
& BUCKET_SUSPENDED
) != 0);
8815 int RGWRados::Object::complete_atomic_modification()
8817 if (!state
->has_manifest
|| state
->keep_tail
)
8820 cls_rgw_obj_chain chain
;
8821 store
->update_gc_chain(obj
, state
->manifest
, &chain
);
8823 if (chain
.empty()) {
8827 string tag
= (state
->tail_tag
.length() > 0 ? state
->tail_tag
.to_str() : state
->obj_tag
.to_str());
8828 return store
->gc
->send_chain(chain
, tag
, false); // do it async
8831 void RGWRados::update_gc_chain(rgw_obj
& head_obj
, RGWObjManifest
& manifest
, cls_rgw_obj_chain
*chain
)
8833 RGWObjManifest::obj_iterator iter
;
8834 rgw_raw_obj raw_head
;
8835 obj_to_raw(manifest
.get_head_placement_rule(), head_obj
, &raw_head
);
8836 for (iter
= manifest
.obj_begin(); iter
!= manifest
.obj_end(); ++iter
) {
8837 const rgw_raw_obj
& mobj
= iter
.get_location().get_raw_obj(this);
8838 if (mobj
== raw_head
)
8840 cls_rgw_obj_key
key(mobj
.oid
);
8841 chain
->push_obj(mobj
.pool
.to_str(), key
, mobj
.loc
);
8845 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain
& chain
, const string
& tag
, bool sync
)
8847 return gc
->send_chain(chain
, tag
, sync
);
8850 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
8851 librados::IoCtx
& index_ctx
,
8854 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8855 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8859 if (bucket
.bucket_id
.empty()) {
8860 ldout(cct
, 0) << "ERROR: empty bucket id for bucket operation" << dendl
;
8864 bucket_oid
= dir_oid_prefix
;
8865 bucket_oid
.append(bucket
.bucket_id
);
8870 int RGWRados::open_bucket_index_base(const RGWBucketInfo
& bucket_info
,
8871 librados::IoCtx
& index_ctx
,
8872 string
& bucket_oid_base
) {
8873 const rgw_bucket
& bucket
= bucket_info
.bucket
;
8874 int r
= open_bucket_index_ctx(bucket_info
, index_ctx
);
8878 if (bucket
.bucket_id
.empty()) {
8879 ldout(cct
, 0) << "ERROR: empty bucket_id for bucket operation" << dendl
;
8883 bucket_oid_base
= dir_oid_prefix
;
8884 bucket_oid_base
.append(bucket
.bucket_id
);
8890 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
,
8891 librados::IoCtx
& index_ctx
,
8892 map
<int, string
>& bucket_objs
,
8894 map
<int, string
> *bucket_instance_ids
) {
8895 string bucket_oid_base
;
8896 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8901 get_bucket_index_objects(bucket_oid_base
, bucket_info
.num_shards
, bucket_objs
, shard_id
);
8902 if (bucket_instance_ids
) {
8903 get_bucket_instance_ids(bucket_info
, shard_id
, bucket_instance_ids
);
8908 template<typename T
>
8909 int RGWRados::open_bucket_index(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8910 map
<int, string
>& oids
, map
<int, T
>& bucket_objs
,
8911 int shard_id
, map
<int, string
> *bucket_instance_ids
)
8913 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, bucket_instance_ids
);
8917 map
<int, string
>::const_iterator iter
= oids
.begin();
8918 for (; iter
!= oids
.end(); ++iter
) {
8919 bucket_objs
[iter
->first
] = T();
8924 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8925 const string
& obj_key
, string
*bucket_obj
, int *shard_id
)
8927 string bucket_oid_base
;
8928 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8932 RGWObjectCtx
obj_ctx(this);
8934 ret
= get_bucket_index_object(bucket_oid_base
, obj_key
, bucket_info
.num_shards
,
8935 (RGWBucketInfo::BIShardsHashType
)bucket_info
.bucket_index_shard_hash_type
, bucket_obj
, shard_id
);
8937 ldout(cct
, 10) << "get_bucket_index_object() returned ret=" << ret
<< dendl
;
8943 int RGWRados::open_bucket_index_shard(const RGWBucketInfo
& bucket_info
, librados::IoCtx
& index_ctx
,
8944 int shard_id
, string
*bucket_obj
)
8946 string bucket_oid_base
;
8947 int ret
= open_bucket_index_base(bucket_info
, index_ctx
, bucket_oid_base
);
8951 RGWObjectCtx
obj_ctx(this);
8953 get_bucket_index_object(bucket_oid_base
, bucket_info
.num_shards
,
8954 shard_id
, bucket_obj
);
8958 static void accumulate_raw_stats(const rgw_bucket_dir_header
& header
,
8959 map
<RGWObjCategory
, RGWStorageStats
>& stats
)
8961 for (const auto& pair
: header
.stats
) {
8962 const RGWObjCategory category
= static_cast<RGWObjCategory
>(pair
.first
);
8963 const rgw_bucket_category_stats
& header_stats
= pair
.second
;
8965 RGWStorageStats
& s
= stats
[category
];
8967 s
.category
= category
;
8968 s
.size
+= header_stats
.total_size
;
8969 s
.size_rounded
+= header_stats
.total_size_rounded
;
8970 s
.size_utilized
+= header_stats
.actual_size
;
8971 s
.num_objects
+= header_stats
.num_entries
;
8975 int RGWRados::bucket_check_index(RGWBucketInfo
& bucket_info
,
8976 map
<RGWObjCategory
, RGWStorageStats
> *existing_stats
,
8977 map
<RGWObjCategory
, RGWStorageStats
> *calculated_stats
)
8979 librados::IoCtx index_ctx
;
8980 // key - bucket index object id
8981 // value - bucket index check OP returned result with the given bucket index object (shard)
8982 map
<int, string
> oids
;
8983 map
<int, struct rgw_cls_check_index_ret
> bucket_objs_ret
;
8985 int ret
= open_bucket_index(bucket_info
, index_ctx
, oids
, bucket_objs_ret
);
8990 ret
= CLSRGWIssueBucketCheck(index_ctx
, oids
, bucket_objs_ret
, cct
->_conf
->rgw_bucket_index_max_aio
)();
8995 // Aggregate results (from different shards if there is any)
8996 map
<int, struct rgw_cls_check_index_ret
>::iterator iter
;
8997 for (iter
= bucket_objs_ret
.begin(); iter
!= bucket_objs_ret
.end(); ++iter
) {
8998 accumulate_raw_stats(iter
->second
.existing_header
, *existing_stats
);
8999 accumulate_raw_stats(iter
->second
.calculated_header
, *calculated_stats
);
9005 int RGWRados::bucket_rebuild_index(RGWBucketInfo
& bucket_info
)
9007 librados::IoCtx index_ctx
;
9008 map
<int, string
> bucket_objs
;
9010 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
9015 return CLSRGWIssueBucketRebuild(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
9018 int RGWRados::bucket_set_reshard(const RGWBucketInfo
& bucket_info
, const cls_rgw_bucket_instance_entry
& entry
)
9020 librados::IoCtx index_ctx
;
9021 map
<int, string
> bucket_objs
;
9023 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
9028 return CLSRGWIssueSetBucketResharding(index_ctx
, bucket_objs
, entry
, cct
->_conf
->rgw_bucket_index_max_aio
)();
9031 int RGWRados::defer_gc(void *ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
)
9033 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
9034 std::string oid
, key
;
9035 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
9039 RGWObjState
*state
= NULL
;
9041 int r
= get_obj_state(rctx
, bucket_info
, obj
, &state
, false);
9045 if (!state
->is_atomic
) {
9046 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not deferring gc operation" << dendl
;
9052 if (state
->tail_tag
.length() > 0) {
9053 tag
= state
->tail_tag
.c_str();
9054 } else if (state
->obj_tag
.length() > 0) {
9055 tag
= state
->obj_tag
.c_str();
9057 ldout(cct
, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl
;
9061 ldout(cct
, 0) << "defer chain tag=" << tag
<< dendl
;
9063 return gc
->defer_chain(tag
, false);
9066 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation
& op
)
9068 list
<string
> prefixes
;
9069 prefixes
.push_back(RGW_ATTR_OLH_PREFIX
);
9070 cls_rgw_remove_obj(op
, prefixes
);
9073 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation
& op
, const string
& prefix
, bool fail_if_exist
)
9075 cls_rgw_obj_check_attrs_prefix(op
, prefix
, fail_if_exist
);
9078 void RGWRados::cls_obj_check_mtime(ObjectOperation
& op
, const real_time
& mtime
, bool high_precision_time
, RGWCheckMTimeType type
)
9080 cls_rgw_obj_check_mtime(op
, mtime
, high_precision_time
, type
);
9086 * bucket: name of the bucket storing the object
9087 * obj: name of the object to delete
9088 * Returns: 0 on success, -ERR# otherwise.
9090 int RGWRados::Object::Delete::delete_obj()
9092 RGWRados
*store
= target
->get_store();
9093 rgw_obj
& src_obj
= target
->get_obj();
9094 const string
& instance
= src_obj
.key
.instance
;
9095 rgw_obj obj
= src_obj
;
9097 if (instance
== "null") {
9098 obj
.key
.instance
.clear();
9101 bool explicit_marker_version
= (!params
.marker_version_id
.empty());
9103 if (params
.versioning_status
& BUCKET_VERSIONED
|| explicit_marker_version
) {
9104 if (instance
.empty() || explicit_marker_version
) {
9105 rgw_obj marker
= obj
;
9107 if (!params
.marker_version_id
.empty()) {
9108 if (params
.marker_version_id
!= "null") {
9109 marker
.key
.set_instance(params
.marker_version_id
);
9111 } else if ((params
.versioning_status
& BUCKET_VERSIONS_SUSPENDED
) == 0) {
9112 store
->gen_rand_obj_instance_name(&marker
);
9115 result
.version_id
= marker
.key
.instance
;
9116 if (result
.version_id
.empty())
9117 result
.version_id
= "null";
9118 result
.delete_marker
= true;
9120 struct rgw_bucket_dir_entry_meta meta
;
9122 meta
.owner
= params
.obj_owner
.get_id().to_str();
9123 meta
.owner_display_name
= params
.obj_owner
.get_display_name();
9125 if (real_clock::is_zero(params
.mtime
)) {
9126 meta
.mtime
= real_clock::now();
9128 meta
.mtime
= params
.mtime
;
9131 int r
= store
->set_olh(target
->get_ctx(), target
->get_bucket_info(), marker
, true, &meta
, params
.olh_epoch
, params
.unmod_since
, params
.high_precision_time
, params
.zones_trace
);
9136 rgw_bucket_dir_entry dirent
;
9138 int r
= store
->bi_get_instance(target
->get_bucket_info(), obj
, &dirent
);
9142 result
.delete_marker
= dirent
.is_delete_marker();
9143 r
= store
->unlink_obj_instance(target
->get_ctx(), target
->get_bucket_info(), obj
, params
.olh_epoch
, params
.zones_trace
);
9147 result
.version_id
= instance
;
9151 int r
= target
->get_bucket_shard(&bs
);
9153 ldout(store
->ctx(), 5) << "failed to get BucketShard object: r=" << r
<< dendl
;
9157 if (target
->bucket_info
.datasync_flag_enabled()) {
9158 r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
9160 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
9169 int r
= store
->get_obj_head_ref(target
->get_bucket_info(), obj
, &ref
);
9175 r
= target
->get_state(&state
, false);
9179 ObjectWriteOperation op
;
9181 if (!real_clock::is_zero(params
.unmod_since
)) {
9182 struct timespec ctime
= ceph::real_clock::to_timespec(state
->mtime
);
9183 struct timespec unmod
= ceph::real_clock::to_timespec(params
.unmod_since
);
9184 if (!params
.high_precision_time
) {
9189 ldout(store
->ctx(), 10) << "If-UnModified-Since: " << params
.unmod_since
<< " Last-Modified: " << ctime
<< dendl
;
9190 if (ctime
> unmod
) {
9191 return -ERR_PRECONDITION_FAILED
;
9194 /* only delete object if mtime is less than or equal to params.unmod_since */
9195 store
->cls_obj_check_mtime(op
, params
.unmod_since
, params
.high_precision_time
, CLS_RGW_CHECK_TIME_MTIME_LE
);
9197 uint64_t obj_size
= state
->size
;
9199 if (!real_clock::is_zero(params
.expiration_time
)) {
9201 real_time delete_at
;
9203 if (state
->get_attr(RGW_ATTR_DELETE_AT
, bl
)) {
9205 bufferlist::iterator iter
= bl
.begin();
9206 ::decode(delete_at
, iter
);
9207 } catch (buffer::error
& err
) {
9208 ldout(store
->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl
;
9212 if (params
.expiration_time
!= delete_at
) {
9213 return -ERR_PRECONDITION_FAILED
;
9216 return -ERR_PRECONDITION_FAILED
;
9220 if (!state
->exists
) {
9221 target
->invalidate_state();
9225 r
= target
->prepare_atomic_modification(op
, false, NULL
, NULL
, NULL
, true, false);
9229 RGWBucketInfo
& bucket_info
= target
->get_bucket_info();
9231 RGWRados::Bucket
bop(store
, bucket_info
);
9232 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9234 index_op
.set_zones_trace(params
.zones_trace
);
9235 index_op
.set_bilog_flags(params
.bilog_flags
);
9237 r
= index_op
.prepare(CLS_RGW_OP_DEL
, &state
->write_tag
);
9241 store
->remove_rgw_head_obj(op
);
9242 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9244 /* raced with another operation, object state is indeterminate */
9245 const bool need_invalidate
= (r
== -ECANCELED
);
9247 int64_t poolid
= ref
.ioctx
.get_id();
9249 tombstone_cache_t
*obj_tombstone_cache
= store
->get_tombstone_cache();
9250 if (obj_tombstone_cache
) {
9251 tombstone_entry entry
{*state
};
9252 obj_tombstone_cache
->add(obj
, entry
);
9254 r
= index_op
.complete_del(poolid
, ref
.ioctx
.get_last_version(), state
->mtime
, params
.remove_objs
);
9256 int ret
= target
->complete_atomic_modification();
9258 ldout(store
->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret
<< dendl
;
9260 /* other than that, no need to propagate error */
9262 int ret
= index_op
.cancel();
9264 ldout(store
->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret
<< dendl
;
9268 if (need_invalidate
) {
9269 target
->invalidate_state();
9275 /* update quota cache */
9276 store
->quota_handler
->update_stats(params
.bucket_owner
, obj
.bucket
, -1, 0, obj_size
);
9281 int RGWRados::delete_obj(RGWObjectCtx
& obj_ctx
,
9282 const RGWBucketInfo
& bucket_info
,
9284 int versioning_status
,
9285 uint16_t bilog_flags
,
9286 const real_time
& expiration_time
,
9287 rgw_zone_set
*zones_trace
)
9289 RGWRados::Object
del_target(this, bucket_info
, obj_ctx
, obj
);
9290 RGWRados::Object::Delete
del_op(&del_target
);
9292 del_op
.params
.bucket_owner
= bucket_info
.owner
;
9293 del_op
.params
.versioning_status
= versioning_status
;
9294 del_op
.params
.bilog_flags
= bilog_flags
;
9295 del_op
.params
.expiration_time
= expiration_time
;
9296 del_op
.params
.zones_trace
= zones_trace
;
9298 return del_op
.delete_obj();
9301 int RGWRados::delete_raw_obj(const rgw_raw_obj
& obj
)
9304 int r
= get_raw_obj_ref(obj
, &ref
);
9309 ObjectWriteOperation op
;
9312 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9319 int RGWRados::delete_system_obj(rgw_raw_obj
& obj
, RGWObjVersionTracker
*objv_tracker
)
9322 ldout(cct
, 1) << "delete_system_obj got empty object name "
9323 << obj
<< ", returning EINVAL" << dendl
;
9327 int r
= get_raw_obj_ref(obj
, &ref
);
9332 ObjectWriteOperation op
;
9335 objv_tracker
->prepare_op_for_write(&op
);
9339 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9346 int RGWRados::delete_obj_index(const rgw_obj
& obj
)
9348 std::string oid
, key
;
9349 get_obj_bucket_and_oid_loc(obj
, oid
, key
);
9351 RGWObjectCtx
obj_ctx(this);
9353 RGWBucketInfo bucket_info
;
9354 int ret
= get_bucket_instance_info(obj_ctx
, obj
.bucket
, bucket_info
, NULL
, NULL
);
9356 ldout(cct
, 0) << "ERROR: " << __func__
<< "() get_bucket_instance_info(bucket=" << obj
.bucket
<< ") returned ret=" << ret
<< dendl
;
9360 RGWRados::Bucket
bop(this, bucket_info
);
9361 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
9363 real_time removed_mtime
;
9364 int r
= index_op
.complete_del(-1 /* pool */, 0, removed_mtime
, NULL
);
9369 static void generate_fake_tag(RGWRados
*store
, map
<string
, bufferlist
>& attrset
, RGWObjManifest
& manifest
, bufferlist
& manifest_bl
, bufferlist
& tag_bl
)
9373 RGWObjManifest::obj_iterator mi
= manifest
.obj_begin();
9374 if (mi
!= manifest
.obj_end()) {
9375 if (manifest
.has_tail()) // first object usually points at the head, let's skip to a more unique part
9377 tag
= mi
.get_location().get_raw_obj(store
).oid
;
9381 unsigned char md5
[CEPH_CRYPTO_MD5_DIGESTSIZE
];
9382 char md5_str
[CEPH_CRYPTO_MD5_DIGESTSIZE
* 2 + 1];
9384 hash
.Update((const byte
*)manifest_bl
.c_str(), manifest_bl
.length());
9386 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_ETAG
);
9387 if (iter
!= attrset
.end()) {
9388 bufferlist
& bl
= iter
->second
;
9389 hash
.Update((const byte
*)bl
.c_str(), bl
.length());
9393 buf_to_hex(md5
, CEPH_CRYPTO_MD5_DIGESTSIZE
, md5_str
);
9394 tag
.append(md5_str
);
9396 ldout(store
->ctx(), 10) << "generate_fake_tag new tag=" << tag
<< dendl
;
9398 tag_bl
.append(tag
.c_str(), tag
.size() + 1);
9401 static bool is_olh(map
<string
, bufferlist
>& attrs
)
9403 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_INFO
);
9404 return (iter
!= attrs
.end());
9407 static bool has_olh_tag(map
<string
, bufferlist
>& attrs
)
9409 map
<string
, bufferlist
>::iterator iter
= attrs
.find(RGW_ATTR_OLH_ID_TAG
);
9410 return (iter
!= attrs
.end());
9413 int RGWRados::get_olh_target_state(RGWObjectCtx
& obj_ctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
9414 RGWObjState
*olh_state
, RGWObjState
**target_state
)
9416 assert(olh_state
->is_olh
);
9419 int r
= RGWRados::follow_olh(bucket_info
, obj_ctx
, olh_state
, obj
, &target
); /* might return -EAGAIN */
9423 r
= get_obj_state(&obj_ctx
, bucket_info
, target
, target_state
, false);
9431 int RGWRados::get_system_obj_state_impl(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
9437 RGWRawObjState
*s
= rctx
->raw
.get_state(obj
);
9438 ldout(cct
, 20) << "get_system_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
9446 int r
= raw_obj_stat(obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), objv_tracker
);
9449 s
->has_attrs
= true;
9450 s
->mtime
= real_time();
9457 s
->has_attrs
= true;
9458 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
9460 if (s
->obj_tag
.length())
9461 ldout(cct
, 20) << "get_system_obj_state: setting s->obj_tag to "
9462 << s
->obj_tag
.c_str() << dendl
;
9464 ldout(cct
, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl
;
9469 int RGWRados::get_system_obj_state(RGWObjectCtx
*rctx
, rgw_raw_obj
& obj
, RGWRawObjState
**state
, RGWObjVersionTracker
*objv_tracker
)
9474 ret
= get_system_obj_state_impl(rctx
, obj
, state
, objv_tracker
);
9475 } while (ret
== -EAGAIN
);
9480 int RGWRados::get_obj_state_impl(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
9481 RGWObjState
**state
, bool follow_olh
, bool assume_noent
)
9487 bool need_follow_olh
= follow_olh
&& obj
.key
.instance
.empty();
9489 RGWObjState
*s
= rctx
->obj
.get_state(obj
);
9490 ldout(cct
, 20) << "get_obj_state: rctx=" << (void *)rctx
<< " obj=" << obj
<< " state=" << (void *)s
<< " s->prefetch_data=" << s
->prefetch_data
<< dendl
;
9493 if (s
->is_olh
&& need_follow_olh
) {
9494 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
9501 rgw_raw_obj raw_obj
;
9502 obj_to_raw(bucket_info
.placement_rule
, obj
, &raw_obj
);
9506 if (!assume_noent
) {
9507 r
= RGWRados::raw_obj_stat(raw_obj
, &s
->size
, &s
->mtime
, &s
->epoch
, &s
->attrset
, (s
->prefetch_data
? &s
->data
: NULL
), NULL
);
9512 s
->has_attrs
= true;
9513 tombstone_entry entry
;
9514 if (obj_tombstone_cache
&& obj_tombstone_cache
->find(obj
, entry
)) {
9515 s
->mtime
= entry
.mtime
;
9516 s
->zone_short_id
= entry
.zone_short_id
;
9517 s
->pg_ver
= entry
.pg_ver
;
9518 ldout(cct
, 20) << __func__
<< "(): found obj in tombstone cache: obj=" << obj
9519 << " mtime=" << s
->mtime
<< " pgv=" << s
->pg_ver
<< dendl
;
9521 s
->mtime
= real_time();
9529 s
->has_attrs
= true;
9530 s
->accounted_size
= s
->size
;
9532 auto iter
= s
->attrset
.find(RGW_ATTR_COMPRESSION
);
9533 const bool compressed
= (iter
!= s
->attrset
.end());
9535 // use uncompressed size for accounted_size
9537 RGWCompressionInfo info
;
9538 auto p
= iter
->second
.begin();
9540 s
->accounted_size
= info
.orig_size
;
9541 } catch (buffer::error
&) {
9542 dout(0) << "ERROR: could not decode compression info for object: " << obj
<< dendl
;
9547 iter
= s
->attrset
.find(RGW_ATTR_SHADOW_OBJ
);
9548 if (iter
!= s
->attrset
.end()) {
9549 bufferlist bl
= iter
->second
;
9550 bufferlist::iterator it
= bl
.begin();
9551 it
.copy(bl
.length(), s
->shadow_obj
);
9552 s
->shadow_obj
[bl
.length()] = '\0';
9554 s
->obj_tag
= s
->attrset
[RGW_ATTR_ID_TAG
];
9555 auto ttiter
= s
->attrset
.find(RGW_ATTR_TAIL_TAG
);
9556 if (ttiter
!= s
->attrset
.end()) {
9557 s
->tail_tag
= s
->attrset
[RGW_ATTR_TAIL_TAG
];
9560 bufferlist manifest_bl
= s
->attrset
[RGW_ATTR_MANIFEST
];
9561 if (manifest_bl
.length()) {
9562 bufferlist::iterator miter
= manifest_bl
.begin();
9564 ::decode(s
->manifest
, miter
);
9565 s
->has_manifest
= true;
9566 s
->manifest
.set_head(bucket_info
.placement_rule
, obj
, s
->size
); /* patch manifest to reflect the head we just read, some manifests might be
9567 broken due to old bugs */
9568 s
->size
= s
->manifest
.get_obj_size();
9570 s
->accounted_size
= s
->size
;
9571 } catch (buffer::error
& err
) {
9572 ldout(cct
, 0) << "ERROR: couldn't decode manifest" << dendl
;
9575 ldout(cct
, 10) << "manifest: total_size = " << s
->manifest
.get_obj_size() << dendl
;
9576 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20) && s
->manifest
.has_explicit_objs()) {
9577 RGWObjManifest::obj_iterator mi
;
9578 for (mi
= s
->manifest
.obj_begin(); mi
!= s
->manifest
.obj_end(); ++mi
) {
9579 ldout(cct
, 20) << "manifest: ofs=" << mi
.get_ofs() << " loc=" << mi
.get_location().get_raw_obj(this) << dendl
;
9583 if (!s
->obj_tag
.length()) {
9585 * Uh oh, something's wrong, object with manifest should have tag. Let's
9586 * create one out of the manifest, would be unique
9588 generate_fake_tag(this, s
->attrset
, s
->manifest
, manifest_bl
, s
->obj_tag
);
9592 map
<string
, bufferlist
>::iterator aiter
= s
->attrset
.find(RGW_ATTR_PG_VER
);
9593 if (aiter
!= s
->attrset
.end()) {
9594 bufferlist
& pg_ver_bl
= aiter
->second
;
9595 if (pg_ver_bl
.length()) {
9596 bufferlist::iterator pgbl
= pg_ver_bl
.begin();
9598 ::decode(s
->pg_ver
, pgbl
);
9599 } catch (buffer::error
& err
) {
9600 ldout(cct
, 0) << "ERROR: couldn't decode pg ver attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
9604 aiter
= s
->attrset
.find(RGW_ATTR_SOURCE_ZONE
);
9605 if (aiter
!= s
->attrset
.end()) {
9606 bufferlist
& zone_short_id_bl
= aiter
->second
;
9607 if (zone_short_id_bl
.length()) {
9608 bufferlist::iterator zbl
= zone_short_id_bl
.begin();
9610 ::decode(s
->zone_short_id
, zbl
);
9611 } catch (buffer::error
& err
) {
9612 ldout(cct
, 0) << "ERROR: couldn't decode zone short id attr for object " << s
->obj
<< ", non-critical error, ignoring" << dendl
;
9616 if (s
->obj_tag
.length())
9617 ldout(cct
, 20) << "get_obj_state: setting s->obj_tag to " << s
->obj_tag
.c_str() << dendl
;
9619 ldout(cct
, 20) << "get_obj_state: s->obj_tag was set empty" << dendl
;
9621 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9622 * it exist, and not only if is_olh() returns true
9624 iter
= s
->attrset
.find(RGW_ATTR_OLH_ID_TAG
);
9625 if (iter
!= s
->attrset
.end()) {
9626 s
->olh_tag
= iter
->second
;
9629 if (is_olh(s
->attrset
)) {
9632 ldout(cct
, 20) << __func__
<< ": setting s->olh_tag to " << string(s
->olh_tag
.c_str(), s
->olh_tag
.length()) << dendl
;
9634 if (need_follow_olh
) {
9635 return get_olh_target_state(*rctx
, bucket_info
, obj
, s
, state
);
9642 int RGWRados::get_obj_state(RGWObjectCtx
*rctx
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWObjState
**state
,
9643 bool follow_olh
, bool assume_noent
)
9648 ret
= get_obj_state_impl(rctx
, bucket_info
, obj
, state
, follow_olh
, assume_noent
);
9649 } while (ret
== -EAGAIN
);
9654 int RGWRados::Object::get_manifest(RGWObjManifest
**pmanifest
)
9656 RGWObjState
*astate
;
9657 int r
= get_state(&astate
, true);
9662 *pmanifest
= &astate
->manifest
;
9667 int RGWRados::Object::Read::get_attr(const char *name
, bufferlist
& dest
)
9670 int r
= source
->get_state(&state
, true);
9675 if (!state
->get_attr(name
, dest
))
9682 int RGWRados::Object::Stat::stat_async()
9684 RGWObjectCtx
& ctx
= source
->get_ctx();
9685 rgw_obj
& obj
= source
->get_obj();
9686 RGWRados
*store
= source
->get_store();
9688 RGWObjState
*s
= ctx
.obj
.get_state(obj
); /* calling this one directly because otherwise a sync request will be sent */
9692 result
.size
= s
->size
;
9693 result
.mtime
= ceph::real_clock::to_timespec(s
->mtime
);
9694 result
.attrs
= s
->attrset
;
9695 result
.has_manifest
= s
->has_manifest
;
9696 result
.manifest
= s
->manifest
;
9702 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
9704 int r
= store
->get_obj_head_ioctx(source
->get_bucket_info(), obj
, &state
.io_ctx
);
9709 librados::ObjectReadOperation op
;
9710 op
.stat2(&result
.size
, &result
.mtime
, NULL
);
9711 op
.getxattrs(&result
.attrs
, NULL
);
9712 state
.completion
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
9713 state
.io_ctx
.locator_set_key(loc
);
9714 r
= state
.io_ctx
.aio_operate(oid
, state
.completion
, &op
, NULL
);
9716 ldout(store
->ctx(), 5) << __func__
9717 << ": ERROR: aio_operate() returned ret=" << r
9726 int RGWRados::Object::Stat::wait()
9728 if (!state
.completion
) {
9732 state
.completion
->wait_for_safe();
9733 state
.ret
= state
.completion
->get_return_value();
9734 state
.completion
->release();
9736 if (state
.ret
!= 0) {
9743 int RGWRados::Object::Stat::finish()
9745 map
<string
, bufferlist
>::iterator iter
= result
.attrs
.find(RGW_ATTR_MANIFEST
);
9746 if (iter
!= result
.attrs
.end()) {
9747 bufferlist
& bl
= iter
->second
;
9748 bufferlist::iterator biter
= bl
.begin();
9750 ::decode(result
.manifest
, biter
);
9751 } catch (buffer::error
& err
) {
9752 RGWRados
*store
= source
->get_store();
9753 ldout(store
->ctx(), 0) << "ERROR: " << __func__
<< ": failed to decode manifest" << dendl
;
9756 result
.has_manifest
= true;
9763 * Get an attribute for a system object.
9764 * obj: the object to get attr
9765 * name: name of the attr to retrieve
9766 * dest: bufferlist to store the result in
9767 * Returns: 0 on success, -ERR# otherwise.
9769 int RGWRados::system_obj_get_attr(rgw_raw_obj
& obj
, const char *name
, bufferlist
& dest
)
9772 int r
= get_system_obj_ref(obj
, &ref
);
9777 ObjectReadOperation op
;
9780 op
.getxattr(name
, &dest
, &rval
);
9782 r
= ref
.ioctx
.operate(ref
.oid
, &op
, NULL
);
9789 int RGWRados::append_atomic_test(RGWObjectCtx
*rctx
,
9790 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
9791 ObjectOperation
& op
, RGWObjState
**pstate
)
9796 int r
= get_obj_state(rctx
, bucket_info
, obj
, pstate
, false);
9800 RGWObjState
*state
= *pstate
;
9802 if (!state
->is_atomic
) {
9803 ldout(cct
, 20) << "state for obj=" << obj
<< " is not atomic, not appending atomic test" << dendl
;
9807 if (state
->obj_tag
.length() > 0 && !state
->fake_tag
) {// check for backward compatibility
9808 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9810 ldout(cct
, 20) << "state->obj_tag is empty, not appending atomic test" << dendl
;
9815 int RGWRados::Object::get_state(RGWObjState
**pstate
, bool follow_olh
, bool assume_noent
)
9817 return store
->get_obj_state(&ctx
, bucket_info
, obj
, pstate
, follow_olh
, assume_noent
);
9820 void RGWRados::Object::invalidate_state()
9822 ctx
.obj
.invalidate(obj
);
9825 void RGWRados::SystemObject::invalidate_state()
9827 ctx
.raw
.invalidate(obj
);
9830 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation
& op
, bool reset_obj
, const string
*ptag
,
9831 const char *if_match
, const char *if_nomatch
, bool removal_op
,
9834 int r
= get_state(&state
, false);
9838 bool need_guard
= (state
->has_manifest
|| (state
->obj_tag
.length() != 0) ||
9839 if_match
!= NULL
|| if_nomatch
!= NULL
) &&
9842 if (!state
->is_atomic
) {
9843 ldout(store
->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state
<< dendl
;
9847 store
->remove_rgw_head_obj(op
); // we're not dropping reference here, actually removing object
9854 /* first verify that the object wasn't replaced under */
9855 if (if_nomatch
== NULL
|| strcmp(if_nomatch
, "*") != 0) {
9856 op
.cmpxattr(RGW_ATTR_ID_TAG
, LIBRADOS_CMPXATTR_OP_EQ
, state
->obj_tag
);
9857 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9861 if (strcmp(if_match
, "*") == 0) {
9862 // test the object is existing
9863 if (!state
->exists
) {
9864 return -ERR_PRECONDITION_FAILED
;
9868 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9869 strncmp(if_match
, bl
.c_str(), bl
.length()) != 0) {
9870 return -ERR_PRECONDITION_FAILED
;
9876 if (strcmp(if_nomatch
, "*") == 0) {
9877 // test the object is NOT existing
9878 if (state
->exists
) {
9879 return -ERR_PRECONDITION_FAILED
;
9883 if (!state
->get_attr(RGW_ATTR_ETAG
, bl
) ||
9884 strncmp(if_nomatch
, bl
.c_str(), bl
.length()) == 0) {
9885 return -ERR_PRECONDITION_FAILED
;
9892 if (state
->exists
) {
9894 store
->remove_rgw_head_obj(op
);
9901 /* the object is being removed, no need to update its tag */
9906 state
->write_tag
= *ptag
;
9908 append_rand_alpha(store
->ctx(), state
->write_tag
, state
->write_tag
, 32);
9911 bl
.append(state
->write_tag
.c_str(), state
->write_tag
.size() + 1);
9913 ldout(store
->ctx(), 10) << "setting object write_tag=" << state
->write_tag
<< dendl
;
9915 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
9917 op
.setxattr(RGW_ATTR_TAIL_TAG
, bl
);
9923 int RGWRados::system_obj_set_attr(void *ctx
, rgw_raw_obj
& obj
, const char *name
, bufferlist
& bl
,
9924 RGWObjVersionTracker
*objv_tracker
)
9926 map
<string
, bufferlist
> attrs
;
9928 return system_obj_set_attrs(ctx
, obj
, attrs
, NULL
, objv_tracker
);
9931 int RGWRados::system_obj_set_attrs(void *ctx
, rgw_raw_obj
& obj
,
9932 map
<string
, bufferlist
>& attrs
,
9933 map
<string
, bufferlist
>* rmattrs
,
9934 RGWObjVersionTracker
*objv_tracker
)
9937 int r
= get_system_obj_ref(obj
, &ref
);
9941 ObjectWriteOperation op
;
9944 objv_tracker
->prepare_op_for_write(&op
);
9947 map
<string
, bufferlist
>::iterator iter
;
9949 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
9950 const string
& name
= iter
->first
;
9951 op
.rmxattr(name
.c_str());
9955 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
9956 const string
& name
= iter
->first
;
9957 bufferlist
& bl
= iter
->second
;
9962 op
.setxattr(name
.c_str(), bl
);
9970 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
9978 * Set an attr on an object.
9979 * bucket: name of the bucket holding the object
9980 * obj: name of the object to set the attr on
9981 * name: the attr to set
9982 * bl: the contents of the attr
9983 * Returns: 0 on success, -ERR# otherwise.
9985 int RGWRados::set_attr(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, const char *name
, bufferlist
& bl
)
9987 map
<string
, bufferlist
> attrs
;
9989 return set_attrs(ctx
, bucket_info
, obj
, attrs
, NULL
);
9992 int RGWRados::set_attrs(void *ctx
, const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
,
9993 map
<string
, bufferlist
>& attrs
,
9994 map
<string
, bufferlist
>* rmattrs
)
9997 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
10001 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
10003 ObjectWriteOperation op
;
10004 RGWObjState
*state
= NULL
;
10006 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &state
);
10010 map
<string
, bufferlist
>::iterator iter
;
10012 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
10013 const string
& name
= iter
->first
;
10014 op
.rmxattr(name
.c_str());
10018 const rgw_bucket
& bucket
= obj
.bucket
;
10020 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
10021 const string
& name
= iter
->first
;
10022 bufferlist
& bl
= iter
->second
;
10027 op
.setxattr(name
.c_str(), bl
);
10029 if (name
.compare(RGW_ATTR_DELETE_AT
) == 0) {
10034 rgw_obj_index_key obj_key
;
10035 obj
.key
.get_index_key(&obj_key
);
10037 objexp_hint_add(ts
, bucket
.tenant
, bucket
.name
, bucket
.bucket_id
, obj_key
);
10038 } catch (buffer::error
& err
) {
10039 ldout(cct
, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT
<< " attr" << dendl
;
10047 RGWObjectCtx
obj_ctx(this);
10050 RGWRados::Bucket
bop(this, bucket_info
);
10051 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
10055 append_rand_alpha(cct
, tag
, tag
, 32);
10056 state
->write_tag
= tag
;
10057 r
= index_op
.prepare(CLS_RGW_OP_ADD
, &state
->write_tag
);
10062 bl
.append(tag
.c_str(), tag
.size() + 1);
10063 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
10067 real_time mtime
= real_clock::now();
10068 struct timespec mtime_ts
= real_clock::to_timespec(mtime
);
10069 op
.mtime2(&mtime_ts
);
10070 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
10073 bufferlist acl_bl
= attrs
[RGW_ATTR_ACL
];
10074 bufferlist etag_bl
= attrs
[RGW_ATTR_ETAG
];
10075 bufferlist content_type_bl
= attrs
[RGW_ATTR_CONTENT_TYPE
];
10076 string
etag(etag_bl
.c_str(), etag_bl
.length());
10077 string
content_type(content_type_bl
.c_str(), content_type_bl
.length());
10078 uint64_t epoch
= ref
.ioctx
.get_last_version();
10079 int64_t poolid
= ref
.ioctx
.get_id();
10080 r
= index_op
.complete(poolid
, epoch
, state
->size
, state
->accounted_size
,
10081 mtime
, etag
, content_type
, &acl_bl
,
10082 RGW_OBJ_CATEGORY_MAIN
, NULL
);
10084 int ret
= index_op
.cancel();
10086 ldout(cct
, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret
<< dendl
;
10094 state
->obj_tag
.swap(bl
);
10096 for (iter
= rmattrs
->begin(); iter
!= rmattrs
->end(); ++iter
) {
10097 state
->attrset
.erase(iter
->first
);
10100 for (iter
= attrs
.begin(); iter
!= attrs
.end(); ++iter
) {
10101 state
->attrset
[iter
->first
] = iter
->second
;
10108 int RGWRados::Object::Read::prepare()
10110 RGWRados
*store
= source
->get_store();
10111 CephContext
*cct
= store
->ctx();
10115 map
<string
, bufferlist
>::iterator iter
;
10117 RGWObjState
*astate
;
10118 int r
= source
->get_state(&astate
, true);
10122 if (!astate
->exists
) {
10126 const RGWBucketInfo
& bucket_info
= source
->get_bucket_info();
10128 state
.obj
= astate
->obj
;
10129 store
->obj_to_raw(bucket_info
.placement_rule
, state
.obj
, &state
.head_obj
);
10131 r
= store
->get_obj_head_ioctx(bucket_info
, state
.obj
, &state
.io_ctx
);
10135 if (params
.attrs
) {
10136 *params
.attrs
= astate
->attrset
;
10137 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
10138 for (iter
= params
.attrs
->begin(); iter
!= params
.attrs
->end(); ++iter
) {
10139 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
10144 /* Convert all times go GMT to make them compatible */
10145 if (conds
.mod_ptr
|| conds
.unmod_ptr
) {
10146 obj_time_weight src_weight
;
10147 src_weight
.init(astate
);
10148 src_weight
.high_precision
= conds
.high_precision_time
;
10150 obj_time_weight dest_weight
;
10151 dest_weight
.high_precision
= conds
.high_precision_time
;
10153 if (conds
.mod_ptr
) {
10154 dest_weight
.init(*conds
.mod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
10155 ldout(cct
, 10) << "If-Modified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
10156 if (!(dest_weight
< src_weight
)) {
10157 return -ERR_NOT_MODIFIED
;
10161 if (conds
.unmod_ptr
) {
10162 dest_weight
.init(*conds
.unmod_ptr
, conds
.mod_zone_id
, conds
.mod_pg_ver
);
10163 ldout(cct
, 10) << "If-UnModified-Since: " << dest_weight
<< " Last-Modified: " << src_weight
<< dendl
;
10164 if (dest_weight
< src_weight
) {
10165 return -ERR_PRECONDITION_FAILED
;
10169 if (conds
.if_match
|| conds
.if_nomatch
) {
10170 r
= get_attr(RGW_ATTR_ETAG
, etag
);
10174 if (conds
.if_match
) {
10175 string if_match_str
= rgw_string_unquote(conds
.if_match
);
10176 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-Match: " << if_match_str
<< dendl
;
10177 if (if_match_str
.compare(etag
.c_str()) != 0) {
10178 return -ERR_PRECONDITION_FAILED
;
10182 if (conds
.if_nomatch
) {
10183 string if_nomatch_str
= rgw_string_unquote(conds
.if_nomatch
);
10184 ldout(cct
, 10) << "ETag: " << etag
.c_str() << " " << " If-NoMatch: " << if_nomatch_str
<< dendl
;
10185 if (if_nomatch_str
.compare(etag
.c_str()) == 0) {
10186 return -ERR_NOT_MODIFIED
;
10191 if (params
.obj_size
)
10192 *params
.obj_size
= astate
->size
;
10193 if (params
.lastmod
)
10194 *params
.lastmod
= astate
->mtime
;
10199 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size
, int64_t &ofs
, int64_t &end
)
10205 end
= obj_size
- 1;
10206 } else if (end
< 0) {
10207 end
= obj_size
- 1;
10210 if (obj_size
> 0) {
10211 if (ofs
>= (off_t
)obj_size
) {
10214 if (end
>= (off_t
)obj_size
) {
10215 end
= obj_size
- 1;
10221 int RGWRados::SystemObject::get_state(RGWRawObjState
**pstate
, RGWObjVersionTracker
*objv_tracker
)
10223 return store
->get_system_obj_state(&ctx
, obj
, pstate
, objv_tracker
);
10226 int RGWRados::stat_system_obj(RGWObjectCtx
& obj_ctx
,
10227 RGWRados::SystemObject::Read::GetObjState
& state
,
10229 map
<string
, bufferlist
> *attrs
,
10230 real_time
*lastmod
,
10231 uint64_t *obj_size
,
10232 RGWObjVersionTracker
*objv_tracker
)
10234 RGWRawObjState
*astate
= NULL
;
10236 int r
= get_system_obj_state(&obj_ctx
, obj
, &astate
, objv_tracker
);
10240 if (!astate
->exists
) {
10245 *attrs
= astate
->attrset
;
10246 if (cct
->_conf
->subsys
.should_gather(ceph_subsys_rgw
, 20)) {
10247 map
<string
, bufferlist
>::iterator iter
;
10248 for (iter
= attrs
->begin(); iter
!= attrs
->end(); ++iter
) {
10249 ldout(cct
, 20) << "Read xattr: " << iter
->first
<< dendl
;
10255 *obj_size
= astate
->size
;
10257 *lastmod
= astate
->mtime
;
10263 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard
**pbs
, std::function
<int(BucketShard
*)> call
)
10265 RGWRados
*store
= target
->get_store();
10269 #define NUM_RESHARD_RETRIES 10
10270 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
10271 int ret
= get_bucket_shard(&bs
);
10273 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
10277 if (r
!= -ERR_BUSY_RESHARDING
) {
10280 ldout(store
->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
10281 string new_bucket_id
;
10282 r
= store
->block_while_resharding(bs
, &new_bucket_id
, target
->bucket_info
);
10283 if (r
== -ERR_BUSY_RESHARDING
) {
10289 ldout(store
->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
10290 i
= 0; /* resharding is finished, make sure we can retry */
10291 r
= target
->update_bucket_id(new_bucket_id
);
10293 ldout(store
->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id
<< " returned r=" << r
<< dendl
;
10310 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker
*objv_tracker
)
10312 RGWRados
*store
= source
->get_store();
10313 rgw_raw_obj
& obj
= source
->get_obj();
10315 return store
->stat_system_obj(source
->get_ctx(), state
, obj
, stat_params
.attrs
,
10316 stat_params
.lastmod
, stat_params
.obj_size
, objv_tracker
);
10319 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op
, const string
*write_tag
)
10324 RGWRados
*store
= target
->get_store();
10326 if (write_tag
&& write_tag
->length()) {
10327 optag
= string(write_tag
->c_str(), write_tag
->length());
10329 if (optag
.empty()) {
10330 append_rand_alpha(store
->ctx(), optag
, optag
, 32);
10334 int r
= guard_reshard(nullptr, [&](BucketShard
*bs
) -> int {
10335 return store
->cls_obj_prepare_op(*bs
, op
, optag
, obj
, bilog_flags
, zones_trace
);
10346 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid
, uint64_t epoch
,
10347 uint64_t size
, uint64_t accounted_size
,
10348 ceph::real_time
& ut
, const string
& etag
,
10349 const string
& content_type
,
10350 bufferlist
*acl_bl
,
10351 RGWObjCategory category
,
10352 list
<rgw_obj_index_key
> *remove_objs
, const string
*user_data
)
10357 RGWRados
*store
= target
->get_store();
10360 int ret
= get_bucket_shard(&bs
);
10362 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
10366 rgw_bucket_dir_entry ent
;
10367 obj
.key
.get_index_key(&ent
.key
);
10368 ent
.meta
.size
= size
;
10369 ent
.meta
.accounted_size
= accounted_size
;
10370 ent
.meta
.mtime
= ut
;
10371 ent
.meta
.etag
= etag
;
10373 ent
.meta
.user_data
= *user_data
;
10376 if (acl_bl
&& acl_bl
->length()) {
10377 int ret
= store
->decode_policy(*acl_bl
, &owner
);
10379 ldout(store
->ctx(), 0) << "WARNING: could not decode policy ret=" << ret
<< dendl
;
10382 ent
.meta
.owner
= owner
.get_id().to_str();
10383 ent
.meta
.owner_display_name
= owner
.get_display_name();
10384 ent
.meta
.content_type
= content_type
;
10386 ret
= store
->cls_obj_complete_add(*bs
, obj
, optag
, poolid
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
10388 if (target
->bucket_info
.datasync_flag_enabled()) {
10389 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
10391 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
10398 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid
, uint64_t epoch
,
10399 real_time
& removed_mtime
,
10400 list
<rgw_obj_index_key
> *remove_objs
)
10405 RGWRados
*store
= target
->get_store();
10408 int ret
= get_bucket_shard(&bs
);
10410 ldout(store
->ctx(), 5) << "failed to get BucketShard object: ret=" << ret
<< dendl
;
10414 ret
= store
->cls_obj_complete_del(*bs
, optag
, poolid
, epoch
, obj
, removed_mtime
, remove_objs
, bilog_flags
, zones_trace
);
10416 if (target
->bucket_info
.datasync_flag_enabled()) {
10417 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
10419 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
10427 int RGWRados::Bucket::UpdateIndex::cancel()
10432 RGWRados
*store
= target
->get_store();
10435 int ret
= guard_reshard(&bs
, [&](BucketShard
*bs
) -> int {
10436 return store
->cls_obj_complete_cancel(*bs
, optag
, obj
, bilog_flags
, zones_trace
);
10440 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10441 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10442 * have no way to tell that they're all caught up
10444 if (target
->bucket_info
.datasync_flag_enabled()) {
10445 int r
= store
->data_log
->add_entry(bs
->bucket
, bs
->shard_id
);
10447 lderr(store
->ctx()) << "ERROR: failed writing data log" << dendl
;
10454 int RGWRados::Object::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
)
10456 RGWRados
*store
= source
->get_store();
10457 CephContext
*cct
= store
->ctx();
10459 rgw_raw_obj read_obj
;
10460 uint64_t read_ofs
= ofs
;
10461 uint64_t len
, read_len
;
10462 bool reading_from_head
= true;
10463 ObjectReadOperation op
;
10465 bool merge_bl
= false;
10466 bufferlist
*pbl
= &bl
;
10467 bufferlist read_bl
;
10468 uint64_t max_chunk_size
;
10470 RGWObjState
*astate
;
10471 int r
= source
->get_state(&astate
, true);
10478 len
= end
- ofs
+ 1;
10480 if (astate
->has_manifest
&& astate
->manifest
.has_tail()) {
10481 /* now get the relevant object part */
10482 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
10484 uint64_t stripe_ofs
= iter
.get_stripe_ofs();
10485 read_obj
= iter
.get_location().get_raw_obj(store
);
10486 len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
10487 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
10488 reading_from_head
= (read_obj
== state
.head_obj
);
10490 read_obj
= state
.head_obj
;
10493 r
= store
->get_max_chunk_size(read_obj
.pool
, &max_chunk_size
);
10495 ldout(cct
, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj
.pool
<< dendl
;
10499 if (len
> max_chunk_size
)
10500 len
= max_chunk_size
;
10503 state
.io_ctx
.locator_set_key(read_obj
.loc
);
10507 if (reading_from_head
) {
10508 /* only when reading from the head object do we need to do the atomic test */
10509 r
= store
->append_atomic_test(&source
->get_ctx(), source
->get_bucket_info(), state
.obj
, op
, &astate
);
10513 if (astate
&& astate
->prefetch_data
) {
10514 if (!ofs
&& astate
->data
.length() >= len
) {
10516 return bl
.length();
10519 if (ofs
< astate
->data
.length()) {
10520 unsigned copy_len
= min((uint64_t)astate
->data
.length() - ofs
, len
);
10521 astate
->data
.copy(ofs
, copy_len
, bl
);
10522 read_len
-= copy_len
;
10523 read_ofs
+= copy_len
;
10525 return bl
.length();
10533 ldout(cct
, 20) << "rados->read obj-ofs=" << ofs
<< " read_ofs=" << read_ofs
<< " read_len=" << read_len
<< dendl
;
10534 op
.read(read_ofs
, read_len
, pbl
, NULL
);
10536 r
= state
.io_ctx
.operate(read_obj
.oid
, &op
, NULL
);
10537 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10544 bl
.append(read_bl
);
10547 return bl
.length();
10550 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados
*store
, rgw_raw_obj
& obj
, rgw_rados_ref
**pref
)
10553 int r
= store
->get_raw_obj_ref(obj
, &ref
);
10564 int RGWRados::get_system_obj(RGWObjectCtx
& obj_ctx
, RGWRados::SystemObject::Read::GetObjState
& read_state
,
10565 RGWObjVersionTracker
*objv_tracker
, rgw_raw_obj
& obj
,
10566 bufferlist
& bl
, off_t ofs
, off_t end
,
10567 map
<string
, bufferlist
> *attrs
,
10568 rgw_cache_entry_info
*cache_info
,
10569 boost::optional
<obj_version
>)
10572 ObjectReadOperation op
;
10577 len
= end
- ofs
+ 1;
10579 if (objv_tracker
) {
10580 objv_tracker
->prepare_op_for_read(&op
);
10583 ldout(cct
, 20) << "rados->read ofs=" << ofs
<< " len=" << len
<< dendl
;
10584 op
.read(ofs
, len
, &bl
, NULL
);
10587 op
.getxattrs(attrs
, NULL
);
10590 rgw_rados_ref
*ref
;
10591 int r
= read_state
.get_ref(this, obj
, &ref
);
10593 ldout(cct
, 20) << "read_state.get_ref() on obj=" << obj
<< " returned " << r
<< dendl
;
10596 r
= ref
->ioctx
.operate(ref
->oid
, &op
, NULL
);
10598 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10601 ldout(cct
, 20) << "rados->read r=" << r
<< " bl.length=" << bl
.length() << dendl
;
10603 uint64_t op_ver
= ref
->ioctx
.get_last_version();
10605 if (read_state
.last_ver
> 0 &&
10606 read_state
.last_ver
!= op_ver
) {
10607 ldout(cct
, 5) << "raced with an object write, abort" << dendl
;
10611 read_state
.last_ver
= op_ver
;
10613 return bl
.length();
10616 int RGWRados::SystemObject::Read::read(int64_t ofs
, int64_t end
, bufferlist
& bl
,
10617 RGWObjVersionTracker
*objv_tracker
,
10618 boost::optional
<obj_version
> refresh_version
)
10620 RGWRados
*store
= source
->get_store();
10621 rgw_raw_obj
& obj
= source
->get_obj();
10623 return store
->get_system_obj(source
->get_ctx(), state
, objv_tracker
, obj
, bl
,
10624 ofs
, end
, read_params
.attrs
,
10625 read_params
.cache_info
, refresh_version
);
10628 int RGWRados::SystemObject::Read::get_attr(const char *name
, bufferlist
& dest
)
10630 RGWRados
*store
= source
->get_store();
10631 rgw_raw_obj
& obj
= source
->get_obj();
10633 return store
->system_obj_get_attr(obj
, name
, dest
);
10636 struct get_obj_data
;
10638 struct get_obj_aio_data
{
10639 struct get_obj_data
*op_data
;
10644 struct get_obj_io
{
10649 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
);
10651 struct get_obj_data
: public RefCountedObject
{
10656 map
<off_t
, get_obj_io
> io_map
;
10657 map
<off_t
, librados::AioCompletion
*> completion_map
;
10658 uint64_t total_read
;
10661 list
<get_obj_aio_data
> aio_data
;
10662 RGWGetDataCB
*client_cb
;
10663 std::atomic
<bool> cancelled
= { false };
10664 std::atomic
<int64_t> err_code
= { 0 };
10666 list
<bufferlist
> read_list
;
10668 explicit get_obj_data(CephContext
*_cct
)
10670 rados(NULL
), ctx(NULL
),
10671 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10673 throttle(cct
, "get_obj_data", cct
->_conf
->rgw_get_obj_window_size
, false) {}
10674 ~get_obj_data() override
{ }
10675 void set_cancelled(int r
) {
10680 bool is_cancelled() {
10684 int get_err_code() {
10688 int wait_next_io(bool *done
) {
10690 map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
10691 if (iter
== completion_map
.end()) {
10696 off_t cur_ofs
= iter
->first
;
10697 librados::AioCompletion
*c
= iter
->second
;
10700 c
->wait_for_safe_and_cb();
10701 int r
= c
->get_return_value();
10704 completion_map
.erase(cur_ofs
);
10706 if (completion_map
.empty()) {
10716 void add_io(off_t ofs
, off_t len
, bufferlist
**pbl
, AioCompletion
**pc
) {
10717 Mutex::Locker
l(lock
);
10719 const auto& io_iter
= io_map
.insert(
10720 map
<off_t
, get_obj_io
>::value_type(ofs
, get_obj_io()));
10722 assert(io_iter
.second
); // assert new insertion
10724 get_obj_io
& io
= (io_iter
.first
)->second
;
10727 struct get_obj_aio_data aio
;
10730 aio
.op_data
= this;
10732 aio_data
.push_back(aio
);
10734 struct get_obj_aio_data
*paio_data
= &aio_data
.back(); /* last element */
10736 librados::AioCompletion
*c
= librados::Rados::aio_create_completion((void *)paio_data
, NULL
, _get_obj_aio_completion_cb
);
10737 completion_map
[ofs
] = c
;
10741 /* we have a reference per IO, plus one reference for the calling function.
10742 * reference is dropped for each callback, plus when we're done iterating
10743 * over the parts */
10747 void cancel_io(off_t ofs
) {
10748 ldout(cct
, 20) << "get_obj_data::cancel_io() ofs=" << ofs
<< dendl
;
10750 map
<off_t
, AioCompletion
*>::iterator iter
= completion_map
.find(ofs
);
10751 if (iter
!= completion_map
.end()) {
10752 AioCompletion
*c
= iter
->second
;
10754 completion_map
.erase(ofs
);
10759 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10760 * need IoCtx to live, as io callback may still be called
10764 void cancel_all_io() {
10765 ldout(cct
, 20) << "get_obj_data::cancel_all_io()" << dendl
;
10766 Mutex::Locker
l(lock
);
10767 for (map
<off_t
, librados::AioCompletion
*>::iterator iter
= completion_map
.begin();
10768 iter
!= completion_map
.end(); ++iter
) {
10769 librados::AioCompletion
*c
= iter
->second
;
10774 int get_complete_ios(off_t ofs
, list
<bufferlist
>& bl_list
) {
10775 Mutex::Locker
l(lock
);
10777 map
<off_t
, get_obj_io
>::iterator liter
= io_map
.begin();
10779 if (liter
== io_map
.end() ||
10780 liter
->first
!= ofs
) {
10784 map
<off_t
, librados::AioCompletion
*>::iterator aiter
;
10785 aiter
= completion_map
.find(ofs
);
10786 if (aiter
== completion_map
.end()) {
10787 /* completion map does not hold this io, it was cancelled */
10791 AioCompletion
*completion
= aiter
->second
;
10792 int r
= completion
->get_return_value();
10796 for (; aiter
!= completion_map
.end(); ++aiter
) {
10797 completion
= aiter
->second
;
10798 if (!completion
->is_safe()) {
10799 /* reached a request that is not yet complete, stop */
10803 r
= completion
->get_return_value();
10805 set_cancelled(r
); /* mark it as cancelled, so that we don't continue processing next operations */
10811 map
<off_t
, get_obj_io
>::iterator old_liter
= liter
++;
10812 bl_list
.push_back(old_liter
->second
.bl
);
10813 io_map
.erase(old_liter
);
10820 static int _get_obj_iterate_cb(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, const rgw_raw_obj
& read_obj
, off_t obj_ofs
, off_t read_ofs
, off_t len
, bool is_head_obj
, RGWObjState
*astate
, void *arg
)
10822 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
10824 return d
->rados
->get_obj_iterate_cb(d
->ctx
, astate
, bucket_info
, obj
, read_obj
, obj_ofs
, read_ofs
, len
, is_head_obj
, arg
);
10827 static void _get_obj_aio_completion_cb(completion_t cb
, void *arg
)
10829 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
10830 struct get_obj_data
*d
= aio_data
->op_data
;
10832 d
->rados
->get_obj_aio_completion_cb(cb
, arg
);
10836 void RGWRados::get_obj_aio_completion_cb(completion_t c
, void *arg
)
10838 struct get_obj_aio_data
*aio_data
= (struct get_obj_aio_data
*)arg
;
10839 struct get_obj_data
*d
= aio_data
->op_data
;
10840 off_t ofs
= aio_data
->ofs
;
10841 off_t len
= aio_data
->len
;
10843 list
<bufferlist
> bl_list
;
10844 list
<bufferlist
>::iterator iter
;
10847 ldout(cct
, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs
<< " len=" << len
<< dendl
;
10848 d
->throttle
.put(len
);
10850 r
= rados_aio_get_return_value(c
);
10852 ldout(cct
, 0) << "ERROR: got unexpected error when trying to read object: " << r
<< dendl
;
10853 d
->set_cancelled(r
);
10857 if (d
->is_cancelled()) {
10861 d
->data_lock
.Lock();
10863 r
= d
->get_complete_ios(ofs
, bl_list
);
10868 d
->read_list
.splice(d
->read_list
.end(), bl_list
);
10871 d
->data_lock
.Unlock();
10877 int RGWRados::flush_read_list(struct get_obj_data
*d
)
10879 d
->data_lock
.Lock();
10880 list
<bufferlist
> l
;
10881 l
.swap(d
->read_list
);
10883 d
->read_list
.clear();
10885 d
->data_lock
.Unlock();
10889 list
<bufferlist
>::iterator iter
;
10890 for (iter
= l
.begin(); iter
!= l
.end(); ++iter
) {
10891 bufferlist
& bl
= *iter
;
10892 r
= d
->client_cb
->handle_data(bl
, 0, bl
.length());
10894 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r
<< dendl
;
10899 d
->data_lock
.Lock();
10902 d
->set_cancelled(r
);
10904 d
->data_lock
.Unlock();
10908 int RGWRados::get_obj_iterate_cb(RGWObjectCtx
*ctx
, RGWObjState
*astate
,
10909 const RGWBucketInfo
& bucket_info
,
10910 const rgw_obj
& obj
,
10911 const rgw_raw_obj
& read_obj
,
10913 off_t read_ofs
, off_t len
,
10914 bool is_head_obj
, void *arg
)
10916 RGWObjectCtx
*rctx
= static_cast<RGWObjectCtx
*>(ctx
);
10917 ObjectReadOperation op
;
10918 struct get_obj_data
*d
= (struct get_obj_data
*)arg
;
10926 /* only when reading from the head object do we need to do the atomic test */
10927 r
= append_atomic_test(rctx
, bucket_info
, obj
, op
, &astate
);
10932 obj_ofs
< astate
->data
.length()) {
10933 unsigned chunk_len
= min((uint64_t)astate
->data
.length() - obj_ofs
, (uint64_t)len
);
10935 d
->data_lock
.Lock();
10936 r
= d
->client_cb
->handle_data(astate
->data
, obj_ofs
, chunk_len
);
10937 d
->data_lock
.Unlock();
10942 d
->total_read
+= chunk_len
;
10946 read_ofs
+= chunk_len
;
10947 obj_ofs
+= chunk_len
;
10953 d
->throttle
.get(len
);
10954 if (d
->is_cancelled()) {
10955 return d
->get_err_code();
10958 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10961 d
->add_io(obj_ofs
, len
, &pbl
, &c
);
10963 ldout(cct
, 20) << "rados->get_obj_iterate_cb oid=" << read_obj
.oid
<< " obj-ofs=" << obj_ofs
<< " read_ofs=" << read_ofs
<< " len=" << len
<< dendl
;
10964 op
.read(read_ofs
, len
, pbl
, NULL
);
10966 librados::IoCtx
io_ctx(d
->io_ctx
);
10967 io_ctx
.locator_set_key(read_obj
.loc
);
10969 r
= io_ctx
.aio_operate(read_obj
.oid
, c
, &op
, NULL
);
10971 ldout(cct
, 0) << "rados->aio_operate r=" << r
<< dendl
;
10975 // Flush data to client if there is any
10976 r
= flush_read_list(d
);
10983 ldout(cct
, 20) << "cancelling io r=" << r
<< " obj_ofs=" << obj_ofs
<< dendl
;
10984 d
->set_cancelled(r
);
10985 d
->cancel_io(obj_ofs
);
10990 int RGWRados::Object::Read::iterate(int64_t ofs
, int64_t end
, RGWGetDataCB
*cb
)
10992 RGWRados
*store
= source
->get_store();
10993 CephContext
*cct
= store
->ctx();
10995 struct get_obj_data
*data
= new get_obj_data(cct
);
10998 RGWObjectCtx
& obj_ctx
= source
->get_ctx();
11000 data
->rados
= store
;
11001 data
->io_ctx
.dup(state
.io_ctx
);
11002 data
->client_cb
= cb
;
11004 int r
= store
->iterate_obj(obj_ctx
, source
->get_bucket_info(), state
.obj
, ofs
, end
, cct
->_conf
->rgw_get_obj_max_req_size
, _get_obj_iterate_cb
, (void *)data
);
11006 data
->cancel_all_io();
11011 r
= data
->wait_next_io(&done
);
11013 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
11014 data
->cancel_all_io();
11017 r
= store
->flush_read_list(data
);
11019 dout(10) << "get_obj_iterate() r=" << r
<< ", canceling all io" << dendl
;
11020 data
->cancel_all_io();
11030 int RGWRados::iterate_obj(RGWObjectCtx
& obj_ctx
,
11031 const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
11032 off_t ofs
, off_t end
,
11033 uint64_t max_chunk_size
,
11034 int (*iterate_obj_cb
)(const RGWBucketInfo
&, const rgw_obj
& obj
,
11035 const rgw_raw_obj
&, off_t
, off_t
, off_t
, bool,
11036 RGWObjState
*, void *),
11039 rgw_raw_obj head_obj
;
11040 rgw_raw_obj read_obj
;
11041 uint64_t read_ofs
= ofs
;
11043 bool reading_from_head
= true;
11044 RGWObjState
*astate
= NULL
;
11046 obj_to_raw(bucket_info
.placement_rule
, obj
, &head_obj
);
11048 int r
= get_obj_state(&obj_ctx
, bucket_info
, obj
, &astate
, false);
11056 len
= end
- ofs
+ 1;
11058 if (astate
->has_manifest
) {
11059 /* now get the relevant object stripe */
11060 RGWObjManifest::obj_iterator iter
= astate
->manifest
.obj_find(ofs
);
11062 RGWObjManifest::obj_iterator obj_end
= astate
->manifest
.obj_end();
11064 for (; iter
!= obj_end
&& ofs
<= end
; ++iter
) {
11065 off_t stripe_ofs
= iter
.get_stripe_ofs();
11066 off_t next_stripe_ofs
= stripe_ofs
+ iter
.get_stripe_size();
11068 while (ofs
< next_stripe_ofs
&& ofs
<= end
) {
11069 read_obj
= iter
.get_location().get_raw_obj(this);
11070 uint64_t read_len
= min(len
, iter
.get_stripe_size() - (ofs
- stripe_ofs
));
11071 read_ofs
= iter
.location_ofs() + (ofs
- stripe_ofs
);
11073 if (read_len
> max_chunk_size
) {
11074 read_len
= max_chunk_size
;
11077 reading_from_head
= (read_obj
== head_obj
);
11078 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, read_ofs
, read_len
, reading_from_head
, astate
, arg
);
11088 while (ofs
<= end
) {
11089 read_obj
= head_obj
;
11090 uint64_t read_len
= min(len
, max_chunk_size
);
11092 r
= iterate_obj_cb(bucket_info
, obj
, read_obj
, ofs
, ofs
, read_len
, reading_from_head
, astate
, arg
);
11105 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectWriteOperation
*op
)
11108 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
11113 return ref
.ioctx
.operate(ref
.oid
, op
);
11116 int RGWRados::obj_operate(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, ObjectReadOperation
*op
)
11119 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
11126 return ref
.ioctx
.operate(ref
.oid
, op
, &outbl
);
11129 int RGWRados::olh_init_modification_impl(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, string
*op_tag
)
11131 ObjectWriteOperation op
;
11133 assert(olh_obj
.key
.instance
.empty());
11135 bool has_tag
= (state
.exists
&& has_olh_tag(state
.attrset
));
11137 if (!state
.exists
) {
11140 op
.assert_exists();
11141 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
11142 op
.mtime2(&mtime_ts
);
11146 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
11147 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
11148 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
11149 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
11150 * log will reflect that.
11152 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
11153 * is used for object data instance, olh_tag for olh instance.
11156 /* guard against racing writes */
11157 bucket_index_guard_olh_op(state
, op
);
11163 int ret
= gen_rand_alphanumeric_lower(cct
, &obj_tag
, 32);
11165 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
11169 bl
.append(obj_tag
.c_str(), obj_tag
.size());
11170 op
.setxattr(RGW_ATTR_ID_TAG
, bl
);
11172 state
.attrset
[RGW_ATTR_ID_TAG
] = bl
;
11173 state
.obj_tag
= bl
;
11177 ret
= gen_rand_alphanumeric_lower(cct
, &olh_tag
, 32);
11179 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
11183 olh_bl
.append(olh_tag
.c_str(), olh_tag
.size());
11184 op
.setxattr(RGW_ATTR_OLH_ID_TAG
, olh_bl
);
11186 state
.attrset
[RGW_ATTR_OLH_ID_TAG
] = olh_bl
;
11187 state
.olh_tag
= olh_bl
;
11188 state
.is_olh
= true;
11191 op
.setxattr(RGW_ATTR_OLH_VER
, verbl
);
11195 RGWOLHPendingInfo pending_info
;
11196 pending_info
.time
= real_clock::now();
11197 ::encode(pending_info
, bl
);
11199 #define OLH_PENDING_TAG_LEN 32
11200 /* tag will start with current time epoch, this so that entries are sorted by time */
11202 utime_t
ut(pending_info
.time
);
11203 snprintf(buf
, sizeof(buf
), "%016llx", (unsigned long long)ut
.sec());
11207 int ret
= gen_rand_alphanumeric_lower(cct
, &s
, OLH_PENDING_TAG_LEN
- op_tag
->size());
11209 ldout(cct
, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret
<< dendl
;
11214 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
11215 attr_name
.append(*op_tag
);
11217 op
.setxattr(attr_name
.c_str(), bl
);
11219 ret
= obj_operate(bucket_info
, olh_obj
, &op
);
11224 state
.exists
= true;
11225 state
.attrset
[attr_name
] = bl
;
11230 int RGWRados::olh_init_modification(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj
, string
*op_tag
)
11234 ret
= olh_init_modification_impl(bucket_info
, state
, obj
, op_tag
);
11235 if (ret
== -EEXIST
) {
11242 int RGWRados::guard_reshard(BucketShard
*bs
,
11243 const rgw_obj
& obj_instance
,
11244 const RGWBucketInfo
& bucket_info
,
11245 std::function
<int(BucketShard
*)> call
)
11248 const rgw_obj
*pobj
= &obj_instance
;
11251 for (int i
= 0; i
< NUM_RESHARD_RETRIES
; ++i
) {
11252 r
= bs
->init(pobj
->bucket
, *pobj
, nullptr /* no RGWBucketInfo */);
11254 ldout(cct
, 5) << "bs.init() returned ret=" << r
<< dendl
;
11258 if (r
!= -ERR_BUSY_RESHARDING
) {
11261 ldout(cct
, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl
;
11262 string new_bucket_id
;
11263 r
= block_while_resharding(bs
, &new_bucket_id
, bucket_info
);
11264 if (r
== -ERR_BUSY_RESHARDING
) {
11270 ldout(cct
, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id
<< dendl
;
11271 i
= 0; /* resharding is finished, make sure we can retry */
11274 obj
.bucket
.update_bucket_id(new_bucket_id
);
11285 int RGWRados::block_while_resharding(RGWRados::BucketShard
*bs
,
11286 string
*new_bucket_id
,
11287 const RGWBucketInfo
& bucket_info
)
11289 std::shared_ptr
<RGWReshardWait
> waiter
= reshard_wait
;
11291 return waiter
->block_while_resharding(bs
, new_bucket_id
, bucket_info
);
11294 int RGWRados::bucket_index_link_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& olh_state
, const rgw_obj
& obj_instance
,
11295 bool delete_marker
,
11296 const string
& op_tag
,
11297 struct rgw_bucket_dir_entry_meta
*meta
,
11298 uint64_t olh_epoch
,
11299 real_time unmod_since
, bool high_precision_time
,
11300 rgw_zone_set
*_zones_trace
, bool log_data_change
)
11303 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
11308 rgw_zone_set zones_trace
;
11309 if (_zones_trace
) {
11310 zones_trace
= *_zones_trace
;
11312 zones_trace
.insert(get_zone().id
);
11314 BucketShard
bs(this);
11316 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
11317 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
11318 [&](BucketShard
*bs
) -> int {
11319 librados::ObjectWriteOperation op
;
11320 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
11321 return cls_rgw_bucket_link_olh(bs
->index_ctx
, op
,
11322 bs
->bucket_obj
, key
, olh_state
.olh_tag
, delete_marker
, op_tag
, meta
, olh_epoch
,
11323 unmod_since
, high_precision_time
,
11324 get_zone().log_data
, zones_trace
);
11327 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
11331 if (log_data_change
&& bucket_info
.datasync_flag_enabled()) {
11332 data_log
->add_entry(bs
.bucket
, bs
.shard_id
);
11338 void RGWRados::bucket_index_guard_olh_op(RGWObjState
& olh_state
, ObjectOperation
& op
)
11340 ldout(cct
, 20) << __func__
<< "(): olh_state.olh_tag=" << string(olh_state
.olh_tag
.c_str(), olh_state
.olh_tag
.length()) << dendl
;
11341 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_state
.olh_tag
);
11344 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj_instance
,
11345 const string
& op_tag
, const string
& olh_tag
, uint64_t olh_epoch
, rgw_zone_set
*_zones_trace
)
11348 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
11353 rgw_zone_set zones_trace
;
11354 if (_zones_trace
) {
11355 zones_trace
= *_zones_trace
;
11357 zones_trace
.insert(get_zone().id
);
11359 BucketShard
bs(this);
11361 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), obj_instance
.key
.instance
);
11362 r
= guard_reshard(&bs
, obj_instance
, bucket_info
,
11363 [&](BucketShard
*bs
) -> int {
11364 librados::ObjectWriteOperation op
;
11365 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
11366 return cls_rgw_bucket_unlink_instance(bs
->index_ctx
, op
, bs
->bucket_obj
, key
, op_tag
,
11367 olh_tag
, olh_epoch
, get_zone().log_data
, zones_trace
);
11370 ldout(cct
, 20) << "cls_rgw_bucket_link_olh() returned r=" << r
<< dendl
;
11377 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
,
11378 const rgw_obj
& obj_instance
, uint64_t ver_marker
,
11379 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > *log
,
11380 bool *is_truncated
)
11383 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
11388 BucketShard
bs(this);
11390 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
11392 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11396 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
11398 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
11400 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
11401 [&](BucketShard
*bs
) -> int {
11402 ObjectReadOperation op
;
11403 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
11404 return cls_rgw_get_olh_log(bs
->index_ctx
, bs
->bucket_obj
, op
,
11405 key
, ver_marker
, olh_tag
, log
, is_truncated
);
11408 ldout(cct
, 20) << "cls_rgw_get_olh_log() returned r=" << r
<< dendl
;
11415 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
, uint64_t ver
)
11418 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
11423 BucketShard
bs(this);
11425 bs
.init(obj_instance
.bucket
, obj_instance
, nullptr /* no RGWBucketInfo */);
11427 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
11431 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
11433 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
11435 ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
11436 [&](BucketShard
*pbs
) -> int {
11437 ObjectWriteOperation op
;
11438 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
11439 cls_rgw_trim_olh_log(op
, key
, ver
, olh_tag
);
11440 return pbs
->index_ctx
.operate(pbs
->bucket_obj
, &op
);
11443 ldout(cct
, 20) << "cls_rgw_trim_olh_log() returned r=" << ret
<< dendl
;
11450 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& obj_instance
)
11453 int r
= get_obj_head_ref(bucket_info
, obj_instance
, &ref
);
11458 BucketShard
bs(this);
11460 string
olh_tag(state
.olh_tag
.c_str(), state
.olh_tag
.length());
11462 cls_rgw_obj_key
key(obj_instance
.key
.get_index_key_name(), string());
11464 int ret
= guard_reshard(&bs
, obj_instance
, bucket_info
,
11465 [&](BucketShard
*pbs
) -> int {
11466 ObjectWriteOperation op
;
11467 cls_rgw_guard_bucket_resharding(op
, -ERR_BUSY_RESHARDING
);
11468 return cls_rgw_clear_olh(pbs
->index_ctx
, op
, pbs
->bucket_obj
, key
, olh_tag
);
11471 ldout(cct
, 5) << "cls_rgw_clear_olh() returned ret=" << ret
<< dendl
;
11478 int RGWRados::apply_olh_log(RGWObjectCtx
& obj_ctx
, RGWObjState
& state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
,
11479 bufferlist
& olh_tag
, map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >& log
,
11480 uint64_t *plast_ver
, rgw_zone_set
* zones_trace
)
11486 librados::ObjectWriteOperation op
;
11488 uint64_t last_ver
= log
.rbegin()->first
;
11489 *plast_ver
= last_ver
;
11491 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> >::iterator iter
= log
.begin();
11493 op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
11494 op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
11496 struct timespec mtime_ts
= real_clock::to_timespec(state
.mtime
);
11497 op
.mtime2(&mtime_ts
);
11499 bool need_to_link
= false;
11500 cls_rgw_obj_key key
;
11501 bool delete_marker
= false;
11502 list
<cls_rgw_obj_key
> remove_instances
;
11503 bool need_to_remove
= false;
11505 for (iter
= log
.begin(); iter
!= log
.end(); ++iter
) {
11506 vector
<rgw_bucket_olh_log_entry
>::iterator viter
= iter
->second
.begin();
11507 for (; viter
!= iter
->second
.end(); ++viter
) {
11508 rgw_bucket_olh_log_entry
& entry
= *viter
;
11510 ldout(cct
, 20) << "olh_log_entry: op=" << (int)entry
.op
11511 << " key=" << entry
.key
.name
<< "[" << entry
.key
.instance
<< "] "
11512 << (entry
.delete_marker
? "(delete)" : "") << dendl
;
11513 switch (entry
.op
) {
11514 case CLS_RGW_OLH_OP_REMOVE_INSTANCE
:
11515 remove_instances
.push_back(entry
.key
);
11517 case CLS_RGW_OLH_OP_LINK_OLH
:
11518 need_to_link
= true;
11519 need_to_remove
= false;
11521 delete_marker
= entry
.delete_marker
;
11523 case CLS_RGW_OLH_OP_UNLINK_OLH
:
11524 need_to_remove
= true;
11525 need_to_link
= false;
11528 ldout(cct
, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry
.op
<< dendl
;
11531 string attr_name
= RGW_ATTR_OLH_PENDING_PREFIX
;
11532 attr_name
.append(entry
.op_tag
);
11533 op
.rmxattr(attr_name
.c_str());
11538 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
11543 const rgw_bucket
& bucket
= obj
.bucket
;
11545 if (need_to_link
) {
11546 rgw_obj
target(bucket
, key
);
11548 info
.target
= target
;
11549 info
.removed
= delete_marker
;
11551 ::encode(info
, bl
);
11552 op
.setxattr(RGW_ATTR_OLH_INFO
, bl
);
11555 /* first remove object instances */
11556 for (list
<cls_rgw_obj_key
>::iterator liter
= remove_instances
.begin();
11557 liter
!= remove_instances
.end(); ++liter
) {
11558 cls_rgw_obj_key
& key
= *liter
;
11559 rgw_obj
obj_instance(bucket
, key
);
11560 int ret
= delete_obj(obj_ctx
, bucket_info
, obj_instance
, 0, RGW_BILOG_FLAG_VERSIONED_OP
, ceph::real_time(), zones_trace
);
11561 if (ret
< 0 && ret
!= -ENOENT
) {
11562 ldout(cct
, 0) << "ERROR: delete_obj() returned " << ret
<< " obj_instance=" << obj_instance
<< dendl
;
11567 /* update olh object */
11568 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
11569 if (r
== -ECANCELED
) {
11573 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
11577 r
= bucket_index_trim_olh_log(bucket_info
, state
, obj
, last_ver
);
11579 ldout(cct
, 0) << "ERROR: could not trim olh log, r=" << r
<< dendl
;
11583 if (need_to_remove
) {
11584 ObjectWriteOperation rm_op
;
11586 rm_op
.cmpxattr(RGW_ATTR_OLH_ID_TAG
, CEPH_OSD_CMPXATTR_OP_EQ
, olh_tag
);
11587 rm_op
.cmpxattr(RGW_ATTR_OLH_VER
, CEPH_OSD_CMPXATTR_OP_GT
, last_ver
);
11588 cls_obj_check_prefix_exist(rm_op
, RGW_ATTR_OLH_PENDING_PREFIX
, true); /* fail if found one of these, pending modification */
11591 r
= ref
.ioctx
.operate(ref
.oid
, &rm_op
);
11592 if (r
== -ECANCELED
) {
11593 return 0; /* someone else won this race */
11596 * only clear if was successful, otherwise we might clobber pending operations on this object
11598 r
= bucket_index_clear_olh(bucket_info
, state
, obj
);
11600 ldout(cct
, 0) << "ERROR: could not clear bucket index olh entries r=" << r
<< dendl
;
11610 * read olh log and apply it
11612 int RGWRados::update_olh(RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, rgw_zone_set
*zones_trace
)
11614 map
<uint64_t, vector
<rgw_bucket_olh_log_entry
> > log
;
11616 uint64_t ver_marker
= 0;
11619 int ret
= bucket_index_read_olh_log(bucket_info
, *state
, obj
, ver_marker
, &log
, &is_truncated
);
11623 ret
= apply_olh_log(obj_ctx
, *state
, bucket_info
, obj
, state
->olh_tag
, log
, &ver_marker
, zones_trace
);
11627 } while (is_truncated
);
11632 int RGWRados::set_olh(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
, bool delete_marker
, rgw_bucket_dir_entry_meta
*meta
,
11633 uint64_t olh_epoch
, real_time unmod_since
, bool high_precision_time
,
11634 rgw_zone_set
*zones_trace
, bool log_data_change
)
11638 rgw_obj olh_obj
= target_obj
;
11639 olh_obj
.key
.instance
.clear();
11641 RGWObjState
*state
= NULL
;
11646 #define MAX_ECANCELED_RETRY 100
11647 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
11648 if (ret
== -ECANCELED
) {
11649 obj_ctx
.obj
.invalidate(olh_obj
);
11652 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
11657 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
11659 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
11660 if (ret
== -ECANCELED
) {
11665 ret
= bucket_index_link_olh(bucket_info
, *state
, target_obj
, delete_marker
,
11666 op_tag
, meta
, olh_epoch
, unmod_since
, high_precision_time
,
11667 zones_trace
, log_data_change
);
11669 ldout(cct
, 20) << "bucket_index_link_olh() target_obj=" << target_obj
<< " delete_marker=" << (int)delete_marker
<< " returned " << ret
<< dendl
;
11670 if (ret
== -ECANCELED
) {
11678 if (i
== MAX_ECANCELED_RETRY
) {
11679 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
11683 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
11684 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
11688 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11695 int RGWRados::unlink_obj_instance(RGWObjectCtx
& obj_ctx
, RGWBucketInfo
& bucket_info
, const rgw_obj
& target_obj
,
11696 uint64_t olh_epoch
, rgw_zone_set
*zones_trace
)
11700 rgw_obj olh_obj
= target_obj
;
11701 olh_obj
.key
.instance
.clear();
11703 RGWObjState
*state
= NULL
;
11708 for (i
= 0; i
< MAX_ECANCELED_RETRY
; i
++) {
11709 if (ret
== -ECANCELED
) {
11710 obj_ctx
.obj
.invalidate(olh_obj
);
11713 ret
= get_obj_state(&obj_ctx
, bucket_info
, olh_obj
, &state
, false); /* don't follow olh */
11717 ret
= olh_init_modification(bucket_info
, *state
, olh_obj
, &op_tag
);
11719 ldout(cct
, 20) << "olh_init_modification() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11720 if (ret
== -ECANCELED
) {
11726 string
olh_tag(state
->olh_tag
.c_str(), state
->olh_tag
.length());
11728 ret
= bucket_index_unlink_instance(bucket_info
, target_obj
, op_tag
, olh_tag
, olh_epoch
, zones_trace
);
11730 ldout(cct
, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11731 if (ret
== -ECANCELED
) {
11739 if (i
== MAX_ECANCELED_RETRY
) {
11740 ldout(cct
, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl
;
11744 ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
, zones_trace
);
11745 if (ret
== -ECANCELED
) { /* already did what we needed, no need to retry, raced with another user */
11749 ldout(cct
, 20) << "update_olh() target_obj=" << target_obj
<< " returned " << ret
<< dendl
;
11756 void RGWRados::gen_rand_obj_instance_name(rgw_obj
*target_obj
)
11758 #define OBJ_INSTANCE_LEN 32
11759 char buf
[OBJ_INSTANCE_LEN
+ 1];
11761 gen_rand_alphanumeric_no_underscore(cct
, buf
, OBJ_INSTANCE_LEN
); /* don't want it to get url escaped,
11762 no underscore for instance name due to the way we encode the raw keys */
11764 target_obj
->key
.set_instance(buf
);
11767 static void filter_attrset(map
<string
, bufferlist
>& unfiltered_attrset
, const string
& check_prefix
,
11768 map
<string
, bufferlist
> *attrset
)
11771 map
<string
, bufferlist
>::iterator iter
;
11772 for (iter
= unfiltered_attrset
.lower_bound(check_prefix
);
11773 iter
!= unfiltered_attrset
.end(); ++iter
) {
11774 if (!boost::algorithm::starts_with(iter
->first
, check_prefix
))
11776 (*attrset
)[iter
->first
] = iter
->second
;
11780 int RGWRados::get_olh(const RGWBucketInfo
& bucket_info
, const rgw_obj
& obj
, RGWOLHInfo
*olh
)
11782 map
<string
, bufferlist
> unfiltered_attrset
;
11784 ObjectReadOperation op
;
11785 op
.getxattrs(&unfiltered_attrset
, NULL
);
11788 int r
= obj_operate(bucket_info
, obj
, &op
);
11793 map
<string
, bufferlist
> attrset
;
11795 filter_attrset(unfiltered_attrset
, RGW_ATTR_OLH_PREFIX
, &attrset
);
11797 map
<string
, bufferlist
>::iterator iter
= attrset
.find(RGW_ATTR_OLH_INFO
);
11798 if (iter
== attrset
.end()) { /* not an olh */
11803 bufferlist::iterator biter
= iter
->second
.begin();
11804 ::decode(*olh
, biter
);
11805 } catch (buffer::error
& err
) {
11806 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
11813 void RGWRados::check_pending_olh_entries(map
<string
, bufferlist
>& pending_entries
,
11814 map
<string
, bufferlist
> *rm_pending_entries
)
11816 map
<string
, bufferlist
>::iterator iter
= pending_entries
.begin();
11818 real_time now
= real_clock::now();
11820 while (iter
!= pending_entries
.end()) {
11821 bufferlist::iterator biter
= iter
->second
.begin();
11822 RGWOLHPendingInfo pending_info
;
11824 ::decode(pending_info
, biter
);
11825 } catch (buffer::error
& err
) {
11826 /* skipping bad entry, we could remove it but it might hide a bug */
11827 ldout(cct
, 0) << "ERROR: failed to decode pending entry " << iter
->first
<< dendl
;
11832 map
<string
, bufferlist
>::iterator cur_iter
= iter
;
11834 if (now
- pending_info
.time
>= make_timespan(cct
->_conf
->rgw_olh_pending_timeout_sec
)) {
11835 (*rm_pending_entries
)[cur_iter
->first
] = cur_iter
->second
;
11836 pending_entries
.erase(cur_iter
);
11838 /* entries names are sorted by time (rounded to a second) */
11844 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo
& bucket_info
, RGWObjState
& state
, const rgw_obj
& olh_obj
, map
<string
, bufferlist
>& pending_attrs
)
11846 ObjectWriteOperation op
;
11848 bucket_index_guard_olh_op(state
, op
);
11850 for (map
<string
, bufferlist
>::iterator iter
= pending_attrs
.begin(); iter
!= pending_attrs
.end(); ++iter
) {
11851 op
.rmxattr(iter
->first
.c_str());
11855 int r
= get_obj_head_ref(bucket_info
, olh_obj
, &ref
);
11860 /* update olh object */
11861 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
11862 if (r
== -ENOENT
|| r
== -ECANCELED
) {
11863 /* raced with some other change, shouldn't sweat about it */
11867 ldout(cct
, 0) << "ERROR: could not apply olh update, r=" << r
<< dendl
;
11874 int RGWRados::follow_olh(const RGWBucketInfo
& bucket_info
, RGWObjectCtx
& obj_ctx
, RGWObjState
*state
, const rgw_obj
& olh_obj
, rgw_obj
*target
)
11876 map
<string
, bufferlist
> pending_entries
;
11877 filter_attrset(state
->attrset
, RGW_ATTR_OLH_PENDING_PREFIX
, &pending_entries
);
11879 map
<string
, bufferlist
> rm_pending_entries
;
11880 check_pending_olh_entries(pending_entries
, &rm_pending_entries
);
11882 if (!rm_pending_entries
.empty()) {
11883 int ret
= remove_olh_pending_entries(bucket_info
, *state
, olh_obj
, rm_pending_entries
);
11885 ldout(cct
, 20) << "ERROR: rm_pending_entries returned ret=" << ret
<< dendl
;
11889 if (!pending_entries
.empty()) {
11890 ldout(cct
, 20) << __func__
<< "(): found pending entries, need to update_olh() on bucket=" << olh_obj
.bucket
<< dendl
;
11892 int ret
= update_olh(obj_ctx
, state
, bucket_info
, olh_obj
);
11898 map
<string
, bufferlist
>::iterator iter
= state
->attrset
.find(RGW_ATTR_OLH_INFO
);
11899 assert(iter
!= state
->attrset
.end());
11902 bufferlist::iterator biter
= iter
->second
.begin();
11903 ::decode(olh
, biter
);
11904 } catch (buffer::error
& err
) {
11905 ldout(cct
, 0) << "ERROR: failed to decode olh info" << dendl
;
11913 *target
= olh
.target
;
11918 int RGWRados::raw_obj_stat(rgw_raw_obj
& obj
, uint64_t *psize
, real_time
*pmtime
, uint64_t *epoch
,
11919 map
<string
, bufferlist
> *attrs
, bufferlist
*first_chunk
,
11920 RGWObjVersionTracker
*objv_tracker
)
11923 int r
= get_raw_obj_ref(obj
, &ref
);
11928 map
<string
, bufferlist
> unfiltered_attrset
;
11930 struct timespec mtime_ts
;
11932 ObjectReadOperation op
;
11933 if (objv_tracker
) {
11934 objv_tracker
->prepare_op_for_read(&op
);
11937 op
.getxattrs(&unfiltered_attrset
, NULL
);
11939 if (psize
|| pmtime
) {
11940 op
.stat2(&size
, &mtime_ts
, NULL
);
11943 op
.read(0, cct
->_conf
->rgw_max_chunk_size
, first_chunk
, NULL
);
11946 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &outbl
);
11949 *epoch
= ref
.ioctx
.get_last_version();
11958 *pmtime
= ceph::real_clock::from_timespec(mtime_ts
);
11960 filter_attrset(unfiltered_attrset
, RGW_ATTR_PREFIX
, attrs
);
11966 int RGWRados::get_bucket_stats(RGWBucketInfo
& bucket_info
, int shard_id
, string
*bucket_ver
, string
*master_ver
,
11967 map
<RGWObjCategory
, RGWStorageStats
>& stats
, string
*max_marker
, bool *syncstopped
)
11969 map
<string
, rgw_bucket_dir_header
> headers
;
11970 map
<int, string
> bucket_instance_ids
;
11971 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
11976 assert(headers
.size() == bucket_instance_ids
.size());
11978 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
11979 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
11980 BucketIndexShardsManager ver_mgr
;
11981 BucketIndexShardsManager master_ver_mgr
;
11982 BucketIndexShardsManager marker_mgr
;
11984 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
11985 accumulate_raw_stats(iter
->second
, stats
);
11986 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.ver
);
11987 ver_mgr
.add(viter
->first
, string(buf
));
11988 snprintf(buf
, sizeof(buf
), "%lu", (unsigned long)iter
->second
.master_ver
);
11989 master_ver_mgr
.add(viter
->first
, string(buf
));
11990 if (shard_id
>= 0) {
11991 *max_marker
= iter
->second
.max_marker
;
11993 marker_mgr
.add(viter
->first
, iter
->second
.max_marker
);
11995 if (syncstopped
!= NULL
)
11996 *syncstopped
= iter
->second
.syncstopped
;
11998 ver_mgr
.to_string(bucket_ver
);
11999 master_ver_mgr
.to_string(master_ver
);
12000 if (shard_id
< 0) {
12001 marker_mgr
.to_string(max_marker
);
12006 int RGWRados::get_bi_log_status(RGWBucketInfo
& bucket_info
, int shard_id
,
12007 map
<int, string
>& markers
)
12009 map
<string
, rgw_bucket_dir_header
> headers
;
12010 map
<int, string
> bucket_instance_ids
;
12011 int r
= cls_bucket_head(bucket_info
, shard_id
, headers
, &bucket_instance_ids
);
12015 assert(headers
.size() == bucket_instance_ids
.size());
12017 map
<string
, rgw_bucket_dir_header
>::iterator iter
= headers
.begin();
12018 map
<int, string
>::iterator viter
= bucket_instance_ids
.begin();
12020 for(; iter
!= headers
.end(); ++iter
, ++viter
) {
12021 if (shard_id
>= 0) {
12022 markers
[shard_id
] = iter
->second
.max_marker
;
12024 markers
[viter
->first
] = iter
->second
.max_marker
;
12030 class RGWGetBucketStatsContext
: public RGWGetDirHeader_CB
{
12031 RGWGetBucketStats_CB
*cb
;
12033 map
<RGWObjCategory
, RGWStorageStats
> stats
;
12039 RGWGetBucketStatsContext(RGWGetBucketStats_CB
*_cb
, uint32_t _pendings
)
12040 : cb(_cb
), pendings(_pendings
), stats(), ret_code(0), should_cb(true),
12041 lock("RGWGetBucketStatsContext") {}
12043 void handle_response(int r
, rgw_bucket_dir_header
& header
) override
{
12044 Mutex::Locker
l(lock
);
12047 accumulate_raw_stats(header
, stats
);
12052 // Are we all done?
12053 if (--pendings
== 0) {
12055 cb
->set_response(&stats
);
12057 cb
->handle_response(ret_code
);
12064 Mutex::Locker
l(lock
);
12069 int RGWRados::get_bucket_stats_async(RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetBucketStats_CB
*ctx
)
12072 RGWGetBucketStatsContext
*get_ctx
= new RGWGetBucketStatsContext(ctx
, bucket_info
.num_shards
? : 1);
12074 int r
= cls_bucket_head_async(bucket_info
, shard_id
, get_ctx
, &num_aio
);
12078 get_ctx
->unset_cb();
12085 class RGWGetUserStatsContext
: public RGWGetUserHeader_CB
{
12086 RGWGetUserStats_CB
*cb
;
12089 explicit RGWGetUserStatsContext(RGWGetUserStats_CB
* const cb
)
12092 void handle_response(int r
, cls_user_header
& header
) override
{
12093 const cls_user_stats
& hs
= header
.stats
;
12095 RGWStorageStats stats
;
12097 stats
.size
= hs
.total_bytes
;
12098 stats
.size_rounded
= hs
.total_bytes_rounded
;
12099 stats
.num_objects
= hs
.total_entries
;
12101 cb
->set_response(stats
);
12104 cb
->handle_response(r
);
12110 int RGWRados::get_user_stats(const rgw_user
& user
, RGWStorageStats
& stats
)
12112 string user_str
= user
.to_str();
12114 cls_user_header header
;
12115 int r
= cls_user_get_header(user_str
, &header
);
12119 const cls_user_stats
& hs
= header
.stats
;
12121 stats
.size
= hs
.total_bytes
;
12122 stats
.size_rounded
= hs
.total_bytes_rounded
;
12123 stats
.num_objects
= hs
.total_entries
;
12128 int RGWRados::get_user_stats_async(const rgw_user
& user
, RGWGetUserStats_CB
*ctx
)
12130 string user_str
= user
.to_str();
12132 RGWGetUserStatsContext
*get_ctx
= new RGWGetUserStatsContext(ctx
);
12133 int r
= cls_user_get_header_async(user_str
, get_ctx
);
12143 void RGWRados::get_bucket_meta_oid(const rgw_bucket
& bucket
, string
& oid
)
12145 oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ bucket
.get_key(':');
12148 void RGWRados::get_bucket_instance_obj(const rgw_bucket
& bucket
, rgw_raw_obj
& obj
)
12150 if (!bucket
.oid
.empty()) {
12151 obj
.init(get_zone_params().domain_root
, bucket
.oid
);
12154 get_bucket_meta_oid(bucket
, oid
);
12155 obj
.init(get_zone_params().domain_root
, oid
);
12159 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const string
& meta_key
, RGWBucketInfo
& info
,
12160 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
12162 size_t pos
= meta_key
.find(':');
12163 if (pos
== string::npos
) {
12166 string oid
= RGW_BUCKET_INSTANCE_MD_PREFIX
+ meta_key
;
12167 rgw_bucket_instance_key_to_oid(oid
);
12169 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
12172 int RGWRados::get_bucket_instance_info(RGWObjectCtx
& obj_ctx
, const rgw_bucket
& bucket
, RGWBucketInfo
& info
,
12173 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
12176 if (bucket
.oid
.empty()) {
12177 get_bucket_meta_oid(bucket
, oid
);
12182 return get_bucket_instance_from_oid(obj_ctx
, oid
, info
, pmtime
, pattrs
);
12185 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx
& obj_ctx
, const string
& oid
, RGWBucketInfo
& info
,
12186 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
,
12187 rgw_cache_entry_info
*cache_info
,
12188 boost::optional
<obj_version
> refresh_version
)
12190 ldout(cct
, 20) << "reading from " << get_zone_params().domain_root
<< ":" << oid
<< dendl
;
12194 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
,
12195 oid
, epbl
, &info
.objv_tracker
, pmtime
, pattrs
,
12196 cache_info
, refresh_version
);
12201 bufferlist::iterator iter
= epbl
.begin();
12203 ::decode(info
, iter
);
12204 } catch (buffer::error
& err
) {
12205 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
12208 info
.bucket
.oid
= oid
;
12212 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx
& obj_ctx
,
12213 const string
& tenant_name
,
12214 const string
& bucket_name
,
12215 RGWBucketEntryPoint
& entry_point
,
12216 RGWObjVersionTracker
*objv_tracker
,
12218 map
<string
, bufferlist
> *pattrs
,
12219 rgw_cache_entry_info
*cache_info
,
12220 boost::optional
<obj_version
> refresh_version
)
12223 string bucket_entry
;
12225 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
12226 int ret
= rgw_get_system_obj(this, obj_ctx
, get_zone_params().domain_root
,
12227 bucket_entry
, bl
, objv_tracker
, pmtime
, pattrs
,
12228 cache_info
, refresh_version
);
12233 bufferlist::iterator iter
= bl
.begin();
12235 ::decode(entry_point
, iter
);
12236 } catch (buffer::error
& err
) {
12237 ldout(cct
, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl
;
12243 int RGWRados::convert_old_bucket_info(RGWObjectCtx
& obj_ctx
,
12244 const string
& tenant_name
,
12245 const string
& bucket_name
)
12247 RGWBucketEntryPoint entry_point
;
12248 real_time ep_mtime
;
12249 RGWObjVersionTracker ot
;
12250 map
<string
, bufferlist
> attrs
;
12251 RGWBucketInfo info
;
12253 ldout(cct
, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name
<< dendl
;
12255 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant_name
, bucket_name
, entry_point
, &ot
, &ep_mtime
, &attrs
);
12257 ldout(cct
, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret
<< " bucket=" << bucket_name
<< dendl
;
12261 if (!entry_point
.has_bucket_info
) {
12262 /* already converted! */
12266 info
= entry_point
.old_bucket_info
;
12267 info
.bucket
.oid
= bucket_name
;
12268 info
.ep_objv
= ot
.read_version
;
12270 ot
.generate_new_write_ver(cct
);
12272 ret
= put_linked_bucket_info(info
, false, ep_mtime
, &ot
.write_version
, &attrs
, true);
12274 ldout(cct
, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret
<< dendl
;
12281 int RGWRados::_get_bucket_info(RGWObjectCtx
& obj_ctx
,
12282 const string
& tenant
,
12283 const string
& bucket_name
,
12284 RGWBucketInfo
& info
,
12286 map
<string
, bufferlist
> *pattrs
,
12287 boost::optional
<obj_version
> refresh_version
)
12289 bucket_info_entry e
;
12290 string bucket_entry
;
12291 rgw_make_bucket_entry_name(tenant
, bucket_name
, bucket_entry
);
12294 if (binfo_cache
->find(bucket_entry
, &e
)) {
12295 if (refresh_version
&&
12296 e
.info
.objv_tracker
.read_version
.compare(&(*refresh_version
))) {
12297 lderr(cct
) << "WARNING: The bucket info cache is inconsistent. This is "
12298 << "a failure that should be debugged. I am a nice machine, "
12299 << "so I will try to recover." << dendl
;
12300 binfo_cache
->invalidate(bucket_entry
);
12310 RGWBucketEntryPoint entry_point
;
12311 real_time ep_mtime
;
12312 RGWObjVersionTracker ot
;
12313 rgw_cache_entry_info entry_cache_info
;
12314 int ret
= get_bucket_entrypoint_info(obj_ctx
, tenant
, bucket_name
,
12315 entry_point
, &ot
, &ep_mtime
, pattrs
,
12316 &entry_cache_info
, refresh_version
);
12318 /* only init these fields */
12319 info
.bucket
.tenant
= tenant
;
12320 info
.bucket
.name
= bucket_name
;
12324 if (entry_point
.has_bucket_info
) {
12325 info
= entry_point
.old_bucket_info
;
12326 info
.bucket
.oid
= bucket_name
;
12327 info
.bucket
.tenant
= tenant
;
12328 info
.ep_objv
= ot
.read_version
;
12329 ldout(cct
, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info
.bucket
<< " owner " << info
.owner
<< dendl
;
12333 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12340 ldout(cct
, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point
.bucket
<< dendl
;
12343 /* read bucket instance info */
12346 get_bucket_meta_oid(entry_point
.bucket
, oid
);
12348 rgw_cache_entry_info cache_info
;
12350 ret
= get_bucket_instance_from_oid(obj_ctx
, oid
, e
.info
, &e
.mtime
, &e
.attrs
,
12351 &cache_info
, refresh_version
);
12352 e
.info
.ep_objv
= ot
.read_version
;
12355 lderr(cct
) << "ERROR: get_bucket_instance_from_oid failed: " << ret
<< dendl
;
12356 info
.bucket
.tenant
= tenant
;
12357 info
.bucket
.name
= bucket_name
;
12358 // XXX and why return anything in case of an error anyway?
12367 list
<rgw_cache_entry_info
*> cache_info_entries
;
12368 cache_info_entries
.push_back(&entry_cache_info
);
12369 cache_info_entries
.push_back(&cache_info
);
12372 /* chain to both bucket entry point and bucket instance */
12373 if (!binfo_cache
->put(this, bucket_entry
, &e
, cache_info_entries
)) {
12374 ldout(cct
, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl
;
12377 if (refresh_version
&&
12378 refresh_version
->compare(&info
.objv_tracker
.read_version
)) {
12379 lderr(cct
) << "WARNING: The OSD has the same version I have. Something may "
12380 << "have gone squirrelly. An administrator may have forced a "
12381 << "change; otherwise there is a problem somewhere." << dendl
;
12387 int RGWRados::get_bucket_info(RGWObjectCtx
& obj_ctx
,
12388 const string
& tenant
, const string
& bucket_name
,
12389 RGWBucketInfo
& info
,
12390 real_time
*pmtime
, map
<string
, bufferlist
> *pattrs
)
12392 return _get_bucket_info(obj_ctx
, tenant
, bucket_name
, info
, pmtime
,
12393 pattrs
, boost::none
);
12396 int RGWRados::try_refresh_bucket_info(RGWBucketInfo
& info
,
12397 ceph::real_time
*pmtime
,
12398 map
<string
, bufferlist
> *pattrs
)
12400 RGWObjectCtx
obj_ctx(this);
12402 return _get_bucket_info(obj_ctx
, info
.bucket
.tenant
, info
.bucket
.name
,
12403 info
, pmtime
, pattrs
, info
.objv_tracker
.read_version
);
12406 int RGWRados::put_bucket_entrypoint_info(const string
& tenant_name
, const string
& bucket_name
, RGWBucketEntryPoint
& entry_point
,
12407 bool exclusive
, RGWObjVersionTracker
& objv_tracker
, real_time mtime
,
12408 map
<string
, bufferlist
> *pattrs
)
12411 ::encode(entry_point
, epbl
);
12412 string bucket_entry
;
12413 rgw_make_bucket_entry_name(tenant_name
, bucket_name
, bucket_entry
);
12414 return rgw_bucket_store_info(this, bucket_entry
, epbl
, exclusive
, pattrs
, &objv_tracker
, mtime
);
12417 int RGWRados::put_bucket_instance_info(RGWBucketInfo
& info
, bool exclusive
,
12418 real_time mtime
, map
<string
, bufferlist
> *pattrs
)
12420 info
.has_instance_obj
= true;
12423 ::encode(info
, bl
);
12425 string key
= info
.bucket
.get_key(); /* when we go through meta api, we don't use oid directly */
12426 int ret
= rgw_bucket_instance_store_info(this, key
, bl
, exclusive
, pattrs
, &info
.objv_tracker
, mtime
);
12427 if (ret
== -EEXIST
) {
12428 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12429 * bucket operation on this specific bucket (e.g., being synced from the master), but
12430 * since bucket instace meta object is unique for this specific bucket instace, we don't
12431 * need to return an error.
12432 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12433 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12434 * locally, while in the sync thread we sync the new bucket.
12441 int RGWRados::put_linked_bucket_info(RGWBucketInfo
& info
, bool exclusive
, real_time mtime
, obj_version
*pep_objv
,
12442 map
<string
, bufferlist
> *pattrs
, bool create_entry_point
)
12444 bool create_head
= !info
.has_instance_obj
|| create_entry_point
;
12446 int ret
= put_bucket_instance_info(info
, exclusive
, mtime
, pattrs
);
12452 return 0; /* done! */
12454 RGWBucketEntryPoint entry_point
;
12455 entry_point
.bucket
= info
.bucket
;
12456 entry_point
.owner
= info
.owner
;
12457 entry_point
.creation_time
= info
.creation_time
;
12458 entry_point
.linked
= true;
12459 RGWObjVersionTracker ot
;
12460 if (pep_objv
&& !pep_objv
->tag
.empty()) {
12461 ot
.write_version
= *pep_objv
;
12463 ot
.generate_new_write_ver(cct
);
12465 *pep_objv
= ot
.write_version
;
12468 ret
= put_bucket_entrypoint_info(info
.bucket
.tenant
, info
.bucket
.name
, entry_point
, exclusive
, ot
, mtime
, NULL
);
12475 int RGWRados::omap_get_vals(rgw_raw_obj
& obj
, bufferlist
& header
, const string
& marker
, uint64_t count
, std::map
<string
, bufferlist
>& m
)
12478 int r
= get_raw_obj_ref(obj
, &ref
);
12483 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, marker
, count
, &m
);
12491 int RGWRados::omap_get_all(rgw_raw_obj
& obj
, bufferlist
& header
,
12492 std::map
<string
, bufferlist
>& m
)
12495 int r
= get_raw_obj_ref(obj
, &ref
);
12500 #define MAX_OMAP_GET_ENTRIES 1024
12501 const int count
= MAX_OMAP_GET_ENTRIES
;
12502 string start_after
;
12505 std::map
<string
, bufferlist
> t
;
12506 r
= ref
.ioctx
.omap_get_vals(ref
.oid
, start_after
, count
, &t
);
12513 start_after
= t
.rbegin()->first
;
12514 m
.insert(t
.begin(), t
.end());
12519 int RGWRados::omap_set(rgw_raw_obj
& obj
, const std::string
& key
, bufferlist
& bl
)
12522 int r
= get_raw_obj_ref(obj
, &ref
);
12526 ldout(cct
, 15) << "omap_set obj=" << obj
<< " key=" << key
<< dendl
;
12528 map
<string
, bufferlist
> m
;
12531 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
12536 int RGWRados::omap_set(rgw_raw_obj
& obj
, std::map
<std::string
, bufferlist
>& m
)
12539 int r
= get_raw_obj_ref(obj
, &ref
);
12544 r
= ref
.ioctx
.omap_set(ref
.oid
, m
);
12549 int RGWRados::omap_del(rgw_raw_obj
& obj
, const std::string
& key
)
12552 int r
= get_raw_obj_ref(obj
, &ref
);
12560 r
= ref
.ioctx
.omap_rm_keys(ref
.oid
, k
);
12564 int RGWRados::update_containers_stats(map
<string
, RGWBucketEnt
>& m
)
12566 RGWObjectCtx
obj_ctx(this);
12568 map
<string
, RGWBucketEnt
>::iterator iter
;
12569 for (iter
= m
.begin(); iter
!= m
.end(); ++iter
) {
12570 RGWBucketEnt
& ent
= iter
->second
;
12571 rgw_bucket
& bucket
= ent
.bucket
;
12574 ent
.size_rounded
= 0;
12576 map
<string
, rgw_bucket_dir_header
> headers
;
12578 RGWBucketInfo bucket_info
;
12579 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
12584 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
12588 map
<string
, rgw_bucket_dir_header
>::iterator hiter
= headers
.begin();
12589 for (; hiter
!= headers
.end(); ++hiter
) {
12590 RGWObjCategory category
= main_category
;
12591 map
<uint8_t, struct rgw_bucket_category_stats
>::iterator iter
= (hiter
->second
.stats
).find((uint8_t)category
);
12592 if (iter
!= hiter
->second
.stats
.end()) {
12593 struct rgw_bucket_category_stats
& stats
= iter
->second
;
12594 ent
.count
+= stats
.num_entries
;
12595 ent
.size
+= stats
.total_size
;
12596 ent
.size_rounded
+= stats
.total_size_rounded
;
12600 // fill in placement_rule from the bucket instance for use in swift's
12601 // per-storage policy statistics
12602 ent
.placement_rule
= std::move(bucket_info
.placement_rule
);
12608 int RGWRados::append_async(rgw_raw_obj
& obj
, size_t size
, bufferlist
& bl
)
12611 int r
= get_raw_obj_ref(obj
, &ref
);
12615 librados::Rados
*rad
= get_rados_handle();
12616 librados::AioCompletion
*completion
= rad
->aio_create_completion(NULL
, NULL
, NULL
);
12618 r
= ref
.ioctx
.aio_append(ref
.oid
, completion
, bl
, size
);
12619 completion
->release();
12623 int RGWRados::distribute(const string
& key
, bufferlist
& bl
)
12626 * we were called before watch was initialized. This can only happen if we're updating some system
12627 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12628 * objects, they're currently only read on startup anyway.
12630 if (!watch_initialized
)
12634 pick_control_oid(key
, notify_oid
);
12636 ldout(cct
, 10) << "distributing notification oid=" << notify_oid
<< " bl.length()=" << bl
.length() << dendl
;
12637 return control_pool_ctx
.notify2(notify_oid
, bl
, 0, NULL
);
12640 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, RGWPoolIterCtx
& ctx
)
12642 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
12643 librados::NObjectIterator
& iter
= ctx
.iter
;
12645 int r
= open_pool_ctx(pool
, io_ctx
);
12649 iter
= io_ctx
.nobjects_begin();
12654 int RGWRados::pool_iterate_begin(const rgw_pool
& pool
, const string
& cursor
, RGWPoolIterCtx
& ctx
)
12656 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
12657 librados::NObjectIterator
& iter
= ctx
.iter
;
12659 int r
= open_pool_ctx(pool
, io_ctx
);
12663 librados::ObjectCursor oc
;
12664 if (!oc
.from_str(cursor
)) {
12665 ldout(cct
, 10) << "failed to parse cursor: " << cursor
<< dendl
;
12670 iter
= io_ctx
.nobjects_begin(oc
);
12672 } catch (const std::system_error
& e
) {
12673 r
= -e
.code().value();
12674 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
12675 << ", returning " << r
<< dendl
;
12677 } catch (const std::exception
& e
) {
12678 ldout(cct
, 10) << "nobjects_begin threw " << e
.what()
12679 << ", returning -5" << dendl
;
12684 string
RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx
& ctx
)
12686 return ctx
.iter
.get_cursor().to_str();
12689 static int do_pool_iterate(CephContext
* cct
, RGWPoolIterCtx
& ctx
, uint32_t num
,
12690 vector
<rgw_bucket_dir_entry
>& objs
,
12691 bool *is_truncated
, RGWAccessListFilter
*filter
)
12693 librados::IoCtx
& io_ctx
= ctx
.io_ctx
;
12694 librados::NObjectIterator
& iter
= ctx
.iter
;
12696 if (iter
== io_ctx
.nobjects_end())
12701 for (i
= 0; i
< num
&& iter
!= io_ctx
.nobjects_end(); ++i
, ++iter
) {
12702 rgw_bucket_dir_entry e
;
12704 string oid
= iter
->get_oid();
12705 ldout(cct
, 20) << "RGWRados::pool_iterate: got " << oid
<< dendl
;
12707 // fill it in with initial values; we may correct later
12708 if (filter
&& !filter
->filter(oid
, oid
))
12716 *is_truncated
= (iter
!= io_ctx
.nobjects_end());
12718 return objs
.size();
12720 struct RGWAccessListFilterPrefix
: public RGWAccessListFilter
{
12723 explicit RGWAccessListFilterPrefix(const string
& _prefix
) : prefix(_prefix
) {}
12724 bool filter(string
& name
, string
& key
) override
{
12725 return (prefix
.compare(key
.substr(0, prefix
.size())) == 0);
12729 int RGWRados::pool_iterate(RGWPoolIterCtx
& ctx
, uint32_t num
, vector
<rgw_bucket_dir_entry
>& objs
,
12730 bool *is_truncated
, RGWAccessListFilter
*filter
)
12732 // catch exceptions from NObjectIterator::operator++()
12734 return do_pool_iterate(cct
, ctx
, num
, objs
, is_truncated
, filter
);
12735 } catch (const std::system_error
& e
) {
12736 int r
= -e
.code().value();
12737 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
12738 << ", returning " << r
<< dendl
;
12740 } catch (const std::exception
& e
) {
12741 ldout(cct
, 10) << "NObjectIterator threw exception " << e
.what()
12742 << ", returning -5" << dendl
;
12747 int RGWRados::list_raw_objects_init(const rgw_pool
& pool
, const string
& marker
, RGWListRawObjsCtx
*ctx
)
12749 if (!ctx
->initialized
) {
12750 int r
= pool_iterate_begin(pool
, marker
, ctx
->iter_ctx
);
12752 ldout(cct
, 10) << "failed to list objects pool_iterate_begin() returned r=" << r
<< dendl
;
12755 ctx
->initialized
= true;
12760 int RGWRados::list_raw_objects_next(const string
& prefix_filter
, int max
,
12761 RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
12762 bool *is_truncated
)
12764 if (!ctx
.initialized
) {
12767 RGWAccessListFilterPrefix
filter(prefix_filter
);
12768 vector
<rgw_bucket_dir_entry
> objs
;
12769 int r
= pool_iterate(ctx
.iter_ctx
, max
, objs
, is_truncated
, &filter
);
12772 ldout(cct
, 10) << "failed to list objects pool_iterate returned r=" << r
<< dendl
;
12776 vector
<rgw_bucket_dir_entry
>::iterator iter
;
12777 for (iter
= objs
.begin(); iter
!= objs
.end(); ++iter
) {
12778 oids
.push_back(iter
->key
.name
);
12781 return oids
.size();
12784 int RGWRados::list_raw_objects(const rgw_pool
& pool
, const string
& prefix_filter
,
12785 int max
, RGWListRawObjsCtx
& ctx
, list
<string
>& oids
,
12786 bool *is_truncated
)
12788 if (!ctx
.initialized
) {
12789 int r
= list_raw_objects_init(pool
, string(), &ctx
);
12795 return list_raw_objects_next(prefix_filter
, max
, ctx
, oids
, is_truncated
);
12798 string
RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx
& ctx
)
12800 return pool_iterate_get_cursor(ctx
.iter_ctx
);
12803 int RGWRados::list_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& marker
, uint32_t max
,
12804 std::list
<rgw_bi_log_entry
>& result
, bool *truncated
)
12806 ldout(cct
, 20) << __func__
<< ": " << bucket_info
.bucket
<< " marker " << marker
<< " shard_id=" << shard_id
<< " max " << max
<< dendl
;
12809 librados::IoCtx index_ctx
;
12810 map
<int, string
> oids
;
12811 map
<int, cls_rgw_bi_log_list_ret
> bi_log_lists
;
12812 map
<int, string
> bucket_instance_ids
;
12813 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
, &bucket_instance_ids
);
12817 BucketIndexShardsManager marker_mgr
;
12818 bool has_shards
= (oids
.size() > 1 || shard_id
>= 0);
12819 // If there are multiple shards for the bucket index object, the marker
12820 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12821 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12822 // only contain one record, and the key is the bucket instance id.
12823 r
= marker_mgr
.from_string(marker
, shard_id
);
12827 r
= CLSRGWIssueBILogList(index_ctx
, marker_mgr
, max
, oids
, bi_log_lists
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12831 map
<int, list
<rgw_bi_log_entry
>::iterator
> vcurrents
;
12832 map
<int, list
<rgw_bi_log_entry
>::iterator
> vends
;
12834 *truncated
= false;
12836 map
<int, cls_rgw_bi_log_list_ret
>::iterator miter
= bi_log_lists
.begin();
12837 for (; miter
!= bi_log_lists
.end(); ++miter
) {
12838 int shard_id
= miter
->first
;
12839 vcurrents
[shard_id
] = miter
->second
.entries
.begin();
12840 vends
[shard_id
] = miter
->second
.entries
.end();
12842 *truncated
= (*truncated
|| miter
->second
.truncated
);
12847 bool has_more
= true;
12848 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator viter
;
12849 map
<int, list
<rgw_bi_log_entry
>::iterator
>::iterator eiter
;
12850 while (total
< max
&& has_more
) {
12853 viter
= vcurrents
.begin();
12854 eiter
= vends
.begin();
12856 for (; total
< max
&& viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
12857 assert (eiter
!= vends
.end());
12859 int shard_id
= viter
->first
;
12860 list
<rgw_bi_log_entry
>::iterator
& liter
= viter
->second
;
12862 if (liter
== eiter
->second
){
12865 rgw_bi_log_entry
& entry
= *(liter
);
12868 snprintf(buf
, sizeof(buf
), "%d", shard_id
);
12870 build_bucket_index_marker(buf
, entry
.id
, &tmp_id
);
12871 entry
.id
.swap(tmp_id
);
12873 marker_mgr
.add(shard_id
, entry
.id
);
12874 result
.push_back(entry
);
12882 for (viter
= vcurrents
.begin(), eiter
= vends
.begin(); viter
!= vcurrents
.end(); ++viter
, ++eiter
) {
12883 assert (eiter
!= vends
.end());
12884 *truncated
= (*truncated
|| (viter
->second
!= eiter
->second
));
12888 // Refresh marker, if there are multiple shards, the output will look like
12889 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12890 // if there is no sharding, the simply marker (without oid) is returned
12892 marker_mgr
.to_string(&marker
);
12894 if (!result
.empty()) {
12895 marker
= result
.rbegin()->id
;
12902 int RGWRados::trim_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
, string
& start_marker
, string
& end_marker
)
12904 librados::IoCtx index_ctx
;
12905 map
<int, string
> bucket_objs
;
12907 BucketIndexShardsManager start_marker_mgr
;
12908 BucketIndexShardsManager end_marker_mgr
;
12910 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12915 r
= start_marker_mgr
.from_string(start_marker
, shard_id
);
12920 r
= end_marker_mgr
.from_string(end_marker
, shard_id
);
12925 return CLSRGWIssueBILogTrim(index_ctx
, start_marker_mgr
, end_marker_mgr
, bucket_objs
,
12926 cct
->_conf
->rgw_bucket_index_max_aio
)();
12931 int RGWRados::resync_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
12933 librados::IoCtx index_ctx
;
12934 map
<int, string
> bucket_objs
;
12935 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12939 return CLSRGWIssueResyncBucketBILog(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12942 int RGWRados::stop_bi_log_entries(RGWBucketInfo
& bucket_info
, int shard_id
)
12944 librados::IoCtx index_ctx
;
12945 map
<int, string
> bucket_objs
;
12946 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
12950 return CLSRGWIssueBucketBILogStop(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
)();
12953 int RGWRados::bi_get_instance(const RGWBucketInfo
& bucket_info
, rgw_obj
& obj
, rgw_bucket_dir_entry
*dirent
)
12956 int r
= get_obj_head_ref(bucket_info
, obj
, &ref
);
12961 rgw_cls_bi_entry bi_entry
;
12962 r
= bi_get(obj
.bucket
, obj
, InstanceIdx
, &bi_entry
);
12963 if (r
< 0 && r
!= -ENOENT
) {
12964 ldout(cct
, 0) << "ERROR: bi_get() returned r=" << r
<< dendl
;
12969 bufferlist::iterator iter
= bi_entry
.data
.begin();
12971 ::decode(*dirent
, iter
);
12972 } catch (buffer::error
& err
) {
12973 ldout(cct
, 0) << "ERROR: failed to decode bi_entry()" << dendl
;
12980 int RGWRados::bi_get(rgw_bucket
& bucket
, rgw_obj
& obj
, BIIndexType index_type
, rgw_cls_bi_entry
*entry
)
12982 BucketShard
bs(this);
12983 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
12985 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
12989 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
12991 ret
= cls_rgw_bi_get(bs
.index_ctx
, bs
.bucket_obj
, index_type
, key
, entry
);
12998 void RGWRados::bi_put(ObjectWriteOperation
& op
, BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
13000 cls_rgw_bi_put(op
, bs
.bucket_obj
, entry
);
13003 int RGWRados::bi_put(BucketShard
& bs
, rgw_cls_bi_entry
& entry
)
13005 int ret
= cls_rgw_bi_put(bs
.index_ctx
, bs
.bucket_obj
, entry
);
13012 int RGWRados::bi_put(rgw_bucket
& bucket
, rgw_obj
& obj
, rgw_cls_bi_entry
& entry
)
13014 BucketShard
bs(this);
13015 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
13017 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
13021 return bi_put(bs
, entry
);
13024 int RGWRados::bi_list(rgw_bucket
& bucket
, const string
& obj_name
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
13026 rgw_obj
obj(bucket
, obj_name
);
13027 BucketShard
bs(this);
13028 int ret
= bs
.init(bucket
, obj
, nullptr /* no RGWBucketInfo */);
13030 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
13034 ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, obj_name
, marker
, max
, entries
, is_truncated
);
13035 if (ret
== -ENOENT
) {
13036 *is_truncated
= false;
13044 int RGWRados::bi_list(BucketShard
& bs
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
13046 int ret
= cls_rgw_bi_list(bs
.index_ctx
, bs
.bucket_obj
, filter_obj
, marker
, max
, entries
, is_truncated
);
13053 int RGWRados::bi_remove(BucketShard
& bs
)
13055 int ret
= bs
.index_ctx
.remove(bs
.bucket_obj
);
13056 if (ret
== -ENOENT
) {
13060 ldout(cct
, 5) << "bs.index_ctx.remove(" << bs
.bucket_obj
<< ") returned ret=" << ret
<< dendl
;
13067 int RGWRados::bi_list(rgw_bucket
& bucket
, int shard_id
, const string
& filter_obj
, const string
& marker
, uint32_t max
, list
<rgw_cls_bi_entry
> *entries
, bool *is_truncated
)
13069 BucketShard
bs(this);
13070 int ret
= bs
.init(bucket
, shard_id
, nullptr /* no RGWBucketInfo */);
13072 ldout(cct
, 5) << "bs.init() returned ret=" << ret
<< dendl
;
13076 return bi_list(bs
, filter_obj
, marker
, max
, entries
, is_truncated
);
13079 int RGWRados::gc_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
13081 return gc_pool_ctx
.operate(oid
, op
);
13084 int RGWRados::gc_aio_operate(string
& oid
, librados::ObjectWriteOperation
*op
)
13086 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13087 int r
= gc_pool_ctx
.aio_operate(oid
, c
, op
);
13092 int RGWRados::gc_operate(string
& oid
, librados::ObjectReadOperation
*op
, bufferlist
*pbl
)
13094 return gc_pool_ctx
.operate(oid
, op
, pbl
);
13097 int RGWRados::list_gc_objs(int *index
, string
& marker
, uint32_t max
, bool expired_only
, std::list
<cls_rgw_gc_obj_info
>& result
, bool *truncated
)
13099 return gc
->list(index
, marker
, max
, expired_only
, result
, truncated
);
13102 int RGWRados::process_gc()
13104 return gc
->process();
13107 int RGWRados::list_lc_progress(const string
& marker
, uint32_t max_entries
, map
<string
, int> *progress_map
)
13109 return lc
->list_lc_progress(marker
, max_entries
, progress_map
);
13112 int RGWRados::process_lc()
13114 return lc
->process();
13117 bool RGWRados::process_expire_objects()
13119 return obj_expirer
->inspect_all_shards(utime_t(), ceph_clock_now());
13122 int RGWRados::cls_obj_prepare_op(BucketShard
& bs
, RGWModifyOp op
, string
& tag
,
13123 rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
13125 rgw_zone_set zones_trace
;
13126 if (_zones_trace
) {
13127 zones_trace
= *_zones_trace
;
13129 zones_trace
.insert(get_zone().id
);
13131 ObjectWriteOperation o
;
13132 cls_rgw_obj_key
key(obj
.key
.get_index_key_name(), obj
.key
.instance
);
13133 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
13134 cls_rgw_bucket_prepare_op(o
, op
, tag
, key
, obj
.key
.get_loc(), get_zone().log_data
, bilog_flags
, zones_trace
);
13135 return bs
.index_ctx
.operate(bs
.bucket_obj
, &o
);
13138 int RGWRados::cls_obj_complete_op(BucketShard
& bs
, const rgw_obj
& obj
, RGWModifyOp op
, string
& tag
,
13139 int64_t pool
, uint64_t epoch
,
13140 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
13141 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*_zones_trace
)
13143 ObjectWriteOperation o
;
13144 rgw_bucket_dir_entry_meta dir_meta
;
13145 dir_meta
= ent
.meta
;
13146 dir_meta
.category
= category
;
13148 rgw_zone_set zones_trace
;
13149 if (_zones_trace
) {
13150 zones_trace
= *_zones_trace
;
13152 zones_trace
.insert(get_zone().id
);
13154 rgw_bucket_entry_ver ver
;
13157 cls_rgw_obj_key
key(ent
.key
.name
, ent
.key
.instance
);
13158 cls_rgw_guard_bucket_resharding(o
, -ERR_BUSY_RESHARDING
);
13159 cls_rgw_bucket_complete_op(o
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
13160 get_zone().log_data
, bilog_flags
, &zones_trace
);
13161 complete_op_data
*arg
;
13162 index_completion_manager
->create_completion(obj
, op
, tag
, ver
, key
, dir_meta
, remove_objs
,
13163 get_zone().log_data
, bilog_flags
, &zones_trace
, &arg
);
13164 librados::AioCompletion
*completion
= arg
->rados_completion
;
13165 int ret
= bs
.index_ctx
.aio_operate(bs
.bucket_obj
, arg
->rados_completion
, &o
);
13166 completion
->release(); /* can't reference arg here, as it might have already been released */
13170 int RGWRados::cls_obj_complete_add(BucketShard
& bs
, const rgw_obj
& obj
, string
& tag
,
13171 int64_t pool
, uint64_t epoch
,
13172 rgw_bucket_dir_entry
& ent
, RGWObjCategory category
,
13173 list
<rgw_obj_index_key
> *remove_objs
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
13175 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_ADD
, tag
, pool
, epoch
, ent
, category
, remove_objs
, bilog_flags
, zones_trace
);
13178 int RGWRados::cls_obj_complete_del(BucketShard
& bs
, string
& tag
,
13179 int64_t pool
, uint64_t epoch
,
13181 real_time
& removed_mtime
,
13182 list
<rgw_obj_index_key
> *remove_objs
,
13183 uint16_t bilog_flags
,
13184 rgw_zone_set
*zones_trace
)
13186 rgw_bucket_dir_entry ent
;
13187 ent
.meta
.mtime
= removed_mtime
;
13188 obj
.key
.get_index_key(&ent
.key
);
13189 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_DEL
, tag
, pool
, epoch
, ent
, RGW_OBJ_CATEGORY_NONE
, remove_objs
, bilog_flags
, zones_trace
);
13192 int RGWRados::cls_obj_complete_cancel(BucketShard
& bs
, string
& tag
, rgw_obj
& obj
, uint16_t bilog_flags
, rgw_zone_set
*zones_trace
)
13194 rgw_bucket_dir_entry ent
;
13195 obj
.key
.get_index_key(&ent
.key
);
13196 return cls_obj_complete_op(bs
, obj
, CLS_RGW_OP_CANCEL
, tag
, -1 /* pool id */, 0, ent
, RGW_OBJ_CATEGORY_NONE
, NULL
, bilog_flags
, zones_trace
);
13199 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo
& bucket_info
, uint64_t timeout
)
13201 librados::IoCtx index_ctx
;
13202 map
<int, string
> bucket_objs
;
13203 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
);
13207 return CLSRGWIssueSetTagTimeout(index_ctx
, bucket_objs
, cct
->_conf
->rgw_bucket_index_max_aio
, timeout
)();
13211 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo
& bucket_info
,
13213 rgw_obj_index_key
& start
,
13214 const string
& prefix
,
13215 uint32_t num_entries
,
13216 bool list_versions
,
13217 map
<string
, rgw_bucket_dir_entry
>& m
,
13218 bool *is_truncated
,
13219 rgw_obj_index_key
*last_entry
,
13220 bool (*force_check_filter
)(const string
& name
))
13222 ldout(cct
, 10) << "cls_bucket_list_ordered " << bucket_info
.bucket
<<
13223 " start " << start
.name
<< "[" << start
.instance
<< "] num_entries " <<
13224 num_entries
<< dendl
;
13226 librados::IoCtx index_ctx
;
13227 // key - oid (for different shards if there is any)
13228 // value - list result for the corresponding oid (shard), it is filled by
13229 // the AIO callback
13230 map
<int, string
> oids
;
13231 map
<int, struct rgw_cls_list_ret
> list_results
;
13232 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
13236 cls_rgw_obj_key
start_key(start
.name
, start
.instance
);
13237 r
= CLSRGWIssueBucketList(index_ctx
, start_key
, prefix
, num_entries
,
13238 list_versions
, oids
, list_results
,
13239 cct
->_conf
->rgw_bucket_index_max_aio
)();
13243 // Create a list of iterators that are used to iterate each shard
13244 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vcurrents(list_results
.size());
13245 vector
<map
<string
, struct rgw_bucket_dir_entry
>::iterator
> vends(list_results
.size());
13246 vector
<string
> vnames(list_results
.size());
13247 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
13248 *is_truncated
= false;
13249 for (; iter
!= list_results
.end(); ++iter
) {
13250 vcurrents
.push_back(iter
->second
.dir
.m
.begin());
13251 vends
.push_back(iter
->second
.dir
.m
.end());
13252 vnames
.push_back(oids
[iter
->first
]);
13253 *is_truncated
= (*is_truncated
|| iter
->second
.is_truncated
);
13256 // Create a map to track the next candidate entry from each shard, if the entry
13257 // from a specified shard is selected/erased, the next entry from that shard will
13258 // be inserted for next round selection
13259 map
<string
, size_t> candidates
;
13260 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
13261 if (vcurrents
[i
] != vends
[i
]) {
13262 candidates
[vcurrents
[i
]->first
] = i
;
13266 map
<string
, bufferlist
> updates
;
13267 uint32_t count
= 0;
13268 while (count
< num_entries
&& !candidates
.empty()) {
13270 // Select the next one
13271 int pos
= candidates
.begin()->second
;
13272 const string
& name
= vcurrents
[pos
]->first
;
13273 struct rgw_bucket_dir_entry
& dirent
= vcurrents
[pos
]->second
;
13275 bool force_check
= force_check_filter
&&
13276 force_check_filter(dirent
.key
.name
);
13277 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
13278 !dirent
.pending_map
.empty() ||
13280 /* there are uncommitted ops. We need to check the current state,
13281 * and if the tags are old we need to do cleanup as well. */
13282 librados::IoCtx sub_ctx
;
13283 sub_ctx
.dup(index_ctx
);
13284 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
,
13285 updates
[vnames
[pos
]]);
13286 if (r
< 0 && r
!= -ENOENT
) {
13291 ldout(cct
, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
13292 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
13293 m
[name
] = std::move(dirent
);
13297 // Refresh the candidates map
13298 candidates
.erase(candidates
.begin());
13300 if (vcurrents
[pos
] != vends
[pos
]) {
13301 candidates
[vcurrents
[pos
]->first
] = pos
;
13305 // Suggest updates if there is any
13306 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
13307 for (; miter
!= updates
.end(); ++miter
) {
13308 if (miter
->second
.length()) {
13309 ObjectWriteOperation o
;
13310 cls_rgw_suggest_changes(o
, miter
->second
);
13311 // we don't care if we lose suggested updates, send them off blindly
13312 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13313 index_ctx
.aio_operate(miter
->first
, c
, &o
);
13318 // Check if all the returned entries are consumed or not
13319 for (size_t i
= 0; i
< vcurrents
.size(); ++i
) {
13320 if (vcurrents
[i
] != vends
[i
]) {
13321 *is_truncated
= true;
13326 *last_entry
= m
.rbegin()->first
;
13332 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo
& bucket_info
,
13334 rgw_obj_index_key
& start
,
13335 const string
& prefix
,
13336 uint32_t num_entries
,
13337 bool list_versions
,
13338 std::vector
<rgw_bucket_dir_entry
>& ent_list
,
13339 bool *is_truncated
,
13340 rgw_obj_index_key
*last_entry
,
13341 bool (*force_check_filter
)(const string
& name
)) {
13342 ldout(cct
, 10) << "cls_bucket_list_unordered " << bucket_info
.bucket
<<
13343 " start " << start
.name
<< "[" << start
.instance
<<
13344 "] num_entries " << num_entries
<< dendl
;
13346 *is_truncated
= false;
13347 librados::IoCtx index_ctx
;
13349 rgw_obj_index_key my_start
= start
;
13351 map
<int, string
> oids
;
13352 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, shard_id
);
13355 const uint32_t num_shards
= oids
.size();
13357 uint32_t current_shard
;
13358 if (shard_id
>= 0) {
13359 current_shard
= shard_id
;
13360 } else if (my_start
.empty()) {
13361 current_shard
= 0u;
13364 rgw_bucket_shard_index(my_start
.name
, num_shards
);
13367 uint32_t count
= 0u;
13368 map
<string
, bufferlist
> updates
;
13369 std::string last_added_entry
;
13370 while (count
<= num_entries
&&
13371 ((shard_id
>= 0 && current_shard
== uint32_t(shard_id
)) ||
13372 current_shard
< num_shards
)) {
13373 // key - oid (for different shards if there is any)
13374 // value - list result for the corresponding oid (shard), it is filled by
13375 // the AIO callback
13376 map
<int, struct rgw_cls_list_ret
> list_results
;
13377 r
= CLSRGWIssueBucketList(index_ctx
, my_start
, prefix
, num_entries
,
13378 list_versions
, oids
, list_results
,
13379 cct
->_conf
->rgw_bucket_index_max_aio
)();
13383 const std::string
& oid
= oids
[current_shard
];
13384 assert(list_results
.find(current_shard
) != list_results
.end());
13385 auto& result
= list_results
[current_shard
];
13386 for (auto& entry
: result
.dir
.m
) {
13387 rgw_bucket_dir_entry
& dirent
= entry
.second
;
13389 bool force_check
= force_check_filter
&&
13390 force_check_filter(dirent
.key
.name
);
13391 if ((!dirent
.exists
&& !dirent
.is_delete_marker()) ||
13392 !dirent
.pending_map
.empty() ||
13394 /* there are uncommitted ops. We need to check the current state,
13395 * and if the tags are old we need to do cleanup as well. */
13396 librados::IoCtx sub_ctx
;
13397 sub_ctx
.dup(index_ctx
);
13398 r
= check_disk_state(sub_ctx
, bucket_info
, dirent
, dirent
, updates
[oid
]);
13399 if (r
< 0 && r
!= -ENOENT
) {
13404 // at this point either r >=0 or r == -ENOENT
13405 if (r
>= 0) { // i.e., if r != -ENOENT
13406 ldout(cct
, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
13407 dirent
.key
.name
<< "[" << dirent
.key
.instance
<< "]" << dendl
;
13409 if (count
< num_entries
) {
13410 last_added_entry
= entry
.first
;
13411 my_start
= dirent
.key
;
13412 ent_list
.emplace_back(std::move(dirent
));
13415 *is_truncated
= true;
13416 goto check_updates
;
13418 } else { // r == -ENOENT
13419 // in the case of -ENOENT, make sure we're advancing marker
13420 // for possible next call to CLSRGWIssueBucketList
13421 my_start
= dirent
.key
;
13423 } // entry for loop
13425 if (!result
.is_truncated
) {
13426 // if we reached the end of the shard read next shard
13428 my_start
= rgw_obj_index_key();
13433 // suggest updates if there is any
13434 map
<string
, bufferlist
>::iterator miter
= updates
.begin();
13435 for (; miter
!= updates
.end(); ++miter
) {
13436 if (miter
->second
.length()) {
13437 ObjectWriteOperation o
;
13438 cls_rgw_suggest_changes(o
, miter
->second
);
13439 // we don't care if we lose suggested updates, send them off blindly
13440 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
13441 index_ctx
.aio_operate(miter
->first
, c
, &o
);
13446 if (last_entry
&& !ent_list
.empty()) {
13447 *last_entry
= last_added_entry
;
13454 int RGWRados::cls_obj_usage_log_add(const string
& oid
,
13455 rgw_usage_log_info
& info
)
13457 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
13460 int r
= get_raw_obj_ref(obj
, &ref
);
13465 ObjectWriteOperation op
;
13466 cls_rgw_usage_log_add(op
, info
);
13468 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13472 int RGWRados::cls_obj_usage_log_read(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
, uint32_t max_entries
,
13473 string
& read_iter
, map
<rgw_user_bucket
, rgw_usage_log_entry
>& usage
, bool *is_truncated
)
13475 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
13478 int r
= get_raw_obj_ref(obj
, &ref
);
13483 *is_truncated
= false;
13485 r
= cls_rgw_usage_log_read(ref
.ioctx
, ref
.oid
, user
, start_epoch
, end_epoch
,
13486 max_entries
, read_iter
, usage
, is_truncated
);
13491 int RGWRados::cls_obj_usage_log_trim(string
& oid
, string
& user
, uint64_t start_epoch
, uint64_t end_epoch
)
13493 rgw_raw_obj
obj(get_zone_params().usage_log_pool
, oid
);
13496 int r
= get_raw_obj_ref(obj
, &ref
);
13501 r
= cls_rgw_usage_log_trim(ref
.ioctx
, ref
.oid
, user
, start_epoch
, end_epoch
);
13505 int RGWRados::remove_objs_from_index(RGWBucketInfo
& bucket_info
, list
<rgw_obj_index_key
>& oid_list
)
13507 librados::IoCtx index_ctx
;
13510 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
13512 int r
= open_bucket_index(bucket_info
, index_ctx
, dir_oid
);
13516 bufferlist updates
;
13518 for (auto iter
= oid_list
.begin(); iter
!= oid_list
.end(); ++iter
) {
13519 rgw_bucket_dir_entry entry
;
13521 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info
.bucket
<< " obj=" << entry
.key
.name
<< ":" << entry
.key
.instance
<< dendl
;
13522 entry
.ver
.epoch
= (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13523 updates
.append(CEPH_RGW_REMOVE
| suggest_flag
);
13524 ::encode(entry
, updates
);
13529 r
= index_ctx
.exec(dir_oid
, RGW_CLASS
, RGW_DIR_SUGGEST_CHANGES
, updates
, out
);
13534 int RGWRados::check_disk_state(librados::IoCtx io_ctx
,
13535 const RGWBucketInfo
& bucket_info
,
13536 rgw_bucket_dir_entry
& list_state
,
13537 rgw_bucket_dir_entry
& object
,
13538 bufferlist
& suggested_updates
)
13540 const rgw_bucket
& bucket
= bucket_info
.bucket
;
13541 uint8_t suggest_flag
= (get_zone().log_data
? CEPH_RGW_DIR_SUGGEST_LOG_OP
: 0);
13545 rgw_obj
obj(bucket
, list_state
.key
);
13548 get_obj_bucket_and_oid_loc(obj
, oid
, loc
);
13550 if (loc
!= list_state
.locator
) {
13551 ldout(cct
, 0) << "WARNING: generated locator (" << loc
<< ") is different from listed locator (" << list_state
.locator
<< ")" << dendl
;
13554 io_ctx
.locator_set_key(list_state
.locator
);
13556 RGWObjState
*astate
= NULL
;
13557 RGWObjectCtx
rctx(this);
13558 int r
= get_obj_state(&rctx
, bucket_info
, obj
, &astate
, false);
13562 list_state
.pending_map
.clear(); // we don't need this and it inflates size
13563 if (!astate
->exists
) {
13564 /* object doesn't exist right now -- hopefully because it's
13565 * marked as !exists and got deleted */
13566 if (list_state
.exists
) {
13567 /* FIXME: what should happen now? Work out if there are any
13568 * non-bad ways this could happen (there probably are, but annoying
13571 // encode a suggested removal of that key
13572 list_state
.ver
.epoch
= io_ctx
.get_last_version();
13573 list_state
.ver
.pool
= io_ctx
.get_id();
13574 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE
, list_state
, suggested_updates
);
13579 string content_type
;
13582 object
.meta
.size
= astate
->size
;
13583 object
.meta
.accounted_size
= astate
->accounted_size
;
13584 object
.meta
.mtime
= astate
->mtime
;
13586 map
<string
, bufferlist
>::iterator iter
= astate
->attrset
.find(RGW_ATTR_ETAG
);
13587 if (iter
!= astate
->attrset
.end()) {
13588 etag
= iter
->second
.c_str();
13590 iter
= astate
->attrset
.find(RGW_ATTR_CONTENT_TYPE
);
13591 if (iter
!= astate
->attrset
.end()) {
13592 content_type
= iter
->second
.c_str();
13594 iter
= astate
->attrset
.find(RGW_ATTR_ACL
);
13595 if (iter
!= astate
->attrset
.end()) {
13596 r
= decode_policy(iter
->second
, &owner
);
13598 dout(0) << "WARNING: could not decode policy for object: " << obj
<< dendl
;
13602 if (astate
->has_manifest
) {
13603 RGWObjManifest::obj_iterator miter
;
13604 RGWObjManifest
& manifest
= astate
->manifest
;
13605 for (miter
= manifest
.obj_begin(); miter
!= manifest
.obj_end(); ++miter
) {
13606 const rgw_raw_obj
& raw_loc
= miter
.get_location().get_raw_obj(this);
13608 rgw_raw_obj_to_obj(manifest
.get_obj().bucket
, raw_loc
, &loc
);
13610 if (loc
.key
.ns
== RGW_OBJ_NS_MULTIPART
) {
13611 dout(10) << "check_disk_state(): removing manifest part from index: " << loc
<< dendl
;
13612 r
= delete_obj_index(loc
);
13614 dout(0) << "WARNING: delete_obj_index() returned r=" << r
<< dendl
;
13620 object
.meta
.etag
= etag
;
13621 object
.meta
.content_type
= content_type
;
13622 object
.meta
.owner
= owner
.get_id().to_str();
13623 object
.meta
.owner_display_name
= owner
.get_display_name();
13625 // encode suggested updates
13626 list_state
.ver
.pool
= io_ctx
.get_id();
13627 list_state
.ver
.epoch
= astate
->epoch
;
13628 list_state
.meta
.size
= object
.meta
.size
;
13629 list_state
.meta
.accounted_size
= object
.meta
.accounted_size
;
13630 list_state
.meta
.mtime
= object
.meta
.mtime
;
13631 list_state
.meta
.category
= main_category
;
13632 list_state
.meta
.etag
= etag
;
13633 list_state
.meta
.content_type
= content_type
;
13634 if (astate
->obj_tag
.length() > 0)
13635 list_state
.tag
= astate
->obj_tag
.c_str();
13636 list_state
.meta
.owner
= owner
.get_id().to_str();
13637 list_state
.meta
.owner_display_name
= owner
.get_display_name();
13639 list_state
.exists
= true;
13640 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE
| suggest_flag
, list_state
, suggested_updates
);
13644 int RGWRados::cls_bucket_head(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<string
, struct rgw_bucket_dir_header
>& headers
, map
<int, string
> *bucket_instance_ids
)
13646 librados::IoCtx index_ctx
;
13647 map
<int, string
> oids
;
13648 map
<int, struct rgw_cls_list_ret
> list_results
;
13649 int r
= open_bucket_index(bucket_info
, index_ctx
, oids
, list_results
, shard_id
, bucket_instance_ids
);
13653 r
= CLSRGWIssueGetDirHeader(index_ctx
, oids
, list_results
, cct
->_conf
->rgw_bucket_index_max_aio
)();
13657 map
<int, struct rgw_cls_list_ret
>::iterator iter
= list_results
.begin();
13658 for(; iter
!= list_results
.end(); ++iter
) {
13659 headers
[oids
[iter
->first
]] = iter
->second
.dir
.header
;
13664 int RGWRados::cls_bucket_head_async(const RGWBucketInfo
& bucket_info
, int shard_id
, RGWGetDirHeader_CB
*ctx
, int *num_aio
)
13666 librados::IoCtx index_ctx
;
13667 map
<int, string
> bucket_objs
;
13668 int r
= open_bucket_index(bucket_info
, index_ctx
, bucket_objs
, shard_id
);
13672 map
<int, string
>::iterator iter
= bucket_objs
.begin();
13673 for (; iter
!= bucket_objs
.end(); ++iter
) {
13674 r
= cls_rgw_get_dir_header_async(index_ctx
, iter
->second
, static_cast<RGWGetDirHeader_CB
*>(ctx
->get()));
13685 int RGWRados::cls_user_get_header(const string
& user_id
, cls_user_header
*header
)
13687 string buckets_obj_id
;
13688 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
13689 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
13692 int r
= get_raw_obj_ref(obj
, &ref
);
13697 librados::ObjectReadOperation op
;
13699 ::cls_user_get_header(op
, header
, &rc
);
13701 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
13710 int RGWRados::cls_user_reset_stats(const string
& user_id
)
13712 string buckets_obj_id
;
13713 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
13714 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
13717 int r
= get_raw_obj_ref(obj
, &ref
);
13722 librados::ObjectWriteOperation op
;
13723 ::cls_user_reset_stats(op
);
13724 return ref
.ioctx
.operate(ref
.oid
, &op
);
13727 int RGWRados::cls_user_get_header_async(const string
& user_id
, RGWGetUserHeader_CB
*ctx
)
13729 string buckets_obj_id
;
13730 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
13731 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
13734 int r
= get_raw_obj_ref(obj
, &ref
);
13739 r
= ::cls_user_get_header_async(ref
.ioctx
, ref
.oid
, ctx
);
13746 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj
& user_obj
, const RGWBucketInfo
& bucket_info
)
13748 map
<string
, struct rgw_bucket_dir_header
> headers
;
13749 int r
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
13751 ldout(cct
, 20) << "cls_bucket_header() returned " << r
<< dendl
;
13755 cls_user_bucket_entry entry
;
13757 bucket_info
.bucket
.convert(&entry
.bucket
);
13759 for (const auto& hiter
: headers
) {
13760 for (const auto& iter
: hiter
.second
.stats
) {
13761 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
13762 entry
.size
+= header_stats
.total_size
;
13763 entry
.size_rounded
+= header_stats
.total_size_rounded
;
13764 entry
.count
+= header_stats
.num_entries
;
13768 list
<cls_user_bucket_entry
> entries
;
13769 entries
.push_back(entry
);
13771 r
= cls_user_update_buckets(user_obj
, entries
, false);
13773 ldout(cct
, 20) << "cls_user_update_buckets() returned " << r
<< dendl
;
13780 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket
& bucket
, cls_user_bucket_entry
& entry
)
13782 map
<string
, struct rgw_bucket_dir_header
> headers
;
13783 RGWBucketInfo bucket_info
;
13784 RGWObjectCtx
obj_ctx(this);
13785 int ret
= get_bucket_instance_info(obj_ctx
, bucket
, bucket_info
, NULL
, NULL
);
13790 ret
= cls_bucket_head(bucket_info
, RGW_NO_SHARD
, headers
);
13792 ldout(cct
, 20) << "cls_bucket_header() returned " << ret
<< dendl
;
13796 bucket
.convert(&entry
.bucket
);
13798 for (const auto& hiter
: headers
) {
13799 for (const auto& iter
: hiter
.second
.stats
) {
13800 const struct rgw_bucket_category_stats
& header_stats
= iter
.second
;
13801 entry
.size
+= header_stats
.total_size
;
13802 entry
.size_rounded
+= header_stats
.total_size_rounded
;
13803 entry
.count
+= header_stats
.num_entries
;
13810 int RGWRados::cls_user_list_buckets(rgw_raw_obj
& obj
,
13811 const string
& in_marker
,
13812 const string
& end_marker
,
13813 const int max_entries
,
13814 list
<cls_user_bucket_entry
>& entries
,
13815 string
* const out_marker
,
13816 bool * const truncated
)
13819 int r
= get_raw_obj_ref(obj
, &ref
);
13824 librados::ObjectReadOperation op
;
13827 cls_user_bucket_list(op
, in_marker
, end_marker
, max_entries
, entries
, out_marker
, truncated
, &rc
);
13829 r
= ref
.ioctx
.operate(ref
.oid
, &op
, &ibl
);
13838 int RGWRados::cls_user_update_buckets(rgw_raw_obj
& obj
, list
<cls_user_bucket_entry
>& entries
, bool add
)
13841 int r
= get_raw_obj_ref(obj
, &ref
);
13846 librados::ObjectWriteOperation op
;
13847 cls_user_set_buckets(op
, entries
, add
);
13848 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13855 int RGWRados::complete_sync_user_stats(const rgw_user
& user_id
)
13857 string buckets_obj_id
;
13858 rgw_get_buckets_obj(user_id
, buckets_obj_id
);
13859 rgw_raw_obj
obj(get_zone_params().user_uid_pool
, buckets_obj_id
);
13860 return cls_user_complete_stats_sync(obj
);
13863 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj
& obj
)
13866 int r
= get_raw_obj_ref(obj
, &ref
);
13871 librados::ObjectWriteOperation op
;
13872 ::cls_user_complete_stats_sync(op
);
13873 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13880 int RGWRados::cls_user_add_bucket(rgw_raw_obj
& obj
, const cls_user_bucket_entry
& entry
)
13882 list
<cls_user_bucket_entry
> l
;
13883 l
.push_back(entry
);
13885 return cls_user_update_buckets(obj
, l
, true);
13888 int RGWRados::cls_user_remove_bucket(rgw_raw_obj
& obj
, const cls_user_bucket
& bucket
)
13891 int r
= get_system_obj_ref(obj
, &ref
);
13896 librados::ObjectWriteOperation op
;
13897 ::cls_user_remove_bucket(op
, bucket
);
13898 r
= ref
.ioctx
.operate(ref
.oid
, &op
);
13905 int RGWRados::check_bucket_shards(const RGWBucketInfo
& bucket_info
, const rgw_bucket
& bucket
,
13906 RGWQuotaInfo
& bucket_quota
)
13908 if (!cct
->_conf
->rgw_dynamic_resharding
) {
13912 bool need_resharding
= false;
13913 int num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
13914 uint32_t suggested_num_shards
;
13916 int ret
= quota_handler
->check_bucket_shards((uint64_t)cct
->_conf
->rgw_max_objs_per_shard
,
13917 num_source_shards
, bucket_info
.owner
, bucket
, bucket_quota
,
13918 1, need_resharding
, &suggested_num_shards
);
13923 if (need_resharding
) {
13924 ldout(cct
, 20) << __func__
<< " bucket " << bucket
.name
<< " need resharding " <<
13925 " old num shards " << bucket_info
.num_shards
<< " new num shards " << suggested_num_shards
<<
13927 return add_bucket_to_reshard(bucket_info
, suggested_num_shards
);
13933 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo
& bucket_info
, uint32_t new_num_shards
)
13935 RGWReshard
reshard(this);
13937 uint32_t num_source_shards
= (bucket_info
.num_shards
> 0 ? bucket_info
.num_shards
: 1);
13939 new_num_shards
= min(new_num_shards
, get_max_bucket_shards());
13940 if (new_num_shards
<= num_source_shards
) {
13941 ldout(cct
, 20) << "not resharding bucket name=" << bucket_info
.bucket
.name
<< ", orig_num=" << num_source_shards
<< ", new_num_shards=" << new_num_shards
<< dendl
;
13945 cls_rgw_reshard_entry entry
;
13946 entry
.time
= real_clock::now();
13947 entry
.tenant
= bucket_info
.owner
.tenant
;
13948 entry
.bucket_name
= bucket_info
.bucket
.name
;
13949 entry
.bucket_id
= bucket_info
.bucket
.bucket_id
;
13950 entry
.old_num_shards
= num_source_shards
;
13951 entry
.new_num_shards
= new_num_shards
;
13953 return reshard
.add(entry
);
13956 int RGWRados::check_quota(const rgw_user
& bucket_owner
, rgw_bucket
& bucket
,
13957 RGWQuotaInfo
& user_quota
, RGWQuotaInfo
& bucket_quota
, uint64_t obj_size
)
13959 return quota_handler
->check_quota(bucket_owner
, bucket
, user_quota
, bucket_quota
, 1, obj_size
);
13962 void RGWRados::get_bucket_index_objects(const string
& bucket_oid_base
,
13963 uint32_t num_shards
,
13964 map
<int, string
>& bucket_objects
,
13967 bucket_objects
[0] = bucket_oid_base
;
13969 char buf
[bucket_oid_base
.size() + 32];
13970 if (shard_id
< 0) {
13971 for (uint32_t i
= 0; i
< num_shards
; ++i
) {
13972 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), i
);
13973 bucket_objects
[i
] = buf
;
13976 if ((uint32_t)shard_id
> num_shards
) {
13979 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
13980 bucket_objects
[shard_id
] = buf
;
13985 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo
& bucket_info
, int shard_id
, map
<int, string
> *result
)
13987 const rgw_bucket
& bucket
= bucket_info
.bucket
;
13988 string plain_id
= bucket
.name
+ ":" + bucket
.bucket_id
;
13989 if (!bucket_info
.num_shards
) {
13990 (*result
)[0] = plain_id
;
13993 if (shard_id
< 0) {
13994 for (uint32_t i
= 0; i
< bucket_info
.num_shards
; ++i
) {
13995 snprintf(buf
, sizeof(buf
), ":%d", i
);
13996 (*result
)[i
] = plain_id
+ buf
;
13999 if ((uint32_t)shard_id
> bucket_info
.num_shards
) {
14002 snprintf(buf
, sizeof(buf
), ":%d", shard_id
);
14003 (*result
)[shard_id
] = plain_id
+ buf
;
14008 int RGWRados::get_target_shard_id(const RGWBucketInfo
& bucket_info
, const string
& obj_key
,
14012 switch (bucket_info
.bucket_index_shard_hash_type
) {
14013 case RGWBucketInfo::MOD
:
14014 if (!bucket_info
.num_shards
) {
14019 uint32_t sid
= rgw_bucket_shard_index(obj_key
, bucket_info
.num_shards
);
14021 *shard_id
= (int)sid
;
14031 void RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, uint32_t num_shards
,
14032 int shard_id
, string
*bucket_obj
)
14035 // By default with no sharding, we use the bucket oid as itself
14036 (*bucket_obj
) = bucket_oid_base
;
14038 char buf
[bucket_oid_base
.size() + 32];
14039 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), shard_id
);
14040 (*bucket_obj
) = buf
;
14044 int RGWRados::get_bucket_index_object(const string
& bucket_oid_base
, const string
& obj_key
,
14045 uint32_t num_shards
, RGWBucketInfo::BIShardsHashType hash_type
, string
*bucket_obj
, int *shard_id
)
14048 switch (hash_type
) {
14049 case RGWBucketInfo::MOD
:
14051 // By default with no sharding, we use the bucket oid as itself
14052 (*bucket_obj
) = bucket_oid_base
;
14057 uint32_t sid
= rgw_bucket_shard_index(obj_key
, num_shards
);
14058 char buf
[bucket_oid_base
.size() + 32];
14059 snprintf(buf
, sizeof(buf
), "%s.%d", bucket_oid_base
.c_str(), sid
);
14060 (*bucket_obj
) = buf
;
14062 *shard_id
= (int)sid
;
14072 void RGWStateLog::oid_str(int shard
, string
& oid
) {
14073 oid
= RGW_STATELOG_OBJ_PREFIX
+ module_name
+ ".";
14075 snprintf(buf
, sizeof(buf
), "%d", shard
);
14079 int RGWStateLog::get_shard_num(const string
& object
) {
14080 uint32_t val
= ceph_str_hash_linux(object
.c_str(), object
.length());
14081 return val
% num_shards
;
14084 string
RGWStateLog::get_oid(const string
& object
) {
14085 int shard
= get_shard_num(object
);
14087 oid_str(shard
, oid
);
14091 int RGWStateLog::open_ioctx(librados::IoCtx
& ioctx
) {
14093 store
->get_log_pool(pool
);
14094 int r
= rgw_init_ioctx(store
->get_rados_handle(), pool
, ioctx
);
14096 lderr(store
->ctx()) << "ERROR: could not open rados pool" << dendl
;
14102 int RGWStateLog::store_entry(const string
& client_id
, const string
& op_id
, const string
& object
,
14103 uint32_t state
, bufferlist
*bl
, uint32_t *check_state
)
14105 if (client_id
.empty() ||
14108 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
14111 librados::IoCtx ioctx
;
14112 int r
= open_ioctx(ioctx
);
14116 string oid
= get_oid(object
);
14118 librados::ObjectWriteOperation op
;
14120 cls_statelog_check_state(op
, client_id
, op_id
, object
, *check_state
);
14122 utime_t ts
= ceph_clock_now();
14124 cls_statelog_add(op
, client_id
, op_id
, object
, ts
, state
, (bl
? *bl
: nobl
));
14125 r
= ioctx
.operate(oid
, &op
);
14133 int RGWStateLog::remove_entry(const string
& client_id
, const string
& op_id
, const string
& object
)
14135 if (client_id
.empty() ||
14138 ldout(store
->ctx(), 0) << "client_id / op_id / object is empty" << dendl
;
14141 librados::IoCtx ioctx
;
14142 int r
= open_ioctx(ioctx
);
14146 string oid
= get_oid(object
);
14148 librados::ObjectWriteOperation op
;
14149 cls_statelog_remove_by_object(op
, object
, op_id
);
14150 r
= ioctx
.operate(oid
, &op
);
14158 void RGWStateLog::init_list_entries(const string
& client_id
, const string
& op_id
, const string
& object
,
14161 list_state
*state
= new list_state
;
14162 state
->client_id
= client_id
;
14163 state
->op_id
= op_id
;
14164 state
->object
= object
;
14165 if (object
.empty()) {
14166 state
->cur_shard
= 0;
14167 state
->max_shard
= num_shards
- 1;
14169 state
->cur_shard
= state
->max_shard
= get_shard_num(object
);
14171 *handle
= (void *)state
;
14174 int RGWStateLog::list_entries(void *handle
, int max_entries
,
14175 list
<cls_statelog_entry
>& entries
,
14178 list_state
*state
= static_cast<list_state
*>(handle
);
14180 librados::IoCtx ioctx
;
14181 int r
= open_ioctx(ioctx
);
14187 for (; state
->cur_shard
<= state
->max_shard
&& max_entries
> 0; ++state
->cur_shard
) {
14189 oid_str(state
->cur_shard
, oid
);
14191 librados::ObjectReadOperation op
;
14192 list
<cls_statelog_entry
> ents
;
14194 cls_statelog_list(op
, state
->client_id
, state
->op_id
, state
->object
, state
->marker
,
14195 max_entries
, ents
, &state
->marker
, &truncated
);
14197 r
= ioctx
.operate(oid
, &op
, &ibl
);
14198 if (r
== -ENOENT
) {
14203 ldout(store
->ctx(), 0) << "cls_statelog_list returned " << r
<< dendl
;
14208 state
->marker
.clear();
14211 max_entries
-= ents
.size();
14213 entries
.splice(entries
.end(), ents
);
14219 *done
= (state
->cur_shard
> state
->max_shard
);
14224 void RGWStateLog::finish_list_entries(void *handle
)
14226 list_state
*state
= static_cast<list_state
*>(handle
);
14230 void RGWStateLog::dump_entry(const cls_statelog_entry
& entry
, Formatter
*f
)
14232 f
->open_object_section("statelog_entry");
14233 f
->dump_string("client_id", entry
.client_id
);
14234 f
->dump_string("op_id", entry
.op_id
);
14235 f
->dump_string("object", entry
.object
);
14236 entry
.timestamp
.gmtime_nsec(f
->dump_stream("timestamp"));
14237 if (!dump_entry_internal(entry
, f
)) {
14238 f
->dump_int("state", entry
.state
);
14240 f
->close_section();
14243 RGWOpState::RGWOpState(RGWRados
*_store
) : RGWStateLog(_store
, _store
->ctx()->_conf
->rgw_num_zone_opstate_shards
, string("obj_opstate"))
14247 bool RGWOpState::dump_entry_internal(const cls_statelog_entry
& entry
, Formatter
*f
)
14250 switch ((OpState
)entry
.state
) {
14251 case OPSTATE_UNKNOWN
:
14254 case OPSTATE_IN_PROGRESS
:
14257 case OPSTATE_COMPLETE
:
14260 case OPSTATE_ERROR
:
14263 case OPSTATE_ABORT
:
14266 case OPSTATE_CANCELLED
:
14272 f
->dump_string("state", s
);
14276 int RGWOpState::state_from_str(const string
& s
, OpState
*state
)
14278 if (s
== "unknown") {
14279 *state
= OPSTATE_UNKNOWN
;
14280 } else if (s
== "in-progress") {
14281 *state
= OPSTATE_IN_PROGRESS
;
14282 } else if (s
== "complete") {
14283 *state
= OPSTATE_COMPLETE
;
14284 } else if (s
== "error") {
14285 *state
= OPSTATE_ERROR
;
14286 } else if (s
== "abort") {
14287 *state
= OPSTATE_ABORT
;
14288 } else if (s
== "cancelled") {
14289 *state
= OPSTATE_CANCELLED
;
14297 int RGWOpState::set_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
14299 uint32_t s
= (uint32_t)state
;
14300 return store_entry(client_id
, op_id
, object
, s
, NULL
, NULL
);
14303 int RGWOpState::renew_state(const string
& client_id
, const string
& op_id
, const string
& object
, OpState state
)
14305 uint32_t s
= (uint32_t)state
;
14306 return store_entry(client_id
, op_id
, object
, s
, NULL
, &s
);
14309 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados
*store
, const string
& cid
, const string
& oid
,
14310 const string
& obj
) : os(store
), client_id(cid
), op_id(oid
), object(obj
)
14312 cct
= store
->ctx();
14313 cur_state
= RGWOpState::OPSTATE_UNKNOWN
;
14316 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state
) {
14317 last_update
= real_clock::now();
14319 return os
.set_state(client_id
, op_id
, object
, state
);
14322 int RGWOpStateSingleOp::renew_state() {
14323 real_time now
= real_clock::now();
14325 int rate_limit_sec
= cct
->_conf
->rgw_opstate_ratelimit_sec
;
14327 if (rate_limit_sec
&& now
- last_update
< make_timespan(rate_limit_sec
)) {
14332 return os
.renew_state(client_id
, op_id
, object
, cur_state
);
14336 uint64_t RGWRados::instance_id()
14338 return get_rados_handle()->get_instance_id();
14341 uint64_t RGWRados::next_bucket_id()
14343 Mutex::Locker
l(bucket_id_lock
);
14344 return ++max_bucket_id
;
14347 RGWRados
*RGWStoreManager::init_storage_provider(CephContext
*cct
, bool use_gc_thread
, bool use_lc_thread
,
14348 bool quota_threads
, bool run_sync_thread
, bool run_reshard_thread
, bool use_cache
)
14350 RGWRados
*store
= NULL
;
14352 store
= new RGWRados
;
14354 store
= new RGWCache
<RGWRados
>;
14357 if (store
->initialize(cct
, use_gc_thread
, use_lc_thread
, quota_threads
, run_sync_thread
, run_reshard_thread
) < 0) {
14365 RGWRados
*RGWStoreManager::init_raw_storage_provider(CephContext
*cct
)
14367 RGWRados
*store
= NULL
;
14368 store
= new RGWRados
;
14370 store
->set_context(cct
);
14372 if (store
->init_rados() < 0) {
14380 void RGWStoreManager::close_storage(RGWRados
*store
)
14390 librados::Rados
* RGWRados::get_rados_handle()
14392 if (rados
.size() == 1) {
14395 handle_lock
.get_read();
14396 pthread_t id
= pthread_self();
14397 std::map
<pthread_t
, int>:: iterator it
= rados_map
.find(id
);
14399 if (it
!= rados_map
.end()) {
14400 handle_lock
.put_read();
14401 return &rados
[it
->second
];
14403 handle_lock
.put_read();
14404 handle_lock
.get_write();
14405 const uint32_t handle
= next_rados_handle
;
14406 rados_map
[id
] = handle
;
14407 if (++next_rados_handle
== rados
.size()) {
14408 next_rados_handle
= 0;
14410 handle_lock
.put_write();
14411 return &rados
[handle
];
14416 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj
& obj
, list
<librados::AioCompletion
*>& handles
)
14419 int ret
= get_raw_obj_ref(obj
, &ref
);
14421 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
14425 ObjectWriteOperation op
;
14426 list
<string
> prefixes
;
14427 cls_rgw_remove_obj(op
, prefixes
);
14429 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
14430 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
14432 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
14437 handles
.push_back(c
);
14442 int RGWRados::delete_obj_aio(const rgw_obj
& obj
,
14443 RGWBucketInfo
& bucket_info
, RGWObjState
*astate
,
14444 list
<librados::AioCompletion
*>& handles
, bool keep_index_consistent
)
14447 int ret
= get_obj_head_ref(bucket_info
, obj
, &ref
);
14449 lderr(cct
) << "ERROR: failed to get obj ref with ret=" << ret
<< dendl
;
14453 if (keep_index_consistent
) {
14454 RGWRados::Bucket
bop(this, bucket_info
);
14455 RGWRados::Bucket::UpdateIndex
index_op(&bop
, obj
);
14457 ret
= index_op
.prepare(CLS_RGW_OP_DEL
, &astate
->write_tag
);
14459 lderr(cct
) << "ERROR: failed to prepare index op with ret=" << ret
<< dendl
;
14464 ObjectWriteOperation op
;
14465 list
<string
> prefixes
;
14466 cls_rgw_remove_obj(op
, prefixes
);
14468 AioCompletion
*c
= librados::Rados::aio_create_completion(NULL
, NULL
, NULL
);
14469 ret
= ref
.ioctx
.aio_operate(ref
.oid
, c
, &op
);
14471 lderr(cct
) << "ERROR: AioOperate failed with ret=" << ret
<< dendl
;
14476 handles
.push_back(c
);
14478 if (keep_index_consistent
) {
14479 ret
= delete_obj_index(obj
);
14481 lderr(cct
) << "ERROR: failed to delete obj index with ret=" << ret
<< dendl
;
14488 int rgw_compression_info_from_attrset(map
<string
, bufferlist
>& attrs
, bool& need_decompress
, RGWCompressionInfo
& cs_info
) {
14489 map
<string
, bufferlist
>::iterator value
= attrs
.find(RGW_ATTR_COMPRESSION
);
14490 if (value
!= attrs
.end()) {
14491 bufferlist::iterator bliter
= value
->second
.begin();
14493 ::decode(cs_info
, bliter
);
14494 } catch (buffer::error
& err
) {
14497 if (cs_info
.blocks
.size() == 0) {
14500 if (cs_info
.compression_type
!= "none")
14501 need_decompress
= true;
14503 need_decompress
= false;
14506 need_decompress
= false;
14511 bool RGWRados::call(std::string command
, cmdmap_t
& cmdmap
, std::string format
,
14514 if (command
== "cache list") {
14515 boost::optional
<std::string
> filter
;
14516 auto i
= cmdmap
.find("filter");
14517 if (i
!= cmdmap
.cend()) {
14518 filter
= boost::get
<std::string
>(i
->second
);
14520 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "table"));
14522 f
->open_array_section("cache_entries");
14523 call_list(filter
, f
.get());
14524 f
->close_section();
14528 out
.append("Unable to create Formatter.\n");
14531 } else if (command
== "cache inspect") {
14532 std::unique_ptr
<Formatter
> f(ceph::Formatter::create(format
, "json-pretty"));
14534 const auto& target
= boost::get
<std::string
>(cmdmap
["target"]);
14535 if (call_inspect(target
, f
.get())) {
14539 out
.append(string("Unable to find entry ") + target
+ string(".\n"));
14543 out
.append("Unable to create Formatter.\n");
14546 } else if (command
== "cache erase") {
14547 const auto& target
= boost::get
<std::string
>(cmdmap
["target"]);
14548 if (call_erase(target
)) {
14551 out
.append(string("Unable to find entry ") + target
+ string(".\n"));
14554 } else if (command
== "cache zap") {
14561 void RGWRados::call_list(const boost::optional
<std::string
>&,
14567 bool RGWRados::call_inspect(const std::string
&, Formatter
*)
14572 bool RGWRados::call_erase(const std::string
&) {
14576 void RGWRados::call_zap() {