]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
bump version to 12.2.12-pve1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
8#include <boost/algorithm/string.hpp>
9
10#include <boost/format.hpp>
11#include <boost/optional.hpp>
12#include <boost/utility/in_place_factory.hpp>
13
14#include "common/ceph_json.h"
15#include "common/utf8.h"
16
17#include "common/errno.h"
18#include "common/Formatter.h"
19#include "common/Throttle.h"
20#include "common/Finisher.h"
21
22#include "rgw_rados.h"
23#include "rgw_cache.h"
24#include "rgw_acl.h"
25#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26#include "rgw_metadata.h"
27#include "rgw_bucket.h"
28#include "rgw_rest_conn.h"
29#include "rgw_cr_rados.h"
30#include "rgw_cr_rest.h"
31
32#include "cls/rgw/cls_rgw_ops.h"
33#include "cls/rgw/cls_rgw_types.h"
34#include "cls/rgw/cls_rgw_client.h"
35#include "cls/rgw/cls_rgw_const.h"
36#include "cls/refcount/cls_refcount_client.h"
37#include "cls/version/cls_version_client.h"
38#include "cls/log/cls_log_client.h"
39#include "cls/statelog/cls_statelog_client.h"
40#include "cls/timeindex/cls_timeindex_client.h"
41#include "cls/lock/cls_lock_client.h"
42#include "cls/user/cls_user_client.h"
c07f9fc5 43#include "osd/osd_types.h"
7c673cae
FG
44
45#include "rgw_tools.h"
46#include "rgw_coroutine.h"
47#include "rgw_compression.h"
48
7c673cae
FG
49#undef fork // fails to compile RGWPeriod::fork() below
50
51#include "common/Clock.h"
52
53#include "include/rados/librados.hpp"
54using namespace librados;
55
56#include <string>
57#include <iostream>
58#include <vector>
59#include <atomic>
60#include <list>
61#include <map>
62#include "auth/Crypto.h" // get_random_bytes()
63
64#include "rgw_log.h"
65
66#include "rgw_gc.h"
67#include "rgw_lc.h"
68
69#include "rgw_object_expirer_core.h"
70#include "rgw_sync.h"
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae
FG
74
75#include "compressor/Compressor.h"
76
7c673cae
FG
77#define dout_context g_ceph_context
78#define dout_subsys ceph_subsys_rgw
79
80using namespace std;
81
82static string notify_oid_prefix = "notify";
83static string *notify_oids = NULL;
84static string shadow_ns = "shadow";
85static string dir_oid_prefix = ".dir.";
86static string default_storage_pool_suffix = "rgw.buckets.data";
87static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89static string avail_pools = ".pools.avail";
90
91static string zone_info_oid_prefix = "zone_info.";
92static string zone_names_oid_prefix = "zone_names.";
93static string region_info_oid_prefix = "region_info.";
94static string zone_group_info_oid_prefix = "zonegroup_info.";
95static string realm_names_oid_prefix = "realms_names.";
96static string realm_info_oid_prefix = "realms.";
97static string default_region_info_oid = "default.region";
98static string default_zone_group_info_oid = "default.zonegroup";
99static string period_info_oid_prefix = "periods.";
100static string period_latest_epoch_info_oid = ".latest_epoch";
101static string region_map_oid = "region_map";
102static string zonegroup_map_oid = "zonegroup_map";
103static string log_lock_name = "rgw_log_lock";
104static string default_realm_info_oid = "default.realm";
105const string default_zonegroup_name = "default";
106const string default_zone_name = "default";
107static string zonegroup_names_oid_prefix = "zonegroups_names.";
108static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109#define RGW_USAGE_OBJ_PREFIX "usage."
110#define FIRST_EPOCH 1
111static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116#define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118#define dout_subsys ceph_subsys_rgw
119
120
121static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123{
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
31f18b77 133 *pool = placement.get_data_extra_pool();
7c673cae
FG
134 }
135 }
136
137 return true;
138}
139
140static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142{
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146}
147
148rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149{
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156}
157
158rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159{
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166}
167
168int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169{
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
28e407b8
AA
173 if (r == -ERANGE) {
174 dout(0)
175 << __func__
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
179 << dendl;
180 }
7c673cae
FG
181 if (r < 0 && r != -EEXIST) {
182 return r;
183 }
184
185 r = rados->ioctx_create(pool.name.c_str(), ioctx);
c07f9fc5
FG
186 if (r < 0) {
187 return r;
188 }
189
190 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
191 if (r < 0 && r != -EOPNOTSUPP) {
192 return r;
193 }
194 } else if (r < 0) {
7c673cae
FG
195 return r;
196 }
197 if (!pool.ns.empty()) {
198 ioctx.set_namespace(pool.ns);
199 }
200 return 0;
201}
202
203template<>
204void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
205 RWLock::WLocker wl(lock);
206 auto iter = objs_state.find(obj);
207 if (iter == objs_state.end()) {
208 return;
209 }
210 bool is_atomic = iter->second.is_atomic;
211 bool prefetch_data = iter->second.prefetch_data;
212
213 objs_state.erase(iter);
214
215 if (is_atomic || prefetch_data) {
216 auto& s = objs_state[obj];
217 s.is_atomic = is_atomic;
218 s.prefetch_data = prefetch_data;
219 }
220}
221
222template<>
223void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
224 RWLock::WLocker wl(lock);
225 auto iter = objs_state.find(obj);
226 if (iter == objs_state.end()) {
227 return;
228 }
229
230 objs_state.erase(iter);
231}
232
233void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
234 encode_json("default_zonegroup", default_zonegroup, f);
235}
236
237void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
238
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
240 /* backward compatability with region */
241 if (default_zonegroup.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
243 }
244}
245
246rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
247{
248 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
250 }
251
252 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
253}
254
255int RGWZoneGroup::create_default(bool old_format)
256{
257 name = default_zonegroup_name;
258 is_master = true;
259
260 RGWZoneGroupPlacementTarget placement_target;
261 placement_target.name = "default-placement";
262 placement_targets[placement_target.name] = placement_target;
263 default_placement = "default-placement";
264
265 RGWZoneParams zone_params(default_zone_name);
266
267 int r = zone_params.init(cct, store, false);
268 if (r < 0) {
269 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
270 return r;
271 }
272
273 r = zone_params.create_default();
274 if (r < 0 && r != -EEXIST) {
275 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 } else if (r == -EEXIST) {
278 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
279 zone_params.clear_id();
280 r = zone_params.init(cct, store);
281 if (r < 0) {
282 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
283 return r;
284 }
285 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
286 << dendl;
287 }
288
289 RGWZone& default_zone = zones[zone_params.get_id()];
290 default_zone.name = zone_params.get_name();
291 default_zone.id = zone_params.get_id();
292 master_zone = default_zone.id;
293
294 r = create();
295 if (r < 0 && r != -EEXIST) {
296 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
297 return r;
298 }
299
300 if (r == -EEXIST) {
301 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
302 id.clear();
303 r = init(cct, store);
304 if (r < 0) {
305 return r;
306 }
307 }
308
309 if (old_format) {
310 name = id;
311 }
312
313 post_process_params();
314
315 return 0;
316}
317
318const string RGWZoneGroup::get_default_oid(bool old_region_format)
319{
320 if (old_region_format) {
321 if (cct->_conf->rgw_default_region_info_oid.empty()) {
322 return default_region_info_oid;
323 }
324 return cct->_conf->rgw_default_region_info_oid;
325 }
326
327 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
328
329 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
330 default_oid = default_zone_group_info_oid;
331 }
332
333 default_oid += "." + realm_id;
334
335 return default_oid;
336}
337
338const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
339{
340 if (old_region_format) {
341 return region_info_oid_prefix;
342 }
343 return zone_group_info_oid_prefix;
344}
345
346const string& RGWZoneGroup::get_names_oid_prefix()
347{
348 return zonegroup_names_oid_prefix;
349}
350
351const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
352 return cct->_conf->rgw_zonegroup;
353}
354
355int RGWZoneGroup::equals(const string& other_zonegroup) const
356{
357 if (is_master && other_zonegroup.empty())
358 return true;
359
360 return (id == other_zonegroup);
361}
362
363int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
364 const list<string>& endpoints, const string *ptier_type,
365 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
366{
367 auto& zone_id = zone_params.get_id();
368 auto& zone_name = zone_params.get_name();
369
370 // check for duplicate zone name on insert
371 if (!zones.count(zone_id)) {
372 for (const auto& zone : zones) {
373 if (zone.second.name == zone_name) {
374 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
376 return -EEXIST;
377 }
378 }
379 }
380
381 if (is_master) {
382 if (*is_master) {
383 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
384 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
385 }
386 master_zone = zone_params.get_id();
387 } else if (master_zone == zone_params.get_id()) {
388 master_zone.clear();
389 }
390 }
391
392 RGWZone& zone = zones[zone_params.get_id()];
393 zone.name = zone_params.get_name();
394 zone.id = zone_params.get_id();
395 if (!endpoints.empty()) {
396 zone.endpoints = endpoints;
397 }
398 if (read_only) {
399 zone.read_only = *read_only;
400 }
401 if (ptier_type) {
402 zone.tier_type = *ptier_type;
403 }
404
405 if (psync_from_all) {
406 zone.sync_from_all = *psync_from_all;
407 }
408
409 for (auto add : sync_from) {
410 zone.sync_from.insert(add);
411 }
412
413 for (auto rm : sync_from_rm) {
414 zone.sync_from.erase(rm);
415 }
416
417 post_process_params();
418
419 return update();
420}
421
422
423int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
424{
425 RGWZone& zone = zones[zone_params.get_id()];
426 zone.name = zone_params.get_name();
427
428 return update();
429}
430
431void RGWZoneGroup::post_process_params()
432{
433 bool log_data = zones.size() > 1;
434
435 if (master_zone.empty()) {
436 map<string, RGWZone>::iterator iter = zones.begin();
437 if (iter != zones.end()) {
438 master_zone = iter->first;
439 }
440 }
441
442 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
443 RGWZone& zone = iter->second;
444 zone.log_data = log_data;
7c673cae
FG
445
446 RGWZoneParams zone_params(zone.id, zone.name);
447 int ret = zone_params.init(cct, store);
448 if (ret < 0) {
449 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
450 continue;
451 }
452
453 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
454 iter != zone_params.placement_pools.end(); ++iter) {
455 const string& placement_name = iter->first;
456 if (placement_targets.find(placement_name) == placement_targets.end()) {
457 RGWZoneGroupPlacementTarget placement_target;
458 placement_target.name = placement_name;
459 placement_targets[placement_name] = placement_target;
460 }
461 }
462 }
463
464 if (default_placement.empty() && !placement_targets.empty()) {
465 default_placement = placement_targets.begin()->first;
466 }
467}
468
469int RGWZoneGroup::remove_zone(const std::string& zone_id)
470{
471 map<string, RGWZone>::iterator iter = zones.find(zone_id);
472 if (iter == zones.end()) {
473 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
474 << name << dendl;
475 return -ENOENT;
476 }
477
478 zones.erase(iter);
479
480 post_process_params();
481
482 return update();
483}
484
485int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
486{
487 if (realm_id.empty()) {
488 /* try using default realm */
489 RGWRealm realm;
490 int ret = realm.init(cct, store);
b32b8144 491 // no default realm exist
7c673cae 492 if (ret < 0) {
b32b8144 493 return read_id(default_zonegroup_name, default_id);
7c673cae
FG
494 }
495 realm_id = realm.get_id();
496 }
497
498 return RGWSystemMetaObj::read_default_id(default_id, old_format);
499}
500
501int RGWZoneGroup::set_as_default(bool exclusive)
502{
503 if (realm_id.empty()) {
504 /* try using default realm */
505 RGWRealm realm;
506 int ret = realm.init(cct, store);
507 if (ret < 0) {
508 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
509 return -EINVAL;
510 }
511 realm_id = realm.get_id();
512 }
513
514 return RGWSystemMetaObj::set_as_default(exclusive);
515}
516
517int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
518{
519 cct = _cct;
520 store = _store;
521
522 if (!setup_obj)
523 return 0;
524
525 if (old_format && id.empty()) {
526 id = name;
527 }
528
529 if (id.empty()) {
530 int r;
531 if (name.empty()) {
532 name = get_predefined_name(cct);
533 }
534 if (name.empty()) {
535 r = use_default(old_format);
536 if (r < 0) {
537 return r;
538 }
539 } else if (!old_format) {
540 r = read_id(name, id);
541 if (r < 0) {
542 if (r != -ENOENT) {
543 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
544 }
545 return r;
546 }
547 }
548 }
549
550 return read_info(id, old_format);
551}
552
553int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
554{
555 auto pool = get_pool(cct);
556 bufferlist bl;
557 RGWObjectCtx obj_ctx(store);
558 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
559 if (ret < 0)
560 return ret;
561
562 try {
563 bufferlist::iterator iter = bl.begin();
564 ::decode(default_info, iter);
565 } catch (buffer::error& err) {
566 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
567 return -EIO;
568 }
569
570 return 0;
571}
572
573int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
574{
575 RGWDefaultSystemMetaObjInfo default_info;
576
577 int ret = read_default(default_info, get_default_oid(old_format));
578 if (ret < 0) {
579 return ret;
580 }
581
582 default_id = default_info.default_id;
583
584 return 0;
585}
586
587int RGWSystemMetaObj::use_default(bool old_format)
588{
589 return read_default_id(id, old_format);
590}
591
592int RGWSystemMetaObj::set_as_default(bool exclusive)
593{
594 string oid = get_default_oid();
595
596 rgw_pool pool(get_pool(cct));
597 bufferlist bl;
598
599 RGWDefaultSystemMetaObjInfo default_info;
600 default_info.default_id = id;
601
602 ::encode(default_info, bl);
603
604 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
605 exclusive, NULL, real_time(), NULL);
606 if (ret < 0)
607 return ret;
608
609 return 0;
610}
611
612int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
613{
614 rgw_pool pool(get_pool(cct));
615 bufferlist bl;
616
617 string oid = get_names_oid_prefix() + obj_name;
618
619 RGWObjectCtx obj_ctx(store);
620 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
621 if (ret < 0) {
622 return ret;
623 }
624
625 RGWNameToId nameToId;
626 try {
627 bufferlist::iterator iter = bl.begin();
628 ::decode(nameToId, iter);
629 } catch (buffer::error& err) {
630 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
631 return -EIO;
632 }
633 object_id = nameToId.obj_id;
634 return 0;
635}
636
637int RGWSystemMetaObj::delete_obj(bool old_format)
638{
639 rgw_pool pool(get_pool(cct));
640
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info;
643 int ret = read_default(default_info, get_default_oid(old_format));
644 if (ret < 0 && ret != -ENOENT)
645 return ret;
646 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
647 string oid = get_default_oid(old_format);
648 rgw_raw_obj default_named_obj(pool, oid);
649 ret = store->delete_system_obj(default_named_obj);
650 if (ret < 0) {
651 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
652 return ret;
653 }
654 }
655 if (!old_format) {
656 string oid = get_names_oid_prefix() + name;
657 rgw_raw_obj object_name(pool, oid);
658 ret = store->delete_system_obj(object_name);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
661 return ret;
662 }
663 }
664
665 string oid = get_info_oid_prefix(old_format);
666 if (old_format) {
667 oid += name;
668 } else {
669 oid += id;
670 }
671
672 rgw_raw_obj object_id(pool, oid);
673 ret = store->delete_system_obj(object_id);
674 if (ret < 0) {
675 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
676 }
677
678 return ret;
679}
680
681int RGWSystemMetaObj::store_name(bool exclusive)
682{
683 rgw_pool pool(get_pool(cct));
684 string oid = get_names_oid_prefix() + name;
685
686 RGWNameToId nameToId;
687 nameToId.obj_id = id;
688
689 bufferlist bl;
690 ::encode(nameToId, bl);
691 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
692}
693
694int RGWSystemMetaObj::rename(const string& new_name)
695{
696 string new_id;
697 int ret = read_id(new_name, new_id);
698 if (!ret) {
699 return -EEXIST;
700 }
701 if (ret < 0 && ret != -ENOENT) {
702 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 string old_name = name;
706 name = new_name;
707 ret = update();
708 if (ret < 0) {
709 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712 ret = store_name(true);
713 if (ret < 0) {
714 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
715 return ret;
716 }
717 /* delete old name */
718 rgw_pool pool(get_pool(cct));
719 string oid = get_names_oid_prefix() + old_name;
720 rgw_raw_obj old_name_obj(pool, oid);
721 ret = store->delete_system_obj(old_name_obj);
722 if (ret < 0) {
723 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
724 return ret;
725 }
726
727 return ret;
728}
729
730int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
731{
732 rgw_pool pool(get_pool(cct));
733
734 bufferlist bl;
735
736 string oid = get_info_oid_prefix(old_format) + obj_id;
737
738 RGWObjectCtx obj_ctx(store);
739 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
740 if (ret < 0) {
741 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
742 return ret;
743 }
744
745 try {
746 bufferlist::iterator iter = bl.begin();
747 ::decode(*this, iter);
748 } catch (buffer::error& err) {
749 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
750 return -EIO;
751 }
752
753 return 0;
754}
755
756int RGWSystemMetaObj::read()
757{
758 int ret = read_id(name, id);
759 if (ret < 0) {
760 return ret;
761 }
762
763 return read_info(id);
764}
765
766int RGWSystemMetaObj::create(bool exclusive)
767{
768 int ret;
769
770 /* check to see the name is not used */
771 ret = read_id(name, id);
772 if (exclusive && ret == 0) {
773 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
774 return -EEXIST;
775 } else if ( ret < 0 && ret != -ENOENT) {
776 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 if (id.empty()) {
781 /* create unique id */
782 uuid_d new_uuid;
783 char uuid_str[37];
784 new_uuid.generate_random();
785 new_uuid.print(uuid_str);
786 id = uuid_str;
787 }
788
789 ret = store_info(exclusive);
790 if (ret < 0) {
791 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
792 return ret;
793 }
794
795 return store_name(exclusive);
796}
797
798int RGWSystemMetaObj::store_info(bool exclusive)
799{
800 rgw_pool pool(get_pool(cct));
801
802 string oid = get_info_oid_prefix() + id;
803
804 bufferlist bl;
805 ::encode(*this, bl);
806 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
807}
808
809int RGWSystemMetaObj::write(bool exclusive)
810{
811 int ret = store_info(exclusive);
812 if (ret < 0) {
813 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
814 return ret;
815 }
816 ret = store_name(exclusive);
817 if (ret < 0) {
818 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
819 return ret;
820 }
821 return 0;
822}
823
824
825const string& RGWRealm::get_predefined_name(CephContext *cct) {
826 return cct->_conf->rgw_realm;
827}
828
829int RGWRealm::create(bool exclusive)
830{
831 int ret = RGWSystemMetaObj::create(exclusive);
832 if (ret < 0) {
833 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
834 return ret;
835 }
836 // create the control object for watch/notify
837 ret = create_control(exclusive);
838 if (ret < 0) {
839 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
840 return ret;
841 }
842 RGWPeriod period;
843 if (current_period.empty()) {
844 /* create new period for the realm */
845 ret = period.init(cct, store, id, name, false);
846 if (ret < 0 ) {
847 return ret;
848 }
849 ret = period.create(true);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
852 return ret;
853 }
854 } else {
855 period = RGWPeriod(current_period, 0);
856 int ret = period.init(cct, store, id, name);
857 if (ret < 0) {
858 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
859 return ret;
860 }
861 }
862 ret = set_current_period(period);
863 if (ret < 0) {
864 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
865 return ret;
866 }
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret = set_as_default(true);
870 if (ret < 0 && ret != -EEXIST) {
871 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
872 }
873
874 return 0;
875}
876
877int RGWRealm::delete_obj()
878{
879 int ret = RGWSystemMetaObj::delete_obj();
880 if (ret < 0) {
881 return ret;
882 }
883 return delete_control();
884}
885
886int RGWRealm::create_control(bool exclusive)
887{
888 auto pool = rgw_pool{get_pool(cct)};
889 auto oid = get_control_oid();
890 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
891 nullptr, real_time(), nullptr);
892}
893
894int RGWRealm::delete_control()
895{
896 auto pool = rgw_pool{get_pool(cct)};
897 auto obj = rgw_raw_obj{pool, get_control_oid()};
898 return store->delete_system_obj(obj);
899}
900
901rgw_pool RGWRealm::get_pool(CephContext *cct)
902{
903 if (cct->_conf->rgw_realm_root_pool.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
905 }
906 return rgw_pool(cct->_conf->rgw_realm_root_pool);
907}
908
909const string RGWRealm::get_default_oid(bool old_format)
910{
911 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
912 return default_realm_info_oid;
913 }
914 return cct->_conf->rgw_default_realm_info_oid;
915}
916
917const string& RGWRealm::get_names_oid_prefix()
918{
919 return realm_names_oid_prefix;
920}
921
922const string& RGWRealm::get_info_oid_prefix(bool old_format)
923{
924 return realm_info_oid_prefix;
925}
926
927int RGWRealm::set_current_period(RGWPeriod& period)
928{
929 // update realm epoch to match the period's
930 if (epoch > period.get_realm_epoch()) {
931 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
932 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
933 return -EINVAL;
934 }
935 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
936 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
937 << period.get_realm_epoch() << ", but different period id "
938 << period.get_id() << " != " << current_period << dendl;
939 return -EINVAL;
940 }
941
942 epoch = period.get_realm_epoch();
943 current_period = period.get_id();
944
945 int ret = update();
946 if (ret < 0) {
947 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
948 return ret;
949 }
950
951 ret = period.reflect();
952 if (ret < 0) {
953 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
954 return ret;
955 }
956
957 return 0;
958}
959
960string RGWRealm::get_control_oid()
961{
962 return get_info_oid_prefix() + id + ".control";
963}
964
965int RGWRealm::notify_zone(bufferlist& bl)
966{
967 // open a context on the realm's pool
968 rgw_pool pool{get_pool(cct)};
969 librados::IoCtx ctx;
970 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
971 if (r < 0) {
972 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
973 return r;
974 }
975 // send a notify on the realm object
976 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
977 if (r < 0) {
978 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
979 return r;
980 }
981 return 0;
982}
983
984int RGWRealm::notify_new_period(const RGWPeriod& period)
985{
986 bufferlist bl;
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
989 ::encode(period, bl);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload, bl);
992
993 return notify_zone(bl);
994}
995
996std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
997{
998 if (realm_id.empty()) {
999 return "period_config.default";
1000 }
1001 return "period_config." + realm_id;
1002}
1003
1004rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
1005{
1006 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1007 if (pool_name.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1009 }
1010 return {pool_name};
1011}
1012
1013int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1014{
1015 RGWObjectCtx obj_ctx(store);
1016 const auto& pool = get_pool(store->ctx());
1017 const auto& oid = get_oid(realm_id);
1018 bufferlist bl;
1019
1020 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1021 if (ret < 0) {
1022 return ret;
1023 }
1024 try {
1025 bufferlist::iterator iter = bl.begin();
1026 ::decode(*this, iter);
1027 } catch (buffer::error& err) {
1028 return -EIO;
1029 }
1030 return 0;
1031}
1032
1033int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1034{
1035 const auto& pool = get_pool(store->ctx());
1036 const auto& oid = get_oid(realm_id);
1037 bufferlist bl;
1038 ::encode(*this, bl);
1039 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1040 false, nullptr, real_time(), nullptr);
1041}
1042
1043int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1044 const string& period_realm_name, bool setup_obj)
1045{
1046 cct = _cct;
1047 store = _store;
1048 realm_id = period_realm_id;
1049 realm_name = period_realm_name;
1050
1051 if (!setup_obj)
1052 return 0;
1053
1054 return init(_cct, _store, setup_obj);
1055}
1056
1057
1058int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1059{
1060 cct = _cct;
1061 store = _store;
1062
1063 if (!setup_obj)
1064 return 0;
1065
1066 if (id.empty()) {
1067 RGWRealm realm(realm_id, realm_name);
1068 int ret = realm.init(cct, store);
1069 if (ret < 0) {
1070 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1071 cpp_strerror(-ret) << dendl;
1072 return ret;
1073 }
1074 id = realm.get_current_period();
1075 realm_id = realm.get_id();
1076 }
1077
1078 if (!epoch) {
1079 int ret = use_latest_epoch();
1080 if (ret < 0) {
1081 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1082 << " : " << cpp_strerror(-ret) << dendl;
1083 return ret;
1084 }
1085 }
1086
1087 return read_info();
1088}
1089
1090
1091int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1092 map<string, RGWZoneGroup>::const_iterator iter;
1093 if (!zonegroup_id.empty()) {
1094 iter = period_map.zonegroups.find(zonegroup_id);
1095 } else {
1096 iter = period_map.zonegroups.find("default");
1097 }
1098 if (iter != period_map.zonegroups.end()) {
1099 zonegroup = iter->second;
1100 return 0;
1101 }
1102
1103 return -ENOENT;
1104}
1105
7c673cae
FG
1106const string& RGWPeriod::get_latest_epoch_oid()
1107{
1108 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1109 return period_latest_epoch_info_oid;
1110 }
1111 return cct->_conf->rgw_period_latest_epoch_info_oid;
1112}
1113
1114const string& RGWPeriod::get_info_oid_prefix()
1115{
1116 return period_info_oid_prefix;
1117}
1118
1119const string RGWPeriod::get_period_oid_prefix()
1120{
1121 return get_info_oid_prefix() + id;
1122}
1123
1124const string RGWPeriod::get_period_oid()
1125{
1126 std::ostringstream oss;
1127 oss << get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id != get_staging_id(realm_id))
1130 oss << "." << epoch;
1131 return oss.str();
1132}
1133
224ce89b
WB
1134int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1135 RGWObjVersionTracker *objv)
7c673cae
FG
1136{
1137 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1138
1139 rgw_pool pool(get_pool(cct));
1140 bufferlist bl;
1141 RGWObjectCtx obj_ctx(store);
224ce89b 1142 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
7c673cae
FG
1143 if (ret < 0) {
1144 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1145 return ret;
1146 }
1147 try {
1148 bufferlist::iterator iter = bl.begin();
1149 ::decode(info, iter);
1150 } catch (buffer::error& err) {
1151 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1152 return -EIO;
1153 }
1154
1155 return 0;
1156}
1157
1158int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1159{
1160 RGWPeriodLatestEpochInfo info;
1161
1162 int ret = read_latest_epoch(info);
1163 if (ret < 0) {
1164 return ret;
1165 }
1166
1167 latest_epoch = info.epoch;
1168
1169 return 0;
1170}
1171
1172int RGWPeriod::use_latest_epoch()
1173{
1174 RGWPeriodLatestEpochInfo info;
1175 int ret = read_latest_epoch(info);
1176 if (ret < 0) {
1177 return ret;
1178 }
1179
1180 epoch = info.epoch;
1181
1182 return 0;
1183}
1184
224ce89b
WB
1185int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1186 RGWObjVersionTracker *objv)
7c673cae
FG
1187{
1188 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1189
1190 rgw_pool pool(get_pool(cct));
1191 bufferlist bl;
1192
1193 RGWPeriodLatestEpochInfo info;
1194 info.epoch = epoch;
1195
1196 ::encode(info, bl);
1197
1198 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
224ce89b
WB
1199 exclusive, objv, real_time(), nullptr);
1200}
1201
1202int RGWPeriod::update_latest_epoch(epoch_t epoch)
1203{
1204 static constexpr int MAX_RETRIES = 20;
1205
1206 for (int i = 0; i < MAX_RETRIES; i++) {
1207 RGWPeriodLatestEpochInfo info;
1208 RGWObjVersionTracker objv;
1209 bool exclusive = false;
1210
1211 // read existing epoch
1212 int r = read_latest_epoch(info, &objv);
1213 if (r == -ENOENT) {
1214 // use an exclusive create to set the epoch atomically
1215 exclusive = true;
1216 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id << dendl;
1218 } else if (r < 0) {
1219 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1220 return r;
1221 } else if (epoch <= info.epoch) {
1222 r = -EEXIST; // fail with EEXIST if epoch is not newer
1223 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1224 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1225 return r;
1226 } else {
1227 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1228 << " -> " << epoch << " on period=" << id << dendl;
1229 }
1230
1231 r = set_latest_epoch(epoch, exclusive, &objv);
1232 if (r == -EEXIST) {
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r == -ECANCELED) {
1235 continue; // write raced with a conflicting version, retry
1236 }
1237 if (r < 0) {
1238 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1239 return r;
1240 }
1241 return 0; // return success
1242 }
1243
1244 return -ECANCELED; // fail after max retries
7c673cae
FG
1245}
1246
1247int RGWPeriod::delete_obj()
1248{
1249 rgw_pool pool(get_pool(cct));
1250
1251 // delete the object for each period epoch
1252 for (epoch_t e = 1; e <= epoch; e++) {
1253 RGWPeriod p{get_id(), e};
1254 rgw_raw_obj oid{pool, p.get_period_oid()};
1255 int ret = store->delete_system_obj(oid);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret) << dendl;
1259 }
1260 }
1261
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret = store->delete_system_obj(oid);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret) << dendl;
1268 }
1269 return ret;
1270}
1271
1272int RGWPeriod::read_info()
1273{
1274 rgw_pool pool(get_pool(cct));
1275
1276 bufferlist bl;
1277
1278 RGWObjectCtx obj_ctx(store);
1279 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1280 if (ret < 0) {
1281 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1282 return ret;
1283 }
1284
1285 try {
1286 bufferlist::iterator iter = bl.begin();
1287 ::decode(*this, iter);
1288 } catch (buffer::error& err) {
1289 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1290 return -EIO;
1291 }
1292
1293 return 0;
1294}
1295
1296int RGWPeriod::create(bool exclusive)
1297{
1298 int ret;
1299
1300 /* create unique id */
1301 uuid_d new_uuid;
1302 char uuid_str[37];
1303 new_uuid.generate_random();
1304 new_uuid.print(uuid_str);
1305 id = uuid_str;
1306
1307 epoch = FIRST_EPOCH;
1308
1309 period_map.id = id;
1310
1311 ret = store_info(exclusive);
1312 if (ret < 0) {
1313 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
224ce89b 1314 return ret;
7c673cae
FG
1315 }
1316
1317 ret = set_latest_epoch(epoch);
1318 if (ret < 0) {
1319 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1320 }
1321
1322 return ret;
1323}
1324
1325int RGWPeriod::store_info(bool exclusive)
1326{
7c673cae
FG
1327 rgw_pool pool(get_pool(cct));
1328
1329 string oid = get_period_oid();
1330 bufferlist bl;
1331 ::encode(*this, bl);
224ce89b
WB
1332
1333 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1334 exclusive, NULL, real_time(), NULL);
7c673cae
FG
1335}
1336
1337rgw_pool RGWPeriod::get_pool(CephContext *cct)
1338{
1339 if (cct->_conf->rgw_period_root_pool.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1341 }
1342 return rgw_pool(cct->_conf->rgw_period_root_pool);
1343}
1344
7c673cae
FG
1345int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1346{
1347 if (zonegroup.realm_id != realm_id) {
1348 return 0;
1349 }
1350 int ret = period_map.update(zonegroup, cct);
1351 if (ret < 0) {
1352 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1353 return ret;
1354 }
1355
1356 return store_info(false);
1357}
1358
1359int RGWPeriod::update()
1360{
1361 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1362 list<string> zonegroups;
1363 int ret = store->list_zonegroups(zonegroups);
1364 if (ret < 0) {
1365 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1366 return ret;
1367 }
1368
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map.short_zone_ids.clear();
1372
1373 for (auto& iter : zonegroups) {
1374 RGWZoneGroup zg(string(), iter);
1375 ret = zg.init(cct, store);
1376 if (ret < 0) {
1377 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1378 continue;
1379 }
1380
1381 if (zg.realm_id != realm_id) {
1382 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1383 continue;
1384 }
1385
1386 if (zg.master_zone.empty()) {
1387 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1388 return -EINVAL;
1389 }
1390
f64942e4
AA
1391 if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
1392 ldout(cct,0) << "ERROR: zonegroup " << zg.get_name()
1393 << " has a non existent master zone "<< dendl;
1394 return -EINVAL;
1395 }
1396
7c673cae
FG
1397 if (zg.is_master_zonegroup()) {
1398 master_zonegroup = zg.get_id();
1399 master_zone = zg.master_zone;
1400 }
1401
1402 int ret = period_map.update(zg, cct);
1403 if (ret < 0) {
1404 return ret;
1405 }
1406 }
1407
1408 ret = period_config.read(store, realm_id);
1409 if (ret < 0 && ret != -ENOENT) {
1410 ldout(cct, 0) << "ERROR: failed to read period config: "
1411 << cpp_strerror(ret) << dendl;
1412 return ret;
1413 }
1414 return 0;
1415}
1416
1417int RGWPeriod::reflect()
1418{
1419 for (auto& iter : period_map.zonegroups) {
1420 RGWZoneGroup& zg = iter.second;
1421 zg.reinit_instance(cct, store);
1422 int r = zg.write(false);
1423 if (r < 0) {
1424 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1425 return r;
1426 }
1427 if (zg.is_master_zonegroup()) {
1428 // set master as default if no default exists
1429 r = zg.set_as_default(true);
1430 if (r == 0) {
1431 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1432 << " as the default" << dendl;
1433 }
1434 }
1435 }
1436
1437 int r = period_config.write(store, realm_id);
1438 if (r < 0) {
1439 ldout(cct, 0) << "ERROR: failed to store period config: "
1440 << cpp_strerror(-r) << dendl;
1441 return r;
1442 }
1443 return 0;
1444}
1445
1446void RGWPeriod::fork()
1447{
1448 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1449 predecessor_uuid = id;
1450 id = get_staging_id(realm_id);
1451 period_map.reset();
1452 realm_epoch++;
1453}
1454
1455static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1456{
1457 // initialize a sync status manager to read the status
1458 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1459 int r = mgr.init();
1460 if (r < 0) {
1461 return r;
1462 }
1463 r = mgr.read_sync_status(sync_status);
1464 mgr.stop();
1465 return r;
1466}
1467
1468int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1469 std::ostream& error_stream,
1470 bool force_if_stale)
1471{
1472 rgw_meta_sync_status status;
1473 int r = read_sync_status(store, &status);
1474 if (r < 0) {
1475 ldout(cct, 0) << "period failed to read sync status: "
1476 << cpp_strerror(-r) << dendl;
1477 return r;
1478 }
1479
1480 std::vector<std::string> markers;
1481
1482 const auto current_epoch = current_period.get_realm_epoch();
1483 if (current_epoch != status.sync_info.realm_epoch) {
1484 // no sync status markers for the current period
1485 assert(current_epoch > status.sync_info.realm_epoch);
1486 const int behind = current_epoch - status.sync_info.realm_epoch;
1487 if (!force_if_stale && current_epoch > 1) {
1488 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1489 "the current master zone in metadata sync. If this zone is promoted "
1490 "to master, any metadata changes during that time are likely to "
1491 "be lost.\n"
1492 "Waiting for this zone to catch up on metadata sync (see "
1493 "'radosgw-admin sync status') is recommended.\n"
1494 "To promote this zone to master anyway, add the flag "
1495 "--yes-i-really-mean-it." << std::endl;
1496 return -EINVAL;
1497 }
1498 // empty sync status markers - other zones will skip this period during
1499 // incremental metadata sync
1500 markers.resize(status.sync_info.num_shards);
1501 } else {
1502 markers.reserve(status.sync_info.num_shards);
1503 for (auto& i : status.sync_markers) {
1504 auto& marker = i.second;
1505 // filter out markers from other periods
1506 if (marker.realm_epoch != current_epoch) {
1507 marker.marker.clear();
1508 }
1509 markers.emplace_back(std::move(marker.marker));
1510 }
1511 }
1512
1513 std::swap(sync_status, markers);
1514 return 0;
1515}
1516
1517int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1518 std::ostream& error_stream, bool force_if_stale)
1519{
1520 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1521 // gateway must be in the master zone to commit
1522 if (master_zone != store->get_zone_params().get_id()) {
1523 error_stream << "Cannot commit period on zone "
1524 << store->get_zone_params().get_id() << ", it must be sent to "
1525 "the period's master zone " << master_zone << '.' << std::endl;
1526 return -EINVAL;
1527 }
1528 // period predecessor must match current period
1529 if (predecessor_uuid != current_period.get_id()) {
1530 error_stream << "Period predecessor " << predecessor_uuid
1531 << " does not match current period " << current_period.get_id()
1532 << ". Use 'period pull' to get the latest period from the master, "
1533 "reapply your changes, and try again." << std::endl;
1534 return -EINVAL;
1535 }
1536 // realm epoch must be 1 greater than current period
1537 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1538 error_stream << "Period's realm epoch " << realm_epoch
1539 << " does not come directly after current realm epoch "
1540 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1541 "latest realm and period from the master zone, reapply your changes, "
1542 "and try again." << std::endl;
1543 return -EINVAL;
1544 }
1545 // did the master zone change?
1546 if (master_zone != current_period.get_master_zone()) {
1547 // store the current metadata sync status in the period
1548 int r = update_sync_status(current_period, error_stream, force_if_stale);
1549 if (r < 0) {
1550 ldout(cct, 0) << "failed to update metadata sync status: "
1551 << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 // create an object with a new period id
1555 r = create(true);
1556 if (r < 0) {
1557 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1558 return r;
1559 }
1560 // set as current period
1561 r = realm.set_current_period(*this);
1562 if (r < 0) {
1563 ldout(cct, 0) << "failed to update realm's current period: "
1564 << cpp_strerror(-r) << dendl;
1565 return r;
1566 }
1567 ldout(cct, 4) << "Promoted to master zone and committed new period "
1568 << id << dendl;
1569 realm.notify_new_period(*this);
1570 return 0;
1571 }
1572 // period must be based on current epoch
1573 if (epoch != current_period.get_epoch()) {
1574 error_stream << "Period epoch " << epoch << " does not match "
1575 "predecessor epoch " << current_period.get_epoch()
1576 << ". Use 'period pull' to get the latest epoch from the master zone, "
1577 "reapply your changes, and try again." << std::endl;
1578 return -EINVAL;
1579 }
1580 // set period as next epoch
1581 set_id(current_period.get_id());
1582 set_epoch(current_period.get_epoch() + 1);
1583 set_predecessor(current_period.get_predecessor());
1584 realm_epoch = current_period.get_realm_epoch();
1585 // write the period to rados
1586 int r = store_info(false);
1587 if (r < 0) {
1588 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1589 return r;
1590 }
1591 // set as latest epoch
224ce89b
WB
1592 r = update_latest_epoch(epoch);
1593 if (r == -EEXIST) {
1594 // already have this epoch (or a more recent one)
1595 return 0;
1596 }
7c673cae
FG
1597 if (r < 0) {
1598 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1599 return r;
1600 }
1601 r = reflect();
1602 if (r < 0) {
1603 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1604 return r;
1605 }
1606 ldout(cct, 4) << "Committed new epoch " << epoch
1607 << " for period " << id << dendl;
1608 realm.notify_new_period(*this);
1609 return 0;
1610}
1611
1612int RGWZoneParams::create_default(bool old_format)
1613{
1614 name = default_zone_name;
1615
1616 int r = create();
1617 if (r < 0) {
1618 return r;
1619 }
1620
1621 if (old_format) {
1622 name = id;
1623 }
1624
1625 return r;
1626}
1627
1628
1629int get_zones_pool_set(CephContext* cct,
1630 RGWRados* store,
1631 const list<string>& zones,
1632 const string& my_zone_id,
1633 set<rgw_pool>& pool_names)
1634{
1635 for(auto const& iter : zones) {
1636 RGWZoneParams zone(iter);
1637 int r = zone.init(cct, store);
1638 if (r < 0) {
1639 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1640 return r;
1641 }
1642 if (zone.get_id() != my_zone_id) {
1643 pool_names.insert(zone.domain_root);
1644 pool_names.insert(zone.metadata_heap);
1645 pool_names.insert(zone.control_pool);
1646 pool_names.insert(zone.gc_pool);
1647 pool_names.insert(zone.log_pool);
1648 pool_names.insert(zone.intent_log_pool);
1649 pool_names.insert(zone.usage_log_pool);
1650 pool_names.insert(zone.user_keys_pool);
1651 pool_names.insert(zone.user_email_pool);
1652 pool_names.insert(zone.user_swift_pool);
1653 pool_names.insert(zone.user_uid_pool);
1654 pool_names.insert(zone.roles_pool);
31f18b77 1655 pool_names.insert(zone.reshard_pool);
7c673cae
FG
1656 for(auto& iter : zone.placement_pools) {
1657 pool_names.insert(iter.second.index_pool);
1658 pool_names.insert(iter.second.data_pool);
1659 pool_names.insert(iter.second.data_extra_pool);
1660 }
1661 }
1662 }
1663 return 0;
1664}
1665
1666rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1667 const string& default_prefix,
1668 const string& default_suffix,
1669 const rgw_pool& suggested_pool)
1670{
1671 string suggested_name = suggested_pool.to_str();
1672
1673 string prefix = default_prefix;
1674 string suffix = default_suffix;
1675
1676 if (!suggested_pool.empty()) {
1677 prefix = suggested_name.substr(0, suggested_name.find("."));
1678 suffix = suggested_name.substr(prefix.length());
1679 }
1680
1681 rgw_pool pool(prefix + suffix);
1682
1683 if (pools.find(pool) == pools.end()) {
1684 return pool;
1685 } else {
1686 while(true) {
1687 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1688 if (pools.find(pool) == pools.end()) {
1689 return pool;
1690 }
1691 }
1692 }
1693}
1694
1695int RGWZoneParams::fix_pool_names()
1696{
1697
1698 list<string> zones;
1699 int r = store->list_zones(zones);
1700 if (r < 0) {
1701 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1702 }
1703
1704 set<rgw_pool> pools;
1705 r = get_zones_pool_set(cct, store, zones, id, pools);
1706 if (r < 0) {
1707 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1708 return r;
1709 }
1710
1711 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1712 if (!metadata_heap.name.empty()) {
1713 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1714 }
1715 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1716 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1717 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1718 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1719 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1720 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1721 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1722 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1723 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1724 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1725 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
31f18b77 1726 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
7c673cae
FG
1727
1728 for(auto& iter : placement_pools) {
1729 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1730 iter.second.index_pool);
1731 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1732 iter.second.data_pool);
1733 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1734 iter.second.data_extra_pool);
1735 }
1736
1737 return 0;
1738}
1739
1740int RGWZoneParams::create(bool exclusive)
1741{
1742 /* check for old pools config */
1743 rgw_raw_obj obj(domain_root, avail_pools);
1744 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1745 if (r < 0) {
1746 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1747 /* a new system, let's set new placement info */
1748 RGWZonePlacementInfo default_placement;
1749 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1750 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1751 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1752 placement_pools["default-placement"] = default_placement;
1753 }
1754
1755 r = fix_pool_names();
1756 if (r < 0) {
1757 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1758 return r;
1759 }
1760
1761 r = RGWSystemMetaObj::create(exclusive);
1762 if (r < 0) {
1763 return r;
1764 }
1765
1766 // try to set as default. may race with another create, so pass exclusive=true
1767 // so we don't override an existing default
1768 r = set_as_default(true);
1769 if (r < 0 && r != -EEXIST) {
1770 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1771 }
1772
1773 return 0;
1774}
1775
1776rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1777{
1778 if (cct->_conf->rgw_zone_root_pool.empty()) {
1779 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1780 }
1781
1782 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1783}
1784
1785const string RGWZoneParams::get_default_oid(bool old_format)
1786{
1787 if (old_format) {
1788 return cct->_conf->rgw_default_zone_info_oid;
1789 }
1790
1791 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1792}
1793
1794const string& RGWZoneParams::get_names_oid_prefix()
1795{
1796 return zone_names_oid_prefix;
1797}
1798
1799const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1800{
1801 return zone_info_oid_prefix;
1802}
1803
1804const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1805 return cct->_conf->rgw_zone;
1806}
1807
1808int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1809{
1810 if (name.empty()) {
1811 name = cct->_conf->rgw_zone;
1812 }
1813
1814 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1815}
1816
1817int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1818{
1819 if (realm_id.empty()) {
1820 /* try using default realm */
1821 RGWRealm realm;
1822 int ret = realm.init(cct, store);
b32b8144 1823 //no default realm exist
7c673cae 1824 if (ret < 0) {
b32b8144 1825 return read_id(default_zone_name, default_id);
7c673cae
FG
1826 }
1827 realm_id = realm.get_id();
1828 }
1829
1830 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1831}
1832
1833
1834int RGWZoneParams::set_as_default(bool exclusive)
1835{
1836 if (realm_id.empty()) {
1837 /* try using default realm */
1838 RGWRealm realm;
1839 int ret = realm.init(cct, store);
1840 if (ret < 0) {
1841 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1842 return -EINVAL;
1843 }
1844 realm_id = realm.get_id();
1845 }
1846
1847 return RGWSystemMetaObj::set_as_default(exclusive);
1848}
1849
1850const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1851{
1852 static const std::string NONE{"none"};
1853 auto p = placement_pools.find(placement_rule);
1854 if (p == placement_pools.end()) {
1855 return NONE;
1856 }
1857 const auto& type = p->second.compression_type;
1858 return !type.empty() ? type : NONE;
1859}
1860
1861void RGWPeriodMap::encode(bufferlist& bl) const {
1862 ENCODE_START(2, 1, bl);
1863 ::encode(id, bl);
1864 ::encode(zonegroups, bl);
1865 ::encode(master_zonegroup, bl);
1866 ::encode(short_zone_ids, bl);
1867 ENCODE_FINISH(bl);
1868}
1869
1870void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1871 DECODE_START(2, bl);
1872 ::decode(id, bl);
1873 ::decode(zonegroups, bl);
1874 ::decode(master_zonegroup, bl);
1875 if (struct_v >= 2) {
1876 ::decode(short_zone_ids, bl);
1877 }
1878 DECODE_FINISH(bl);
1879
1880 zonegroups_by_api.clear();
1881 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1882 iter != zonegroups.end(); ++iter) {
1883 RGWZoneGroup& zonegroup = iter->second;
1884 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 1885 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1886 master_zonegroup = zonegroup.get_id();
1887 }
1888 }
1889}
1890
1891// run an MD5 hash on the zone_id and return the first 32 bits
1892static uint32_t gen_short_zone_id(const std::string zone_id)
1893{
1894 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1895 MD5 hash;
1896 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1897 hash.Final(md5);
1898
1899 uint32_t short_id;
1900 memcpy((char *)&short_id, md5, sizeof(short_id));
1901 return std::max(short_id, 1u);
1902}
1903
1904int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1905{
31f18b77 1906 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
7c673cae
FG
1907 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1908 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1909 return -EINVAL;
1910 }
1911 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1912 if (iter != zonegroups.end()) {
1913 RGWZoneGroup& old_zonegroup = iter->second;
1914 if (!old_zonegroup.api_name.empty()) {
1915 zonegroups_by_api.erase(old_zonegroup.api_name);
1916 }
1917 }
1918 zonegroups[zonegroup.get_id()] = zonegroup;
1919
1920 if (!zonegroup.api_name.empty()) {
1921 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1922 }
1923
31f18b77 1924 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1925 master_zonegroup = zonegroup.get_id();
1926 } else if (master_zonegroup == zonegroup.get_id()) {
1927 master_zonegroup = "";
1928 }
1929
1930 for (auto& i : zonegroup.zones) {
1931 auto& zone = i.second;
1932 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1933 continue;
1934 }
1935 // calculate the zone's short id
1936 uint32_t short_id = gen_short_zone_id(zone.id);
1937
1938 // search for an existing zone with the same short id
1939 for (auto& s : short_zone_ids) {
1940 if (s.second == short_id) {
1941 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1942 << ") generates the same short_zone_id " << short_id
1943 << " as existing zone id " << s.first << dendl;
1944 return -EEXIST;
1945 }
1946 }
1947
1948 short_zone_ids[zone.id] = short_id;
1949 }
1950
1951 return 0;
1952}
1953
1954uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1955{
1956 auto i = short_zone_ids.find(zone_id);
1957 if (i == short_zone_ids.end()) {
1958 return 0;
1959 }
1960 return i->second;
1961}
1962
1963int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1964{
1965
1966 RGWPeriod period;
1967 int ret = period.init(cct, store);
1968 if (ret < 0) {
1969 cerr << "failed to read current period info: " << cpp_strerror(ret);
1970 return ret;
1971 }
1972
1973 bucket_quota = period.get_config().bucket_quota;
1974 user_quota = period.get_config().user_quota;
1975 zonegroups = period.get_map().zonegroups;
1976 zonegroups_by_api = period.get_map().zonegroups_by_api;
1977 master_zonegroup = period.get_map().master_zonegroup;
1978
1979 return 0;
1980}
1981
1982void RGWRegionMap::encode(bufferlist& bl) const {
1983 ENCODE_START( 3, 1, bl);
1984 ::encode(regions, bl);
1985 ::encode(master_region, bl);
1986 ::encode(bucket_quota, bl);
1987 ::encode(user_quota, bl);
1988 ENCODE_FINISH(bl);
1989}
1990
1991void RGWRegionMap::decode(bufferlist::iterator& bl) {
1992 DECODE_START(3, bl);
1993 ::decode(regions, bl);
1994 ::decode(master_region, bl);
1995 if (struct_v >= 2)
1996 ::decode(bucket_quota, bl);
1997 if (struct_v >= 3)
1998 ::decode(user_quota, bl);
1999 DECODE_FINISH(bl);
2000}
2001
2002void RGWZoneGroupMap::encode(bufferlist& bl) const {
2003 ENCODE_START( 3, 1, bl);
2004 ::encode(zonegroups, bl);
2005 ::encode(master_zonegroup, bl);
2006 ::encode(bucket_quota, bl);
2007 ::encode(user_quota, bl);
2008 ENCODE_FINISH(bl);
2009}
2010
2011void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
2012 DECODE_START(3, bl);
2013 ::decode(zonegroups, bl);
2014 ::decode(master_zonegroup, bl);
2015 if (struct_v >= 2)
2016 ::decode(bucket_quota, bl);
2017 if (struct_v >= 3)
2018 ::decode(user_quota, bl);
2019 DECODE_FINISH(bl);
2020
2021 zonegroups_by_api.clear();
2022 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2023 iter != zonegroups.end(); ++iter) {
2024 RGWZoneGroup& zonegroup = iter->second;
2025 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 2026 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
2027 master_zonegroup = zonegroup.get_name();
2028 }
2029 }
2030}
2031
2032void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2033{
2034 obj_version *check_objv = version_for_check();
2035
2036 if (check_objv) {
2037 cls_version_check(*op, *check_objv, VER_COND_EQ);
2038 }
2039
2040 cls_version_read(*op, &read_version);
2041}
2042
2043void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2044{
2045 obj_version *check_objv = version_for_check();
2046 obj_version *modify_version = version_for_write();
2047
2048 if (check_objv) {
2049 cls_version_check(*op, *check_objv, VER_COND_EQ);
2050 }
2051
2052 if (modify_version) {
2053 cls_version_set(*op, *modify_version);
2054 } else {
2055 cls_version_inc(*op);
2056 }
2057}
2058
2059void RGWObjManifest::obj_iterator::operator++()
2060{
2061 if (manifest->explicit_objs) {
2062 ++explicit_iter;
2063
2064 if (explicit_iter == manifest->objs.end()) {
2065 ofs = manifest->obj_size;
2066 return;
2067 }
2068
2069 update_explicit_pos();
2070
2071 update_location();
2072 return;
2073 }
2074
2075 uint64_t obj_size = manifest->get_obj_size();
2076 uint64_t head_size = manifest->get_head_size();
2077
2078 if (ofs == obj_size) {
2079 return;
2080 }
2081
2082 if (manifest->rules.empty()) {
2083 return;
2084 }
2085
2086 /* are we still pointing at the head? */
2087 if (ofs < head_size) {
2088 rule_iter = manifest->rules.begin();
2089 RGWObjManifestRule *rule = &rule_iter->second;
2090 ofs = MIN(head_size, obj_size);
2091 stripe_ofs = ofs;
2092 cur_stripe = 1;
2093 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2094 if (rule->part_size > 0) {
2095 stripe_size = MIN(stripe_size, rule->part_size);
2096 }
2097 update_location();
2098 return;
2099 }
2100
2101 RGWObjManifestRule *rule = &rule_iter->second;
2102
2103 stripe_ofs += rule->stripe_max_size;
2104 cur_stripe++;
2105 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2106
2107 if (rule->part_size > 0) {
2108 /* multi part, multi stripes object */
2109
2110 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2111
2112 if (stripe_ofs >= part_ofs + rule->part_size) {
2113 /* moved to the next part */
2114 cur_stripe = 0;
2115 part_ofs += rule->part_size;
2116 stripe_ofs = part_ofs;
2117
2118 bool last_rule = (next_rule_iter == manifest->rules.end());
2119 /* move to the next rule? */
2120 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2121 rule_iter = next_rule_iter;
2122 last_rule = (next_rule_iter == manifest->rules.end());
2123 if (!last_rule) {
2124 ++next_rule_iter;
2125 }
2126 cur_part_id = rule_iter->second.start_part_num;
2127 } else {
2128 cur_part_id++;
2129 }
2130
2131 rule = &rule_iter->second;
2132 }
2133
2134 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2135 }
2136
2137 cur_override_prefix = rule->override_prefix;
2138
2139 ofs = stripe_ofs;
2140 if (ofs > obj_size) {
2141 ofs = obj_size;
2142 stripe_ofs = ofs;
2143 stripe_size = 0;
2144 }
2145
2146 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2147 update_location();
2148}
2149
2150int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2151{
2152 manifest = _m;
2153
2154 manifest->set_tail_placement(placement_rule, _b);
2155 manifest->set_head(placement_rule, _obj, 0);
2156 last_ofs = 0;
2157
2158 if (manifest->get_prefix().empty()) {
2159 char buf[33];
2160 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2161
2162 string oid_prefix = ".";
2163 oid_prefix.append(buf);
2164 oid_prefix.append("_");
2165
2166 manifest->set_prefix(oid_prefix);
2167 }
2168
2169 bool found = manifest->get_rule(0, &rule);
2170 if (!found) {
2171 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2172 return -EIO;
2173 }
2174
2175 uint64_t head_size = manifest->get_head_size();
2176
2177 if (head_size > 0) {
2178 cur_stripe_size = head_size;
2179 } else {
2180 cur_stripe_size = rule.stripe_max_size;
2181 }
2182
2183 cur_part_id = rule.start_part_num;
2184
2185 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2186
2187 // Normal object which not generated through copy operation
2188 manifest->set_tail_instance(_obj.key.instance);
2189
2190 manifest->update_iterators();
2191
2192 return 0;
2193}
2194
2195int RGWObjManifest::generator::create_next(uint64_t ofs)
2196{
2197 if (ofs < last_ofs) /* only going forward */
2198 return -EINVAL;
2199
2200 uint64_t max_head_size = manifest->get_max_head_size();
2201
2202 if (ofs < max_head_size) {
2203 manifest->set_head_size(ofs);
2204 }
2205
2206 if (ofs >= max_head_size) {
2207 manifest->set_head_size(max_head_size);
2208 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2209 cur_stripe_size = rule.stripe_max_size;
2210
2211 if (cur_part_id == 0 && max_head_size > 0) {
2212 cur_stripe++;
2213 }
2214 }
2215
2216 last_ofs = ofs;
2217 manifest->set_obj_size(ofs);
2218
2219 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2220
2221 manifest->update_iterators();
2222
2223 return 0;
2224}
2225
2226const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2227{
2228 return begin_iter;
2229}
2230
2231const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2232{
2233 return end_iter;
2234}
2235
2236RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2237{
2238 if (ofs > obj_size) {
2239 ofs = obj_size;
2240 }
2241 RGWObjManifest::obj_iterator iter(this);
2242 iter.seek(ofs);
2243 return iter;
2244}
2245
2246int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2247{
2248 if (explicit_objs || m.explicit_objs) {
2249 return append_explicit(m, zonegroup, zone_params);
2250 }
2251
2252 if (rules.empty()) {
2253 *this = m;
2254 return 0;
2255 }
2256
2257 string override_prefix;
2258
2259 if (prefix.empty()) {
2260 prefix = m.prefix;
2261 }
2262
2263 if (prefix != m.prefix) {
2264 override_prefix = m.prefix;
2265 }
2266
2267 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2268 if (miter == m.rules.end()) {
2269 return append_explicit(m, zonegroup, zone_params);
2270 }
2271
2272 for (; miter != m.rules.end(); ++miter) {
2273 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2274
2275 RGWObjManifestRule& rule = last_rule->second;
2276
2277 if (rule.part_size == 0) {
2278 rule.part_size = obj_size - rule.start_ofs;
2279 }
2280
2281 RGWObjManifestRule& next_rule = miter->second;
2282 if (!next_rule.part_size) {
2283 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2284 }
2285
2286 string rule_prefix = prefix;
2287 if (!rule.override_prefix.empty()) {
2288 rule_prefix = rule.override_prefix;
2289 }
2290
2291 string next_rule_prefix = m.prefix;
2292 if (!next_rule.override_prefix.empty()) {
2293 next_rule_prefix = next_rule.override_prefix;
2294 }
2295
2296 if (rule.part_size != next_rule.part_size ||
2297 rule.stripe_max_size != next_rule.stripe_max_size ||
2298 rule_prefix != next_rule_prefix) {
2299 if (next_rule_prefix != prefix) {
2300 append_rules(m, miter, &next_rule_prefix);
2301 } else {
2302 append_rules(m, miter, NULL);
2303 }
2304 break;
2305 }
2306
2307 uint64_t expected_part_num = rule.start_part_num + 1;
2308 if (rule.part_size > 0) {
2309 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2310 }
2311
2312 if (expected_part_num != next_rule.start_part_num) {
2313 append_rules(m, miter, NULL);
2314 break;
2315 }
2316 }
2317
2318 set_obj_size(obj_size + m.obj_size);
2319
2320 return 0;
2321}
2322
2323int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2324{
2325 return append(m, store->get_zonegroup(), store->get_zone_params());
2326}
2327
2328void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2329 string *override_prefix)
2330{
2331 for (; miter != m.rules.end(); ++miter) {
2332 RGWObjManifestRule rule = miter->second;
2333 rule.start_ofs += obj_size;
2334 if (override_prefix)
2335 rule.override_prefix = *override_prefix;
2336 rules[rule.start_ofs] = rule;
2337 }
2338}
2339
2340void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2341{
2342 if (explicit_objs) {
2343 return;
2344 }
2345 obj_iterator iter = obj_begin();
2346
2347 while (iter != obj_end()) {
2348 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2349 const rgw_obj_select& os = iter.get_location();
2350 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2351 part.loc_ofs = 0;
2352
2353 uint64_t ofs = iter.get_stripe_ofs();
2354
2355 if (ofs == 0) {
2356 part.loc = obj;
2357 } else {
2358 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2359 }
2360 ++iter;
2361 uint64_t next_ofs = iter.get_stripe_ofs();
2362
2363 part.size = next_ofs - ofs;
2364 }
2365
2366 explicit_objs = true;
2367 rules.clear();
2368 prefix.clear();
2369}
2370
2371int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2372{
2373 if (!explicit_objs) {
2374 convert_to_explicit(zonegroup, zone_params);
2375 }
2376 if (!m.explicit_objs) {
2377 m.convert_to_explicit(zonegroup, zone_params);
2378 }
2379 map<uint64_t, RGWObjManifestPart>::iterator iter;
2380 uint64_t base = obj_size;
2381 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2382 RGWObjManifestPart& part = iter->second;
2383 objs[base + iter->first] = part;
2384 }
2385 obj_size += m.obj_size;
2386
2387 return 0;
2388}
2389
2390bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2391{
2392 if (rules.empty()) {
2393 return false;
2394 }
2395
2396 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2397 if (iter != rules.begin()) {
2398 --iter;
2399 }
2400
2401 *rule = iter->second;
2402
2403 return true;
2404}
2405
2406void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2407{
2408 write_version.ver = 1;
2409#define TAG_LEN 24
2410
2411 write_version.tag.clear();
2412 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2413}
2414
2415int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2416 real_time *mtime, real_time set_mtime,
2417 map<string, bufferlist>& attrs, real_time delete_at,
31f18b77
FG
2418 const char *if_match, const char *if_nomatch, const string *user_data,
2419 rgw_zone_set *zones_trace)
7c673cae 2420{
31f18b77 2421 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
7c673cae
FG
2422 if (r < 0)
2423 return r;
2424
2425 is_complete = !canceled;
2426 return 0;
2427}
2428
2429CephContext *RGWPutObjProcessor::ctx()
2430{
2431 return store->ctx();
2432}
2433
2434RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2435{
2436 drain_pending();
2437
2438 if (is_complete)
2439 return;
2440
2441 set<rgw_raw_obj>::iterator iter;
2442 bool need_to_remove_head = false;
2443 rgw_raw_obj raw_head;
2444
2445 if (!head_obj.empty()) {
2446 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2447 }
2448
2449 /**
2450 * We should delete the object in the "multipart" namespace to avoid race condition.
2451 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2452 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2453 * written by the second upload may be deleted by the first upload.
2454 * details is describled on #11749
2455 *
2456 * The above comment still stands, but instead of searching for a specific object in the multipart
2457 * namespace, we just make sure that we remove the object that is marked as the head object after
2458 * we remove all the other raw objects. Note that we use different call to remove the head object,
2459 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2460 */
2461 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2462 const rgw_raw_obj& obj = *iter;
2463 if (!head_obj.empty() && obj == raw_head) {
2464 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2465 need_to_remove_head = true;
2466 continue;
2467 }
2468
2469 int r = store->delete_raw_obj(obj);
2470 if (r < 0 && r != -ENOENT) {
2471 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2472 }
2473 }
2474
2475 if (need_to_remove_head) {
2476 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2477 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2478 if (r < 0 && r != -ENOENT) {
2479 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2480 }
2481 }
2482}
2483
2484int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2485{
2486 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2487 obj_len = abs_ofs + bl.length();
2488
2489 if (!(obj == last_written_obj)) {
2490 last_written_obj = obj;
2491 }
2492
2493 // For the first call pass -1 as the offset to
2494 // do a write_full.
2495 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2496}
2497
2498struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2499{
2500 struct put_obj_aio_info info;
2501 info = pending.front();
2502 pending.pop_front();
2503 pending_size -= info.size;
2504 return info;
2505}
2506
2507int RGWPutObjProcessor_Aio::wait_pending_front()
2508{
2509 if (pending.empty()) {
2510 return 0;
2511 }
2512 struct put_obj_aio_info info = pop_pending();
2513 int ret = store->aio_wait(info.handle);
2514
2515 if (ret >= 0) {
2516 add_written_obj(info.obj);
2517 }
2518
2519 return ret;
2520}
2521
2522bool RGWPutObjProcessor_Aio::pending_has_completed()
2523{
2524 if (pending.empty())
2525 return false;
2526
2527 struct put_obj_aio_info& info = pending.front();
2528 return store->aio_completed(info.handle);
2529}
2530
2531int RGWPutObjProcessor_Aio::drain_pending()
2532{
2533 int ret = 0;
2534 while (!pending.empty()) {
2535 int r = wait_pending_front();
2536 if (r < 0)
2537 ret = r;
2538 }
2539 return ret;
2540}
2541
2542int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2543{
2544 bool _wait = need_to_wait;
2545
2546 if (handle) {
2547 struct put_obj_aio_info info;
2548 info.handle = handle;
2549 info.obj = obj;
2550 info.size = size;
2551 pending_size += size;
2552 pending.push_back(info);
2553 }
2554 size_t orig_size = pending_size;
2555
2556 /* first drain complete IOs */
2557 while (pending_has_completed()) {
2558 int r = wait_pending_front();
2559 if (r < 0)
2560 return r;
2561
2562 _wait = false;
2563 }
2564
2565 /* resize window in case messages are draining too fast */
2566 if (orig_size - pending_size >= window_size) {
2567 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2568 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2569 if (window_size > max_window_size) {
2570 window_size = max_window_size;
2571 }
2572 }
2573
2574 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2575 if (pending_size > window_size || _wait) {
2576 int r = wait_pending_front();
2577 if (r < 0)
2578 return r;
2579 }
2580 return 0;
2581}
2582
2583int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2584{
2585 if (ofs >= next_part_ofs) {
2586 int r = prepare_next_part(ofs);
2587 if (r < 0) {
2588 return r;
2589 }
2590 }
2591
2592 *pobj = cur_obj;
2593
224ce89b
WB
2594 if (!bl.length()) {
2595 *phandle = nullptr;
7c673cae 2596 return 0;
224ce89b 2597 }
7c673cae
FG
2598
2599 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2600}
2601
2602int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2603{
2604 RGWPutObjProcessor::prepare(store, oid_rand);
2605
2606 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2607
2608 return 0;
2609}
2610
2611int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2612{
2613 *phandle = NULL;
2614 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2615
2616 pending_data_bl.claim_append(bl);
2617 if (pending_data_bl.length() < max_write_size) {
2618 *again = false;
2619 return 0;
2620 }
2621
2622 pending_data_bl.splice(0, max_write_size, &bl);
2623
2624 /* do we have enough data pending accumulated that needs to be written? */
2625 *again = (pending_data_bl.length() >= max_chunk_size);
2626
2627 if (!data_ofs && !immutable_head()) {
2628 first_chunk.claim(bl);
2629 obj_len = (uint64_t)first_chunk.length();
2630 int r = prepare_next_part(obj_len);
2631 if (r < 0) {
2632 return r;
2633 }
2634 data_ofs = obj_len;
2635 return 0;
2636 }
2637 off_t write_ofs = data_ofs;
2638 data_ofs = write_ofs + bl.length();
2639 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2640 we could be racing with another upload, to the same
2641 object and cleanup can be messy */
2642 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2643 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2644 bl.clear();
2645 }
2646 return ret;
2647}
2648
2649
2650int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2651{
2652 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2653
2654 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2655 if (r < 0) {
2656 return r;
2657 }
2658
2659 return 0;
2660}
2661
2662int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2663{
2664 head_obj.init(bucket, obj_str);
2665
2666 int r = prepare_init(store, oid_rand);
2667 if (r < 0) {
2668 return r;
2669 }
2670
2671 if (!version_id.empty()) {
2672 head_obj.key.set_instance(version_id);
2673 } else if (versioned_object) {
2674 store->gen_rand_obj_instance_name(&head_obj);
2675 }
2676
2677 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2678
2679 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2680 if (r < 0) {
2681 return r;
2682 }
2683
2684 return 0;
2685}
2686
2687int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2688
2689 int ret = manifest_gen.create_next(ofs);
2690 if (ret < 0) {
2691 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2692 return ret;
2693 }
2694 cur_part_ofs = ofs;
2695 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2696 cur_obj = manifest_gen.get_cur_obj(store);
2697
2698 return 0;
2699}
2700
2701int RGWPutObjProcessor_Atomic::complete_parts()
2702{
2703 if (obj_len > (uint64_t)cur_part_ofs) {
2704 return prepare_next_part(obj_len);
2705 }
2706 return 0;
2707}
2708
2709int RGWPutObjProcessor_Atomic::complete_writing_data()
2710{
2711 if (!data_ofs && !immutable_head()) {
2712 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2713 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2714 * clobber first_chunk
2715 */
2716 if (pending_data_bl.length() > 0) {
2717 first_chunk.claim(pending_data_bl);
2718 }
2719 obj_len = (uint64_t)first_chunk.length();
2720 }
2721 while (pending_data_bl.length()) {
224ce89b 2722 void *handle = nullptr;
7c673cae
FG
2723 rgw_raw_obj obj;
2724 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2725 if (max_write_size > pending_data_bl.length()) {
2726 max_write_size = pending_data_bl.length();
2727 }
2728 bufferlist bl;
2729 pending_data_bl.splice(0, max_write_size, &bl);
2730 uint64_t write_len = bl.length();
2731 int r = write_data(bl, data_ofs, &handle, &obj, false);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2734 return r;
2735 }
2736 data_ofs += write_len;
2737 r = throttle_data(handle, obj, write_len, false);
2738 if (r < 0) {
2739 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2740 return r;
2741 }
2742
2743 if (data_ofs >= next_part_ofs) {
2744 r = prepare_next_part(data_ofs);
2745 if (r < 0) {
2746 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2747 return r;
2748 }
2749 }
2750 }
2751 int r = complete_parts();
2752 if (r < 0) {
2753 return r;
2754 }
2755
2756 r = drain_pending();
2757 if (r < 0)
2758 return r;
2759
2760 return 0;
2761}
2762
2763int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2764 real_time *mtime, real_time set_mtime,
2765 map<string, bufferlist>& attrs,
2766 real_time delete_at,
2767 const char *if_match,
31f18b77
FG
2768 const char *if_nomatch, const string *user_data,
2769 rgw_zone_set *zones_trace) {
7c673cae
FG
2770 int r = complete_writing_data();
2771 if (r < 0)
2772 return r;
2773
2774 obj_ctx.obj.set_atomic(head_obj);
2775
2776 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2777
2778 /* some object types shouldn't be versioned, e.g., multipart parts */
2779 op_target.set_versioning_disabled(!versioned_object);
2780
2781 RGWRados::Object::Write obj_op(&op_target);
2782
2783 obj_op.meta.data = &first_chunk;
2784 obj_op.meta.manifest = &manifest;
2785 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2786 obj_op.meta.if_match = if_match;
2787 obj_op.meta.if_nomatch = if_nomatch;
2788 obj_op.meta.mtime = mtime;
2789 obj_op.meta.set_mtime = set_mtime;
2790 obj_op.meta.owner = bucket_info.owner;
2791 obj_op.meta.flags = PUT_OBJ_CREATE;
2792 obj_op.meta.olh_epoch = olh_epoch;
2793 obj_op.meta.delete_at = delete_at;
2794 obj_op.meta.user_data = user_data;
31f18b77 2795 obj_op.meta.zones_trace = zones_trace;
181888fb 2796 obj_op.meta.modify_tail = true;
7c673cae
FG
2797
2798 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2799 if (r < 0) {
2800 return r;
2801 }
2802
2803 canceled = obj_op.meta.canceled;
2804
2805 return 0;
2806}
2807
3a9019d9
FG
2808const char* RGWRados::admin_commands[4][3] = {
2809 { "cache list",
2810 "cache list name=filter,type=CephString,req=false",
2811 "cache list [filter_str]: list object cache, possibly matching substrings" },
2812 { "cache inspect",
2813 "cache inspect name=target,type=CephString,req=true",
2814 "cache inspect target: print cache element" },
2815 { "cache erase",
2816 "cache erase name=target,type=CephString,req=true",
2817 "cache erase target: erase element from cache" },
2818 { "cache zap",
2819 "cache zap",
2820 "cache zap: erase all elements from cache" }
2821};
2822
2823
7c673cae
FG
2824int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2825 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2826 if (r < 0)
2827 return r;
2828 return 0;
2829}
2830
2831int RGWRados::unwatch(uint64_t watch_handle)
2832{
2833 int r = control_pool_ctx.unwatch2(watch_handle);
2834 if (r < 0) {
2835 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2836 return r;
2837 }
2838 r = rados[0].watch_flush();
2839 if (r < 0) {
2840 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2841 return r;
2842 }
2843 return 0;
2844}
2845
2846void RGWRados::add_watcher(int i)
2847{
2848 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2849 Mutex::Locker l(watchers_lock);
2850 watchers_set.insert(i);
2851 if (watchers_set.size() == (size_t)num_watchers) {
2852 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2853 set_cache_enabled(true);
2854 }
2855}
2856
2857void RGWRados::remove_watcher(int i)
2858{
2859 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2860 Mutex::Locker l(watchers_lock);
2861 size_t orig_size = watchers_set.size();
2862 watchers_set.erase(i);
2863 if (orig_size == (size_t)num_watchers &&
2864 watchers_set.size() < orig_size) { /* actually removed */
2865 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2866 set_cache_enabled(false);
2867 }
2868}
2869
2870class RGWWatcher : public librados::WatchCtx2 {
2871 RGWRados *rados;
2872 int index;
2873 string oid;
2874 uint64_t watch_handle;
2875
2876 class C_ReinitWatch : public Context {
2877 RGWWatcher *watcher;
2878 public:
2879 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2880 void finish(int r) override {
2881 watcher->reinit();
2882 }
2883 };
2884public:
2885 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2886 void handle_notify(uint64_t notify_id,
2887 uint64_t cookie,
2888 uint64_t notifier_id,
2889 bufferlist& bl) override {
2890 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2891 << " notify_id " << notify_id
2892 << " cookie " << cookie
2893 << " notifier " << notifier_id
2894 << " bl.length()=" << bl.length() << dendl;
2895 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2896
2897 bufferlist reply_bl; // empty reply payload
2898 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2899 }
2900 void handle_error(uint64_t cookie, int err) override {
2901 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2902 << " err " << cpp_strerror(err) << dendl;
2903 rados->remove_watcher(index);
2904 rados->schedule_context(new C_ReinitWatch(this));
2905 }
2906
2907 void reinit() {
2908 int ret = unregister_watch();
2909 if (ret < 0) {
2910 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2911 return;
2912 }
2913 ret = register_watch();
2914 if (ret < 0) {
2915 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2916 return;
2917 }
2918 }
2919
2920 int unregister_watch() {
2921 int r = rados->unwatch(watch_handle);
2922 if (r < 0) {
2923 return r;
2924 }
2925 rados->remove_watcher(index);
2926 return 0;
2927 }
2928
2929 int register_watch() {
2930 int r = rados->watch(oid, &watch_handle, this);
2931 if (r < 0) {
2932 return r;
2933 }
2934 rados->add_watcher(index);
2935 return 0;
2936 }
2937};
2938
2939class RGWMetaNotifierManager : public RGWCoroutinesManager {
2940 RGWRados *store;
2941 RGWHTTPManager http_manager;
2942
2943public:
2944 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2945 http_manager(store->ctx(), completion_mgr) {
2946 http_manager.set_threaded();
2947 }
2948
2949 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2950 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2951 { "notify", NULL },
2952 { NULL, NULL } };
2953
2954 list<RGWCoroutinesStack *> stacks;
2955 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2956 RGWRESTConn *conn = iter->second;
2957 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2958 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2959
2960 stacks.push_back(stack);
2961 }
2962 return run(stacks);
2963 }
2964};
2965
2966class RGWDataNotifierManager : public RGWCoroutinesManager {
2967 RGWRados *store;
2968 RGWHTTPManager http_manager;
2969
2970public:
2971 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2972 http_manager(store->ctx(), completion_mgr) {
2973 http_manager.set_threaded();
2974 }
2975
2976 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2977 rgw_http_param_pair pairs[] = { { "type", "data" },
2978 { "notify", NULL },
2979 { "source-zone", store->get_zone_params().get_id().c_str() },
2980 { NULL, NULL } };
2981
2982 list<RGWCoroutinesStack *> stacks;
2983 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2984 RGWRESTConn *conn = iter->second;
2985 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2986 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2987
2988 stacks.push_back(stack);
2989 }
2990 return run(stacks);
2991 }
2992};
2993
2994class RGWRadosThread {
2995 class Worker : public Thread {
2996 CephContext *cct;
2997 RGWRadosThread *processor;
2998 Mutex lock;
2999 Cond cond;
3000
31f18b77
FG
3001 void wait() {
3002 Mutex::Locker l(lock);
3003 cond.Wait(lock);
3004 };
3005
3006 void wait_interval(const utime_t& wait_time) {
3007 Mutex::Locker l(lock);
3008 cond.WaitInterval(lock, wait_time);
3009 }
3010
7c673cae
FG
3011 public:
3012 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
3013 void *entry() override;
31f18b77 3014 void signal() {
7c673cae
FG
3015 Mutex::Locker l(lock);
3016 cond.Signal();
3017 }
3018 };
3019
3020 Worker *worker;
3021
3022protected:
3023 CephContext *cct;
3024 RGWRados *store;
3025
3026 std::atomic<bool> down_flag = { false };
3027
3028 string thread_name;
3029
3030 virtual uint64_t interval_msec() = 0;
3031 virtual void stop_process() {}
3032public:
3033 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3034 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3035 virtual ~RGWRadosThread() {
3036 stop();
3037 }
3038
3039 virtual int init() { return 0; }
3040 virtual int process() = 0;
3041
3042 bool going_down() { return down_flag; }
3043
3044 void start();
3045 void stop();
31f18b77
FG
3046
3047 void signal() {
3048 if (worker) {
3049 worker->signal();
3050 }
3051 }
7c673cae
FG
3052};
3053
3054void RGWRadosThread::start()
3055{
3056 worker = new Worker(cct, this);
3057 worker->create(thread_name.c_str());
3058}
3059
3060void RGWRadosThread::stop()
3061{
3062 down_flag = true;
3063 stop_process();
3064 if (worker) {
31f18b77 3065 worker->signal();
7c673cae
FG
3066 worker->join();
3067 }
3068 delete worker;
3069 worker = NULL;
3070}
3071
3072void *RGWRadosThread::Worker::entry() {
3073 uint64_t msec = processor->interval_msec();
3074 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3075
3076 do {
3077 utime_t start = ceph_clock_now();
3078 int r = processor->process();
3079 if (r < 0) {
3080 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3081 }
3082
3083 if (processor->going_down())
3084 break;
3085
3086 utime_t end = ceph_clock_now();
3087 end -= start;
3088
3089 uint64_t cur_msec = processor->interval_msec();
3090 if (cur_msec != msec) { /* was it reconfigured? */
3091 msec = cur_msec;
3092 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3093 }
3094
3095 if (cur_msec > 0) {
3096 if (interval <= end)
3097 continue; // next round
3098
3099 utime_t wait_time = interval;
3100 wait_time -= end;
3101
31f18b77 3102 wait_interval(wait_time);
7c673cae 3103 } else {
31f18b77 3104 wait();
7c673cae
FG
3105 }
3106 } while (!processor->going_down());
3107
3108 return NULL;
3109}
3110
3111class RGWMetaNotifier : public RGWRadosThread {
3112 RGWMetaNotifierManager notify_mgr;
3113 RGWMetadataLog *const log;
3114
3115 uint64_t interval_msec() override {
3116 return cct->_conf->rgw_md_notify_interval_msec;
3117 }
1adf2230
AA
3118 void stop_process() override {
3119 notify_mgr.stop();
3120 }
7c673cae
FG
3121public:
3122 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3123 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3124
3125 int process() override;
3126};
3127
3128int RGWMetaNotifier::process()
3129{
3130 set<int> shards;
3131
3132 log->read_clear_modified(shards);
3133
3134 if (shards.empty()) {
3135 return 0;
3136 }
3137
3138 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3139 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3140 }
3141
3142 notify_mgr.notify_all(store->zone_conn_map, shards);
3143
3144 return 0;
3145}
3146
3147class RGWDataNotifier : public RGWRadosThread {
3148 RGWDataNotifierManager notify_mgr;
3149
3150 uint64_t interval_msec() override {
d2e6a577 3151 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 3152 }
1adf2230
AA
3153 void stop_process() override {
3154 notify_mgr.stop();
3155 }
7c673cae
FG
3156public:
3157 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3158
3159 int process() override;
3160};
3161
3162int RGWDataNotifier::process()
3163{
3164 if (!store->data_log) {
3165 return 0;
3166 }
3167
3168 map<int, set<string> > shards;
3169
3170 store->data_log->read_clear_modified(shards);
3171
3172 if (shards.empty()) {
3173 return 0;
3174 }
3175
3176 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3177 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3178 }
3179
3180 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3181
3182 return 0;
3183}
3184
3185class RGWSyncProcessorThread : public RGWRadosThread {
3186public:
3187 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3188 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3189 ~RGWSyncProcessorThread() override {}
3190 int init() override = 0 ;
3191 int process() override = 0;
3192};
3193
3194class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3195{
3196 RGWMetaSyncStatusManager sync;
3197
3198 uint64_t interval_msec() override {
3199 return 0; /* no interval associated, it'll run once until stopped */
3200 }
3201 void stop_process() override {
3202 sync.stop();
3203 }
3204public:
3205 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3206 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3207
3208 void wakeup_sync_shards(set<int>& shard_ids) {
3209 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3210 sync.wakeup(*iter);
3211 }
3212 }
3213 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3214
3215 int init() override {
3216 int ret = sync.init();
3217 if (ret < 0) {
3218 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3219 return ret;
3220 }
3221 return 0;
3222 }
3223
3224 int process() override {
3225 sync.run();
3226 return 0;
3227 }
3228};
3229
3230class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3231{
3232 RGWDataSyncStatusManager sync;
3233 bool initialized;
3234
3235 uint64_t interval_msec() override {
3236 if (initialized) {
3237 return 0; /* no interval associated, it'll run once until stopped */
3238 } else {
3239#define DATA_SYNC_INIT_WAIT_SEC 20
3240 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3241 }
3242 }
3243 void stop_process() override {
3244 sync.stop();
3245 }
3246public:
3247 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
91327a77 3248 const string& _source_zone)
b32b8144 3249 : RGWSyncProcessorThread(_store, "data-sync"),
91327a77 3250 sync(_store, async_rados, _source_zone),
7c673cae
FG
3251 initialized(false) {}
3252
3253 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3254 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3255 sync.wakeup(iter->first, iter->second);
3256 }
3257 }
3258 RGWDataSyncStatusManager* get_manager() { return &sync; }
3259
3260 int init() override {
3261 return 0;
3262 }
3263
3264 int process() override {
3265 while (!initialized) {
3266 if (going_down()) {
3267 return 0;
3268 }
3269 int ret = sync.init();
3270 if (ret >= 0) {
3271 initialized = true;
3272 break;
3273 }
3274 /* we'll be back! */
3275 return 0;
3276 }
3277 sync.run();
3278 return 0;
3279 }
3280};
3281
3282class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3283{
3284 RGWCoroutinesManager crs;
3285 RGWRados *store;
b32b8144 3286 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
3287 RGWHTTPManager http;
3288 const utime_t trim_interval;
3289
3290 uint64_t interval_msec() override { return 0; }
3291 void stop_process() override { crs.stop(); }
3292public:
b32b8144
FG
3293 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3294 int interval)
7c673cae
FG
3295 : RGWSyncProcessorThread(store, "sync-log-trim"),
3296 crs(store->ctx(), store->get_cr_registry()), store(store),
b32b8144 3297 bucket_trim(bucket_trim),
7c673cae
FG
3298 http(store->ctx(), crs.get_completion_mgr()),
3299 trim_interval(interval, 0)
3300 {}
3301
3302 int init() override {
3303 return http.set_threaded();
3304 }
3305 int process() override {
3306 list<RGWCoroutinesStack*> stacks;
3307 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3308 meta->call(create_meta_log_trim_cr(store, &http,
3309 cct->_conf->rgw_md_log_max_shards,
3310 trim_interval));
3311 stacks.push_back(meta);
3312
3313 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3314 data->call(create_data_log_trim_cr(store, &http,
3315 cct->_conf->rgw_data_log_num_shards,
3316 trim_interval));
3317 stacks.push_back(data);
3318
b32b8144
FG
3319 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3320 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3321 stacks.push_back(bucket);
3322
7c673cae
FG
3323 crs.run(stacks);
3324 return 0;
3325 }
3326};
3327
3328void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3329{
3330 Mutex::Locker l(meta_sync_thread_lock);
3331 if (meta_sync_processor_thread) {
3332 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3333 }
3334}
3335
3336void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3337{
3338 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3339 Mutex::Locker l(data_sync_thread_lock);
3340 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3341 if (iter == data_sync_processor_threads.end()) {
3342 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3343 return;
3344 }
3345
3346 RGWDataSyncProcessorThread *thread = iter->second;
3347 assert(thread);
3348 thread->wakeup_sync_shards(shard_ids);
3349}
3350
3351RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3352{
3353 Mutex::Locker l(meta_sync_thread_lock);
3354 if (meta_sync_processor_thread) {
3355 return meta_sync_processor_thread->get_manager();
3356 }
3357 return nullptr;
3358}
3359
3360RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3361{
3362 Mutex::Locker l(data_sync_thread_lock);
3363 auto thread = data_sync_processor_threads.find(source_zone);
3364 if (thread == data_sync_processor_threads.end()) {
3365 return nullptr;
3366 }
3367 return thread->second->get_manager();
3368}
3369
3370int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3371{
3372 IoCtx ioctx;
3373 int r = open_pool_ctx(pool, ioctx);
3374 if (r < 0) {
3375 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3376 return r;
3377 }
3378
3379 bool requires;
3380 r = ioctx.pool_requires_alignment2(&requires);
3381 if (r < 0) {
3382 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3383 << r << dendl;
3384 return r;
3385 }
3386
3387 if (!requires) {
3388 *alignment = 0;
3389 return 0;
3390 }
3391
3392 uint64_t align;
3393 r = ioctx.pool_required_alignment2(&align);
3394 if (r < 0) {
3395 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3396 << r << dendl;
3397 return r;
3398 }
3399 if (align != 0) {
3400 ldout(cct, 20) << "required alignment=" << align << dendl;
3401 }
3402 *alignment = align;
3403 return 0;
3404}
3405
3406int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3407{
224ce89b 3408 uint64_t alignment = 0;
7c673cae
FG
3409 int r = get_required_alignment(pool, &alignment);
3410 if (r < 0) {
3411 return r;
3412 }
3413
3414 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3415
3416 if (alignment == 0) {
3417 *max_chunk_size = config_chunk_size;
3418 return 0;
3419 }
3420
3421 if (config_chunk_size <= alignment) {
3422 *max_chunk_size = alignment;
3423 return 0;
3424 }
3425
3426 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3427
3428 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3429
3430 return 0;
3431}
3432
3433int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3434{
3435 rgw_pool pool;
3436 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3437 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3438 return -EIO;
3439 }
3440 return get_max_chunk_size(pool, max_chunk_size);
3441}
3442
31f18b77
FG
3443class RGWIndexCompletionManager;
3444
3445struct complete_op_data {
3446 Mutex lock{"complete_op_data"};
3447 AioCompletion *rados_completion{nullptr};
3448 int manager_shard_id{-1};
3449 RGWIndexCompletionManager *manager{nullptr};
3450 rgw_obj obj;
3451 RGWModifyOp op;
3452 string tag;
3453 rgw_bucket_entry_ver ver;
3454 cls_rgw_obj_key key;
3455 rgw_bucket_dir_entry_meta dir_meta;
3456 list<cls_rgw_obj_key> remove_objs;
3457 bool log_op;
3458 uint16_t bilog_op;
3459 rgw_zone_set zones_trace;
3460
3461 bool stopped{false};
3462
3463 void stop() {
3464 Mutex::Locker l(lock);
3465 stopped = true;
3466 }
3467};
3468
3469class RGWIndexCompletionThread : public RGWRadosThread {
3470 RGWRados *store;
3471
3472 uint64_t interval_msec() override {
3473 return 0;
3474 }
3475
3476 list<complete_op_data *> completions;
3477
3478 Mutex completions_lock;
3479public:
3480 RGWIndexCompletionThread(RGWRados *_store)
3481 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3482
3483 int process() override;
3484
3485 void add_completion(complete_op_data *completion) {
3486 {
3487 Mutex::Locker l(completions_lock);
3488 completions.push_back(completion);
3489 }
3490
3491 signal();
3492 }
3493};
3494
3495int RGWIndexCompletionThread::process()
3496{
3497 list<complete_op_data *> comps;
3498
3499 {
3500 Mutex::Locker l(completions_lock);
3501 completions.swap(comps);
3502 }
3503
3504 for (auto c : comps) {
3505 std::unique_ptr<complete_op_data> up{c};
3506
3507 if (going_down()) {
3508 continue;
3509 }
3510 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3511
3512 RGWRados::BucketShard bs(store);
f64942e4 3513 RGWBucketInfo bucket_info;
31f18b77 3514
f64942e4 3515 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
31f18b77
FG
3516 if (r < 0) {
3517 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3518 /* not much to do */
3519 continue;
3520 }
3521
f64942e4
AA
3522 r = store->guard_reshard(&bs, c->obj, bucket_info,
3523 [&](RGWRados::BucketShard *bs) -> int {
3524 librados::ObjectWriteOperation o;
3525 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3526 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3527 c->log_op, c->bilog_op, &c->zones_trace);
3528 return bs->index_ctx.operate(bs->bucket_obj, &o);
31f18b77
FG
3529 });
3530 if (r < 0) {
3531 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3532 /* ignoring error, can't do anything about it */
3533 continue;
3534 }
3535 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3536 if (r < 0) {
3537 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3538 }
3539 }
3540
3541 return 0;
3542}
3543
3544class RGWIndexCompletionManager {
3545 RGWRados *store{nullptr};
3546 vector<Mutex *> locks;
3547 vector<set<complete_op_data *> > completions;
3548
3549 RGWIndexCompletionThread *completion_thread{nullptr};
3550
3551 int num_shards;
3552
3553 std::atomic<int> cur_shard {0};
3554
3555
3556public:
3557 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3558 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3559
3560 for (int i = 0; i < num_shards; i++) {
3561 char buf[64];
3562 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3563 locks.push_back(new Mutex(buf));
3564 }
3565
3566 completions.resize(num_shards);
3567 }
3568 ~RGWIndexCompletionManager() {
3569 stop();
3570
3571 for (auto l : locks) {
3572 delete l;
3573 }
3574 }
3575
3576 int next_shard() {
3577 int result = cur_shard % num_shards;
3578 cur_shard++;
3579 return result;
3580 }
3581
3582 void create_completion(const rgw_obj& obj,
3583 RGWModifyOp op, string& tag,
3584 rgw_bucket_entry_ver& ver,
3585 const cls_rgw_obj_key& key,
3586 rgw_bucket_dir_entry_meta& dir_meta,
3587 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3588 uint16_t bilog_op,
3589 rgw_zone_set *zones_trace,
3590 complete_op_data **result);
3591 bool handle_completion(completion_t cb, complete_op_data *arg);
3592
3593 int start() {
3594 completion_thread = new RGWIndexCompletionThread(store);
3595 int ret = completion_thread->init();
3596 if (ret < 0) {
3597 return ret;
3598 }
3599 completion_thread->start();
3600 return 0;
3601 }
3602 void stop() {
3603 if (completion_thread) {
3604 completion_thread->stop();
3605 delete completion_thread;
3606 }
3607
3608 for (int i = 0; i < num_shards; ++i) {
3609 Mutex::Locker l(*locks[i]);
3610 for (auto c : completions[i]) {
31f18b77
FG
3611 c->stop();
3612 }
3613 }
3614 completions.clear();
3615 }
3616};
3617
3618static void obj_complete_cb(completion_t cb, void *arg)
3619{
3620 complete_op_data *completion = (complete_op_data *)arg;
3621 completion->lock.Lock();
3622 if (completion->stopped) {
3623 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3624 delete completion;
3625 return;
3626 }
3627 bool need_delete = completion->manager->handle_completion(cb, completion);
3628 completion->lock.Unlock();
3629 if (need_delete) {
3630 delete completion;
3631 }
3632}
3633
3634
3635void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3636 RGWModifyOp op, string& tag,
3637 rgw_bucket_entry_ver& ver,
3638 const cls_rgw_obj_key& key,
3639 rgw_bucket_dir_entry_meta& dir_meta,
3640 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3641 uint16_t bilog_op,
3642 rgw_zone_set *zones_trace,
3643 complete_op_data **result)
3644{
3645 complete_op_data *entry = new complete_op_data;
3646
3647 int shard_id = next_shard();
3648
3649 entry->manager_shard_id = shard_id;
3650 entry->manager = this;
3651 entry->obj = obj;
3652 entry->op = op;
3653 entry->tag = tag;
3654 entry->ver = ver;
3655 entry->key = key;
3656 entry->dir_meta = dir_meta;
3657 entry->log_op = log_op;
3658 entry->bilog_op = bilog_op;
3659
3660 if (remove_objs) {
3661 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3662 entry->remove_objs.push_back(*iter);
3663 }
3664 }
3665
3666 if (zones_trace) {
3667 entry->zones_trace = *zones_trace;
3668 } else {
3669 entry->zones_trace.insert(store->get_zone().id);
3670 }
3671
3672 *result = entry;
3673
3674 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3675
3676 Mutex::Locker l(*locks[shard_id]);
3677 completions[shard_id].insert(entry);
3678}
3679
3680bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3681{
3682 int shard_id = arg->manager_shard_id;
3683 {
3684 Mutex::Locker l(*locks[shard_id]);
3685
3686 auto& comps = completions[shard_id];
3687
3688 auto iter = comps.find(arg);
3689 if (iter == comps.end()) {
3690 return true;
3691 }
3692
3693 comps.erase(iter);
3694 }
3695
3696 int r = rados_aio_get_return_value(cb);
3697 if (r != -ERR_BUSY_RESHARDING) {
3698 return true;
3699 }
3700 completion_thread->add_completion(arg);
3701 return false;
3702}
3703
7c673cae
FG
3704void RGWRados::finalize()
3705{
3a9019d9
FG
3706 auto admin_socket = cct->get_admin_socket();
3707 for (auto cmd : admin_commands) {
3708 int r = admin_socket->unregister_command(cmd[0]);
3709 if (r < 0) {
3710 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3711 << ")" << dendl;
3712 }
3713 }
3714
7c673cae
FG
3715 if (run_sync_thread) {
3716 Mutex::Locker l(meta_sync_thread_lock);
3717 meta_sync_processor_thread->stop();
3718
3719 Mutex::Locker dl(data_sync_thread_lock);
3720 for (auto iter : data_sync_processor_threads) {
3721 RGWDataSyncProcessorThread *thread = iter.second;
3722 thread->stop();
3723 }
3724 if (sync_log_trimmer) {
3725 sync_log_trimmer->stop();
3726 }
3727 }
3728 if (async_rados) {
3729 async_rados->stop();
3730 }
3731 if (run_sync_thread) {
3732 delete meta_sync_processor_thread;
3733 meta_sync_processor_thread = NULL;
3734 Mutex::Locker dl(data_sync_thread_lock);
3735 for (auto iter : data_sync_processor_threads) {
3736 RGWDataSyncProcessorThread *thread = iter.second;
3737 delete thread;
3738 }
3739 data_sync_processor_threads.clear();
3740 delete sync_log_trimmer;
3741 sync_log_trimmer = nullptr;
b32b8144 3742 bucket_trim = boost::none;
7c673cae
FG
3743 }
3744 if (finisher) {
3745 finisher->stop();
3746 }
3747 if (need_watch_notify()) {
3748 finalize_watch();
3749 }
3750 if (finisher) {
3751 /* delete finisher only after cleaning up watches, as watch error path might call
3752 * into finisher. We stop finisher before finalizing watch to make sure we don't
3753 * actually handle any racing work
3754 */
3755 delete finisher;
3756 }
3757 if (meta_notifier) {
3758 meta_notifier->stop();
3759 delete meta_notifier;
3760 }
3761 if (data_notifier) {
3762 data_notifier->stop();
3763 delete data_notifier;
3764 }
3765 delete data_log;
3766 if (async_rados) {
3767 delete async_rados;
3768 }
224ce89b 3769
c07f9fc5
FG
3770 delete lc;
3771 lc = NULL;
3772
7c673cae
FG
3773 delete gc;
3774 gc = NULL;
3775
7c673cae
FG
3776 delete obj_expirer;
3777 obj_expirer = NULL;
3778
3779 delete rest_master_conn;
3780
3781 map<string, RGWRESTConn *>::iterator iter;
3782 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3783 RGWRESTConn *conn = iter->second;
3784 delete conn;
3785 }
3786
3787 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3788 RGWRESTConn *conn = iter->second;
3789 delete conn;
3790 }
3791 RGWQuotaHandler::free_handler(quota_handler);
3792 if (cr_registry) {
3793 cr_registry->put();
3794 }
3795 delete meta_mgr;
3796 delete binfo_cache;
3797 delete obj_tombstone_cache;
3798 delete sync_modules_manager;
31f18b77
FG
3799
3800 if (reshard_wait.get()) {
3801 reshard_wait->stop();
3802 reshard_wait.reset();
3803 }
3804
3805 if (run_reshard_thread) {
3806 reshard->stop_processor();
3807 }
3808 delete reshard;
3809 delete index_completion_manager;
7c673cae
FG
3810}
3811
3812/**
3813 * Initialize the RADOS instance and prepare to do other ops
3814 * Returns 0 on success, -ERR# on failure.
3815 */
3816int RGWRados::init_rados()
3817{
3818 int ret = 0;
3a9019d9
FG
3819 auto admin_socket = cct->get_admin_socket();
3820 for (auto cmd : admin_commands) {
3821 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3822 cmd[2]);
3823 if (r < 0) {
3824 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3825 << ")" << dendl;
3826 return r;
3827 }
3828 }
3829
7c673cae
FG
3830 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3831
3832 for (auto& r : handles) {
3833 ret = r.init_with_context(cct);
3834 if (ret < 0) {
3835 return ret;
3836 }
7c673cae
FG
3837 ret = r.connect();
3838 if (ret < 0) {
3839 return ret;
3840 }
3841 }
3842
3843 sync_modules_manager = new RGWSyncModulesManager();
3844
3845 rgw_register_sync_modules(sync_modules_manager);
3846
3847 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3848 new RGWCoroutinesManagerRegistry(cct)};
3849 ret = crs->hook_to_admin_command("cr dump");
3850 if (ret < 0) {
3851 return ret;
3852 }
3853
3854 meta_mgr = new RGWMetadataManager(cct, this);
3855 data_log = new RGWDataChangesLog(cct, this);
3856 cr_registry = crs.release();
3857
3858 std::swap(handles, rados);
3859 return ret;
3860}
3861
224ce89b
WB
3862
3863int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3864{
3865 map<string,string> metadata = meta;
3866 metadata["num_handles"] = stringify(rados.size());
3867 metadata["zonegroup_id"] = zonegroup.get_id();
3868 metadata["zonegroup_name"] = zonegroup.get_name();
3869 metadata["zone_name"] = zone_name();
3870 metadata["zone_id"] = zone_id();;
3871 string name = cct->_conf->name.get_id();
3872 if (name.find("rgw.") == 0) {
3873 name = name.substr(4);
3874 }
3875 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3876 if (ret < 0) {
3877 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3878 return ret;
3879 }
3880
3881 return 0;
3882}
3883
7c673cae
FG
3884/**
3885 * Add new connection to connections map
3886 * @param zonegroup_conn_map map which new connection will be added to
3887 * @param zonegroup zonegroup which new connection will connect to
3888 * @param new_connection pointer to new connection instance
3889 */
3890static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3891 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3892{
3893 // Delete if connection is already exists
3894 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3895 if (iterZoneGroup != zonegroup_conn_map.end()) {
3896 delete iterZoneGroup->second;
3897 }
3898
3899 // Add new connection to connections map
3900 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3901}
3902
3903int RGWRados::convert_regionmap()
3904{
3905 RGWZoneGroupMap zonegroupmap;
3906
3907 string pool_name = cct->_conf->rgw_zone_root_pool;
3908 if (pool_name.empty()) {
3909 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3910 }
3911 string oid = region_map_oid;
3912
3913 rgw_pool pool(pool_name);
3914 bufferlist bl;
3915 RGWObjectCtx obj_ctx(this);
3916 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3917 if (ret < 0 && ret != -ENOENT) {
3918 return ret;
3919 } else if (ret == -ENOENT) {
3920 return 0;
3921 }
3922
3923 try {
3924 bufferlist::iterator iter = bl.begin();
3925 ::decode(zonegroupmap, iter);
3926 } catch (buffer::error& err) {
3927 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3928 return -EIO;
3929 }
3930
3931 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3932 iter != zonegroupmap.zonegroups.end(); ++iter) {
3933 RGWZoneGroup& zonegroup = iter->second;
3934 ret = zonegroup.init(cct, this, false);
3935 ret = zonegroup.update();
3936 if (ret < 0 && ret != -ENOENT) {
3937 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3938 cpp_strerror(-ret) << dendl;
3939 return ret;
3940 } else if (ret == -ENOENT) {
3941 ret = zonegroup.create();
3942 if (ret < 0) {
3943 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3944 cpp_strerror(-ret) << dendl;
3945 return ret;
3946 }
3947 }
3948 }
3949
3950 current_period.set_user_quota(zonegroupmap.user_quota);
3951 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3952
3953 // remove the region_map so we don't try to convert again
3954 rgw_raw_obj obj(pool, oid);
3955 ret = delete_system_obj(obj);
3956 if (ret < 0) {
3957 ldout(cct, 0) << "Error could not remove " << obj
3958 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3959 return ret;
3960 }
3961
3962 return 0;
3963}
3964
3965/**
3966 * Replace all region configuration with zonegroup for
3967 * backward compatability
3968 * Returns 0 on success, -ERR# on failure.
3969 */
3970int RGWRados::replace_region_with_zonegroup()
3971{
3972 /* copy default region */
3973 /* convert default region to default zonegroup */
3974 string default_oid = cct->_conf->rgw_default_region_info_oid;
3975 if (default_oid.empty()) {
3976 default_oid = default_region_info_oid;
3977 }
3978
3979
3980 RGWZoneGroup default_zonegroup;
3981 rgw_pool pool{default_zonegroup.get_pool(cct)};
3982 string oid = "converted";
3983 bufferlist bl;
3984 RGWObjectCtx obj_ctx(this);
3985
3986 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3987 if (ret < 0 && ret != -ENOENT) {
3988 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3989 << dendl;
3990 return ret;
3991 } else if (ret != -ENOENT) {
3992 ldout(cct, 20) << "System already converted " << dendl;
3993 return 0;
3994 }
3995
3996 string default_region;
3997 ret = default_zonegroup.init(cct, this, false, true);
3998 if (ret < 0) {
3999 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4000 return ret;
4001 }
4002 ret = default_zonegroup.read_default_id(default_region, true);
4003 if (ret < 0 && ret != -ENOENT) {
4004 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4005 return ret;
4006 }
4007
4008 /* convert regions to zonegroups */
4009 list<string> regions;
4010 ret = list_regions(regions);
4011 if (ret < 0 && ret != -ENOENT) {
4012 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4013 return ret;
4014 } else if (ret == -ENOENT || regions.empty()) {
4015 RGWZoneParams zoneparams(default_zone_name);
4016 int ret = zoneparams.init(cct, this);
4017 if (ret < 0 && ret != -ENOENT) {
4018 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4019 return ret;
4020 }
4021 /* update master zone */
4022 RGWZoneGroup default_zg(default_zonegroup_name);
4023 ret = default_zg.init(cct, this);
4024 if (ret < 0 && ret != -ENOENT) {
4025 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4026 return ret;
4027 }
4028 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4029 default_zg.master_zone = zoneparams.get_id();
4030 return default_zg.update();
4031 }
4032 return 0;
4033 }
4034
4035 string master_region, master_zone;
4036 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4037 if (*iter != default_zonegroup_name){
4038 RGWZoneGroup region(*iter);
4039 int ret = region.init(cct, this, true, true);
4040 if (ret < 0) {
4041 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4042 return ret;
4043 }
31f18b77 4044 if (region.is_master_zonegroup()) {
7c673cae
FG
4045 master_region = region.get_id();
4046 master_zone = region.master_zone;
4047 }
4048 }
4049 }
4050
4051 /* create realm if there is none.
4052 The realm name will be the region and zone concatenated
4053 realm id will be mds of its name */
4054 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4055 string new_realm_name = master_region + "." + master_zone;
4056 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4057 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4058 MD5 hash;
4059 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4060 hash.Final(md5);
4061 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4062 string new_realm_id(md5_str);
4063 RGWRealm new_realm(new_realm_id,new_realm_name);
4064 ret = new_realm.init(cct, this, false);
4065 if (ret < 0) {
4066 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4067 return ret;
4068 }
4069 ret = new_realm.create();
4070 if (ret < 0 && ret != -EEXIST) {
4071 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4072 return ret;
4073 }
4074 ret = new_realm.set_as_default();
4075 if (ret < 0) {
4076 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4077 return ret;
4078 }
4079 ret = realm.init(cct, this);
4080 if (ret < 0) {
4081 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4082 return ret;
4083 }
4084 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4085 if (ret < 0) {
4086 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4087 return ret;
4088 }
4089 }
4090
4091 list<string>::iterator iter;
4092 /* create zonegroups */
4093 for (iter = regions.begin(); iter != regions.end(); ++iter)
4094 {
4095 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4096 /* check to see if we don't have already a zonegroup with this name */
4097 RGWZoneGroup new_zonegroup(*iter);
4098 ret = new_zonegroup.init(cct , this);
4099 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4100 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4101 " skipping conversion " << dendl;
4102 continue;
4103 }
4104 RGWZoneGroup zonegroup(*iter);
4105 zonegroup.set_id(*iter);
4106 int ret = zonegroup.init(cct, this, true, true);
4107 if (ret < 0) {
4108 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4109 return ret;
4110 }
4111 zonegroup.realm_id = realm.get_id();
4112 /* fix default region master zone */
4113 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4114 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4115 zonegroup.master_zone = default_zone_name;
4116 }
4117 ret = zonegroup.update();
4118 if (ret < 0 && ret != -EEXIST) {
4119 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4120 << dendl;
4121 return ret;
4122 }
4123 ret = zonegroup.update_name();
4124 if (ret < 0 && ret != -EEXIST) {
4125 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4126 << dendl;
4127 return ret;
4128 }
4129 if (zonegroup.get_name() == default_region) {
4130 ret = zonegroup.set_as_default();
4131 if (ret < 0) {
4132 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4133 << dendl;
4134 return ret;
4135 }
4136 }
4137 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4138 ++iter) {
4139 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4140 RGWZoneParams zoneparams(iter->first, iter->first);
4141 zoneparams.set_id(iter->first);
4142 zoneparams.realm_id = realm.get_id();
4143 ret = zoneparams.init(cct, this);
4144 if (ret < 0 && ret != -ENOENT) {
4145 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4146 return ret;
4147 } else if (ret == -ENOENT) {
4148 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4149 continue;
4150 }
4151 zonegroup.realm_id = realm.get_id();
4152 ret = zoneparams.update();
4153 if (ret < 0 && ret != -EEXIST) {
4154 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4155 return ret;
4156 }
4157 ret = zoneparams.update_name();
4158 if (ret < 0 && ret != -EEXIST) {
4159 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4160 return ret;
4161 }
4162 }
4163
4164 if (!current_period.get_id().empty()) {
4165 ret = current_period.add_zonegroup(zonegroup);
4166 if (ret < 0) {
4167 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4168 return ret;
4169 }
4170 }
4171 }
4172
4173 if (!current_period.get_id().empty()) {
4174 ret = current_period.update();
4175 if (ret < 0) {
4176 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4177 return ret;
4178 }
4179 ret = current_period.store_info(false);
4180 if (ret < 0) {
4181 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4182 return ret;
4183 }
4184 ret = current_period.reflect();
4185 if (ret < 0) {
4186 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4187 return ret;
4188 }
4189 }
4190
4191 for (auto const& iter : regions) {
4192 RGWZoneGroup zonegroup(iter);
4193 int ret = zonegroup.init(cct, this, true, true);
4194 if (ret < 0) {
4195 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4196 return ret;
4197 }
4198 ret = zonegroup.delete_obj(true);
4199 if (ret < 0 && ret != -ENOENT) {
4200 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4201 << dendl;
4202 return ret;
4203 }
4204 }
4205
4206 /* mark as converted */
4207 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4208 true, NULL, real_time(), NULL);
4209 if (ret < 0 ) {
4210 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4211 << dendl;
4212 return ret;
4213 }
4214
4215 return 0;
4216}
4217
4218int RGWRados::init_zg_from_period(bool *initialized)
4219{
4220 *initialized = false;
4221
4222 if (current_period.get_id().empty()) {
4223 return 0;
4224 }
4225
4226 int ret = zonegroup.init(cct, this);
4227 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4228 if (ret == -ENOENT) {
4229 return 0;
4230 }
4231 if (ret < 0) {
4232 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4233 return ret;
4234 }
4235 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4236
4237 map<string, RGWZoneGroup>::const_iterator iter =
4238 current_period.get_map().zonegroups.find(zonegroup.get_id());
4239
4240 if (iter != current_period.get_map().zonegroups.end()) {
4241 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4242 zonegroup = iter->second;
4243 ret = zonegroup.init(cct, this, false);
4244 if (ret < 0) {
4245 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4246 return ret;
4247 }
4248 ret = zone_params.init(cct, this);
4249 if (ret < 0 && ret != -ENOENT) {
4250 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4251 return ret;
4252 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4253 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4254 zone_params.set_name(default_zone_name);
4255 ret = zone_params.init(cct, this);
4256 if (ret < 0 && ret != -ENOENT) {
4257 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4258 return ret;
4259 }
4260 }
4261 }
4262 for (iter = current_period.get_map().zonegroups.begin();
4263 iter != current_period.get_map().zonegroups.end(); ++iter){
4264 const RGWZoneGroup& zg = iter->second;
4265 // use endpoints from the zonegroup's master zone
4266 auto master = zg.zones.find(zg.master_zone);
4267 if (master == zg.zones.end()) {
f64942e4
AA
4268 // Check for empty zonegroup which can happen if zone was deleted before removal
4269 if (zg.zones.size() == 0)
4270 continue;
7c673cae
FG
4271 // fix missing master zone for a single zone zonegroup
4272 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4273 master = zg.zones.begin();
4274 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4275 master->second.name << " id:" << master->second.id << " as master" << dendl;
4276 if (zonegroup.get_id() == zg.get_id()) {
4277 zonegroup.master_zone = master->second.id;
4278 ret = zonegroup.update();
4279 if (ret < 0) {
4280 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4281 return ret;
4282 }
4283 } else {
4284 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4285 ret = fixed_zg.init(cct, this);
4286 if (ret < 0) {
4287 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4288 return ret;
4289 }
4290 fixed_zg.master_zone = master->second.id;
4291 ret = fixed_zg.update();
4292 if (ret < 0) {
4293 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4294 return ret;
4295 }
4296 }
4297 } else {
4298 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4299 zg.master_zone << dendl;
4300 return -EINVAL;
4301 }
4302 }
4303 const auto& endpoints = master->second.endpoints;
4304 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4305 if (!current_period.get_master_zonegroup().empty() &&
4306 zg.get_id() == current_period.get_master_zonegroup()) {
4307 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4308 }
4309 }
4310
4311 *initialized = true;
4312
4313 return 0;
4314}
4315
4316int RGWRados::init_zg_from_local(bool *creating_defaults)
4317{
4318 int ret = zonegroup.init(cct, this);
4319 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4320 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4321 return ret;
4322 } else if (ret == -ENOENT) {
4323 *creating_defaults = true;
4324 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4325 ret = zonegroup.create_default();
4326 if (ret < 0) {
4327 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4328 << dendl;
4329 return ret;
4330 }
4331 ret = zonegroup.init(cct, this);
4332 if (ret < 0) {
4333 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4334 << dendl;
4335 return ret;
4336 }
4337 }
4338 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
31f18b77 4339 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
4340 // use endpoints from the zonegroup's master zone
4341 auto master = zonegroup.zones.find(zonegroup.master_zone);
4342 if (master == zonegroup.zones.end()) {
4343 // fix missing master zone for a single zone zonegroup
4344 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4345 master = zonegroup.zones.begin();
4346 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4347 master->second.name << " id:" << master->second.id << " as master" << dendl;
4348 zonegroup.master_zone = master->second.id;
4349 ret = zonegroup.update();
4350 if (ret < 0) {
4351 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4352 return ret;
4353 }
4354 } else {
4355 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4356 "master_zone=" << zonegroup.master_zone << dendl;
4357 return -EINVAL;
4358 }
4359 }
4360 const auto& endpoints = master->second.endpoints;
4361 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4362 }
4363
4364 return 0;
4365}
4366
4367
4368bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4369{
4370 return target_zone.syncs_from(source_zone.name) &&
4371 sync_modules_manager->supports_data_export(source_zone.tier_type);
4372}
4373
4374/**
4375 * Initialize the RADOS instance and prepare to do other ops
4376 * Returns 0 on success, -ERR# on failure.
4377 */
4378int RGWRados::init_complete()
4379{
4380 int ret = realm.init(cct, this);
4381 if (ret < 0 && ret != -ENOENT) {
4382 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4383 return ret;
4384 } else if (ret != -ENOENT) {
4385 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4386 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4387 if (ret < 0 && ret != -ENOENT) {
4388 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4389 return ret;
4390 }
4391 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4392 }
4393
4394 ret = replace_region_with_zonegroup();
4395 if (ret < 0) {
4396 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4397 return ret;
4398 }
4399
4400 ret = convert_regionmap();
4401 if (ret < 0) {
4402 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4403 return ret;
4404 }
4405
4406 bool zg_initialized = false;
4407
4408 if (!current_period.get_id().empty()) {
4409 ret = init_zg_from_period(&zg_initialized);
4410 if (ret < 0) {
4411 return ret;
4412 }
4413 }
4414
4415 bool creating_defaults = false;
4416 bool using_local = (!zg_initialized);
4417 if (using_local) {
4418 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4419 ret = init_zg_from_local(&creating_defaults);
4420 if (ret < 0) {
4421 return ret;
4422 }
4423 // read period_config into current_period
4424 auto& period_config = current_period.get_config();
4425 ret = period_config.read(this, zonegroup.realm_id);
4426 if (ret < 0 && ret != -ENOENT) {
4427 ldout(cct, 0) << "ERROR: failed to read period config: "
4428 << cpp_strerror(ret) << dendl;
4429 return ret;
4430 }
4431 }
4432
4433 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4434 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4435 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4436 zone_params.set_name(default_zone_name);
4437 }
4438
4439 ret = zone_params.init(cct, this);
4440 if (ret < 0 && ret != -ENOENT) {
4441 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4442 return ret;
4443 }
4444 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4445 if (zone_iter == get_zonegroup().zones.end()) {
4446 if (using_local) {
4447 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4448 return -EINVAL;
4449 }
4450 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4451 ret = init_zg_from_local(&creating_defaults);
4452 if (ret < 0) {
4453 return ret;
4454 }
4455 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4456 }
4457 if (zone_iter != get_zonegroup().zones.end()) {
4458 zone_public_config = zone_iter->second;
4459 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4460 } else {
4461 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4462 return -EINVAL;
4463 }
4464
4465 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4466
31f18b77
FG
4467 if (run_sync_thread) {
4468 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4469 if (ret < 0) {
4470 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4471 return ret;
4472 }
7c673cae
FG
4473 }
4474
4475 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4476
4477 init_unique_trans_id_deps();
4478
4479 finisher = new Finisher(cct);
4480 finisher->start();
4481
4482 period_puller.reset(new RGWPeriodPuller(this));
4483 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4484 current_period));
4485
4486 if (need_watch_notify()) {
4487 ret = init_watch();
4488 if (ret < 0) {
4489 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4490 return ret;
4491 }
4492 }
4493
4494 /* first build all zones index */
4495 for (auto ziter : get_zonegroup().zones) {
4496 const string& id = ziter.first;
4497 RGWZone& z = ziter.second;
4498 zone_id_by_name[z.name] = id;
4499 zone_by_id[id] = z;
4500 }
31f18b77 4501
7c673cae
FG
4502 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4503 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4504 }
4505 zone_public_config = zone_by_id[zone_id()];
4506 for (auto ziter : get_zonegroup().zones) {
4507 const string& id = ziter.first;
4508 RGWZone& z = ziter.second;
4509 if (id == zone_id()) {
4510 continue;
4511 }
4512 if (z.endpoints.empty()) {
4513 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4514 continue;
4515 }
4516 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4517 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4518 zone_conn_map[id] = conn;
4519 if (zone_syncs_from(zone_public_config, z) ||
4520 zone_syncs_from(z, zone_public_config)) {
4521 if (zone_syncs_from(zone_public_config, z)) {
4522 zone_data_sync_from_map[id] = conn;
4523 }
4524 if (zone_syncs_from(z, zone_public_config)) {
4525 zone_data_notify_to_map[id] = conn;
4526 }
4527 } else {
4528 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4529 }
4530 }
4531
4532 ret = open_root_pool_ctx();
4533 if (ret < 0)
4534 return ret;
4535
4536 ret = open_gc_pool_ctx();
4537 if (ret < 0)
4538 return ret;
4539
4540 ret = open_lc_pool_ctx();
4541 if (ret < 0)
4542 return ret;
4543
4544 ret = open_objexp_pool_ctx();
4545 if (ret < 0)
4546 return ret;
4547
31f18b77
FG
4548 ret = open_reshard_pool_ctx();
4549 if (ret < 0)
4550 return ret;
4551
7c673cae
FG
4552 pools_initialized = true;
4553
4554 gc = new RGWGC();
4555 gc->initialize(cct, this);
4556
4557 obj_expirer = new RGWObjectExpirer(this);
4558
4559 if (use_gc_thread) {
4560 gc->start_processor();
4561 obj_expirer->start_processor();
4562 }
4563
7c673cae
FG
4564 /* no point of running sync thread if we don't have a master zone configured
4565 or there is no rest_master_conn */
4566 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4567 || current_period.get_id().empty()) {
4568 run_sync_thread = false;
4569 }
4570
b32b8144
FG
4571 if (run_sync_thread) {
4572 // initialize the log period history
4573 meta_mgr->init_oldest_log_period();
4574 }
4575
7c673cae
FG
4576 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4577 async_rados->start();
4578
4579 ret = meta_mgr->init(current_period.get_id());
4580 if (ret < 0) {
4581 lderr(cct) << "ERROR: failed to initialize metadata log: "
4582 << cpp_strerror(-ret) << dendl;
4583 return ret;
4584 }
4585
4586 if (is_meta_master()) {
4587 auto md_log = meta_mgr->get_log(current_period.get_id());
4588 meta_notifier = new RGWMetaNotifier(this, md_log);
4589 meta_notifier->start();
4590 }
4591
4592 if (run_sync_thread) {
4593 Mutex::Locker l(meta_sync_thread_lock);
4594 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4595 ret = meta_sync_processor_thread->init();
4596 if (ret < 0) {
4597 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4598 return ret;
4599 }
4600 meta_sync_processor_thread->start();
4601
b32b8144
FG
4602 // configure the bucket trim manager
4603 rgw::BucketTrimConfig config;
4604 rgw::configure_bucket_trim(cct, config);
4605
4606 bucket_trim.emplace(this, config);
4607 ret = bucket_trim->init();
4608 if (ret < 0) {
4609 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4610 return ret;
4611 }
91327a77 4612 data_log->set_observer(&*bucket_trim);
b32b8144 4613
7c673cae
FG
4614 Mutex::Locker dl(data_sync_thread_lock);
4615 for (auto iter : zone_data_sync_from_map) {
4616 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
91327a77 4617 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
7c673cae
FG
4618 ret = thread->init();
4619 if (ret < 0) {
4620 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4621 return ret;
4622 }
4623 thread->start();
4624 data_sync_processor_threads[iter.first] = thread;
4625 }
4626 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4627 if (interval > 0) {
b32b8144 4628 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
7c673cae
FG
4629 ret = sync_log_trimmer->init();
4630 if (ret < 0) {
4631 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4632 return ret;
4633 }
4634 sync_log_trimmer->start();
4635 }
4636 }
4637 data_notifier = new RGWDataNotifier(this);
4638 data_notifier->start();
4639
4640 lc = new RGWLC();
4641 lc->initialize(cct, this);
31f18b77 4642
7c673cae
FG
4643 if (use_lc_thread)
4644 lc->start_processor();
31f18b77 4645
7c673cae
FG
4646 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4647
4648 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4649 get_zone().bucket_index_max_shards);
31f18b77
FG
4650 if (bucket_index_max_shards > get_max_bucket_shards()) {
4651 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 4652 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 4653 << get_max_bucket_shards() << dendl;
7c673cae
FG
4654 }
4655 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4656
4657 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4658 binfo_cache->init(this);
4659
4660 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4661
4662 if (need_tombstone_cache) {
4663 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4664 }
4665
31f18b77
FG
4666 reshard_wait = std::make_shared<RGWReshardWait>(this);
4667
4668 reshard = new RGWReshard(this);
4669
4670 /* only the master zone in the zonegroup reshards buckets */
4671 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4672 if (run_reshard_thread) {
4673 reshard->start_processor();
4674 }
4675
4676 index_completion_manager = new RGWIndexCompletionManager(this);
4677 ret = index_completion_manager->start();
4678
7c673cae
FG
4679 return ret;
4680}
4681
4682/**
4683 * Initialize the RADOS instance and prepare to do other ops
4684 * Returns 0 on success, -ERR# on failure.
4685 */
4686int RGWRados::initialize()
4687{
4688 int ret;
4689
4690 ret = init_rados();
4691 if (ret < 0)
4692 return ret;
4693
4694 return init_complete();
4695}
4696
4697void RGWRados::finalize_watch()
4698{
4699 for (int i = 0; i < num_watchers; i++) {
4700 RGWWatcher *watcher = watchers[i];
4701 watcher->unregister_watch();
4702 delete watcher;
4703 }
4704
4705 delete[] notify_oids;
4706 delete[] watchers;
4707}
4708
4709void RGWRados::schedule_context(Context *c) {
4710 finisher->queue(c);
4711}
4712
4713int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4714{
4715 bool is_truncated;
4716 RGWListRawObjsCtx ctx;
4717 do {
4718 list<string> oids;
4719 int r = list_raw_objects(pool, prefix, 1000,
4720 ctx, oids, &is_truncated);
4721 if (r < 0) {
4722 return r;
4723 }
4724 list<string>::iterator iter;
4725 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4726 string& val = *iter;
4727 if (val.size() > prefix.size())
4728 result.push_back(val.substr(prefix.size()));
4729 }
4730 } while (is_truncated);
4731
4732 return 0;
4733}
4734
4735int RGWRados::list_regions(list<string>& regions)
4736{
4737 RGWZoneGroup zonegroup;
4738
4739 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4740}
4741
4742int RGWRados::list_zonegroups(list<string>& zonegroups)
4743{
4744 RGWZoneGroup zonegroup;
4745
4746 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4747}
4748
4749int RGWRados::list_zones(list<string>& zones)
4750{
4751 RGWZoneParams zoneparams;
4752
4753 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4754}
4755
4756int RGWRados::list_realms(list<string>& realms)
4757{
4758 RGWRealm realm(cct, this);
4759 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4760}
4761
4762int RGWRados::list_periods(list<string>& periods)
4763{
4764 RGWPeriod period;
4765 list<string> raw_periods;
4766 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4767 if (ret < 0) {
4768 return ret;
4769 }
4770 for (const auto& oid : raw_periods) {
4771 size_t pos = oid.find(".");
4772 if (pos != std::string::npos) {
4773 periods.push_back(oid.substr(0, pos));
4774 } else {
4775 periods.push_back(oid);
4776 }
4777 }
4778 periods.sort(); // unique() only detects duplicates if they're adjacent
4779 periods.unique();
4780 return 0;
4781}
4782
4783
4784int RGWRados::list_periods(const string& current_period, list<string>& periods)
4785{
4786 int ret = 0;
4787 string period_id = current_period;
4788 while(!period_id.empty()) {
4789 RGWPeriod period(period_id);
4790 ret = period.init(cct, this);
4791 if (ret < 0) {
4792 return ret;
4793 }
4794 periods.push_back(period.get_id());
4795 period_id = period.get_predecessor();
4796 }
4797
4798 return ret;
4799}
4800
4801/**
4802 * Open the pool used as root for this gateway
4803 * Returns: 0 on success, -ERR# otherwise.
4804 */
4805int RGWRados::open_root_pool_ctx()
4806{
4807 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4808}
4809
4810int RGWRados::open_gc_pool_ctx()
4811{
4812 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4813}
4814
4815int RGWRados::open_lc_pool_ctx()
4816{
4817 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4818}
4819
4820int RGWRados::open_objexp_pool_ctx()
4821{
4822 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4823}
4824
31f18b77
FG
4825int RGWRados::open_reshard_pool_ctx()
4826{
4827 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4828}
4829
7c673cae
FG
4830int RGWRados::init_watch()
4831{
4832 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4833 if (r < 0) {
4834 return r;
4835 }
4836
4837 num_watchers = cct->_conf->rgw_num_control_oids;
4838
4839 bool compat_oid = (num_watchers == 0);
4840
4841 if (num_watchers <= 0)
4842 num_watchers = 1;
4843
4844 notify_oids = new string[num_watchers];
4845 watchers = new RGWWatcher *[num_watchers];
4846
4847 for (int i=0; i < num_watchers; i++) {
4848 string& notify_oid = notify_oids[i];
4849 notify_oid = notify_oid_prefix;
4850 if (!compat_oid) {
4851 char buf[16];
4852 snprintf(buf, sizeof(buf), ".%d", i);
4853 notify_oid.append(buf);
4854 }
4855 r = control_pool_ctx.create(notify_oid, false);
4856 if (r < 0 && r != -EEXIST)
4857 return r;
4858
4859 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4860 watchers[i] = watcher;
4861
4862 r = watcher->register_watch();
4863 if (r < 0)
4864 return r;
4865 }
4866
4867 watch_initialized = true;
4868
4869 set_cache_enabled(true);
4870
4871 return 0;
4872}
4873
4874void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4875{
4876 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4877
4878 int i = r % num_watchers;
4879 char buf[16];
4880 snprintf(buf, sizeof(buf), ".%d", i);
4881
4882 notify_oid = notify_oid_prefix;
4883 notify_oid.append(buf);
4884}
4885
28e407b8 4886int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
7c673cae 4887{
28e407b8
AA
4888 constexpr bool create = true; // create the pool if it doesn't exist
4889 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
4890}
4891
4892void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4893 string *marker) {
4894 if (marker) {
4895 *marker = shard_id_str;
4896 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4897 marker->append(shard_marker);
4898 }
4899}
4900
4901int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4902{
3a9019d9
FG
4903 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4904
4905 if (!explicit_pool.empty()) {
4906 return open_pool_ctx(explicit_pool, index_ctx);
4907 }
4908
7c673cae
FG
4909 const string *rule = &bucket_info.placement_rule;
4910 if (rule->empty()) {
4911 rule = &zonegroup.default_placement;
4912 }
4913 auto iter = zone_params.placement_pools.find(*rule);
4914 if (iter == zone_params.placement_pools.end()) {
4915 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4916 return -EINVAL;
4917 }
4918
4919 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4920 if (r < 0)
4921 return r;
4922
4923 return 0;
4924}
4925
4926/**
4927 * set up a bucket listing.
4928 * handle is filled in.
4929 * Returns 0 on success, -ERR# otherwise.
4930 */
4931int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4932{
f64942e4
AA
4933 try {
4934 auto iter = root_pool_ctx.nobjects_begin();
4935 librados::NObjectIterator *state = new librados::NObjectIterator(iter);
4936 *handle = (RGWAccessHandle)state;
4937 return 0;
4938 } catch (const std::system_error& e) {
4939 int r = -e.code().value();
4940 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4941 << ", returning " << r << dendl;
4942 return r;
4943 } catch (const std::exception& e) {
4944 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4945 << ", returning -5" << dendl;
4946 return -EIO;
4947 }
7c673cae
FG
4948}
4949
4950/**
4951 * get the next bucket in the listing.
4952 * obj is filled in,
4953 * handle is updated.
4954 * returns 0 on success, -ERR# otherwise.
4955 */
4956int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4957{
4958 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4959
4960 do {
4961 if (*state == root_pool_ctx.nobjects_end()) {
4962 delete state;
4963 return -ENOENT;
4964 }
4965
4966 obj.key.name = (*state)->get_oid();
4967 if (obj.key.name[0] == '_') {
4968 obj.key.name = obj.key.name.substr(1);
4969 }
f64942e4
AA
4970 try {
4971 (*state)++;
4972 } catch (const std::system_error& e) {
4973 int r = -e.code().value();
4974 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4975 << ", returning " << r << dendl;
4976 return r;
4977 } catch (const std::exception& e) {
4978 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4979 << ", returning -5" << dendl;
4980 return -EIO;
4981 }
7c673cae
FG
4982 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4983
4984 return 0;
4985}
4986
4987
4988/**** logs ****/
4989
4990struct log_list_state {
4991 string prefix;
4992 librados::IoCtx io_ctx;
4993 librados::NObjectIterator obit;
4994};
4995
4996int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4997{
4998 log_list_state *state = new log_list_state;
4999 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5000 if (r < 0) {
5001 delete state;
5002 return r;
5003 }
5004 state->prefix = prefix;
5005 state->obit = state->io_ctx.nobjects_begin();
5006 *handle = (RGWAccessHandle)state;
5007 return 0;
5008}
5009
5010int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
5011{
5012 log_list_state *state = static_cast<log_list_state *>(handle);
5013 while (true) {
5014 if (state->obit == state->io_ctx.nobjects_end()) {
5015 delete state;
5016 return -ENOENT;
5017 }
5018 if (state->prefix.length() &&
5019 state->obit->get_oid().find(state->prefix) != 0) {
5020 state->obit++;
5021 continue;
5022 }
5023 *name = state->obit->get_oid();
5024 state->obit++;
5025 break;
5026 }
5027 return 0;
5028}
5029
5030int RGWRados::log_remove(const string& name)
5031{
5032 librados::IoCtx io_ctx;
5033 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5034 if (r < 0)
5035 return r;
5036 return io_ctx.remove(name);
5037}
5038
5039struct log_show_state {
5040 librados::IoCtx io_ctx;
5041 bufferlist bl;
5042 bufferlist::iterator p;
5043 string name;
5044 uint64_t pos;
5045 bool eof;
5046 log_show_state() : pos(0), eof(false) {}
5047};
5048
5049int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5050{
5051 log_show_state *state = new log_show_state;
5052 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5053 if (r < 0) {
5054 delete state;
5055 return r;
5056 }
5057 state->name = name;
5058 *handle = (RGWAccessHandle)state;
5059 return 0;
5060}
5061
5062int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5063{
5064 log_show_state *state = static_cast<log_show_state *>(handle);
5065 off_t off = state->p.get_off();
5066
5067 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5068 << " off " << off
5069 << " eof " << (int)state->eof
5070 << dendl;
5071 // read some?
5072 unsigned chunk = 1024*1024;
5073 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5074 bufferlist more;
5075 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5076 if (r < 0)
5077 return r;
5078 state->pos += r;
5079 bufferlist old;
5080 try {
5081 old.substr_of(state->bl, off, state->bl.length() - off);
5082 } catch (buffer::error& err) {
5083 return -EINVAL;
5084 }
5085 state->bl.clear();
5086 state->bl.claim(old);
5087 state->bl.claim_append(more);
5088 state->p = state->bl.begin();
5089 if ((unsigned)r < chunk)
5090 state->eof = true;
5091 ldout(cct, 10) << " read " << r << dendl;
5092 }
5093
5094 if (state->p.end())
5095 return 0; // end of file
5096 try {
5097 ::decode(*entry, state->p);
5098 }
5099 catch (const buffer::error &e) {
5100 return -EINVAL;
5101 }
5102 return 1;
5103}
5104
5105/**
5106 * usage_log_hash: get usage log key hash, based on name and index
5107 *
5108 * Get the usage object name. Since a user may have more than 1
5109 * object holding that info (multiple shards), we use index to
5110 * specify that shard number. Once index exceeds max shards it
5111 * wraps.
5112 * If name is not being set, results for all users will be returned
5113 * and index will wrap only after total shards number.
5114 *
5115 * @param cct [in] ceph context
5116 * @param name [in] user name
5117 * @param hash [out] hash value
5118 * @param index [in] shard index number
5119 */
5120static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5121{
5122 uint32_t val = index;
5123
5124 if (!name.empty()) {
c07f9fc5 5125 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
5126 val %= max_user_shards;
5127 val += ceph_str_hash_linux(name.c_str(), name.size());
5128 }
5129 char buf[17];
c07f9fc5 5130 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
5131 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5132 hash = buf;
5133}
5134
5135int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5136{
5137 uint32_t index = 0;
5138
5139 map<string, rgw_usage_log_info> log_objs;
5140
5141 string hash;
5142 string last_user;
5143
5144 /* restructure usage map, zone by object hash */
5145 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5146 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5147 const rgw_user_bucket& ub = iter->first;
5148 RGWUsageBatch& info = iter->second;
5149
5150 if (ub.user.empty()) {
5151 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5152 continue;
5153 }
5154
5155 if (ub.user != last_user) {
5156 /* index *should* be random, but why waste extra cycles
5157 in most cases max user shards is not going to exceed 1,
5158 so just incrementing it */
5159 usage_log_hash(cct, ub.user, hash, index++);
5160 }
5161 last_user = ub.user;
5162 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5163
5164 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5165 v.push_back(miter->second);
5166 }
5167 }
5168
5169 map<string, rgw_usage_log_info>::iterator liter;
5170
5171 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5172 int r = cls_obj_usage_log_add(liter->first, liter->second);
5173 if (r < 0)
5174 return r;
5175 }
5176 return 0;
5177}
5178
5179int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5180 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5181{
5182 uint32_t num = max_entries;
5183 string hash, first_hash;
5184 string user_str = user.to_str();
5185 usage_log_hash(cct, user_str, first_hash, 0);
5186
5187 if (usage_iter.index) {
5188 usage_log_hash(cct, user_str, hash, usage_iter.index);
5189 } else {
5190 hash = first_hash;
5191 }
5192
5193 usage.clear();
5194
5195 do {
5196 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5197 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5198
5199 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5200 usage_iter.read_iter, ret_usage, is_truncated);
5201 if (ret == -ENOENT)
5202 goto next;
5203
5204 if (ret < 0)
5205 return ret;
5206
5207 num -= ret_usage.size();
5208
5209 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5210 usage[iter->first].aggregate(iter->second);
5211 }
5212
5213next:
5214 if (!*is_truncated) {
5215 usage_iter.read_iter.clear();
5216 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5217 }
5218 } while (num && !*is_truncated && hash != first_hash);
5219 return 0;
5220}
5221
5222int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5223{
5224 uint32_t index = 0;
5225 string hash, first_hash;
5226 string user_str = user.to_str();
5227 usage_log_hash(cct, user_str, first_hash, index);
5228
5229 hash = first_hash;
7c673cae
FG
5230 do {
5231 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
7c673cae 5232
b32b8144 5233 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
5234 return ret;
5235
7c673cae
FG
5236 usage_log_hash(cct, user_str, hash, ++index);
5237 } while (hash != first_hash);
5238
5239 return 0;
5240}
5241
7c673cae
FG
5242int RGWRados::key_to_shard_id(const string& key, int max_shards)
5243{
1adf2230 5244 return rgw_shard_id(key, max_shards);
7c673cae
FG
5245}
5246
5247void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5248{
5249 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5250 char buf[16];
5251 if (shard_id) {
5252 *shard_id = val % max_shards;
5253 }
5254 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5255 name = prefix + buf;
5256}
5257
5258void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5259{
5260 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5261 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5262 char buf[16];
5263 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5264 name = prefix + buf;
5265}
5266
5267void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5268{
5269 char buf[16];
5270 snprintf(buf, sizeof(buf), "%u", shard_id);
5271 name = prefix + buf;
5272
5273}
5274
5275void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5276{
5277 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5278}
5279
5280int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5281{
5282 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5283
5284}
5285
5286int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5287{
5288 librados::IoCtx io_ctx;
5289
5290 int r = time_log_add_init(io_ctx);
5291 if (r < 0) {
5292 return r;
5293 }
5294
5295 ObjectWriteOperation op;
5296 utime_t t(ut);
5297 cls_log_add(op, t, section, key, bl);
5298
5299 return io_ctx.operate(oid, &op);
5300}
5301
5302int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5303 librados::AioCompletion *completion, bool monotonic_inc)
5304{
5305 librados::IoCtx io_ctx;
5306
5307 int r = time_log_add_init(io_ctx);
5308 if (r < 0) {
5309 return r;
5310 }
5311
5312 ObjectWriteOperation op;
5313 cls_log_add(op, entries, monotonic_inc);
5314
5315 if (!completion) {
5316 r = io_ctx.operate(oid, &op);
5317 } else {
5318 r = io_ctx.aio_operate(oid, completion, &op);
5319 }
5320 return r;
5321}
5322
5323int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5324 int max_entries, list<cls_log_entry>& entries,
5325 const string& marker,
5326 string *out_marker,
5327 bool *truncated)
5328{
5329 librados::IoCtx io_ctx;
5330
5331 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5332 if (r < 0)
5333 return r;
5334 librados::ObjectReadOperation op;
5335
5336 utime_t st(start_time);
5337 utime_t et(end_time);
5338
5339 cls_log_list(op, st, et, marker, max_entries, entries,
5340 out_marker, truncated);
5341
5342 bufferlist obl;
5343
5344 int ret = io_ctx.operate(oid, &op, &obl);
5345 if (ret < 0)
5346 return ret;
5347
5348 return 0;
5349}
5350
5351int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5352{
5353 librados::IoCtx io_ctx;
5354
5355 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5356 if (r < 0)
5357 return r;
5358 librados::ObjectReadOperation op;
5359
5360 cls_log_info(op, header);
5361
5362 bufferlist obl;
5363
5364 int ret = io_ctx.operate(oid, &op, &obl);
5365 if (ret < 0)
5366 return ret;
5367
5368 return 0;
5369}
5370
5371int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5372{
5373 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5374 if (r < 0)
5375 return r;
5376
5377 librados::ObjectReadOperation op;
5378
5379 cls_log_info(op, header);
5380
5381 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5382 if (ret < 0)
5383 return ret;
5384
5385 return 0;
5386}
5387
5388int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5389 const string& from_marker, const string& to_marker,
5390 librados::AioCompletion *completion)
5391{
5392 librados::IoCtx io_ctx;
5393
5394 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5395 if (r < 0)
5396 return r;
5397
5398 utime_t st(start_time);
5399 utime_t et(end_time);
5400
5401 ObjectWriteOperation op;
5402 cls_log_trim(op, st, et, from_marker, to_marker);
5403
5404 if (!completion) {
5405 r = io_ctx.operate(oid, &op);
5406 } else {
5407 r = io_ctx.aio_operate(oid, completion, &op);
5408 }
5409 return r;
5410}
5411
5412string RGWRados::objexp_hint_get_shardname(int shard_num)
5413{
5414 char buf[32];
5415 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5416
5417 string objname("obj_delete_at_hint.");
5418 return objname + buf;
5419}
5420
7c673cae
FG
5421int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5422{
5423 string obj_key = key.name + key.instance;
5424 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
1adf2230 5425 return rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
5426}
5427
5428static string objexp_hint_get_keyext(const string& tenant_name,
5429 const string& bucket_name,
5430 const string& bucket_id,
5431 const rgw_obj_key& obj_key)
5432{
5433 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5434 ":" + obj_key.name + ":" + obj_key.instance;
5435}
5436
5437int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5438 const string& tenant_name,
5439 const string& bucket_name,
5440 const string& bucket_id,
5441 const rgw_obj_index_key& obj_key)
5442{
5443 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5444 bucket_id, obj_key);
5445 objexp_hint_entry he = {
5446 .tenant = tenant_name,
5447 .bucket_name = bucket_name,
5448 .bucket_id = bucket_id,
5449 .obj_key = obj_key,
5450 .exp_time = delete_at };
5451 bufferlist hebl;
5452 ::encode(he, hebl);
5453 ObjectWriteOperation op;
5454 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5455
5456 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5457 return objexp_pool_ctx.operate(shard_name, &op);
5458}
5459
5460void RGWRados::objexp_get_shard(int shard_num,
5461 string& shard) /* out */
5462{
5463 shard = objexp_hint_get_shardname(shard_num);
5464}
5465
5466int RGWRados::objexp_hint_list(const string& oid,
5467 const ceph::real_time& start_time,
5468 const ceph::real_time& end_time,
5469 const int max_entries,
5470 const string& marker,
5471 list<cls_timeindex_entry>& entries, /* out */
5472 string *out_marker, /* out */
5473 bool *truncated) /* out */
5474{
5475 librados::ObjectReadOperation op;
5476 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5477 out_marker, truncated);
5478
5479 bufferlist obl;
5480 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5481
5482 if ((ret < 0 ) && (ret != -ENOENT)) {
5483 return ret;
5484 }
5485
5486 if ((ret == -ENOENT) && truncated) {
5487 *truncated = false;
5488 }
5489
5490 return 0;
5491}
5492
5493int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5494 objexp_hint_entry& hint_entry) /* out */
5495{
5496 try {
5497 bufferlist::iterator iter = ti_entry.value.begin();
5498 ::decode(hint_entry, iter);
5499 } catch (buffer::error& err) {
5500 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5501 }
5502
5503 return 0;
5504}
5505
5506int RGWRados::objexp_hint_trim(const string& oid,
5507 const ceph::real_time& start_time,
5508 const ceph::real_time& end_time,
5509 const string& from_marker,
5510 const string& to_marker)
5511{
5512 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5513 from_marker, to_marker);
5514 if ((ret < 0 ) && (ret != -ENOENT)) {
5515 return ret;
5516 }
5517
5518 return 0;
5519}
5520
5521int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5522 string& zone_id, string& owner_id) {
5523 librados::IoCtx io_ctx;
5524
5525 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5526 if (r < 0) {
5527 return r;
5528 }
5529 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5530 utime_t ut(msec / 1000, msec % 1000);
5531
5532 rados::cls::lock::Lock l(log_lock_name);
5533 l.set_duration(ut);
5534 l.set_cookie(owner_id);
5535 l.set_tag(zone_id);
f64942e4 5536 l.set_may_renew(true);
7c673cae
FG
5537
5538 return l.lock_exclusive(&io_ctx, oid);
5539}
5540
5541int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5542 librados::IoCtx io_ctx;
5543
5544 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5545 if (r < 0) {
5546 return r;
5547 }
5548
5549 rados::cls::lock::Lock l(log_lock_name);
5550 l.set_tag(zone_id);
5551 l.set_cookie(owner_id);
5552
5553 return l.unlock(&io_ctx, oid);
5554}
5555
5556int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5557{
5558 bufferlist::iterator i = bl.begin();
5559 RGWAccessControlPolicy policy(cct);
5560 try {
5561 policy.decode_owner(i);
5562 } catch (buffer::error& err) {
5563 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5564 return -EIO;
5565 }
5566 *owner = policy.get_owner();
5567 return 0;
5568}
5569
5570int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5571{
5572 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5573 if (aiter == attrset.end())
5574 return -EIO;
5575
5576 bufferlist& bl = aiter->second;
5577 bufferlist::iterator iter = bl.begin();
5578 try {
5579 policy->decode(iter);
5580 } catch (buffer::error& err) {
5581 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5582 return -EIO;
5583 }
5584 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5585 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5586 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5587 s3policy->to_xml(*_dout);
5588 *_dout << dendl;
5589 }
5590 return 0;
5591}
5592
5593
31f18b77
FG
5594int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5595{
5596 rgw_bucket bucket = bucket_info.bucket;
5597 bucket.update_bucket_id(new_bucket_id);
5598
5599 RGWObjectCtx obj_ctx(store);
5600
5601 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5602 if (ret < 0) {
5603 return ret;
5604 }
5605
5606 return 0;
5607}
5608
1adf2230
AA
5609
5610/**
5611 * Get ordered listing of the objects in a bucket.
7c673cae
FG
5612 *
5613 * max: maximum number of results to return
5614 * bucket: bucket to list contents of
5615 * prefix: only return results that match this prefix
5616 * delim: do not include results that match this string.
5617 * Any skipped results will have the matching portion of their name
5618 * inserted in common_prefixes with a "true" mark.
5619 * marker: if filled in, begin the listing with this object.
5620 * end_marker: if filled in, end the listing with this object.
5621 * result: the objects are put in here.
5622 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5623 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5624 */
1adf2230
AA
5625int RGWRados::Bucket::List::list_objects_ordered(int64_t max,
5626 vector<rgw_bucket_dir_entry> *result,
5627 map<string, bool> *common_prefixes,
5628 bool *is_truncated)
7c673cae
FG
5629{
5630 RGWRados *store = target->get_store();
5631 CephContext *cct = store->ctx();
5632 int shard_id = target->get_shard_id();
5633
5634 int count = 0;
5635 bool truncated = true;
5636 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5637
5638 result->clear();
5639
5640 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
7c673cae
FG
5641 rgw_obj_index_key cur_marker;
5642 marker_obj.get_index_key(&cur_marker);
5643
3efd9988
FG
5644 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5645 params.ns);
5646 rgw_obj_index_key cur_end_marker;
5647 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
5648 const bool cur_end_marker_valid = !params.end_marker.empty();
5649
5650 rgw_obj_key prefix_obj(params.prefix);
5651 prefix_obj.ns = params.ns;
5652 string cur_prefix = prefix_obj.get_index_key_name();
5653
5654 string bigger_than_delim;
5655
5656 if (!params.delim.empty()) {
1adf2230
AA
5657 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(),
5658 params.delim.size());
7c673cae
FG
5659 char buf[params.delim.size() + 16];
5660 int r = encode_utf8(val + 1, (unsigned char *)buf);
5661 if (r < 0) {
5662 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5663 return -EINVAL;
5664 }
5665 buf[r] = '\0';
5666
5667 bigger_than_delim = buf;
5668
5669 /* if marker points at a common prefix, fast forward it into its upperbound string */
224ce89b 5670 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
5671 if (delim_pos >= 0) {
5672 string s = cur_marker.name.substr(0, delim_pos);
5673 s.append(bigger_than_delim);
5674 cur_marker = s;
5675 }
5676 }
1adf2230 5677
7c673cae
FG
5678 string skip_after_delim;
5679 while (truncated && count <= max) {
5680 if (skip_after_delim > cur_marker.name) {
5681 cur_marker = skip_after_delim;
5682 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5683 }
5684 std::map<string, rgw_bucket_dir_entry> ent_map;
1adf2230
AA
5685 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
5686 shard_id,
5687 cur_marker,
5688 cur_prefix,
5689 read_ahead + 1 - count,
5690 params.list_versions,
5691 ent_map,
5692 &truncated,
5693 &cur_marker);
7c673cae
FG
5694 if (r < 0)
5695 return r;
5696
1adf2230 5697 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
5698 rgw_bucket_dir_entry& entry = eiter->second;
5699 rgw_obj_index_key index_key = entry.key;
5700
5701 rgw_obj_key obj(index_key);
5702
1adf2230
AA
5703 /* note that parse_raw_oid() here will not set the correct
5704 * object's instance, as rgw_obj_index_key encodes that
5705 * separately. We don't need to set the instance because it's
5706 * not needed for the checks here and we end up using the raw
5707 * entry for the return vector
7c673cae
FG
5708 */
5709 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5710 if (!valid) {
5711 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5712 continue;
5713 }
5714 bool check_ns = (obj.ns == params.ns);
5715 if (!params.list_versions && !entry.is_visible()) {
5716 continue;
5717 }
5718
5719 if (params.enforce_ns && !check_ns) {
5720 if (!params.ns.empty()) {
5721 /* we've iterated past the namespace we're searching -- done now */
5722 truncated = false;
5723 goto done;
5724 }
5725
5726 /* we're not looking at the namespace this object is in, next! */
5727 continue;
5728 }
5729
5730 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5731 truncated = false;
5732 goto done;
5733 }
5734
5735 if (count < max) {
5736 params.marker = index_key;
5737 next_marker = index_key;
5738 }
5739
5740 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5741 continue;
5742
1adf2230
AA
5743 if (params.prefix.size() &&
5744 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
7c673cae
FG
5745 continue;
5746
5747 if (!params.delim.empty()) {
5748 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5749
5750 if (delim_pos >= 0) {
5751 string prefix_key = obj.name.substr(0, delim_pos + 1);
5752
5753 if (common_prefixes &&
5754 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5755 if (count >= max) {
5756 truncated = true;
5757 goto done;
5758 }
5759 next_marker = prefix_key;
5760 (*common_prefixes)[prefix_key] = true;
5761
224ce89b
WB
5762 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5763
5764 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
7c673cae
FG
5765 skip_after_delim.append(bigger_than_delim);
5766
5767 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5768
5769 count++;
5770 }
5771
5772 continue;
5773 }
5774 }
5775
5776 if (count >= max) {
5777 truncated = true;
5778 goto done;
5779 }
5780
5781 result->emplace_back(std::move(entry));
5782 count++;
5783 }
7c673cae
FG
5784 }
5785
5786done:
5787 if (is_truncated)
5788 *is_truncated = truncated;
5789
5790 return 0;
1adf2230
AA
5791} // list_objects_ordered
5792
5793
5794/**
5795 * Get listing of the objects in a bucket and allow the results to be out
5796 * of order.
5797 *
5798 * Even though there are key differences with the ordered counterpart,
5799 * the parameters are the same to maintain some compatability.
5800 *
5801 * max: maximum number of results to return
5802 * bucket: bucket to list contents of
5803 * prefix: only return results that match this prefix
5804 * delim: should not be set; if it is we should have indicated an error
5805 * marker: if filled in, begin the listing with this object.
5806 * end_marker: if filled in, end the listing with this object.
5807 * result: the objects are put in here.
5808 * common_prefixes: this is never filled with an unordered list; the param
5809 * is maintained for compatibility
5810 * is_truncated: if number of objects in the bucket is bigger than max, then
5811 * truncated.
5812 */
5813int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
5814 vector<rgw_bucket_dir_entry> *result,
5815 map<string, bool> *common_prefixes,
5816 bool *is_truncated)
5817{
5818 RGWRados *store = target->get_store();
5819 CephContext *cct = store->ctx();
5820 int shard_id = target->get_shard_id();
5821
5822 int count = 0;
5823 bool truncated = true;
5824
5825 // read a few extra in each call to cls_bucket_list_unordered in
5826 // case some are filtered out due to namespace matching, versioning,
5827 // filtering, etc.
5828 const int64_t max_read_ahead = 100;
5829 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
5830
5831 result->clear();
5832
5833 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5834 rgw_obj_index_key cur_marker;
5835 marker_obj.get_index_key(&cur_marker);
5836
5837 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5838 params.ns);
5839 rgw_obj_index_key cur_end_marker;
5840 end_marker_obj.get_index_key(&cur_end_marker);
5841 const bool cur_end_marker_valid = !params.end_marker.empty();
5842
5843 rgw_obj_key prefix_obj(params.prefix);
5844 prefix_obj.ns = params.ns;
5845 string cur_prefix = prefix_obj.get_index_key_name();
5846
5847 while (truncated && count <= max) {
5848 std::vector<rgw_bucket_dir_entry> ent_list;
5849 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
5850 shard_id,
5851 cur_marker,
5852 cur_prefix,
5853 read_ahead,
5854 params.list_versions,
5855 ent_list,
5856 &truncated,
5857 &cur_marker);
5858 if (r < 0)
5859 return r;
5860
5861 // NB: while regions of ent_list will be sorted, we have no
5862 // guarantee that all items will be sorted since they can cross
5863 // shard boundaries
5864
5865 for (auto& entry : ent_list) {
5866 rgw_obj_index_key index_key = entry.key;
5867 rgw_obj_key obj(index_key);
5868
5869 /* note that parse_raw_oid() here will not set the correct
5870 * object's instance, as rgw_obj_index_key encodes that
5871 * separately. We don't need to set the instance because it's
5872 * not needed for the checks here and we end up using the raw
5873 * entry for the return vector
5874 */
5875 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5876 if (!valid) {
5877 ldout(cct, 0) << "ERROR: could not parse object name: " <<
5878 obj.name << dendl;
5879 continue;
5880 }
5881
5882 if (!params.list_versions && !entry.is_visible()) {
5883 continue;
5884 }
5885
5886 if (params.enforce_ns && obj.ns != params.ns) {
5887 continue;
5888 }
5889
5890 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5891 // we're not guaranteed items will come in order, so we have
5892 // to loop through all
5893 continue;
5894 }
5895
5896 if (count < max) {
5897 params.marker = index_key;
5898 next_marker = index_key;
5899 }
5900
5901 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5902 continue;
5903
5904 if (params.prefix.size() &&
5905 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
5906 continue;
5907
5908 if (count >= max) {
5909 truncated = true;
5910 goto done;
5911 }
5912
5913 result->emplace_back(std::move(entry));
5914 count++;
5915 } // for (auto& entry : ent_list)
5916 } // while (truncated && count <= max)
5917
5918done:
5919 if (is_truncated)
5920 *is_truncated = truncated;
5921
5922 return 0;
5923} // list_objects_unordered
5924
7c673cae
FG
5925
5926/**
5927 * create a rados pool, associated meta info
5928 * returns 0 on success, -ERR# otherwise.
5929 */
5930int RGWRados::create_pool(const rgw_pool& pool)
5931{
c07f9fc5 5932 librados::IoCtx io_ctx;
28e407b8
AA
5933 constexpr bool create = true;
5934 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
5935}
5936
5937int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5938{
f64942e4 5939 librados::IoCtx index_ctx;
7c673cae 5940
f64942e4 5941 string dir_oid = dir_oid_prefix;
7c673cae 5942 int r = open_bucket_index_ctx(bucket_info, index_ctx);
31f18b77 5943 if (r < 0) {
7c673cae 5944 return r;
31f18b77 5945 }
7c673cae 5946
7c673cae
FG
5947 dir_oid.append(bucket_info.bucket.bucket_id);
5948
5949 map<int, string> bucket_objs;
5950 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5951
f64942e4
AA
5952 return CLSRGWIssueBucketIndexInit(index_ctx,
5953 bucket_objs,
5954 cct->_conf->rgw_bucket_index_max_aio)();
5955}
5956
5957int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5958{
5959 librados::IoCtx index_ctx;
5960
5961 std::string dir_oid = dir_oid_prefix;
5962 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5963 if (r < 0) {
5964 return r;
5965 }
5966
5967 dir_oid.append(bucket_info.bucket.bucket_id);
5968
5969 std::map<int, std::string> bucket_objs;
5970 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5971
5972 return CLSRGWIssueBucketIndexClean(index_ctx,
5973 bucket_objs,
5974 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
5975}
5976
5977void RGWRados::create_bucket_id(string *bucket_id)
5978{
5979 uint64_t iid = instance_id();
5980 uint64_t bid = next_bucket_id();
5981 char buf[get_zone_params().get_id().size() + 48];
5982 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5983 *bucket_id = buf;
5984}
5985
7c673cae
FG
5986int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5987 const string& zonegroup_id,
5988 const string& placement_rule,
5989 const string& swift_ver_location,
5990 const RGWQuotaInfo * pquota_info,
5991 map<std::string, bufferlist>& attrs,
5992 RGWBucketInfo& info,
5993 obj_version *pobjv,
5994 obj_version *pep_objv,
5995 real_time creation_time,
5996 rgw_bucket *pmaster_bucket,
5997 uint32_t *pmaster_num_shards,
5998 bool exclusive)
5999{
6000#define MAX_CREATE_RETRIES 20 /* need to bound retries */
6001 string selected_placement_rule_name;
6002 RGWZonePlacementInfo rule_info;
6003
6004 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
6005 int ret = 0;
6006 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
6007 &selected_placement_rule_name, &rule_info);
6008 if (ret < 0)
6009 return ret;
6010
6011 if (!pmaster_bucket) {
6012 create_bucket_id(&bucket.marker);
6013 bucket.bucket_id = bucket.marker;
6014 } else {
6015 bucket.marker = pmaster_bucket->marker;
6016 bucket.bucket_id = pmaster_bucket->bucket_id;
6017 }
6018
6019 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
6020
6021 if (pobjv) {
6022 objv_tracker.write_version = *pobjv;
6023 } else {
6024 objv_tracker.generate_new_write_ver(cct);
6025 }
6026
6027 info.bucket = bucket;
6028 info.owner = owner.user_id;
6029 info.zonegroup = zonegroup_id;
6030 info.placement_rule = selected_placement_rule_name;
6031 info.index_type = rule_info.index_type;
6032 info.swift_ver_location = swift_ver_location;
6033 info.swift_versioning = (!swift_ver_location.empty());
6034 if (pmaster_num_shards) {
6035 info.num_shards = *pmaster_num_shards;
6036 } else {
6037 info.num_shards = bucket_index_max_shards;
6038 }
6039 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
6040 info.requester_pays = false;
6041 if (real_clock::is_zero(creation_time)) {
6042 info.creation_time = ceph::real_clock::now();
6043 } else {
6044 info.creation_time = creation_time;
6045 }
6046 if (pquota_info) {
6047 info.quota = *pquota_info;
6048 }
6049
6050 int r = init_bucket_index(info, info.num_shards);
6051 if (r < 0) {
6052 return r;
6053 }
6054
6055 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
6056 if (ret == -EEXIST) {
6057 librados::IoCtx index_ctx;
6058 map<int, string> bucket_objs;
6059 int r = open_bucket_index(info, index_ctx, bucket_objs);
6060 if (r < 0)
6061 return r;
6062
6063 /* we need to reread the info and return it, caller will have a use for it */
6064 RGWObjVersionTracker instance_ver = info.objv_tracker;
6065 info.objv_tracker.clear();
6066 RGWObjectCtx obj_ctx(this);
6067 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
6068 if (r < 0) {
6069 if (r == -ENOENT) {
6070 continue;
6071 }
6072 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
6073 return r;
6074 }
6075
6076 /* only remove it if it's a different bucket instance */
6077 if (info.bucket.bucket_id != bucket.bucket_id) {
6078 /* remove bucket meta instance */
f64942e4
AA
6079 r = rgw_bucket_instance_remove_entry(this,
6080 bucket.get_key(),
6081 &instance_ver);
7c673cae
FG
6082 if (r < 0)
6083 return r;
6084
f64942e4
AA
6085 /* remove bucket index objects asynchronously by best effort */
6086 (void) CLSRGWIssueBucketIndexClean(index_ctx,
6087 bucket_objs,
6088 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
6089 }
6090 /* ret == -ENOENT here */
6091 }
6092 return ret;
6093 }
6094
6095 /* this is highly unlikely */
6096 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
6097 return -ENOENT;
6098}
6099
6100int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
6101 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6102
6103{
c07f9fc5 6104 /* first check that zonegroup exists within current period. */
7c673cae
FG
6105 RGWZoneGroup zonegroup;
6106 int ret = get_zonegroup(zonegroup_id, zonegroup);
6107 if (ret < 0) {
6108 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
6109 return ret;
6110 }
6111
7c673cae 6112 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
c07f9fc5
FG
6113 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
6114
6115 if (!request_rule.empty()) {
6116 titer = zonegroup.placement_targets.find(request_rule);
6117 if (titer == zonegroup.placement_targets.end()) {
6118 ldout(cct, 0) << "could not find requested placement id " << request_rule
6119 << " within zonegroup " << dendl;
6120 return -ERR_INVALID_LOCATION_CONSTRAINT;
6121 }
6122 } else if (!user_info.default_placement.empty()) {
6123 titer = zonegroup.placement_targets.find(user_info.default_placement);
6124 if (titer == zonegroup.placement_targets.end()) {
6125 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
6126 << " within zonegroup " << dendl;
6127 return -ERR_INVALID_LOCATION_CONSTRAINT;
6128 }
6129 } else {
6130 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
6131 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
6132 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
6133 } else {
6134 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
6135 if (titer == zonegroup.placement_targets.end()) {
6136 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
6137 << " within zonegroup " << dendl;
6138 return -ERR_INVALID_LOCATION_CONSTRAINT;
6139 }
6140 }
7c673cae
FG
6141 }
6142
6143 /* now check tag for the rule, whether user is permitted to use rule */
c07f9fc5 6144 const auto& target_rule = titer->second;
7c673cae 6145 if (!target_rule.user_permitted(user_info.placement_tags)) {
c07f9fc5 6146 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
7c673cae
FG
6147 return -EPERM;
6148 }
6149
6150 if (pselected_rule_name)
c07f9fc5 6151 *pselected_rule_name = titer->first;
7c673cae 6152
c07f9fc5 6153 return select_bucket_location_by_rule(titer->first, rule_info);
7c673cae
FG
6154}
6155
6156int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
6157{
6158 if (location_rule.empty()) {
6159 /* we can only reach here if we're trying to set a bucket location from a bucket
6160 * created on a different zone, using a legacy / default pool configuration
6161 */
6162 return select_legacy_bucket_placement(rule_info);
6163 }
6164
6165 /*
6166 * make sure that zone has this rule configured. We're
6167 * checking it for the local zone, because that's where this bucket object is going to
6168 * reside.
6169 */
6170 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
6171 if (piter == get_zone_params().placement_pools.end()) {
6172 /* couldn't find, means we cannot really place data for this bucket in this zone */
224ce89b 6173 if (get_zonegroup().equals(zonegroup.get_id())) {
7c673cae
FG
6174 /* that's a configuration error, zone should have that rule, as we're within the requested
6175 * zonegroup */
6176 return -EINVAL;
6177 } else {
6178 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6179 return 0;
6180 }
6181 }
6182
6183 RGWZonePlacementInfo& placement_info = piter->second;
6184
6185 if (rule_info) {
6186 *rule_info = placement_info;
6187 }
6188
6189 return 0;
6190}
6191
6192int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
6193 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6194{
6195 if (!get_zone_params().placement_pools.empty()) {
6196 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6197 pselected_rule_name, rule_info);
6198 }
6199
6200 if (pselected_rule_name) {
6201 pselected_rule_name->clear();
6202 }
6203
6204 return select_legacy_bucket_placement(rule_info);
6205}
6206
6207int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6208{
6209 bufferlist map_bl;
6210 map<string, bufferlist> m;
6211 string pool_name;
6212 bool write_map = false;
6213
6214 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6215
6216 RGWObjectCtx obj_ctx(this);
6217 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6218 if (ret < 0) {
6219 goto read_omap;
6220 }
6221
6222 try {
6223 bufferlist::iterator iter = map_bl.begin();
6224 ::decode(m, iter);
6225 } catch (buffer::error& err) {
6226 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6227 }
6228
6229read_omap:
6230 if (m.empty()) {
6231 bufferlist header;
6232 ret = omap_get_all(obj, header, m);
6233
6234 write_map = true;
6235 }
6236
6237 if (ret < 0 || m.empty()) {
6238 vector<rgw_pool> pools;
6239 string s = string("default.") + default_storage_pool_suffix;
6240 pools.push_back(rgw_pool(s));
6241 vector<int> retcodes;
6242 bufferlist bl;
6243 ret = create_pools(pools, retcodes);
6244 if (ret < 0)
6245 return ret;
6246 ret = omap_set(obj, s, bl);
6247 if (ret < 0)
6248 return ret;
6249 m[s] = bl;
6250 }
6251
6252 if (write_map) {
6253 bufferlist new_bl;
6254 ::encode(m, new_bl);
6255 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6256 if (ret < 0) {
6257 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6258 }
6259 }
6260
6261 map<string, bufferlist>::iterator miter;
6262 if (m.size() > 1) {
6263 vector<string> v;
6264 for (miter = m.begin(); miter != m.end(); ++miter) {
6265 v.push_back(miter->first);
6266 }
6267
6268 uint32_t r;
6269 ret = get_random_bytes((char *)&r, sizeof(r));
6270 if (ret < 0)
6271 return ret;
6272
6273 int i = r % v.size();
6274 pool_name = v[i];
6275 } else {
6276 miter = m.begin();
6277 pool_name = miter->first;
6278 }
6279
6280 rule_info->data_pool = pool_name;
6281 rule_info->data_extra_pool = pool_name;
6282 rule_info->index_pool = pool_name;
6283 rule_info->index_type = RGWBIType_Normal;
6284
6285 return 0;
6286}
6287
6288bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6289{
6290 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6291}
6292
6293bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6294{
6295 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6296
6297 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6298}
6299
6300int RGWRados::update_placement_map()
6301{
6302 bufferlist header;
6303 map<string, bufferlist> m;
6304 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6305 int ret = omap_get_all(obj, header, m);
6306 if (ret < 0)
6307 return ret;
6308
6309 bufferlist new_bl;
6310 ::encode(m, new_bl);
6311 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6312 if (ret < 0) {
6313 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6314 }
6315
6316 return ret;
6317}
6318
6319int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6320{
6321 librados::Rados *rad = get_rados_handle();
6322 int ret = rad->pool_lookup(new_pool.name.c_str());
6323 if (ret < 0) // DNE, or something
6324 return ret;
6325
6326 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6327 bufferlist empty_bl;
6328 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6329
6330 // don't care about return value
6331 update_placement_map();
6332
6333 return ret;
6334}
6335
6336int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6337{
6338 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6339 int ret = omap_del(obj, old_pool.to_str());
6340
6341 // don't care about return value
6342 update_placement_map();
6343
6344 return ret;
6345}
6346
6347int RGWRados::list_placement_set(set<rgw_pool>& names)
6348{
6349 bufferlist header;
6350 map<string, bufferlist> m;
6351
6352 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6353 int ret = omap_get_all(obj, header, m);
6354 if (ret < 0)
6355 return ret;
6356
6357 names.clear();
6358 map<string, bufferlist>::iterator miter;
6359 for (miter = m.begin(); miter != m.end(); ++miter) {
6360 names.insert(rgw_pool(miter->first));
6361 }
6362
6363 return names.size();
6364}
6365
6366int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6367{
6368 vector<librados::PoolAsyncCompletion *> completions;
6369 vector<int> rets;
6370
6371 librados::Rados *rad = get_rados_handle();
6372 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6373 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6374 completions.push_back(c);
6375 rgw_pool& pool = *iter;
6376 int ret = rad->pool_create_async(pool.name.c_str(), c);
6377 rets.push_back(ret);
6378 }
6379
6380 vector<int>::iterator riter;
6381 vector<librados::PoolAsyncCompletion *>::iterator citer;
6382
c07f9fc5 6383 bool error = false;
7c673cae
FG
6384 assert(rets.size() == completions.size());
6385 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6386 int r = *riter;
6387 PoolAsyncCompletion *c = *citer;
6388 if (r == 0) {
6389 c->wait();
6390 r = c->get_return_value();
6391 if (r < 0) {
6392 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
c07f9fc5 6393 error = true;
7c673cae
FG
6394 }
6395 }
6396 c->release();
6397 retcodes.push_back(r);
6398 }
c07f9fc5
FG
6399 if (error) {
6400 return 0;
6401 }
6402
6403 std::vector<librados::IoCtx> io_ctxs;
6404 retcodes.clear();
6405 for (auto pool : pools) {
6406 io_ctxs.emplace_back();
6407 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6408 if (ret < 0) {
6409 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6410 error = true;
6411 }
6412 retcodes.push_back(ret);
6413 }
6414 if (error) {
6415 return 0;
6416 }
6417
6418 completions.clear();
6419 for (auto &io_ctx : io_ctxs) {
6420 librados::PoolAsyncCompletion *c =
6421 librados::Rados::pool_async_create_completion();
6422 completions.push_back(c);
6423 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6424 false, c);
6425 assert(ret == 0);
6426 }
6427
6428 retcodes.clear();
6429 for (auto c : completions) {
6430 c->wait();
6431 int ret = c->get_return_value();
6432 if (ret == -EOPNOTSUPP) {
6433 ret = 0;
6434 } else if (ret < 0) {
6435 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6436 << dendl;
6437 error = true;
6438 }
6439 c->release();
6440 retcodes.push_back(ret);
6441 }
7c673cae
FG
6442 return 0;
6443}
6444
6445int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6446{
6447 string oid, key;
6448 get_obj_bucket_and_oid_loc(obj, oid, key);
6449
6450 rgw_pool pool;
6451 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6452 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6453 return -EIO;
6454 }
6455
6456 int r = open_pool_ctx(pool, *ioctx);
6457 if (r < 0) {
6458 return r;
6459 }
6460
6461 ioctx->locator_set_key(key);
6462
6463 return 0;
6464}
6465
6466int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6467{
6468 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6469
6470 rgw_pool pool;
6471 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6472 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6473 return -EIO;
6474 }
6475
6476 int r = open_pool_ctx(pool, ref->ioctx);
6477 if (r < 0) {
6478 return r;
6479 }
6480
6481 ref->ioctx.locator_set_key(ref->key);
6482
6483 return 0;
6484}
6485
224ce89b 6486int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae
FG
6487{
6488 ref->oid = obj.oid;
6489 ref->key = obj.loc;
6490
6491 int r;
6492
6493 if (ref->oid.empty()) {
6494 ref->oid = obj.pool.to_str();
6495 ref->pool = get_zone_params().domain_root;
6496 } else {
6497 ref->pool = obj.pool;
6498 }
7c673cae
FG
6499 r = open_pool_ctx(ref->pool, ref->ioctx);
6500 if (r < 0)
6501 return r;
6502
6503 ref->ioctx.locator_set_key(ref->key);
6504
6505 return 0;
6506}
6507
224ce89b 6508int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 6509{
224ce89b 6510 return get_raw_obj_ref(obj, ref);
7c673cae
FG
6511}
6512
6513/*
6514 * fixes an issue where head objects were supposed to have a locator created, but ended
6515 * up without one
6516 */
6517int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6518{
6519 const rgw_bucket& bucket = bucket_info.bucket;
6520 string oid;
6521 string locator;
6522
6523 rgw_obj obj(bucket, key);
6524
6525 get_obj_bucket_and_oid_loc(obj, oid, locator);
6526
6527 if (locator.empty()) {
6528 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6529 return 0;
6530 }
6531
6532 librados::IoCtx ioctx;
6533
6534 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6535 if (ret < 0) {
6536 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6537 return ret;
6538 }
6539 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6540
6541 uint64_t size;
6542 bufferlist data;
6543
6544 struct timespec mtime_ts;
6545 map<string, bufferlist> attrs;
6546 librados::ObjectReadOperation op;
6547 op.getxattrs(&attrs, NULL);
6548 op.stat2(&size, &mtime_ts, NULL);
6549#define HEAD_SIZE 512 * 1024
6550 op.read(0, HEAD_SIZE, &data, NULL);
6551
6552 ret = ioctx.operate(oid, &op, NULL);
6553 if (ret < 0) {
6554 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6555 return ret;
6556 }
6557
6558 if (size > HEAD_SIZE) {
6559 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6560 return -EIO;
6561 }
6562
6563 if (size != data.length()) {
6564 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6565 return -EIO;
6566 }
6567
6568 if (copy_obj) {
6569 librados::ObjectWriteOperation wop;
6570
6571 wop.mtime2(&mtime_ts);
6572
6573 map<string, bufferlist>::iterator iter;
6574 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6575 wop.setxattr(iter->first.c_str(), iter->second);
6576 }
6577
6578 wop.write(0, data);
6579
6580 ioctx.locator_set_key(locator);
6581 ioctx.operate(oid, &wop);
6582 }
6583
6584 if (remove_bad) {
6585 ioctx.locator_set_key(string());
6586
6587 ret = ioctx.remove(oid);
6588 if (ret < 0) {
6589 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6590 return ret;
6591 }
6592 }
6593
6594 return 0;
6595}
6596
6597int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6598 const string& src_oid, const string& src_locator,
6599 librados::IoCtx& dst_ioctx,
6600 const string& dst_oid, const string& dst_locator)
6601{
6602
6603#define COPY_BUF_SIZE (4 * 1024 * 1024)
6604 bool done = false;
6605 uint64_t chunk_size = COPY_BUF_SIZE;
6606 uint64_t ofs = 0;
6607 int ret = 0;
6608 real_time mtime;
6609 struct timespec mtime_ts;
6610 uint64_t size;
6611
6612 if (src_oid == dst_oid && src_locator == dst_locator) {
6613 return 0;
6614 }
6615
6616 src_ioctx.locator_set_key(src_locator);
6617 dst_ioctx.locator_set_key(dst_locator);
6618
6619 do {
6620 bufferlist data;
6621 ObjectReadOperation rop;
6622 ObjectWriteOperation wop;
6623
6624 if (ofs == 0) {
6625 rop.stat2(&size, &mtime_ts, NULL);
6626 mtime = real_clock::from_timespec(mtime_ts);
6627 }
6628 rop.read(ofs, chunk_size, &data, NULL);
6629 ret = src_ioctx.operate(src_oid, &rop, NULL);
6630 if (ret < 0) {
6631 goto done_err;
6632 }
6633
6634 if (data.length() == 0) {
6635 break;
6636 }
6637
6638 if (ofs == 0) {
6639 wop.create(true); /* make it exclusive */
6640 wop.mtime2(&mtime_ts);
6641 mtime = real_clock::from_timespec(mtime_ts);
6642 }
6643 wop.write(ofs, data);
6644 ret = dst_ioctx.operate(dst_oid, &wop);
6645 ofs += data.length();
6646 done = data.length() != chunk_size;
6647 } while (!done);
6648
6649 if (ofs != size) {
6650 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6651 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6652 ret = -EIO;
6653 goto done_err;
6654 }
6655
6656 src_ioctx.remove(src_oid);
6657
6658 return 0;
6659
6660done_err:
6661 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6662 return ret;
6663}
6664
6665/*
6666 * fixes an issue where head objects were supposed to have a locator created, but ended
6667 * up without one
6668 */
6669int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6670{
6671 const rgw_bucket& bucket = bucket_info.bucket;
6672 rgw_obj obj(bucket, key);
6673
6674 if (need_fix) {
6675 *need_fix = false;
6676 }
6677
6678 rgw_rados_ref ref;
6679 int r = get_obj_head_ref(bucket_info, obj, &ref);
6680 if (r < 0) {
6681 return r;
6682 }
6683
6684 RGWObjState *astate = NULL;
6685 RGWObjectCtx rctx(this);
6686 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6687 if (r < 0)
6688 return r;
6689
6690 if (astate->has_manifest) {
6691 RGWObjManifest::obj_iterator miter;
6692 RGWObjManifest& manifest = astate->manifest;
6693 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6694 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6695 rgw_obj loc;
6696 string oid;
6697 string locator;
6698
6699 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6700
6701 if (loc.key.ns.empty()) {
6702 /* continue, we're only interested in tail objects */
6703 continue;
6704 }
6705
6706 get_obj_bucket_and_oid_loc(loc, oid, locator);
6707 ref.ioctx.locator_set_key(locator);
6708
6709 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6710
6711 r = ref.ioctx.stat(oid, NULL, NULL);
6712 if (r != -ENOENT) {
6713 continue;
6714 }
6715
6716 string bad_loc;
6717 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6718
6719 /* create a new ioctx with the bad locator */
6720 librados::IoCtx src_ioctx;
6721 src_ioctx.dup(ref.ioctx);
6722 src_ioctx.locator_set_key(bad_loc);
6723
6724 r = src_ioctx.stat(oid, NULL, NULL);
6725 if (r != 0) {
6726 /* cannot find a broken part */
6727 continue;
6728 }
6729 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6730 if (need_fix) {
6731 *need_fix = true;
6732 }
6733 if (fix) {
6734 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6735 if (r < 0) {
6736 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6737 }
6738 }
6739 }
6740 }
6741
6742 return 0;
6743}
6744
f64942e4
AA
6745int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
6746 const rgw_obj& obj,
6747 RGWBucketInfo* bucket_info_out)
7c673cae
FG
6748{
6749 bucket = _bucket;
6750
6751 RGWObjectCtx obj_ctx(store);
6752
6753 RGWBucketInfo bucket_info;
f64942e4
AA
6754 RGWBucketInfo* bucket_info_p =
6755 bucket_info_out ? bucket_info_out : &bucket_info;
6756
6757 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
6758 if (ret < 0) {
6759 return ret;
6760 }
6761
f64942e4 6762 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae
FG
6763 if (ret < 0) {
6764 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6765 return ret;
6766 }
6767 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6768
6769 return 0;
6770}
6771
f64942e4
AA
6772int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
6773 int sid,
6774 RGWBucketInfo* bucket_info_out)
7c673cae
FG
6775{
6776 bucket = _bucket;
6777 shard_id = sid;
6778
6779 RGWObjectCtx obj_ctx(store);
6780
6781 RGWBucketInfo bucket_info;
f64942e4
AA
6782 RGWBucketInfo* bucket_info_p =
6783 bucket_info_out ? bucket_info_out : &bucket_info;
6784 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
6785 if (ret < 0) {
6786 return ret;
6787 }
6788
f64942e4 6789 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
7c673cae
FG
6790 if (ret < 0) {
6791 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6792 return ret;
6793 }
6794 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6795
6796 return 0;
6797}
6798
a8e16298
TL
6799int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
6800 const rgw_obj& obj)
6801{
6802 bucket = bucket_info.bucket;
6803
6804 int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
6805 obj.get_hash_object(), &bucket_obj,
6806 &shard_id);
6807 if (ret < 0) {
6808 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6809 return ret;
6810 }
6811 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6812
6813 return 0;
6814}
6815
b32b8144
FG
6816int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6817{
6818 bucket = bucket_info.bucket;
6819 shard_id = sid;
6820
6821 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6822 if (ret < 0) {
6823 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6824 return ret;
6825 }
6826 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6827
6828 return 0;
6829}
6830
7c673cae
FG
6831
6832/* Execute @handler on last item in bucket listing for bucket specified
6833 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6834 * to objects matching these criterias. */
6835int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6836 const std::string& obj_prefix,
6837 const std::string& obj_delim,
6838 std::function<int(const rgw_bucket_dir_entry&)> handler)
6839{
6840 RGWRados::Bucket target(this, bucket_info);
6841 RGWRados::Bucket::List list_op(&target);
6842
6843 list_op.params.prefix = obj_prefix;
6844 list_op.params.delim = obj_delim;
6845
6846 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6847 << ", obj_prefix=" << obj_prefix
6848 << ", obj_delim=" << obj_delim
6849 << dendl;
6850
6851 bool is_truncated = false;
6852
6853 boost::optional<rgw_bucket_dir_entry> last_entry;
6854 /* We need to rewind to the last object in a listing. */
6855 do {
6856 /* List bucket entries in chunks. */
6857 static constexpr int MAX_LIST_OBJS = 100;
6858 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6859
6860 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6861 &is_truncated);
6862 if (ret < 0) {
6863 return ret;
6864 } else if (!entries.empty()) {
6865 last_entry = entries.back();
6866 }
6867 } while (is_truncated);
6868
6869 if (last_entry) {
6870 return handler(*last_entry);
6871 }
6872
6873 /* Empty listing - no items we can run handler on. */
6874 return 0;
6875}
6876
6877
6878int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6879 const rgw_user& user,
6880 RGWBucketInfo& bucket_info,
6881 rgw_obj& obj)
6882{
6883 if (! swift_versioning_enabled(bucket_info)) {
6884 return 0;
6885 }
6886
6887 obj_ctx.obj.set_atomic(obj);
6888
6889 RGWObjState * state = nullptr;
6890 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6891 if (r < 0) {
6892 return r;
6893 }
6894
6895 if (!state->exists) {
6896 return 0;
6897 }
6898
6899 string client_id;
6900 string op_id;
6901
6902 const string& src_name = obj.get_oid();
6903 char buf[src_name.size() + 32];
6904 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6905 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6906 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6907
6908 RGWBucketInfo dest_bucket_info;
6909
6910 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6911 if (r < 0) {
6912 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6913 if (r == -ENOENT) {
6914 return -ERR_PRECONDITION_FAILED;
6915 }
6916 return r;
6917 }
6918
6919 if (dest_bucket_info.owner != bucket_info.owner) {
6920 return -ERR_PRECONDITION_FAILED;
6921 }
6922
6923 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6924 obj_ctx.obj.set_atomic(dest_obj);
6925
6926 string no_zone;
6927
6928 r = copy_obj(obj_ctx,
6929 user,
6930 client_id,
6931 op_id,
6932 NULL, /* req_info *info */
6933 no_zone,
6934 dest_obj,
6935 obj,
6936 dest_bucket_info,
6937 bucket_info,
6938 NULL, /* time_t *src_mtime */
6939 NULL, /* time_t *mtime */
6940 NULL, /* const time_t *mod_ptr */
6941 NULL, /* const time_t *unmod_ptr */
6942 false, /* bool high_precision_time */
6943 NULL, /* const char *if_match */
6944 NULL, /* const char *if_nomatch */
6945 RGWRados::ATTRSMOD_NONE,
6946 true, /* bool copy_if_newer */
6947 state->attrset,
6948 RGW_OBJ_CATEGORY_MAIN,
6949 0, /* uint64_t olh_epoch */
6950 real_time(), /* time_t delete_at */
6951 NULL, /* string *version_id */
6952 NULL, /* string *ptag */
6953 NULL, /* string *petag */
7c673cae
FG
6954 NULL, /* void (*progress_cb)(off_t, void *) */
6955 NULL); /* void *progress_data */
6956 if (r == -ECANCELED || r == -ENOENT) {
6957 /* Has already been overwritten, meaning another rgw process already
6958 * copied it out */
6959 return 0;
6960 }
6961
6962 return r;
6963}
6964
6965int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6966 const rgw_user& user,
6967 RGWBucketInfo& bucket_info,
6968 rgw_obj& obj,
6969 bool& restored) /* out */
6970{
6971 if (! swift_versioning_enabled(bucket_info)) {
6972 return 0;
6973 }
6974
6975 /* Bucket info of the bucket that stores previous versions of our object. */
6976 RGWBucketInfo archive_binfo;
6977
6978 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6979 bucket_info.swift_ver_location, archive_binfo,
6980 nullptr, nullptr);
6981 if (ret < 0) {
6982 return ret;
6983 }
6984
6985 /* Abort the operation if the bucket storing our archive belongs to someone
6986 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6987 * into consideration. For we can live with that.
6988 *
6989 * TODO: delegate this check to un upper layer and compare with ACLs. */
6990 if (bucket_info.owner != archive_binfo.owner) {
6991 return -EPERM;
6992 }
6993
6994 /* This code will be executed on latest version of the object. */
6995 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6996 std::string no_client_id;
6997 std::string no_op_id;
6998 std::string no_zone;
6999
7000 /* We don't support object versioning of Swift API on those buckets that
7001 * are already versioned using the S3 mechanism. This affects also bucket
7002 * storing archived objects. Otherwise the delete operation would create
7003 * a deletion marker. */
7004 if (archive_binfo.versioned()) {
7005 restored = false;
7006 return -ERR_PRECONDITION_FAILED;
7007 }
7008
7009 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
7010 * irrelevant and may be safely skipped. */
7011 std::map<std::string, ceph::bufferlist> no_attrs;
7012
7013 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
7014 obj_ctx.obj.set_atomic(archive_obj);
7015 obj_ctx.obj.set_atomic(obj);
7016
7017 int ret = copy_obj(obj_ctx,
7018 user,
7019 no_client_id,
7020 no_op_id,
7021 nullptr, /* req_info *info */
7022 no_zone,
7023 obj, /* dest obj */
7024 archive_obj, /* src obj */
7025 bucket_info, /* dest bucket info */
7026 archive_binfo, /* src bucket info */
7027 nullptr, /* time_t *src_mtime */
7028 nullptr, /* time_t *mtime */
7029 nullptr, /* const time_t *mod_ptr */
7030 nullptr, /* const time_t *unmod_ptr */
7031 false, /* bool high_precision_time */
7032 nullptr, /* const char *if_match */
7033 nullptr, /* const char *if_nomatch */
7034 RGWRados::ATTRSMOD_NONE,
7035 true, /* bool copy_if_newer */
7036 no_attrs,
7037 RGW_OBJ_CATEGORY_MAIN,
7038 0, /* uint64_t olh_epoch */
7039 real_time(), /* time_t delete_at */
7040 nullptr, /* string *version_id */
7041 nullptr, /* string *ptag */
7042 nullptr, /* string *petag */
7c673cae
FG
7043 nullptr, /* void (*progress_cb)(off_t, void *) */
7044 nullptr); /* void *progress_data */
7045 if (ret == -ECANCELED || ret == -ENOENT) {
7046 /* Has already been overwritten, meaning another rgw process already
7047 * copied it out */
7048 return 0;
7049 } else if (ret < 0) {
7050 return ret;
7051 } else {
7052 restored = true;
7053 }
7054
7055 /* Need to remove the archived copy. */
7056 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
7057 archive_binfo.versioning_status());
7058
7059 return ret;
7060 };
7061
7062 const std::string& obj_name = obj.get_oid();
7063 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
7064 % obj_name);
7065
7066 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
7067 handler);
7068}
7069
7070/**
7071 * Write/overwrite an object to the bucket storage.
7072 * bucket: the bucket to store the object in
7073 * obj: the object name/key
7074 * data: the object contents/value
7075 * size: the amount of data to write (data must be this long)
7076 * accounted_size: original size of data before compression, encryption
7077 * mtime: if non-NULL, writes the given mtime to the bucket storage
7078 * attrs: all the given attrs are written to bucket storage for the given object
7079 * exclusive: create object exclusively
7080 * Returns: 0 on success, -ERR# otherwise.
7081 */
7082int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
7083 map<string, bufferlist>& attrs,
7084 bool assume_noent, bool modify_tail,
7c673cae
FG
7085 void *_index_op)
7086{
7087 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
7088 RGWRados *store = target->get_store();
7089
7090 ObjectWriteOperation op;
7091
7092 RGWObjState *state;
7093 int r = target->get_state(&state, false, assume_noent);
7094 if (r < 0)
7095 return r;
7096
7097 rgw_obj& obj = target->get_obj();
7098
7099 if (obj.get_oid().empty()) {
7100 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7101 return -EIO;
7102 }
7103
224ce89b 7104 rgw_rados_ref ref;
7c673cae
FG
7105 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
7106 if (r < 0)
7107 return r;
7108
7109 bool is_olh = state->is_olh;
7110
7111 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
7112
7113 const string *ptag = meta.ptag;
7114 if (!ptag && !index_op->get_optag()->empty()) {
7115 ptag = index_op->get_optag();
7116 }
181888fb 7117 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
7c673cae
FG
7118 if (r < 0)
7119 return r;
7120
7121 if (real_clock::is_zero(meta.set_mtime)) {
7122 meta.set_mtime = real_clock::now();
7123 }
7124
7125 if (state->is_olh) {
7126 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
7127 }
7128
7129 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
7130 op.mtime2(&mtime_ts);
7131
7132 if (meta.data) {
7133 /* if we want to overwrite the data, we also want to overwrite the
7134 xattrs, so just remove the object */
7135 op.write_full(*meta.data);
7136 }
7137
7138 string etag;
7139 string content_type;
7140 bufferlist acl_bl;
7141
7142 map<string, bufferlist>::iterator iter;
7143 if (meta.rmattrs) {
7144 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
7145 const string& name = iter->first;
7146 op.rmxattr(name.c_str());
7147 }
7148 }
7149
7150 if (meta.manifest) {
7151 /* remove existing manifest attr */
7152 iter = attrs.find(RGW_ATTR_MANIFEST);
7153 if (iter != attrs.end())
7154 attrs.erase(iter);
7155
7156 bufferlist bl;
7157 ::encode(*meta.manifest, bl);
7158 op.setxattr(RGW_ATTR_MANIFEST, bl);
7159 }
7160
7161 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
7162 const string& name = iter->first;
7163 bufferlist& bl = iter->second;
7164
7165 if (!bl.length())
7166 continue;
7167
7168 op.setxattr(name.c_str(), bl);
7169
7170 if (name.compare(RGW_ATTR_ETAG) == 0) {
7171 etag = bl.c_str();
7172 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
7173 content_type = bl.c_str();
7174 } else if (name.compare(RGW_ATTR_ACL) == 0) {
7175 acl_bl = bl;
7176 }
7177 }
7178 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
7179 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
7180 }
7181
7182 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
7183 bufferlist bl;
7184 ::encode(store->get_zone_short_id(), bl);
7185 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
7186 }
7187
7188 if (!op.size())
7189 return 0;
7190
7191 uint64_t epoch;
7192 int64_t poolid;
224ce89b
WB
7193 bool orig_exists;
7194 uint64_t orig_size;
7195
7196 if (!reset_obj) { //Multipart upload, it has immutable head.
7197 orig_exists = false;
7198 orig_size = 0;
7199 } else {
7200 orig_exists = state->exists;
7201 orig_size = state->accounted_size;
7202 }
7c673cae 7203
91327a77
AA
7204 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
7205 !obj.key.instance.empty();
7c673cae
FG
7206
7207 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
7208
7209 if (versioned_op) {
7210 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
7211 }
7212
7213 if (!index_op->is_prepared()) {
7214 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
7215 if (r < 0)
7216 return r;
7217 }
7218
7219 r = ref.ioctx.operate(ref.oid, &op);
7220 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7221 or -ENOENT if was removed, or -EEXIST if it did not exist
7222 before and now it does */
7223 if (r == -EEXIST && assume_noent) {
7224 target->invalidate_state();
7225 return r;
7226 }
7227 goto done_cancel;
7228 }
7229
7230 epoch = ref.ioctx.get_last_version();
7231 poolid = ref.ioctx.get_id();
7232
7233 r = target->complete_atomic_modification();
7234 if (r < 0) {
7235 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7236 }
7237
7238 r = index_op->complete(poolid, epoch, size, accounted_size,
7239 meta.set_mtime, etag, content_type, &acl_bl,
7240 meta.category, meta.remove_objs, meta.user_data);
7241 if (r < 0)
7242 goto done_cancel;
7243
7244 if (meta.mtime) {
7245 *meta.mtime = meta.set_mtime;
7246 }
7247
7248 /* note that index_op was using state so we couldn't invalidate it earlier */
7249 target->invalidate_state();
7250 state = NULL;
7251
91327a77
AA
7252 if (versioned_op && meta.olh_epoch) {
7253 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
7c673cae
FG
7254 if (r < 0) {
7255 return r;
7256 }
7257 }
7258
7259 if (!real_clock::is_zero(meta.delete_at)) {
7260 rgw_obj_index_key obj_key;
7261 obj.key.get_index_key(&obj_key);
7262
7263 r = store->objexp_hint_add(meta.delete_at,
7264 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7265 if (r < 0) {
7266 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7267 /* ignoring error, nothing we can do at this point */
7268 }
7269 }
7270 meta.canceled = false;
7271
7272 /* update quota cache */
3efd9988
FG
7273 if (meta.completeMultipart){
7274 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7275 0, orig_size);
7276 }
7277 else {
7278 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7279 accounted_size, orig_size);
7280 }
7c673cae
FG
7281 return 0;
7282
7283done_cancel:
7284 int ret = index_op->cancel();
7285 if (ret < 0) {
7286 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7287 }
7288
7289 meta.canceled = true;
7290
7291 /* we lost in a race. There are a few options:
7292 * - existing object was rewritten (ECANCELED)
7293 * - non existing object was created (EEXIST)
7294 * - object was removed (ENOENT)
7295 * should treat it as a success
7296 */
7297 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7298 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7299 r = 0;
7300 }
7301 } else {
7302 if (meta.if_match != NULL) {
7303 // only overwrite existing object
7304 if (strcmp(meta.if_match, "*") == 0) {
7305 if (r == -ENOENT) {
7306 r = -ERR_PRECONDITION_FAILED;
7307 } else if (r == -ECANCELED) {
7308 r = 0;
7309 }
7310 }
7311 }
7312
7313 if (meta.if_nomatch != NULL) {
7314 // only create a new object
7315 if (strcmp(meta.if_nomatch, "*") == 0) {
7316 if (r == -EEXIST) {
7317 r = -ERR_PRECONDITION_FAILED;
7318 } else if (r == -ENOENT) {
7319 r = 0;
7320 }
7321 }
7322 }
7323 }
7324
7325 return r;
7326}
7327
7328int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7329 map<string, bufferlist>& attrs)
7330{
7331 RGWBucketInfo& bucket_info = target->get_bucket_info();
7332
7333 RGWRados::Bucket bop(target->get_store(), bucket_info);
7334 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
7335 index_op.set_zones_trace(meta.zones_trace);
7336
7c673cae
FG
7337 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7338 int r;
7339 if (assume_noent) {
181888fb 7340 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7341 if (r == -EEXIST) {
7342 assume_noent = false;
7343 }
7344 }
7345 if (!assume_noent) {
181888fb 7346 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7347 }
7348 return r;
7349}
7350
7351/** Write/overwrite a system object. */
7352int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7353 map<std::string, bufferlist>& attrs, int flags,
7354 bufferlist& data,
7355 RGWObjVersionTracker *objv_tracker,
7356 real_time set_mtime /* 0 for don't set */)
7357{
7c673cae 7358 rgw_rados_ref ref;
224ce89b 7359 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7360 if (r < 0)
7361 return r;
7362
7363 ObjectWriteOperation op;
7364
7365 if (flags & PUT_OBJ_EXCL) {
7366 if (!(flags & PUT_OBJ_CREATE))
7367 return -EINVAL;
7368 op.create(true); // exclusive create
7369 } else {
7370 op.remove();
7371 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7372 op.create(false);
7373 }
7374
7375 if (objv_tracker) {
7376 objv_tracker->prepare_op_for_write(&op);
7377 }
7378
7379 if (real_clock::is_zero(set_mtime)) {
7380 set_mtime = real_clock::now();
7381 }
7382
7383 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7384 op.mtime2(&mtime_ts);
7385 op.write_full(data);
7386
7387 bufferlist acl_bl;
7388
7389 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7390 const string& name = iter->first;
7391 bufferlist& bl = iter->second;
7392
7393 if (!bl.length())
7394 continue;
7395
7396 op.setxattr(name.c_str(), bl);
7397 }
7398
7399 r = ref.ioctx.operate(ref.oid, &op);
7400 if (r < 0) {
7401 return r;
7402 }
7403
7404 if (objv_tracker) {
7405 objv_tracker->apply_write();
7406 }
7407
7408 if (mtime) {
7409 *mtime = set_mtime;
7410 }
7411
7412 return 0;
7413}
7414
7415int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7416 off_t ofs, bool exclusive,
7417 RGWObjVersionTracker *objv_tracker)
7418{
7419 rgw_rados_ref ref;
224ce89b 7420 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7421 if (r < 0) {
7422 return r;
7423 }
7424
7425 ObjectWriteOperation op;
7426
7427 if (exclusive)
7428 op.create(true);
7429
7430 if (objv_tracker) {
7431 objv_tracker->prepare_op_for_write(&op);
7432 }
7433 if (ofs == -1) {
7434 op.write_full(bl);
7435 } else {
7436 op.write(ofs, bl);
7437 }
7438 r = ref.ioctx.operate(ref.oid, &op);
7439 if (r < 0)
7440 return r;
7441
7442 if (objv_tracker) {
7443 objv_tracker->apply_write();
7444 }
7445 return 0;
7446}
7447
7448/**
7449 * Write/overwrite an object to the bucket storage.
7450 * bucket: the bucket to store the object in
7451 * obj: the object name/key
7452 * data: the object contents/value
7453 * offset: the offet to write to in the object
7454 * If this is -1, we will overwrite the whole object.
7455 * size: the amount of data to write (data must be this long)
7456 * attrs: all the given attrs are written to bucket storage for the given object
7457 * Returns: 0 on success, -ERR# otherwise.
7458 */
7459
7460int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7461 off_t ofs, bool exclusive,
7462 void **handle)
7463{
7464 rgw_rados_ref ref;
7465 int r = get_raw_obj_ref(obj, &ref);
7466 if (r < 0) {
7467 return r;
7468 }
7469
7470 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7471 *handle = c;
7472
7473 ObjectWriteOperation op;
7474
7475 if (exclusive)
7476 op.create(true);
7477
7478 if (ofs == -1) {
7479 op.write_full(bl);
7480 } else {
7481 op.write(ofs, bl);
7482 }
7483 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7484 if (r < 0)
7485 return r;
7486
7487 return 0;
7488}
7489
7490int RGWRados::aio_wait(void *handle)
7491{
7492 AioCompletion *c = (AioCompletion *)handle;
7493 c->wait_for_safe();
7494 int ret = c->get_return_value();
7495 c->release();
7496 return ret;
7497}
7498
7499bool RGWRados::aio_completed(void *handle)
7500{
7501 AioCompletion *c = (AioCompletion *)handle;
7502 return c->is_safe();
7503}
7504
28e407b8
AA
7505// PutObj filter that buffers data so we don't try to compress tiny blocks.
7506// libcurl reads in 16k at a time, and we need at least 64k to get a good
7507// compression ratio
7508class RGWPutObj_Buffer : public RGWPutObj_Filter {
7509 const unsigned buffer_size;
7510 bufferlist buffer;
7511 public:
7512 RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
7513 : RGWPutObj_Filter(next), buffer_size(buffer_size) {
7514 assert(ISP2(buffer_size)); // must be power of 2
7515 }
7516
7517 int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
7518 bool *again) override {
7519 if (*again || !bl.length()) {
7520 // flush buffered data
7521 return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
7522 }
7523 // transform offset to the beginning of the buffer
7524 ofs = ofs - buffer.length();
7525 buffer.claim_append(bl);
7526 if (buffer.length() < buffer_size) {
7527 *again = false; // don't come back until there's more data
7528 return 0;
7529 }
7530 const auto count = P2ALIGN(buffer.length(), buffer_size);
7531 buffer.splice(0, count, &bl);
7532 return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
7533 }
7534};
7535
7c673cae
FG
7536class RGWRadosPutObj : public RGWGetDataCB
7537{
7538 CephContext* cct;
7539 rgw_obj obj;
7540 RGWPutObjDataProcessor *filter;
7541 boost::optional<RGWPutObj_Compress>& compressor;
28e407b8 7542 boost::optional<RGWPutObj_Buffer> buffering;
7c673cae
FG
7543 CompressorRef& plugin;
7544 RGWPutObjProcessor_Atomic *processor;
7545 RGWOpStateSingleOp *opstate;
7546 void (*progress_cb)(off_t, void *);
7547 void *progress_data;
7548 bufferlist extra_data_bl;
b32b8144 7549 uint64_t extra_data_left;
7c673cae
FG
7550 uint64_t data_len;
7551 map<string, bufferlist> src_attrs;
7552public:
7553 RGWRadosPutObj(CephContext* cct,
7554 CompressorRef& plugin,
7555 boost::optional<RGWPutObj_Compress>& compressor,
7556 RGWPutObjProcessor_Atomic *p,
7557 RGWOpStateSingleOp *_ops,
7558 void (*_progress_cb)(off_t, void *),
7559 void *_progress_data) :
7560 cct(cct),
7561 filter(p),
7562 compressor(compressor),
7563 plugin(plugin),
7564 processor(p),
7565 opstate(_ops),
7566 progress_cb(_progress_cb),
7567 progress_data(_progress_data),
b32b8144 7568 extra_data_left(0),
7c673cae
FG
7569 data_len(0) {}
7570
7571 int process_attrs(void) {
7572 if (extra_data_bl.length()) {
7573 JSONParser jp;
7574 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7575 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7576 return -EIO;
7577 }
7578
7579 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7580
7581 src_attrs.erase(RGW_ATTR_COMPRESSION);
7582 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
a8e16298
TL
7583
7584 // filter out olh attributes
7585 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
7586 while (iter != src_attrs.end()) {
7587 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
7588 break;
7589 }
7590 iter = src_attrs.erase(iter);
7591 }
7c673cae
FG
7592 }
7593
7594 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7595 //do not compress if object is encrypted
7596 compressor = boost::in_place(cct, plugin, filter);
28e407b8
AA
7597 constexpr unsigned buffer_size = 512 * 1024;
7598 buffering = boost::in_place(&*compressor, buffer_size);
7599 filter = &*buffering;
7c673cae
FG
7600 }
7601 return 0;
7602 }
7603
7604 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7605 if (progress_cb) {
7606 progress_cb(ofs, progress_data);
7607 }
b32b8144 7608 if (extra_data_left) {
7c673cae 7609 size_t extra_len = bl.length();
b32b8144
FG
7610 if (extra_len > extra_data_left)
7611 extra_len = extra_data_left;
7c673cae
FG
7612
7613 bufferlist extra;
7614 bl.splice(0, extra_len, &extra);
7615 extra_data_bl.append(extra);
7616
b32b8144
FG
7617 extra_data_left -= extra_len;
7618 if (extra_data_left == 0) {
7c673cae
FG
7619 int res = process_attrs();
7620 if (res < 0)
7621 return res;
7622 }
7623 if (bl.length() == 0) {
7624 return 0;
7625 }
b32b8144 7626 ofs += extra_len;
7c673cae 7627 }
b32b8144
FG
7628 // adjust ofs based on extra_data_len, so the result is a logical offset
7629 // into the object data
7630 assert(uint64_t(ofs) >= extra_data_len);
7631 ofs -= extra_data_len;
7632
7c673cae
FG
7633 data_len += bl.length();
7634 bool again = false;
7635
7636 bool need_opstate = true;
7637
7638 do {
7639 void *handle = NULL;
7640 rgw_raw_obj obj;
7641 uint64_t size = bl.length();
7642 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7643 if (ret < 0)
7644 return ret;
7645
7646 if (need_opstate && opstate) {
7647 /* need to update opstate repository with new state. This is ratelimited, so we're not
7648 * really doing it every time
7649 */
7650 ret = opstate->renew_state();
7651 if (ret < 0) {
7652 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7653 int r = filter->throttle_data(handle, obj, size, false);
7654 if (r < 0) {
7655 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7656 }
7657 /* could not renew state! might have been marked as cancelled */
7658 return ret;
7659 }
7660 need_opstate = false;
7661 }
7662
7663 ret = filter->throttle_data(handle, obj, size, false);
7664 if (ret < 0)
7665 return ret;
7666 } while (again);
7667
7668 return 0;
7669 }
7670
28e407b8
AA
7671 int flush() {
7672 bufferlist bl;
7673 return put_data_and_throttle(filter, bl, 0, false);
7674 }
7675
7c673cae
FG
7676 bufferlist& get_extra_data() { return extra_data_bl; }
7677
7678 map<string, bufferlist>& get_attrs() { return src_attrs; }
7679
7680 void set_extra_data_len(uint64_t len) override {
b32b8144
FG
7681 extra_data_left = len;
7682 RGWGetDataCB::set_extra_data_len(len);
7c673cae
FG
7683 }
7684
7685 uint64_t get_data_len() {
7686 return data_len;
7687 }
7688
7689 int complete(const string& etag, real_time *mtime, real_time set_mtime,
31f18b77
FG
7690 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7691 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7c673cae
FG
7692 }
7693
7694 bool is_canceled() {
7695 return processor->is_canceled();
7696 }
7697};
7698
7699/*
7700 * prepare attrset depending on attrs_mod.
7701 */
7702static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7703 map<string, bufferlist>& attrs,
7704 RGWRados::AttrsMod attrs_mod)
7705{
7706 switch (attrs_mod) {
7707 case RGWRados::ATTRSMOD_NONE:
7708 attrs = src_attrs;
7709 break;
7710 case RGWRados::ATTRSMOD_REPLACE:
7711 if (!attrs[RGW_ATTR_ETAG].length()) {
7712 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7713 }
181888fb
FG
7714 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7715 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7716 if (ttiter != src_attrs.end()) {
7717 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7718 }
7719 }
7c673cae
FG
7720 break;
7721 case RGWRados::ATTRSMOD_MERGE:
7722 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7723 if (attrs.find(it->first) == attrs.end()) {
7724 attrs[it->first] = it->second;
7725 }
7726 }
7727 break;
7728 }
7729}
7730
7731int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7732{
7733 map<string, bufferlist> attrset;
7734
7735 real_time mtime;
7736 uint64_t obj_size;
7737 RGWObjectCtx rctx(this);
7738
7739 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7740 RGWRados::Object::Read read_op(&op_target);
7741
7742 read_op.params.attrs = &attrset;
7743 read_op.params.lastmod = &mtime;
7744 read_op.params.obj_size = &obj_size;
7745
7746 int ret = read_op.prepare();
7747 if (ret < 0)
7748 return ret;
7749
7750 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 7751 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
7752
7753 uint64_t max_chunk_size;
7754
7755 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7756 if (ret < 0) {
7757 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7758 return ret;
7759 }
7760
b32b8144
FG
7761 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7762 max_chunk_size, NULL, mtime, attrset,
7763 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7764 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7765 NULL, NULL);
7c673cae
FG
7766}
7767
7768struct obj_time_weight {
7769 real_time mtime;
7770 uint32_t zone_short_id;
7771 uint64_t pg_ver;
7772 bool high_precision;
7773
7774 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7775
7776 bool compare_low_precision(const obj_time_weight& rhs) {
7777 struct timespec l = ceph::real_clock::to_timespec(mtime);
7778 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7779 l.tv_nsec = 0;
7780 r.tv_nsec = 0;
7781 if (l > r) {
7782 return false;
7783 }
7784 if (l < r) {
7785 return true;
7786 }
7787 if (zone_short_id != rhs.zone_short_id) {
7788 return (zone_short_id < rhs.zone_short_id);
7789 }
7790 return (pg_ver < rhs.pg_ver);
7791
7792 }
7793
7794 bool operator<(const obj_time_weight& rhs) {
7795 if (!high_precision || !rhs.high_precision) {
7796 return compare_low_precision(rhs);
7797 }
7798 if (mtime > rhs.mtime) {
7799 return false;
7800 }
7801 if (mtime < rhs.mtime) {
7802 return true;
7803 }
7804 if (zone_short_id != rhs.zone_short_id) {
7805 return (zone_short_id < rhs.zone_short_id);
7806 }
7807 return (pg_ver < rhs.pg_ver);
7808 }
7809
7810 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7811 mtime = _mtime;
7812 zone_short_id = _short_id;
7813 pg_ver = _pg_ver;
7814 }
7815
7816 void init(RGWObjState *state) {
7817 mtime = state->mtime;
7818 zone_short_id = state->zone_short_id;
7819 pg_ver = state->pg_ver;
7820 }
7821};
7822
7823inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7824 out << o.mtime;
7825
7826 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7827 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7828 }
7829
7830 return out;
7831}
7832
7833class RGWGetExtraDataCB : public RGWGetDataCB {
7834 bufferlist extra_data;
7835public:
7836 RGWGetExtraDataCB() {}
7837 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7838 if (extra_data.length() < extra_data_len) {
7839 off_t max = extra_data_len - extra_data.length();
7840 if (max > bl_len) {
7841 max = bl_len;
7842 }
7843 bl.splice(0, max, &extra_data);
7844 }
7845 return bl_len;
7846 }
7847
7848 bufferlist& get_extra_data() {
7849 return extra_data;
7850 }
7851};
7852
7853int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7854 const rgw_user& user_id,
7855 const string& client_id,
7856 req_info *info,
7857 const string& source_zone,
7858 rgw_obj& src_obj,
7859 RGWBucketInfo& src_bucket_info,
7860 real_time *src_mtime,
7861 uint64_t *psize,
7862 const real_time *mod_ptr,
7863 const real_time *unmod_ptr,
7864 bool high_precision_time,
7865 const char *if_match,
7866 const char *if_nomatch,
7867 map<string, bufferlist> *pattrs,
7868 string *version_id,
7869 string *ptag,
7870 string *petag)
7871{
7872 /* source is in a different zonegroup, copy from there */
7873
7874 RGWRESTStreamRWRequest *in_stream_req;
7875 string tag;
7876 map<string, bufferlist> src_attrs;
7877 append_rand_alpha(cct, tag, tag, 32);
7878 obj_time_weight set_mtime_weight;
7879 set_mtime_weight.high_precision = high_precision_time;
7880
7881 RGWRESTConn *conn;
7882 if (source_zone.empty()) {
7883 if (src_bucket_info.zonegroup.empty()) {
7884 /* source is in the master zonegroup */
7885 conn = rest_master_conn;
7886 } else {
7887 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7888 if (iter == zonegroup_conn_map.end()) {
7889 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7890 return -ENOENT;
7891 }
7892 conn = iter->second;
7893 }
7894 } else {
7895 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7896 if (iter == zone_conn_map.end()) {
7897 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7898 return -ENOENT;
7899 }
7900 conn = iter->second;
7901 }
7902
7903 RGWGetExtraDataCB cb;
7904 string etag;
7905 map<string, string> req_headers;
7906 real_time set_mtime;
7907
7908 const real_time *pmod = mod_ptr;
7909
7910 obj_time_weight dest_mtime_weight;
7911
181888fb
FG
7912 constexpr bool prepend_meta = true;
7913 constexpr bool get_op = true;
7914 constexpr bool rgwx_stat = true;
7915 constexpr bool sync_manifest = true;
7916 constexpr bool skip_decrypt = true;
7c673cae
FG
7917 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7918 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
7919 prepend_meta, get_op, rgwx_stat,
7920 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
7921 if (ret < 0) {
7922 return ret;
7923 }
7924
7925 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7926 if (ret < 0) {
7927 return ret;
7928 }
7929
7930 bufferlist& extra_data_bl = cb.get_extra_data();
7931 if (extra_data_bl.length()) {
7932 JSONParser jp;
7933 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7934 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7935 return -EIO;
7936 }
7937
7938 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7939
7940 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7941 }
7942
7943 if (src_mtime) {
7944 *src_mtime = set_mtime;
7945 }
7946
7947 if (petag) {
7948 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7949 if (iter != src_attrs.end()) {
7950 bufferlist& etagbl = iter->second;
7951 *petag = etagbl.to_str();
7952 }
7953 }
7954
7955 if (pattrs) {
7956 *pattrs = src_attrs;
7957 }
7958
7959 return 0;
7960}
7961
7962int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7963 const rgw_user& user_id,
7964 const string& client_id,
7965 const string& op_id,
7966 bool record_op_state,
7967 req_info *info,
7968 const string& source_zone,
7969 rgw_obj& dest_obj,
7970 rgw_obj& src_obj,
7971 RGWBucketInfo& dest_bucket_info,
7972 RGWBucketInfo& src_bucket_info,
7973 real_time *src_mtime,
7974 real_time *mtime,
7975 const real_time *mod_ptr,
7976 const real_time *unmod_ptr,
7977 bool high_precision_time,
7978 const char *if_match,
7979 const char *if_nomatch,
7980 AttrsMod attrs_mod,
7981 bool copy_if_newer,
7982 map<string, bufferlist>& attrs,
7983 RGWObjCategory category,
91327a77 7984 boost::optional<uint64_t> olh_epoch,
7c673cae
FG
7985 real_time delete_at,
7986 string *version_id,
7987 string *ptag,
7988 ceph::buffer::list *petag,
7c673cae 7989 void (*progress_cb)(off_t, void *),
31f18b77
FG
7990 void *progress_data,
7991 rgw_zone_set *zones_trace)
7c673cae
FG
7992{
7993 /* source is in a different zonegroup, copy from there */
7994
7995 RGWRESTStreamRWRequest *in_stream_req;
7996 string tag;
7997 int i;
7998 append_rand_alpha(cct, tag, tag, 32);
7999 obj_time_weight set_mtime_weight;
8000 set_mtime_weight.high_precision = high_precision_time;
8001
8002 RGWPutObjProcessor_Atomic processor(obj_ctx,
8003 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
8004 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8005 if (version_id && *version_id != "null") {
8006 processor.set_version_id(*version_id);
8007 }
91327a77
AA
8008 if (olh_epoch) {
8009 processor.set_olh_epoch(*olh_epoch);
8010 }
7c673cae
FG
8011 int ret = processor.prepare(this, NULL);
8012 if (ret < 0) {
8013 return ret;
8014 }
8015
8016 RGWRESTConn *conn;
8017 if (source_zone.empty()) {
8018 if (dest_bucket_info.zonegroup.empty()) {
8019 /* source is in the master zonegroup */
8020 conn = rest_master_conn;
8021 } else {
8022 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
8023 if (iter == zonegroup_conn_map.end()) {
8024 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
8025 return -ENOENT;
8026 }
8027 conn = iter->second;
8028 }
8029 } else {
8030 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
8031 if (iter == zone_conn_map.end()) {
8032 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
8033 return -ENOENT;
8034 }
8035 conn = iter->second;
8036 }
8037
8038 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
8039
8040 RGWOpStateSingleOp *opstate = NULL;
8041
8042 if (record_op_state) {
8043 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
8044
8045 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
8046 if (ret < 0) {
8047 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8048 delete opstate;
8049 return ret;
8050 }
8051 }
8052
8053 boost::optional<RGWPutObj_Compress> compressor;
8054 CompressorRef plugin;
8055
8056 const auto& compression_type = zone_params.get_compression_type(
8057 dest_bucket_info.placement_rule);
8058 if (compression_type != "none") {
8059 plugin = Compressor::create(cct, compression_type);
8060 if (!plugin) {
8061 ldout(cct, 1) << "Cannot load plugin for compression type "
8062 << compression_type << dendl;
8063 }
8064 }
8065
8066 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
8067
8068 string etag;
8069 map<string, string> req_headers;
8070 real_time set_mtime;
8071
8072 RGWObjState *dest_state = NULL;
8073
8074 const real_time *pmod = mod_ptr;
8075
8076 obj_time_weight dest_mtime_weight;
8077
8078 if (copy_if_newer) {
8079 /* need to get mtime for destination */
8080 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8081 if (ret < 0)
8082 goto set_err_state;
8083
8084 if (!real_clock::is_zero(dest_state->mtime)) {
8085 dest_mtime_weight.init(dest_state);
8086 pmod = &dest_mtime_weight.mtime;
8087 }
8088 }
8089
181888fb
FG
8090 static constexpr bool prepend_meta = true;
8091 static constexpr bool get_op = true;
8092 static constexpr bool rgwx_stat = false;
8093 static constexpr bool sync_manifest = true;
8094 static constexpr bool skip_decrypt = true;
7c673cae
FG
8095 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
8096 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
8097 prepend_meta, get_op, rgwx_stat,
8098 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
8099 if (ret < 0) {
8100 goto set_err_state;
8101 }
8102
8103 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
8104 if (ret < 0) {
8105 goto set_err_state;
8106 }
28e407b8
AA
8107 ret = cb.flush();
8108 if (ret < 0) {
8109 goto set_err_state;
8110 }
7c673cae
FG
8111 if (compressor && compressor->is_compressed()) {
8112 bufferlist tmp;
8113 RGWCompressionInfo cs_info;
8114 cs_info.compression_type = plugin->get_type_name();
8115 cs_info.orig_size = cb.get_data_len();
8116 cs_info.blocks = move(compressor->get_compression_blocks());
8117 ::encode(cs_info, tmp);
8118 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
8119 }
8120
8121 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
8122 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
8123 } else {
8124 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
8125 if (iter != cb.get_attrs().end()) {
8126 try {
8127 ::decode(delete_at, iter->second);
8128 } catch (buffer::error& err) {
8129 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
8130 }
8131 }
8132 }
8133
8134 if (src_mtime) {
8135 *src_mtime = set_mtime;
8136 }
8137
8138 if (petag) {
8139 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
8140 if (iter != cb.get_attrs().end()) {
8141 *petag = iter->second;
8142 }
8143 }
8144
8145 if (source_zone.empty()) {
8146 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
8147 } else {
8148 attrs = cb.get_attrs();
8149 }
8150
8151 if (copy_if_newer) {
8152 uint64_t pg_ver = 0;
8153 auto i = attrs.find(RGW_ATTR_PG_VER);
8154 if (i != attrs.end() && i->second.length() > 0) {
8155 bufferlist::iterator iter = i->second.begin();
8156 try {
8157 ::decode(pg_ver, iter);
8158 } catch (buffer::error& err) {
8159 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
8160 /* non critical error */
8161 }
8162 }
8163 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
8164 }
8165
8166#define MAX_COMPLETE_RETRY 100
8167 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
31f18b77 8168 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7c673cae
FG
8169 if (ret < 0) {
8170 goto set_err_state;
8171 }
8172 if (copy_if_newer && cb.is_canceled()) {
8173 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
8174 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
8175 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8176 if (ret < 0) {
8177 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
8178 goto set_err_state;
8179 }
8180 dest_mtime_weight.init(dest_state);
8181 dest_mtime_weight.high_precision = high_precision_time;
8182 if (!dest_state->exists ||
8183 dest_mtime_weight < set_mtime_weight) {
8184 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8185 continue;
8186 } else {
8187 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8188 }
8189 }
8190 break;
8191 }
8192
8193 if (i == MAX_COMPLETE_RETRY) {
8194 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
8195 ret = -EIO;
8196 goto set_err_state;
8197 }
8198
8199 if (opstate) {
8200 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
8201 if (ret < 0) {
8202 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8203 }
8204 delete opstate;
8205 }
8206
8207 return 0;
8208set_err_state:
8209 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
8210 // we may have already fetched during sync of OP_ADD, but were waiting
8211 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
8212 if (olh_epoch && *olh_epoch > 0) {
8213 constexpr bool log_data_change = true;
8214 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
8215 *olh_epoch, real_time(), false, zones_trace, log_data_change);
8216 } else {
8217 // we already have the latest copy
8218 ret = 0;
8219 }
7c673cae
FG
8220 }
8221 if (opstate) {
8222 RGWOpState::OpState state;
8223 if (ret < 0) {
8224 state = RGWOpState::OPSTATE_ERROR;
8225 } else {
8226 state = RGWOpState::OPSTATE_COMPLETE;
8227 }
8228 int r = opstate->set_state(state);
8229 if (r < 0) {
8230 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
8231 }
8232 delete opstate;
8233 }
8234 return ret;
8235}
8236
8237
8238int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
8239 map<string, bufferlist>& src_attrs,
8240 RGWRados::Object::Read& read_op,
8241 const rgw_user& user_id,
8242 rgw_obj& dest_obj,
8243 real_time *mtime)
8244{
8245 string etag;
8246
8247 RGWRESTStreamWriteRequest *out_stream_req;
8248
8249 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
8250 if (ret < 0) {
7c673cae
FG
8251 return ret;
8252 }
8253
8254 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
224ce89b
WB
8255 if (ret < 0) {
8256 delete out_stream_req;
7c673cae 8257 return ret;
224ce89b 8258 }
7c673cae
FG
8259
8260 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8261 if (ret < 0)
8262 return ret;
8263
8264 return 0;
8265}
8266
8267/**
8268 * Copy an object.
8269 * dest_obj: the object to copy into
8270 * src_obj: the object to copy from
8271 * attrs: usage depends on attrs_mod parameter
8272 * attrs_mod: the modification mode of the attrs, may have the following values:
8273 * ATTRSMOD_NONE - the attributes of the source object will be
8274 * copied without modifications, attrs parameter is ignored;
8275 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8276 * parameter, source object attributes are not copied;
8277 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8278 * are overwritten by values contained in attrs parameter.
8279 * err: stores any errors resulting from the get of the original object
8280 * Returns: 0 on success, -ERR# otherwise.
8281 */
8282int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8283 const rgw_user& user_id,
8284 const string& client_id,
8285 const string& op_id,
8286 req_info *info,
8287 const string& source_zone,
8288 rgw_obj& dest_obj,
8289 rgw_obj& src_obj,
8290 RGWBucketInfo& dest_bucket_info,
8291 RGWBucketInfo& src_bucket_info,
8292 real_time *src_mtime,
8293 real_time *mtime,
8294 const real_time *mod_ptr,
8295 const real_time *unmod_ptr,
8296 bool high_precision_time,
8297 const char *if_match,
8298 const char *if_nomatch,
8299 AttrsMod attrs_mod,
8300 bool copy_if_newer,
8301 map<string, bufferlist>& attrs,
8302 RGWObjCategory category,
8303 uint64_t olh_epoch,
8304 real_time delete_at,
8305 string *version_id,
8306 string *ptag,
8307 ceph::buffer::list *petag,
7c673cae
FG
8308 void (*progress_cb)(off_t, void *),
8309 void *progress_data)
8310{
8311 int ret;
8312 uint64_t obj_size;
8313 rgw_obj shadow_obj = dest_obj;
8314 string shadow_oid;
8315
8316 bool remote_src;
8317 bool remote_dest;
8318
8319 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8320 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8321
8322 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8323 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8324
8325 if (remote_src && remote_dest) {
8326 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8327 return -EINVAL;
8328 }
8329
8330 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8331
8332 if (remote_src || !source_zone.empty()) {
8333 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8334 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8335 unmod_ptr, high_precision_time,
8336 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
31f18b77 8337 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7c673cae
FG
8338 }
8339
8340 map<string, bufferlist> src_attrs;
8341 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8342 RGWRados::Object::Read read_op(&src_op_target);
8343
8344 read_op.conds.mod_ptr = mod_ptr;
8345 read_op.conds.unmod_ptr = unmod_ptr;
8346 read_op.conds.high_precision_time = high_precision_time;
8347 read_op.conds.if_match = if_match;
8348 read_op.conds.if_nomatch = if_nomatch;
8349 read_op.params.attrs = &src_attrs;
8350 read_op.params.lastmod = src_mtime;
8351 read_op.params.obj_size = &obj_size;
7c673cae
FG
8352
8353 ret = read_op.prepare();
8354 if (ret < 0) {
8355 return ret;
8356 }
94b18763
FG
8357 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8358 // Current implementation does not follow S3 spec and even
8359 // may result in data corruption silently when copying
8360 // multipart objects acorss pools. So reject COPY operations
8361 //on encrypted objects before it is fully functional.
8362 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8363 << " has not been implemented." << dendl;
8364 return -ERR_NOT_IMPLEMENTED;
8365 }
7c673cae
FG
8366
8367 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8368 src_attrs.erase(RGW_ATTR_DELETE_AT);
8369
8370 set_copy_attrs(src_attrs, attrs, attrs_mod);
8371 attrs.erase(RGW_ATTR_ID_TAG);
8372 attrs.erase(RGW_ATTR_PG_VER);
8373 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8374 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8375 if (cmp != src_attrs.end())
8376 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8377
8378 RGWObjManifest manifest;
8379 RGWObjState *astate = NULL;
8380
8381 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8382 if (ret < 0) {
8383 return ret;
8384 }
8385
8386 vector<rgw_raw_obj> ref_objs;
8387
8388 if (remote_dest) {
8389 /* dest is in a different zonegroup, copy it there */
8390 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8391 }
8392 uint64_t max_chunk_size;
8393
8394 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8395 if (ret < 0) {
8396 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8397 return ret;
8398 }
8399
8400 rgw_pool src_pool;
8401 rgw_pool dest_pool;
8402 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8403 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8404 return -EIO;
8405 }
8406 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8407 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8408 return -EIO;
8409 }
8410
8411
8412 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8413 bool copy_first = false;
8414 if (astate->has_manifest) {
8415 if (!astate->manifest.has_tail()) {
8416 copy_data = true;
8417 } else {
8418 uint64_t head_size = astate->manifest.get_head_size();
8419
8420 if (head_size > 0) {
8421 if (head_size > max_chunk_size) {
8422 copy_data = true;
8423 } else {
8424 copy_first = true;
8425 }
8426 }
8427 }
8428 }
8429
8430 if (petag) {
8431 const auto iter = attrs.find(RGW_ATTR_ETAG);
8432 if (iter != attrs.end()) {
8433 *petag = iter->second;
8434 }
8435 }
8436
8437 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8438 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8439 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
31f18b77 8440 version_id, ptag, petag);
7c673cae
FG
8441 }
8442
8443 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8444
8445 if (copy_first) { // we need to copy first chunk, not increase refcount
8446 ++miter;
8447 }
8448
8449 rgw_rados_ref ref;
8450 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8451 if (ret < 0) {
8452 return ret;
8453 }
8454
8455 bool versioned_dest = dest_bucket_info.versioning_enabled();
8456
8457 if (version_id && !version_id->empty()) {
8458 versioned_dest = true;
8459 dest_obj.key.set_instance(*version_id);
8460 } else if (versioned_dest) {
8461 gen_rand_obj_instance_name(&dest_obj);
8462 }
8463
8464 bufferlist first_chunk;
8465
8466 bool copy_itself = (dest_obj == src_obj);
8467 RGWObjManifest *pmanifest;
31f18b77 8468 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
8469
8470 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8471 RGWRados::Object::Write write_op(&dest_op_target);
8472
8473 string tag;
8474
8475 if (ptag) {
8476 tag = *ptag;
8477 }
8478
8479 if (tag.empty()) {
8480 append_rand_alpha(cct, tag, tag, 32);
8481 }
8482
8483 if (!copy_itself) {
181888fb 8484 attrs.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
8485 manifest = astate->manifest;
8486 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8487 if (tail_placement.bucket.name.empty()) {
8488 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8489 }
3efd9988 8490 string ref_tag;
7c673cae
FG
8491 for (; miter != astate->manifest.obj_end(); ++miter) {
8492 ObjectWriteOperation op;
3efd9988
FG
8493 ref_tag = tag + '\0';
8494 cls_refcount_get(op, ref_tag, true);
7c673cae
FG
8495 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8496 ref.ioctx.locator_set_key(loc.loc);
8497
8498 ret = ref.ioctx.operate(loc.oid, &op);
8499 if (ret < 0) {
8500 goto done_ret;
8501 }
8502
8503 ref_objs.push_back(loc);
8504 }
8505
8506 pmanifest = &manifest;
8507 } else {
8508 pmanifest = &astate->manifest;
8509 /* don't send the object's tail for garbage collection */
8510 astate->keep_tail = true;
8511 }
8512
8513 if (copy_first) {
8514 ret = read_op.read(0, max_chunk_size, first_chunk);
8515 if (ret < 0) {
8516 goto done_ret;
8517 }
8518
8519 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8520 } else {
8521 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8522 }
8523
8524 write_op.meta.data = &first_chunk;
8525 write_op.meta.manifest = pmanifest;
8526 write_op.meta.ptag = &tag;
8527 write_op.meta.owner = dest_bucket_info.owner;
8528 write_op.meta.mtime = mtime;
8529 write_op.meta.flags = PUT_OBJ_CREATE;
8530 write_op.meta.category = category;
8531 write_op.meta.olh_epoch = olh_epoch;
8532 write_op.meta.delete_at = delete_at;
181888fb 8533 write_op.meta.modify_tail = !copy_itself;
7c673cae
FG
8534
8535 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8536 if (ret < 0) {
8537 goto done_ret;
8538 }
8539
8540 return 0;
8541
8542done_ret:
8543 if (!copy_itself) {
8544 vector<rgw_raw_obj>::iterator riter;
8545
7c673cae
FG
8546 /* rollback reference */
8547 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8548 ObjectWriteOperation op;
8549 cls_refcount_put(op, tag, true);
8550
8551 ref.ioctx.locator_set_key(riter->loc);
8552
8553 int r = ref.ioctx.operate(riter->oid, &op);
8554 if (r < 0) {
8555 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8556 }
8557 }
8558 }
8559 return ret;
8560}
8561
8562
8563int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8564 RGWBucketInfo& dest_bucket_info,
8565 RGWRados::Object::Read& read_op, off_t end,
8566 rgw_obj& dest_obj,
8567 rgw_obj& src_obj,
8568 uint64_t max_chunk_size,
8569 real_time *mtime,
8570 real_time set_mtime,
8571 map<string, bufferlist>& attrs,
8572 RGWObjCategory category,
8573 uint64_t olh_epoch,
8574 real_time delete_at,
8575 string *version_id,
8576 string *ptag,
31f18b77 8577 ceph::buffer::list *petag)
7c673cae
FG
8578{
8579 bufferlist first_chunk;
8580 RGWObjManifest manifest;
8581
8582 string tag;
8583 append_rand_alpha(cct, tag, tag, 32);
8584
8585 RGWPutObjProcessor_Atomic processor(obj_ctx,
b32b8144 8586 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7c673cae
FG
8587 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8588 if (version_id) {
8589 processor.set_version_id(*version_id);
8590 }
8591 processor.set_olh_epoch(olh_epoch);
8592 int ret = processor.prepare(this, NULL);
8593 if (ret < 0)
8594 return ret;
8595
8596 off_t ofs = 0;
8597
8598 do {
8599 bufferlist bl;
8600 ret = read_op.read(ofs, end, bl);
8601
8602 uint64_t read_len = ret;
8603 bool again;
8604
8605 do {
8606 void *handle;
8607 rgw_raw_obj obj;
8608
8609 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8610 if (ret < 0) {
8611 return ret;
8612 }
8613 ret = processor.throttle_data(handle, obj, read_len, false);
8614 if (ret < 0)
8615 return ret;
8616 } while (again);
8617
8618 ofs += read_len;
8619 } while (ofs <= end);
8620
8621 string etag;
8622 auto iter = attrs.find(RGW_ATTR_ETAG);
8623 if (iter != attrs.end()) {
8624 bufferlist& bl = iter->second;
8625 etag = string(bl.c_str(), bl.length());
8626 if (petag) {
8627 *petag = bl;
8628 }
8629 }
8630
8631 uint64_t accounted_size;
8632 {
8633 bool compressed{false};
8634 RGWCompressionInfo cs_info;
8635 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8636 if (ret < 0) {
8637 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8638 return ret;
8639 }
8640 // pass original size if compressed
8641 accounted_size = compressed ? cs_info.orig_size : ofs;
8642 }
8643
8644 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8645}
8646
8647bool RGWRados::is_meta_master()
8648{
31f18b77 8649 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8650 return false;
8651 }
8652
8653 return (get_zonegroup().master_zone == zone_public_config.id);
8654}
8655
8656/**
8657 * Check to see if the bucket metadata could be synced
8658 * bucket: the bucket to check
8659 * Returns false is the bucket is not synced
8660 */
8661bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8662{
8663
8664 /* no current period */
8665 if (current_period.get_id().empty()) {
8666 return false;
8667 }
8668
8669 /* zonegroup is not master zonegroup */
31f18b77 8670 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8671 return false;
8672 }
8673
8674 /* single zonegroup and a single zone */
224ce89b 8675 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
7c673cae
FG
8676 return false;
8677 }
8678
8679 /* zone is not master */
8680 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8681 return false;
8682 }
8683
8684 return true;
8685}
8686
8687int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8688{
1adf2230 8689 std::vector<rgw_bucket_dir_entry> ent_list;
7c673cae
FG
8690 rgw_obj_index_key marker;
8691 string prefix;
8692 bool is_truncated;
8693
8694 do {
1adf2230
AA
8695 constexpr uint NUM_ENTRIES = 1000u;
8696 int r = cls_bucket_list_unordered(bucket_info,
8697 RGW_NO_SHARD,
8698 marker,
8699 prefix,
8700 NUM_ENTRIES,
8701 true,
8702 ent_list,
8703 &is_truncated,
8704 &marker);
7c673cae
FG
8705 if (r < 0)
8706 return r;
8707
8708 string ns;
1adf2230 8709 for (auto const& dirent : ent_list) {
7c673cae
FG
8710 rgw_obj_key obj;
8711
1adf2230 8712 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
7c673cae
FG
8713 return -ENOTEMPTY;
8714 }
8715 } while (is_truncated);
1adf2230 8716
7c673cae
FG
8717 return 0;
8718}
8719
8720/**
8721 * Delete a bucket.
8722 * bucket: the name of the bucket to delete
8723 * Returns 0 on success, -ERR# otherwise.
8724 */
8725int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8726{
8727 const rgw_bucket& bucket = bucket_info.bucket;
8728 librados::IoCtx index_ctx;
8729 map<int, string> bucket_objs;
8730 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8731 if (r < 0)
8732 return r;
8733
8734 if (check_empty) {
8735 r = check_bucket_empty(bucket_info);
8736 if (r < 0) {
8737 return r;
8738 }
8739 }
8740
8741 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8742 if (r < 0)
8743 return r;
8744
8745 /* if the bucket is not synced we can remove the meta file */
8746 if (!is_syncing_bucket_meta(bucket)) {
8747 RGWObjVersionTracker objv_tracker;
f64942e4 8748 r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
7c673cae
FG
8749 if (r < 0) {
8750 return r;
8751 }
f64942e4
AA
8752
8753 /* remove bucket index objects asynchronously by best effort */
8754 (void) CLSRGWIssueBucketIndexClean(index_ctx,
8755 bucket_objs,
8756 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 8757 }
f64942e4 8758
7c673cae
FG
8759 return 0;
8760}
8761
8762int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8763{
8764 RGWBucketInfo info;
8765 map<string, bufferlist> attrs;
8766 RGWObjectCtx obj_ctx(this);
31f18b77
FG
8767 int r;
8768 if (bucket.bucket_id.empty()) {
8769 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8770 } else {
8771 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8772 }
7c673cae
FG
8773 if (r < 0) {
8774 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8775 return r;
8776 }
8777
8778 info.owner = owner.get_id();
8779
8780 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8781 if (r < 0) {
8782 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8783 return r;
8784 }
8785
8786 return 0;
8787}
8788
8789
8790int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8791{
8792 int ret = 0;
8793
8794 vector<rgw_bucket>::iterator iter;
8795
8796 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8797 rgw_bucket& bucket = *iter;
8798 if (enabled)
8799 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8800 else
8801 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8802
8803 RGWBucketInfo info;
8804 map<string, bufferlist> attrs;
8805 RGWObjectCtx obj_ctx(this);
8806 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8807 if (r < 0) {
8808 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8809 ret = r;
8810 continue;
8811 }
8812 if (enabled) {
8813 info.flags &= ~BUCKET_SUSPENDED;
8814 } else {
8815 info.flags |= BUCKET_SUSPENDED;
8816 }
8817
8818 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8819 if (r < 0) {
8820 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8821 ret = r;
8822 continue;
8823 }
8824 }
8825 return ret;
8826}
8827
8828int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8829{
8830 RGWBucketInfo bucket_info;
8831 RGWObjectCtx obj_ctx(this);
8832 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8833 if (ret < 0) {
8834 return ret;
8835 }
8836
8837 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8838 return 0;
8839}
8840
8841int RGWRados::Object::complete_atomic_modification()
8842{
8843 if (!state->has_manifest || state->keep_tail)
8844 return 0;
8845
8846 cls_rgw_obj_chain chain;
8847 store->update_gc_chain(obj, state->manifest, &chain);
8848
8849 if (chain.empty()) {
8850 return 0;
8851 }
8852
181888fb 8853 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
7c673cae
FG
8854 return store->gc->send_chain(chain, tag, false); // do it async
8855}
8856
8857void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8858{
8859 RGWObjManifest::obj_iterator iter;
8860 rgw_raw_obj raw_head;
8861 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8862 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8863 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8864 if (mobj == raw_head)
8865 continue;
8866 cls_rgw_obj_key key(mobj.oid);
8867 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8868 }
8869}
8870
8871int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8872{
8873 return gc->send_chain(chain, tag, sync);
8874}
8875
1adf2230
AA
8876int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8877 librados::IoCtx& index_ctx,
8878 string& bucket_oid)
7c673cae
FG
8879{
8880 const rgw_bucket& bucket = bucket_info.bucket;
8881 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8882 if (r < 0)
8883 return r;
8884
8885 if (bucket.bucket_id.empty()) {
8886 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8887 return -EIO;
8888 }
8889
8890 bucket_oid = dir_oid_prefix;
8891 bucket_oid.append(bucket.bucket_id);
8892
8893 return 0;
8894}
8895
1adf2230
AA
8896int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
8897 librados::IoCtx& index_ctx,
8898 string& bucket_oid_base) {
7c673cae
FG
8899 const rgw_bucket& bucket = bucket_info.bucket;
8900 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8901 if (r < 0)
8902 return r;
8903
8904 if (bucket.bucket_id.empty()) {
8905 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8906 return -EIO;
8907 }
8908
8909 bucket_oid_base = dir_oid_prefix;
8910 bucket_oid_base.append(bucket.bucket_id);
8911
8912 return 0;
8913
8914}
8915
1adf2230
AA
8916int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8917 librados::IoCtx& index_ctx,
8918 map<int, string>& bucket_objs,
8919 int shard_id,
8920 map<int, string> *bucket_instance_ids) {
7c673cae
FG
8921 string bucket_oid_base;
8922 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8923 if (ret < 0) {
8924 return ret;
8925 }
8926
8927 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8928 if (bucket_instance_ids) {
8929 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8930 }
8931 return 0;
8932}
8933
8934template<typename T>
8935int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8936 map<int, string>& oids, map<int, T>& bucket_objs,
8937 int shard_id, map<int, string> *bucket_instance_ids)
8938{
8939 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8940 if (ret < 0)
8941 return ret;
8942
8943 map<int, string>::const_iterator iter = oids.begin();
8944 for (; iter != oids.end(); ++iter) {
8945 bucket_objs[iter->first] = T();
8946 }
8947 return 0;
8948}
8949
8950int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8951 const string& obj_key, string *bucket_obj, int *shard_id)
8952{
8953 string bucket_oid_base;
8954 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8955 if (ret < 0)
8956 return ret;
8957
8958 RGWObjectCtx obj_ctx(this);
8959
8960 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8961 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8962 if (ret < 0) {
8963 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8964 return ret;
8965 }
8966 return 0;
8967}
8968
8969int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8970 int shard_id, string *bucket_obj)
8971{
8972 string bucket_oid_base;
8973 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8974 if (ret < 0)
8975 return ret;
8976
8977 RGWObjectCtx obj_ctx(this);
8978
8979 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8980 shard_id, bucket_obj);
8981 return 0;
8982}
8983
8984static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8985 map<RGWObjCategory, RGWStorageStats>& stats)
8986{
8987 for (const auto& pair : header.stats) {
8988 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8989 const rgw_bucket_category_stats& header_stats = pair.second;
8990
8991 RGWStorageStats& s = stats[category];
8992
8993 s.category = category;
8994 s.size += header_stats.total_size;
8995 s.size_rounded += header_stats.total_size_rounded;
8996 s.size_utilized += header_stats.actual_size;
8997 s.num_objects += header_stats.num_entries;
8998 }
8999}
9000
9001int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
9002 map<RGWObjCategory, RGWStorageStats> *existing_stats,
9003 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
9004{
9005 librados::IoCtx index_ctx;
9006 // key - bucket index object id
9007 // value - bucket index check OP returned result with the given bucket index object (shard)
9008 map<int, string> oids;
9009 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 9010
7c673cae 9011 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
31f18b77
FG
9012 if (ret < 0) {
9013 return ret;
9014 }
7c673cae
FG
9015
9016 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
9017 if (ret < 0) {
9018 return ret;
9019 }
7c673cae
FG
9020
9021 // Aggregate results (from different shards if there is any)
9022 map<int, struct rgw_cls_check_index_ret>::iterator iter;
9023 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
9024 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
9025 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
9026 }
9027
9028 return 0;
9029}
9030
9031int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
9032{
9033 librados::IoCtx index_ctx;
9034 map<int, string> bucket_objs;
31f18b77 9035
7c673cae 9036 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
31f18b77 9037 if (r < 0) {
7c673cae 9038 return r;
31f18b77 9039 }
7c673cae
FG
9040
9041 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
9042}
9043
f64942e4 9044int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77
FG
9045{
9046 librados::IoCtx index_ctx;
9047 map<int, string> bucket_objs;
9048
9049 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
9050 if (r < 0) {
9051 return r;
9052 }
9053
9054 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
9055}
7c673cae
FG
9056
9057int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
9058{
9059 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9060 std::string oid, key;
9061 get_obj_bucket_and_oid_loc(obj, oid, key);
9062 if (!rctx)
9063 return 0;
9064
9065 RGWObjState *state = NULL;
9066
9067 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
9068 if (r < 0)
9069 return r;
9070
9071 if (!state->is_atomic) {
9072 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
9073 return -EINVAL;
9074 }
9075
181888fb
FG
9076 string tag;
9077
9078 if (state->tail_tag.length() > 0) {
9079 tag = state->tail_tag.c_str();
9080 } else if (state->obj_tag.length() > 0) {
9081 tag = state->obj_tag.c_str();
9082 } else {
7c673cae
FG
9083 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
9084 return -EINVAL;
9085 }
9086
7c673cae
FG
9087 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
9088
9089 return gc->defer_chain(tag, false);
9090}
9091
9092void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
9093{
9094 list<string> prefixes;
9095 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
9096 cls_rgw_remove_obj(op, prefixes);
9097}
9098
9099void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
9100{
9101 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
9102}
9103
9104void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
9105{
9106 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
9107}
9108
9109
9110/**
9111 * Delete an object.
9112 * bucket: name of the bucket storing the object
9113 * obj: name of the object to delete
9114 * Returns: 0 on success, -ERR# otherwise.
9115 */
9116int RGWRados::Object::Delete::delete_obj()
9117{
9118 RGWRados *store = target->get_store();
9119 rgw_obj& src_obj = target->get_obj();
9120 const string& instance = src_obj.key.instance;
9121 rgw_obj obj = src_obj;
9122
9123 if (instance == "null") {
9124 obj.key.instance.clear();
9125 }
9126
9127 bool explicit_marker_version = (!params.marker_version_id.empty());
9128
9129 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
9130 if (instance.empty() || explicit_marker_version) {
9131 rgw_obj marker = obj;
9132
9133 if (!params.marker_version_id.empty()) {
9134 if (params.marker_version_id != "null") {
9135 marker.key.set_instance(params.marker_version_id);
9136 }
9137 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
9138 store->gen_rand_obj_instance_name(&marker);
9139 }
9140
9141 result.version_id = marker.key.instance;
91327a77
AA
9142 if (result.version_id.empty())
9143 result.version_id = "null";
7c673cae
FG
9144 result.delete_marker = true;
9145
9146 struct rgw_bucket_dir_entry_meta meta;
9147
9148 meta.owner = params.obj_owner.get_id().to_str();
9149 meta.owner_display_name = params.obj_owner.get_display_name();
9150
9151 if (real_clock::is_zero(params.mtime)) {
9152 meta.mtime = real_clock::now();
9153 } else {
9154 meta.mtime = params.mtime;
9155 }
9156
31f18b77 9157 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
7c673cae
FG
9158 if (r < 0) {
9159 return r;
9160 }
9161 } else {
9162 rgw_bucket_dir_entry dirent;
9163
9164 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
9165 if (r < 0) {
9166 return r;
9167 }
9168 result.delete_marker = dirent.is_delete_marker();
31f18b77 9169 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
7c673cae
FG
9170 if (r < 0) {
9171 return r;
9172 }
9173 result.version_id = instance;
9174 }
9175
9176 BucketShard *bs;
9177 int r = target->get_bucket_shard(&bs);
9178 if (r < 0) {
9179 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
9180 return r;
9181 }
9182
c07f9fc5
FG
9183 if (target->bucket_info.datasync_flag_enabled()) {
9184 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9185 if (r < 0) {
9186 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9187 return r;
9188 }
7c673cae
FG
9189 }
9190
9191 return 0;
9192 }
9193
9194 rgw_rados_ref ref;
9195 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
9196 if (r < 0) {
9197 return r;
9198 }
9199
9200 RGWObjState *state;
9201 r = target->get_state(&state, false);
9202 if (r < 0)
9203 return r;
9204
9205 ObjectWriteOperation op;
9206
9207 if (!real_clock::is_zero(params.unmod_since)) {
9208 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
9209 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
9210 if (!params.high_precision_time) {
9211 ctime.tv_nsec = 0;
9212 unmod.tv_nsec = 0;
9213 }
9214
9215 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
9216 if (ctime > unmod) {
9217 return -ERR_PRECONDITION_FAILED;
9218 }
9219
9220 /* only delete object if mtime is less than or equal to params.unmod_since */
9221 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
9222 }
9223 uint64_t obj_size = state->size;
9224
9225 if (!real_clock::is_zero(params.expiration_time)) {
9226 bufferlist bl;
9227 real_time delete_at;
9228
9229 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
9230 try {
9231 bufferlist::iterator iter = bl.begin();
9232 ::decode(delete_at, iter);
9233 } catch (buffer::error& err) {
9234 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
9235 return -EIO;
9236 }
9237
9238 if (params.expiration_time != delete_at) {
9239 return -ERR_PRECONDITION_FAILED;
9240 }
9241 } else {
9242 return -ERR_PRECONDITION_FAILED;
9243 }
9244 }
9245
9246 if (!state->exists) {
9247 target->invalidate_state();
9248 return -ENOENT;
9249 }
9250
181888fb 9251 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
7c673cae
FG
9252 if (r < 0)
9253 return r;
9254
9255 RGWBucketInfo& bucket_info = target->get_bucket_info();
9256
9257 RGWRados::Bucket bop(store, bucket_info);
9258 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
9259
9260 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
9261 index_op.set_bilog_flags(params.bilog_flags);
9262
7c673cae
FG
9263 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
9264 if (r < 0)
9265 return r;
9266
9267 store->remove_rgw_head_obj(op);
9268 r = ref.ioctx.operate(ref.oid, &op);
94b18763
FG
9269
9270 /* raced with another operation, object state is indeterminate */
9271 const bool need_invalidate = (r == -ECANCELED);
7c673cae
FG
9272
9273 int64_t poolid = ref.ioctx.get_id();
9274 if (r >= 0) {
9275 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9276 if (obj_tombstone_cache) {
9277 tombstone_entry entry{*state};
9278 obj_tombstone_cache->add(obj, entry);
9279 }
9280 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 9281
7c673cae
FG
9282 int ret = target->complete_atomic_modification();
9283 if (ret < 0) {
9284 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9285 }
9286 /* other than that, no need to propagate error */
224ce89b
WB
9287 } else {
9288 int ret = index_op.cancel();
9289 if (ret < 0) {
9290 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9291 }
7c673cae
FG
9292 }
9293
9294 if (need_invalidate) {
9295 target->invalidate_state();
9296 }
9297
9298 if (r < 0)
9299 return r;
9300
9301 /* update quota cache */
9302 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9303
9304 return 0;
9305}
9306
9307int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9308 const RGWBucketInfo& bucket_info,
9309 const rgw_obj& obj,
9310 int versioning_status,
9311 uint16_t bilog_flags,
31f18b77
FG
9312 const real_time& expiration_time,
9313 rgw_zone_set *zones_trace)
7c673cae
FG
9314{
9315 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9316 RGWRados::Object::Delete del_op(&del_target);
9317
9318 del_op.params.bucket_owner = bucket_info.owner;
9319 del_op.params.versioning_status = versioning_status;
9320 del_op.params.bilog_flags = bilog_flags;
9321 del_op.params.expiration_time = expiration_time;
31f18b77 9322 del_op.params.zones_trace = zones_trace;
7c673cae
FG
9323
9324 return del_op.delete_obj();
9325}
9326
9327int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9328{
9329 rgw_rados_ref ref;
224ce89b 9330 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9331 if (r < 0) {
9332 return r;
9333 }
9334
9335 ObjectWriteOperation op;
9336
9337 op.remove();
9338 r = ref.ioctx.operate(ref.oid, &op);
9339 if (r < 0)
9340 return r;
9341
9342 return 0;
9343}
9344
9345int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9346{
9347 if (obj.empty()) {
9348 ldout(cct, 1) << "delete_system_obj got empty object name "
9349 << obj << ", returning EINVAL" << dendl;
9350 return -EINVAL;
9351 }
9352 rgw_rados_ref ref;
224ce89b 9353 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9354 if (r < 0) {
9355 return r;
9356 }
9357
9358 ObjectWriteOperation op;
9359
9360 if (objv_tracker) {
9361 objv_tracker->prepare_op_for_write(&op);
9362 }
9363
9364 op.remove();
9365 r = ref.ioctx.operate(ref.oid, &op);
9366 if (r < 0)
9367 return r;
9368
9369 return 0;
9370}
9371
9372int RGWRados::delete_obj_index(const rgw_obj& obj)
9373{
9374 std::string oid, key;
9375 get_obj_bucket_and_oid_loc(obj, oid, key);
9376
9377 RGWObjectCtx obj_ctx(this);
9378
9379 RGWBucketInfo bucket_info;
9380 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9381 if (ret < 0) {
9382 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9383 return ret;
9384 }
9385
9386 RGWRados::Bucket bop(this, bucket_info);
9387 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9388
9389 real_time removed_mtime;
9390 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9391
9392 return r;
9393}
9394
9395static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9396{
9397 string tag;
9398
9399 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9400 if (mi != manifest.obj_end()) {
9401 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9402 ++mi;
9403 tag = mi.get_location().get_raw_obj(store).oid;
9404 tag.append("_");
9405 }
9406
9407 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9408 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9409 MD5 hash;
9410 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9411
9412 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9413 if (iter != attrset.end()) {
9414 bufferlist& bl = iter->second;
9415 hash.Update((const byte *)bl.c_str(), bl.length());
9416 }
9417
9418 hash.Final(md5);
9419 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9420 tag.append(md5_str);
9421
9422 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9423
9424 tag_bl.append(tag.c_str(), tag.size() + 1);
9425}
9426
9427static bool is_olh(map<string, bufferlist>& attrs)
9428{
9429 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9430 return (iter != attrs.end());
9431}
9432
9433static bool has_olh_tag(map<string, bufferlist>& attrs)
9434{
9435 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9436 return (iter != attrs.end());
9437}
9438
9439int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9440 RGWObjState *olh_state, RGWObjState **target_state)
9441{
9442 assert(olh_state->is_olh);
9443
9444 rgw_obj target;
9445 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9446 if (r < 0) {
9447 return r;
9448 }
9449 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9450 if (r < 0) {
9451 return r;
9452 }
9453
9454 return 0;
9455}
9456
9457int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9458{
9459 if (obj.empty()) {
9460 return -EINVAL;
9461 }
9462
9463 RGWRawObjState *s = rctx->raw.get_state(obj);
9464 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9465 *state = s;
9466 if (s->has_attrs) {
9467 return 0;
9468 }
9469
9470 s->obj = obj;
9471
9472 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9473 if (r == -ENOENT) {
9474 s->exists = false;
9475 s->has_attrs = true;
9476 s->mtime = real_time();
9477 return 0;
9478 }
9479 if (r < 0)
9480 return r;
9481
9482 s->exists = true;
9483 s->has_attrs = true;
9484 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9485
9486 if (s->obj_tag.length())
31f18b77
FG
9487 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9488 << s->obj_tag.c_str() << dendl;
7c673cae
FG
9489 else
9490 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9491
9492 return 0;
9493}
9494
9495int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9496{
9497 int ret;
9498
9499 do {
9500 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9501 } while (ret == -EAGAIN);
9502
9503 return ret;
9504}
9505
9506int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9507 RGWObjState **state, bool follow_olh, bool assume_noent)
9508{
9509 if (obj.empty()) {
9510 return -EINVAL;
9511 }
9512
9513 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9514
9515 RGWObjState *s = rctx->obj.get_state(obj);
9516 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9517 *state = s;
9518 if (s->has_attrs) {
9519 if (s->is_olh && need_follow_olh) {
9520 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9521 }
9522 return 0;
9523 }
9524
9525 s->obj = obj;
9526
9527 rgw_raw_obj raw_obj;
9528 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9529
9530 int r = -ENOENT;
9531
9532 if (!assume_noent) {
9533 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9534 }
9535
9536 if (r == -ENOENT) {
9537 s->exists = false;
9538 s->has_attrs = true;
9539 tombstone_entry entry;
9540 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9541 s->mtime = entry.mtime;
9542 s->zone_short_id = entry.zone_short_id;
9543 s->pg_ver = entry.pg_ver;
9544 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9545 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9546 } else {
9547 s->mtime = real_time();
9548 }
9549 return 0;
9550 }
9551 if (r < 0)
9552 return r;
9553
9554 s->exists = true;
9555 s->has_attrs = true;
9556 s->accounted_size = s->size;
9557
9558 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
9559 const bool compressed = (iter != s->attrset.end());
9560 if (compressed) {
7c673cae
FG
9561 // use uncompressed size for accounted_size
9562 try {
9563 RGWCompressionInfo info;
9564 auto p = iter->second.begin();
9565 ::decode(info, p);
31f18b77 9566 s->accounted_size = info.orig_size;
7c673cae
FG
9567 } catch (buffer::error&) {
9568 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9569 return -EIO;
9570 }
9571 }
9572
9573 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9574 if (iter != s->attrset.end()) {
9575 bufferlist bl = iter->second;
9576 bufferlist::iterator it = bl.begin();
9577 it.copy(bl.length(), s->shadow_obj);
9578 s->shadow_obj[bl.length()] = '\0';
9579 }
9580 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
9581 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9582 if (ttiter != s->attrset.end()) {
9583 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9584 }
7c673cae
FG
9585
9586 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9587 if (manifest_bl.length()) {
9588 bufferlist::iterator miter = manifest_bl.begin();
9589 try {
9590 ::decode(s->manifest, miter);
9591 s->has_manifest = true;
9592 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9593 broken due to old bugs */
9594 s->size = s->manifest.get_obj_size();
31f18b77
FG
9595 if (!compressed)
9596 s->accounted_size = s->size;
7c673cae
FG
9597 } catch (buffer::error& err) {
9598 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9599 return -EIO;
9600 }
9601 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9602 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9603 RGWObjManifest::obj_iterator mi;
9604 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9605 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9606 }
9607 }
9608
9609 if (!s->obj_tag.length()) {
9610 /*
9611 * Uh oh, something's wrong, object with manifest should have tag. Let's
9612 * create one out of the manifest, would be unique
9613 */
9614 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9615 s->fake_tag = true;
9616 }
9617 }
9618 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9619 if (aiter != s->attrset.end()) {
9620 bufferlist& pg_ver_bl = aiter->second;
9621 if (pg_ver_bl.length()) {
9622 bufferlist::iterator pgbl = pg_ver_bl.begin();
9623 try {
9624 ::decode(s->pg_ver, pgbl);
9625 } catch (buffer::error& err) {
9626 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9627 }
9628 }
9629 }
9630 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9631 if (aiter != s->attrset.end()) {
9632 bufferlist& zone_short_id_bl = aiter->second;
9633 if (zone_short_id_bl.length()) {
9634 bufferlist::iterator zbl = zone_short_id_bl.begin();
9635 try {
9636 ::decode(s->zone_short_id, zbl);
9637 } catch (buffer::error& err) {
9638 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9639 }
9640 }
9641 }
9642 if (s->obj_tag.length())
31f18b77 9643 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
9644 else
9645 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9646
9647 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9648 * it exist, and not only if is_olh() returns true
9649 */
9650 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9651 if (iter != s->attrset.end()) {
9652 s->olh_tag = iter->second;
9653 }
9654
9655 if (is_olh(s->attrset)) {
9656 s->is_olh = true;
9657
9658 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9659
9660 if (need_follow_olh) {
9661 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9662 }
9663 }
9664
9665 return 0;
9666}
9667
9668int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9669 bool follow_olh, bool assume_noent)
9670{
9671 int ret;
9672
9673 do {
9674 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9675 } while (ret == -EAGAIN);
9676
9677 return ret;
9678}
9679
9680int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9681{
9682 RGWObjState *astate;
9683 int r = get_state(&astate, true);
9684 if (r < 0) {
9685 return r;
9686 }
9687
9688 *pmanifest = &astate->manifest;
9689
9690 return 0;
9691}
9692
9693int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9694{
9695 RGWObjState *state;
9696 int r = source->get_state(&state, true);
9697 if (r < 0)
9698 return r;
9699 if (!state->exists)
9700 return -ENOENT;
9701 if (!state->get_attr(name, dest))
9702 return -ENODATA;
9703
9704 return 0;
9705}
9706
9707
9708int RGWRados::Object::Stat::stat_async()
9709{
9710 RGWObjectCtx& ctx = source->get_ctx();
9711 rgw_obj& obj = source->get_obj();
9712 RGWRados *store = source->get_store();
9713
9714 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9715 result.obj = obj;
9716 if (s->has_attrs) {
9717 state.ret = 0;
9718 result.size = s->size;
9719 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9720 result.attrs = s->attrset;
9721 result.has_manifest = s->has_manifest;
9722 result.manifest = s->manifest;
9723 return 0;
9724 }
9725
9726 string oid;
9727 string loc;
9728 get_obj_bucket_and_oid_loc(obj, oid, loc);
9729
9730 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9731 if (r < 0) {
9732 return r;
9733 }
9734
9735 librados::ObjectReadOperation op;
9736 op.stat2(&result.size, &result.mtime, NULL);
9737 op.getxattrs(&result.attrs, NULL);
9738 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9739 state.io_ctx.locator_set_key(loc);
9740 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9741 if (r < 0) {
9742 ldout(store->ctx(), 5) << __func__
9743 << ": ERROR: aio_operate() returned ret=" << r
9744 << dendl;
9745 return r;
9746 }
9747
9748 return 0;
9749}
9750
9751
9752int RGWRados::Object::Stat::wait()
9753{
9754 if (!state.completion) {
9755 return state.ret;
9756 }
9757
9758 state.completion->wait_for_safe();
9759 state.ret = state.completion->get_return_value();
9760 state.completion->release();
9761
9762 if (state.ret != 0) {
9763 return state.ret;
9764 }
9765
9766 return finish();
9767}
9768
9769int RGWRados::Object::Stat::finish()
9770{
9771 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9772 if (iter != result.attrs.end()) {
9773 bufferlist& bl = iter->second;
9774 bufferlist::iterator biter = bl.begin();
9775 try {
9776 ::decode(result.manifest, biter);
9777 } catch (buffer::error& err) {
9778 RGWRados *store = source->get_store();
9779 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9780 return -EIO;
9781 }
9782 result.has_manifest = true;
9783 }
9784
9785 return 0;
9786}
9787
9788/**
31f18b77
FG
9789 * Get an attribute for a system object.
9790 * obj: the object to get attr
7c673cae
FG
9791 * name: name of the attr to retrieve
9792 * dest: bufferlist to store the result in
9793 * Returns: 0 on success, -ERR# otherwise.
9794 */
9795int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9796{
9797 rgw_rados_ref ref;
224ce89b 9798 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9799 if (r < 0) {
9800 return r;
9801 }
9802
9803 ObjectReadOperation op;
9804
9805 int rval;
9806 op.getxattr(name, &dest, &rval);
9807
9808 r = ref.ioctx.operate(ref.oid, &op, NULL);
9809 if (r < 0)
9810 return r;
9811
9812 return 0;
9813}
9814
9815int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9816 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9817 ObjectOperation& op, RGWObjState **pstate)
9818{
9819 if (!rctx)
9820 return 0;
9821
9822 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9823 if (r < 0)
9824 return r;
9825
9826 RGWObjState *state = *pstate;
9827
9828 if (!state->is_atomic) {
9829 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9830 return 0;
9831 }
9832
9833 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9834 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9835 } else {
9836 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9837 }
9838 return 0;
9839}
9840
9841int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9842{
9843 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9844}
9845
9846void RGWRados::Object::invalidate_state()
9847{
9848 ctx.obj.invalidate(obj);
9849}
9850
9851void RGWRados::SystemObject::invalidate_state()
9852{
9853 ctx.raw.invalidate(obj);
9854}
9855
9856int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb
FG
9857 const char *if_match, const char *if_nomatch, bool removal_op,
9858 bool modify_tail)
7c673cae
FG
9859{
9860 int r = get_state(&state, false);
9861 if (r < 0)
9862 return r;
9863
9864 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9865 if_match != NULL || if_nomatch != NULL) &&
9866 (!state->fake_tag);
9867
9868 if (!state->is_atomic) {
9869 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9870
9871 if (reset_obj) {
9872 op.create(false);
9873 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9874 }
9875
9876 return 0;
9877 }
9878
9879 if (need_guard) {
9880 /* first verify that the object wasn't replaced under */
9881 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9882 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9883 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9884 }
9885
9886 if (if_match) {
9887 if (strcmp(if_match, "*") == 0) {
9888 // test the object is existing
9889 if (!state->exists) {
9890 return -ERR_PRECONDITION_FAILED;
9891 }
9892 } else {
9893 bufferlist bl;
9894 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9895 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9896 return -ERR_PRECONDITION_FAILED;
9897 }
9898 }
9899 }
9900
9901 if (if_nomatch) {
9902 if (strcmp(if_nomatch, "*") == 0) {
9903 // test the object is NOT existing
9904 if (state->exists) {
9905 return -ERR_PRECONDITION_FAILED;
9906 }
9907 } else {
9908 bufferlist bl;
9909 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9910 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9911 return -ERR_PRECONDITION_FAILED;
9912 }
9913 }
9914 }
9915 }
9916
9917 if (reset_obj) {
9918 if (state->exists) {
9919 op.create(false);
9920 store->remove_rgw_head_obj(op);
9921 } else {
9922 op.create(true);
9923 }
9924 }
9925
9926 if (removal_op) {
9927 /* the object is being removed, no need to update its tag */
9928 return 0;
9929 }
9930
9931 if (ptag) {
9932 state->write_tag = *ptag;
9933 } else {
9934 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9935 }
9936 bufferlist bl;
9937 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9938
9939 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9940
9941 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
9942 if (modify_tail) {
9943 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9944 }
7c673cae
FG
9945
9946 return 0;
9947}
9948
9949int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9950 RGWObjVersionTracker *objv_tracker)
9951{
9952 map<string, bufferlist> attrs;
9953 attrs[name] = bl;
9954 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9955}
9956
9957int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9958 map<string, bufferlist>& attrs,
9959 map<string, bufferlist>* rmattrs,
9960 RGWObjVersionTracker *objv_tracker)
9961{
9962 rgw_rados_ref ref;
224ce89b 9963 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9964 if (r < 0) {
9965 return r;
9966 }
9967 ObjectWriteOperation op;
9968
9969 if (objv_tracker) {
9970 objv_tracker->prepare_op_for_write(&op);
9971 }
9972
9973 map<string, bufferlist>::iterator iter;
9974 if (rmattrs) {
9975 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9976 const string& name = iter->first;
9977 op.rmxattr(name.c_str());
9978 }
9979 }
9980
9981 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9982 const string& name = iter->first;
9983 bufferlist& bl = iter->second;
9984
9985 if (!bl.length())
9986 continue;
9987
9988 op.setxattr(name.c_str(), bl);
9989 }
9990
9991 if (!op.size())
9992 return 0;
9993
9994 bufferlist bl;
9995
9996 r = ref.ioctx.operate(ref.oid, &op);
9997 if (r < 0)
9998 return r;
9999
10000 return 0;
10001}
10002
10003/**
10004 * Set an attr on an object.
10005 * bucket: name of the bucket holding the object
10006 * obj: name of the object to set the attr on
10007 * name: the attr to set
10008 * bl: the contents of the attr
10009 * Returns: 0 on success, -ERR# otherwise.
10010 */
10011int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
10012{
10013 map<string, bufferlist> attrs;
10014 attrs[name] = bl;
10015 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
10016}
10017
10018int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
10019 map<string, bufferlist>& attrs,
10020 map<string, bufferlist>* rmattrs)
10021{
10022 rgw_rados_ref ref;
10023 int r = get_obj_head_ref(bucket_info, obj, &ref);
10024 if (r < 0) {
10025 return r;
10026 }
10027 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10028
10029 ObjectWriteOperation op;
10030 RGWObjState *state = NULL;
10031
10032 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
10033 if (r < 0)
10034 return r;
10035
10036 map<string, bufferlist>::iterator iter;
10037 if (rmattrs) {
10038 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10039 const string& name = iter->first;
10040 op.rmxattr(name.c_str());
10041 }
10042 }
10043
10044 const rgw_bucket& bucket = obj.bucket;
10045
10046 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10047 const string& name = iter->first;
10048 bufferlist& bl = iter->second;
10049
10050 if (!bl.length())
10051 continue;
10052
10053 op.setxattr(name.c_str(), bl);
10054
10055 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
10056 real_time ts;
10057 try {
10058 ::decode(ts, bl);
10059
10060 rgw_obj_index_key obj_key;
10061 obj.key.get_index_key(&obj_key);
10062
10063 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
10064 } catch (buffer::error& err) {
10065 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
10066 }
10067 }
10068 }
10069
10070 if (!op.size())
10071 return 0;
10072
10073 RGWObjectCtx obj_ctx(this);
10074
10075 bufferlist bl;
10076 RGWRados::Bucket bop(this, bucket_info);
10077 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
10078
10079 if (state) {
10080 string tag;
10081 append_rand_alpha(cct, tag, tag, 32);
10082 state->write_tag = tag;
10083 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
10084
10085 if (r < 0)
10086 return r;
10087
10088 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
10089 op.setxattr(RGW_ATTR_ID_TAG, bl);
10090 }
10091
3efd9988
FG
10092
10093 real_time mtime = real_clock::now();
10094 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10095 op.mtime2(&mtime_ts);
7c673cae
FG
10096 r = ref.ioctx.operate(ref.oid, &op);
10097 if (state) {
10098 if (r >= 0) {
10099 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
10100 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
10101 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
10102 string etag(etag_bl.c_str(), etag_bl.length());
10103 string content_type(content_type_bl.c_str(), content_type_bl.length());
10104 uint64_t epoch = ref.ioctx.get_last_version();
10105 int64_t poolid = ref.ioctx.get_id();
7c673cae
FG
10106 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
10107 mtime, etag, content_type, &acl_bl,
10108 RGW_OBJ_CATEGORY_MAIN, NULL);
10109 } else {
10110 int ret = index_op.cancel();
10111 if (ret < 0) {
10112 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
10113 }
10114 }
10115 }
10116 if (r < 0)
10117 return r;
10118
10119 if (state) {
10120 state->obj_tag.swap(bl);
10121 if (rmattrs) {
10122 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10123 state->attrset.erase(iter->first);
10124 }
10125 }
10126 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10127 state->attrset[iter->first] = iter->second;
10128 }
10129 }
10130
10131 return 0;
10132}
10133
7c673cae
FG
10134int RGWRados::Object::Read::prepare()
10135{
10136 RGWRados *store = source->get_store();
10137 CephContext *cct = store->ctx();
10138
10139 bufferlist etag;
10140
10141 map<string, bufferlist>::iterator iter;
10142
10143 RGWObjState *astate;
10144 int r = source->get_state(&astate, true);
10145 if (r < 0)
10146 return r;
10147
10148 if (!astate->exists) {
10149 return -ENOENT;
10150 }
10151
10152 const RGWBucketInfo& bucket_info = source->get_bucket_info();
10153
10154 state.obj = astate->obj;
10155 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
10156
10157 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
10158 if (r < 0) {
10159 return r;
10160 }
10161 if (params.attrs) {
10162 *params.attrs = astate->attrset;
10163 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10164 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
10165 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10166 }
10167 }
10168 }
10169
10170 /* Convert all times go GMT to make them compatible */
10171 if (conds.mod_ptr || conds.unmod_ptr) {
10172 obj_time_weight src_weight;
10173 src_weight.init(astate);
10174 src_weight.high_precision = conds.high_precision_time;
10175
10176 obj_time_weight dest_weight;
10177 dest_weight.high_precision = conds.high_precision_time;
10178
10179 if (conds.mod_ptr) {
10180 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10181 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10182 if (!(dest_weight < src_weight)) {
10183 return -ERR_NOT_MODIFIED;
10184 }
10185 }
10186
10187 if (conds.unmod_ptr) {
10188 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10189 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10190 if (dest_weight < src_weight) {
10191 return -ERR_PRECONDITION_FAILED;
10192 }
10193 }
10194 }
10195 if (conds.if_match || conds.if_nomatch) {
10196 r = get_attr(RGW_ATTR_ETAG, etag);
10197 if (r < 0)
10198 return r;
10199
10200 if (conds.if_match) {
10201 string if_match_str = rgw_string_unquote(conds.if_match);
10202 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
10203 if (if_match_str.compare(etag.c_str()) != 0) {
10204 return -ERR_PRECONDITION_FAILED;
10205 }
10206 }
10207
10208 if (conds.if_nomatch) {
10209 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
10210 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
10211 if (if_nomatch_str.compare(etag.c_str()) == 0) {
10212 return -ERR_NOT_MODIFIED;
10213 }
10214 }
10215 }
10216
10217 if (params.obj_size)
10218 *params.obj_size = astate->size;
10219 if (params.lastmod)
10220 *params.lastmod = astate->mtime;
10221
10222 return 0;
10223}
10224
10225int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
10226{
10227 if (ofs < 0) {
10228 ofs += obj_size;
10229 if (ofs < 0)
10230 ofs = 0;
10231 end = obj_size - 1;
10232 } else if (end < 0) {
10233 end = obj_size - 1;
10234 }
10235
10236 if (obj_size > 0) {
10237 if (ofs >= (off_t)obj_size) {
10238 return -ERANGE;
10239 }
10240 if (end >= (off_t)obj_size) {
10241 end = obj_size - 1;
10242 }
10243 }
10244 return 0;
10245}
10246
10247int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
10248{
10249 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
10250}
10251
10252int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
10253 RGWRados::SystemObject::Read::GetObjState& state,
10254 rgw_raw_obj& obj,
10255 map<string, bufferlist> *attrs,
10256 real_time *lastmod,
10257 uint64_t *obj_size,
10258 RGWObjVersionTracker *objv_tracker)
10259{
10260 RGWRawObjState *astate = NULL;
10261
10262 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
10263 if (r < 0)
10264 return r;
10265
10266 if (!astate->exists) {
10267 return -ENOENT;
10268 }
10269
10270 if (attrs) {
10271 *attrs = astate->attrset;
10272 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10273 map<string, bufferlist>::iterator iter;
10274 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10275 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10276 }
10277 }
10278 }
10279
10280 if (obj_size)
10281 *obj_size = astate->size;
10282 if (lastmod)
10283 *lastmod = astate->mtime;
10284
10285 return 0;
10286}
10287
31f18b77
FG
10288
10289int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10290{
10291 RGWRados *store = target->get_store();
10292 BucketShard *bs;
10293 int r;
10294
10295#define NUM_RESHARD_RETRIES 10
10296 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10297 int ret = get_bucket_shard(&bs);
10298 if (ret < 0) {
10299 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10300 return ret;
10301 }
10302 r = call(bs);
10303 if (r != -ERR_BUSY_RESHARDING) {
10304 break;
10305 }
10306 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10307 string new_bucket_id;
f64942e4 10308 r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info);
31f18b77
FG
10309 if (r == -ERR_BUSY_RESHARDING) {
10310 continue;
10311 }
10312 if (r < 0) {
10313 return r;
10314 }
10315 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10316 i = 0; /* resharding is finished, make sure we can retry */
10317 r = target->update_bucket_id(new_bucket_id);
10318 if (r < 0) {
10319 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10320 return r;
10321 }
10322 invalidate_bs();
10323 }
10324
10325 if (r < 0) {
10326 return r;
10327 }
10328
10329 if (pbs) {
10330 *pbs = bs;
10331 }
10332
10333 return 0;
10334}
10335
7c673cae
FG
10336int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10337{
10338 RGWRados *store = source->get_store();
10339 rgw_raw_obj& obj = source->get_obj();
10340
10341 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10342 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10343}
10344
10345int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10346{
10347 if (blind) {
10348 return 0;
10349 }
10350 RGWRados *store = target->get_store();
7c673cae
FG
10351
10352 if (write_tag && write_tag->length()) {
10353 optag = string(write_tag->c_str(), write_tag->length());
10354 } else {
10355 if (optag.empty()) {
10356 append_rand_alpha(store->ctx(), optag, optag, 32);
10357 }
10358 }
10359
f64942e4
AA
10360 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10361 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10362 });
31f18b77 10363
7c673cae
FG
10364 if (r < 0) {
10365 return r;
10366 }
10367 prepared = true;
31f18b77 10368
7c673cae
FG
10369 return 0;
10370}
10371
10372int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10373 uint64_t size, uint64_t accounted_size,
10374 ceph::real_time& ut, const string& etag,
10375 const string& content_type,
10376 bufferlist *acl_bl,
10377 RGWObjCategory category,
10378 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10379{
10380 if (blind) {
10381 return 0;
10382 }
10383 RGWRados *store = target->get_store();
10384 BucketShard *bs;
31f18b77 10385
7c673cae
FG
10386 int ret = get_bucket_shard(&bs);
10387 if (ret < 0) {
10388 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10389 return ret;
10390 }
10391
10392 rgw_bucket_dir_entry ent;
10393 obj.key.get_index_key(&ent.key);
10394 ent.meta.size = size;
10395 ent.meta.accounted_size = accounted_size;
10396 ent.meta.mtime = ut;
10397 ent.meta.etag = etag;
10398 if (user_data)
10399 ent.meta.user_data = *user_data;
10400
10401 ACLOwner owner;
10402 if (acl_bl && acl_bl->length()) {
10403 int ret = store->decode_policy(*acl_bl, &owner);
10404 if (ret < 0) {
10405 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10406 }
10407 }
10408 ent.meta.owner = owner.get_id().to_str();
10409 ent.meta.owner_display_name = owner.get_display_name();
10410 ent.meta.content_type = content_type;
10411
31f18b77 10412 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 10413
c07f9fc5
FG
10414 if (target->bucket_info.datasync_flag_enabled()) {
10415 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10416 if (r < 0) {
10417 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10418 }
7c673cae
FG
10419 }
10420
10421 return ret;
10422}
10423
10424int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10425 real_time& removed_mtime,
10426 list<rgw_obj_index_key> *remove_objs)
10427{
10428 if (blind) {
10429 return 0;
10430 }
10431 RGWRados *store = target->get_store();
10432 BucketShard *bs;
31f18b77 10433
7c673cae
FG
10434 int ret = get_bucket_shard(&bs);
10435 if (ret < 0) {
10436 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10437 return ret;
10438 }
10439
31f18b77 10440 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 10441
c07f9fc5
FG
10442 if (target->bucket_info.datasync_flag_enabled()) {
10443 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10444 if (r < 0) {
10445 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10446 }
7c673cae
FG
10447 }
10448
10449 return ret;
10450}
10451
10452
10453int RGWRados::Bucket::UpdateIndex::cancel()
10454{
10455 if (blind) {
10456 return 0;
10457 }
10458 RGWRados *store = target->get_store();
10459 BucketShard *bs;
7c673cae 10460
f64942e4
AA
10461 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10462 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10463 });
7c673cae
FG
10464
10465 /*
10466 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10467 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10468 * have no way to tell that they're all caught up
10469 */
c07f9fc5
FG
10470 if (target->bucket_info.datasync_flag_enabled()) {
10471 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10472 if (r < 0) {
10473 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10474 }
7c673cae
FG
10475 }
10476
10477 return ret;
10478}
10479
10480int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10481{
10482 RGWRados *store = source->get_store();
10483 CephContext *cct = store->ctx();
10484
7c673cae
FG
10485 rgw_raw_obj read_obj;
10486 uint64_t read_ofs = ofs;
10487 uint64_t len, read_len;
10488 bool reading_from_head = true;
10489 ObjectReadOperation op;
10490
10491 bool merge_bl = false;
10492 bufferlist *pbl = &bl;
10493 bufferlist read_bl;
10494 uint64_t max_chunk_size;
10495
10496 RGWObjState *astate;
10497 int r = source->get_state(&astate, true);
10498 if (r < 0)
10499 return r;
10500
10501 if (end < 0)
10502 len = 0;
10503 else
10504 len = end - ofs + 1;
10505
10506 if (astate->has_manifest && astate->manifest.has_tail()) {
10507 /* now get the relevant object part */
10508 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10509
10510 uint64_t stripe_ofs = iter.get_stripe_ofs();
10511 read_obj = iter.get_location().get_raw_obj(store);
10512 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10513 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10514 reading_from_head = (read_obj == state.head_obj);
10515 } else {
10516 read_obj = state.head_obj;
10517 }
10518
10519 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10520 if (r < 0) {
10521 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10522 return r;
10523 }
10524
10525 if (len > max_chunk_size)
10526 len = max_chunk_size;
10527
10528
10529 state.io_ctx.locator_set_key(read_obj.loc);
10530
10531 read_len = len;
10532
10533 if (reading_from_head) {
10534 /* only when reading from the head object do we need to do the atomic test */
10535 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10536 if (r < 0)
10537 return r;
10538
10539 if (astate && astate->prefetch_data) {
10540 if (!ofs && astate->data.length() >= len) {
10541 bl = astate->data;
10542 return bl.length();
10543 }
10544
10545 if (ofs < astate->data.length()) {
10546 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10547 astate->data.copy(ofs, copy_len, bl);
10548 read_len -= copy_len;
10549 read_ofs += copy_len;
10550 if (!read_len)
10551 return bl.length();
10552
10553 merge_bl = true;
10554 pbl = &read_bl;
10555 }
10556 }
10557 }
10558
10559 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10560 op.read(read_ofs, read_len, pbl, NULL);
10561
10562 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10563 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10564
10565 if (r < 0) {
10566 return r;
10567 }
10568
10569 if (merge_bl) {
10570 bl.append(read_bl);
10571 }
10572
10573 return bl.length();
10574}
10575
10576int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10577{
10578 if (!has_ref) {
224ce89b 10579 int r = store->get_raw_obj_ref(obj, &ref);
7c673cae
FG
10580 if (r < 0) {
10581 return r;
10582 }
10583 has_ref = true;
10584 }
10585 *pref = &ref;
10586 return 0;
10587
10588}
10589
10590int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10591 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10592 bufferlist& bl, off_t ofs, off_t end,
10593 map<string, bufferlist> *attrs,
b32b8144
FG
10594 rgw_cache_entry_info *cache_info,
10595 boost::optional<obj_version>)
7c673cae
FG
10596{
10597 uint64_t len;
10598 ObjectReadOperation op;
10599
10600 if (end < 0)
10601 len = 0;
10602 else
10603 len = end - ofs + 1;
10604
10605 if (objv_tracker) {
10606 objv_tracker->prepare_op_for_read(&op);
10607 }
10608
10609 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10610 op.read(ofs, len, &bl, NULL);
10611
10612 if (attrs) {
10613 op.getxattrs(attrs, NULL);
10614 }
10615
10616 rgw_rados_ref *ref;
10617 int r = read_state.get_ref(this, obj, &ref);
10618 if (r < 0) {
10619 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10620 return r;
10621 }
10622 r = ref->ioctx.operate(ref->oid, &op, NULL);
10623 if (r < 0) {
10624 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10625 return r;
10626 }
10627 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10628
10629 uint64_t op_ver = ref->ioctx.get_last_version();
10630
10631 if (read_state.last_ver > 0 &&
10632 read_state.last_ver != op_ver) {
10633 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10634 return -ECANCELED;
10635 }
10636
10637 read_state.last_ver = op_ver;
10638
10639 return bl.length();
10640}
10641
b32b8144
FG
10642int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10643 RGWObjVersionTracker *objv_tracker,
10644 boost::optional<obj_version> refresh_version)
7c673cae
FG
10645{
10646 RGWRados *store = source->get_store();
10647 rgw_raw_obj& obj = source->get_obj();
10648
b32b8144
FG
10649 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10650 ofs, end, read_params.attrs,
10651 read_params.cache_info, refresh_version);
7c673cae
FG
10652}
10653
10654int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10655{
10656 RGWRados *store = source->get_store();
10657 rgw_raw_obj& obj = source->get_obj();
10658
10659 return store->system_obj_get_attr(obj, name, dest);
10660}
10661
10662struct get_obj_data;
10663
10664struct get_obj_aio_data {
10665 struct get_obj_data *op_data;
10666 off_t ofs;
10667 off_t len;
10668};
10669
10670struct get_obj_io {
10671 off_t len;
10672 bufferlist bl;
10673};
10674
10675static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10676
10677struct get_obj_data : public RefCountedObject {
10678 CephContext *cct;
10679 RGWRados *rados;
10680 RGWObjectCtx *ctx;
10681 IoCtx io_ctx;
10682 map<off_t, get_obj_io> io_map;
10683 map<off_t, librados::AioCompletion *> completion_map;
10684 uint64_t total_read;
10685 Mutex lock;
10686 Mutex data_lock;
10687 list<get_obj_aio_data> aio_data;
10688 RGWGetDataCB *client_cb;
10689 std::atomic<bool> cancelled = { false };
10690 std::atomic<int64_t> err_code = { 0 };
10691 Throttle throttle;
10692 list<bufferlist> read_list;
10693
10694 explicit get_obj_data(CephContext *_cct)
10695 : cct(_cct),
10696 rados(NULL), ctx(NULL),
10697 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10698 client_cb(NULL),
10699 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10700 ~get_obj_data() override { }
10701 void set_cancelled(int r) {
10702 cancelled = true;
10703 err_code = r;
10704 }
10705
10706 bool is_cancelled() {
10707 return cancelled;
10708 }
10709
10710 int get_err_code() {
10711 return err_code;
10712 }
10713
10714 int wait_next_io(bool *done) {
10715 lock.Lock();
10716 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10717 if (iter == completion_map.end()) {
10718 *done = true;
10719 lock.Unlock();
10720 return 0;
10721 }
10722 off_t cur_ofs = iter->first;
10723 librados::AioCompletion *c = iter->second;
10724 lock.Unlock();
10725
10726 c->wait_for_safe_and_cb();
10727 int r = c->get_return_value();
10728
10729 lock.Lock();
10730 completion_map.erase(cur_ofs);
10731
10732 if (completion_map.empty()) {
10733 *done = true;
10734 }
10735 lock.Unlock();
10736
10737 c->release();
10738
10739 return r;
10740 }
10741
10742 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10743 Mutex::Locker l(lock);
10744
10745 const auto& io_iter = io_map.insert(
10746 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10747
10748 assert(io_iter.second); // assert new insertion
10749
10750 get_obj_io& io = (io_iter.first)->second;
10751 *pbl = &io.bl;
10752
10753 struct get_obj_aio_data aio;
10754 aio.ofs = ofs;
10755 aio.len = len;
10756 aio.op_data = this;
10757
10758 aio_data.push_back(aio);
10759
10760 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10761
10762 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10763 completion_map[ofs] = c;
10764
10765 *pc = c;
10766
10767 /* we have a reference per IO, plus one reference for the calling function.
10768 * reference is dropped for each callback, plus when we're done iterating
10769 * over the parts */
10770 get();
10771 }
10772
10773 void cancel_io(off_t ofs) {
10774 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10775 lock.Lock();
10776 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10777 if (iter != completion_map.end()) {
10778 AioCompletion *c = iter->second;
10779 c->release();
10780 completion_map.erase(ofs);
10781 io_map.erase(ofs);
10782 }
10783 lock.Unlock();
10784
10785 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10786 * need IoCtx to live, as io callback may still be called
10787 */
10788 }
10789
10790 void cancel_all_io() {
10791 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10792 Mutex::Locker l(lock);
10793 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10794 iter != completion_map.end(); ++iter) {
10795 librados::AioCompletion *c = iter->second;
10796 c->release();
10797 }
10798 }
10799
10800 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10801 Mutex::Locker l(lock);
10802
10803 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10804
10805 if (liter == io_map.end() ||
10806 liter->first != ofs) {
10807 return 0;
10808 }
10809
10810 map<off_t, librados::AioCompletion *>::iterator aiter;
10811 aiter = completion_map.find(ofs);
10812 if (aiter == completion_map.end()) {
10813 /* completion map does not hold this io, it was cancelled */
10814 return 0;
10815 }
10816
10817 AioCompletion *completion = aiter->second;
10818 int r = completion->get_return_value();
10819 if (r < 0)
10820 return r;
10821
10822 for (; aiter != completion_map.end(); ++aiter) {
10823 completion = aiter->second;
10824 if (!completion->is_safe()) {
10825 /* reached a request that is not yet complete, stop */
10826 break;
10827 }
10828
10829 r = completion->get_return_value();
10830 if (r < 0) {
10831 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10832 return r;
10833 }
10834
10835 total_read += r;
10836
10837 map<off_t, get_obj_io>::iterator old_liter = liter++;
10838 bl_list.push_back(old_liter->second.bl);
10839 io_map.erase(old_liter);
10840 }
10841
10842 return 0;
10843 }
10844};
10845
10846static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10847{
10848 struct get_obj_data *d = (struct get_obj_data *)arg;
10849
10850 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10851}
10852
10853static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10854{
10855 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10856 struct get_obj_data *d = aio_data->op_data;
10857
10858 d->rados->get_obj_aio_completion_cb(cb, arg);
10859}
10860
10861
10862void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10863{
10864 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10865 struct get_obj_data *d = aio_data->op_data;
10866 off_t ofs = aio_data->ofs;
10867 off_t len = aio_data->len;
10868
10869 list<bufferlist> bl_list;
10870 list<bufferlist>::iterator iter;
10871 int r;
10872
10873 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10874 d->throttle.put(len);
10875
10876 r = rados_aio_get_return_value(c);
10877 if (r < 0) {
10878 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10879 d->set_cancelled(r);
10880 goto done;
10881 }
10882
10883 if (d->is_cancelled()) {
10884 goto done;
10885 }
10886
10887 d->data_lock.Lock();
10888
10889 r = d->get_complete_ios(ofs, bl_list);
10890 if (r < 0) {
10891 goto done_unlock;
10892 }
10893
10894 d->read_list.splice(d->read_list.end(), bl_list);
10895
10896done_unlock:
10897 d->data_lock.Unlock();
10898done:
10899 d->put();
10900 return;
10901}
10902
10903int RGWRados::flush_read_list(struct get_obj_data *d)
10904{
10905 d->data_lock.Lock();
10906 list<bufferlist> l;
10907 l.swap(d->read_list);
10908 d->get();
10909 d->read_list.clear();
10910
10911 d->data_lock.Unlock();
10912
10913 int r = 0;
10914
10915 list<bufferlist>::iterator iter;
10916 for (iter = l.begin(); iter != l.end(); ++iter) {
10917 bufferlist& bl = *iter;
10918 r = d->client_cb->handle_data(bl, 0, bl.length());
10919 if (r < 0) {
10920 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10921 break;
10922 }
10923 }
10924
10925 d->data_lock.Lock();
10926 d->put();
10927 if (r < 0) {
10928 d->set_cancelled(r);
10929 }
10930 d->data_lock.Unlock();
10931 return r;
10932}
10933
10934int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10935 const RGWBucketInfo& bucket_info,
10936 const rgw_obj& obj,
10937 const rgw_raw_obj& read_obj,
10938 off_t obj_ofs,
10939 off_t read_ofs, off_t len,
10940 bool is_head_obj, void *arg)
10941{
10942 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10943 ObjectReadOperation op;
10944 struct get_obj_data *d = (struct get_obj_data *)arg;
10945 string oid, key;
10946 bufferlist *pbl;
10947 AioCompletion *c;
10948
10949 int r;
10950
10951 if (is_head_obj) {
10952 /* only when reading from the head object do we need to do the atomic test */
10953 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10954 if (r < 0)
10955 return r;
10956
10957 if (astate &&
10958 obj_ofs < astate->data.length()) {
10959 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10960
10961 d->data_lock.Lock();
10962 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10963 d->data_lock.Unlock();
10964 if (r < 0)
10965 return r;
10966
10967 d->lock.Lock();
10968 d->total_read += chunk_len;
10969 d->lock.Unlock();
10970
10971 len -= chunk_len;
10972 read_ofs += chunk_len;
10973 obj_ofs += chunk_len;
10974 if (!len)
10975 return 0;
10976 }
10977 }
10978
10979 d->throttle.get(len);
10980 if (d->is_cancelled()) {
10981 return d->get_err_code();
10982 }
10983
10984 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10985 * cleaning up
10986 */
10987 d->add_io(obj_ofs, len, &pbl, &c);
10988
10989 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10990 op.read(read_ofs, len, pbl, NULL);
10991
10992 librados::IoCtx io_ctx(d->io_ctx);
10993 io_ctx.locator_set_key(read_obj.loc);
10994
10995 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10996 if (r < 0) {
10997 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10998 goto done_err;
10999 }
11000
11001 // Flush data to client if there is any
11002 r = flush_read_list(d);
11003 if (r < 0)
11004 return r;
11005
11006 return 0;
11007
11008done_err:
11009 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
11010 d->set_cancelled(r);
11011 d->cancel_io(obj_ofs);
11012
11013 return r;
11014}
11015
11016int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
11017{
11018 RGWRados *store = source->get_store();
11019 CephContext *cct = store->ctx();
11020
11021 struct get_obj_data *data = new get_obj_data(cct);
11022 bool done = false;
11023
11024 RGWObjectCtx& obj_ctx = source->get_ctx();
11025
11026 data->rados = store;
11027 data->io_ctx.dup(state.io_ctx);
11028 data->client_cb = cb;
11029
11030 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
11031 if (r < 0) {
11032 data->cancel_all_io();
11033 goto done;
11034 }
11035
11036 while (!done) {
11037 r = data->wait_next_io(&done);
11038 if (r < 0) {
11039 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
11040 data->cancel_all_io();
11041 break;
11042 }
11043 r = store->flush_read_list(data);
11044 if (r < 0) {
11045 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
11046 data->cancel_all_io();
11047 break;
11048 }
11049 }
11050
11051done:
11052 data->put();
11053 return r;
11054}
11055
11056int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
11057 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11058 off_t ofs, off_t end,
11059 uint64_t max_chunk_size,
11060 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
11061 const rgw_raw_obj&, off_t, off_t, off_t, bool,
11062 RGWObjState *, void *),
11063 void *arg)
11064{
11065 rgw_raw_obj head_obj;
11066 rgw_raw_obj read_obj;
11067 uint64_t read_ofs = ofs;
11068 uint64_t len;
11069 bool reading_from_head = true;
11070 RGWObjState *astate = NULL;
11071
11072 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
11073
11074 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
11075 if (r < 0) {
11076 return r;
11077 }
11078
11079 if (end < 0)
11080 len = 0;
11081 else
11082 len = end - ofs + 1;
11083
11084 if (astate->has_manifest) {
11085 /* now get the relevant object stripe */
11086 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
11087
11088 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
11089
11090 for (; iter != obj_end && ofs <= end; ++iter) {
11091 off_t stripe_ofs = iter.get_stripe_ofs();
11092 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
11093
11094 while (ofs < next_stripe_ofs && ofs <= end) {
11095 read_obj = iter.get_location().get_raw_obj(this);
11096 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
11097 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
11098
11099 if (read_len > max_chunk_size) {
11100 read_len = max_chunk_size;
11101 }
11102
11103 reading_from_head = (read_obj == head_obj);
11104 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
11105 if (r < 0) {
11106 return r;
11107 }
11108
11109 len -= read_len;
11110 ofs += read_len;
11111 }
11112 }
11113 } else {
11114 while (ofs <= end) {
11115 read_obj = head_obj;
11116 uint64_t read_len = min(len, max_chunk_size);
11117
11118 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
11119 if (r < 0) {
11120 return r;
11121 }
11122
11123 len -= read_len;
11124 ofs += read_len;
11125 }
11126 }
11127
11128 return 0;
11129}
11130
11131int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
11132{
11133 rgw_rados_ref ref;
11134 int r = get_obj_head_ref(bucket_info, obj, &ref);
11135 if (r < 0) {
11136 return r;
11137 }
11138
11139 return ref.ioctx.operate(ref.oid, op);
11140}
11141
11142int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
11143{
11144 rgw_rados_ref ref;
11145 int r = get_obj_head_ref(bucket_info, obj, &ref);
11146 if (r < 0) {
11147 return r;
11148 }
11149
11150 bufferlist outbl;
11151
11152 return ref.ioctx.operate(ref.oid, op, &outbl);
11153}
11154
11155int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
11156{
11157 ObjectWriteOperation op;
11158
11159 assert(olh_obj.key.instance.empty());
11160
11161 bool has_tag = (state.exists && has_olh_tag(state.attrset));
11162
11163 if (!state.exists) {
11164 op.create(true);
11165 } else {
11166 op.assert_exists();
b32b8144
FG
11167 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11168 op.mtime2(&mtime_ts);
7c673cae
FG
11169 }
11170
11171 /*
11172 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
11173 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
11174 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
11175 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
11176 * log will reflect that.
11177 *
11178 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
11179 * is used for object data instance, olh_tag for olh instance.
11180 */
11181 if (has_tag) {
11182 /* guard against racing writes */
11183 bucket_index_guard_olh_op(state, op);
11184 }
11185
11186 if (!has_tag) {
11187 /* obj tag */
11188 string obj_tag;
11189 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
11190 if (ret < 0) {
11191 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11192 return ret;
11193 }
11194 bufferlist bl;
11195 bl.append(obj_tag.c_str(), obj_tag.size());
11196 op.setxattr(RGW_ATTR_ID_TAG, bl);
11197
11198 state.attrset[RGW_ATTR_ID_TAG] = bl;
11199 state.obj_tag = bl;
11200
11201 /* olh tag */
11202 string olh_tag;
11203 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
11204 if (ret < 0) {
11205 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11206 return ret;
11207 }
11208 bufferlist olh_bl;
11209 olh_bl.append(olh_tag.c_str(), olh_tag.size());
11210 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
11211
11212 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
11213 state.olh_tag = olh_bl;
11214 state.is_olh = true;
11215
11216 bufferlist verbl;
11217 op.setxattr(RGW_ATTR_OLH_VER, verbl);
11218 }
11219
11220 bufferlist bl;
11221 RGWOLHPendingInfo pending_info;
11222 pending_info.time = real_clock::now();
11223 ::encode(pending_info, bl);
11224
11225#define OLH_PENDING_TAG_LEN 32
11226 /* tag will start with current time epoch, this so that entries are sorted by time */
11227 char buf[32];
11228 utime_t ut(pending_info.time);
11229 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
11230 *op_tag = buf;
11231
11232 string s;
11233 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
11234 if (ret < 0) {
11235 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11236 return ret;
11237 }
11238 op_tag->append(s);
11239
11240 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11241 attr_name.append(*op_tag);
11242
11243 op.setxattr(attr_name.c_str(), bl);
11244
11245 ret = obj_operate(bucket_info, olh_obj, &op);
11246 if (ret < 0) {
11247 return ret;
11248 }
11249
11250 state.exists = true;
11251 state.attrset[attr_name] = bl;
11252
11253 return 0;
11254}
11255
11256int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
11257{
11258 int ret;
11259
11260 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
11261 if (ret == -EEXIST) {
11262 ret = -ECANCELED;
11263 }
11264
11265 return ret;
11266}
11267
f64942e4
AA
11268int RGWRados::guard_reshard(BucketShard *bs,
11269 const rgw_obj& obj_instance,
11270 const RGWBucketInfo& bucket_info,
11271 std::function<int(BucketShard *)> call)
31f18b77
FG
11272{
11273 rgw_obj obj;
11274 const rgw_obj *pobj = &obj_instance;
11275 int r;
11276
11277 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
f64942e4 11278 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
31f18b77
FG
11279 if (r < 0) {
11280 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11281 return r;
11282 }
11283 r = call(bs);
11284 if (r != -ERR_BUSY_RESHARDING) {
11285 break;
11286 }
11287 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11288 string new_bucket_id;
f64942e4 11289 r = block_while_resharding(bs, &new_bucket_id, bucket_info);
31f18b77
FG
11290 if (r == -ERR_BUSY_RESHARDING) {
11291 continue;
11292 }
11293 if (r < 0) {
11294 return r;
11295 }
11296 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11297 i = 0; /* resharding is finished, make sure we can retry */
11298
11299 obj = *pobj;
11300 obj.bucket.update_bucket_id(new_bucket_id);
11301 pobj = &obj;
11302 }
11303
11304 if (r < 0) {
11305 return r;
11306 }
11307
11308 return 0;
11309}
11310
f64942e4
AA
11311int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
11312 string *new_bucket_id,
11313 const RGWBucketInfo& bucket_info)
31f18b77
FG
11314{
11315 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11316
f64942e4 11317 return waiter->block_while_resharding(bs, new_bucket_id, bucket_info);
31f18b77
FG
11318}
11319
7c673cae
FG
11320int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11321 bool delete_marker,
11322 const string& op_tag,
11323 struct rgw_bucket_dir_entry_meta *meta,
11324 uint64_t olh_epoch,
91327a77
AA
11325 real_time unmod_since, bool high_precision_time,
11326 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
11327{
11328 rgw_rados_ref ref;
11329 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11330 if (r < 0) {
11331 return r;
11332 }
11333
31f18b77
FG
11334 rgw_zone_set zones_trace;
11335 if (_zones_trace) {
11336 zones_trace = *_zones_trace;
7c673cae 11337 }
1adf2230 11338 zones_trace.insert(get_zone().id);
7c673cae 11339
31f18b77
FG
11340 BucketShard bs(this);
11341
7c673cae 11342 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
11343 r = guard_reshard(&bs, obj_instance, bucket_info,
11344 [&](BucketShard *bs) -> int {
11345 librados::ObjectWriteOperation op;
11346 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11347 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11348 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11349 unmod_since, high_precision_time,
11350 get_zone().log_data, zones_trace);
31f18b77
FG
11351 });
11352 if (r < 0) {
11353 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11354 return r;
7c673cae
FG
11355 }
11356
91327a77
AA
11357 if (log_data_change && bucket_info.datasync_flag_enabled()) {
11358 data_log->add_entry(bs.bucket, bs.shard_id);
11359 }
11360
7c673cae
FG
11361 return 0;
11362}
11363
11364void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11365{
11366 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11367 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11368}
11369
11370int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 11371 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
11372{
11373 rgw_rados_ref ref;
11374 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11375 if (r < 0) {
11376 return r;
11377 }
11378
31f18b77
FG
11379 rgw_zone_set zones_trace;
11380 if (_zones_trace) {
11381 zones_trace = *_zones_trace;
7c673cae 11382 }
31f18b77
FG
11383 zones_trace.insert(get_zone().id);
11384
11385 BucketShard bs(this);
7c673cae
FG
11386
11387 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
11388 r = guard_reshard(&bs, obj_instance, bucket_info,
11389 [&](BucketShard *bs) -> int {
11390 librados::ObjectWriteOperation op;
11391 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11392 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11393 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
31f18b77
FG
11394 });
11395 if (r < 0) {
11396 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11397 return r;
7c673cae
FG
11398 }
11399
11400 return 0;
11401}
11402
11403int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11404 const rgw_obj& obj_instance, uint64_t ver_marker,
11405 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11406 bool *is_truncated)
11407{
11408 rgw_rados_ref ref;
11409 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11410 if (r < 0) {
11411 return r;
11412 }
11413
11414 BucketShard bs(this);
f64942e4
AA
11415 int ret =
11416 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
11417 if (ret < 0) {
11418 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11419 return ret;
11420 }
11421
11422 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11423
11424 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11425
f64942e4
AA
11426 ret = guard_reshard(&bs, obj_instance, bucket_info,
11427 [&](BucketShard *bs) -> int {
11428 ObjectReadOperation op;
11429 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11430 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11431 key, ver_marker, olh_tag, log, is_truncated);
11432 });
31f18b77
FG
11433 if (ret < 0) {
11434 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 11435 return ret;
31f18b77 11436 }
7c673cae
FG
11437
11438 return 0;
11439}
11440
a8e16298
TL
11441// a multisite sync bug resulted in the OLH head attributes being overwritten by
11442// the attributes from another zone, causing link_olh() to fail endlessly due to
11443// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
11444// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
11445int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
11446 const rgw_obj& obj)
11447{
11448 // fetch the current olh entry from the bucket index
11449 rgw_bucket_olh_entry olh;
11450 int r = bi_get_olh(bucket_info, obj, &olh);
11451 if (r < 0) {
11452 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
11453 return r;
11454 }
11455 if (olh.tag == state->olh_tag.to_str()) { // mismatch already resolved?
11456 return 0;
11457 }
11458
11459 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
11460 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
11461
11462 // rewrite OLH_ID_TAG and OLH_INFO from current olh
11463 ObjectWriteOperation op;
11464 // assert this is the same olh tag we think we're fixing
11465 bucket_index_guard_olh_op(*state, op);
11466 // preserve existing mtime
11467 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
11468 op.mtime2(&mtime_ts);
11469 {
11470 bufferlist bl;
11471 bl.append(olh.tag.c_str(), olh.tag.size());
11472 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
11473 }
11474 {
11475 RGWOLHInfo info;
11476 info.target = rgw_obj(bucket_info.bucket, olh.key);
11477 info.removed = olh.delete_marker;
11478 bufferlist bl;
11479 encode(info, bl);
11480 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11481 }
11482 rgw_rados_ref ref;
11483 r = get_obj_head_ref(bucket_info, obj, &ref);
11484 if (r < 0) {
11485 return r;
11486 }
11487 r = ref.ioctx.operate(ref.oid, &op);
11488 if (r < 0) {
11489 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
11490 << cpp_strerror(r) << dendl;
11491 return r;
11492 }
11493 return 0;
11494}
11495
7c673cae
FG
11496int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11497{
11498 rgw_rados_ref ref;
11499 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11500 if (r < 0) {
11501 return r;
11502 }
11503
11504 BucketShard bs(this);
f64942e4
AA
11505 int ret =
11506 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
11507 if (ret < 0) {
11508 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11509 return ret;
11510 }
11511
11512 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11513
11514 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11515
f64942e4
AA
11516 ret = guard_reshard(&bs, obj_instance, bucket_info,
11517 [&](BucketShard *pbs) -> int {
11518 ObjectWriteOperation op;
11519 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11520 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11521 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
31f18b77
FG
11522 });
11523 if (ret < 0) {
11524 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 11525 return ret;
31f18b77 11526 }
7c673cae
FG
11527
11528 return 0;
11529}
11530
11531int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11532{
11533 rgw_rados_ref ref;
11534 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11535 if (r < 0) {
11536 return r;
11537 }
11538
11539 BucketShard bs(this);
7c673cae
FG
11540
11541 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11542
11543 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11544
f64942e4
AA
11545 int ret = guard_reshard(&bs, obj_instance, bucket_info,
11546 [&](BucketShard *pbs) -> int {
11547 ObjectWriteOperation op;
11548 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11549 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
31f18b77 11550 });
7c673cae
FG
11551 if (ret < 0) {
11552 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11553 return ret;
11554 }
11555
11556 return 0;
11557}
11558
11559int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11560 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 11561 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
11562{
11563 if (log.empty()) {
11564 return 0;
11565 }
11566
11567 librados::ObjectWriteOperation op;
11568
11569 uint64_t last_ver = log.rbegin()->first;
11570 *plast_ver = last_ver;
11571
11572 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11573
11574 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11575 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11576
a8e16298
TL
11577 bufferlist ver_bl;
11578 string last_ver_s = to_string(last_ver);
11579 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
11580 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
11581
b32b8144
FG
11582 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11583 op.mtime2(&mtime_ts);
11584
7c673cae
FG
11585 bool need_to_link = false;
11586 cls_rgw_obj_key key;
11587 bool delete_marker = false;
11588 list<cls_rgw_obj_key> remove_instances;
11589 bool need_to_remove = false;
11590
11591 for (iter = log.begin(); iter != log.end(); ++iter) {
11592 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11593 for (; viter != iter->second.end(); ++viter) {
11594 rgw_bucket_olh_log_entry& entry = *viter;
11595
11596 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11597 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11598 << (entry.delete_marker ? "(delete)" : "") << dendl;
11599 switch (entry.op) {
11600 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11601 remove_instances.push_back(entry.key);
11602 break;
11603 case CLS_RGW_OLH_OP_LINK_OLH:
11604 need_to_link = true;
11605 need_to_remove = false;
11606 key = entry.key;
11607 delete_marker = entry.delete_marker;
11608 break;
11609 case CLS_RGW_OLH_OP_UNLINK_OLH:
11610 need_to_remove = true;
11611 need_to_link = false;
11612 break;
11613 default:
11614 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11615 return -EIO;
11616 }
11617 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11618 attr_name.append(entry.op_tag);
11619 op.rmxattr(attr_name.c_str());
11620 }
11621 }
11622
11623 rgw_rados_ref ref;
11624 int r = get_obj_head_ref(bucket_info, obj, &ref);
11625 if (r < 0) {
11626 return r;
11627 }
11628
11629 const rgw_bucket& bucket = obj.bucket;
11630
11631 if (need_to_link) {
11632 rgw_obj target(bucket, key);
11633 RGWOLHInfo info;
11634 info.target = target;
11635 info.removed = delete_marker;
11636 bufferlist bl;
11637 ::encode(info, bl);
11638 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11639 }
11640
11641 /* first remove object instances */
11642 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11643 liter != remove_instances.end(); ++liter) {
11644 cls_rgw_obj_key& key = *liter;
11645 rgw_obj obj_instance(bucket, key);
31f18b77 11646 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
11647 if (ret < 0 && ret != -ENOENT) {
11648 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11649 return ret;
11650 }
11651 }
11652
11653 /* update olh object */
11654 r = ref.ioctx.operate(ref.oid, &op);
11655 if (r == -ECANCELED) {
11656 r = 0;
11657 }
11658 if (r < 0) {
11659 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11660 return r;
11661 }
11662
11663 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11664 if (r < 0) {
11665 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11666 return r;
11667 }
11668
11669 if (need_to_remove) {
11670 ObjectWriteOperation rm_op;
11671
11672 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 11673 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
11674 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11675 rm_op.remove();
11676
11677 r = ref.ioctx.operate(ref.oid, &rm_op);
11678 if (r == -ECANCELED) {
11679 return 0; /* someone else won this race */
11680 } else {
11681 /*
11682 * only clear if was successful, otherwise we might clobber pending operations on this object
11683 */
11684 r = bucket_index_clear_olh(bucket_info, state, obj);
11685 if (r < 0) {
11686 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11687 return r;
11688 }
11689 }
11690 }
11691
11692 return 0;
11693}
11694
11695/*
11696 * read olh log and apply it
11697 */
31f18b77 11698int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
11699{
11700 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11701 bool is_truncated;
11702 uint64_t ver_marker = 0;
11703
11704 do {
11705 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11706 if (ret < 0) {
11707 return ret;
11708 }
31f18b77 11709 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
11710 if (ret < 0) {
11711 return ret;
11712 }
11713 } while (is_truncated);
11714
11715 return 0;
11716}
11717
11718int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77
AA
11719 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
11720 rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
11721{
11722 string op_tag;
11723
11724 rgw_obj olh_obj = target_obj;
11725 olh_obj.key.instance.clear();
11726
11727 RGWObjState *state = NULL;
11728
11729 int ret = 0;
11730 int i;
31f18b77 11731
7c673cae
FG
11732#define MAX_ECANCELED_RETRY 100
11733 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11734 if (ret == -ECANCELED) {
11735 obj_ctx.obj.invalidate(olh_obj);
11736 }
11737
11738 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11739 if (ret < 0) {
11740 return ret;
11741 }
11742
11743 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11744 if (ret < 0) {
11745 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11746 if (ret == -ECANCELED) {
11747 continue;
11748 }
11749 return ret;
11750 }
91327a77
AA
11751 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
11752 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
11753 zones_trace, log_data_change);
7c673cae
FG
11754 if (ret < 0) {
11755 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11756 if (ret == -ECANCELED) {
a8e16298
TL
11757 // the bucket index rejected the link_olh() due to olh tag mismatch;
11758 // attempt to reconstruct olh head attributes based on the bucket index
11759 int r2 = repair_olh(state, bucket_info, olh_obj);
11760 if (r2 < 0 && r2 != -ECANCELED) {
11761 return r2;
11762 }
7c673cae
FG
11763 continue;
11764 }
11765 return ret;
11766 }
11767 break;
11768 }
11769
11770 if (i == MAX_ECANCELED_RETRY) {
11771 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11772 return -EIO;
11773 }
11774
11775 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11776 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11777 ret = 0;
11778 }
11779 if (ret < 0) {
11780 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11781 return ret;
11782 }
11783
11784 return 0;
11785}
11786
11787int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
31f18b77 11788 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7c673cae
FG
11789{
11790 string op_tag;
11791
11792 rgw_obj olh_obj = target_obj;
11793 olh_obj.key.instance.clear();
11794
11795 RGWObjState *state = NULL;
11796
11797 int ret = 0;
11798 int i;
11799
11800 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11801 if (ret == -ECANCELED) {
11802 obj_ctx.obj.invalidate(olh_obj);
11803 }
11804
11805 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11806 if (ret < 0)
11807 return ret;
11808
11809 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11810 if (ret < 0) {
11811 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11812 if (ret == -ECANCELED) {
11813 continue;
11814 }
11815 return ret;
11816 }
11817
11818 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11819
31f18b77 11820 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
11821 if (ret < 0) {
11822 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11823 if (ret == -ECANCELED) {
11824 continue;
11825 }
11826 return ret;
11827 }
11828 break;
11829 }
11830
11831 if (i == MAX_ECANCELED_RETRY) {
11832 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11833 return -EIO;
11834 }
11835
31f18b77 11836 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
11837 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11838 return 0;
11839 }
11840 if (ret < 0) {
11841 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11842 return ret;
11843 }
11844
11845 return 0;
11846}
11847
11848void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11849{
11850#define OBJ_INSTANCE_LEN 32
11851 char buf[OBJ_INSTANCE_LEN + 1];
11852
11853 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11854 no underscore for instance name due to the way we encode the raw keys */
11855
11856 target_obj->key.set_instance(buf);
11857}
11858
11859static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11860 map<string, bufferlist> *attrset)
11861{
11862 attrset->clear();
11863 map<string, bufferlist>::iterator iter;
11864 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11865 iter != unfiltered_attrset.end(); ++iter) {
11866 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11867 break;
11868 (*attrset)[iter->first] = iter->second;
11869 }
11870}
11871
11872int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11873{
11874 map<string, bufferlist> unfiltered_attrset;
11875
11876 ObjectReadOperation op;
11877 op.getxattrs(&unfiltered_attrset, NULL);
11878
11879 bufferlist outbl;
11880 int r = obj_operate(bucket_info, obj, &op);
11881
11882 if (r < 0) {
11883 return r;
11884 }
11885 map<string, bufferlist> attrset;
11886
11887 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11888
11889 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11890 if (iter == attrset.end()) { /* not an olh */
11891 return -EINVAL;
11892 }
11893
11894 try {
11895 bufferlist::iterator biter = iter->second.begin();
11896 ::decode(*olh, biter);
11897 } catch (buffer::error& err) {
11898 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11899 return -EIO;
11900 }
11901
11902 return 0;
11903}
11904
11905void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11906 map<string, bufferlist> *rm_pending_entries)
11907{
11908 map<string, bufferlist>::iterator iter = pending_entries.begin();
11909
11910 real_time now = real_clock::now();
11911
11912 while (iter != pending_entries.end()) {
11913 bufferlist::iterator biter = iter->second.begin();
11914 RGWOLHPendingInfo pending_info;
11915 try {
11916 ::decode(pending_info, biter);
11917 } catch (buffer::error& err) {
11918 /* skipping bad entry, we could remove it but it might hide a bug */
11919 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11920 ++iter;
11921 continue;
11922 }
11923
11924 map<string, bufferlist>::iterator cur_iter = iter;
11925 ++iter;
11926 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11927 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11928 pending_entries.erase(cur_iter);
11929 } else {
11930 /* entries names are sorted by time (rounded to a second) */
11931 break;
11932 }
11933 }
11934}
11935
11936int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11937{
11938 ObjectWriteOperation op;
11939
11940 bucket_index_guard_olh_op(state, op);
11941
11942 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11943 op.rmxattr(iter->first.c_str());
11944 }
11945
11946 rgw_rados_ref ref;
11947 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11948 if (r < 0) {
11949 return r;
11950 }
11951
11952 /* update olh object */
11953 r = ref.ioctx.operate(ref.oid, &op);
11954 if (r == -ENOENT || r == -ECANCELED) {
11955 /* raced with some other change, shouldn't sweat about it */
11956 r = 0;
11957 }
11958 if (r < 0) {
11959 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11960 return r;
11961 }
11962
11963 return 0;
11964}
11965
11966int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11967{
11968 map<string, bufferlist> pending_entries;
11969 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11970
11971 map<string, bufferlist> rm_pending_entries;
11972 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11973
11974 if (!rm_pending_entries.empty()) {
11975 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11976 if (ret < 0) {
11977 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11978 return ret;
11979 }
11980 }
11981 if (!pending_entries.empty()) {
11982 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11983
11984 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11985 if (ret < 0) {
11986 return ret;
11987 }
11988 }
11989
11990 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11991 assert(iter != state->attrset.end());
11992 RGWOLHInfo olh;
11993 try {
11994 bufferlist::iterator biter = iter->second.begin();
11995 ::decode(olh, biter);
11996 } catch (buffer::error& err) {
11997 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11998 return -EIO;
11999 }
12000
12001 if (olh.removed) {
12002 return -ENOENT;
12003 }
12004
12005 *target = olh.target;
12006
12007 return 0;
12008}
12009
12010int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
12011 map<string, bufferlist> *attrs, bufferlist *first_chunk,
12012 RGWObjVersionTracker *objv_tracker)
12013{
12014 rgw_rados_ref ref;
12015 int r = get_raw_obj_ref(obj, &ref);
12016 if (r < 0) {
12017 return r;
12018 }
12019
12020 map<string, bufferlist> unfiltered_attrset;
12021 uint64_t size = 0;
12022 struct timespec mtime_ts;
12023
12024 ObjectReadOperation op;
12025 if (objv_tracker) {
12026 objv_tracker->prepare_op_for_read(&op);
12027 }
12028 if (attrs) {
12029 op.getxattrs(&unfiltered_attrset, NULL);
12030 }
12031 if (psize || pmtime) {
12032 op.stat2(&size, &mtime_ts, NULL);
12033 }
12034 if (first_chunk) {
12035 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
12036 }
12037 bufferlist outbl;
12038 r = ref.ioctx.operate(ref.oid, &op, &outbl);
12039
12040 if (epoch) {
12041 *epoch = ref.ioctx.get_last_version();
12042 }
12043
12044 if (r < 0)
12045 return r;
12046
12047 if (psize)
12048 *psize = size;
12049 if (pmtime)
12050 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
12051 if (attrs) {
12052 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
12053 }
12054
12055 return 0;
12056}
12057
12058int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 12059 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 12060{
a8e16298 12061 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
12062 map<int, string> bucket_instance_ids;
12063 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
12064 if (r < 0) {
12065 return r;
12066 }
12067
12068 assert(headers.size() == bucket_instance_ids.size());
12069
a8e16298 12070 auto iter = headers.begin();
7c673cae
FG
12071 map<int, string>::iterator viter = bucket_instance_ids.begin();
12072 BucketIndexShardsManager ver_mgr;
12073 BucketIndexShardsManager master_ver_mgr;
12074 BucketIndexShardsManager marker_mgr;
7c673cae
FG
12075 char buf[64];
12076 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
12077 accumulate_raw_stats(*iter, stats);
12078 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 12079 ver_mgr.add(viter->first, string(buf));
a8e16298 12080 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
12081 master_ver_mgr.add(viter->first, string(buf));
12082 if (shard_id >= 0) {
a8e16298 12083 *max_marker = iter->max_marker;
7c673cae 12084 } else {
a8e16298 12085 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 12086 }
c07f9fc5 12087 if (syncstopped != NULL)
a8e16298 12088 *syncstopped = iter->syncstopped;
7c673cae
FG
12089 }
12090 ver_mgr.to_string(bucket_ver);
12091 master_ver_mgr.to_string(master_ver);
12092 if (shard_id < 0) {
12093 marker_mgr.to_string(max_marker);
12094 }
12095 return 0;
12096}
12097
12098int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
12099 map<int, string>& markers)
12100{
a8e16298 12101 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
12102 map<int, string> bucket_instance_ids;
12103 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
12104 if (r < 0)
12105 return r;
12106
12107 assert(headers.size() == bucket_instance_ids.size());
12108
a8e16298 12109 auto iter = headers.begin();
7c673cae
FG
12110 map<int, string>::iterator viter = bucket_instance_ids.begin();
12111
12112 for(; iter != headers.end(); ++iter, ++viter) {
12113 if (shard_id >= 0) {
a8e16298 12114 markers[shard_id] = iter->max_marker;
7c673cae 12115 } else {
a8e16298 12116 markers[viter->first] = iter->max_marker;
7c673cae
FG
12117 }
12118 }
12119 return 0;
12120}
12121
12122class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
12123 RGWGetBucketStats_CB *cb;
12124 uint32_t pendings;
12125 map<RGWObjCategory, RGWStorageStats> stats;
12126 int ret_code;
12127 bool should_cb;
12128 Mutex lock;
12129
12130public:
12131 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
12132 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
12133 lock("RGWGetBucketStatsContext") {}
12134
12135 void handle_response(int r, rgw_bucket_dir_header& header) override {
12136 Mutex::Locker l(lock);
12137 if (should_cb) {
12138 if ( r >= 0) {
12139 accumulate_raw_stats(header, stats);
12140 } else {
12141 ret_code = r;
12142 }
12143
12144 // Are we all done?
12145 if (--pendings == 0) {
12146 if (!ret_code) {
12147 cb->set_response(&stats);
12148 }
12149 cb->handle_response(ret_code);
12150 cb->put();
12151 }
12152 }
12153 }
12154
12155 void unset_cb() {
12156 Mutex::Locker l(lock);
12157 should_cb = false;
12158 }
12159};
12160
12161int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
12162{
12163 int num_aio = 0;
c07f9fc5 12164 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
7c673cae
FG
12165 assert(get_ctx);
12166 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
12167 if (r < 0) {
12168 ctx->put();
12169 if (num_aio) {
12170 get_ctx->unset_cb();
12171 }
12172 }
c07f9fc5 12173 get_ctx->put();
7c673cae
FG
12174 return r;
12175}
12176
12177class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
12178 RGWGetUserStats_CB *cb;
12179
12180public:
12181 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
12182 : cb(cb) {}
12183
12184 void handle_response(int r, cls_user_header& header) override {
12185 const cls_user_stats& hs = header.stats;
12186 if (r >= 0) {
12187 RGWStorageStats stats;
12188
12189 stats.size = hs.total_bytes;
12190 stats.size_rounded = hs.total_bytes_rounded;
12191 stats.num_objects = hs.total_entries;
12192
12193 cb->set_response(stats);
12194 }
12195
12196 cb->handle_response(r);
12197
12198 cb->put();
12199 }
12200};
12201
12202int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
12203{
12204 string user_str = user.to_str();
12205
12206 cls_user_header header;
12207 int r = cls_user_get_header(user_str, &header);
12208 if (r < 0)
12209 return r;
12210
12211 const cls_user_stats& hs = header.stats;
12212
12213 stats.size = hs.total_bytes;
12214 stats.size_rounded = hs.total_bytes_rounded;
12215 stats.num_objects = hs.total_entries;
12216
12217 return 0;
12218}
12219
12220int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
12221{
12222 string user_str = user.to_str();
12223
12224 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
12225 int r = cls_user_get_header_async(user_str, get_ctx);
12226 if (r < 0) {
12227 ctx->put();
12228 delete get_ctx;
12229 return r;
12230 }
12231
12232 return 0;
12233}
12234
12235void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
12236{
12237 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
12238}
12239
12240void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
12241{
12242 if (!bucket.oid.empty()) {
12243 obj.init(get_zone_params().domain_root, bucket.oid);
12244 } else {
12245 string oid;
12246 get_bucket_meta_oid(bucket, oid);
12247 obj.init(get_zone_params().domain_root, oid);
12248 }
12249}
12250
12251int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
12252 real_time *pmtime, map<string, bufferlist> *pattrs)
12253{
12254 size_t pos = meta_key.find(':');
12255 if (pos == string::npos) {
12256 return -EINVAL;
12257 }
12258 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
12259 rgw_bucket_instance_key_to_oid(oid);
12260
12261 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12262}
12263
12264int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
12265 real_time *pmtime, map<string, bufferlist> *pattrs)
12266{
12267 string oid;
12268 if (bucket.oid.empty()) {
12269 get_bucket_meta_oid(bucket, oid);
12270 } else {
12271 oid = bucket.oid;
12272 }
12273
12274 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12275}
12276
31f18b77 12277int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
7c673cae 12278 real_time *pmtime, map<string, bufferlist> *pattrs,
b32b8144
FG
12279 rgw_cache_entry_info *cache_info,
12280 boost::optional<obj_version> refresh_version)
7c673cae
FG
12281{
12282 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
12283
12284 bufferlist epbl;
12285
b32b8144
FG
12286 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12287 oid, epbl, &info.objv_tracker, pmtime, pattrs,
12288 cache_info, refresh_version);
7c673cae
FG
12289 if (ret < 0) {
12290 return ret;
12291 }
12292
12293 bufferlist::iterator iter = epbl.begin();
12294 try {
12295 ::decode(info, iter);
12296 } catch (buffer::error& err) {
12297 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12298 return -EIO;
12299 }
12300 info.bucket.oid = oid;
12301 return 0;
12302}
12303
12304int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
12305 const string& tenant_name,
12306 const string& bucket_name,
12307 RGWBucketEntryPoint& entry_point,
12308 RGWObjVersionTracker *objv_tracker,
12309 real_time *pmtime,
12310 map<string, bufferlist> *pattrs,
b32b8144
FG
12311 rgw_cache_entry_info *cache_info,
12312 boost::optional<obj_version> refresh_version)
7c673cae
FG
12313{
12314 bufferlist bl;
12315 string bucket_entry;
12316
12317 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
b32b8144
FG
12318 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12319 bucket_entry, bl, objv_tracker, pmtime, pattrs,
12320 cache_info, refresh_version);
7c673cae
FG
12321 if (ret < 0) {
12322 return ret;
12323 }
12324
12325 bufferlist::iterator iter = bl.begin();
12326 try {
12327 ::decode(entry_point, iter);
12328 } catch (buffer::error& err) {
12329 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12330 return -EIO;
12331 }
12332 return 0;
12333}
12334
12335int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
12336 const string& tenant_name,
12337 const string& bucket_name)
12338{
12339 RGWBucketEntryPoint entry_point;
12340 real_time ep_mtime;
12341 RGWObjVersionTracker ot;
12342 map<string, bufferlist> attrs;
12343 RGWBucketInfo info;
12344
12345 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
12346
12347 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
12348 if (ret < 0) {
12349 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
12350 return ret;
12351 }
12352
12353 if (!entry_point.has_bucket_info) {
12354 /* already converted! */
12355 return 0;
12356 }
12357
12358 info = entry_point.old_bucket_info;
12359 info.bucket.oid = bucket_name;
12360 info.ep_objv = ot.read_version;
12361
12362 ot.generate_new_write_ver(cct);
12363
12364 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12365 if (ret < 0) {
12366 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12367 return ret;
12368 }
12369
12370 return 0;
12371}
12372
b32b8144
FG
12373int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12374 const string& tenant,
12375 const string& bucket_name,
12376 RGWBucketInfo& info,
12377 real_time *pmtime,
12378 map<string, bufferlist> *pattrs,
12379 boost::optional<obj_version> refresh_version)
7c673cae
FG
12380{
12381 bucket_info_entry e;
12382 string bucket_entry;
12383 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12384
b32b8144 12385
7c673cae 12386 if (binfo_cache->find(bucket_entry, &e)) {
b32b8144
FG
12387 if (refresh_version &&
12388 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12389 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12390 << "a failure that should be debugged. I am a nice machine, "
12391 << "so I will try to recover." << dendl;
12392 binfo_cache->invalidate(bucket_entry);
12393 }
7c673cae
FG
12394 info = e.info;
12395 if (pattrs)
12396 *pattrs = e.attrs;
12397 if (pmtime)
12398 *pmtime = e.mtime;
12399 return 0;
12400 }
12401
12402 RGWBucketEntryPoint entry_point;
12403 real_time ep_mtime;
12404 RGWObjVersionTracker ot;
12405 rgw_cache_entry_info entry_cache_info;
b32b8144
FG
12406 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12407 entry_point, &ot, &ep_mtime, pattrs,
12408 &entry_cache_info, refresh_version);
7c673cae
FG
12409 if (ret < 0) {
12410 /* only init these fields */
12411 info.bucket.tenant = tenant;
12412 info.bucket.name = bucket_name;
12413 return ret;
12414 }
12415
12416 if (entry_point.has_bucket_info) {
12417 info = entry_point.old_bucket_info;
12418 info.bucket.oid = bucket_name;
12419 info.bucket.tenant = tenant;
12420 info.ep_objv = ot.read_version;
12421 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12422 return 0;
12423 }
12424
12425 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12426 * that we got
12427 */
12428 if (pattrs) {
12429 pattrs->clear();
12430 }
12431
12432 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12433
12434
12435 /* read bucket instance info */
12436
12437 string oid;
12438 get_bucket_meta_oid(entry_point.bucket, oid);
12439
12440 rgw_cache_entry_info cache_info;
12441
b32b8144
FG
12442 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12443 &cache_info, refresh_version);
7c673cae
FG
12444 e.info.ep_objv = ot.read_version;
12445 info = e.info;
12446 if (ret < 0) {
b32b8144 12447 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
7c673cae
FG
12448 info.bucket.tenant = tenant;
12449 info.bucket.name = bucket_name;
12450 // XXX and why return anything in case of an error anyway?
12451 return ret;
12452 }
12453
12454 if (pmtime)
12455 *pmtime = e.mtime;
12456 if (pattrs)
12457 *pattrs = e.attrs;
12458
12459 list<rgw_cache_entry_info *> cache_info_entries;
12460 cache_info_entries.push_back(&entry_cache_info);
12461 cache_info_entries.push_back(&cache_info);
12462
12463
12464 /* chain to both bucket entry point and bucket instance */
12465 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12466 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12467 }
12468
b32b8144
FG
12469 if (refresh_version &&
12470 refresh_version->compare(&info.objv_tracker.read_version)) {
12471 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12472 << "have gone squirrelly. An administrator may have forced a "
12473 << "change; otherwise there is a problem somewhere." << dendl;
12474 }
12475
7c673cae
FG
12476 return 0;
12477}
12478
b32b8144
FG
12479int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12480 const string& tenant, const string& bucket_name,
12481 RGWBucketInfo& info,
12482 real_time *pmtime, map<string, bufferlist> *pattrs)
12483{
12484 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12485 pattrs, boost::none);
12486}
12487
12488int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12489 ceph::real_time *pmtime,
12490 map<string, bufferlist> *pattrs)
12491{
12492 RGWObjectCtx obj_ctx(this);
12493
12494 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12495 info, pmtime, pattrs, info.objv_tracker.read_version);
12496}
12497
7c673cae
FG
12498int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12499 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12500 map<string, bufferlist> *pattrs)
12501{
12502 bufferlist epbl;
12503 ::encode(entry_point, epbl);
12504 string bucket_entry;
12505 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12506 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12507}
12508
12509int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12510 real_time mtime, map<string, bufferlist> *pattrs)
12511{
12512 info.has_instance_obj = true;
12513 bufferlist bl;
12514
12515 ::encode(info, bl);
12516
12517 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12518 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12519 if (ret == -EEXIST) {
12520 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12521 * bucket operation on this specific bucket (e.g., being synced from the master), but
12522 * since bucket instace meta object is unique for this specific bucket instace, we don't
12523 * need to return an error.
12524 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12525 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12526 * locally, while in the sync thread we sync the new bucket.
12527 */
12528 ret = 0;
12529 }
12530 return ret;
12531}
12532
12533int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12534 map<string, bufferlist> *pattrs, bool create_entry_point)
12535{
12536 bool create_head = !info.has_instance_obj || create_entry_point;
12537
12538 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12539 if (ret < 0) {
12540 return ret;
12541 }
12542
12543 if (!create_head)
12544 return 0; /* done! */
12545
12546 RGWBucketEntryPoint entry_point;
12547 entry_point.bucket = info.bucket;
12548 entry_point.owner = info.owner;
12549 entry_point.creation_time = info.creation_time;
12550 entry_point.linked = true;
12551 RGWObjVersionTracker ot;
12552 if (pep_objv && !pep_objv->tag.empty()) {
12553 ot.write_version = *pep_objv;
12554 } else {
12555 ot.generate_new_write_ver(cct);
12556 if (pep_objv) {
12557 *pep_objv = ot.write_version;
12558 }
12559 }
12560 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12561 if (ret < 0)
12562 return ret;
12563
12564 return 0;
12565}
12566
12567int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12568{
12569 rgw_rados_ref ref;
12570 int r = get_raw_obj_ref(obj, &ref);
12571 if (r < 0) {
12572 return r;
12573 }
12574
12575 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12576 if (r < 0)
12577 return r;
12578
12579 return 0;
12580
12581}
12582
12583int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12584 std::map<string, bufferlist>& m)
12585{
12586 rgw_rados_ref ref;
12587 int r = get_raw_obj_ref(obj, &ref);
12588 if (r < 0) {
12589 return r;
12590 }
12591
12592#define MAX_OMAP_GET_ENTRIES 1024
12593 const int count = MAX_OMAP_GET_ENTRIES;
12594 string start_after;
12595
12596 while (true) {
12597 std::map<string, bufferlist> t;
12598 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12599 if (r < 0) {
12600 return r;
12601 }
12602 if (t.empty()) {
12603 break;
12604 }
12605 start_after = t.rbegin()->first;
12606 m.insert(t.begin(), t.end());
12607 }
12608 return 0;
12609}
12610
12611int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12612{
12613 rgw_rados_ref ref;
12614 int r = get_raw_obj_ref(obj, &ref);
12615 if (r < 0) {
12616 return r;
12617 }
12618 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12619
12620 map<string, bufferlist> m;
12621 m[key] = bl;
12622
12623 r = ref.ioctx.omap_set(ref.oid, m);
12624
12625 return r;
12626}
12627
12628int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12629{
12630 rgw_rados_ref ref;
12631 int r = get_raw_obj_ref(obj, &ref);
12632 if (r < 0) {
12633 return r;
12634 }
12635
12636 r = ref.ioctx.omap_set(ref.oid, m);
12637
12638 return r;
12639}
12640
12641int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12642{
12643 rgw_rados_ref ref;
12644 int r = get_raw_obj_ref(obj, &ref);
12645 if (r < 0) {
12646 return r;
12647 }
12648
12649 set<string> k;
12650 k.insert(key);
12651
12652 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12653 return r;
12654}
12655
12656int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12657{
12658 RGWObjectCtx obj_ctx(this);
12659
12660 map<string, RGWBucketEnt>::iterator iter;
12661 for (iter = m.begin(); iter != m.end(); ++iter) {
12662 RGWBucketEnt& ent = iter->second;
12663 rgw_bucket& bucket = ent.bucket;
12664 ent.count = 0;
12665 ent.size = 0;
12666 ent.size_rounded = 0;
12667
a8e16298 12668 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
12669
12670 RGWBucketInfo bucket_info;
12671 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12672 if (ret < 0) {
12673 return ret;
12674 }
12675
12676 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12677 if (r < 0)
12678 return r;
12679
a8e16298 12680 auto hiter = headers.begin();
7c673cae
FG
12681 for (; hiter != headers.end(); ++hiter) {
12682 RGWObjCategory category = main_category;
a8e16298
TL
12683 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->stats).find((uint8_t)category);
12684 if (iter != hiter->stats.end()) {
7c673cae
FG
12685 struct rgw_bucket_category_stats& stats = iter->second;
12686 ent.count += stats.num_entries;
12687 ent.size += stats.total_size;
12688 ent.size_rounded += stats.total_size_rounded;
12689 }
12690 }
3efd9988
FG
12691
12692 // fill in placement_rule from the bucket instance for use in swift's
12693 // per-storage policy statistics
12694 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
12695 }
12696
12697 return m.size();
12698}
12699
12700int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12701{
12702 rgw_rados_ref ref;
12703 int r = get_raw_obj_ref(obj, &ref);
12704 if (r < 0) {
12705 return r;
12706 }
12707 librados::Rados *rad = get_rados_handle();
12708 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12709
12710 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12711 completion->release();
12712 return r;
12713}
12714
12715int RGWRados::distribute(const string& key, bufferlist& bl)
12716{
12717 /*
12718 * we were called before watch was initialized. This can only happen if we're updating some system
12719 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12720 * objects, they're currently only read on startup anyway.
12721 */
12722 if (!watch_initialized)
12723 return 0;
12724
12725 string notify_oid;
12726 pick_control_oid(key, notify_oid);
12727
12728 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12729 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12730}
12731
12732int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12733{
12734 librados::IoCtx& io_ctx = ctx.io_ctx;
12735 librados::NObjectIterator& iter = ctx.iter;
12736
12737 int r = open_pool_ctx(pool, io_ctx);
12738 if (r < 0)
12739 return r;
12740
12741 iter = io_ctx.nobjects_begin();
12742
12743 return 0;
12744}
12745
181888fb
FG
12746int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12747{
12748 librados::IoCtx& io_ctx = ctx.io_ctx;
12749 librados::NObjectIterator& iter = ctx.iter;
12750
12751 int r = open_pool_ctx(pool, io_ctx);
12752 if (r < 0)
12753 return r;
12754
12755 librados::ObjectCursor oc;
12756 if (!oc.from_str(cursor)) {
12757 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12758 return -EINVAL;
12759 }
12760
f64942e4
AA
12761 try {
12762 iter = io_ctx.nobjects_begin(oc);
12763 return 0;
12764 } catch (const std::system_error& e) {
12765 r = -e.code().value();
12766 ldout(cct, 10) << "nobjects_begin threw " << e.what()
12767 << ", returning " << r << dendl;
12768 return r;
12769 } catch (const std::exception& e) {
12770 ldout(cct, 10) << "nobjects_begin threw " << e.what()
12771 << ", returning -5" << dendl;
12772 return -EIO;
12773 }
181888fb
FG
12774}
12775
12776string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12777{
12778 return ctx.iter.get_cursor().to_str();
12779}
12780
f64942e4
AA
12781static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
12782 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
12783 bool *is_truncated, RGWAccessListFilter *filter)
12784{
12785 librados::IoCtx& io_ctx = ctx.io_ctx;
12786 librados::NObjectIterator& iter = ctx.iter;
12787
12788 if (iter == io_ctx.nobjects_end())
12789 return -ENOENT;
12790
12791 uint32_t i;
12792
12793 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12794 rgw_bucket_dir_entry e;
12795
12796 string oid = iter->get_oid();
12797 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12798
12799 // fill it in with initial values; we may correct later
12800 if (filter && !filter->filter(oid, oid))
12801 continue;
12802
12803 e.key = oid;
12804 objs.push_back(e);
12805 }
12806
12807 if (is_truncated)
12808 *is_truncated = (iter != io_ctx.nobjects_end());
12809
12810 return objs.size();
12811}
12812struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12813 string prefix;
12814
12815 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12816 bool filter(string& name, string& key) override {
12817 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12818 }
12819};
12820
f64942e4
AA
12821int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12822 bool *is_truncated, RGWAccessListFilter *filter)
12823{
12824 // catch exceptions from NObjectIterator::operator++()
12825 try {
12826 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
12827 } catch (const std::system_error& e) {
12828 int r = -e.code().value();
12829 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
12830 << ", returning " << r << dendl;
12831 return r;
12832 } catch (const std::exception& e) {
12833 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
12834 << ", returning -5" << dendl;
12835 return -EIO;
12836 }
12837}
12838
181888fb 12839int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 12840{
181888fb
FG
12841 if (!ctx->initialized) {
12842 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
12843 if (r < 0) {
12844 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12845 return r;
12846 }
181888fb 12847 ctx->initialized = true;
7c673cae 12848 }
181888fb
FG
12849 return 0;
12850}
7c673cae 12851
181888fb
FG
12852int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12853 RGWListRawObjsCtx& ctx, list<string>& oids,
12854 bool *is_truncated)
12855{
12856 if (!ctx.initialized) {
12857 return -EINVAL;
12858 }
12859 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
12860 vector<rgw_bucket_dir_entry> objs;
12861 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12862 if (r < 0) {
12863 if(r != -ENOENT)
12864 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12865 return r;
12866 }
12867
12868 vector<rgw_bucket_dir_entry>::iterator iter;
12869 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12870 oids.push_back(iter->key.name);
12871 }
12872
12873 return oids.size();
12874}
12875
181888fb
FG
12876int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12877 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12878 bool *is_truncated)
12879{
12880 if (!ctx.initialized) {
12881 int r = list_raw_objects_init(pool, string(), &ctx);
12882 if (r < 0) {
12883 return r;
12884 }
12885 }
12886
12887 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12888}
12889
12890string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12891{
12892 return pool_iterate_get_cursor(ctx.iter_ctx);
12893}
12894
7c673cae
FG
12895int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12896 std::list<rgw_bi_log_entry>& result, bool *truncated)
12897{
12898 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12899 result.clear();
12900
12901 librados::IoCtx index_ctx;
12902 map<int, string> oids;
12903 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12904 map<int, string> bucket_instance_ids;
12905 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12906 if (r < 0)
12907 return r;
12908
12909 BucketIndexShardsManager marker_mgr;
12910 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12911 // If there are multiple shards for the bucket index object, the marker
12912 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12913 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12914 // only contain one record, and the key is the bucket instance id.
12915 r = marker_mgr.from_string(marker, shard_id);
12916 if (r < 0)
12917 return r;
12918
12919 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12920 if (r < 0)
12921 return r;
12922
12923 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12924 map<int, list<rgw_bi_log_entry>::iterator> vends;
12925 if (truncated) {
12926 *truncated = false;
12927 }
12928 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12929 for (; miter != bi_log_lists.end(); ++miter) {
12930 int shard_id = miter->first;
12931 vcurrents[shard_id] = miter->second.entries.begin();
12932 vends[shard_id] = miter->second.entries.end();
12933 if (truncated) {
12934 *truncated = (*truncated || miter->second.truncated);
12935 }
12936 }
12937
12938 size_t total = 0;
12939 bool has_more = true;
12940 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12941 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12942 while (total < max && has_more) {
12943 has_more = false;
12944
12945 viter = vcurrents.begin();
12946 eiter = vends.begin();
12947
12948 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12949 assert (eiter != vends.end());
12950
12951 int shard_id = viter->first;
12952 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12953
12954 if (liter == eiter->second){
12955 continue;
12956 }
12957 rgw_bi_log_entry& entry = *(liter);
12958 if (has_shards) {
12959 char buf[16];
12960 snprintf(buf, sizeof(buf), "%d", shard_id);
12961 string tmp_id;
12962 build_bucket_index_marker(buf, entry.id, &tmp_id);
12963 entry.id.swap(tmp_id);
12964 }
12965 marker_mgr.add(shard_id, entry.id);
12966 result.push_back(entry);
12967 total++;
12968 has_more = true;
12969 ++liter;
12970 }
12971 }
12972
12973 if (truncated) {
12974 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12975 assert (eiter != vends.end());
12976 *truncated = (*truncated || (viter->second != eiter->second));
12977 }
12978 }
12979
12980 // Refresh marker, if there are multiple shards, the output will look like
12981 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12982 // if there is no sharding, the simply marker (without oid) is returned
12983 if (has_shards) {
12984 marker_mgr.to_string(&marker);
12985 } else {
12986 if (!result.empty()) {
12987 marker = result.rbegin()->id;
12988 }
12989 }
12990
12991 return 0;
12992}
12993
12994int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12995{
12996 librados::IoCtx index_ctx;
12997 map<int, string> bucket_objs;
31f18b77
FG
12998
12999 BucketIndexShardsManager start_marker_mgr;
13000 BucketIndexShardsManager end_marker_mgr;
13001
7c673cae 13002 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
31f18b77 13003 if (r < 0) {
7c673cae 13004 return r;
31f18b77 13005 }
7c673cae 13006
7c673cae 13007 r = start_marker_mgr.from_string(start_marker, shard_id);
31f18b77 13008 if (r < 0) {
7c673cae 13009 return r;
31f18b77
FG
13010 }
13011
7c673cae 13012 r = end_marker_mgr.from_string(end_marker, shard_id);
31f18b77 13013 if (r < 0) {
7c673cae 13014 return r;
31f18b77 13015 }
7c673cae
FG
13016
13017 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
31f18b77
FG
13018 cct->_conf->rgw_bucket_index_max_aio)();
13019
13020 return r;
7c673cae
FG
13021}
13022
c07f9fc5
FG
13023int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
13024{
13025 librados::IoCtx index_ctx;
13026 map<int, string> bucket_objs;
13027 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13028 if (r < 0)
13029 return r;
13030
13031 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
13032}
13033
13034int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
13035{
13036 librados::IoCtx index_ctx;
13037 map<int, string> bucket_objs;
13038 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13039 if (r < 0)
13040 return r;
13041
13042 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
13043}
13044
a8e16298
TL
13045int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
13046 rgw_bucket_dir_entry *dirent)
7c673cae 13047{
a8e16298
TL
13048 rgw_cls_bi_entry bi_entry;
13049 int r = bi_get(bucket_info, obj, InstanceIdx, &bi_entry);
13050 if (r < 0 && r != -ENOENT) {
13051 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
13052 }
7c673cae
FG
13053 if (r < 0) {
13054 return r;
13055 }
a8e16298
TL
13056 bufferlist::iterator iter = bi_entry.data.begin();
13057 try {
13058 ::decode(*dirent, iter);
13059 } catch (buffer::error& err) {
13060 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
13061 return -EIO;
13062 }
13063
13064 return 0;
13065}
7c673cae 13066
a8e16298
TL
13067int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
13068 rgw_bucket_olh_entry *olh)
13069{
7c673cae 13070 rgw_cls_bi_entry bi_entry;
a8e16298 13071 int r = bi_get(bucket_info, obj, OLHIdx, &bi_entry);
7c673cae
FG
13072 if (r < 0 && r != -ENOENT) {
13073 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
13074 }
13075 if (r < 0) {
13076 return r;
13077 }
a8e16298 13078 auto iter = bi_entry.data.begin();
7c673cae 13079 try {
a8e16298 13080 decode(*olh, iter);
7c673cae
FG
13081 } catch (buffer::error& err) {
13082 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
13083 return -EIO;
13084 }
13085
13086 return 0;
13087}
13088
a8e16298
TL
13089int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
13090 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
13091{
13092 BucketShard bs(this);
a8e16298 13093 int ret = bs.init(bucket_info, obj);
7c673cae
FG
13094 if (ret < 0) {
13095 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13096 return ret;
13097 }
13098
13099 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
13100
a8e16298 13101 return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
7c673cae
FG
13102}
13103
13104void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
13105{
13106 cls_rgw_bi_put(op, bs.bucket_obj, entry);
13107}
13108
13109int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
13110{
13111 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
13112 if (ret < 0)
13113 return ret;
13114
13115 return 0;
13116}
13117
13118int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
13119{
13120 BucketShard bs(this);
f64942e4 13121 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13122 if (ret < 0) {
13123 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13124 return ret;
13125 }
13126
13127 return bi_put(bs, entry);
13128}
13129
13130int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13131{
13132 rgw_obj obj(bucket, obj_name);
13133 BucketShard bs(this);
f64942e4 13134 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13135 if (ret < 0) {
13136 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13137 return ret;
13138 }
13139
13140 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
13141 if (ret == -ENOENT) {
13142 *is_truncated = false;
13143 }
7c673cae
FG
13144 if (ret < 0)
13145 return ret;
13146
13147 return 0;
13148}
13149
13150int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13151{
13152 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
13153 if (ret < 0)
13154 return ret;
13155
13156 return 0;
13157}
13158
13159int RGWRados::bi_remove(BucketShard& bs)
13160{
13161 int ret = bs.index_ctx.remove(bs.bucket_obj);
13162 if (ret == -ENOENT) {
13163 ret = 0;
13164 }
13165 if (ret < 0) {
13166 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
13167 return ret;
13168 }
13169
13170 return 0;
13171}
13172
13173int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13174{
13175 BucketShard bs(this);
f64942e4 13176 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13177 if (ret < 0) {
13178 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13179 return ret;
13180 }
13181
13182 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
13183}
13184
13185int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
13186{
13187 return gc_pool_ctx.operate(oid, op);
13188}
13189
13190int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
13191{
13192 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13193 int r = gc_pool_ctx.aio_operate(oid, c, op);
13194 c->release();
13195 return r;
13196}
13197
13198int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
13199{
13200 return gc_pool_ctx.operate(oid, op, pbl);
13201}
13202
13203int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
13204{
13205 return gc->list(index, marker, max, expired_only, result, truncated);
13206}
13207
13208int RGWRados::process_gc()
13209{
13210 return gc->process();
13211}
13212
13213int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
13214{
13215 return lc->list_lc_progress(marker, max_entries, progress_map);
13216}
13217
13218int RGWRados::process_lc()
13219{
13220 return lc->process();
13221}
13222
1adf2230 13223bool RGWRados::process_expire_objects()
7c673cae 13224{
1adf2230 13225 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
13226}
13227
7c673cae 13228int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
31f18b77 13229 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13230{
31f18b77
FG
13231 rgw_zone_set zones_trace;
13232 if (_zones_trace) {
13233 zones_trace = *_zones_trace;
13234 }
1adf2230
AA
13235 zones_trace.insert(get_zone().id);
13236
7c673cae
FG
13237 ObjectWriteOperation o;
13238 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77
FG
13239 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13240 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
7c673cae
FG
13241 return bs.index_ctx.operate(bs.bucket_obj, &o);
13242}
13243
31f18b77 13244int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
13245 int64_t pool, uint64_t epoch,
13246 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13247 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13248{
7c673cae
FG
13249 ObjectWriteOperation o;
13250 rgw_bucket_dir_entry_meta dir_meta;
13251 dir_meta = ent.meta;
13252 dir_meta.category = category;
13253
1adf2230
AA
13254 rgw_zone_set zones_trace;
13255 if (_zones_trace) {
13256 zones_trace = *_zones_trace;
13257 }
13258 zones_trace.insert(get_zone().id);
13259
7c673cae
FG
13260 rgw_bucket_entry_ver ver;
13261 ver.pool = pool;
13262 ver.epoch = epoch;
13263 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
13264 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13265 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13266 get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
13267 complete_op_data *arg;
13268 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13269 get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77
FG
13270 librados::AioCompletion *completion = arg->rados_completion;
13271 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
13272 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
13273 return ret;
13274}
13275
31f18b77 13276int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
13277 int64_t pool, uint64_t epoch,
13278 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13279 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 13280{
31f18b77 13281 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13282}
13283
13284int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
13285 int64_t pool, uint64_t epoch,
13286 rgw_obj& obj,
13287 real_time& removed_mtime,
13288 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
13289 uint16_t bilog_flags,
13290 rgw_zone_set *zones_trace)
7c673cae
FG
13291{
13292 rgw_bucket_dir_entry ent;
13293 ent.meta.mtime = removed_mtime;
13294 obj.key.get_index_key(&ent.key);
31f18b77 13295 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13296}
13297
31f18b77 13298int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
13299{
13300 rgw_bucket_dir_entry ent;
13301 obj.key.get_index_key(&ent.key);
31f18b77 13302 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
7c673cae
FG
13303}
13304
13305int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
13306{
13307 librados::IoCtx index_ctx;
13308 map<int, string> bucket_objs;
13309 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
13310 if (r < 0)
13311 return r;
13312
13313 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
13314}
13315
1adf2230
AA
13316
13317int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
13318 int shard_id,
13319 rgw_obj_index_key& start,
13320 const string& prefix,
13321 uint32_t num_entries,
13322 bool list_versions,
13323 map<string, rgw_bucket_dir_entry>& m,
13324 bool *is_truncated,
13325 rgw_obj_index_key *last_entry,
13326 bool (*force_check_filter)(const string& name))
7c673cae 13327{
1adf2230
AA
13328 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
13329 " start " << start.name << "[" << start.instance << "] num_entries " <<
13330 num_entries << dendl;
7c673cae
FG
13331
13332 librados::IoCtx index_ctx;
13333 // key - oid (for different shards if there is any)
1adf2230
AA
13334 // value - list result for the corresponding oid (shard), it is filled by
13335 // the AIO callback
7c673cae
FG
13336 map<int, string> oids;
13337 map<int, struct rgw_cls_list_ret> list_results;
13338 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13339 if (r < 0)
13340 return r;
13341
13342 cls_rgw_obj_key start_key(start.name, start.instance);
1adf2230
AA
13343 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
13344 list_versions, oids, list_results,
13345 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
13346 if (r < 0)
13347 return r;
13348
13349 // Create a list of iterators that are used to iterate each shard
13350 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
13351 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
13352 vector<string> vnames(list_results.size());
13353 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13354 *is_truncated = false;
13355 for (; iter != list_results.end(); ++iter) {
13356 vcurrents.push_back(iter->second.dir.m.begin());
13357 vends.push_back(iter->second.dir.m.end());
13358 vnames.push_back(oids[iter->first]);
13359 *is_truncated = (*is_truncated || iter->second.is_truncated);
13360 }
13361
13362 // Create a map to track the next candidate entry from each shard, if the entry
13363 // from a specified shard is selected/erased, the next entry from that shard will
13364 // be inserted for next round selection
13365 map<string, size_t> candidates;
13366 for (size_t i = 0; i < vcurrents.size(); ++i) {
13367 if (vcurrents[i] != vends[i]) {
13368 candidates[vcurrents[i]->first] = i;
13369 }
13370 }
13371
13372 map<string, bufferlist> updates;
13373 uint32_t count = 0;
13374 while (count < num_entries && !candidates.empty()) {
13375 r = 0;
13376 // Select the next one
13377 int pos = candidates.begin()->second;
13378 const string& name = vcurrents[pos]->first;
13379 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
13380
3efd9988
FG
13381 bool force_check = force_check_filter &&
13382 force_check_filter(dirent.key.name);
13383 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13384 !dirent.pending_map.empty() ||
13385 force_check) {
7c673cae
FG
13386 /* there are uncommitted ops. We need to check the current state,
13387 * and if the tags are old we need to do cleanup as well. */
13388 librados::IoCtx sub_ctx;
13389 sub_ctx.dup(index_ctx);
1adf2230
AA
13390 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
13391 updates[vnames[pos]]);
7c673cae
FG
13392 if (r < 0 && r != -ENOENT) {
13393 return r;
13394 }
13395 }
13396 if (r >= 0) {
1adf2230
AA
13397 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
13398 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae
FG
13399 m[name] = std::move(dirent);
13400 ++count;
13401 }
13402
13403 // Refresh the candidates map
13404 candidates.erase(candidates.begin());
13405 ++vcurrents[pos];
13406 if (vcurrents[pos] != vends[pos]) {
13407 candidates[vcurrents[pos]->first] = pos;
13408 }
13409 }
13410
13411 // Suggest updates if there is any
13412 map<string, bufferlist>::iterator miter = updates.begin();
13413 for (; miter != updates.end(); ++miter) {
13414 if (miter->second.length()) {
13415 ObjectWriteOperation o;
13416 cls_rgw_suggest_changes(o, miter->second);
13417 // we don't care if we lose suggested updates, send them off blindly
13418 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13419 index_ctx.aio_operate(miter->first, c, &o);
1adf2230 13420 c->release();
7c673cae
FG
13421 }
13422 }
13423
13424 // Check if all the returned entries are consumed or not
13425 for (size_t i = 0; i < vcurrents.size(); ++i) {
1adf2230 13426 if (vcurrents[i] != vends[i]) {
7c673cae 13427 *is_truncated = true;
1adf2230
AA
13428 break;
13429 }
7c673cae
FG
13430 }
13431 if (!m.empty())
13432 *last_entry = m.rbegin()->first;
13433
13434 return 0;
13435}
13436
1adf2230
AA
13437
13438int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
13439 int shard_id,
13440 rgw_obj_index_key& start,
13441 const string& prefix,
13442 uint32_t num_entries,
13443 bool list_versions,
13444 std::vector<rgw_bucket_dir_entry>& ent_list,
13445 bool *is_truncated,
13446 rgw_obj_index_key *last_entry,
13447 bool (*force_check_filter)(const string& name)) {
13448 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
13449 " start " << start.name << "[" << start.instance <<
13450 "] num_entries " << num_entries << dendl;
13451
13452 *is_truncated = false;
13453 librados::IoCtx index_ctx;
13454
13455 rgw_obj_index_key my_start = start;
13456
13457 map<int, string> oids;
13458 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13459 if (r < 0)
13460 return r;
13461 const uint32_t num_shards = oids.size();
13462
13463 uint32_t current_shard;
13464 if (shard_id >= 0) {
13465 current_shard = shard_id;
13466 } else if (my_start.empty()) {
13467 current_shard = 0u;
13468 } else {
13469 current_shard =
13470 rgw_bucket_shard_index(my_start.name, num_shards);
13471 }
13472
13473 uint32_t count = 0u;
13474 map<string, bufferlist> updates;
13475 std::string last_added_entry;
13476 while (count <= num_entries &&
13477 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
13478 current_shard < num_shards)) {
13479 // key - oid (for different shards if there is any)
13480 // value - list result for the corresponding oid (shard), it is filled by
13481 // the AIO callback
13482 map<int, struct rgw_cls_list_ret> list_results;
13483 r = CLSRGWIssueBucketList(index_ctx, my_start, prefix, num_entries,
13484 list_versions, oids, list_results,
13485 cct->_conf->rgw_bucket_index_max_aio)();
13486 if (r < 0)
13487 return r;
13488
13489 const std::string& oid = oids[current_shard];
13490 assert(list_results.find(current_shard) != list_results.end());
13491 auto& result = list_results[current_shard];
13492 for (auto& entry : result.dir.m) {
13493 rgw_bucket_dir_entry& dirent = entry.second;
13494
13495 bool force_check = force_check_filter &&
13496 force_check_filter(dirent.key.name);
13497 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13498 !dirent.pending_map.empty() ||
13499 force_check) {
13500 /* there are uncommitted ops. We need to check the current state,
13501 * and if the tags are old we need to do cleanup as well. */
13502 librados::IoCtx sub_ctx;
13503 sub_ctx.dup(index_ctx);
13504 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
13505 if (r < 0 && r != -ENOENT) {
13506 return r;
13507 }
13508 }
13509
13510 // at this point either r >=0 or r == -ENOENT
13511 if (r >= 0) { // i.e., if r != -ENOENT
13512 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
13513 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13514
13515 if (count < num_entries) {
13516 last_added_entry = entry.first;
13517 my_start = dirent.key;
13518 ent_list.emplace_back(std::move(dirent));
13519 ++count;
13520 } else {
13521 *is_truncated = true;
13522 goto check_updates;
13523 }
13524 } else { // r == -ENOENT
13525 // in the case of -ENOENT, make sure we're advancing marker
13526 // for possible next call to CLSRGWIssueBucketList
13527 my_start = dirent.key;
13528 }
13529 } // entry for loop
13530
13531 if (!result.is_truncated) {
13532 // if we reached the end of the shard read next shard
13533 ++current_shard;
13534 my_start = rgw_obj_index_key();
13535 }
13536 } // shard loop
13537
13538check_updates:
13539 // suggest updates if there is any
13540 map<string, bufferlist>::iterator miter = updates.begin();
13541 for (; miter != updates.end(); ++miter) {
13542 if (miter->second.length()) {
13543 ObjectWriteOperation o;
13544 cls_rgw_suggest_changes(o, miter->second);
13545 // we don't care if we lose suggested updates, send them off blindly
13546 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13547 index_ctx.aio_operate(miter->first, c, &o);
13548 c->release();
13549 }
13550 }
13551
13552 if (last_entry && !ent_list.empty()) {
13553 *last_entry = last_added_entry;
13554 }
13555
13556 return 0;
13557}
13558
13559
13560int RGWRados::cls_obj_usage_log_add(const string& oid,
13561 rgw_usage_log_info& info)
7c673cae
FG
13562{
13563 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13564
13565 rgw_rados_ref ref;
224ce89b 13566 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13567 if (r < 0) {
13568 return r;
13569 }
13570
13571 ObjectWriteOperation op;
13572 cls_rgw_usage_log_add(op, info);
13573
13574 r = ref.ioctx.operate(ref.oid, &op);
13575 return r;
13576}
13577
13578int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13579 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13580{
13581 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13582
13583 rgw_rados_ref ref;
224ce89b 13584 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13585 if (r < 0) {
13586 return r;
13587 }
13588
13589 *is_truncated = false;
13590
13591 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13592 max_entries, read_iter, usage, is_truncated);
13593
13594 return r;
13595}
13596
13597int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13598{
13599 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13600
13601 rgw_rados_ref ref;
224ce89b 13602 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13603 if (r < 0) {
13604 return r;
13605 }
13606
b32b8144 13607 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
7c673cae
FG
13608 return r;
13609}
13610
13611int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13612{
13613 librados::IoCtx index_ctx;
13614 string dir_oid;
13615
13616 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13617
13618 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13619 if (r < 0)
13620 return r;
13621
13622 bufferlist updates;
13623
13624 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13625 rgw_bucket_dir_entry entry;
13626 entry.key = *iter;
13627 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13628 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13629 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13630 ::encode(entry, updates);
13631 }
13632
13633 bufferlist out;
13634
13635 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13636
13637 return r;
13638}
13639
13640int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13641 const RGWBucketInfo& bucket_info,
13642 rgw_bucket_dir_entry& list_state,
13643 rgw_bucket_dir_entry& object,
13644 bufferlist& suggested_updates)
13645{
13646 const rgw_bucket& bucket = bucket_info.bucket;
13647 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13648
13649 std::string loc;
13650
13651 rgw_obj obj(bucket, list_state.key);
13652
13653 string oid;
13654 get_obj_bucket_and_oid_loc(obj, oid, loc);
13655
13656 if (loc != list_state.locator) {
13657 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13658 }
13659
13660 io_ctx.locator_set_key(list_state.locator);
13661
13662 RGWObjState *astate = NULL;
13663 RGWObjectCtx rctx(this);
13664 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13665 if (r < 0)
13666 return r;
13667
13668 list_state.pending_map.clear(); // we don't need this and it inflates size
13669 if (!astate->exists) {
13670 /* object doesn't exist right now -- hopefully because it's
13671 * marked as !exists and got deleted */
13672 if (list_state.exists) {
13673 /* FIXME: what should happen now? Work out if there are any
13674 * non-bad ways this could happen (there probably are, but annoying
13675 * to handle!) */
13676 }
13677 // encode a suggested removal of that key
13678 list_state.ver.epoch = io_ctx.get_last_version();
13679 list_state.ver.pool = io_ctx.get_id();
13680 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13681 return -ENOENT;
13682 }
13683
13684 string etag;
13685 string content_type;
13686 ACLOwner owner;
13687
13688 object.meta.size = astate->size;
13689 object.meta.accounted_size = astate->accounted_size;
13690 object.meta.mtime = astate->mtime;
13691
13692 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13693 if (iter != astate->attrset.end()) {
13694 etag = iter->second.c_str();
13695 }
13696 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13697 if (iter != astate->attrset.end()) {
13698 content_type = iter->second.c_str();
13699 }
13700 iter = astate->attrset.find(RGW_ATTR_ACL);
13701 if (iter != astate->attrset.end()) {
13702 r = decode_policy(iter->second, &owner);
13703 if (r < 0) {
13704 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13705 }
13706 }
13707
13708 if (astate->has_manifest) {
13709 RGWObjManifest::obj_iterator miter;
13710 RGWObjManifest& manifest = astate->manifest;
13711 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13712 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13713 rgw_obj loc;
13714 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13715
13716 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13717 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13718 r = delete_obj_index(loc);
13719 if (r < 0) {
13720 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13721 }
13722 }
13723 }
13724 }
13725
13726 object.meta.etag = etag;
13727 object.meta.content_type = content_type;
13728 object.meta.owner = owner.get_id().to_str();
13729 object.meta.owner_display_name = owner.get_display_name();
13730
13731 // encode suggested updates
13732 list_state.ver.pool = io_ctx.get_id();
13733 list_state.ver.epoch = astate->epoch;
13734 list_state.meta.size = object.meta.size;
13735 list_state.meta.accounted_size = object.meta.accounted_size;
13736 list_state.meta.mtime = object.meta.mtime;
13737 list_state.meta.category = main_category;
13738 list_state.meta.etag = etag;
13739 list_state.meta.content_type = content_type;
13740 if (astate->obj_tag.length() > 0)
13741 list_state.tag = astate->obj_tag.c_str();
13742 list_state.meta.owner = owner.get_id().to_str();
13743 list_state.meta.owner_display_name = owner.get_display_name();
13744
13745 list_state.exists = true;
13746 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13747 return 0;
13748}
13749
a8e16298 13750int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae
FG
13751{
13752 librados::IoCtx index_ctx;
13753 map<int, string> oids;
13754 map<int, struct rgw_cls_list_ret> list_results;
13755 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13756 if (r < 0)
13757 return r;
13758
13759 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13760 if (r < 0)
13761 return r;
13762
13763 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13764 for(; iter != list_results.end(); ++iter) {
a8e16298 13765 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
13766 }
13767 return 0;
13768}
13769
13770int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13771{
13772 librados::IoCtx index_ctx;
13773 map<int, string> bucket_objs;
13774 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13775 if (r < 0)
13776 return r;
13777
13778 map<int, string>::iterator iter = bucket_objs.begin();
13779 for (; iter != bucket_objs.end(); ++iter) {
13780 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13781 if (r < 0) {
13782 ctx->put();
13783 break;
13784 } else {
13785 (*num_aio)++;
13786 }
13787 }
13788 return r;
13789}
13790
13791int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13792{
13793 string buckets_obj_id;
13794 rgw_get_buckets_obj(user_id, buckets_obj_id);
13795 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13796
13797 rgw_rados_ref ref;
224ce89b 13798 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13799 if (r < 0) {
13800 return r;
13801 }
13802
13803 librados::ObjectReadOperation op;
13804 int rc;
13805 ::cls_user_get_header(op, header, &rc);
13806 bufferlist ibl;
13807 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13808 if (r < 0)
13809 return r;
13810 if (rc < 0)
13811 return rc;
13812
13813 return 0;
13814}
13815
94b18763
FG
13816int RGWRados::cls_user_reset_stats(const string& user_id)
13817{
13818 string buckets_obj_id;
13819 rgw_get_buckets_obj(user_id, buckets_obj_id);
13820 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13821
13822 rgw_rados_ref ref;
13823 int r = get_raw_obj_ref(obj, &ref);
13824 if (r < 0) {
13825 return r;
13826 }
13827
13828 librados::ObjectWriteOperation op;
13829 ::cls_user_reset_stats(op);
13830 return ref.ioctx.operate(ref.oid, &op);
13831}
13832
7c673cae
FG
13833int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13834{
13835 string buckets_obj_id;
13836 rgw_get_buckets_obj(user_id, buckets_obj_id);
13837 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13838
13839 rgw_rados_ref ref;
224ce89b 13840 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13841 if (r < 0) {
13842 return r;
13843 }
13844
13845 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13846 if (r < 0)
13847 return r;
13848
13849 return 0;
13850}
13851
13852int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13853{
a8e16298 13854 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
13855 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13856 if (r < 0) {
13857 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13858 return r;
13859 }
13860
13861 cls_user_bucket_entry entry;
13862
13863 bucket_info.bucket.convert(&entry.bucket);
13864
c07f9fc5 13865 for (const auto& hiter : headers) {
a8e16298 13866 for (const auto& iter : hiter.stats) {
c07f9fc5 13867 const struct rgw_bucket_category_stats& header_stats = iter.second;
7c673cae
FG
13868 entry.size += header_stats.total_size;
13869 entry.size_rounded += header_stats.total_size_rounded;
13870 entry.count += header_stats.num_entries;
13871 }
13872 }
13873
13874 list<cls_user_bucket_entry> entries;
13875 entries.push_back(entry);
13876
13877 r = cls_user_update_buckets(user_obj, entries, false);
13878 if (r < 0) {
13879 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13880 return r;
13881 }
13882
13883 return 0;
13884}
13885
c07f9fc5
FG
13886int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13887{
a8e16298 13888 vector<rgw_bucket_dir_header> headers;
c07f9fc5
FG
13889 RGWBucketInfo bucket_info;
13890 RGWObjectCtx obj_ctx(this);
13891 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13892 if (ret < 0) {
13893 return ret;
13894 }
13895
13896 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13897 if (ret < 0) {
13898 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13899 return ret;
13900 }
13901
13902 bucket.convert(&entry.bucket);
13903
13904 for (const auto& hiter : headers) {
a8e16298 13905 for (const auto& iter : hiter.stats) {
c07f9fc5
FG
13906 const struct rgw_bucket_category_stats& header_stats = iter.second;
13907 entry.size += header_stats.total_size;
13908 entry.size_rounded += header_stats.total_size_rounded;
13909 entry.count += header_stats.num_entries;
13910 }
13911 }
13912
13913 return 0;
13914}
13915
7c673cae
FG
13916int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13917 const string& in_marker,
13918 const string& end_marker,
13919 const int max_entries,
13920 list<cls_user_bucket_entry>& entries,
13921 string * const out_marker,
13922 bool * const truncated)
13923{
13924 rgw_rados_ref ref;
224ce89b 13925 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13926 if (r < 0) {
13927 return r;
13928 }
13929
13930 librados::ObjectReadOperation op;
13931 int rc;
13932
13933 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13934 bufferlist ibl;
13935 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13936 if (r < 0)
13937 return r;
13938 if (rc < 0)
13939 return rc;
13940
13941 return 0;
13942}
13943
13944int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13945{
13946 rgw_rados_ref ref;
224ce89b 13947 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13948 if (r < 0) {
13949 return r;
13950 }
13951
13952 librados::ObjectWriteOperation op;
13953 cls_user_set_buckets(op, entries, add);
13954 r = ref.ioctx.operate(ref.oid, &op);
13955 if (r < 0)
13956 return r;
13957
13958 return 0;
13959}
13960
13961int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13962{
13963 string buckets_obj_id;
13964 rgw_get_buckets_obj(user_id, buckets_obj_id);
13965 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13966 return cls_user_complete_stats_sync(obj);
13967}
13968
13969int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13970{
13971 rgw_rados_ref ref;
224ce89b 13972 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13973 if (r < 0) {
13974 return r;
13975 }
13976
13977 librados::ObjectWriteOperation op;
13978 ::cls_user_complete_stats_sync(op);
13979 r = ref.ioctx.operate(ref.oid, &op);
13980 if (r < 0)
13981 return r;
13982
13983 return 0;
13984}
13985
13986int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13987{
13988 list<cls_user_bucket_entry> l;
13989 l.push_back(entry);
13990
13991 return cls_user_update_buckets(obj, l, true);
13992}
13993
13994int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13995{
7c673cae 13996 rgw_rados_ref ref;
224ce89b 13997 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
13998 if (r < 0) {
13999 return r;
14000 }
14001
14002 librados::ObjectWriteOperation op;
14003 ::cls_user_remove_bucket(op, bucket);
14004 r = ref.ioctx.operate(ref.oid, &op);
14005 if (r < 0)
14006 return r;
14007
14008 return 0;
14009}
14010
224ce89b 14011int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
31f18b77
FG
14012 RGWQuotaInfo& bucket_quota)
14013{
14014 if (!cct->_conf->rgw_dynamic_resharding) {
14015 return 0;
14016 }
14017
14018 bool need_resharding = false;
14019 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
14020 uint32_t suggested_num_shards;
14021
14022 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
14023 num_source_shards, bucket_info.owner, bucket, bucket_quota,
14024 1, need_resharding, &suggested_num_shards);
14025 if (ret < 0) {
14026 return ret;
14027 }
14028
14029 if (need_resharding) {
224ce89b
WB
14030 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
14031 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
14032 dendl;
31f18b77
FG
14033 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
14034 }
14035
14036 return ret;
14037}
14038
14039int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
14040{
14041 RGWReshard reshard(this);
14042
14043 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
14044
14045 new_num_shards = min(new_num_shards, get_max_bucket_shards());
14046 if (new_num_shards <= num_source_shards) {
14047 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
14048 return 0;
14049 }
14050
14051 cls_rgw_reshard_entry entry;
14052 entry.time = real_clock::now();
14053 entry.tenant = bucket_info.owner.tenant;
14054 entry.bucket_name = bucket_info.bucket.name;
14055 entry.bucket_id = bucket_info.bucket.bucket_id;
14056 entry.old_num_shards = num_source_shards;
14057 entry.new_num_shards = new_num_shards;
14058
14059 return reshard.add(entry);
14060}
14061
7c673cae
FG
14062int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
14063 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
14064{
14065 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
14066}
14067
14068void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
1adf2230
AA
14069 uint32_t num_shards,
14070 map<int, string>& bucket_objects,
14071 int shard_id) {
7c673cae
FG
14072 if (!num_shards) {
14073 bucket_objects[0] = bucket_oid_base;
14074 } else {
14075 char buf[bucket_oid_base.size() + 32];
14076 if (shard_id < 0) {
14077 for (uint32_t i = 0; i < num_shards; ++i) {
14078 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
14079 bucket_objects[i] = buf;
14080 }
14081 } else {
14082 if ((uint32_t)shard_id > num_shards) {
14083 return;
14084 }
14085 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
14086 bucket_objects[shard_id] = buf;
14087 }
14088 }
14089}
14090
14091void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
14092{
14093 const rgw_bucket& bucket = bucket_info.bucket;
14094 string plain_id = bucket.name + ":" + bucket.bucket_id;
14095 if (!bucket_info.num_shards) {
14096 (*result)[0] = plain_id;
14097 } else {
14098 char buf[16];
14099 if (shard_id < 0) {
14100 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
14101 snprintf(buf, sizeof(buf), ":%d", i);
14102 (*result)[i] = plain_id + buf;
14103 }
14104 } else {
14105 if ((uint32_t)shard_id > bucket_info.num_shards) {
14106 return;
14107 }
14108 snprintf(buf, sizeof(buf), ":%d", shard_id);
14109 (*result)[shard_id] = plain_id + buf;
14110 }
14111 }
14112}
14113
14114int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
14115 int *shard_id)
14116{
14117 int r = 0;
14118 switch (bucket_info.bucket_index_shard_hash_type) {
14119 case RGWBucketInfo::MOD:
14120 if (!bucket_info.num_shards) {
14121 if (shard_id) {
14122 *shard_id = -1;
14123 }
14124 } else {
1adf2230 14125 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
7c673cae
FG
14126 if (shard_id) {
14127 *shard_id = (int)sid;
14128 }
14129 }
14130 break;
14131 default:
14132 r = -ENOTSUP;
14133 }
14134 return r;
14135}
14136
14137void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
14138 int shard_id, string *bucket_obj)
14139{
14140 if (!num_shards) {
14141 // By default with no sharding, we use the bucket oid as itself
14142 (*bucket_obj) = bucket_oid_base;
14143 } else {
14144 char buf[bucket_oid_base.size() + 32];
14145 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
14146 (*bucket_obj) = buf;
14147 }
14148}
14149
14150int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
14151 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
14152{
14153 int r = 0;
14154 switch (hash_type) {
14155 case RGWBucketInfo::MOD:
14156 if (!num_shards) {
14157 // By default with no sharding, we use the bucket oid as itself
14158 (*bucket_obj) = bucket_oid_base;
14159 if (shard_id) {
14160 *shard_id = -1;
14161 }
14162 } else {
1adf2230 14163 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
14164 char buf[bucket_oid_base.size() + 32];
14165 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
14166 (*bucket_obj) = buf;
14167 if (shard_id) {
14168 *shard_id = (int)sid;
14169 }
14170 }
14171 break;
14172 default:
14173 r = -ENOTSUP;
14174 }
14175 return r;
14176}
14177
14178void RGWStateLog::oid_str(int shard, string& oid) {
14179 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
14180 char buf[16];
14181 snprintf(buf, sizeof(buf), "%d", shard);
14182 oid += buf;
14183}
14184
14185int RGWStateLog::get_shard_num(const string& object) {
14186 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
14187 return val % num_shards;
14188}
14189
14190string RGWStateLog::get_oid(const string& object) {
14191 int shard = get_shard_num(object);
14192 string oid;
14193 oid_str(shard, oid);
14194 return oid;
14195}
14196
14197int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
14198 rgw_pool pool;
14199 store->get_log_pool(pool);
14200 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
14201 if (r < 0) {
14202 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
14203 return r;
14204 }
14205 return 0;
14206}
14207
14208int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
14209 uint32_t state, bufferlist *bl, uint32_t *check_state)
14210{
14211 if (client_id.empty() ||
14212 op_id.empty() ||
14213 object.empty()) {
14214 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14215 }
14216
14217 librados::IoCtx ioctx;
14218 int r = open_ioctx(ioctx);
14219 if (r < 0)
14220 return r;
14221
14222 string oid = get_oid(object);
14223
14224 librados::ObjectWriteOperation op;
14225 if (check_state) {
14226 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
14227 }
14228 utime_t ts = ceph_clock_now();
14229 bufferlist nobl;
14230 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
14231 r = ioctx.operate(oid, &op);
14232 if (r < 0) {
14233 return r;
14234 }
14235
14236 return 0;
14237}
14238
14239int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
14240{
14241 if (client_id.empty() ||
14242 op_id.empty() ||
14243 object.empty()) {
14244 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14245 }
14246
14247 librados::IoCtx ioctx;
14248 int r = open_ioctx(ioctx);
14249 if (r < 0)
14250 return r;
14251
14252 string oid = get_oid(object);
14253
14254 librados::ObjectWriteOperation op;
14255 cls_statelog_remove_by_object(op, object, op_id);
14256 r = ioctx.operate(oid, &op);
14257 if (r < 0) {
14258 return r;
14259 }
14260
14261 return 0;
14262}
14263
14264void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
14265 void **handle)
14266{
14267 list_state *state = new list_state;
14268 state->client_id = client_id;
14269 state->op_id = op_id;
14270 state->object = object;
14271 if (object.empty()) {
14272 state->cur_shard = 0;
14273 state->max_shard = num_shards - 1;
14274 } else {
14275 state->cur_shard = state->max_shard = get_shard_num(object);
14276 }
14277 *handle = (void *)state;
14278}
14279
14280int RGWStateLog::list_entries(void *handle, int max_entries,
14281 list<cls_statelog_entry>& entries,
14282 bool *done)
14283{
14284 list_state *state = static_cast<list_state *>(handle);
14285
14286 librados::IoCtx ioctx;
14287 int r = open_ioctx(ioctx);
14288 if (r < 0)
14289 return r;
14290
14291 entries.clear();
14292
14293 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
14294 string oid;
14295 oid_str(state->cur_shard, oid);
14296
14297 librados::ObjectReadOperation op;
14298 list<cls_statelog_entry> ents;
14299 bool truncated;
14300 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
14301 max_entries, ents, &state->marker, &truncated);
14302 bufferlist ibl;
14303 r = ioctx.operate(oid, &op, &ibl);
14304 if (r == -ENOENT) {
14305 truncated = false;
14306 r = 0;
14307 }
14308 if (r < 0) {
14309 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
14310 return r;
14311 }
14312
14313 if (!truncated) {
14314 state->marker.clear();
14315 }
14316
14317 max_entries -= ents.size();
14318
14319 entries.splice(entries.end(), ents);
14320
14321 if (truncated)
14322 break;
14323 }
14324
14325 *done = (state->cur_shard > state->max_shard);
14326
14327 return 0;
14328}
14329
14330void RGWStateLog::finish_list_entries(void *handle)
14331{
14332 list_state *state = static_cast<list_state *>(handle);
14333 delete state;
14334}
14335
14336void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
14337{
14338 f->open_object_section("statelog_entry");
14339 f->dump_string("client_id", entry.client_id);
14340 f->dump_string("op_id", entry.op_id);
14341 f->dump_string("object", entry.object);
14342 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
14343 if (!dump_entry_internal(entry, f)) {
14344 f->dump_int("state", entry.state);
14345 }
14346 f->close_section();
14347}
14348
14349RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
14350{
14351}
14352
14353bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
14354{
14355 string s;
14356 switch ((OpState)entry.state) {
14357 case OPSTATE_UNKNOWN:
14358 s = "unknown";
14359 break;
14360 case OPSTATE_IN_PROGRESS:
14361 s = "in-progress";
14362 break;
14363 case OPSTATE_COMPLETE:
14364 s = "complete";
14365 break;
14366 case OPSTATE_ERROR:
14367 s = "error";
14368 break;
14369 case OPSTATE_ABORT:
14370 s = "abort";
14371 break;
14372 case OPSTATE_CANCELLED:
14373 s = "cancelled";
14374 break;
14375 default:
14376 s = "invalid";
14377 }
14378 f->dump_string("state", s);
14379 return true;
14380}
14381
14382int RGWOpState::state_from_str(const string& s, OpState *state)
14383{
14384 if (s == "unknown") {
14385 *state = OPSTATE_UNKNOWN;
14386 } else if (s == "in-progress") {
14387 *state = OPSTATE_IN_PROGRESS;
14388 } else if (s == "complete") {
14389 *state = OPSTATE_COMPLETE;
14390 } else if (s == "error") {
14391 *state = OPSTATE_ERROR;
14392 } else if (s == "abort") {
14393 *state = OPSTATE_ABORT;
14394 } else if (s == "cancelled") {
14395 *state = OPSTATE_CANCELLED;
14396 } else {
14397 return -EINVAL;
14398 }
14399
14400 return 0;
14401}
14402
14403int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
14404{
14405 uint32_t s = (uint32_t)state;
14406 return store_entry(client_id, op_id, object, s, NULL, NULL);
14407}
14408
14409int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
14410{
14411 uint32_t s = (uint32_t)state;
14412 return store_entry(client_id, op_id, object, s, NULL, &s);
14413}
14414
14415RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
14416 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
14417{
14418 cct = store->ctx();
14419 cur_state = RGWOpState::OPSTATE_UNKNOWN;
14420}
14421
14422int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
14423 last_update = real_clock::now();
14424 cur_state = state;
14425 return os.set_state(client_id, op_id, object, state);
14426}
14427
14428int RGWOpStateSingleOp::renew_state() {
14429 real_time now = real_clock::now();
14430
14431 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
14432
14433 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
14434 return 0;
14435 }
14436
14437 last_update = now;
14438 return os.renew_state(client_id, op_id, object, cur_state);
14439}
14440
14441
14442uint64_t RGWRados::instance_id()
14443{
14444 return get_rados_handle()->get_instance_id();
14445}
14446
14447uint64_t RGWRados::next_bucket_id()
14448{
14449 Mutex::Locker l(bucket_id_lock);
14450 return ++max_bucket_id;
14451}
14452
28e407b8
AA
14453RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
14454 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
7c673cae 14455{
7c673cae
FG
14456 RGWRados *store = NULL;
14457 if (!use_cache) {
14458 store = new RGWRados;
14459 } else {
28e407b8 14460 store = new RGWCache<RGWRados>;
7c673cae
FG
14461 }
14462
31f18b77 14463 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
7c673cae
FG
14464 delete store;
14465 return NULL;
14466 }
14467
14468 return store;
14469}
14470
14471RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
14472{
14473 RGWRados *store = NULL;
14474 store = new RGWRados;
14475
14476 store->set_context(cct);
14477
14478 if (store->init_rados() < 0) {
14479 delete store;
14480 return NULL;
14481 }
14482
14483 return store;
14484}
14485
14486void RGWStoreManager::close_storage(RGWRados *store)
14487{
14488 if (!store)
14489 return;
14490
14491 store->finalize();
14492
14493 delete store;
14494}
14495
14496librados::Rados* RGWRados::get_rados_handle()
14497{
14498 if (rados.size() == 1) {
14499 return &rados[0];
14500 } else {
14501 handle_lock.get_read();
14502 pthread_t id = pthread_self();
14503 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
14504
14505 if (it != rados_map.end()) {
14506 handle_lock.put_read();
14507 return &rados[it->second];
14508 } else {
14509 handle_lock.put_read();
14510 handle_lock.get_write();
14511 const uint32_t handle = next_rados_handle;
14512 rados_map[id] = handle;
14513 if (++next_rados_handle == rados.size()) {
14514 next_rados_handle = 0;
14515 }
14516 handle_lock.put_write();
14517 return &rados[handle];
14518 }
14519 }
14520}
14521
14522int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
14523{
14524 rgw_rados_ref ref;
14525 int ret = get_raw_obj_ref(obj, &ref);
14526 if (ret < 0) {
14527 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14528 return ret;
14529 }
14530
14531 ObjectWriteOperation op;
14532 list<string> prefixes;
14533 cls_rgw_remove_obj(op, prefixes);
14534
14535 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14536 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14537 if (ret < 0) {
14538 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14539 c->release();
14540 return ret;
14541 }
14542
14543 handles.push_back(c);
14544
14545 return 0;
14546}
14547
14548int RGWRados::delete_obj_aio(const rgw_obj& obj,
14549 RGWBucketInfo& bucket_info, RGWObjState *astate,
14550 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14551{
14552 rgw_rados_ref ref;
14553 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14554 if (ret < 0) {
14555 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14556 return ret;
14557 }
14558
14559 if (keep_index_consistent) {
14560 RGWRados::Bucket bop(this, bucket_info);
14561 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14562
14563 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14564 if (ret < 0) {
14565 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14566 return ret;
14567 }
14568 }
14569
14570 ObjectWriteOperation op;
14571 list<string> prefixes;
14572 cls_rgw_remove_obj(op, prefixes);
14573
14574 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14575 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14576 if (ret < 0) {
14577 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14578 c->release();
14579 return ret;
14580 }
14581
14582 handles.push_back(c);
14583
14584 if (keep_index_consistent) {
14585 ret = delete_obj_index(obj);
14586 if (ret < 0) {
14587 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14588 return ret;
14589 }
14590 }
14591 return ret;
14592}
14593
14594int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14595 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14596 if (value != attrs.end()) {
14597 bufferlist::iterator bliter = value->second.begin();
14598 try {
14599 ::decode(cs_info, bliter);
14600 } catch (buffer::error& err) {
14601 return -EIO;
14602 }
14603 if (cs_info.blocks.size() == 0) {
14604 return -EIO;
14605 }
14606 if (cs_info.compression_type != "none")
14607 need_decompress = true;
14608 else
14609 need_decompress = false;
14610 return 0;
14611 } else {
14612 need_decompress = false;
14613 return 0;
14614 }
14615}
14616
3a9019d9
FG
14617bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14618 bufferlist& out)
14619{
14620 if (command == "cache list") {
14621 boost::optional<std::string> filter;
14622 auto i = cmdmap.find("filter");
14623 if (i != cmdmap.cend()) {
14624 filter = boost::get<std::string>(i->second);
14625 }
14626 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14627 if (f) {
14628 f->open_array_section("cache_entries");
14629 call_list(filter, f.get());
14630 f->close_section();
14631 f->flush(out);
14632 return true;
14633 } else {
14634 out.append("Unable to create Formatter.\n");
14635 return false;
14636 }
14637 } else if (command == "cache inspect") {
14638 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14639 if (f) {
14640 const auto& target = boost::get<std::string>(cmdmap["target"]);
14641 if (call_inspect(target, f.get())) {
14642 f->flush(out);
14643 return true;
14644 } else {
14645 out.append(string("Unable to find entry ") + target + string(".\n"));
14646 return false;
14647 }
14648 } else {
14649 out.append("Unable to create Formatter.\n");
14650 return false;
14651 }
14652 } else if (command == "cache erase") {
14653 const auto& target = boost::get<std::string>(cmdmap["target"]);
14654 if (call_erase(target)) {
14655 return true;
14656 } else {
14657 out.append(string("Unable to find entry ") + target + string(".\n"));
14658 return false;
14659 }
14660 } else if (command == "cache zap") {
14661 call_zap();
14662 return true;
14663 }
14664 return false;
14665}
14666
14667void RGWRados::call_list(const boost::optional<std::string>&,
14668 ceph::Formatter*)
14669{
14670 return;
14671}
14672
14673bool RGWRados::call_inspect(const std::string&, Formatter*)
14674{
14675 return false;
14676}
14677
14678bool RGWRados::call_erase(const std::string&) {
14679 return false;
14680}
14681
14682void RGWRados::call_zap() {
14683 return;
14684}