]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
bump version to 12.2.11-pve1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
8#include <boost/algorithm/string.hpp>
9
10#include <boost/format.hpp>
11#include <boost/optional.hpp>
12#include <boost/utility/in_place_factory.hpp>
13
14#include "common/ceph_json.h"
15#include "common/utf8.h"
16
17#include "common/errno.h"
18#include "common/Formatter.h"
19#include "common/Throttle.h"
20#include "common/Finisher.h"
21
22#include "rgw_rados.h"
23#include "rgw_cache.h"
24#include "rgw_acl.h"
25#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26#include "rgw_metadata.h"
27#include "rgw_bucket.h"
28#include "rgw_rest_conn.h"
29#include "rgw_cr_rados.h"
30#include "rgw_cr_rest.h"
31
32#include "cls/rgw/cls_rgw_ops.h"
33#include "cls/rgw/cls_rgw_types.h"
34#include "cls/rgw/cls_rgw_client.h"
35#include "cls/rgw/cls_rgw_const.h"
36#include "cls/refcount/cls_refcount_client.h"
37#include "cls/version/cls_version_client.h"
38#include "cls/log/cls_log_client.h"
39#include "cls/statelog/cls_statelog_client.h"
40#include "cls/timeindex/cls_timeindex_client.h"
41#include "cls/lock/cls_lock_client.h"
42#include "cls/user/cls_user_client.h"
c07f9fc5 43#include "osd/osd_types.h"
7c673cae
FG
44
45#include "rgw_tools.h"
46#include "rgw_coroutine.h"
47#include "rgw_compression.h"
48
7c673cae
FG
49#undef fork // fails to compile RGWPeriod::fork() below
50
51#include "common/Clock.h"
52
53#include "include/rados/librados.hpp"
54using namespace librados;
55
56#include <string>
57#include <iostream>
58#include <vector>
59#include <atomic>
60#include <list>
61#include <map>
62#include "auth/Crypto.h" // get_random_bytes()
63
64#include "rgw_log.h"
65
66#include "rgw_gc.h"
67#include "rgw_lc.h"
68
69#include "rgw_object_expirer_core.h"
70#include "rgw_sync.h"
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae
FG
74
75#include "compressor/Compressor.h"
76
7c673cae
FG
77#define dout_context g_ceph_context
78#define dout_subsys ceph_subsys_rgw
79
80using namespace std;
81
82static string notify_oid_prefix = "notify";
83static string *notify_oids = NULL;
84static string shadow_ns = "shadow";
85static string dir_oid_prefix = ".dir.";
86static string default_storage_pool_suffix = "rgw.buckets.data";
87static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89static string avail_pools = ".pools.avail";
90
91static string zone_info_oid_prefix = "zone_info.";
92static string zone_names_oid_prefix = "zone_names.";
93static string region_info_oid_prefix = "region_info.";
94static string zone_group_info_oid_prefix = "zonegroup_info.";
95static string realm_names_oid_prefix = "realms_names.";
96static string realm_info_oid_prefix = "realms.";
97static string default_region_info_oid = "default.region";
98static string default_zone_group_info_oid = "default.zonegroup";
99static string period_info_oid_prefix = "periods.";
100static string period_latest_epoch_info_oid = ".latest_epoch";
101static string region_map_oid = "region_map";
102static string zonegroup_map_oid = "zonegroup_map";
103static string log_lock_name = "rgw_log_lock";
104static string default_realm_info_oid = "default.realm";
105const string default_zonegroup_name = "default";
106const string default_zone_name = "default";
107static string zonegroup_names_oid_prefix = "zonegroups_names.";
108static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109#define RGW_USAGE_OBJ_PREFIX "usage."
110#define FIRST_EPOCH 1
111static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116#define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118#define dout_subsys ceph_subsys_rgw
119
120
121static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123{
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
31f18b77 133 *pool = placement.get_data_extra_pool();
7c673cae
FG
134 }
135 }
136
137 return true;
138}
139
140static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142{
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146}
147
148rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149{
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156}
157
158rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159{
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166}
167
168int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169{
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
28e407b8
AA
173 if (r == -ERANGE) {
174 dout(0)
175 << __func__
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
179 << dendl;
180 }
7c673cae
FG
181 if (r < 0 && r != -EEXIST) {
182 return r;
183 }
184
185 r = rados->ioctx_create(pool.name.c_str(), ioctx);
c07f9fc5
FG
186 if (r < 0) {
187 return r;
188 }
189
190 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
191 if (r < 0 && r != -EOPNOTSUPP) {
192 return r;
193 }
194 } else if (r < 0) {
7c673cae
FG
195 return r;
196 }
197 if (!pool.ns.empty()) {
198 ioctx.set_namespace(pool.ns);
199 }
200 return 0;
201}
202
203template<>
204void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
205 RWLock::WLocker wl(lock);
206 auto iter = objs_state.find(obj);
207 if (iter == objs_state.end()) {
208 return;
209 }
210 bool is_atomic = iter->second.is_atomic;
211 bool prefetch_data = iter->second.prefetch_data;
212
213 objs_state.erase(iter);
214
215 if (is_atomic || prefetch_data) {
216 auto& s = objs_state[obj];
217 s.is_atomic = is_atomic;
218 s.prefetch_data = prefetch_data;
219 }
220}
221
222template<>
223void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
224 RWLock::WLocker wl(lock);
225 auto iter = objs_state.find(obj);
226 if (iter == objs_state.end()) {
227 return;
228 }
229
230 objs_state.erase(iter);
231}
232
233void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
234 encode_json("default_zonegroup", default_zonegroup, f);
235}
236
237void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
238
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
240 /* backward compatability with region */
241 if (default_zonegroup.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
243 }
244}
245
246rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
247{
248 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
250 }
251
252 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
253}
254
255int RGWZoneGroup::create_default(bool old_format)
256{
257 name = default_zonegroup_name;
258 is_master = true;
259
260 RGWZoneGroupPlacementTarget placement_target;
261 placement_target.name = "default-placement";
262 placement_targets[placement_target.name] = placement_target;
263 default_placement = "default-placement";
264
265 RGWZoneParams zone_params(default_zone_name);
266
267 int r = zone_params.init(cct, store, false);
268 if (r < 0) {
269 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
270 return r;
271 }
272
273 r = zone_params.create_default();
274 if (r < 0 && r != -EEXIST) {
275 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 } else if (r == -EEXIST) {
278 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
279 zone_params.clear_id();
280 r = zone_params.init(cct, store);
281 if (r < 0) {
282 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
283 return r;
284 }
285 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
286 << dendl;
287 }
288
289 RGWZone& default_zone = zones[zone_params.get_id()];
290 default_zone.name = zone_params.get_name();
291 default_zone.id = zone_params.get_id();
292 master_zone = default_zone.id;
293
294 r = create();
295 if (r < 0 && r != -EEXIST) {
296 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
297 return r;
298 }
299
300 if (r == -EEXIST) {
301 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
302 id.clear();
303 r = init(cct, store);
304 if (r < 0) {
305 return r;
306 }
307 }
308
309 if (old_format) {
310 name = id;
311 }
312
313 post_process_params();
314
315 return 0;
316}
317
318const string RGWZoneGroup::get_default_oid(bool old_region_format)
319{
320 if (old_region_format) {
321 if (cct->_conf->rgw_default_region_info_oid.empty()) {
322 return default_region_info_oid;
323 }
324 return cct->_conf->rgw_default_region_info_oid;
325 }
326
327 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
328
329 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
330 default_oid = default_zone_group_info_oid;
331 }
332
333 default_oid += "." + realm_id;
334
335 return default_oid;
336}
337
338const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
339{
340 if (old_region_format) {
341 return region_info_oid_prefix;
342 }
343 return zone_group_info_oid_prefix;
344}
345
346const string& RGWZoneGroup::get_names_oid_prefix()
347{
348 return zonegroup_names_oid_prefix;
349}
350
351const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
352 return cct->_conf->rgw_zonegroup;
353}
354
355int RGWZoneGroup::equals(const string& other_zonegroup) const
356{
357 if (is_master && other_zonegroup.empty())
358 return true;
359
360 return (id == other_zonegroup);
361}
362
363int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
364 const list<string>& endpoints, const string *ptier_type,
365 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
366{
367 auto& zone_id = zone_params.get_id();
368 auto& zone_name = zone_params.get_name();
369
370 // check for duplicate zone name on insert
371 if (!zones.count(zone_id)) {
372 for (const auto& zone : zones) {
373 if (zone.second.name == zone_name) {
374 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
376 return -EEXIST;
377 }
378 }
379 }
380
381 if (is_master) {
382 if (*is_master) {
383 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
384 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
385 }
386 master_zone = zone_params.get_id();
387 } else if (master_zone == zone_params.get_id()) {
388 master_zone.clear();
389 }
390 }
391
392 RGWZone& zone = zones[zone_params.get_id()];
393 zone.name = zone_params.get_name();
394 zone.id = zone_params.get_id();
395 if (!endpoints.empty()) {
396 zone.endpoints = endpoints;
397 }
398 if (read_only) {
399 zone.read_only = *read_only;
400 }
401 if (ptier_type) {
402 zone.tier_type = *ptier_type;
403 }
404
405 if (psync_from_all) {
406 zone.sync_from_all = *psync_from_all;
407 }
408
409 for (auto add : sync_from) {
410 zone.sync_from.insert(add);
411 }
412
413 for (auto rm : sync_from_rm) {
414 zone.sync_from.erase(rm);
415 }
416
417 post_process_params();
418
419 return update();
420}
421
422
423int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
424{
425 RGWZone& zone = zones[zone_params.get_id()];
426 zone.name = zone_params.get_name();
427
428 return update();
429}
430
431void RGWZoneGroup::post_process_params()
432{
433 bool log_data = zones.size() > 1;
434
435 if (master_zone.empty()) {
436 map<string, RGWZone>::iterator iter = zones.begin();
437 if (iter != zones.end()) {
438 master_zone = iter->first;
439 }
440 }
441
442 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
443 RGWZone& zone = iter->second;
444 zone.log_data = log_data;
7c673cae
FG
445
446 RGWZoneParams zone_params(zone.id, zone.name);
447 int ret = zone_params.init(cct, store);
448 if (ret < 0) {
449 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
450 continue;
451 }
452
453 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
454 iter != zone_params.placement_pools.end(); ++iter) {
455 const string& placement_name = iter->first;
456 if (placement_targets.find(placement_name) == placement_targets.end()) {
457 RGWZoneGroupPlacementTarget placement_target;
458 placement_target.name = placement_name;
459 placement_targets[placement_name] = placement_target;
460 }
461 }
462 }
463
464 if (default_placement.empty() && !placement_targets.empty()) {
465 default_placement = placement_targets.begin()->first;
466 }
467}
468
469int RGWZoneGroup::remove_zone(const std::string& zone_id)
470{
471 map<string, RGWZone>::iterator iter = zones.find(zone_id);
472 if (iter == zones.end()) {
473 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
474 << name << dendl;
475 return -ENOENT;
476 }
477
478 zones.erase(iter);
479
480 post_process_params();
481
482 return update();
483}
484
485int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
486{
487 if (realm_id.empty()) {
488 /* try using default realm */
489 RGWRealm realm;
490 int ret = realm.init(cct, store);
b32b8144 491 // no default realm exist
7c673cae 492 if (ret < 0) {
b32b8144 493 return read_id(default_zonegroup_name, default_id);
7c673cae
FG
494 }
495 realm_id = realm.get_id();
496 }
497
498 return RGWSystemMetaObj::read_default_id(default_id, old_format);
499}
500
501int RGWZoneGroup::set_as_default(bool exclusive)
502{
503 if (realm_id.empty()) {
504 /* try using default realm */
505 RGWRealm realm;
506 int ret = realm.init(cct, store);
507 if (ret < 0) {
508 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
509 return -EINVAL;
510 }
511 realm_id = realm.get_id();
512 }
513
514 return RGWSystemMetaObj::set_as_default(exclusive);
515}
516
517int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
518{
519 cct = _cct;
520 store = _store;
521
522 if (!setup_obj)
523 return 0;
524
525 if (old_format && id.empty()) {
526 id = name;
527 }
528
529 if (id.empty()) {
530 int r;
531 if (name.empty()) {
532 name = get_predefined_name(cct);
533 }
534 if (name.empty()) {
535 r = use_default(old_format);
536 if (r < 0) {
537 return r;
538 }
539 } else if (!old_format) {
540 r = read_id(name, id);
541 if (r < 0) {
542 if (r != -ENOENT) {
543 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
544 }
545 return r;
546 }
547 }
548 }
549
550 return read_info(id, old_format);
551}
552
553int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
554{
555 auto pool = get_pool(cct);
556 bufferlist bl;
557 RGWObjectCtx obj_ctx(store);
558 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
559 if (ret < 0)
560 return ret;
561
562 try {
563 bufferlist::iterator iter = bl.begin();
564 ::decode(default_info, iter);
565 } catch (buffer::error& err) {
566 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
567 return -EIO;
568 }
569
570 return 0;
571}
572
573int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
574{
575 RGWDefaultSystemMetaObjInfo default_info;
576
577 int ret = read_default(default_info, get_default_oid(old_format));
578 if (ret < 0) {
579 return ret;
580 }
581
582 default_id = default_info.default_id;
583
584 return 0;
585}
586
587int RGWSystemMetaObj::use_default(bool old_format)
588{
589 return read_default_id(id, old_format);
590}
591
592int RGWSystemMetaObj::set_as_default(bool exclusive)
593{
594 string oid = get_default_oid();
595
596 rgw_pool pool(get_pool(cct));
597 bufferlist bl;
598
599 RGWDefaultSystemMetaObjInfo default_info;
600 default_info.default_id = id;
601
602 ::encode(default_info, bl);
603
604 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
605 exclusive, NULL, real_time(), NULL);
606 if (ret < 0)
607 return ret;
608
609 return 0;
610}
611
612int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
613{
614 rgw_pool pool(get_pool(cct));
615 bufferlist bl;
616
617 string oid = get_names_oid_prefix() + obj_name;
618
619 RGWObjectCtx obj_ctx(store);
620 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
621 if (ret < 0) {
622 return ret;
623 }
624
625 RGWNameToId nameToId;
626 try {
627 bufferlist::iterator iter = bl.begin();
628 ::decode(nameToId, iter);
629 } catch (buffer::error& err) {
630 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
631 return -EIO;
632 }
633 object_id = nameToId.obj_id;
634 return 0;
635}
636
637int RGWSystemMetaObj::delete_obj(bool old_format)
638{
639 rgw_pool pool(get_pool(cct));
640
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info;
643 int ret = read_default(default_info, get_default_oid(old_format));
644 if (ret < 0 && ret != -ENOENT)
645 return ret;
646 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
647 string oid = get_default_oid(old_format);
648 rgw_raw_obj default_named_obj(pool, oid);
649 ret = store->delete_system_obj(default_named_obj);
650 if (ret < 0) {
651 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
652 return ret;
653 }
654 }
655 if (!old_format) {
656 string oid = get_names_oid_prefix() + name;
657 rgw_raw_obj object_name(pool, oid);
658 ret = store->delete_system_obj(object_name);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
661 return ret;
662 }
663 }
664
665 string oid = get_info_oid_prefix(old_format);
666 if (old_format) {
667 oid += name;
668 } else {
669 oid += id;
670 }
671
672 rgw_raw_obj object_id(pool, oid);
673 ret = store->delete_system_obj(object_id);
674 if (ret < 0) {
675 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
676 }
677
678 return ret;
679}
680
681int RGWSystemMetaObj::store_name(bool exclusive)
682{
683 rgw_pool pool(get_pool(cct));
684 string oid = get_names_oid_prefix() + name;
685
686 RGWNameToId nameToId;
687 nameToId.obj_id = id;
688
689 bufferlist bl;
690 ::encode(nameToId, bl);
691 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
692}
693
694int RGWSystemMetaObj::rename(const string& new_name)
695{
696 string new_id;
697 int ret = read_id(new_name, new_id);
698 if (!ret) {
699 return -EEXIST;
700 }
701 if (ret < 0 && ret != -ENOENT) {
702 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 string old_name = name;
706 name = new_name;
707 ret = update();
708 if (ret < 0) {
709 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712 ret = store_name(true);
713 if (ret < 0) {
714 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
715 return ret;
716 }
717 /* delete old name */
718 rgw_pool pool(get_pool(cct));
719 string oid = get_names_oid_prefix() + old_name;
720 rgw_raw_obj old_name_obj(pool, oid);
721 ret = store->delete_system_obj(old_name_obj);
722 if (ret < 0) {
723 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
724 return ret;
725 }
726
727 return ret;
728}
729
730int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
731{
732 rgw_pool pool(get_pool(cct));
733
734 bufferlist bl;
735
736 string oid = get_info_oid_prefix(old_format) + obj_id;
737
738 RGWObjectCtx obj_ctx(store);
739 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
740 if (ret < 0) {
741 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
742 return ret;
743 }
744
745 try {
746 bufferlist::iterator iter = bl.begin();
747 ::decode(*this, iter);
748 } catch (buffer::error& err) {
749 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
750 return -EIO;
751 }
752
753 return 0;
754}
755
756int RGWSystemMetaObj::read()
757{
758 int ret = read_id(name, id);
759 if (ret < 0) {
760 return ret;
761 }
762
763 return read_info(id);
764}
765
766int RGWSystemMetaObj::create(bool exclusive)
767{
768 int ret;
769
770 /* check to see the name is not used */
771 ret = read_id(name, id);
772 if (exclusive && ret == 0) {
773 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
774 return -EEXIST;
775 } else if ( ret < 0 && ret != -ENOENT) {
776 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 if (id.empty()) {
781 /* create unique id */
782 uuid_d new_uuid;
783 char uuid_str[37];
784 new_uuid.generate_random();
785 new_uuid.print(uuid_str);
786 id = uuid_str;
787 }
788
789 ret = store_info(exclusive);
790 if (ret < 0) {
791 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
792 return ret;
793 }
794
795 return store_name(exclusive);
796}
797
798int RGWSystemMetaObj::store_info(bool exclusive)
799{
800 rgw_pool pool(get_pool(cct));
801
802 string oid = get_info_oid_prefix() + id;
803
804 bufferlist bl;
805 ::encode(*this, bl);
806 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
807}
808
809int RGWSystemMetaObj::write(bool exclusive)
810{
811 int ret = store_info(exclusive);
812 if (ret < 0) {
813 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
814 return ret;
815 }
816 ret = store_name(exclusive);
817 if (ret < 0) {
818 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
819 return ret;
820 }
821 return 0;
822}
823
824
825const string& RGWRealm::get_predefined_name(CephContext *cct) {
826 return cct->_conf->rgw_realm;
827}
828
829int RGWRealm::create(bool exclusive)
830{
831 int ret = RGWSystemMetaObj::create(exclusive);
832 if (ret < 0) {
833 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
834 return ret;
835 }
836 // create the control object for watch/notify
837 ret = create_control(exclusive);
838 if (ret < 0) {
839 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
840 return ret;
841 }
842 RGWPeriod period;
843 if (current_period.empty()) {
844 /* create new period for the realm */
845 ret = period.init(cct, store, id, name, false);
846 if (ret < 0 ) {
847 return ret;
848 }
849 ret = period.create(true);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
852 return ret;
853 }
854 } else {
855 period = RGWPeriod(current_period, 0);
856 int ret = period.init(cct, store, id, name);
857 if (ret < 0) {
858 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
859 return ret;
860 }
861 }
862 ret = set_current_period(period);
863 if (ret < 0) {
864 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
865 return ret;
866 }
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret = set_as_default(true);
870 if (ret < 0 && ret != -EEXIST) {
871 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
872 }
873
874 return 0;
875}
876
877int RGWRealm::delete_obj()
878{
879 int ret = RGWSystemMetaObj::delete_obj();
880 if (ret < 0) {
881 return ret;
882 }
883 return delete_control();
884}
885
886int RGWRealm::create_control(bool exclusive)
887{
888 auto pool = rgw_pool{get_pool(cct)};
889 auto oid = get_control_oid();
890 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
891 nullptr, real_time(), nullptr);
892}
893
894int RGWRealm::delete_control()
895{
896 auto pool = rgw_pool{get_pool(cct)};
897 auto obj = rgw_raw_obj{pool, get_control_oid()};
898 return store->delete_system_obj(obj);
899}
900
901rgw_pool RGWRealm::get_pool(CephContext *cct)
902{
903 if (cct->_conf->rgw_realm_root_pool.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
905 }
906 return rgw_pool(cct->_conf->rgw_realm_root_pool);
907}
908
909const string RGWRealm::get_default_oid(bool old_format)
910{
911 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
912 return default_realm_info_oid;
913 }
914 return cct->_conf->rgw_default_realm_info_oid;
915}
916
917const string& RGWRealm::get_names_oid_prefix()
918{
919 return realm_names_oid_prefix;
920}
921
922const string& RGWRealm::get_info_oid_prefix(bool old_format)
923{
924 return realm_info_oid_prefix;
925}
926
927int RGWRealm::set_current_period(RGWPeriod& period)
928{
929 // update realm epoch to match the period's
930 if (epoch > period.get_realm_epoch()) {
931 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
932 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
933 return -EINVAL;
934 }
935 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
936 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
937 << period.get_realm_epoch() << ", but different period id "
938 << period.get_id() << " != " << current_period << dendl;
939 return -EINVAL;
940 }
941
942 epoch = period.get_realm_epoch();
943 current_period = period.get_id();
944
945 int ret = update();
946 if (ret < 0) {
947 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
948 return ret;
949 }
950
951 ret = period.reflect();
952 if (ret < 0) {
953 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
954 return ret;
955 }
956
957 return 0;
958}
959
960string RGWRealm::get_control_oid()
961{
962 return get_info_oid_prefix() + id + ".control";
963}
964
965int RGWRealm::notify_zone(bufferlist& bl)
966{
967 // open a context on the realm's pool
968 rgw_pool pool{get_pool(cct)};
969 librados::IoCtx ctx;
970 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
971 if (r < 0) {
972 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
973 return r;
974 }
975 // send a notify on the realm object
976 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
977 if (r < 0) {
978 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
979 return r;
980 }
981 return 0;
982}
983
984int RGWRealm::notify_new_period(const RGWPeriod& period)
985{
986 bufferlist bl;
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
989 ::encode(period, bl);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload, bl);
992
993 return notify_zone(bl);
994}
995
996std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
997{
998 if (realm_id.empty()) {
999 return "period_config.default";
1000 }
1001 return "period_config." + realm_id;
1002}
1003
1004rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
1005{
1006 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1007 if (pool_name.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1009 }
1010 return {pool_name};
1011}
1012
1013int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1014{
1015 RGWObjectCtx obj_ctx(store);
1016 const auto& pool = get_pool(store->ctx());
1017 const auto& oid = get_oid(realm_id);
1018 bufferlist bl;
1019
1020 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1021 if (ret < 0) {
1022 return ret;
1023 }
1024 try {
1025 bufferlist::iterator iter = bl.begin();
1026 ::decode(*this, iter);
1027 } catch (buffer::error& err) {
1028 return -EIO;
1029 }
1030 return 0;
1031}
1032
1033int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1034{
1035 const auto& pool = get_pool(store->ctx());
1036 const auto& oid = get_oid(realm_id);
1037 bufferlist bl;
1038 ::encode(*this, bl);
1039 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1040 false, nullptr, real_time(), nullptr);
1041}
1042
1043int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1044 const string& period_realm_name, bool setup_obj)
1045{
1046 cct = _cct;
1047 store = _store;
1048 realm_id = period_realm_id;
1049 realm_name = period_realm_name;
1050
1051 if (!setup_obj)
1052 return 0;
1053
1054 return init(_cct, _store, setup_obj);
1055}
1056
1057
1058int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1059{
1060 cct = _cct;
1061 store = _store;
1062
1063 if (!setup_obj)
1064 return 0;
1065
1066 if (id.empty()) {
1067 RGWRealm realm(realm_id, realm_name);
1068 int ret = realm.init(cct, store);
1069 if (ret < 0) {
1070 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1071 cpp_strerror(-ret) << dendl;
1072 return ret;
1073 }
1074 id = realm.get_current_period();
1075 realm_id = realm.get_id();
1076 }
1077
1078 if (!epoch) {
1079 int ret = use_latest_epoch();
1080 if (ret < 0) {
1081 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1082 << " : " << cpp_strerror(-ret) << dendl;
1083 return ret;
1084 }
1085 }
1086
1087 return read_info();
1088}
1089
1090
1091int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1092 map<string, RGWZoneGroup>::const_iterator iter;
1093 if (!zonegroup_id.empty()) {
1094 iter = period_map.zonegroups.find(zonegroup_id);
1095 } else {
1096 iter = period_map.zonegroups.find("default");
1097 }
1098 if (iter != period_map.zonegroups.end()) {
1099 zonegroup = iter->second;
1100 return 0;
1101 }
1102
1103 return -ENOENT;
1104}
1105
7c673cae
FG
1106const string& RGWPeriod::get_latest_epoch_oid()
1107{
1108 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1109 return period_latest_epoch_info_oid;
1110 }
1111 return cct->_conf->rgw_period_latest_epoch_info_oid;
1112}
1113
1114const string& RGWPeriod::get_info_oid_prefix()
1115{
1116 return period_info_oid_prefix;
1117}
1118
1119const string RGWPeriod::get_period_oid_prefix()
1120{
1121 return get_info_oid_prefix() + id;
1122}
1123
1124const string RGWPeriod::get_period_oid()
1125{
1126 std::ostringstream oss;
1127 oss << get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id != get_staging_id(realm_id))
1130 oss << "." << epoch;
1131 return oss.str();
1132}
1133
224ce89b
WB
1134int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1135 RGWObjVersionTracker *objv)
7c673cae
FG
1136{
1137 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1138
1139 rgw_pool pool(get_pool(cct));
1140 bufferlist bl;
1141 RGWObjectCtx obj_ctx(store);
224ce89b 1142 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
7c673cae
FG
1143 if (ret < 0) {
1144 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1145 return ret;
1146 }
1147 try {
1148 bufferlist::iterator iter = bl.begin();
1149 ::decode(info, iter);
1150 } catch (buffer::error& err) {
1151 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1152 return -EIO;
1153 }
1154
1155 return 0;
1156}
1157
1158int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1159{
1160 RGWPeriodLatestEpochInfo info;
1161
1162 int ret = read_latest_epoch(info);
1163 if (ret < 0) {
1164 return ret;
1165 }
1166
1167 latest_epoch = info.epoch;
1168
1169 return 0;
1170}
1171
1172int RGWPeriod::use_latest_epoch()
1173{
1174 RGWPeriodLatestEpochInfo info;
1175 int ret = read_latest_epoch(info);
1176 if (ret < 0) {
1177 return ret;
1178 }
1179
1180 epoch = info.epoch;
1181
1182 return 0;
1183}
1184
224ce89b
WB
1185int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1186 RGWObjVersionTracker *objv)
7c673cae
FG
1187{
1188 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1189
1190 rgw_pool pool(get_pool(cct));
1191 bufferlist bl;
1192
1193 RGWPeriodLatestEpochInfo info;
1194 info.epoch = epoch;
1195
1196 ::encode(info, bl);
1197
1198 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
224ce89b
WB
1199 exclusive, objv, real_time(), nullptr);
1200}
1201
1202int RGWPeriod::update_latest_epoch(epoch_t epoch)
1203{
1204 static constexpr int MAX_RETRIES = 20;
1205
1206 for (int i = 0; i < MAX_RETRIES; i++) {
1207 RGWPeriodLatestEpochInfo info;
1208 RGWObjVersionTracker objv;
1209 bool exclusive = false;
1210
1211 // read existing epoch
1212 int r = read_latest_epoch(info, &objv);
1213 if (r == -ENOENT) {
1214 // use an exclusive create to set the epoch atomically
1215 exclusive = true;
1216 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id << dendl;
1218 } else if (r < 0) {
1219 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1220 return r;
1221 } else if (epoch <= info.epoch) {
1222 r = -EEXIST; // fail with EEXIST if epoch is not newer
1223 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1224 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1225 return r;
1226 } else {
1227 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1228 << " -> " << epoch << " on period=" << id << dendl;
1229 }
1230
1231 r = set_latest_epoch(epoch, exclusive, &objv);
1232 if (r == -EEXIST) {
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r == -ECANCELED) {
1235 continue; // write raced with a conflicting version, retry
1236 }
1237 if (r < 0) {
1238 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1239 return r;
1240 }
1241 return 0; // return success
1242 }
1243
1244 return -ECANCELED; // fail after max retries
7c673cae
FG
1245}
1246
1247int RGWPeriod::delete_obj()
1248{
1249 rgw_pool pool(get_pool(cct));
1250
1251 // delete the object for each period epoch
1252 for (epoch_t e = 1; e <= epoch; e++) {
1253 RGWPeriod p{get_id(), e};
1254 rgw_raw_obj oid{pool, p.get_period_oid()};
1255 int ret = store->delete_system_obj(oid);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret) << dendl;
1259 }
1260 }
1261
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret = store->delete_system_obj(oid);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret) << dendl;
1268 }
1269 return ret;
1270}
1271
1272int RGWPeriod::read_info()
1273{
1274 rgw_pool pool(get_pool(cct));
1275
1276 bufferlist bl;
1277
1278 RGWObjectCtx obj_ctx(store);
1279 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1280 if (ret < 0) {
1281 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1282 return ret;
1283 }
1284
1285 try {
1286 bufferlist::iterator iter = bl.begin();
1287 ::decode(*this, iter);
1288 } catch (buffer::error& err) {
1289 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1290 return -EIO;
1291 }
1292
1293 return 0;
1294}
1295
1296int RGWPeriod::create(bool exclusive)
1297{
1298 int ret;
1299
1300 /* create unique id */
1301 uuid_d new_uuid;
1302 char uuid_str[37];
1303 new_uuid.generate_random();
1304 new_uuid.print(uuid_str);
1305 id = uuid_str;
1306
1307 epoch = FIRST_EPOCH;
1308
1309 period_map.id = id;
1310
1311 ret = store_info(exclusive);
1312 if (ret < 0) {
1313 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
224ce89b 1314 return ret;
7c673cae
FG
1315 }
1316
1317 ret = set_latest_epoch(epoch);
1318 if (ret < 0) {
1319 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1320 }
1321
1322 return ret;
1323}
1324
1325int RGWPeriod::store_info(bool exclusive)
1326{
7c673cae
FG
1327 rgw_pool pool(get_pool(cct));
1328
1329 string oid = get_period_oid();
1330 bufferlist bl;
1331 ::encode(*this, bl);
224ce89b
WB
1332
1333 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1334 exclusive, NULL, real_time(), NULL);
7c673cae
FG
1335}
1336
1337rgw_pool RGWPeriod::get_pool(CephContext *cct)
1338{
1339 if (cct->_conf->rgw_period_root_pool.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1341 }
1342 return rgw_pool(cct->_conf->rgw_period_root_pool);
1343}
1344
7c673cae
FG
1345int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1346{
1347 if (zonegroup.realm_id != realm_id) {
1348 return 0;
1349 }
1350 int ret = period_map.update(zonegroup, cct);
1351 if (ret < 0) {
1352 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1353 return ret;
1354 }
1355
1356 return store_info(false);
1357}
1358
1359int RGWPeriod::update()
1360{
1361 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1362 list<string> zonegroups;
1363 int ret = store->list_zonegroups(zonegroups);
1364 if (ret < 0) {
1365 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1366 return ret;
1367 }
1368
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map.short_zone_ids.clear();
1372
1373 for (auto& iter : zonegroups) {
1374 RGWZoneGroup zg(string(), iter);
1375 ret = zg.init(cct, store);
1376 if (ret < 0) {
1377 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1378 continue;
1379 }
1380
1381 if (zg.realm_id != realm_id) {
1382 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1383 continue;
1384 }
1385
1386 if (zg.master_zone.empty()) {
1387 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1388 return -EINVAL;
1389 }
1390
f64942e4
AA
1391 if (zg.zones.find(zg.master_zone) == zg.zones.end()) {
1392 ldout(cct,0) << "ERROR: zonegroup " << zg.get_name()
1393 << " has a non existent master zone "<< dendl;
1394 return -EINVAL;
1395 }
1396
7c673cae
FG
1397 if (zg.is_master_zonegroup()) {
1398 master_zonegroup = zg.get_id();
1399 master_zone = zg.master_zone;
1400 }
1401
1402 int ret = period_map.update(zg, cct);
1403 if (ret < 0) {
1404 return ret;
1405 }
1406 }
1407
1408 ret = period_config.read(store, realm_id);
1409 if (ret < 0 && ret != -ENOENT) {
1410 ldout(cct, 0) << "ERROR: failed to read period config: "
1411 << cpp_strerror(ret) << dendl;
1412 return ret;
1413 }
1414 return 0;
1415}
1416
1417int RGWPeriod::reflect()
1418{
1419 for (auto& iter : period_map.zonegroups) {
1420 RGWZoneGroup& zg = iter.second;
1421 zg.reinit_instance(cct, store);
1422 int r = zg.write(false);
1423 if (r < 0) {
1424 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1425 return r;
1426 }
1427 if (zg.is_master_zonegroup()) {
1428 // set master as default if no default exists
1429 r = zg.set_as_default(true);
1430 if (r == 0) {
1431 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1432 << " as the default" << dendl;
1433 }
1434 }
1435 }
1436
1437 int r = period_config.write(store, realm_id);
1438 if (r < 0) {
1439 ldout(cct, 0) << "ERROR: failed to store period config: "
1440 << cpp_strerror(-r) << dendl;
1441 return r;
1442 }
1443 return 0;
1444}
1445
1446void RGWPeriod::fork()
1447{
1448 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1449 predecessor_uuid = id;
1450 id = get_staging_id(realm_id);
1451 period_map.reset();
1452 realm_epoch++;
1453}
1454
1455static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1456{
1457 // initialize a sync status manager to read the status
1458 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1459 int r = mgr.init();
1460 if (r < 0) {
1461 return r;
1462 }
1463 r = mgr.read_sync_status(sync_status);
1464 mgr.stop();
1465 return r;
1466}
1467
1468int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1469 std::ostream& error_stream,
1470 bool force_if_stale)
1471{
1472 rgw_meta_sync_status status;
1473 int r = read_sync_status(store, &status);
1474 if (r < 0) {
1475 ldout(cct, 0) << "period failed to read sync status: "
1476 << cpp_strerror(-r) << dendl;
1477 return r;
1478 }
1479
1480 std::vector<std::string> markers;
1481
1482 const auto current_epoch = current_period.get_realm_epoch();
1483 if (current_epoch != status.sync_info.realm_epoch) {
1484 // no sync status markers for the current period
1485 assert(current_epoch > status.sync_info.realm_epoch);
1486 const int behind = current_epoch - status.sync_info.realm_epoch;
1487 if (!force_if_stale && current_epoch > 1) {
1488 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1489 "the current master zone in metadata sync. If this zone is promoted "
1490 "to master, any metadata changes during that time are likely to "
1491 "be lost.\n"
1492 "Waiting for this zone to catch up on metadata sync (see "
1493 "'radosgw-admin sync status') is recommended.\n"
1494 "To promote this zone to master anyway, add the flag "
1495 "--yes-i-really-mean-it." << std::endl;
1496 return -EINVAL;
1497 }
1498 // empty sync status markers - other zones will skip this period during
1499 // incremental metadata sync
1500 markers.resize(status.sync_info.num_shards);
1501 } else {
1502 markers.reserve(status.sync_info.num_shards);
1503 for (auto& i : status.sync_markers) {
1504 auto& marker = i.second;
1505 // filter out markers from other periods
1506 if (marker.realm_epoch != current_epoch) {
1507 marker.marker.clear();
1508 }
1509 markers.emplace_back(std::move(marker.marker));
1510 }
1511 }
1512
1513 std::swap(sync_status, markers);
1514 return 0;
1515}
1516
1517int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1518 std::ostream& error_stream, bool force_if_stale)
1519{
1520 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1521 // gateway must be in the master zone to commit
1522 if (master_zone != store->get_zone_params().get_id()) {
1523 error_stream << "Cannot commit period on zone "
1524 << store->get_zone_params().get_id() << ", it must be sent to "
1525 "the period's master zone " << master_zone << '.' << std::endl;
1526 return -EINVAL;
1527 }
1528 // period predecessor must match current period
1529 if (predecessor_uuid != current_period.get_id()) {
1530 error_stream << "Period predecessor " << predecessor_uuid
1531 << " does not match current period " << current_period.get_id()
1532 << ". Use 'period pull' to get the latest period from the master, "
1533 "reapply your changes, and try again." << std::endl;
1534 return -EINVAL;
1535 }
1536 // realm epoch must be 1 greater than current period
1537 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1538 error_stream << "Period's realm epoch " << realm_epoch
1539 << " does not come directly after current realm epoch "
1540 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1541 "latest realm and period from the master zone, reapply your changes, "
1542 "and try again." << std::endl;
1543 return -EINVAL;
1544 }
1545 // did the master zone change?
1546 if (master_zone != current_period.get_master_zone()) {
1547 // store the current metadata sync status in the period
1548 int r = update_sync_status(current_period, error_stream, force_if_stale);
1549 if (r < 0) {
1550 ldout(cct, 0) << "failed to update metadata sync status: "
1551 << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 // create an object with a new period id
1555 r = create(true);
1556 if (r < 0) {
1557 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1558 return r;
1559 }
1560 // set as current period
1561 r = realm.set_current_period(*this);
1562 if (r < 0) {
1563 ldout(cct, 0) << "failed to update realm's current period: "
1564 << cpp_strerror(-r) << dendl;
1565 return r;
1566 }
1567 ldout(cct, 4) << "Promoted to master zone and committed new period "
1568 << id << dendl;
1569 realm.notify_new_period(*this);
1570 return 0;
1571 }
1572 // period must be based on current epoch
1573 if (epoch != current_period.get_epoch()) {
1574 error_stream << "Period epoch " << epoch << " does not match "
1575 "predecessor epoch " << current_period.get_epoch()
1576 << ". Use 'period pull' to get the latest epoch from the master zone, "
1577 "reapply your changes, and try again." << std::endl;
1578 return -EINVAL;
1579 }
1580 // set period as next epoch
1581 set_id(current_period.get_id());
1582 set_epoch(current_period.get_epoch() + 1);
1583 set_predecessor(current_period.get_predecessor());
1584 realm_epoch = current_period.get_realm_epoch();
1585 // write the period to rados
1586 int r = store_info(false);
1587 if (r < 0) {
1588 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1589 return r;
1590 }
1591 // set as latest epoch
224ce89b
WB
1592 r = update_latest_epoch(epoch);
1593 if (r == -EEXIST) {
1594 // already have this epoch (or a more recent one)
1595 return 0;
1596 }
7c673cae
FG
1597 if (r < 0) {
1598 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1599 return r;
1600 }
1601 r = reflect();
1602 if (r < 0) {
1603 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1604 return r;
1605 }
1606 ldout(cct, 4) << "Committed new epoch " << epoch
1607 << " for period " << id << dendl;
1608 realm.notify_new_period(*this);
1609 return 0;
1610}
1611
1612int RGWZoneParams::create_default(bool old_format)
1613{
1614 name = default_zone_name;
1615
1616 int r = create();
1617 if (r < 0) {
1618 return r;
1619 }
1620
1621 if (old_format) {
1622 name = id;
1623 }
1624
1625 return r;
1626}
1627
1628
1629int get_zones_pool_set(CephContext* cct,
1630 RGWRados* store,
1631 const list<string>& zones,
1632 const string& my_zone_id,
1633 set<rgw_pool>& pool_names)
1634{
1635 for(auto const& iter : zones) {
1636 RGWZoneParams zone(iter);
1637 int r = zone.init(cct, store);
1638 if (r < 0) {
1639 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1640 return r;
1641 }
1642 if (zone.get_id() != my_zone_id) {
1643 pool_names.insert(zone.domain_root);
1644 pool_names.insert(zone.metadata_heap);
1645 pool_names.insert(zone.control_pool);
1646 pool_names.insert(zone.gc_pool);
1647 pool_names.insert(zone.log_pool);
1648 pool_names.insert(zone.intent_log_pool);
1649 pool_names.insert(zone.usage_log_pool);
1650 pool_names.insert(zone.user_keys_pool);
1651 pool_names.insert(zone.user_email_pool);
1652 pool_names.insert(zone.user_swift_pool);
1653 pool_names.insert(zone.user_uid_pool);
1654 pool_names.insert(zone.roles_pool);
31f18b77 1655 pool_names.insert(zone.reshard_pool);
7c673cae
FG
1656 for(auto& iter : zone.placement_pools) {
1657 pool_names.insert(iter.second.index_pool);
1658 pool_names.insert(iter.second.data_pool);
1659 pool_names.insert(iter.second.data_extra_pool);
1660 }
1661 }
1662 }
1663 return 0;
1664}
1665
1666rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1667 const string& default_prefix,
1668 const string& default_suffix,
1669 const rgw_pool& suggested_pool)
1670{
1671 string suggested_name = suggested_pool.to_str();
1672
1673 string prefix = default_prefix;
1674 string suffix = default_suffix;
1675
1676 if (!suggested_pool.empty()) {
1677 prefix = suggested_name.substr(0, suggested_name.find("."));
1678 suffix = suggested_name.substr(prefix.length());
1679 }
1680
1681 rgw_pool pool(prefix + suffix);
1682
1683 if (pools.find(pool) == pools.end()) {
1684 return pool;
1685 } else {
1686 while(true) {
1687 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1688 if (pools.find(pool) == pools.end()) {
1689 return pool;
1690 }
1691 }
1692 }
1693}
1694
1695int RGWZoneParams::fix_pool_names()
1696{
1697
1698 list<string> zones;
1699 int r = store->list_zones(zones);
1700 if (r < 0) {
1701 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1702 }
1703
1704 set<rgw_pool> pools;
1705 r = get_zones_pool_set(cct, store, zones, id, pools);
1706 if (r < 0) {
1707 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1708 return r;
1709 }
1710
1711 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1712 if (!metadata_heap.name.empty()) {
1713 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1714 }
1715 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1716 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1717 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1718 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1719 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1720 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1721 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1722 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1723 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1724 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1725 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
31f18b77 1726 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
7c673cae
FG
1727
1728 for(auto& iter : placement_pools) {
1729 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1730 iter.second.index_pool);
1731 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1732 iter.second.data_pool);
1733 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1734 iter.second.data_extra_pool);
1735 }
1736
1737 return 0;
1738}
1739
1740int RGWZoneParams::create(bool exclusive)
1741{
1742 /* check for old pools config */
1743 rgw_raw_obj obj(domain_root, avail_pools);
1744 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1745 if (r < 0) {
1746 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1747 /* a new system, let's set new placement info */
1748 RGWZonePlacementInfo default_placement;
1749 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1750 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1751 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1752 placement_pools["default-placement"] = default_placement;
1753 }
1754
1755 r = fix_pool_names();
1756 if (r < 0) {
1757 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1758 return r;
1759 }
1760
1761 r = RGWSystemMetaObj::create(exclusive);
1762 if (r < 0) {
1763 return r;
1764 }
1765
1766 // try to set as default. may race with another create, so pass exclusive=true
1767 // so we don't override an existing default
1768 r = set_as_default(true);
1769 if (r < 0 && r != -EEXIST) {
1770 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1771 }
1772
1773 return 0;
1774}
1775
1776rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1777{
1778 if (cct->_conf->rgw_zone_root_pool.empty()) {
1779 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1780 }
1781
1782 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1783}
1784
1785const string RGWZoneParams::get_default_oid(bool old_format)
1786{
1787 if (old_format) {
1788 return cct->_conf->rgw_default_zone_info_oid;
1789 }
1790
1791 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1792}
1793
1794const string& RGWZoneParams::get_names_oid_prefix()
1795{
1796 return zone_names_oid_prefix;
1797}
1798
1799const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1800{
1801 return zone_info_oid_prefix;
1802}
1803
1804const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1805 return cct->_conf->rgw_zone;
1806}
1807
1808int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1809{
1810 if (name.empty()) {
1811 name = cct->_conf->rgw_zone;
1812 }
1813
1814 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1815}
1816
1817int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1818{
1819 if (realm_id.empty()) {
1820 /* try using default realm */
1821 RGWRealm realm;
1822 int ret = realm.init(cct, store);
b32b8144 1823 //no default realm exist
7c673cae 1824 if (ret < 0) {
b32b8144 1825 return read_id(default_zone_name, default_id);
7c673cae
FG
1826 }
1827 realm_id = realm.get_id();
1828 }
1829
1830 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1831}
1832
1833
1834int RGWZoneParams::set_as_default(bool exclusive)
1835{
1836 if (realm_id.empty()) {
1837 /* try using default realm */
1838 RGWRealm realm;
1839 int ret = realm.init(cct, store);
1840 if (ret < 0) {
1841 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1842 return -EINVAL;
1843 }
1844 realm_id = realm.get_id();
1845 }
1846
1847 return RGWSystemMetaObj::set_as_default(exclusive);
1848}
1849
1850const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1851{
1852 static const std::string NONE{"none"};
1853 auto p = placement_pools.find(placement_rule);
1854 if (p == placement_pools.end()) {
1855 return NONE;
1856 }
1857 const auto& type = p->second.compression_type;
1858 return !type.empty() ? type : NONE;
1859}
1860
1861void RGWPeriodMap::encode(bufferlist& bl) const {
1862 ENCODE_START(2, 1, bl);
1863 ::encode(id, bl);
1864 ::encode(zonegroups, bl);
1865 ::encode(master_zonegroup, bl);
1866 ::encode(short_zone_ids, bl);
1867 ENCODE_FINISH(bl);
1868}
1869
1870void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1871 DECODE_START(2, bl);
1872 ::decode(id, bl);
1873 ::decode(zonegroups, bl);
1874 ::decode(master_zonegroup, bl);
1875 if (struct_v >= 2) {
1876 ::decode(short_zone_ids, bl);
1877 }
1878 DECODE_FINISH(bl);
1879
1880 zonegroups_by_api.clear();
1881 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1882 iter != zonegroups.end(); ++iter) {
1883 RGWZoneGroup& zonegroup = iter->second;
1884 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 1885 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1886 master_zonegroup = zonegroup.get_id();
1887 }
1888 }
1889}
1890
1891// run an MD5 hash on the zone_id and return the first 32 bits
1892static uint32_t gen_short_zone_id(const std::string zone_id)
1893{
1894 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1895 MD5 hash;
1896 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1897 hash.Final(md5);
1898
1899 uint32_t short_id;
1900 memcpy((char *)&short_id, md5, sizeof(short_id));
1901 return std::max(short_id, 1u);
1902}
1903
1904int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1905{
31f18b77 1906 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
7c673cae
FG
1907 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1908 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1909 return -EINVAL;
1910 }
1911 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1912 if (iter != zonegroups.end()) {
1913 RGWZoneGroup& old_zonegroup = iter->second;
1914 if (!old_zonegroup.api_name.empty()) {
1915 zonegroups_by_api.erase(old_zonegroup.api_name);
1916 }
1917 }
1918 zonegroups[zonegroup.get_id()] = zonegroup;
1919
1920 if (!zonegroup.api_name.empty()) {
1921 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1922 }
1923
31f18b77 1924 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1925 master_zonegroup = zonegroup.get_id();
1926 } else if (master_zonegroup == zonegroup.get_id()) {
1927 master_zonegroup = "";
1928 }
1929
1930 for (auto& i : zonegroup.zones) {
1931 auto& zone = i.second;
1932 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1933 continue;
1934 }
1935 // calculate the zone's short id
1936 uint32_t short_id = gen_short_zone_id(zone.id);
1937
1938 // search for an existing zone with the same short id
1939 for (auto& s : short_zone_ids) {
1940 if (s.second == short_id) {
1941 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1942 << ") generates the same short_zone_id " << short_id
1943 << " as existing zone id " << s.first << dendl;
1944 return -EEXIST;
1945 }
1946 }
1947
1948 short_zone_ids[zone.id] = short_id;
1949 }
1950
1951 return 0;
1952}
1953
1954uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1955{
1956 auto i = short_zone_ids.find(zone_id);
1957 if (i == short_zone_ids.end()) {
1958 return 0;
1959 }
1960 return i->second;
1961}
1962
1963int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1964{
1965
1966 RGWPeriod period;
1967 int ret = period.init(cct, store);
1968 if (ret < 0) {
1969 cerr << "failed to read current period info: " << cpp_strerror(ret);
1970 return ret;
1971 }
1972
1973 bucket_quota = period.get_config().bucket_quota;
1974 user_quota = period.get_config().user_quota;
1975 zonegroups = period.get_map().zonegroups;
1976 zonegroups_by_api = period.get_map().zonegroups_by_api;
1977 master_zonegroup = period.get_map().master_zonegroup;
1978
1979 return 0;
1980}
1981
1982void RGWRegionMap::encode(bufferlist& bl) const {
1983 ENCODE_START( 3, 1, bl);
1984 ::encode(regions, bl);
1985 ::encode(master_region, bl);
1986 ::encode(bucket_quota, bl);
1987 ::encode(user_quota, bl);
1988 ENCODE_FINISH(bl);
1989}
1990
1991void RGWRegionMap::decode(bufferlist::iterator& bl) {
1992 DECODE_START(3, bl);
1993 ::decode(regions, bl);
1994 ::decode(master_region, bl);
1995 if (struct_v >= 2)
1996 ::decode(bucket_quota, bl);
1997 if (struct_v >= 3)
1998 ::decode(user_quota, bl);
1999 DECODE_FINISH(bl);
2000}
2001
2002void RGWZoneGroupMap::encode(bufferlist& bl) const {
2003 ENCODE_START( 3, 1, bl);
2004 ::encode(zonegroups, bl);
2005 ::encode(master_zonegroup, bl);
2006 ::encode(bucket_quota, bl);
2007 ::encode(user_quota, bl);
2008 ENCODE_FINISH(bl);
2009}
2010
2011void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
2012 DECODE_START(3, bl);
2013 ::decode(zonegroups, bl);
2014 ::decode(master_zonegroup, bl);
2015 if (struct_v >= 2)
2016 ::decode(bucket_quota, bl);
2017 if (struct_v >= 3)
2018 ::decode(user_quota, bl);
2019 DECODE_FINISH(bl);
2020
2021 zonegroups_by_api.clear();
2022 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2023 iter != zonegroups.end(); ++iter) {
2024 RGWZoneGroup& zonegroup = iter->second;
2025 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 2026 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
2027 master_zonegroup = zonegroup.get_name();
2028 }
2029 }
2030}
2031
2032void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2033{
2034 obj_version *check_objv = version_for_check();
2035
2036 if (check_objv) {
2037 cls_version_check(*op, *check_objv, VER_COND_EQ);
2038 }
2039
2040 cls_version_read(*op, &read_version);
2041}
2042
2043void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2044{
2045 obj_version *check_objv = version_for_check();
2046 obj_version *modify_version = version_for_write();
2047
2048 if (check_objv) {
2049 cls_version_check(*op, *check_objv, VER_COND_EQ);
2050 }
2051
2052 if (modify_version) {
2053 cls_version_set(*op, *modify_version);
2054 } else {
2055 cls_version_inc(*op);
2056 }
2057}
2058
2059void RGWObjManifest::obj_iterator::operator++()
2060{
2061 if (manifest->explicit_objs) {
2062 ++explicit_iter;
2063
2064 if (explicit_iter == manifest->objs.end()) {
2065 ofs = manifest->obj_size;
2066 return;
2067 }
2068
2069 update_explicit_pos();
2070
2071 update_location();
2072 return;
2073 }
2074
2075 uint64_t obj_size = manifest->get_obj_size();
2076 uint64_t head_size = manifest->get_head_size();
2077
2078 if (ofs == obj_size) {
2079 return;
2080 }
2081
2082 if (manifest->rules.empty()) {
2083 return;
2084 }
2085
2086 /* are we still pointing at the head? */
2087 if (ofs < head_size) {
2088 rule_iter = manifest->rules.begin();
2089 RGWObjManifestRule *rule = &rule_iter->second;
2090 ofs = MIN(head_size, obj_size);
2091 stripe_ofs = ofs;
2092 cur_stripe = 1;
2093 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2094 if (rule->part_size > 0) {
2095 stripe_size = MIN(stripe_size, rule->part_size);
2096 }
2097 update_location();
2098 return;
2099 }
2100
2101 RGWObjManifestRule *rule = &rule_iter->second;
2102
2103 stripe_ofs += rule->stripe_max_size;
2104 cur_stripe++;
2105 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2106
2107 if (rule->part_size > 0) {
2108 /* multi part, multi stripes object */
2109
2110 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2111
2112 if (stripe_ofs >= part_ofs + rule->part_size) {
2113 /* moved to the next part */
2114 cur_stripe = 0;
2115 part_ofs += rule->part_size;
2116 stripe_ofs = part_ofs;
2117
2118 bool last_rule = (next_rule_iter == manifest->rules.end());
2119 /* move to the next rule? */
2120 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2121 rule_iter = next_rule_iter;
2122 last_rule = (next_rule_iter == manifest->rules.end());
2123 if (!last_rule) {
2124 ++next_rule_iter;
2125 }
2126 cur_part_id = rule_iter->second.start_part_num;
2127 } else {
2128 cur_part_id++;
2129 }
2130
2131 rule = &rule_iter->second;
2132 }
2133
2134 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2135 }
2136
2137 cur_override_prefix = rule->override_prefix;
2138
2139 ofs = stripe_ofs;
2140 if (ofs > obj_size) {
2141 ofs = obj_size;
2142 stripe_ofs = ofs;
2143 stripe_size = 0;
2144 }
2145
2146 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2147 update_location();
2148}
2149
2150int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2151{
2152 manifest = _m;
2153
2154 manifest->set_tail_placement(placement_rule, _b);
2155 manifest->set_head(placement_rule, _obj, 0);
2156 last_ofs = 0;
2157
2158 if (manifest->get_prefix().empty()) {
2159 char buf[33];
2160 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2161
2162 string oid_prefix = ".";
2163 oid_prefix.append(buf);
2164 oid_prefix.append("_");
2165
2166 manifest->set_prefix(oid_prefix);
2167 }
2168
2169 bool found = manifest->get_rule(0, &rule);
2170 if (!found) {
2171 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2172 return -EIO;
2173 }
2174
2175 uint64_t head_size = manifest->get_head_size();
2176
2177 if (head_size > 0) {
2178 cur_stripe_size = head_size;
2179 } else {
2180 cur_stripe_size = rule.stripe_max_size;
2181 }
2182
2183 cur_part_id = rule.start_part_num;
2184
2185 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2186
2187 // Normal object which not generated through copy operation
2188 manifest->set_tail_instance(_obj.key.instance);
2189
2190 manifest->update_iterators();
2191
2192 return 0;
2193}
2194
2195int RGWObjManifest::generator::create_next(uint64_t ofs)
2196{
2197 if (ofs < last_ofs) /* only going forward */
2198 return -EINVAL;
2199
2200 uint64_t max_head_size = manifest->get_max_head_size();
2201
2202 if (ofs < max_head_size) {
2203 manifest->set_head_size(ofs);
2204 }
2205
2206 if (ofs >= max_head_size) {
2207 manifest->set_head_size(max_head_size);
2208 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2209 cur_stripe_size = rule.stripe_max_size;
2210
2211 if (cur_part_id == 0 && max_head_size > 0) {
2212 cur_stripe++;
2213 }
2214 }
2215
2216 last_ofs = ofs;
2217 manifest->set_obj_size(ofs);
2218
2219 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2220
2221 manifest->update_iterators();
2222
2223 return 0;
2224}
2225
2226const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2227{
2228 return begin_iter;
2229}
2230
2231const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2232{
2233 return end_iter;
2234}
2235
2236RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2237{
2238 if (ofs > obj_size) {
2239 ofs = obj_size;
2240 }
2241 RGWObjManifest::obj_iterator iter(this);
2242 iter.seek(ofs);
2243 return iter;
2244}
2245
2246int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2247{
2248 if (explicit_objs || m.explicit_objs) {
2249 return append_explicit(m, zonegroup, zone_params);
2250 }
2251
2252 if (rules.empty()) {
2253 *this = m;
2254 return 0;
2255 }
2256
2257 string override_prefix;
2258
2259 if (prefix.empty()) {
2260 prefix = m.prefix;
2261 }
2262
2263 if (prefix != m.prefix) {
2264 override_prefix = m.prefix;
2265 }
2266
2267 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2268 if (miter == m.rules.end()) {
2269 return append_explicit(m, zonegroup, zone_params);
2270 }
2271
2272 for (; miter != m.rules.end(); ++miter) {
2273 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2274
2275 RGWObjManifestRule& rule = last_rule->second;
2276
2277 if (rule.part_size == 0) {
2278 rule.part_size = obj_size - rule.start_ofs;
2279 }
2280
2281 RGWObjManifestRule& next_rule = miter->second;
2282 if (!next_rule.part_size) {
2283 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2284 }
2285
2286 string rule_prefix = prefix;
2287 if (!rule.override_prefix.empty()) {
2288 rule_prefix = rule.override_prefix;
2289 }
2290
2291 string next_rule_prefix = m.prefix;
2292 if (!next_rule.override_prefix.empty()) {
2293 next_rule_prefix = next_rule.override_prefix;
2294 }
2295
2296 if (rule.part_size != next_rule.part_size ||
2297 rule.stripe_max_size != next_rule.stripe_max_size ||
2298 rule_prefix != next_rule_prefix) {
2299 if (next_rule_prefix != prefix) {
2300 append_rules(m, miter, &next_rule_prefix);
2301 } else {
2302 append_rules(m, miter, NULL);
2303 }
2304 break;
2305 }
2306
2307 uint64_t expected_part_num = rule.start_part_num + 1;
2308 if (rule.part_size > 0) {
2309 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2310 }
2311
2312 if (expected_part_num != next_rule.start_part_num) {
2313 append_rules(m, miter, NULL);
2314 break;
2315 }
2316 }
2317
2318 set_obj_size(obj_size + m.obj_size);
2319
2320 return 0;
2321}
2322
2323int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2324{
2325 return append(m, store->get_zonegroup(), store->get_zone_params());
2326}
2327
2328void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2329 string *override_prefix)
2330{
2331 for (; miter != m.rules.end(); ++miter) {
2332 RGWObjManifestRule rule = miter->second;
2333 rule.start_ofs += obj_size;
2334 if (override_prefix)
2335 rule.override_prefix = *override_prefix;
2336 rules[rule.start_ofs] = rule;
2337 }
2338}
2339
2340void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2341{
2342 if (explicit_objs) {
2343 return;
2344 }
2345 obj_iterator iter = obj_begin();
2346
2347 while (iter != obj_end()) {
2348 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2349 const rgw_obj_select& os = iter.get_location();
2350 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2351 part.loc_ofs = 0;
2352
2353 uint64_t ofs = iter.get_stripe_ofs();
2354
2355 if (ofs == 0) {
2356 part.loc = obj;
2357 } else {
2358 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2359 }
2360 ++iter;
2361 uint64_t next_ofs = iter.get_stripe_ofs();
2362
2363 part.size = next_ofs - ofs;
2364 }
2365
2366 explicit_objs = true;
2367 rules.clear();
2368 prefix.clear();
2369}
2370
2371int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2372{
2373 if (!explicit_objs) {
2374 convert_to_explicit(zonegroup, zone_params);
2375 }
2376 if (!m.explicit_objs) {
2377 m.convert_to_explicit(zonegroup, zone_params);
2378 }
2379 map<uint64_t, RGWObjManifestPart>::iterator iter;
2380 uint64_t base = obj_size;
2381 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2382 RGWObjManifestPart& part = iter->second;
2383 objs[base + iter->first] = part;
2384 }
2385 obj_size += m.obj_size;
2386
2387 return 0;
2388}
2389
2390bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2391{
2392 if (rules.empty()) {
2393 return false;
2394 }
2395
2396 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2397 if (iter != rules.begin()) {
2398 --iter;
2399 }
2400
2401 *rule = iter->second;
2402
2403 return true;
2404}
2405
2406void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2407{
2408 write_version.ver = 1;
2409#define TAG_LEN 24
2410
2411 write_version.tag.clear();
2412 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2413}
2414
2415int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2416 real_time *mtime, real_time set_mtime,
2417 map<string, bufferlist>& attrs, real_time delete_at,
31f18b77
FG
2418 const char *if_match, const char *if_nomatch, const string *user_data,
2419 rgw_zone_set *zones_trace)
7c673cae 2420{
31f18b77 2421 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
7c673cae
FG
2422 if (r < 0)
2423 return r;
2424
2425 is_complete = !canceled;
2426 return 0;
2427}
2428
2429CephContext *RGWPutObjProcessor::ctx()
2430{
2431 return store->ctx();
2432}
2433
2434RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2435{
2436 drain_pending();
2437
2438 if (is_complete)
2439 return;
2440
2441 set<rgw_raw_obj>::iterator iter;
2442 bool need_to_remove_head = false;
2443 rgw_raw_obj raw_head;
2444
2445 if (!head_obj.empty()) {
2446 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2447 }
2448
2449 /**
2450 * We should delete the object in the "multipart" namespace to avoid race condition.
2451 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2452 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2453 * written by the second upload may be deleted by the first upload.
2454 * details is describled on #11749
2455 *
2456 * The above comment still stands, but instead of searching for a specific object in the multipart
2457 * namespace, we just make sure that we remove the object that is marked as the head object after
2458 * we remove all the other raw objects. Note that we use different call to remove the head object,
2459 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2460 */
2461 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2462 const rgw_raw_obj& obj = *iter;
2463 if (!head_obj.empty() && obj == raw_head) {
2464 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2465 need_to_remove_head = true;
2466 continue;
2467 }
2468
2469 int r = store->delete_raw_obj(obj);
2470 if (r < 0 && r != -ENOENT) {
2471 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2472 }
2473 }
2474
2475 if (need_to_remove_head) {
2476 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2477 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2478 if (r < 0 && r != -ENOENT) {
2479 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2480 }
2481 }
2482}
2483
2484int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2485{
2486 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2487 obj_len = abs_ofs + bl.length();
2488
2489 if (!(obj == last_written_obj)) {
2490 last_written_obj = obj;
2491 }
2492
2493 // For the first call pass -1 as the offset to
2494 // do a write_full.
2495 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2496}
2497
2498struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2499{
2500 struct put_obj_aio_info info;
2501 info = pending.front();
2502 pending.pop_front();
2503 pending_size -= info.size;
2504 return info;
2505}
2506
2507int RGWPutObjProcessor_Aio::wait_pending_front()
2508{
2509 if (pending.empty()) {
2510 return 0;
2511 }
2512 struct put_obj_aio_info info = pop_pending();
2513 int ret = store->aio_wait(info.handle);
2514
2515 if (ret >= 0) {
2516 add_written_obj(info.obj);
2517 }
2518
2519 return ret;
2520}
2521
2522bool RGWPutObjProcessor_Aio::pending_has_completed()
2523{
2524 if (pending.empty())
2525 return false;
2526
2527 struct put_obj_aio_info& info = pending.front();
2528 return store->aio_completed(info.handle);
2529}
2530
2531int RGWPutObjProcessor_Aio::drain_pending()
2532{
2533 int ret = 0;
2534 while (!pending.empty()) {
2535 int r = wait_pending_front();
2536 if (r < 0)
2537 ret = r;
2538 }
2539 return ret;
2540}
2541
2542int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2543{
2544 bool _wait = need_to_wait;
2545
2546 if (handle) {
2547 struct put_obj_aio_info info;
2548 info.handle = handle;
2549 info.obj = obj;
2550 info.size = size;
2551 pending_size += size;
2552 pending.push_back(info);
2553 }
2554 size_t orig_size = pending_size;
2555
2556 /* first drain complete IOs */
2557 while (pending_has_completed()) {
2558 int r = wait_pending_front();
2559 if (r < 0)
2560 return r;
2561
2562 _wait = false;
2563 }
2564
2565 /* resize window in case messages are draining too fast */
2566 if (orig_size - pending_size >= window_size) {
2567 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2568 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2569 if (window_size > max_window_size) {
2570 window_size = max_window_size;
2571 }
2572 }
2573
2574 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2575 if (pending_size > window_size || _wait) {
2576 int r = wait_pending_front();
2577 if (r < 0)
2578 return r;
2579 }
2580 return 0;
2581}
2582
2583int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2584{
2585 if (ofs >= next_part_ofs) {
2586 int r = prepare_next_part(ofs);
2587 if (r < 0) {
2588 return r;
2589 }
2590 }
2591
2592 *pobj = cur_obj;
2593
224ce89b
WB
2594 if (!bl.length()) {
2595 *phandle = nullptr;
7c673cae 2596 return 0;
224ce89b 2597 }
7c673cae
FG
2598
2599 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2600}
2601
2602int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2603{
2604 RGWPutObjProcessor::prepare(store, oid_rand);
2605
2606 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2607
2608 return 0;
2609}
2610
2611int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2612{
2613 *phandle = NULL;
2614 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2615
2616 pending_data_bl.claim_append(bl);
2617 if (pending_data_bl.length() < max_write_size) {
2618 *again = false;
2619 return 0;
2620 }
2621
2622 pending_data_bl.splice(0, max_write_size, &bl);
2623
2624 /* do we have enough data pending accumulated that needs to be written? */
2625 *again = (pending_data_bl.length() >= max_chunk_size);
2626
2627 if (!data_ofs && !immutable_head()) {
2628 first_chunk.claim(bl);
2629 obj_len = (uint64_t)first_chunk.length();
2630 int r = prepare_next_part(obj_len);
2631 if (r < 0) {
2632 return r;
2633 }
2634 data_ofs = obj_len;
2635 return 0;
2636 }
2637 off_t write_ofs = data_ofs;
2638 data_ofs = write_ofs + bl.length();
2639 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2640 we could be racing with another upload, to the same
2641 object and cleanup can be messy */
2642 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2643 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2644 bl.clear();
2645 }
2646 return ret;
2647}
2648
2649
2650int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2651{
2652 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2653
2654 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2655 if (r < 0) {
2656 return r;
2657 }
2658
2659 return 0;
2660}
2661
2662int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2663{
2664 head_obj.init(bucket, obj_str);
2665
2666 int r = prepare_init(store, oid_rand);
2667 if (r < 0) {
2668 return r;
2669 }
2670
2671 if (!version_id.empty()) {
2672 head_obj.key.set_instance(version_id);
2673 } else if (versioned_object) {
2674 store->gen_rand_obj_instance_name(&head_obj);
2675 }
2676
2677 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2678
2679 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2680 if (r < 0) {
2681 return r;
2682 }
2683
2684 return 0;
2685}
2686
2687int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2688
2689 int ret = manifest_gen.create_next(ofs);
2690 if (ret < 0) {
2691 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2692 return ret;
2693 }
2694 cur_part_ofs = ofs;
2695 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2696 cur_obj = manifest_gen.get_cur_obj(store);
2697
2698 return 0;
2699}
2700
2701int RGWPutObjProcessor_Atomic::complete_parts()
2702{
2703 if (obj_len > (uint64_t)cur_part_ofs) {
2704 return prepare_next_part(obj_len);
2705 }
2706 return 0;
2707}
2708
2709int RGWPutObjProcessor_Atomic::complete_writing_data()
2710{
2711 if (!data_ofs && !immutable_head()) {
2712 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2713 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2714 * clobber first_chunk
2715 */
2716 if (pending_data_bl.length() > 0) {
2717 first_chunk.claim(pending_data_bl);
2718 }
2719 obj_len = (uint64_t)first_chunk.length();
2720 }
2721 while (pending_data_bl.length()) {
224ce89b 2722 void *handle = nullptr;
7c673cae
FG
2723 rgw_raw_obj obj;
2724 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2725 if (max_write_size > pending_data_bl.length()) {
2726 max_write_size = pending_data_bl.length();
2727 }
2728 bufferlist bl;
2729 pending_data_bl.splice(0, max_write_size, &bl);
2730 uint64_t write_len = bl.length();
2731 int r = write_data(bl, data_ofs, &handle, &obj, false);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2734 return r;
2735 }
2736 data_ofs += write_len;
2737 r = throttle_data(handle, obj, write_len, false);
2738 if (r < 0) {
2739 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2740 return r;
2741 }
2742
2743 if (data_ofs >= next_part_ofs) {
2744 r = prepare_next_part(data_ofs);
2745 if (r < 0) {
2746 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2747 return r;
2748 }
2749 }
2750 }
2751 int r = complete_parts();
2752 if (r < 0) {
2753 return r;
2754 }
2755
2756 r = drain_pending();
2757 if (r < 0)
2758 return r;
2759
2760 return 0;
2761}
2762
2763int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2764 real_time *mtime, real_time set_mtime,
2765 map<string, bufferlist>& attrs,
2766 real_time delete_at,
2767 const char *if_match,
31f18b77
FG
2768 const char *if_nomatch, const string *user_data,
2769 rgw_zone_set *zones_trace) {
7c673cae
FG
2770 int r = complete_writing_data();
2771 if (r < 0)
2772 return r;
2773
2774 obj_ctx.obj.set_atomic(head_obj);
2775
2776 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2777
2778 /* some object types shouldn't be versioned, e.g., multipart parts */
2779 op_target.set_versioning_disabled(!versioned_object);
2780
2781 RGWRados::Object::Write obj_op(&op_target);
2782
2783 obj_op.meta.data = &first_chunk;
2784 obj_op.meta.manifest = &manifest;
2785 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2786 obj_op.meta.if_match = if_match;
2787 obj_op.meta.if_nomatch = if_nomatch;
2788 obj_op.meta.mtime = mtime;
2789 obj_op.meta.set_mtime = set_mtime;
2790 obj_op.meta.owner = bucket_info.owner;
2791 obj_op.meta.flags = PUT_OBJ_CREATE;
2792 obj_op.meta.olh_epoch = olh_epoch;
2793 obj_op.meta.delete_at = delete_at;
2794 obj_op.meta.user_data = user_data;
31f18b77 2795 obj_op.meta.zones_trace = zones_trace;
181888fb 2796 obj_op.meta.modify_tail = true;
7c673cae
FG
2797
2798 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2799 if (r < 0) {
2800 return r;
2801 }
2802
2803 canceled = obj_op.meta.canceled;
2804
2805 return 0;
2806}
2807
3a9019d9
FG
2808const char* RGWRados::admin_commands[4][3] = {
2809 { "cache list",
2810 "cache list name=filter,type=CephString,req=false",
2811 "cache list [filter_str]: list object cache, possibly matching substrings" },
2812 { "cache inspect",
2813 "cache inspect name=target,type=CephString,req=true",
2814 "cache inspect target: print cache element" },
2815 { "cache erase",
2816 "cache erase name=target,type=CephString,req=true",
2817 "cache erase target: erase element from cache" },
2818 { "cache zap",
2819 "cache zap",
2820 "cache zap: erase all elements from cache" }
2821};
2822
2823
7c673cae
FG
2824int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2825 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2826 if (r < 0)
2827 return r;
2828 return 0;
2829}
2830
2831int RGWRados::unwatch(uint64_t watch_handle)
2832{
2833 int r = control_pool_ctx.unwatch2(watch_handle);
2834 if (r < 0) {
2835 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2836 return r;
2837 }
2838 r = rados[0].watch_flush();
2839 if (r < 0) {
2840 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2841 return r;
2842 }
2843 return 0;
2844}
2845
2846void RGWRados::add_watcher(int i)
2847{
2848 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2849 Mutex::Locker l(watchers_lock);
2850 watchers_set.insert(i);
2851 if (watchers_set.size() == (size_t)num_watchers) {
2852 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2853 set_cache_enabled(true);
2854 }
2855}
2856
2857void RGWRados::remove_watcher(int i)
2858{
2859 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2860 Mutex::Locker l(watchers_lock);
2861 size_t orig_size = watchers_set.size();
2862 watchers_set.erase(i);
2863 if (orig_size == (size_t)num_watchers &&
2864 watchers_set.size() < orig_size) { /* actually removed */
2865 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2866 set_cache_enabled(false);
2867 }
2868}
2869
2870class RGWWatcher : public librados::WatchCtx2 {
2871 RGWRados *rados;
2872 int index;
2873 string oid;
2874 uint64_t watch_handle;
2875
2876 class C_ReinitWatch : public Context {
2877 RGWWatcher *watcher;
2878 public:
2879 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2880 void finish(int r) override {
2881 watcher->reinit();
2882 }
2883 };
2884public:
2885 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2886 void handle_notify(uint64_t notify_id,
2887 uint64_t cookie,
2888 uint64_t notifier_id,
2889 bufferlist& bl) override {
2890 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2891 << " notify_id " << notify_id
2892 << " cookie " << cookie
2893 << " notifier " << notifier_id
2894 << " bl.length()=" << bl.length() << dendl;
2895 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2896
2897 bufferlist reply_bl; // empty reply payload
2898 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2899 }
2900 void handle_error(uint64_t cookie, int err) override {
2901 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2902 << " err " << cpp_strerror(err) << dendl;
2903 rados->remove_watcher(index);
2904 rados->schedule_context(new C_ReinitWatch(this));
2905 }
2906
2907 void reinit() {
2908 int ret = unregister_watch();
2909 if (ret < 0) {
2910 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2911 return;
2912 }
2913 ret = register_watch();
2914 if (ret < 0) {
2915 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2916 return;
2917 }
2918 }
2919
2920 int unregister_watch() {
2921 int r = rados->unwatch(watch_handle);
2922 if (r < 0) {
2923 return r;
2924 }
2925 rados->remove_watcher(index);
2926 return 0;
2927 }
2928
2929 int register_watch() {
2930 int r = rados->watch(oid, &watch_handle, this);
2931 if (r < 0) {
2932 return r;
2933 }
2934 rados->add_watcher(index);
2935 return 0;
2936 }
2937};
2938
2939class RGWMetaNotifierManager : public RGWCoroutinesManager {
2940 RGWRados *store;
2941 RGWHTTPManager http_manager;
2942
2943public:
2944 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2945 http_manager(store->ctx(), completion_mgr) {
2946 http_manager.set_threaded();
2947 }
2948
2949 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2950 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2951 { "notify", NULL },
2952 { NULL, NULL } };
2953
2954 list<RGWCoroutinesStack *> stacks;
2955 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2956 RGWRESTConn *conn = iter->second;
2957 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2958 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2959
2960 stacks.push_back(stack);
2961 }
2962 return run(stacks);
2963 }
2964};
2965
2966class RGWDataNotifierManager : public RGWCoroutinesManager {
2967 RGWRados *store;
2968 RGWHTTPManager http_manager;
2969
2970public:
2971 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2972 http_manager(store->ctx(), completion_mgr) {
2973 http_manager.set_threaded();
2974 }
2975
2976 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2977 rgw_http_param_pair pairs[] = { { "type", "data" },
2978 { "notify", NULL },
2979 { "source-zone", store->get_zone_params().get_id().c_str() },
2980 { NULL, NULL } };
2981
2982 list<RGWCoroutinesStack *> stacks;
2983 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2984 RGWRESTConn *conn = iter->second;
2985 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2986 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2987
2988 stacks.push_back(stack);
2989 }
2990 return run(stacks);
2991 }
2992};
2993
2994class RGWRadosThread {
2995 class Worker : public Thread {
2996 CephContext *cct;
2997 RGWRadosThread *processor;
2998 Mutex lock;
2999 Cond cond;
3000
31f18b77
FG
3001 void wait() {
3002 Mutex::Locker l(lock);
3003 cond.Wait(lock);
3004 };
3005
3006 void wait_interval(const utime_t& wait_time) {
3007 Mutex::Locker l(lock);
3008 cond.WaitInterval(lock, wait_time);
3009 }
3010
7c673cae
FG
3011 public:
3012 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
3013 void *entry() override;
31f18b77 3014 void signal() {
7c673cae
FG
3015 Mutex::Locker l(lock);
3016 cond.Signal();
3017 }
3018 };
3019
3020 Worker *worker;
3021
3022protected:
3023 CephContext *cct;
3024 RGWRados *store;
3025
3026 std::atomic<bool> down_flag = { false };
3027
3028 string thread_name;
3029
3030 virtual uint64_t interval_msec() = 0;
3031 virtual void stop_process() {}
3032public:
3033 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3034 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3035 virtual ~RGWRadosThread() {
3036 stop();
3037 }
3038
3039 virtual int init() { return 0; }
3040 virtual int process() = 0;
3041
3042 bool going_down() { return down_flag; }
3043
3044 void start();
3045 void stop();
31f18b77
FG
3046
3047 void signal() {
3048 if (worker) {
3049 worker->signal();
3050 }
3051 }
7c673cae
FG
3052};
3053
3054void RGWRadosThread::start()
3055{
3056 worker = new Worker(cct, this);
3057 worker->create(thread_name.c_str());
3058}
3059
3060void RGWRadosThread::stop()
3061{
3062 down_flag = true;
3063 stop_process();
3064 if (worker) {
31f18b77 3065 worker->signal();
7c673cae
FG
3066 worker->join();
3067 }
3068 delete worker;
3069 worker = NULL;
3070}
3071
3072void *RGWRadosThread::Worker::entry() {
3073 uint64_t msec = processor->interval_msec();
3074 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3075
3076 do {
3077 utime_t start = ceph_clock_now();
3078 int r = processor->process();
3079 if (r < 0) {
3080 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3081 }
3082
3083 if (processor->going_down())
3084 break;
3085
3086 utime_t end = ceph_clock_now();
3087 end -= start;
3088
3089 uint64_t cur_msec = processor->interval_msec();
3090 if (cur_msec != msec) { /* was it reconfigured? */
3091 msec = cur_msec;
3092 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3093 }
3094
3095 if (cur_msec > 0) {
3096 if (interval <= end)
3097 continue; // next round
3098
3099 utime_t wait_time = interval;
3100 wait_time -= end;
3101
31f18b77 3102 wait_interval(wait_time);
7c673cae 3103 } else {
31f18b77 3104 wait();
7c673cae
FG
3105 }
3106 } while (!processor->going_down());
3107
3108 return NULL;
3109}
3110
3111class RGWMetaNotifier : public RGWRadosThread {
3112 RGWMetaNotifierManager notify_mgr;
3113 RGWMetadataLog *const log;
3114
3115 uint64_t interval_msec() override {
3116 return cct->_conf->rgw_md_notify_interval_msec;
3117 }
1adf2230
AA
3118 void stop_process() override {
3119 notify_mgr.stop();
3120 }
7c673cae
FG
3121public:
3122 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3123 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3124
3125 int process() override;
3126};
3127
3128int RGWMetaNotifier::process()
3129{
3130 set<int> shards;
3131
3132 log->read_clear_modified(shards);
3133
3134 if (shards.empty()) {
3135 return 0;
3136 }
3137
3138 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3139 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3140 }
3141
3142 notify_mgr.notify_all(store->zone_conn_map, shards);
3143
3144 return 0;
3145}
3146
3147class RGWDataNotifier : public RGWRadosThread {
3148 RGWDataNotifierManager notify_mgr;
3149
3150 uint64_t interval_msec() override {
d2e6a577 3151 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 3152 }
1adf2230
AA
3153 void stop_process() override {
3154 notify_mgr.stop();
3155 }
7c673cae
FG
3156public:
3157 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3158
3159 int process() override;
3160};
3161
3162int RGWDataNotifier::process()
3163{
3164 if (!store->data_log) {
3165 return 0;
3166 }
3167
3168 map<int, set<string> > shards;
3169
3170 store->data_log->read_clear_modified(shards);
3171
3172 if (shards.empty()) {
3173 return 0;
3174 }
3175
3176 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3177 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3178 }
3179
3180 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3181
3182 return 0;
3183}
3184
3185class RGWSyncProcessorThread : public RGWRadosThread {
3186public:
3187 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3188 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3189 ~RGWSyncProcessorThread() override {}
3190 int init() override = 0 ;
3191 int process() override = 0;
3192};
3193
3194class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3195{
3196 RGWMetaSyncStatusManager sync;
3197
3198 uint64_t interval_msec() override {
3199 return 0; /* no interval associated, it'll run once until stopped */
3200 }
3201 void stop_process() override {
3202 sync.stop();
3203 }
3204public:
3205 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3206 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3207
3208 void wakeup_sync_shards(set<int>& shard_ids) {
3209 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3210 sync.wakeup(*iter);
3211 }
3212 }
3213 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3214
3215 int init() override {
3216 int ret = sync.init();
3217 if (ret < 0) {
3218 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3219 return ret;
3220 }
3221 return 0;
3222 }
3223
3224 int process() override {
3225 sync.run();
3226 return 0;
3227 }
3228};
3229
3230class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3231{
3232 RGWDataSyncStatusManager sync;
3233 bool initialized;
3234
3235 uint64_t interval_msec() override {
3236 if (initialized) {
3237 return 0; /* no interval associated, it'll run once until stopped */
3238 } else {
3239#define DATA_SYNC_INIT_WAIT_SEC 20
3240 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3241 }
3242 }
3243 void stop_process() override {
3244 sync.stop();
3245 }
3246public:
3247 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
91327a77 3248 const string& _source_zone)
b32b8144 3249 : RGWSyncProcessorThread(_store, "data-sync"),
91327a77 3250 sync(_store, async_rados, _source_zone),
7c673cae
FG
3251 initialized(false) {}
3252
3253 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3254 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3255 sync.wakeup(iter->first, iter->second);
3256 }
3257 }
3258 RGWDataSyncStatusManager* get_manager() { return &sync; }
3259
3260 int init() override {
3261 return 0;
3262 }
3263
3264 int process() override {
3265 while (!initialized) {
3266 if (going_down()) {
3267 return 0;
3268 }
3269 int ret = sync.init();
3270 if (ret >= 0) {
3271 initialized = true;
3272 break;
3273 }
3274 /* we'll be back! */
3275 return 0;
3276 }
3277 sync.run();
3278 return 0;
3279 }
3280};
3281
3282class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3283{
3284 RGWCoroutinesManager crs;
3285 RGWRados *store;
b32b8144 3286 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
3287 RGWHTTPManager http;
3288 const utime_t trim_interval;
3289
3290 uint64_t interval_msec() override { return 0; }
3291 void stop_process() override { crs.stop(); }
3292public:
b32b8144
FG
3293 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3294 int interval)
7c673cae
FG
3295 : RGWSyncProcessorThread(store, "sync-log-trim"),
3296 crs(store->ctx(), store->get_cr_registry()), store(store),
b32b8144 3297 bucket_trim(bucket_trim),
7c673cae
FG
3298 http(store->ctx(), crs.get_completion_mgr()),
3299 trim_interval(interval, 0)
3300 {}
3301
3302 int init() override {
3303 return http.set_threaded();
3304 }
3305 int process() override {
3306 list<RGWCoroutinesStack*> stacks;
3307 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3308 meta->call(create_meta_log_trim_cr(store, &http,
3309 cct->_conf->rgw_md_log_max_shards,
3310 trim_interval));
3311 stacks.push_back(meta);
3312
3313 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3314 data->call(create_data_log_trim_cr(store, &http,
3315 cct->_conf->rgw_data_log_num_shards,
3316 trim_interval));
3317 stacks.push_back(data);
3318
b32b8144
FG
3319 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3320 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3321 stacks.push_back(bucket);
3322
7c673cae
FG
3323 crs.run(stacks);
3324 return 0;
3325 }
3326};
3327
3328void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3329{
3330 Mutex::Locker l(meta_sync_thread_lock);
3331 if (meta_sync_processor_thread) {
3332 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3333 }
3334}
3335
3336void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3337{
3338 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3339 Mutex::Locker l(data_sync_thread_lock);
3340 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3341 if (iter == data_sync_processor_threads.end()) {
3342 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3343 return;
3344 }
3345
3346 RGWDataSyncProcessorThread *thread = iter->second;
3347 assert(thread);
3348 thread->wakeup_sync_shards(shard_ids);
3349}
3350
3351RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3352{
3353 Mutex::Locker l(meta_sync_thread_lock);
3354 if (meta_sync_processor_thread) {
3355 return meta_sync_processor_thread->get_manager();
3356 }
3357 return nullptr;
3358}
3359
3360RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3361{
3362 Mutex::Locker l(data_sync_thread_lock);
3363 auto thread = data_sync_processor_threads.find(source_zone);
3364 if (thread == data_sync_processor_threads.end()) {
3365 return nullptr;
3366 }
3367 return thread->second->get_manager();
3368}
3369
3370int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3371{
3372 IoCtx ioctx;
3373 int r = open_pool_ctx(pool, ioctx);
3374 if (r < 0) {
3375 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3376 return r;
3377 }
3378
3379 bool requires;
3380 r = ioctx.pool_requires_alignment2(&requires);
3381 if (r < 0) {
3382 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3383 << r << dendl;
3384 return r;
3385 }
3386
3387 if (!requires) {
3388 *alignment = 0;
3389 return 0;
3390 }
3391
3392 uint64_t align;
3393 r = ioctx.pool_required_alignment2(&align);
3394 if (r < 0) {
3395 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3396 << r << dendl;
3397 return r;
3398 }
3399 if (align != 0) {
3400 ldout(cct, 20) << "required alignment=" << align << dendl;
3401 }
3402 *alignment = align;
3403 return 0;
3404}
3405
3406int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3407{
224ce89b 3408 uint64_t alignment = 0;
7c673cae
FG
3409 int r = get_required_alignment(pool, &alignment);
3410 if (r < 0) {
3411 return r;
3412 }
3413
3414 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3415
3416 if (alignment == 0) {
3417 *max_chunk_size = config_chunk_size;
3418 return 0;
3419 }
3420
3421 if (config_chunk_size <= alignment) {
3422 *max_chunk_size = alignment;
3423 return 0;
3424 }
3425
3426 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3427
3428 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3429
3430 return 0;
3431}
3432
3433int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3434{
3435 rgw_pool pool;
3436 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3437 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3438 return -EIO;
3439 }
3440 return get_max_chunk_size(pool, max_chunk_size);
3441}
3442
31f18b77
FG
3443class RGWIndexCompletionManager;
3444
3445struct complete_op_data {
3446 Mutex lock{"complete_op_data"};
3447 AioCompletion *rados_completion{nullptr};
3448 int manager_shard_id{-1};
3449 RGWIndexCompletionManager *manager{nullptr};
3450 rgw_obj obj;
3451 RGWModifyOp op;
3452 string tag;
3453 rgw_bucket_entry_ver ver;
3454 cls_rgw_obj_key key;
3455 rgw_bucket_dir_entry_meta dir_meta;
3456 list<cls_rgw_obj_key> remove_objs;
3457 bool log_op;
3458 uint16_t bilog_op;
3459 rgw_zone_set zones_trace;
3460
3461 bool stopped{false};
3462
3463 void stop() {
3464 Mutex::Locker l(lock);
3465 stopped = true;
3466 }
3467};
3468
3469class RGWIndexCompletionThread : public RGWRadosThread {
3470 RGWRados *store;
3471
3472 uint64_t interval_msec() override {
3473 return 0;
3474 }
3475
3476 list<complete_op_data *> completions;
3477
3478 Mutex completions_lock;
3479public:
3480 RGWIndexCompletionThread(RGWRados *_store)
3481 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3482
3483 int process() override;
3484
3485 void add_completion(complete_op_data *completion) {
3486 {
3487 Mutex::Locker l(completions_lock);
3488 completions.push_back(completion);
3489 }
3490
3491 signal();
3492 }
3493};
3494
3495int RGWIndexCompletionThread::process()
3496{
3497 list<complete_op_data *> comps;
3498
3499 {
3500 Mutex::Locker l(completions_lock);
3501 completions.swap(comps);
3502 }
3503
3504 for (auto c : comps) {
3505 std::unique_ptr<complete_op_data> up{c};
3506
3507 if (going_down()) {
3508 continue;
3509 }
3510 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3511
3512 RGWRados::BucketShard bs(store);
f64942e4 3513 RGWBucketInfo bucket_info;
31f18b77 3514
f64942e4 3515 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
31f18b77
FG
3516 if (r < 0) {
3517 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3518 /* not much to do */
3519 continue;
3520 }
3521
f64942e4
AA
3522 r = store->guard_reshard(&bs, c->obj, bucket_info,
3523 [&](RGWRados::BucketShard *bs) -> int {
3524 librados::ObjectWriteOperation o;
3525 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3526 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3527 c->log_op, c->bilog_op, &c->zones_trace);
3528 return bs->index_ctx.operate(bs->bucket_obj, &o);
31f18b77
FG
3529 });
3530 if (r < 0) {
3531 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3532 /* ignoring error, can't do anything about it */
3533 continue;
3534 }
3535 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3536 if (r < 0) {
3537 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3538 }
3539 }
3540
3541 return 0;
3542}
3543
3544class RGWIndexCompletionManager {
3545 RGWRados *store{nullptr};
3546 vector<Mutex *> locks;
3547 vector<set<complete_op_data *> > completions;
3548
3549 RGWIndexCompletionThread *completion_thread{nullptr};
3550
3551 int num_shards;
3552
3553 std::atomic<int> cur_shard {0};
3554
3555
3556public:
3557 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3558 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3559
3560 for (int i = 0; i < num_shards; i++) {
3561 char buf[64];
3562 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3563 locks.push_back(new Mutex(buf));
3564 }
3565
3566 completions.resize(num_shards);
3567 }
3568 ~RGWIndexCompletionManager() {
3569 stop();
3570
3571 for (auto l : locks) {
3572 delete l;
3573 }
3574 }
3575
3576 int next_shard() {
3577 int result = cur_shard % num_shards;
3578 cur_shard++;
3579 return result;
3580 }
3581
3582 void create_completion(const rgw_obj& obj,
3583 RGWModifyOp op, string& tag,
3584 rgw_bucket_entry_ver& ver,
3585 const cls_rgw_obj_key& key,
3586 rgw_bucket_dir_entry_meta& dir_meta,
3587 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3588 uint16_t bilog_op,
3589 rgw_zone_set *zones_trace,
3590 complete_op_data **result);
3591 bool handle_completion(completion_t cb, complete_op_data *arg);
3592
3593 int start() {
3594 completion_thread = new RGWIndexCompletionThread(store);
3595 int ret = completion_thread->init();
3596 if (ret < 0) {
3597 return ret;
3598 }
3599 completion_thread->start();
3600 return 0;
3601 }
3602 void stop() {
3603 if (completion_thread) {
3604 completion_thread->stop();
3605 delete completion_thread;
3606 }
3607
3608 for (int i = 0; i < num_shards; ++i) {
3609 Mutex::Locker l(*locks[i]);
3610 for (auto c : completions[i]) {
31f18b77
FG
3611 c->stop();
3612 }
3613 }
3614 completions.clear();
3615 }
3616};
3617
3618static void obj_complete_cb(completion_t cb, void *arg)
3619{
3620 complete_op_data *completion = (complete_op_data *)arg;
3621 completion->lock.Lock();
3622 if (completion->stopped) {
3623 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3624 delete completion;
3625 return;
3626 }
3627 bool need_delete = completion->manager->handle_completion(cb, completion);
3628 completion->lock.Unlock();
3629 if (need_delete) {
3630 delete completion;
3631 }
3632}
3633
3634
3635void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3636 RGWModifyOp op, string& tag,
3637 rgw_bucket_entry_ver& ver,
3638 const cls_rgw_obj_key& key,
3639 rgw_bucket_dir_entry_meta& dir_meta,
3640 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3641 uint16_t bilog_op,
3642 rgw_zone_set *zones_trace,
3643 complete_op_data **result)
3644{
3645 complete_op_data *entry = new complete_op_data;
3646
3647 int shard_id = next_shard();
3648
3649 entry->manager_shard_id = shard_id;
3650 entry->manager = this;
3651 entry->obj = obj;
3652 entry->op = op;
3653 entry->tag = tag;
3654 entry->ver = ver;
3655 entry->key = key;
3656 entry->dir_meta = dir_meta;
3657 entry->log_op = log_op;
3658 entry->bilog_op = bilog_op;
3659
3660 if (remove_objs) {
3661 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3662 entry->remove_objs.push_back(*iter);
3663 }
3664 }
3665
3666 if (zones_trace) {
3667 entry->zones_trace = *zones_trace;
3668 } else {
3669 entry->zones_trace.insert(store->get_zone().id);
3670 }
3671
3672 *result = entry;
3673
3674 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3675
3676 Mutex::Locker l(*locks[shard_id]);
3677 completions[shard_id].insert(entry);
3678}
3679
3680bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3681{
3682 int shard_id = arg->manager_shard_id;
3683 {
3684 Mutex::Locker l(*locks[shard_id]);
3685
3686 auto& comps = completions[shard_id];
3687
3688 auto iter = comps.find(arg);
3689 if (iter == comps.end()) {
3690 return true;
3691 }
3692
3693 comps.erase(iter);
3694 }
3695
3696 int r = rados_aio_get_return_value(cb);
3697 if (r != -ERR_BUSY_RESHARDING) {
3698 return true;
3699 }
3700 completion_thread->add_completion(arg);
3701 return false;
3702}
3703
7c673cae
FG
3704void RGWRados::finalize()
3705{
3a9019d9
FG
3706 auto admin_socket = cct->get_admin_socket();
3707 for (auto cmd : admin_commands) {
3708 int r = admin_socket->unregister_command(cmd[0]);
3709 if (r < 0) {
3710 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3711 << ")" << dendl;
3712 }
3713 }
3714
7c673cae
FG
3715 if (run_sync_thread) {
3716 Mutex::Locker l(meta_sync_thread_lock);
3717 meta_sync_processor_thread->stop();
3718
3719 Mutex::Locker dl(data_sync_thread_lock);
3720 for (auto iter : data_sync_processor_threads) {
3721 RGWDataSyncProcessorThread *thread = iter.second;
3722 thread->stop();
3723 }
3724 if (sync_log_trimmer) {
3725 sync_log_trimmer->stop();
3726 }
3727 }
3728 if (async_rados) {
3729 async_rados->stop();
3730 }
3731 if (run_sync_thread) {
3732 delete meta_sync_processor_thread;
3733 meta_sync_processor_thread = NULL;
3734 Mutex::Locker dl(data_sync_thread_lock);
3735 for (auto iter : data_sync_processor_threads) {
3736 RGWDataSyncProcessorThread *thread = iter.second;
3737 delete thread;
3738 }
3739 data_sync_processor_threads.clear();
3740 delete sync_log_trimmer;
3741 sync_log_trimmer = nullptr;
b32b8144 3742 bucket_trim = boost::none;
7c673cae
FG
3743 }
3744 if (finisher) {
3745 finisher->stop();
3746 }
3747 if (need_watch_notify()) {
3748 finalize_watch();
3749 }
3750 if (finisher) {
3751 /* delete finisher only after cleaning up watches, as watch error path might call
3752 * into finisher. We stop finisher before finalizing watch to make sure we don't
3753 * actually handle any racing work
3754 */
3755 delete finisher;
3756 }
3757 if (meta_notifier) {
3758 meta_notifier->stop();
3759 delete meta_notifier;
3760 }
3761 if (data_notifier) {
3762 data_notifier->stop();
3763 delete data_notifier;
3764 }
3765 delete data_log;
3766 if (async_rados) {
3767 delete async_rados;
3768 }
224ce89b 3769
c07f9fc5
FG
3770 delete lc;
3771 lc = NULL;
3772
7c673cae
FG
3773 delete gc;
3774 gc = NULL;
3775
7c673cae
FG
3776 delete obj_expirer;
3777 obj_expirer = NULL;
3778
3779 delete rest_master_conn;
3780
3781 map<string, RGWRESTConn *>::iterator iter;
3782 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3783 RGWRESTConn *conn = iter->second;
3784 delete conn;
3785 }
3786
3787 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3788 RGWRESTConn *conn = iter->second;
3789 delete conn;
3790 }
3791 RGWQuotaHandler::free_handler(quota_handler);
3792 if (cr_registry) {
3793 cr_registry->put();
3794 }
3795 delete meta_mgr;
3796 delete binfo_cache;
3797 delete obj_tombstone_cache;
3798 delete sync_modules_manager;
31f18b77
FG
3799
3800 if (reshard_wait.get()) {
3801 reshard_wait->stop();
3802 reshard_wait.reset();
3803 }
3804
3805 if (run_reshard_thread) {
3806 reshard->stop_processor();
3807 }
3808 delete reshard;
3809 delete index_completion_manager;
7c673cae
FG
3810}
3811
3812/**
3813 * Initialize the RADOS instance and prepare to do other ops
3814 * Returns 0 on success, -ERR# on failure.
3815 */
3816int RGWRados::init_rados()
3817{
3818 int ret = 0;
3a9019d9
FG
3819 auto admin_socket = cct->get_admin_socket();
3820 for (auto cmd : admin_commands) {
3821 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3822 cmd[2]);
3823 if (r < 0) {
3824 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3825 << ")" << dendl;
3826 return r;
3827 }
3828 }
3829
7c673cae
FG
3830 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3831
3832 for (auto& r : handles) {
3833 ret = r.init_with_context(cct);
3834 if (ret < 0) {
3835 return ret;
3836 }
7c673cae
FG
3837 ret = r.connect();
3838 if (ret < 0) {
3839 return ret;
3840 }
3841 }
3842
3843 sync_modules_manager = new RGWSyncModulesManager();
3844
3845 rgw_register_sync_modules(sync_modules_manager);
3846
3847 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3848 new RGWCoroutinesManagerRegistry(cct)};
3849 ret = crs->hook_to_admin_command("cr dump");
3850 if (ret < 0) {
3851 return ret;
3852 }
3853
3854 meta_mgr = new RGWMetadataManager(cct, this);
3855 data_log = new RGWDataChangesLog(cct, this);
3856 cr_registry = crs.release();
3857
3858 std::swap(handles, rados);
3859 return ret;
3860}
3861
224ce89b
WB
3862
3863int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3864{
3865 map<string,string> metadata = meta;
3866 metadata["num_handles"] = stringify(rados.size());
3867 metadata["zonegroup_id"] = zonegroup.get_id();
3868 metadata["zonegroup_name"] = zonegroup.get_name();
3869 metadata["zone_name"] = zone_name();
3870 metadata["zone_id"] = zone_id();;
3871 string name = cct->_conf->name.get_id();
3872 if (name.find("rgw.") == 0) {
3873 name = name.substr(4);
3874 }
3875 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3876 if (ret < 0) {
3877 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3878 return ret;
3879 }
3880
3881 return 0;
3882}
3883
7c673cae
FG
3884/**
3885 * Add new connection to connections map
3886 * @param zonegroup_conn_map map which new connection will be added to
3887 * @param zonegroup zonegroup which new connection will connect to
3888 * @param new_connection pointer to new connection instance
3889 */
3890static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3891 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3892{
3893 // Delete if connection is already exists
3894 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3895 if (iterZoneGroup != zonegroup_conn_map.end()) {
3896 delete iterZoneGroup->second;
3897 }
3898
3899 // Add new connection to connections map
3900 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3901}
3902
3903int RGWRados::convert_regionmap()
3904{
3905 RGWZoneGroupMap zonegroupmap;
3906
3907 string pool_name = cct->_conf->rgw_zone_root_pool;
3908 if (pool_name.empty()) {
3909 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3910 }
3911 string oid = region_map_oid;
3912
3913 rgw_pool pool(pool_name);
3914 bufferlist bl;
3915 RGWObjectCtx obj_ctx(this);
3916 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3917 if (ret < 0 && ret != -ENOENT) {
3918 return ret;
3919 } else if (ret == -ENOENT) {
3920 return 0;
3921 }
3922
3923 try {
3924 bufferlist::iterator iter = bl.begin();
3925 ::decode(zonegroupmap, iter);
3926 } catch (buffer::error& err) {
3927 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3928 return -EIO;
3929 }
3930
3931 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3932 iter != zonegroupmap.zonegroups.end(); ++iter) {
3933 RGWZoneGroup& zonegroup = iter->second;
3934 ret = zonegroup.init(cct, this, false);
3935 ret = zonegroup.update();
3936 if (ret < 0 && ret != -ENOENT) {
3937 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3938 cpp_strerror(-ret) << dendl;
3939 return ret;
3940 } else if (ret == -ENOENT) {
3941 ret = zonegroup.create();
3942 if (ret < 0) {
3943 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3944 cpp_strerror(-ret) << dendl;
3945 return ret;
3946 }
3947 }
3948 }
3949
3950 current_period.set_user_quota(zonegroupmap.user_quota);
3951 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3952
3953 // remove the region_map so we don't try to convert again
3954 rgw_raw_obj obj(pool, oid);
3955 ret = delete_system_obj(obj);
3956 if (ret < 0) {
3957 ldout(cct, 0) << "Error could not remove " << obj
3958 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3959 return ret;
3960 }
3961
3962 return 0;
3963}
3964
3965/**
3966 * Replace all region configuration with zonegroup for
3967 * backward compatability
3968 * Returns 0 on success, -ERR# on failure.
3969 */
3970int RGWRados::replace_region_with_zonegroup()
3971{
3972 /* copy default region */
3973 /* convert default region to default zonegroup */
3974 string default_oid = cct->_conf->rgw_default_region_info_oid;
3975 if (default_oid.empty()) {
3976 default_oid = default_region_info_oid;
3977 }
3978
3979
3980 RGWZoneGroup default_zonegroup;
3981 rgw_pool pool{default_zonegroup.get_pool(cct)};
3982 string oid = "converted";
3983 bufferlist bl;
3984 RGWObjectCtx obj_ctx(this);
3985
3986 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3987 if (ret < 0 && ret != -ENOENT) {
3988 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3989 << dendl;
3990 return ret;
3991 } else if (ret != -ENOENT) {
3992 ldout(cct, 20) << "System already converted " << dendl;
3993 return 0;
3994 }
3995
3996 string default_region;
3997 ret = default_zonegroup.init(cct, this, false, true);
3998 if (ret < 0) {
3999 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4000 return ret;
4001 }
4002 ret = default_zonegroup.read_default_id(default_region, true);
4003 if (ret < 0 && ret != -ENOENT) {
4004 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4005 return ret;
4006 }
4007
4008 /* convert regions to zonegroups */
4009 list<string> regions;
4010 ret = list_regions(regions);
4011 if (ret < 0 && ret != -ENOENT) {
4012 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4013 return ret;
4014 } else if (ret == -ENOENT || regions.empty()) {
4015 RGWZoneParams zoneparams(default_zone_name);
4016 int ret = zoneparams.init(cct, this);
4017 if (ret < 0 && ret != -ENOENT) {
4018 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4019 return ret;
4020 }
4021 /* update master zone */
4022 RGWZoneGroup default_zg(default_zonegroup_name);
4023 ret = default_zg.init(cct, this);
4024 if (ret < 0 && ret != -ENOENT) {
4025 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4026 return ret;
4027 }
4028 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4029 default_zg.master_zone = zoneparams.get_id();
4030 return default_zg.update();
4031 }
4032 return 0;
4033 }
4034
4035 string master_region, master_zone;
4036 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4037 if (*iter != default_zonegroup_name){
4038 RGWZoneGroup region(*iter);
4039 int ret = region.init(cct, this, true, true);
4040 if (ret < 0) {
4041 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4042 return ret;
4043 }
31f18b77 4044 if (region.is_master_zonegroup()) {
7c673cae
FG
4045 master_region = region.get_id();
4046 master_zone = region.master_zone;
4047 }
4048 }
4049 }
4050
4051 /* create realm if there is none.
4052 The realm name will be the region and zone concatenated
4053 realm id will be mds of its name */
4054 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4055 string new_realm_name = master_region + "." + master_zone;
4056 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4057 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4058 MD5 hash;
4059 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4060 hash.Final(md5);
4061 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4062 string new_realm_id(md5_str);
4063 RGWRealm new_realm(new_realm_id,new_realm_name);
4064 ret = new_realm.init(cct, this, false);
4065 if (ret < 0) {
4066 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4067 return ret;
4068 }
4069 ret = new_realm.create();
4070 if (ret < 0 && ret != -EEXIST) {
4071 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4072 return ret;
4073 }
4074 ret = new_realm.set_as_default();
4075 if (ret < 0) {
4076 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4077 return ret;
4078 }
4079 ret = realm.init(cct, this);
4080 if (ret < 0) {
4081 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4082 return ret;
4083 }
4084 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4085 if (ret < 0) {
4086 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4087 return ret;
4088 }
4089 }
4090
4091 list<string>::iterator iter;
4092 /* create zonegroups */
4093 for (iter = regions.begin(); iter != regions.end(); ++iter)
4094 {
4095 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4096 /* check to see if we don't have already a zonegroup with this name */
4097 RGWZoneGroup new_zonegroup(*iter);
4098 ret = new_zonegroup.init(cct , this);
4099 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4100 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4101 " skipping conversion " << dendl;
4102 continue;
4103 }
4104 RGWZoneGroup zonegroup(*iter);
4105 zonegroup.set_id(*iter);
4106 int ret = zonegroup.init(cct, this, true, true);
4107 if (ret < 0) {
4108 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4109 return ret;
4110 }
4111 zonegroup.realm_id = realm.get_id();
4112 /* fix default region master zone */
4113 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4114 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4115 zonegroup.master_zone = default_zone_name;
4116 }
4117 ret = zonegroup.update();
4118 if (ret < 0 && ret != -EEXIST) {
4119 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4120 << dendl;
4121 return ret;
4122 }
4123 ret = zonegroup.update_name();
4124 if (ret < 0 && ret != -EEXIST) {
4125 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4126 << dendl;
4127 return ret;
4128 }
4129 if (zonegroup.get_name() == default_region) {
4130 ret = zonegroup.set_as_default();
4131 if (ret < 0) {
4132 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4133 << dendl;
4134 return ret;
4135 }
4136 }
4137 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4138 ++iter) {
4139 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4140 RGWZoneParams zoneparams(iter->first, iter->first);
4141 zoneparams.set_id(iter->first);
4142 zoneparams.realm_id = realm.get_id();
4143 ret = zoneparams.init(cct, this);
4144 if (ret < 0 && ret != -ENOENT) {
4145 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4146 return ret;
4147 } else if (ret == -ENOENT) {
4148 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4149 continue;
4150 }
4151 zonegroup.realm_id = realm.get_id();
4152 ret = zoneparams.update();
4153 if (ret < 0 && ret != -EEXIST) {
4154 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4155 return ret;
4156 }
4157 ret = zoneparams.update_name();
4158 if (ret < 0 && ret != -EEXIST) {
4159 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4160 return ret;
4161 }
4162 }
4163
4164 if (!current_period.get_id().empty()) {
4165 ret = current_period.add_zonegroup(zonegroup);
4166 if (ret < 0) {
4167 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4168 return ret;
4169 }
4170 }
4171 }
4172
4173 if (!current_period.get_id().empty()) {
4174 ret = current_period.update();
4175 if (ret < 0) {
4176 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4177 return ret;
4178 }
4179 ret = current_period.store_info(false);
4180 if (ret < 0) {
4181 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4182 return ret;
4183 }
4184 ret = current_period.reflect();
4185 if (ret < 0) {
4186 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4187 return ret;
4188 }
4189 }
4190
4191 for (auto const& iter : regions) {
4192 RGWZoneGroup zonegroup(iter);
4193 int ret = zonegroup.init(cct, this, true, true);
4194 if (ret < 0) {
4195 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4196 return ret;
4197 }
4198 ret = zonegroup.delete_obj(true);
4199 if (ret < 0 && ret != -ENOENT) {
4200 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4201 << dendl;
4202 return ret;
4203 }
4204 }
4205
4206 /* mark as converted */
4207 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4208 true, NULL, real_time(), NULL);
4209 if (ret < 0 ) {
4210 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4211 << dendl;
4212 return ret;
4213 }
4214
4215 return 0;
4216}
4217
4218int RGWRados::init_zg_from_period(bool *initialized)
4219{
4220 *initialized = false;
4221
4222 if (current_period.get_id().empty()) {
4223 return 0;
4224 }
4225
4226 int ret = zonegroup.init(cct, this);
4227 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4228 if (ret == -ENOENT) {
4229 return 0;
4230 }
4231 if (ret < 0) {
4232 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4233 return ret;
4234 }
4235 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4236
4237 map<string, RGWZoneGroup>::const_iterator iter =
4238 current_period.get_map().zonegroups.find(zonegroup.get_id());
4239
4240 if (iter != current_period.get_map().zonegroups.end()) {
4241 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4242 zonegroup = iter->second;
4243 ret = zonegroup.init(cct, this, false);
4244 if (ret < 0) {
4245 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4246 return ret;
4247 }
4248 ret = zone_params.init(cct, this);
4249 if (ret < 0 && ret != -ENOENT) {
4250 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4251 return ret;
4252 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4253 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4254 zone_params.set_name(default_zone_name);
4255 ret = zone_params.init(cct, this);
4256 if (ret < 0 && ret != -ENOENT) {
4257 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4258 return ret;
4259 }
4260 }
4261 }
4262 for (iter = current_period.get_map().zonegroups.begin();
4263 iter != current_period.get_map().zonegroups.end(); ++iter){
4264 const RGWZoneGroup& zg = iter->second;
4265 // use endpoints from the zonegroup's master zone
4266 auto master = zg.zones.find(zg.master_zone);
4267 if (master == zg.zones.end()) {
f64942e4
AA
4268 // Check for empty zonegroup which can happen if zone was deleted before removal
4269 if (zg.zones.size() == 0)
4270 continue;
7c673cae
FG
4271 // fix missing master zone for a single zone zonegroup
4272 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4273 master = zg.zones.begin();
4274 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4275 master->second.name << " id:" << master->second.id << " as master" << dendl;
4276 if (zonegroup.get_id() == zg.get_id()) {
4277 zonegroup.master_zone = master->second.id;
4278 ret = zonegroup.update();
4279 if (ret < 0) {
4280 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4281 return ret;
4282 }
4283 } else {
4284 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4285 ret = fixed_zg.init(cct, this);
4286 if (ret < 0) {
4287 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4288 return ret;
4289 }
4290 fixed_zg.master_zone = master->second.id;
4291 ret = fixed_zg.update();
4292 if (ret < 0) {
4293 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4294 return ret;
4295 }
4296 }
4297 } else {
4298 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4299 zg.master_zone << dendl;
4300 return -EINVAL;
4301 }
4302 }
4303 const auto& endpoints = master->second.endpoints;
4304 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4305 if (!current_period.get_master_zonegroup().empty() &&
4306 zg.get_id() == current_period.get_master_zonegroup()) {
4307 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4308 }
4309 }
4310
4311 *initialized = true;
4312
4313 return 0;
4314}
4315
4316int RGWRados::init_zg_from_local(bool *creating_defaults)
4317{
4318 int ret = zonegroup.init(cct, this);
4319 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4320 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4321 return ret;
4322 } else if (ret == -ENOENT) {
4323 *creating_defaults = true;
4324 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4325 ret = zonegroup.create_default();
4326 if (ret < 0) {
4327 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4328 << dendl;
4329 return ret;
4330 }
4331 ret = zonegroup.init(cct, this);
4332 if (ret < 0) {
4333 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4334 << dendl;
4335 return ret;
4336 }
4337 }
4338 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
31f18b77 4339 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
4340 // use endpoints from the zonegroup's master zone
4341 auto master = zonegroup.zones.find(zonegroup.master_zone);
4342 if (master == zonegroup.zones.end()) {
4343 // fix missing master zone for a single zone zonegroup
4344 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4345 master = zonegroup.zones.begin();
4346 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4347 master->second.name << " id:" << master->second.id << " as master" << dendl;
4348 zonegroup.master_zone = master->second.id;
4349 ret = zonegroup.update();
4350 if (ret < 0) {
4351 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4352 return ret;
4353 }
4354 } else {
4355 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4356 "master_zone=" << zonegroup.master_zone << dendl;
4357 return -EINVAL;
4358 }
4359 }
4360 const auto& endpoints = master->second.endpoints;
4361 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4362 }
4363
4364 return 0;
4365}
4366
4367
4368bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4369{
4370 return target_zone.syncs_from(source_zone.name) &&
4371 sync_modules_manager->supports_data_export(source_zone.tier_type);
4372}
4373
4374/**
4375 * Initialize the RADOS instance and prepare to do other ops
4376 * Returns 0 on success, -ERR# on failure.
4377 */
4378int RGWRados::init_complete()
4379{
4380 int ret = realm.init(cct, this);
4381 if (ret < 0 && ret != -ENOENT) {
4382 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4383 return ret;
4384 } else if (ret != -ENOENT) {
4385 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4386 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4387 if (ret < 0 && ret != -ENOENT) {
4388 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4389 return ret;
4390 }
4391 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4392 }
4393
4394 ret = replace_region_with_zonegroup();
4395 if (ret < 0) {
4396 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4397 return ret;
4398 }
4399
4400 ret = convert_regionmap();
4401 if (ret < 0) {
4402 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4403 return ret;
4404 }
4405
4406 bool zg_initialized = false;
4407
4408 if (!current_period.get_id().empty()) {
4409 ret = init_zg_from_period(&zg_initialized);
4410 if (ret < 0) {
4411 return ret;
4412 }
4413 }
4414
4415 bool creating_defaults = false;
4416 bool using_local = (!zg_initialized);
4417 if (using_local) {
4418 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4419 ret = init_zg_from_local(&creating_defaults);
4420 if (ret < 0) {
4421 return ret;
4422 }
4423 // read period_config into current_period
4424 auto& period_config = current_period.get_config();
4425 ret = period_config.read(this, zonegroup.realm_id);
4426 if (ret < 0 && ret != -ENOENT) {
4427 ldout(cct, 0) << "ERROR: failed to read period config: "
4428 << cpp_strerror(ret) << dendl;
4429 return ret;
4430 }
4431 }
4432
4433 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4434 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4435 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4436 zone_params.set_name(default_zone_name);
4437 }
4438
4439 ret = zone_params.init(cct, this);
4440 if (ret < 0 && ret != -ENOENT) {
4441 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4442 return ret;
4443 }
4444 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4445 if (zone_iter == get_zonegroup().zones.end()) {
4446 if (using_local) {
4447 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4448 return -EINVAL;
4449 }
4450 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4451 ret = init_zg_from_local(&creating_defaults);
4452 if (ret < 0) {
4453 return ret;
4454 }
4455 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4456 }
4457 if (zone_iter != get_zonegroup().zones.end()) {
4458 zone_public_config = zone_iter->second;
4459 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4460 } else {
4461 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4462 return -EINVAL;
4463 }
4464
4465 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4466
31f18b77
FG
4467 if (run_sync_thread) {
4468 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4469 if (ret < 0) {
4470 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4471 return ret;
4472 }
7c673cae
FG
4473 }
4474
4475 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4476
4477 init_unique_trans_id_deps();
4478
4479 finisher = new Finisher(cct);
4480 finisher->start();
4481
4482 period_puller.reset(new RGWPeriodPuller(this));
4483 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4484 current_period));
4485
4486 if (need_watch_notify()) {
4487 ret = init_watch();
4488 if (ret < 0) {
4489 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4490 return ret;
4491 }
4492 }
4493
4494 /* first build all zones index */
4495 for (auto ziter : get_zonegroup().zones) {
4496 const string& id = ziter.first;
4497 RGWZone& z = ziter.second;
4498 zone_id_by_name[z.name] = id;
4499 zone_by_id[id] = z;
4500 }
31f18b77 4501
7c673cae
FG
4502 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4503 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4504 }
4505 zone_public_config = zone_by_id[zone_id()];
4506 for (auto ziter : get_zonegroup().zones) {
4507 const string& id = ziter.first;
4508 RGWZone& z = ziter.second;
4509 if (id == zone_id()) {
4510 continue;
4511 }
4512 if (z.endpoints.empty()) {
4513 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4514 continue;
4515 }
4516 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4517 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4518 zone_conn_map[id] = conn;
4519 if (zone_syncs_from(zone_public_config, z) ||
4520 zone_syncs_from(z, zone_public_config)) {
4521 if (zone_syncs_from(zone_public_config, z)) {
4522 zone_data_sync_from_map[id] = conn;
4523 }
4524 if (zone_syncs_from(z, zone_public_config)) {
4525 zone_data_notify_to_map[id] = conn;
4526 }
4527 } else {
4528 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4529 }
4530 }
4531
4532 ret = open_root_pool_ctx();
4533 if (ret < 0)
4534 return ret;
4535
4536 ret = open_gc_pool_ctx();
4537 if (ret < 0)
4538 return ret;
4539
4540 ret = open_lc_pool_ctx();
4541 if (ret < 0)
4542 return ret;
4543
4544 ret = open_objexp_pool_ctx();
4545 if (ret < 0)
4546 return ret;
4547
31f18b77
FG
4548 ret = open_reshard_pool_ctx();
4549 if (ret < 0)
4550 return ret;
4551
7c673cae
FG
4552 pools_initialized = true;
4553
4554 gc = new RGWGC();
4555 gc->initialize(cct, this);
4556
4557 obj_expirer = new RGWObjectExpirer(this);
4558
4559 if (use_gc_thread) {
4560 gc->start_processor();
4561 obj_expirer->start_processor();
4562 }
4563
7c673cae
FG
4564 /* no point of running sync thread if we don't have a master zone configured
4565 or there is no rest_master_conn */
4566 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4567 || current_period.get_id().empty()) {
4568 run_sync_thread = false;
4569 }
4570
b32b8144
FG
4571 if (run_sync_thread) {
4572 // initialize the log period history
4573 meta_mgr->init_oldest_log_period();
4574 }
4575
7c673cae
FG
4576 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4577 async_rados->start();
4578
4579 ret = meta_mgr->init(current_period.get_id());
4580 if (ret < 0) {
4581 lderr(cct) << "ERROR: failed to initialize metadata log: "
4582 << cpp_strerror(-ret) << dendl;
4583 return ret;
4584 }
4585
4586 if (is_meta_master()) {
4587 auto md_log = meta_mgr->get_log(current_period.get_id());
4588 meta_notifier = new RGWMetaNotifier(this, md_log);
4589 meta_notifier->start();
4590 }
4591
4592 if (run_sync_thread) {
4593 Mutex::Locker l(meta_sync_thread_lock);
4594 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4595 ret = meta_sync_processor_thread->init();
4596 if (ret < 0) {
4597 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4598 return ret;
4599 }
4600 meta_sync_processor_thread->start();
4601
b32b8144
FG
4602 // configure the bucket trim manager
4603 rgw::BucketTrimConfig config;
4604 rgw::configure_bucket_trim(cct, config);
4605
4606 bucket_trim.emplace(this, config);
4607 ret = bucket_trim->init();
4608 if (ret < 0) {
4609 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4610 return ret;
4611 }
91327a77 4612 data_log->set_observer(&*bucket_trim);
b32b8144 4613
7c673cae
FG
4614 Mutex::Locker dl(data_sync_thread_lock);
4615 for (auto iter : zone_data_sync_from_map) {
4616 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
91327a77 4617 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
7c673cae
FG
4618 ret = thread->init();
4619 if (ret < 0) {
4620 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4621 return ret;
4622 }
4623 thread->start();
4624 data_sync_processor_threads[iter.first] = thread;
4625 }
4626 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4627 if (interval > 0) {
b32b8144 4628 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
7c673cae
FG
4629 ret = sync_log_trimmer->init();
4630 if (ret < 0) {
4631 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4632 return ret;
4633 }
4634 sync_log_trimmer->start();
4635 }
4636 }
4637 data_notifier = new RGWDataNotifier(this);
4638 data_notifier->start();
4639
4640 lc = new RGWLC();
4641 lc->initialize(cct, this);
31f18b77 4642
7c673cae
FG
4643 if (use_lc_thread)
4644 lc->start_processor();
31f18b77 4645
7c673cae
FG
4646 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4647
4648 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4649 get_zone().bucket_index_max_shards);
31f18b77
FG
4650 if (bucket_index_max_shards > get_max_bucket_shards()) {
4651 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 4652 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 4653 << get_max_bucket_shards() << dendl;
7c673cae
FG
4654 }
4655 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4656
4657 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4658 binfo_cache->init(this);
4659
4660 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4661
4662 if (need_tombstone_cache) {
4663 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4664 }
4665
31f18b77
FG
4666 reshard_wait = std::make_shared<RGWReshardWait>(this);
4667
4668 reshard = new RGWReshard(this);
4669
4670 /* only the master zone in the zonegroup reshards buckets */
4671 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4672 if (run_reshard_thread) {
4673 reshard->start_processor();
4674 }
4675
4676 index_completion_manager = new RGWIndexCompletionManager(this);
4677 ret = index_completion_manager->start();
4678
7c673cae
FG
4679 return ret;
4680}
4681
4682/**
4683 * Initialize the RADOS instance and prepare to do other ops
4684 * Returns 0 on success, -ERR# on failure.
4685 */
4686int RGWRados::initialize()
4687{
4688 int ret;
4689
4690 ret = init_rados();
4691 if (ret < 0)
4692 return ret;
4693
4694 return init_complete();
4695}
4696
4697void RGWRados::finalize_watch()
4698{
4699 for (int i = 0; i < num_watchers; i++) {
4700 RGWWatcher *watcher = watchers[i];
4701 watcher->unregister_watch();
4702 delete watcher;
4703 }
4704
4705 delete[] notify_oids;
4706 delete[] watchers;
4707}
4708
4709void RGWRados::schedule_context(Context *c) {
4710 finisher->queue(c);
4711}
4712
4713int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4714{
4715 bool is_truncated;
4716 RGWListRawObjsCtx ctx;
4717 do {
4718 list<string> oids;
4719 int r = list_raw_objects(pool, prefix, 1000,
4720 ctx, oids, &is_truncated);
4721 if (r < 0) {
4722 return r;
4723 }
4724 list<string>::iterator iter;
4725 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4726 string& val = *iter;
4727 if (val.size() > prefix.size())
4728 result.push_back(val.substr(prefix.size()));
4729 }
4730 } while (is_truncated);
4731
4732 return 0;
4733}
4734
4735int RGWRados::list_regions(list<string>& regions)
4736{
4737 RGWZoneGroup zonegroup;
4738
4739 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4740}
4741
4742int RGWRados::list_zonegroups(list<string>& zonegroups)
4743{
4744 RGWZoneGroup zonegroup;
4745
4746 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4747}
4748
4749int RGWRados::list_zones(list<string>& zones)
4750{
4751 RGWZoneParams zoneparams;
4752
4753 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4754}
4755
4756int RGWRados::list_realms(list<string>& realms)
4757{
4758 RGWRealm realm(cct, this);
4759 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4760}
4761
4762int RGWRados::list_periods(list<string>& periods)
4763{
4764 RGWPeriod period;
4765 list<string> raw_periods;
4766 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4767 if (ret < 0) {
4768 return ret;
4769 }
4770 for (const auto& oid : raw_periods) {
4771 size_t pos = oid.find(".");
4772 if (pos != std::string::npos) {
4773 periods.push_back(oid.substr(0, pos));
4774 } else {
4775 periods.push_back(oid);
4776 }
4777 }
4778 periods.sort(); // unique() only detects duplicates if they're adjacent
4779 periods.unique();
4780 return 0;
4781}
4782
4783
4784int RGWRados::list_periods(const string& current_period, list<string>& periods)
4785{
4786 int ret = 0;
4787 string period_id = current_period;
4788 while(!period_id.empty()) {
4789 RGWPeriod period(period_id);
4790 ret = period.init(cct, this);
4791 if (ret < 0) {
4792 return ret;
4793 }
4794 periods.push_back(period.get_id());
4795 period_id = period.get_predecessor();
4796 }
4797
4798 return ret;
4799}
4800
4801/**
4802 * Open the pool used as root for this gateway
4803 * Returns: 0 on success, -ERR# otherwise.
4804 */
4805int RGWRados::open_root_pool_ctx()
4806{
4807 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4808}
4809
4810int RGWRados::open_gc_pool_ctx()
4811{
4812 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4813}
4814
4815int RGWRados::open_lc_pool_ctx()
4816{
4817 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4818}
4819
4820int RGWRados::open_objexp_pool_ctx()
4821{
4822 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4823}
4824
31f18b77
FG
4825int RGWRados::open_reshard_pool_ctx()
4826{
4827 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4828}
4829
7c673cae
FG
4830int RGWRados::init_watch()
4831{
4832 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4833 if (r < 0) {
4834 return r;
4835 }
4836
4837 num_watchers = cct->_conf->rgw_num_control_oids;
4838
4839 bool compat_oid = (num_watchers == 0);
4840
4841 if (num_watchers <= 0)
4842 num_watchers = 1;
4843
4844 notify_oids = new string[num_watchers];
4845 watchers = new RGWWatcher *[num_watchers];
4846
4847 for (int i=0; i < num_watchers; i++) {
4848 string& notify_oid = notify_oids[i];
4849 notify_oid = notify_oid_prefix;
4850 if (!compat_oid) {
4851 char buf[16];
4852 snprintf(buf, sizeof(buf), ".%d", i);
4853 notify_oid.append(buf);
4854 }
4855 r = control_pool_ctx.create(notify_oid, false);
4856 if (r < 0 && r != -EEXIST)
4857 return r;
4858
4859 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4860 watchers[i] = watcher;
4861
4862 r = watcher->register_watch();
4863 if (r < 0)
4864 return r;
4865 }
4866
4867 watch_initialized = true;
4868
4869 set_cache_enabled(true);
4870
4871 return 0;
4872}
4873
4874void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4875{
4876 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4877
4878 int i = r % num_watchers;
4879 char buf[16];
4880 snprintf(buf, sizeof(buf), ".%d", i);
4881
4882 notify_oid = notify_oid_prefix;
4883 notify_oid.append(buf);
4884}
4885
28e407b8 4886int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
7c673cae 4887{
28e407b8
AA
4888 constexpr bool create = true; // create the pool if it doesn't exist
4889 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
4890}
4891
4892void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4893 string *marker) {
4894 if (marker) {
4895 *marker = shard_id_str;
4896 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4897 marker->append(shard_marker);
4898 }
4899}
4900
4901int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4902{
3a9019d9
FG
4903 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4904
4905 if (!explicit_pool.empty()) {
4906 return open_pool_ctx(explicit_pool, index_ctx);
4907 }
4908
7c673cae
FG
4909 const string *rule = &bucket_info.placement_rule;
4910 if (rule->empty()) {
4911 rule = &zonegroup.default_placement;
4912 }
4913 auto iter = zone_params.placement_pools.find(*rule);
4914 if (iter == zone_params.placement_pools.end()) {
4915 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4916 return -EINVAL;
4917 }
4918
4919 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4920 if (r < 0)
4921 return r;
4922
4923 return 0;
4924}
4925
4926/**
4927 * set up a bucket listing.
4928 * handle is filled in.
4929 * Returns 0 on success, -ERR# otherwise.
4930 */
4931int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4932{
f64942e4
AA
4933 try {
4934 auto iter = root_pool_ctx.nobjects_begin();
4935 librados::NObjectIterator *state = new librados::NObjectIterator(iter);
4936 *handle = (RGWAccessHandle)state;
4937 return 0;
4938 } catch (const std::system_error& e) {
4939 int r = -e.code().value();
4940 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4941 << ", returning " << r << dendl;
4942 return r;
4943 } catch (const std::exception& e) {
4944 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4945 << ", returning -5" << dendl;
4946 return -EIO;
4947 }
7c673cae
FG
4948}
4949
4950/**
4951 * get the next bucket in the listing.
4952 * obj is filled in,
4953 * handle is updated.
4954 * returns 0 on success, -ERR# otherwise.
4955 */
4956int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4957{
4958 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4959
4960 do {
4961 if (*state == root_pool_ctx.nobjects_end()) {
4962 delete state;
4963 return -ENOENT;
4964 }
4965
4966 obj.key.name = (*state)->get_oid();
4967 if (obj.key.name[0] == '_') {
4968 obj.key.name = obj.key.name.substr(1);
4969 }
f64942e4
AA
4970 try {
4971 (*state)++;
4972 } catch (const std::system_error& e) {
4973 int r = -e.code().value();
4974 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4975 << ", returning " << r << dendl;
4976 return r;
4977 } catch (const std::exception& e) {
4978 ldout(cct, 10) << "nobjects_begin threw " << e.what()
4979 << ", returning -5" << dendl;
4980 return -EIO;
4981 }
7c673cae
FG
4982 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4983
4984 return 0;
4985}
4986
4987
4988/**** logs ****/
4989
4990struct log_list_state {
4991 string prefix;
4992 librados::IoCtx io_ctx;
4993 librados::NObjectIterator obit;
4994};
4995
4996int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4997{
4998 log_list_state *state = new log_list_state;
4999 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5000 if (r < 0) {
5001 delete state;
5002 return r;
5003 }
5004 state->prefix = prefix;
5005 state->obit = state->io_ctx.nobjects_begin();
5006 *handle = (RGWAccessHandle)state;
5007 return 0;
5008}
5009
5010int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
5011{
5012 log_list_state *state = static_cast<log_list_state *>(handle);
5013 while (true) {
5014 if (state->obit == state->io_ctx.nobjects_end()) {
5015 delete state;
5016 return -ENOENT;
5017 }
5018 if (state->prefix.length() &&
5019 state->obit->get_oid().find(state->prefix) != 0) {
5020 state->obit++;
5021 continue;
5022 }
5023 *name = state->obit->get_oid();
5024 state->obit++;
5025 break;
5026 }
5027 return 0;
5028}
5029
5030int RGWRados::log_remove(const string& name)
5031{
5032 librados::IoCtx io_ctx;
5033 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5034 if (r < 0)
5035 return r;
5036 return io_ctx.remove(name);
5037}
5038
5039struct log_show_state {
5040 librados::IoCtx io_ctx;
5041 bufferlist bl;
5042 bufferlist::iterator p;
5043 string name;
5044 uint64_t pos;
5045 bool eof;
5046 log_show_state() : pos(0), eof(false) {}
5047};
5048
5049int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5050{
5051 log_show_state *state = new log_show_state;
5052 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5053 if (r < 0) {
5054 delete state;
5055 return r;
5056 }
5057 state->name = name;
5058 *handle = (RGWAccessHandle)state;
5059 return 0;
5060}
5061
5062int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5063{
5064 log_show_state *state = static_cast<log_show_state *>(handle);
5065 off_t off = state->p.get_off();
5066
5067 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5068 << " off " << off
5069 << " eof " << (int)state->eof
5070 << dendl;
5071 // read some?
5072 unsigned chunk = 1024*1024;
5073 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5074 bufferlist more;
5075 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5076 if (r < 0)
5077 return r;
5078 state->pos += r;
5079 bufferlist old;
5080 try {
5081 old.substr_of(state->bl, off, state->bl.length() - off);
5082 } catch (buffer::error& err) {
5083 return -EINVAL;
5084 }
5085 state->bl.clear();
5086 state->bl.claim(old);
5087 state->bl.claim_append(more);
5088 state->p = state->bl.begin();
5089 if ((unsigned)r < chunk)
5090 state->eof = true;
5091 ldout(cct, 10) << " read " << r << dendl;
5092 }
5093
5094 if (state->p.end())
5095 return 0; // end of file
5096 try {
5097 ::decode(*entry, state->p);
5098 }
5099 catch (const buffer::error &e) {
5100 return -EINVAL;
5101 }
5102 return 1;
5103}
5104
5105/**
5106 * usage_log_hash: get usage log key hash, based on name and index
5107 *
5108 * Get the usage object name. Since a user may have more than 1
5109 * object holding that info (multiple shards), we use index to
5110 * specify that shard number. Once index exceeds max shards it
5111 * wraps.
5112 * If name is not being set, results for all users will be returned
5113 * and index will wrap only after total shards number.
5114 *
5115 * @param cct [in] ceph context
5116 * @param name [in] user name
5117 * @param hash [out] hash value
5118 * @param index [in] shard index number
5119 */
5120static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5121{
5122 uint32_t val = index;
5123
5124 if (!name.empty()) {
c07f9fc5 5125 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
5126 val %= max_user_shards;
5127 val += ceph_str_hash_linux(name.c_str(), name.size());
5128 }
5129 char buf[17];
c07f9fc5 5130 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
5131 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5132 hash = buf;
5133}
5134
5135int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5136{
5137 uint32_t index = 0;
5138
5139 map<string, rgw_usage_log_info> log_objs;
5140
5141 string hash;
5142 string last_user;
5143
5144 /* restructure usage map, zone by object hash */
5145 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5146 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5147 const rgw_user_bucket& ub = iter->first;
5148 RGWUsageBatch& info = iter->second;
5149
5150 if (ub.user.empty()) {
5151 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5152 continue;
5153 }
5154
5155 if (ub.user != last_user) {
5156 /* index *should* be random, but why waste extra cycles
5157 in most cases max user shards is not going to exceed 1,
5158 so just incrementing it */
5159 usage_log_hash(cct, ub.user, hash, index++);
5160 }
5161 last_user = ub.user;
5162 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5163
5164 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5165 v.push_back(miter->second);
5166 }
5167 }
5168
5169 map<string, rgw_usage_log_info>::iterator liter;
5170
5171 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5172 int r = cls_obj_usage_log_add(liter->first, liter->second);
5173 if (r < 0)
5174 return r;
5175 }
5176 return 0;
5177}
5178
5179int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5180 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5181{
5182 uint32_t num = max_entries;
5183 string hash, first_hash;
5184 string user_str = user.to_str();
5185 usage_log_hash(cct, user_str, first_hash, 0);
5186
5187 if (usage_iter.index) {
5188 usage_log_hash(cct, user_str, hash, usage_iter.index);
5189 } else {
5190 hash = first_hash;
5191 }
5192
5193 usage.clear();
5194
5195 do {
5196 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5197 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5198
5199 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5200 usage_iter.read_iter, ret_usage, is_truncated);
5201 if (ret == -ENOENT)
5202 goto next;
5203
5204 if (ret < 0)
5205 return ret;
5206
5207 num -= ret_usage.size();
5208
5209 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5210 usage[iter->first].aggregate(iter->second);
5211 }
5212
5213next:
5214 if (!*is_truncated) {
5215 usage_iter.read_iter.clear();
5216 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5217 }
5218 } while (num && !*is_truncated && hash != first_hash);
5219 return 0;
5220}
5221
5222int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5223{
5224 uint32_t index = 0;
5225 string hash, first_hash;
5226 string user_str = user.to_str();
5227 usage_log_hash(cct, user_str, first_hash, index);
5228
5229 hash = first_hash;
7c673cae
FG
5230 do {
5231 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
7c673cae 5232
b32b8144 5233 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
5234 return ret;
5235
7c673cae
FG
5236 usage_log_hash(cct, user_str, hash, ++index);
5237 } while (hash != first_hash);
5238
5239 return 0;
5240}
5241
7c673cae
FG
5242int RGWRados::key_to_shard_id(const string& key, int max_shards)
5243{
1adf2230 5244 return rgw_shard_id(key, max_shards);
7c673cae
FG
5245}
5246
5247void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5248{
5249 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5250 char buf[16];
5251 if (shard_id) {
5252 *shard_id = val % max_shards;
5253 }
5254 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5255 name = prefix + buf;
5256}
5257
5258void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5259{
5260 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5261 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5262 char buf[16];
5263 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5264 name = prefix + buf;
5265}
5266
5267void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5268{
5269 char buf[16];
5270 snprintf(buf, sizeof(buf), "%u", shard_id);
5271 name = prefix + buf;
5272
5273}
5274
5275void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5276{
5277 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5278}
5279
5280int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5281{
5282 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5283
5284}
5285
5286int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5287{
5288 librados::IoCtx io_ctx;
5289
5290 int r = time_log_add_init(io_ctx);
5291 if (r < 0) {
5292 return r;
5293 }
5294
5295 ObjectWriteOperation op;
5296 utime_t t(ut);
5297 cls_log_add(op, t, section, key, bl);
5298
5299 return io_ctx.operate(oid, &op);
5300}
5301
5302int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5303 librados::AioCompletion *completion, bool monotonic_inc)
5304{
5305 librados::IoCtx io_ctx;
5306
5307 int r = time_log_add_init(io_ctx);
5308 if (r < 0) {
5309 return r;
5310 }
5311
5312 ObjectWriteOperation op;
5313 cls_log_add(op, entries, monotonic_inc);
5314
5315 if (!completion) {
5316 r = io_ctx.operate(oid, &op);
5317 } else {
5318 r = io_ctx.aio_operate(oid, completion, &op);
5319 }
5320 return r;
5321}
5322
5323int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5324 int max_entries, list<cls_log_entry>& entries,
5325 const string& marker,
5326 string *out_marker,
5327 bool *truncated)
5328{
5329 librados::IoCtx io_ctx;
5330
5331 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5332 if (r < 0)
5333 return r;
5334 librados::ObjectReadOperation op;
5335
5336 utime_t st(start_time);
5337 utime_t et(end_time);
5338
5339 cls_log_list(op, st, et, marker, max_entries, entries,
5340 out_marker, truncated);
5341
5342 bufferlist obl;
5343
5344 int ret = io_ctx.operate(oid, &op, &obl);
5345 if (ret < 0)
5346 return ret;
5347
5348 return 0;
5349}
5350
5351int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5352{
5353 librados::IoCtx io_ctx;
5354
5355 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5356 if (r < 0)
5357 return r;
5358 librados::ObjectReadOperation op;
5359
5360 cls_log_info(op, header);
5361
5362 bufferlist obl;
5363
5364 int ret = io_ctx.operate(oid, &op, &obl);
5365 if (ret < 0)
5366 return ret;
5367
5368 return 0;
5369}
5370
5371int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5372{
5373 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5374 if (r < 0)
5375 return r;
5376
5377 librados::ObjectReadOperation op;
5378
5379 cls_log_info(op, header);
5380
5381 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5382 if (ret < 0)
5383 return ret;
5384
5385 return 0;
5386}
5387
5388int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5389 const string& from_marker, const string& to_marker,
5390 librados::AioCompletion *completion)
5391{
5392 librados::IoCtx io_ctx;
5393
5394 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5395 if (r < 0)
5396 return r;
5397
5398 utime_t st(start_time);
5399 utime_t et(end_time);
5400
5401 ObjectWriteOperation op;
5402 cls_log_trim(op, st, et, from_marker, to_marker);
5403
5404 if (!completion) {
5405 r = io_ctx.operate(oid, &op);
5406 } else {
5407 r = io_ctx.aio_operate(oid, completion, &op);
5408 }
5409 return r;
5410}
5411
5412string RGWRados::objexp_hint_get_shardname(int shard_num)
5413{
5414 char buf[32];
5415 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5416
5417 string objname("obj_delete_at_hint.");
5418 return objname + buf;
5419}
5420
7c673cae
FG
5421int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5422{
5423 string obj_key = key.name + key.instance;
5424 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
1adf2230 5425 return rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
5426}
5427
5428static string objexp_hint_get_keyext(const string& tenant_name,
5429 const string& bucket_name,
5430 const string& bucket_id,
5431 const rgw_obj_key& obj_key)
5432{
5433 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5434 ":" + obj_key.name + ":" + obj_key.instance;
5435}
5436
5437int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5438 const string& tenant_name,
5439 const string& bucket_name,
5440 const string& bucket_id,
5441 const rgw_obj_index_key& obj_key)
5442{
5443 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5444 bucket_id, obj_key);
5445 objexp_hint_entry he = {
5446 .tenant = tenant_name,
5447 .bucket_name = bucket_name,
5448 .bucket_id = bucket_id,
5449 .obj_key = obj_key,
5450 .exp_time = delete_at };
5451 bufferlist hebl;
5452 ::encode(he, hebl);
5453 ObjectWriteOperation op;
5454 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5455
5456 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5457 return objexp_pool_ctx.operate(shard_name, &op);
5458}
5459
5460void RGWRados::objexp_get_shard(int shard_num,
5461 string& shard) /* out */
5462{
5463 shard = objexp_hint_get_shardname(shard_num);
5464}
5465
5466int RGWRados::objexp_hint_list(const string& oid,
5467 const ceph::real_time& start_time,
5468 const ceph::real_time& end_time,
5469 const int max_entries,
5470 const string& marker,
5471 list<cls_timeindex_entry>& entries, /* out */
5472 string *out_marker, /* out */
5473 bool *truncated) /* out */
5474{
5475 librados::ObjectReadOperation op;
5476 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5477 out_marker, truncated);
5478
5479 bufferlist obl;
5480 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5481
5482 if ((ret < 0 ) && (ret != -ENOENT)) {
5483 return ret;
5484 }
5485
5486 if ((ret == -ENOENT) && truncated) {
5487 *truncated = false;
5488 }
5489
5490 return 0;
5491}
5492
5493int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5494 objexp_hint_entry& hint_entry) /* out */
5495{
5496 try {
5497 bufferlist::iterator iter = ti_entry.value.begin();
5498 ::decode(hint_entry, iter);
5499 } catch (buffer::error& err) {
5500 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5501 }
5502
5503 return 0;
5504}
5505
5506int RGWRados::objexp_hint_trim(const string& oid,
5507 const ceph::real_time& start_time,
5508 const ceph::real_time& end_time,
5509 const string& from_marker,
5510 const string& to_marker)
5511{
5512 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5513 from_marker, to_marker);
5514 if ((ret < 0 ) && (ret != -ENOENT)) {
5515 return ret;
5516 }
5517
5518 return 0;
5519}
5520
5521int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5522 string& zone_id, string& owner_id) {
5523 librados::IoCtx io_ctx;
5524
5525 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5526 if (r < 0) {
5527 return r;
5528 }
5529 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5530 utime_t ut(msec / 1000, msec % 1000);
5531
5532 rados::cls::lock::Lock l(log_lock_name);
5533 l.set_duration(ut);
5534 l.set_cookie(owner_id);
5535 l.set_tag(zone_id);
f64942e4 5536 l.set_may_renew(true);
7c673cae
FG
5537
5538 return l.lock_exclusive(&io_ctx, oid);
5539}
5540
5541int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5542 librados::IoCtx io_ctx;
5543
5544 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5545 if (r < 0) {
5546 return r;
5547 }
5548
5549 rados::cls::lock::Lock l(log_lock_name);
5550 l.set_tag(zone_id);
5551 l.set_cookie(owner_id);
5552
5553 return l.unlock(&io_ctx, oid);
5554}
5555
5556int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5557{
5558 bufferlist::iterator i = bl.begin();
5559 RGWAccessControlPolicy policy(cct);
5560 try {
5561 policy.decode_owner(i);
5562 } catch (buffer::error& err) {
5563 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5564 return -EIO;
5565 }
5566 *owner = policy.get_owner();
5567 return 0;
5568}
5569
5570int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5571{
5572 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5573 if (aiter == attrset.end())
5574 return -EIO;
5575
5576 bufferlist& bl = aiter->second;
5577 bufferlist::iterator iter = bl.begin();
5578 try {
5579 policy->decode(iter);
5580 } catch (buffer::error& err) {
5581 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5582 return -EIO;
5583 }
5584 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5585 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5586 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5587 s3policy->to_xml(*_dout);
5588 *_dout << dendl;
5589 }
5590 return 0;
5591}
5592
5593
31f18b77
FG
5594int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5595{
5596 rgw_bucket bucket = bucket_info.bucket;
5597 bucket.update_bucket_id(new_bucket_id);
5598
5599 RGWObjectCtx obj_ctx(store);
5600
5601 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5602 if (ret < 0) {
5603 return ret;
5604 }
5605
5606 return 0;
5607}
5608
1adf2230
AA
5609
5610/**
5611 * Get ordered listing of the objects in a bucket.
7c673cae
FG
5612 *
5613 * max: maximum number of results to return
5614 * bucket: bucket to list contents of
5615 * prefix: only return results that match this prefix
5616 * delim: do not include results that match this string.
5617 * Any skipped results will have the matching portion of their name
5618 * inserted in common_prefixes with a "true" mark.
5619 * marker: if filled in, begin the listing with this object.
5620 * end_marker: if filled in, end the listing with this object.
5621 * result: the objects are put in here.
5622 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5623 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5624 */
1adf2230
AA
5625int RGWRados::Bucket::List::list_objects_ordered(int64_t max,
5626 vector<rgw_bucket_dir_entry> *result,
5627 map<string, bool> *common_prefixes,
5628 bool *is_truncated)
7c673cae
FG
5629{
5630 RGWRados *store = target->get_store();
5631 CephContext *cct = store->ctx();
5632 int shard_id = target->get_shard_id();
5633
5634 int count = 0;
5635 bool truncated = true;
5636 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5637
5638 result->clear();
5639
5640 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
7c673cae
FG
5641 rgw_obj_index_key cur_marker;
5642 marker_obj.get_index_key(&cur_marker);
5643
3efd9988
FG
5644 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5645 params.ns);
5646 rgw_obj_index_key cur_end_marker;
5647 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
5648 const bool cur_end_marker_valid = !params.end_marker.empty();
5649
5650 rgw_obj_key prefix_obj(params.prefix);
5651 prefix_obj.ns = params.ns;
5652 string cur_prefix = prefix_obj.get_index_key_name();
5653
5654 string bigger_than_delim;
5655
5656 if (!params.delim.empty()) {
1adf2230
AA
5657 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(),
5658 params.delim.size());
7c673cae
FG
5659 char buf[params.delim.size() + 16];
5660 int r = encode_utf8(val + 1, (unsigned char *)buf);
5661 if (r < 0) {
5662 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5663 return -EINVAL;
5664 }
5665 buf[r] = '\0';
5666
5667 bigger_than_delim = buf;
5668
5669 /* if marker points at a common prefix, fast forward it into its upperbound string */
224ce89b 5670 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
5671 if (delim_pos >= 0) {
5672 string s = cur_marker.name.substr(0, delim_pos);
5673 s.append(bigger_than_delim);
5674 cur_marker = s;
5675 }
5676 }
1adf2230 5677
7c673cae
FG
5678 string skip_after_delim;
5679 while (truncated && count <= max) {
5680 if (skip_after_delim > cur_marker.name) {
5681 cur_marker = skip_after_delim;
5682 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5683 }
5684 std::map<string, rgw_bucket_dir_entry> ent_map;
1adf2230
AA
5685 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
5686 shard_id,
5687 cur_marker,
5688 cur_prefix,
5689 read_ahead + 1 - count,
5690 params.list_versions,
5691 ent_map,
5692 &truncated,
5693 &cur_marker);
7c673cae
FG
5694 if (r < 0)
5695 return r;
5696
1adf2230 5697 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
5698 rgw_bucket_dir_entry& entry = eiter->second;
5699 rgw_obj_index_key index_key = entry.key;
5700
5701 rgw_obj_key obj(index_key);
5702
1adf2230
AA
5703 /* note that parse_raw_oid() here will not set the correct
5704 * object's instance, as rgw_obj_index_key encodes that
5705 * separately. We don't need to set the instance because it's
5706 * not needed for the checks here and we end up using the raw
5707 * entry for the return vector
7c673cae
FG
5708 */
5709 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5710 if (!valid) {
5711 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5712 continue;
5713 }
5714 bool check_ns = (obj.ns == params.ns);
5715 if (!params.list_versions && !entry.is_visible()) {
5716 continue;
5717 }
5718
5719 if (params.enforce_ns && !check_ns) {
5720 if (!params.ns.empty()) {
5721 /* we've iterated past the namespace we're searching -- done now */
5722 truncated = false;
5723 goto done;
5724 }
5725
5726 /* we're not looking at the namespace this object is in, next! */
5727 continue;
5728 }
5729
5730 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5731 truncated = false;
5732 goto done;
5733 }
5734
5735 if (count < max) {
5736 params.marker = index_key;
5737 next_marker = index_key;
5738 }
5739
5740 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5741 continue;
5742
1adf2230
AA
5743 if (params.prefix.size() &&
5744 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
7c673cae
FG
5745 continue;
5746
5747 if (!params.delim.empty()) {
5748 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5749
5750 if (delim_pos >= 0) {
5751 string prefix_key = obj.name.substr(0, delim_pos + 1);
5752
5753 if (common_prefixes &&
5754 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5755 if (count >= max) {
5756 truncated = true;
5757 goto done;
5758 }
5759 next_marker = prefix_key;
5760 (*common_prefixes)[prefix_key] = true;
5761
224ce89b
WB
5762 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5763
5764 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
7c673cae
FG
5765 skip_after_delim.append(bigger_than_delim);
5766
5767 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5768
5769 count++;
5770 }
5771
5772 continue;
5773 }
5774 }
5775
5776 if (count >= max) {
5777 truncated = true;
5778 goto done;
5779 }
5780
5781 result->emplace_back(std::move(entry));
5782 count++;
5783 }
7c673cae
FG
5784 }
5785
5786done:
5787 if (is_truncated)
5788 *is_truncated = truncated;
5789
5790 return 0;
1adf2230
AA
5791} // list_objects_ordered
5792
5793
5794/**
5795 * Get listing of the objects in a bucket and allow the results to be out
5796 * of order.
5797 *
5798 * Even though there are key differences with the ordered counterpart,
5799 * the parameters are the same to maintain some compatability.
5800 *
5801 * max: maximum number of results to return
5802 * bucket: bucket to list contents of
5803 * prefix: only return results that match this prefix
5804 * delim: should not be set; if it is we should have indicated an error
5805 * marker: if filled in, begin the listing with this object.
5806 * end_marker: if filled in, end the listing with this object.
5807 * result: the objects are put in here.
5808 * common_prefixes: this is never filled with an unordered list; the param
5809 * is maintained for compatibility
5810 * is_truncated: if number of objects in the bucket is bigger than max, then
5811 * truncated.
5812 */
5813int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
5814 vector<rgw_bucket_dir_entry> *result,
5815 map<string, bool> *common_prefixes,
5816 bool *is_truncated)
5817{
5818 RGWRados *store = target->get_store();
5819 CephContext *cct = store->ctx();
5820 int shard_id = target->get_shard_id();
5821
5822 int count = 0;
5823 bool truncated = true;
5824
5825 // read a few extra in each call to cls_bucket_list_unordered in
5826 // case some are filtered out due to namespace matching, versioning,
5827 // filtering, etc.
5828 const int64_t max_read_ahead = 100;
5829 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
5830
5831 result->clear();
5832
5833 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5834 rgw_obj_index_key cur_marker;
5835 marker_obj.get_index_key(&cur_marker);
5836
5837 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5838 params.ns);
5839 rgw_obj_index_key cur_end_marker;
5840 end_marker_obj.get_index_key(&cur_end_marker);
5841 const bool cur_end_marker_valid = !params.end_marker.empty();
5842
5843 rgw_obj_key prefix_obj(params.prefix);
5844 prefix_obj.ns = params.ns;
5845 string cur_prefix = prefix_obj.get_index_key_name();
5846
5847 while (truncated && count <= max) {
5848 std::vector<rgw_bucket_dir_entry> ent_list;
5849 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
5850 shard_id,
5851 cur_marker,
5852 cur_prefix,
5853 read_ahead,
5854 params.list_versions,
5855 ent_list,
5856 &truncated,
5857 &cur_marker);
5858 if (r < 0)
5859 return r;
5860
5861 // NB: while regions of ent_list will be sorted, we have no
5862 // guarantee that all items will be sorted since they can cross
5863 // shard boundaries
5864
5865 for (auto& entry : ent_list) {
5866 rgw_obj_index_key index_key = entry.key;
5867 rgw_obj_key obj(index_key);
5868
5869 /* note that parse_raw_oid() here will not set the correct
5870 * object's instance, as rgw_obj_index_key encodes that
5871 * separately. We don't need to set the instance because it's
5872 * not needed for the checks here and we end up using the raw
5873 * entry for the return vector
5874 */
5875 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5876 if (!valid) {
5877 ldout(cct, 0) << "ERROR: could not parse object name: " <<
5878 obj.name << dendl;
5879 continue;
5880 }
5881
5882 if (!params.list_versions && !entry.is_visible()) {
5883 continue;
5884 }
5885
5886 if (params.enforce_ns && obj.ns != params.ns) {
5887 continue;
5888 }
5889
5890 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5891 // we're not guaranteed items will come in order, so we have
5892 // to loop through all
5893 continue;
5894 }
5895
5896 if (count < max) {
5897 params.marker = index_key;
5898 next_marker = index_key;
5899 }
5900
5901 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5902 continue;
5903
5904 if (params.prefix.size() &&
5905 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
5906 continue;
5907
5908 if (count >= max) {
5909 truncated = true;
5910 goto done;
5911 }
5912
5913 result->emplace_back(std::move(entry));
5914 count++;
5915 } // for (auto& entry : ent_list)
5916 } // while (truncated && count <= max)
5917
5918done:
5919 if (is_truncated)
5920 *is_truncated = truncated;
5921
5922 return 0;
5923} // list_objects_unordered
5924
7c673cae
FG
5925
5926/**
5927 * create a rados pool, associated meta info
5928 * returns 0 on success, -ERR# otherwise.
5929 */
5930int RGWRados::create_pool(const rgw_pool& pool)
5931{
c07f9fc5 5932 librados::IoCtx io_ctx;
28e407b8
AA
5933 constexpr bool create = true;
5934 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
5935}
5936
5937int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5938{
f64942e4 5939 librados::IoCtx index_ctx;
7c673cae 5940
f64942e4 5941 string dir_oid = dir_oid_prefix;
7c673cae 5942 int r = open_bucket_index_ctx(bucket_info, index_ctx);
31f18b77 5943 if (r < 0) {
7c673cae 5944 return r;
31f18b77 5945 }
7c673cae 5946
7c673cae
FG
5947 dir_oid.append(bucket_info.bucket.bucket_id);
5948
5949 map<int, string> bucket_objs;
5950 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5951
f64942e4
AA
5952 return CLSRGWIssueBucketIndexInit(index_ctx,
5953 bucket_objs,
5954 cct->_conf->rgw_bucket_index_max_aio)();
5955}
5956
5957int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5958{
5959 librados::IoCtx index_ctx;
5960
5961 std::string dir_oid = dir_oid_prefix;
5962 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5963 if (r < 0) {
5964 return r;
5965 }
5966
5967 dir_oid.append(bucket_info.bucket.bucket_id);
5968
5969 std::map<int, std::string> bucket_objs;
5970 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5971
5972 return CLSRGWIssueBucketIndexClean(index_ctx,
5973 bucket_objs,
5974 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
5975}
5976
5977void RGWRados::create_bucket_id(string *bucket_id)
5978{
5979 uint64_t iid = instance_id();
5980 uint64_t bid = next_bucket_id();
5981 char buf[get_zone_params().get_id().size() + 48];
5982 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5983 *bucket_id = buf;
5984}
5985
7c673cae
FG
5986int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5987 const string& zonegroup_id,
5988 const string& placement_rule,
5989 const string& swift_ver_location,
5990 const RGWQuotaInfo * pquota_info,
5991 map<std::string, bufferlist>& attrs,
5992 RGWBucketInfo& info,
5993 obj_version *pobjv,
5994 obj_version *pep_objv,
5995 real_time creation_time,
5996 rgw_bucket *pmaster_bucket,
5997 uint32_t *pmaster_num_shards,
5998 bool exclusive)
5999{
6000#define MAX_CREATE_RETRIES 20 /* need to bound retries */
6001 string selected_placement_rule_name;
6002 RGWZonePlacementInfo rule_info;
6003
6004 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
6005 int ret = 0;
6006 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
6007 &selected_placement_rule_name, &rule_info);
6008 if (ret < 0)
6009 return ret;
6010
6011 if (!pmaster_bucket) {
6012 create_bucket_id(&bucket.marker);
6013 bucket.bucket_id = bucket.marker;
6014 } else {
6015 bucket.marker = pmaster_bucket->marker;
6016 bucket.bucket_id = pmaster_bucket->bucket_id;
6017 }
6018
6019 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
6020
6021 if (pobjv) {
6022 objv_tracker.write_version = *pobjv;
6023 } else {
6024 objv_tracker.generate_new_write_ver(cct);
6025 }
6026
6027 info.bucket = bucket;
6028 info.owner = owner.user_id;
6029 info.zonegroup = zonegroup_id;
6030 info.placement_rule = selected_placement_rule_name;
6031 info.index_type = rule_info.index_type;
6032 info.swift_ver_location = swift_ver_location;
6033 info.swift_versioning = (!swift_ver_location.empty());
6034 if (pmaster_num_shards) {
6035 info.num_shards = *pmaster_num_shards;
6036 } else {
6037 info.num_shards = bucket_index_max_shards;
6038 }
6039 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
6040 info.requester_pays = false;
6041 if (real_clock::is_zero(creation_time)) {
6042 info.creation_time = ceph::real_clock::now();
6043 } else {
6044 info.creation_time = creation_time;
6045 }
6046 if (pquota_info) {
6047 info.quota = *pquota_info;
6048 }
6049
6050 int r = init_bucket_index(info, info.num_shards);
6051 if (r < 0) {
6052 return r;
6053 }
6054
6055 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
6056 if (ret == -EEXIST) {
6057 librados::IoCtx index_ctx;
6058 map<int, string> bucket_objs;
6059 int r = open_bucket_index(info, index_ctx, bucket_objs);
6060 if (r < 0)
6061 return r;
6062
6063 /* we need to reread the info and return it, caller will have a use for it */
6064 RGWObjVersionTracker instance_ver = info.objv_tracker;
6065 info.objv_tracker.clear();
6066 RGWObjectCtx obj_ctx(this);
6067 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
6068 if (r < 0) {
6069 if (r == -ENOENT) {
6070 continue;
6071 }
6072 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
6073 return r;
6074 }
6075
6076 /* only remove it if it's a different bucket instance */
6077 if (info.bucket.bucket_id != bucket.bucket_id) {
6078 /* remove bucket meta instance */
f64942e4
AA
6079 r = rgw_bucket_instance_remove_entry(this,
6080 bucket.get_key(),
6081 &instance_ver);
7c673cae
FG
6082 if (r < 0)
6083 return r;
6084
f64942e4
AA
6085 /* remove bucket index objects asynchronously by best effort */
6086 (void) CLSRGWIssueBucketIndexClean(index_ctx,
6087 bucket_objs,
6088 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
6089 }
6090 /* ret == -ENOENT here */
6091 }
6092 return ret;
6093 }
6094
6095 /* this is highly unlikely */
6096 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
6097 return -ENOENT;
6098}
6099
6100int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
6101 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6102
6103{
c07f9fc5 6104 /* first check that zonegroup exists within current period. */
7c673cae
FG
6105 RGWZoneGroup zonegroup;
6106 int ret = get_zonegroup(zonegroup_id, zonegroup);
6107 if (ret < 0) {
6108 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
6109 return ret;
6110 }
6111
7c673cae 6112 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
c07f9fc5
FG
6113 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
6114
6115 if (!request_rule.empty()) {
6116 titer = zonegroup.placement_targets.find(request_rule);
6117 if (titer == zonegroup.placement_targets.end()) {
6118 ldout(cct, 0) << "could not find requested placement id " << request_rule
6119 << " within zonegroup " << dendl;
6120 return -ERR_INVALID_LOCATION_CONSTRAINT;
6121 }
6122 } else if (!user_info.default_placement.empty()) {
6123 titer = zonegroup.placement_targets.find(user_info.default_placement);
6124 if (titer == zonegroup.placement_targets.end()) {
6125 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
6126 << " within zonegroup " << dendl;
6127 return -ERR_INVALID_LOCATION_CONSTRAINT;
6128 }
6129 } else {
6130 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
6131 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
6132 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
6133 } else {
6134 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
6135 if (titer == zonegroup.placement_targets.end()) {
6136 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
6137 << " within zonegroup " << dendl;
6138 return -ERR_INVALID_LOCATION_CONSTRAINT;
6139 }
6140 }
7c673cae
FG
6141 }
6142
6143 /* now check tag for the rule, whether user is permitted to use rule */
c07f9fc5 6144 const auto& target_rule = titer->second;
7c673cae 6145 if (!target_rule.user_permitted(user_info.placement_tags)) {
c07f9fc5 6146 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
7c673cae
FG
6147 return -EPERM;
6148 }
6149
6150 if (pselected_rule_name)
c07f9fc5 6151 *pselected_rule_name = titer->first;
7c673cae 6152
c07f9fc5 6153 return select_bucket_location_by_rule(titer->first, rule_info);
7c673cae
FG
6154}
6155
6156int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
6157{
6158 if (location_rule.empty()) {
6159 /* we can only reach here if we're trying to set a bucket location from a bucket
6160 * created on a different zone, using a legacy / default pool configuration
6161 */
6162 return select_legacy_bucket_placement(rule_info);
6163 }
6164
6165 /*
6166 * make sure that zone has this rule configured. We're
6167 * checking it for the local zone, because that's where this bucket object is going to
6168 * reside.
6169 */
6170 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
6171 if (piter == get_zone_params().placement_pools.end()) {
6172 /* couldn't find, means we cannot really place data for this bucket in this zone */
224ce89b 6173 if (get_zonegroup().equals(zonegroup.get_id())) {
7c673cae
FG
6174 /* that's a configuration error, zone should have that rule, as we're within the requested
6175 * zonegroup */
6176 return -EINVAL;
6177 } else {
6178 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6179 return 0;
6180 }
6181 }
6182
6183 RGWZonePlacementInfo& placement_info = piter->second;
6184
6185 if (rule_info) {
6186 *rule_info = placement_info;
6187 }
6188
6189 return 0;
6190}
6191
6192int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
6193 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6194{
6195 if (!get_zone_params().placement_pools.empty()) {
6196 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6197 pselected_rule_name, rule_info);
6198 }
6199
6200 if (pselected_rule_name) {
6201 pselected_rule_name->clear();
6202 }
6203
6204 return select_legacy_bucket_placement(rule_info);
6205}
6206
6207int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6208{
6209 bufferlist map_bl;
6210 map<string, bufferlist> m;
6211 string pool_name;
6212 bool write_map = false;
6213
6214 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6215
6216 RGWObjectCtx obj_ctx(this);
6217 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6218 if (ret < 0) {
6219 goto read_omap;
6220 }
6221
6222 try {
6223 bufferlist::iterator iter = map_bl.begin();
6224 ::decode(m, iter);
6225 } catch (buffer::error& err) {
6226 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6227 }
6228
6229read_omap:
6230 if (m.empty()) {
6231 bufferlist header;
6232 ret = omap_get_all(obj, header, m);
6233
6234 write_map = true;
6235 }
6236
6237 if (ret < 0 || m.empty()) {
6238 vector<rgw_pool> pools;
6239 string s = string("default.") + default_storage_pool_suffix;
6240 pools.push_back(rgw_pool(s));
6241 vector<int> retcodes;
6242 bufferlist bl;
6243 ret = create_pools(pools, retcodes);
6244 if (ret < 0)
6245 return ret;
6246 ret = omap_set(obj, s, bl);
6247 if (ret < 0)
6248 return ret;
6249 m[s] = bl;
6250 }
6251
6252 if (write_map) {
6253 bufferlist new_bl;
6254 ::encode(m, new_bl);
6255 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6256 if (ret < 0) {
6257 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6258 }
6259 }
6260
6261 map<string, bufferlist>::iterator miter;
6262 if (m.size() > 1) {
6263 vector<string> v;
6264 for (miter = m.begin(); miter != m.end(); ++miter) {
6265 v.push_back(miter->first);
6266 }
6267
6268 uint32_t r;
6269 ret = get_random_bytes((char *)&r, sizeof(r));
6270 if (ret < 0)
6271 return ret;
6272
6273 int i = r % v.size();
6274 pool_name = v[i];
6275 } else {
6276 miter = m.begin();
6277 pool_name = miter->first;
6278 }
6279
6280 rule_info->data_pool = pool_name;
6281 rule_info->data_extra_pool = pool_name;
6282 rule_info->index_pool = pool_name;
6283 rule_info->index_type = RGWBIType_Normal;
6284
6285 return 0;
6286}
6287
6288bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6289{
6290 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6291}
6292
6293bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6294{
6295 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6296
6297 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6298}
6299
6300int RGWRados::update_placement_map()
6301{
6302 bufferlist header;
6303 map<string, bufferlist> m;
6304 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6305 int ret = omap_get_all(obj, header, m);
6306 if (ret < 0)
6307 return ret;
6308
6309 bufferlist new_bl;
6310 ::encode(m, new_bl);
6311 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6312 if (ret < 0) {
6313 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6314 }
6315
6316 return ret;
6317}
6318
6319int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6320{
6321 librados::Rados *rad = get_rados_handle();
6322 int ret = rad->pool_lookup(new_pool.name.c_str());
6323 if (ret < 0) // DNE, or something
6324 return ret;
6325
6326 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6327 bufferlist empty_bl;
6328 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6329
6330 // don't care about return value
6331 update_placement_map();
6332
6333 return ret;
6334}
6335
6336int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6337{
6338 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6339 int ret = omap_del(obj, old_pool.to_str());
6340
6341 // don't care about return value
6342 update_placement_map();
6343
6344 return ret;
6345}
6346
6347int RGWRados::list_placement_set(set<rgw_pool>& names)
6348{
6349 bufferlist header;
6350 map<string, bufferlist> m;
6351
6352 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6353 int ret = omap_get_all(obj, header, m);
6354 if (ret < 0)
6355 return ret;
6356
6357 names.clear();
6358 map<string, bufferlist>::iterator miter;
6359 for (miter = m.begin(); miter != m.end(); ++miter) {
6360 names.insert(rgw_pool(miter->first));
6361 }
6362
6363 return names.size();
6364}
6365
6366int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6367{
6368 vector<librados::PoolAsyncCompletion *> completions;
6369 vector<int> rets;
6370
6371 librados::Rados *rad = get_rados_handle();
6372 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6373 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6374 completions.push_back(c);
6375 rgw_pool& pool = *iter;
6376 int ret = rad->pool_create_async(pool.name.c_str(), c);
6377 rets.push_back(ret);
6378 }
6379
6380 vector<int>::iterator riter;
6381 vector<librados::PoolAsyncCompletion *>::iterator citer;
6382
c07f9fc5 6383 bool error = false;
7c673cae
FG
6384 assert(rets.size() == completions.size());
6385 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6386 int r = *riter;
6387 PoolAsyncCompletion *c = *citer;
6388 if (r == 0) {
6389 c->wait();
6390 r = c->get_return_value();
6391 if (r < 0) {
6392 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
c07f9fc5 6393 error = true;
7c673cae
FG
6394 }
6395 }
6396 c->release();
6397 retcodes.push_back(r);
6398 }
c07f9fc5
FG
6399 if (error) {
6400 return 0;
6401 }
6402
6403 std::vector<librados::IoCtx> io_ctxs;
6404 retcodes.clear();
6405 for (auto pool : pools) {
6406 io_ctxs.emplace_back();
6407 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6408 if (ret < 0) {
6409 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6410 error = true;
6411 }
6412 retcodes.push_back(ret);
6413 }
6414 if (error) {
6415 return 0;
6416 }
6417
6418 completions.clear();
6419 for (auto &io_ctx : io_ctxs) {
6420 librados::PoolAsyncCompletion *c =
6421 librados::Rados::pool_async_create_completion();
6422 completions.push_back(c);
6423 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6424 false, c);
6425 assert(ret == 0);
6426 }
6427
6428 retcodes.clear();
6429 for (auto c : completions) {
6430 c->wait();
6431 int ret = c->get_return_value();
6432 if (ret == -EOPNOTSUPP) {
6433 ret = 0;
6434 } else if (ret < 0) {
6435 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6436 << dendl;
6437 error = true;
6438 }
6439 c->release();
6440 retcodes.push_back(ret);
6441 }
7c673cae
FG
6442 return 0;
6443}
6444
6445int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6446{
6447 string oid, key;
6448 get_obj_bucket_and_oid_loc(obj, oid, key);
6449
6450 rgw_pool pool;
6451 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6452 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6453 return -EIO;
6454 }
6455
6456 int r = open_pool_ctx(pool, *ioctx);
6457 if (r < 0) {
6458 return r;
6459 }
6460
6461 ioctx->locator_set_key(key);
6462
6463 return 0;
6464}
6465
6466int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6467{
6468 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6469
6470 rgw_pool pool;
6471 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6472 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6473 return -EIO;
6474 }
6475
6476 int r = open_pool_ctx(pool, ref->ioctx);
6477 if (r < 0) {
6478 return r;
6479 }
6480
6481 ref->ioctx.locator_set_key(ref->key);
6482
6483 return 0;
6484}
6485
224ce89b 6486int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae
FG
6487{
6488 ref->oid = obj.oid;
6489 ref->key = obj.loc;
6490
6491 int r;
6492
6493 if (ref->oid.empty()) {
6494 ref->oid = obj.pool.to_str();
6495 ref->pool = get_zone_params().domain_root;
6496 } else {
6497 ref->pool = obj.pool;
6498 }
7c673cae
FG
6499 r = open_pool_ctx(ref->pool, ref->ioctx);
6500 if (r < 0)
6501 return r;
6502
6503 ref->ioctx.locator_set_key(ref->key);
6504
6505 return 0;
6506}
6507
224ce89b 6508int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 6509{
224ce89b 6510 return get_raw_obj_ref(obj, ref);
7c673cae
FG
6511}
6512
6513/*
6514 * fixes an issue where head objects were supposed to have a locator created, but ended
6515 * up without one
6516 */
6517int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6518{
6519 const rgw_bucket& bucket = bucket_info.bucket;
6520 string oid;
6521 string locator;
6522
6523 rgw_obj obj(bucket, key);
6524
6525 get_obj_bucket_and_oid_loc(obj, oid, locator);
6526
6527 if (locator.empty()) {
6528 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6529 return 0;
6530 }
6531
6532 librados::IoCtx ioctx;
6533
6534 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6535 if (ret < 0) {
6536 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6537 return ret;
6538 }
6539 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6540
6541 uint64_t size;
6542 bufferlist data;
6543
6544 struct timespec mtime_ts;
6545 map<string, bufferlist> attrs;
6546 librados::ObjectReadOperation op;
6547 op.getxattrs(&attrs, NULL);
6548 op.stat2(&size, &mtime_ts, NULL);
6549#define HEAD_SIZE 512 * 1024
6550 op.read(0, HEAD_SIZE, &data, NULL);
6551
6552 ret = ioctx.operate(oid, &op, NULL);
6553 if (ret < 0) {
6554 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6555 return ret;
6556 }
6557
6558 if (size > HEAD_SIZE) {
6559 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6560 return -EIO;
6561 }
6562
6563 if (size != data.length()) {
6564 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6565 return -EIO;
6566 }
6567
6568 if (copy_obj) {
6569 librados::ObjectWriteOperation wop;
6570
6571 wop.mtime2(&mtime_ts);
6572
6573 map<string, bufferlist>::iterator iter;
6574 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6575 wop.setxattr(iter->first.c_str(), iter->second);
6576 }
6577
6578 wop.write(0, data);
6579
6580 ioctx.locator_set_key(locator);
6581 ioctx.operate(oid, &wop);
6582 }
6583
6584 if (remove_bad) {
6585 ioctx.locator_set_key(string());
6586
6587 ret = ioctx.remove(oid);
6588 if (ret < 0) {
6589 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6590 return ret;
6591 }
6592 }
6593
6594 return 0;
6595}
6596
6597int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6598 const string& src_oid, const string& src_locator,
6599 librados::IoCtx& dst_ioctx,
6600 const string& dst_oid, const string& dst_locator)
6601{
6602
6603#define COPY_BUF_SIZE (4 * 1024 * 1024)
6604 bool done = false;
6605 uint64_t chunk_size = COPY_BUF_SIZE;
6606 uint64_t ofs = 0;
6607 int ret = 0;
6608 real_time mtime;
6609 struct timespec mtime_ts;
6610 uint64_t size;
6611
6612 if (src_oid == dst_oid && src_locator == dst_locator) {
6613 return 0;
6614 }
6615
6616 src_ioctx.locator_set_key(src_locator);
6617 dst_ioctx.locator_set_key(dst_locator);
6618
6619 do {
6620 bufferlist data;
6621 ObjectReadOperation rop;
6622 ObjectWriteOperation wop;
6623
6624 if (ofs == 0) {
6625 rop.stat2(&size, &mtime_ts, NULL);
6626 mtime = real_clock::from_timespec(mtime_ts);
6627 }
6628 rop.read(ofs, chunk_size, &data, NULL);
6629 ret = src_ioctx.operate(src_oid, &rop, NULL);
6630 if (ret < 0) {
6631 goto done_err;
6632 }
6633
6634 if (data.length() == 0) {
6635 break;
6636 }
6637
6638 if (ofs == 0) {
6639 wop.create(true); /* make it exclusive */
6640 wop.mtime2(&mtime_ts);
6641 mtime = real_clock::from_timespec(mtime_ts);
6642 }
6643 wop.write(ofs, data);
6644 ret = dst_ioctx.operate(dst_oid, &wop);
6645 ofs += data.length();
6646 done = data.length() != chunk_size;
6647 } while (!done);
6648
6649 if (ofs != size) {
6650 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6651 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6652 ret = -EIO;
6653 goto done_err;
6654 }
6655
6656 src_ioctx.remove(src_oid);
6657
6658 return 0;
6659
6660done_err:
6661 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6662 return ret;
6663}
6664
6665/*
6666 * fixes an issue where head objects were supposed to have a locator created, but ended
6667 * up without one
6668 */
6669int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6670{
6671 const rgw_bucket& bucket = bucket_info.bucket;
6672 rgw_obj obj(bucket, key);
6673
6674 if (need_fix) {
6675 *need_fix = false;
6676 }
6677
6678 rgw_rados_ref ref;
6679 int r = get_obj_head_ref(bucket_info, obj, &ref);
6680 if (r < 0) {
6681 return r;
6682 }
6683
6684 RGWObjState *astate = NULL;
6685 RGWObjectCtx rctx(this);
6686 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6687 if (r < 0)
6688 return r;
6689
6690 if (astate->has_manifest) {
6691 RGWObjManifest::obj_iterator miter;
6692 RGWObjManifest& manifest = astate->manifest;
6693 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6694 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6695 rgw_obj loc;
6696 string oid;
6697 string locator;
6698
6699 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6700
6701 if (loc.key.ns.empty()) {
6702 /* continue, we're only interested in tail objects */
6703 continue;
6704 }
6705
6706 get_obj_bucket_and_oid_loc(loc, oid, locator);
6707 ref.ioctx.locator_set_key(locator);
6708
6709 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6710
6711 r = ref.ioctx.stat(oid, NULL, NULL);
6712 if (r != -ENOENT) {
6713 continue;
6714 }
6715
6716 string bad_loc;
6717 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6718
6719 /* create a new ioctx with the bad locator */
6720 librados::IoCtx src_ioctx;
6721 src_ioctx.dup(ref.ioctx);
6722 src_ioctx.locator_set_key(bad_loc);
6723
6724 r = src_ioctx.stat(oid, NULL, NULL);
6725 if (r != 0) {
6726 /* cannot find a broken part */
6727 continue;
6728 }
6729 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6730 if (need_fix) {
6731 *need_fix = true;
6732 }
6733 if (fix) {
6734 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6735 if (r < 0) {
6736 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6737 }
6738 }
6739 }
6740 }
6741
6742 return 0;
6743}
6744
f64942e4
AA
6745int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
6746 const rgw_obj& obj,
6747 RGWBucketInfo* bucket_info_out)
7c673cae
FG
6748{
6749 bucket = _bucket;
6750
6751 RGWObjectCtx obj_ctx(store);
6752
6753 RGWBucketInfo bucket_info;
f64942e4
AA
6754 RGWBucketInfo* bucket_info_p =
6755 bucket_info_out ? bucket_info_out : &bucket_info;
6756
6757 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
6758 if (ret < 0) {
6759 return ret;
6760 }
6761
f64942e4 6762 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae
FG
6763 if (ret < 0) {
6764 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6765 return ret;
6766 }
6767 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6768
6769 return 0;
6770}
6771
f64942e4
AA
6772int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
6773 int sid,
6774 RGWBucketInfo* bucket_info_out)
7c673cae
FG
6775{
6776 bucket = _bucket;
6777 shard_id = sid;
6778
6779 RGWObjectCtx obj_ctx(store);
6780
6781 RGWBucketInfo bucket_info;
f64942e4
AA
6782 RGWBucketInfo* bucket_info_p =
6783 bucket_info_out ? bucket_info_out : &bucket_info;
6784 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
6785 if (ret < 0) {
6786 return ret;
6787 }
6788
f64942e4 6789 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
7c673cae
FG
6790 if (ret < 0) {
6791 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6792 return ret;
6793 }
6794 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6795
6796 return 0;
6797}
6798
b32b8144
FG
6799int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6800{
6801 bucket = bucket_info.bucket;
6802 shard_id = sid;
6803
6804 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6805 if (ret < 0) {
6806 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6807 return ret;
6808 }
6809 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6810
6811 return 0;
6812}
6813
7c673cae
FG
6814
6815/* Execute @handler on last item in bucket listing for bucket specified
6816 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6817 * to objects matching these criterias. */
6818int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6819 const std::string& obj_prefix,
6820 const std::string& obj_delim,
6821 std::function<int(const rgw_bucket_dir_entry&)> handler)
6822{
6823 RGWRados::Bucket target(this, bucket_info);
6824 RGWRados::Bucket::List list_op(&target);
6825
6826 list_op.params.prefix = obj_prefix;
6827 list_op.params.delim = obj_delim;
6828
6829 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6830 << ", obj_prefix=" << obj_prefix
6831 << ", obj_delim=" << obj_delim
6832 << dendl;
6833
6834 bool is_truncated = false;
6835
6836 boost::optional<rgw_bucket_dir_entry> last_entry;
6837 /* We need to rewind to the last object in a listing. */
6838 do {
6839 /* List bucket entries in chunks. */
6840 static constexpr int MAX_LIST_OBJS = 100;
6841 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6842
6843 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6844 &is_truncated);
6845 if (ret < 0) {
6846 return ret;
6847 } else if (!entries.empty()) {
6848 last_entry = entries.back();
6849 }
6850 } while (is_truncated);
6851
6852 if (last_entry) {
6853 return handler(*last_entry);
6854 }
6855
6856 /* Empty listing - no items we can run handler on. */
6857 return 0;
6858}
6859
6860
6861int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6862 const rgw_user& user,
6863 RGWBucketInfo& bucket_info,
6864 rgw_obj& obj)
6865{
6866 if (! swift_versioning_enabled(bucket_info)) {
6867 return 0;
6868 }
6869
6870 obj_ctx.obj.set_atomic(obj);
6871
6872 RGWObjState * state = nullptr;
6873 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6874 if (r < 0) {
6875 return r;
6876 }
6877
6878 if (!state->exists) {
6879 return 0;
6880 }
6881
6882 string client_id;
6883 string op_id;
6884
6885 const string& src_name = obj.get_oid();
6886 char buf[src_name.size() + 32];
6887 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6888 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6889 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6890
6891 RGWBucketInfo dest_bucket_info;
6892
6893 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6894 if (r < 0) {
6895 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6896 if (r == -ENOENT) {
6897 return -ERR_PRECONDITION_FAILED;
6898 }
6899 return r;
6900 }
6901
6902 if (dest_bucket_info.owner != bucket_info.owner) {
6903 return -ERR_PRECONDITION_FAILED;
6904 }
6905
6906 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6907 obj_ctx.obj.set_atomic(dest_obj);
6908
6909 string no_zone;
6910
6911 r = copy_obj(obj_ctx,
6912 user,
6913 client_id,
6914 op_id,
6915 NULL, /* req_info *info */
6916 no_zone,
6917 dest_obj,
6918 obj,
6919 dest_bucket_info,
6920 bucket_info,
6921 NULL, /* time_t *src_mtime */
6922 NULL, /* time_t *mtime */
6923 NULL, /* const time_t *mod_ptr */
6924 NULL, /* const time_t *unmod_ptr */
6925 false, /* bool high_precision_time */
6926 NULL, /* const char *if_match */
6927 NULL, /* const char *if_nomatch */
6928 RGWRados::ATTRSMOD_NONE,
6929 true, /* bool copy_if_newer */
6930 state->attrset,
6931 RGW_OBJ_CATEGORY_MAIN,
6932 0, /* uint64_t olh_epoch */
6933 real_time(), /* time_t delete_at */
6934 NULL, /* string *version_id */
6935 NULL, /* string *ptag */
6936 NULL, /* string *petag */
7c673cae
FG
6937 NULL, /* void (*progress_cb)(off_t, void *) */
6938 NULL); /* void *progress_data */
6939 if (r == -ECANCELED || r == -ENOENT) {
6940 /* Has already been overwritten, meaning another rgw process already
6941 * copied it out */
6942 return 0;
6943 }
6944
6945 return r;
6946}
6947
6948int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6949 const rgw_user& user,
6950 RGWBucketInfo& bucket_info,
6951 rgw_obj& obj,
6952 bool& restored) /* out */
6953{
6954 if (! swift_versioning_enabled(bucket_info)) {
6955 return 0;
6956 }
6957
6958 /* Bucket info of the bucket that stores previous versions of our object. */
6959 RGWBucketInfo archive_binfo;
6960
6961 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6962 bucket_info.swift_ver_location, archive_binfo,
6963 nullptr, nullptr);
6964 if (ret < 0) {
6965 return ret;
6966 }
6967
6968 /* Abort the operation if the bucket storing our archive belongs to someone
6969 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6970 * into consideration. For we can live with that.
6971 *
6972 * TODO: delegate this check to un upper layer and compare with ACLs. */
6973 if (bucket_info.owner != archive_binfo.owner) {
6974 return -EPERM;
6975 }
6976
6977 /* This code will be executed on latest version of the object. */
6978 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6979 std::string no_client_id;
6980 std::string no_op_id;
6981 std::string no_zone;
6982
6983 /* We don't support object versioning of Swift API on those buckets that
6984 * are already versioned using the S3 mechanism. This affects also bucket
6985 * storing archived objects. Otherwise the delete operation would create
6986 * a deletion marker. */
6987 if (archive_binfo.versioned()) {
6988 restored = false;
6989 return -ERR_PRECONDITION_FAILED;
6990 }
6991
6992 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6993 * irrelevant and may be safely skipped. */
6994 std::map<std::string, ceph::bufferlist> no_attrs;
6995
6996 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6997 obj_ctx.obj.set_atomic(archive_obj);
6998 obj_ctx.obj.set_atomic(obj);
6999
7000 int ret = copy_obj(obj_ctx,
7001 user,
7002 no_client_id,
7003 no_op_id,
7004 nullptr, /* req_info *info */
7005 no_zone,
7006 obj, /* dest obj */
7007 archive_obj, /* src obj */
7008 bucket_info, /* dest bucket info */
7009 archive_binfo, /* src bucket info */
7010 nullptr, /* time_t *src_mtime */
7011 nullptr, /* time_t *mtime */
7012 nullptr, /* const time_t *mod_ptr */
7013 nullptr, /* const time_t *unmod_ptr */
7014 false, /* bool high_precision_time */
7015 nullptr, /* const char *if_match */
7016 nullptr, /* const char *if_nomatch */
7017 RGWRados::ATTRSMOD_NONE,
7018 true, /* bool copy_if_newer */
7019 no_attrs,
7020 RGW_OBJ_CATEGORY_MAIN,
7021 0, /* uint64_t olh_epoch */
7022 real_time(), /* time_t delete_at */
7023 nullptr, /* string *version_id */
7024 nullptr, /* string *ptag */
7025 nullptr, /* string *petag */
7c673cae
FG
7026 nullptr, /* void (*progress_cb)(off_t, void *) */
7027 nullptr); /* void *progress_data */
7028 if (ret == -ECANCELED || ret == -ENOENT) {
7029 /* Has already been overwritten, meaning another rgw process already
7030 * copied it out */
7031 return 0;
7032 } else if (ret < 0) {
7033 return ret;
7034 } else {
7035 restored = true;
7036 }
7037
7038 /* Need to remove the archived copy. */
7039 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
7040 archive_binfo.versioning_status());
7041
7042 return ret;
7043 };
7044
7045 const std::string& obj_name = obj.get_oid();
7046 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
7047 % obj_name);
7048
7049 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
7050 handler);
7051}
7052
7053/**
7054 * Write/overwrite an object to the bucket storage.
7055 * bucket: the bucket to store the object in
7056 * obj: the object name/key
7057 * data: the object contents/value
7058 * size: the amount of data to write (data must be this long)
7059 * accounted_size: original size of data before compression, encryption
7060 * mtime: if non-NULL, writes the given mtime to the bucket storage
7061 * attrs: all the given attrs are written to bucket storage for the given object
7062 * exclusive: create object exclusively
7063 * Returns: 0 on success, -ERR# otherwise.
7064 */
7065int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
7066 map<string, bufferlist>& attrs,
7067 bool assume_noent, bool modify_tail,
7c673cae
FG
7068 void *_index_op)
7069{
7070 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
7071 RGWRados *store = target->get_store();
7072
7073 ObjectWriteOperation op;
7074
7075 RGWObjState *state;
7076 int r = target->get_state(&state, false, assume_noent);
7077 if (r < 0)
7078 return r;
7079
7080 rgw_obj& obj = target->get_obj();
7081
7082 if (obj.get_oid().empty()) {
7083 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7084 return -EIO;
7085 }
7086
224ce89b 7087 rgw_rados_ref ref;
7c673cae
FG
7088 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
7089 if (r < 0)
7090 return r;
7091
7092 bool is_olh = state->is_olh;
7093
7094 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
7095
7096 const string *ptag = meta.ptag;
7097 if (!ptag && !index_op->get_optag()->empty()) {
7098 ptag = index_op->get_optag();
7099 }
181888fb 7100 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
7c673cae
FG
7101 if (r < 0)
7102 return r;
7103
7104 if (real_clock::is_zero(meta.set_mtime)) {
7105 meta.set_mtime = real_clock::now();
7106 }
7107
7108 if (state->is_olh) {
7109 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
7110 }
7111
7112 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
7113 op.mtime2(&mtime_ts);
7114
7115 if (meta.data) {
7116 /* if we want to overwrite the data, we also want to overwrite the
7117 xattrs, so just remove the object */
7118 op.write_full(*meta.data);
7119 }
7120
7121 string etag;
7122 string content_type;
7123 bufferlist acl_bl;
7124
7125 map<string, bufferlist>::iterator iter;
7126 if (meta.rmattrs) {
7127 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
7128 const string& name = iter->first;
7129 op.rmxattr(name.c_str());
7130 }
7131 }
7132
7133 if (meta.manifest) {
7134 /* remove existing manifest attr */
7135 iter = attrs.find(RGW_ATTR_MANIFEST);
7136 if (iter != attrs.end())
7137 attrs.erase(iter);
7138
7139 bufferlist bl;
7140 ::encode(*meta.manifest, bl);
7141 op.setxattr(RGW_ATTR_MANIFEST, bl);
7142 }
7143
7144 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
7145 const string& name = iter->first;
7146 bufferlist& bl = iter->second;
7147
7148 if (!bl.length())
7149 continue;
7150
7151 op.setxattr(name.c_str(), bl);
7152
7153 if (name.compare(RGW_ATTR_ETAG) == 0) {
7154 etag = bl.c_str();
7155 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
7156 content_type = bl.c_str();
7157 } else if (name.compare(RGW_ATTR_ACL) == 0) {
7158 acl_bl = bl;
7159 }
7160 }
7161 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
7162 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
7163 }
7164
7165 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
7166 bufferlist bl;
7167 ::encode(store->get_zone_short_id(), bl);
7168 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
7169 }
7170
7171 if (!op.size())
7172 return 0;
7173
7174 uint64_t epoch;
7175 int64_t poolid;
224ce89b
WB
7176 bool orig_exists;
7177 uint64_t orig_size;
7178
7179 if (!reset_obj) { //Multipart upload, it has immutable head.
7180 orig_exists = false;
7181 orig_size = 0;
7182 } else {
7183 orig_exists = state->exists;
7184 orig_size = state->accounted_size;
7185 }
7c673cae 7186
91327a77
AA
7187 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
7188 !obj.key.instance.empty();
7c673cae
FG
7189
7190 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
7191
7192 if (versioned_op) {
7193 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
7194 }
7195
7196 if (!index_op->is_prepared()) {
7197 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
7198 if (r < 0)
7199 return r;
7200 }
7201
7202 r = ref.ioctx.operate(ref.oid, &op);
7203 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7204 or -ENOENT if was removed, or -EEXIST if it did not exist
7205 before and now it does */
7206 if (r == -EEXIST && assume_noent) {
7207 target->invalidate_state();
7208 return r;
7209 }
7210 goto done_cancel;
7211 }
7212
7213 epoch = ref.ioctx.get_last_version();
7214 poolid = ref.ioctx.get_id();
7215
7216 r = target->complete_atomic_modification();
7217 if (r < 0) {
7218 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7219 }
7220
7221 r = index_op->complete(poolid, epoch, size, accounted_size,
7222 meta.set_mtime, etag, content_type, &acl_bl,
7223 meta.category, meta.remove_objs, meta.user_data);
7224 if (r < 0)
7225 goto done_cancel;
7226
7227 if (meta.mtime) {
7228 *meta.mtime = meta.set_mtime;
7229 }
7230
7231 /* note that index_op was using state so we couldn't invalidate it earlier */
7232 target->invalidate_state();
7233 state = NULL;
7234
91327a77
AA
7235 if (versioned_op && meta.olh_epoch) {
7236 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
7c673cae
FG
7237 if (r < 0) {
7238 return r;
7239 }
7240 }
7241
7242 if (!real_clock::is_zero(meta.delete_at)) {
7243 rgw_obj_index_key obj_key;
7244 obj.key.get_index_key(&obj_key);
7245
7246 r = store->objexp_hint_add(meta.delete_at,
7247 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7248 if (r < 0) {
7249 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7250 /* ignoring error, nothing we can do at this point */
7251 }
7252 }
7253 meta.canceled = false;
7254
7255 /* update quota cache */
3efd9988
FG
7256 if (meta.completeMultipart){
7257 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7258 0, orig_size);
7259 }
7260 else {
7261 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7262 accounted_size, orig_size);
7263 }
7c673cae
FG
7264 return 0;
7265
7266done_cancel:
7267 int ret = index_op->cancel();
7268 if (ret < 0) {
7269 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7270 }
7271
7272 meta.canceled = true;
7273
7274 /* we lost in a race. There are a few options:
7275 * - existing object was rewritten (ECANCELED)
7276 * - non existing object was created (EEXIST)
7277 * - object was removed (ENOENT)
7278 * should treat it as a success
7279 */
7280 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7281 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7282 r = 0;
7283 }
7284 } else {
7285 if (meta.if_match != NULL) {
7286 // only overwrite existing object
7287 if (strcmp(meta.if_match, "*") == 0) {
7288 if (r == -ENOENT) {
7289 r = -ERR_PRECONDITION_FAILED;
7290 } else if (r == -ECANCELED) {
7291 r = 0;
7292 }
7293 }
7294 }
7295
7296 if (meta.if_nomatch != NULL) {
7297 // only create a new object
7298 if (strcmp(meta.if_nomatch, "*") == 0) {
7299 if (r == -EEXIST) {
7300 r = -ERR_PRECONDITION_FAILED;
7301 } else if (r == -ENOENT) {
7302 r = 0;
7303 }
7304 }
7305 }
7306 }
7307
7308 return r;
7309}
7310
7311int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7312 map<string, bufferlist>& attrs)
7313{
7314 RGWBucketInfo& bucket_info = target->get_bucket_info();
7315
7316 RGWRados::Bucket bop(target->get_store(), bucket_info);
7317 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
7318 index_op.set_zones_trace(meta.zones_trace);
7319
7c673cae
FG
7320 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7321 int r;
7322 if (assume_noent) {
181888fb 7323 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7324 if (r == -EEXIST) {
7325 assume_noent = false;
7326 }
7327 }
7328 if (!assume_noent) {
181888fb 7329 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7330 }
7331 return r;
7332}
7333
7334/** Write/overwrite a system object. */
7335int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7336 map<std::string, bufferlist>& attrs, int flags,
7337 bufferlist& data,
7338 RGWObjVersionTracker *objv_tracker,
7339 real_time set_mtime /* 0 for don't set */)
7340{
7c673cae 7341 rgw_rados_ref ref;
224ce89b 7342 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7343 if (r < 0)
7344 return r;
7345
7346 ObjectWriteOperation op;
7347
7348 if (flags & PUT_OBJ_EXCL) {
7349 if (!(flags & PUT_OBJ_CREATE))
7350 return -EINVAL;
7351 op.create(true); // exclusive create
7352 } else {
7353 op.remove();
7354 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7355 op.create(false);
7356 }
7357
7358 if (objv_tracker) {
7359 objv_tracker->prepare_op_for_write(&op);
7360 }
7361
7362 if (real_clock::is_zero(set_mtime)) {
7363 set_mtime = real_clock::now();
7364 }
7365
7366 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7367 op.mtime2(&mtime_ts);
7368 op.write_full(data);
7369
7370 bufferlist acl_bl;
7371
7372 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7373 const string& name = iter->first;
7374 bufferlist& bl = iter->second;
7375
7376 if (!bl.length())
7377 continue;
7378
7379 op.setxattr(name.c_str(), bl);
7380 }
7381
7382 r = ref.ioctx.operate(ref.oid, &op);
7383 if (r < 0) {
7384 return r;
7385 }
7386
7387 if (objv_tracker) {
7388 objv_tracker->apply_write();
7389 }
7390
7391 if (mtime) {
7392 *mtime = set_mtime;
7393 }
7394
7395 return 0;
7396}
7397
7398int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7399 off_t ofs, bool exclusive,
7400 RGWObjVersionTracker *objv_tracker)
7401{
7402 rgw_rados_ref ref;
224ce89b 7403 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7404 if (r < 0) {
7405 return r;
7406 }
7407
7408 ObjectWriteOperation op;
7409
7410 if (exclusive)
7411 op.create(true);
7412
7413 if (objv_tracker) {
7414 objv_tracker->prepare_op_for_write(&op);
7415 }
7416 if (ofs == -1) {
7417 op.write_full(bl);
7418 } else {
7419 op.write(ofs, bl);
7420 }
7421 r = ref.ioctx.operate(ref.oid, &op);
7422 if (r < 0)
7423 return r;
7424
7425 if (objv_tracker) {
7426 objv_tracker->apply_write();
7427 }
7428 return 0;
7429}
7430
7431/**
7432 * Write/overwrite an object to the bucket storage.
7433 * bucket: the bucket to store the object in
7434 * obj: the object name/key
7435 * data: the object contents/value
7436 * offset: the offet to write to in the object
7437 * If this is -1, we will overwrite the whole object.
7438 * size: the amount of data to write (data must be this long)
7439 * attrs: all the given attrs are written to bucket storage for the given object
7440 * Returns: 0 on success, -ERR# otherwise.
7441 */
7442
7443int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7444 off_t ofs, bool exclusive,
7445 void **handle)
7446{
7447 rgw_rados_ref ref;
7448 int r = get_raw_obj_ref(obj, &ref);
7449 if (r < 0) {
7450 return r;
7451 }
7452
7453 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7454 *handle = c;
7455
7456 ObjectWriteOperation op;
7457
7458 if (exclusive)
7459 op.create(true);
7460
7461 if (ofs == -1) {
7462 op.write_full(bl);
7463 } else {
7464 op.write(ofs, bl);
7465 }
7466 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7467 if (r < 0)
7468 return r;
7469
7470 return 0;
7471}
7472
7473int RGWRados::aio_wait(void *handle)
7474{
7475 AioCompletion *c = (AioCompletion *)handle;
7476 c->wait_for_safe();
7477 int ret = c->get_return_value();
7478 c->release();
7479 return ret;
7480}
7481
7482bool RGWRados::aio_completed(void *handle)
7483{
7484 AioCompletion *c = (AioCompletion *)handle;
7485 return c->is_safe();
7486}
7487
28e407b8
AA
7488// PutObj filter that buffers data so we don't try to compress tiny blocks.
7489// libcurl reads in 16k at a time, and we need at least 64k to get a good
7490// compression ratio
7491class RGWPutObj_Buffer : public RGWPutObj_Filter {
7492 const unsigned buffer_size;
7493 bufferlist buffer;
7494 public:
7495 RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
7496 : RGWPutObj_Filter(next), buffer_size(buffer_size) {
7497 assert(ISP2(buffer_size)); // must be power of 2
7498 }
7499
7500 int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
7501 bool *again) override {
7502 if (*again || !bl.length()) {
7503 // flush buffered data
7504 return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
7505 }
7506 // transform offset to the beginning of the buffer
7507 ofs = ofs - buffer.length();
7508 buffer.claim_append(bl);
7509 if (buffer.length() < buffer_size) {
7510 *again = false; // don't come back until there's more data
7511 return 0;
7512 }
7513 const auto count = P2ALIGN(buffer.length(), buffer_size);
7514 buffer.splice(0, count, &bl);
7515 return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
7516 }
7517};
7518
7c673cae
FG
7519class RGWRadosPutObj : public RGWGetDataCB
7520{
7521 CephContext* cct;
7522 rgw_obj obj;
7523 RGWPutObjDataProcessor *filter;
7524 boost::optional<RGWPutObj_Compress>& compressor;
28e407b8 7525 boost::optional<RGWPutObj_Buffer> buffering;
7c673cae
FG
7526 CompressorRef& plugin;
7527 RGWPutObjProcessor_Atomic *processor;
7528 RGWOpStateSingleOp *opstate;
7529 void (*progress_cb)(off_t, void *);
7530 void *progress_data;
7531 bufferlist extra_data_bl;
b32b8144 7532 uint64_t extra_data_left;
7c673cae
FG
7533 uint64_t data_len;
7534 map<string, bufferlist> src_attrs;
7535public:
7536 RGWRadosPutObj(CephContext* cct,
7537 CompressorRef& plugin,
7538 boost::optional<RGWPutObj_Compress>& compressor,
7539 RGWPutObjProcessor_Atomic *p,
7540 RGWOpStateSingleOp *_ops,
7541 void (*_progress_cb)(off_t, void *),
7542 void *_progress_data) :
7543 cct(cct),
7544 filter(p),
7545 compressor(compressor),
7546 plugin(plugin),
7547 processor(p),
7548 opstate(_ops),
7549 progress_cb(_progress_cb),
7550 progress_data(_progress_data),
b32b8144 7551 extra_data_left(0),
7c673cae
FG
7552 data_len(0) {}
7553
7554 int process_attrs(void) {
7555 if (extra_data_bl.length()) {
7556 JSONParser jp;
7557 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7558 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7559 return -EIO;
7560 }
7561
7562 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7563
7564 src_attrs.erase(RGW_ATTR_COMPRESSION);
7565 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7566 }
7567
7568 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7569 //do not compress if object is encrypted
7570 compressor = boost::in_place(cct, plugin, filter);
28e407b8
AA
7571 constexpr unsigned buffer_size = 512 * 1024;
7572 buffering = boost::in_place(&*compressor, buffer_size);
7573 filter = &*buffering;
7c673cae
FG
7574 }
7575 return 0;
7576 }
7577
7578 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7579 if (progress_cb) {
7580 progress_cb(ofs, progress_data);
7581 }
b32b8144 7582 if (extra_data_left) {
7c673cae 7583 size_t extra_len = bl.length();
b32b8144
FG
7584 if (extra_len > extra_data_left)
7585 extra_len = extra_data_left;
7c673cae
FG
7586
7587 bufferlist extra;
7588 bl.splice(0, extra_len, &extra);
7589 extra_data_bl.append(extra);
7590
b32b8144
FG
7591 extra_data_left -= extra_len;
7592 if (extra_data_left == 0) {
7c673cae
FG
7593 int res = process_attrs();
7594 if (res < 0)
7595 return res;
7596 }
7597 if (bl.length() == 0) {
7598 return 0;
7599 }
b32b8144 7600 ofs += extra_len;
7c673cae 7601 }
b32b8144
FG
7602 // adjust ofs based on extra_data_len, so the result is a logical offset
7603 // into the object data
7604 assert(uint64_t(ofs) >= extra_data_len);
7605 ofs -= extra_data_len;
7606
7c673cae
FG
7607 data_len += bl.length();
7608 bool again = false;
7609
7610 bool need_opstate = true;
7611
7612 do {
7613 void *handle = NULL;
7614 rgw_raw_obj obj;
7615 uint64_t size = bl.length();
7616 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7617 if (ret < 0)
7618 return ret;
7619
7620 if (need_opstate && opstate) {
7621 /* need to update opstate repository with new state. This is ratelimited, so we're not
7622 * really doing it every time
7623 */
7624 ret = opstate->renew_state();
7625 if (ret < 0) {
7626 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7627 int r = filter->throttle_data(handle, obj, size, false);
7628 if (r < 0) {
7629 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7630 }
7631 /* could not renew state! might have been marked as cancelled */
7632 return ret;
7633 }
7634 need_opstate = false;
7635 }
7636
7637 ret = filter->throttle_data(handle, obj, size, false);
7638 if (ret < 0)
7639 return ret;
7640 } while (again);
7641
7642 return 0;
7643 }
7644
28e407b8
AA
7645 int flush() {
7646 bufferlist bl;
7647 return put_data_and_throttle(filter, bl, 0, false);
7648 }
7649
7c673cae
FG
7650 bufferlist& get_extra_data() { return extra_data_bl; }
7651
7652 map<string, bufferlist>& get_attrs() { return src_attrs; }
7653
7654 void set_extra_data_len(uint64_t len) override {
b32b8144
FG
7655 extra_data_left = len;
7656 RGWGetDataCB::set_extra_data_len(len);
7c673cae
FG
7657 }
7658
7659 uint64_t get_data_len() {
7660 return data_len;
7661 }
7662
7663 int complete(const string& etag, real_time *mtime, real_time set_mtime,
31f18b77
FG
7664 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7665 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7c673cae
FG
7666 }
7667
7668 bool is_canceled() {
7669 return processor->is_canceled();
7670 }
7671};
7672
7673/*
7674 * prepare attrset depending on attrs_mod.
7675 */
7676static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7677 map<string, bufferlist>& attrs,
7678 RGWRados::AttrsMod attrs_mod)
7679{
7680 switch (attrs_mod) {
7681 case RGWRados::ATTRSMOD_NONE:
7682 attrs = src_attrs;
7683 break;
7684 case RGWRados::ATTRSMOD_REPLACE:
7685 if (!attrs[RGW_ATTR_ETAG].length()) {
7686 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7687 }
181888fb
FG
7688 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7689 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7690 if (ttiter != src_attrs.end()) {
7691 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7692 }
7693 }
7c673cae
FG
7694 break;
7695 case RGWRados::ATTRSMOD_MERGE:
7696 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7697 if (attrs.find(it->first) == attrs.end()) {
7698 attrs[it->first] = it->second;
7699 }
7700 }
7701 break;
7702 }
7703}
7704
7705int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7706{
7707 map<string, bufferlist> attrset;
7708
7709 real_time mtime;
7710 uint64_t obj_size;
7711 RGWObjectCtx rctx(this);
7712
7713 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7714 RGWRados::Object::Read read_op(&op_target);
7715
7716 read_op.params.attrs = &attrset;
7717 read_op.params.lastmod = &mtime;
7718 read_op.params.obj_size = &obj_size;
7719
7720 int ret = read_op.prepare();
7721 if (ret < 0)
7722 return ret;
7723
7724 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 7725 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
7726
7727 uint64_t max_chunk_size;
7728
7729 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7730 if (ret < 0) {
7731 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7732 return ret;
7733 }
7734
b32b8144
FG
7735 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7736 max_chunk_size, NULL, mtime, attrset,
7737 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7738 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7739 NULL, NULL);
7c673cae
FG
7740}
7741
7742struct obj_time_weight {
7743 real_time mtime;
7744 uint32_t zone_short_id;
7745 uint64_t pg_ver;
7746 bool high_precision;
7747
7748 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7749
7750 bool compare_low_precision(const obj_time_weight& rhs) {
7751 struct timespec l = ceph::real_clock::to_timespec(mtime);
7752 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7753 l.tv_nsec = 0;
7754 r.tv_nsec = 0;
7755 if (l > r) {
7756 return false;
7757 }
7758 if (l < r) {
7759 return true;
7760 }
7761 if (zone_short_id != rhs.zone_short_id) {
7762 return (zone_short_id < rhs.zone_short_id);
7763 }
7764 return (pg_ver < rhs.pg_ver);
7765
7766 }
7767
7768 bool operator<(const obj_time_weight& rhs) {
7769 if (!high_precision || !rhs.high_precision) {
7770 return compare_low_precision(rhs);
7771 }
7772 if (mtime > rhs.mtime) {
7773 return false;
7774 }
7775 if (mtime < rhs.mtime) {
7776 return true;
7777 }
7778 if (zone_short_id != rhs.zone_short_id) {
7779 return (zone_short_id < rhs.zone_short_id);
7780 }
7781 return (pg_ver < rhs.pg_ver);
7782 }
7783
7784 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7785 mtime = _mtime;
7786 zone_short_id = _short_id;
7787 pg_ver = _pg_ver;
7788 }
7789
7790 void init(RGWObjState *state) {
7791 mtime = state->mtime;
7792 zone_short_id = state->zone_short_id;
7793 pg_ver = state->pg_ver;
7794 }
7795};
7796
7797inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7798 out << o.mtime;
7799
7800 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7801 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7802 }
7803
7804 return out;
7805}
7806
7807class RGWGetExtraDataCB : public RGWGetDataCB {
7808 bufferlist extra_data;
7809public:
7810 RGWGetExtraDataCB() {}
7811 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7812 if (extra_data.length() < extra_data_len) {
7813 off_t max = extra_data_len - extra_data.length();
7814 if (max > bl_len) {
7815 max = bl_len;
7816 }
7817 bl.splice(0, max, &extra_data);
7818 }
7819 return bl_len;
7820 }
7821
7822 bufferlist& get_extra_data() {
7823 return extra_data;
7824 }
7825};
7826
7827int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7828 const rgw_user& user_id,
7829 const string& client_id,
7830 req_info *info,
7831 const string& source_zone,
7832 rgw_obj& src_obj,
7833 RGWBucketInfo& src_bucket_info,
7834 real_time *src_mtime,
7835 uint64_t *psize,
7836 const real_time *mod_ptr,
7837 const real_time *unmod_ptr,
7838 bool high_precision_time,
7839 const char *if_match,
7840 const char *if_nomatch,
7841 map<string, bufferlist> *pattrs,
7842 string *version_id,
7843 string *ptag,
7844 string *petag)
7845{
7846 /* source is in a different zonegroup, copy from there */
7847
7848 RGWRESTStreamRWRequest *in_stream_req;
7849 string tag;
7850 map<string, bufferlist> src_attrs;
7851 append_rand_alpha(cct, tag, tag, 32);
7852 obj_time_weight set_mtime_weight;
7853 set_mtime_weight.high_precision = high_precision_time;
7854
7855 RGWRESTConn *conn;
7856 if (source_zone.empty()) {
7857 if (src_bucket_info.zonegroup.empty()) {
7858 /* source is in the master zonegroup */
7859 conn = rest_master_conn;
7860 } else {
7861 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7862 if (iter == zonegroup_conn_map.end()) {
7863 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7864 return -ENOENT;
7865 }
7866 conn = iter->second;
7867 }
7868 } else {
7869 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7870 if (iter == zone_conn_map.end()) {
7871 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7872 return -ENOENT;
7873 }
7874 conn = iter->second;
7875 }
7876
7877 RGWGetExtraDataCB cb;
7878 string etag;
7879 map<string, string> req_headers;
7880 real_time set_mtime;
7881
7882 const real_time *pmod = mod_ptr;
7883
7884 obj_time_weight dest_mtime_weight;
7885
181888fb
FG
7886 constexpr bool prepend_meta = true;
7887 constexpr bool get_op = true;
7888 constexpr bool rgwx_stat = true;
7889 constexpr bool sync_manifest = true;
7890 constexpr bool skip_decrypt = true;
7c673cae
FG
7891 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7892 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
7893 prepend_meta, get_op, rgwx_stat,
7894 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
7895 if (ret < 0) {
7896 return ret;
7897 }
7898
7899 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7900 if (ret < 0) {
7901 return ret;
7902 }
7903
7904 bufferlist& extra_data_bl = cb.get_extra_data();
7905 if (extra_data_bl.length()) {
7906 JSONParser jp;
7907 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7908 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7909 return -EIO;
7910 }
7911
7912 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7913
7914 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7915 }
7916
7917 if (src_mtime) {
7918 *src_mtime = set_mtime;
7919 }
7920
7921 if (petag) {
7922 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7923 if (iter != src_attrs.end()) {
7924 bufferlist& etagbl = iter->second;
7925 *petag = etagbl.to_str();
7926 }
7927 }
7928
7929 if (pattrs) {
7930 *pattrs = src_attrs;
7931 }
7932
7933 return 0;
7934}
7935
7936int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7937 const rgw_user& user_id,
7938 const string& client_id,
7939 const string& op_id,
7940 bool record_op_state,
7941 req_info *info,
7942 const string& source_zone,
7943 rgw_obj& dest_obj,
7944 rgw_obj& src_obj,
7945 RGWBucketInfo& dest_bucket_info,
7946 RGWBucketInfo& src_bucket_info,
7947 real_time *src_mtime,
7948 real_time *mtime,
7949 const real_time *mod_ptr,
7950 const real_time *unmod_ptr,
7951 bool high_precision_time,
7952 const char *if_match,
7953 const char *if_nomatch,
7954 AttrsMod attrs_mod,
7955 bool copy_if_newer,
7956 map<string, bufferlist>& attrs,
7957 RGWObjCategory category,
91327a77 7958 boost::optional<uint64_t> olh_epoch,
7c673cae
FG
7959 real_time delete_at,
7960 string *version_id,
7961 string *ptag,
7962 ceph::buffer::list *petag,
7c673cae 7963 void (*progress_cb)(off_t, void *),
31f18b77
FG
7964 void *progress_data,
7965 rgw_zone_set *zones_trace)
7c673cae
FG
7966{
7967 /* source is in a different zonegroup, copy from there */
7968
7969 RGWRESTStreamRWRequest *in_stream_req;
7970 string tag;
7971 int i;
7972 append_rand_alpha(cct, tag, tag, 32);
7973 obj_time_weight set_mtime_weight;
7974 set_mtime_weight.high_precision = high_precision_time;
7975
7976 RGWPutObjProcessor_Atomic processor(obj_ctx,
7977 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7978 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7979 if (version_id && *version_id != "null") {
7980 processor.set_version_id(*version_id);
7981 }
91327a77
AA
7982 if (olh_epoch) {
7983 processor.set_olh_epoch(*olh_epoch);
7984 }
7c673cae
FG
7985 int ret = processor.prepare(this, NULL);
7986 if (ret < 0) {
7987 return ret;
7988 }
7989
7990 RGWRESTConn *conn;
7991 if (source_zone.empty()) {
7992 if (dest_bucket_info.zonegroup.empty()) {
7993 /* source is in the master zonegroup */
7994 conn = rest_master_conn;
7995 } else {
7996 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7997 if (iter == zonegroup_conn_map.end()) {
7998 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7999 return -ENOENT;
8000 }
8001 conn = iter->second;
8002 }
8003 } else {
8004 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
8005 if (iter == zone_conn_map.end()) {
8006 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
8007 return -ENOENT;
8008 }
8009 conn = iter->second;
8010 }
8011
8012 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
8013
8014 RGWOpStateSingleOp *opstate = NULL;
8015
8016 if (record_op_state) {
8017 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
8018
8019 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
8020 if (ret < 0) {
8021 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8022 delete opstate;
8023 return ret;
8024 }
8025 }
8026
8027 boost::optional<RGWPutObj_Compress> compressor;
8028 CompressorRef plugin;
8029
8030 const auto& compression_type = zone_params.get_compression_type(
8031 dest_bucket_info.placement_rule);
8032 if (compression_type != "none") {
8033 plugin = Compressor::create(cct, compression_type);
8034 if (!plugin) {
8035 ldout(cct, 1) << "Cannot load plugin for compression type "
8036 << compression_type << dendl;
8037 }
8038 }
8039
8040 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
8041
8042 string etag;
8043 map<string, string> req_headers;
8044 real_time set_mtime;
8045
8046 RGWObjState *dest_state = NULL;
8047
8048 const real_time *pmod = mod_ptr;
8049
8050 obj_time_weight dest_mtime_weight;
8051
8052 if (copy_if_newer) {
8053 /* need to get mtime for destination */
8054 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8055 if (ret < 0)
8056 goto set_err_state;
8057
8058 if (!real_clock::is_zero(dest_state->mtime)) {
8059 dest_mtime_weight.init(dest_state);
8060 pmod = &dest_mtime_weight.mtime;
8061 }
8062 }
8063
181888fb
FG
8064 static constexpr bool prepend_meta = true;
8065 static constexpr bool get_op = true;
8066 static constexpr bool rgwx_stat = false;
8067 static constexpr bool sync_manifest = true;
8068 static constexpr bool skip_decrypt = true;
7c673cae
FG
8069 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
8070 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
8071 prepend_meta, get_op, rgwx_stat,
8072 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
8073 if (ret < 0) {
8074 goto set_err_state;
8075 }
8076
8077 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
8078 if (ret < 0) {
8079 goto set_err_state;
8080 }
28e407b8
AA
8081 ret = cb.flush();
8082 if (ret < 0) {
8083 goto set_err_state;
8084 }
7c673cae
FG
8085 if (compressor && compressor->is_compressed()) {
8086 bufferlist tmp;
8087 RGWCompressionInfo cs_info;
8088 cs_info.compression_type = plugin->get_type_name();
8089 cs_info.orig_size = cb.get_data_len();
8090 cs_info.blocks = move(compressor->get_compression_blocks());
8091 ::encode(cs_info, tmp);
8092 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
8093 }
8094
8095 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
8096 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
8097 } else {
8098 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
8099 if (iter != cb.get_attrs().end()) {
8100 try {
8101 ::decode(delete_at, iter->second);
8102 } catch (buffer::error& err) {
8103 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
8104 }
8105 }
8106 }
8107
8108 if (src_mtime) {
8109 *src_mtime = set_mtime;
8110 }
8111
8112 if (petag) {
8113 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
8114 if (iter != cb.get_attrs().end()) {
8115 *petag = iter->second;
8116 }
8117 }
8118
8119 if (source_zone.empty()) {
8120 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
8121 } else {
8122 attrs = cb.get_attrs();
8123 }
8124
8125 if (copy_if_newer) {
8126 uint64_t pg_ver = 0;
8127 auto i = attrs.find(RGW_ATTR_PG_VER);
8128 if (i != attrs.end() && i->second.length() > 0) {
8129 bufferlist::iterator iter = i->second.begin();
8130 try {
8131 ::decode(pg_ver, iter);
8132 } catch (buffer::error& err) {
8133 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
8134 /* non critical error */
8135 }
8136 }
8137 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
8138 }
8139
8140#define MAX_COMPLETE_RETRY 100
8141 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
31f18b77 8142 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7c673cae
FG
8143 if (ret < 0) {
8144 goto set_err_state;
8145 }
8146 if (copy_if_newer && cb.is_canceled()) {
8147 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
8148 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
8149 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8150 if (ret < 0) {
8151 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
8152 goto set_err_state;
8153 }
8154 dest_mtime_weight.init(dest_state);
8155 dest_mtime_weight.high_precision = high_precision_time;
8156 if (!dest_state->exists ||
8157 dest_mtime_weight < set_mtime_weight) {
8158 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8159 continue;
8160 } else {
8161 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8162 }
8163 }
8164 break;
8165 }
8166
8167 if (i == MAX_COMPLETE_RETRY) {
8168 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
8169 ret = -EIO;
8170 goto set_err_state;
8171 }
8172
8173 if (opstate) {
8174 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
8175 if (ret < 0) {
8176 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8177 }
8178 delete opstate;
8179 }
8180
8181 return 0;
8182set_err_state:
8183 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
8184 // we may have already fetched during sync of OP_ADD, but were waiting
8185 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
8186 if (olh_epoch && *olh_epoch > 0) {
8187 constexpr bool log_data_change = true;
8188 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
8189 *olh_epoch, real_time(), false, zones_trace, log_data_change);
8190 } else {
8191 // we already have the latest copy
8192 ret = 0;
8193 }
7c673cae
FG
8194 }
8195 if (opstate) {
8196 RGWOpState::OpState state;
8197 if (ret < 0) {
8198 state = RGWOpState::OPSTATE_ERROR;
8199 } else {
8200 state = RGWOpState::OPSTATE_COMPLETE;
8201 }
8202 int r = opstate->set_state(state);
8203 if (r < 0) {
8204 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
8205 }
8206 delete opstate;
8207 }
8208 return ret;
8209}
8210
8211
8212int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
8213 map<string, bufferlist>& src_attrs,
8214 RGWRados::Object::Read& read_op,
8215 const rgw_user& user_id,
8216 rgw_obj& dest_obj,
8217 real_time *mtime)
8218{
8219 string etag;
8220
8221 RGWRESTStreamWriteRequest *out_stream_req;
8222
8223 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
8224 if (ret < 0) {
7c673cae
FG
8225 return ret;
8226 }
8227
8228 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
224ce89b
WB
8229 if (ret < 0) {
8230 delete out_stream_req;
7c673cae 8231 return ret;
224ce89b 8232 }
7c673cae
FG
8233
8234 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8235 if (ret < 0)
8236 return ret;
8237
8238 return 0;
8239}
8240
8241/**
8242 * Copy an object.
8243 * dest_obj: the object to copy into
8244 * src_obj: the object to copy from
8245 * attrs: usage depends on attrs_mod parameter
8246 * attrs_mod: the modification mode of the attrs, may have the following values:
8247 * ATTRSMOD_NONE - the attributes of the source object will be
8248 * copied without modifications, attrs parameter is ignored;
8249 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8250 * parameter, source object attributes are not copied;
8251 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8252 * are overwritten by values contained in attrs parameter.
8253 * err: stores any errors resulting from the get of the original object
8254 * Returns: 0 on success, -ERR# otherwise.
8255 */
8256int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8257 const rgw_user& user_id,
8258 const string& client_id,
8259 const string& op_id,
8260 req_info *info,
8261 const string& source_zone,
8262 rgw_obj& dest_obj,
8263 rgw_obj& src_obj,
8264 RGWBucketInfo& dest_bucket_info,
8265 RGWBucketInfo& src_bucket_info,
8266 real_time *src_mtime,
8267 real_time *mtime,
8268 const real_time *mod_ptr,
8269 const real_time *unmod_ptr,
8270 bool high_precision_time,
8271 const char *if_match,
8272 const char *if_nomatch,
8273 AttrsMod attrs_mod,
8274 bool copy_if_newer,
8275 map<string, bufferlist>& attrs,
8276 RGWObjCategory category,
8277 uint64_t olh_epoch,
8278 real_time delete_at,
8279 string *version_id,
8280 string *ptag,
8281 ceph::buffer::list *petag,
7c673cae
FG
8282 void (*progress_cb)(off_t, void *),
8283 void *progress_data)
8284{
8285 int ret;
8286 uint64_t obj_size;
8287 rgw_obj shadow_obj = dest_obj;
8288 string shadow_oid;
8289
8290 bool remote_src;
8291 bool remote_dest;
8292
8293 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8294 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8295
8296 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8297 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8298
8299 if (remote_src && remote_dest) {
8300 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8301 return -EINVAL;
8302 }
8303
8304 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8305
8306 if (remote_src || !source_zone.empty()) {
8307 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8308 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8309 unmod_ptr, high_precision_time,
8310 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
31f18b77 8311 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7c673cae
FG
8312 }
8313
8314 map<string, bufferlist> src_attrs;
8315 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8316 RGWRados::Object::Read read_op(&src_op_target);
8317
8318 read_op.conds.mod_ptr = mod_ptr;
8319 read_op.conds.unmod_ptr = unmod_ptr;
8320 read_op.conds.high_precision_time = high_precision_time;
8321 read_op.conds.if_match = if_match;
8322 read_op.conds.if_nomatch = if_nomatch;
8323 read_op.params.attrs = &src_attrs;
8324 read_op.params.lastmod = src_mtime;
8325 read_op.params.obj_size = &obj_size;
7c673cae
FG
8326
8327 ret = read_op.prepare();
8328 if (ret < 0) {
8329 return ret;
8330 }
94b18763
FG
8331 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8332 // Current implementation does not follow S3 spec and even
8333 // may result in data corruption silently when copying
8334 // multipart objects acorss pools. So reject COPY operations
8335 //on encrypted objects before it is fully functional.
8336 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8337 << " has not been implemented." << dendl;
8338 return -ERR_NOT_IMPLEMENTED;
8339 }
7c673cae
FG
8340
8341 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8342 src_attrs.erase(RGW_ATTR_DELETE_AT);
8343
8344 set_copy_attrs(src_attrs, attrs, attrs_mod);
8345 attrs.erase(RGW_ATTR_ID_TAG);
8346 attrs.erase(RGW_ATTR_PG_VER);
8347 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8348 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8349 if (cmp != src_attrs.end())
8350 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8351
8352 RGWObjManifest manifest;
8353 RGWObjState *astate = NULL;
8354
8355 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8356 if (ret < 0) {
8357 return ret;
8358 }
8359
8360 vector<rgw_raw_obj> ref_objs;
8361
8362 if (remote_dest) {
8363 /* dest is in a different zonegroup, copy it there */
8364 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8365 }
8366 uint64_t max_chunk_size;
8367
8368 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8369 if (ret < 0) {
8370 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8371 return ret;
8372 }
8373
8374 rgw_pool src_pool;
8375 rgw_pool dest_pool;
8376 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8377 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8378 return -EIO;
8379 }
8380 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8381 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8382 return -EIO;
8383 }
8384
8385
8386 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8387 bool copy_first = false;
8388 if (astate->has_manifest) {
8389 if (!astate->manifest.has_tail()) {
8390 copy_data = true;
8391 } else {
8392 uint64_t head_size = astate->manifest.get_head_size();
8393
8394 if (head_size > 0) {
8395 if (head_size > max_chunk_size) {
8396 copy_data = true;
8397 } else {
8398 copy_first = true;
8399 }
8400 }
8401 }
8402 }
8403
8404 if (petag) {
8405 const auto iter = attrs.find(RGW_ATTR_ETAG);
8406 if (iter != attrs.end()) {
8407 *petag = iter->second;
8408 }
8409 }
8410
8411 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8412 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8413 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
31f18b77 8414 version_id, ptag, petag);
7c673cae
FG
8415 }
8416
8417 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8418
8419 if (copy_first) { // we need to copy first chunk, not increase refcount
8420 ++miter;
8421 }
8422
8423 rgw_rados_ref ref;
8424 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8425 if (ret < 0) {
8426 return ret;
8427 }
8428
8429 bool versioned_dest = dest_bucket_info.versioning_enabled();
8430
8431 if (version_id && !version_id->empty()) {
8432 versioned_dest = true;
8433 dest_obj.key.set_instance(*version_id);
8434 } else if (versioned_dest) {
8435 gen_rand_obj_instance_name(&dest_obj);
8436 }
8437
8438 bufferlist first_chunk;
8439
8440 bool copy_itself = (dest_obj == src_obj);
8441 RGWObjManifest *pmanifest;
31f18b77 8442 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
8443
8444 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8445 RGWRados::Object::Write write_op(&dest_op_target);
8446
8447 string tag;
8448
8449 if (ptag) {
8450 tag = *ptag;
8451 }
8452
8453 if (tag.empty()) {
8454 append_rand_alpha(cct, tag, tag, 32);
8455 }
8456
8457 if (!copy_itself) {
181888fb 8458 attrs.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
8459 manifest = astate->manifest;
8460 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8461 if (tail_placement.bucket.name.empty()) {
8462 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8463 }
3efd9988 8464 string ref_tag;
7c673cae
FG
8465 for (; miter != astate->manifest.obj_end(); ++miter) {
8466 ObjectWriteOperation op;
3efd9988
FG
8467 ref_tag = tag + '\0';
8468 cls_refcount_get(op, ref_tag, true);
7c673cae
FG
8469 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8470 ref.ioctx.locator_set_key(loc.loc);
8471
8472 ret = ref.ioctx.operate(loc.oid, &op);
8473 if (ret < 0) {
8474 goto done_ret;
8475 }
8476
8477 ref_objs.push_back(loc);
8478 }
8479
8480 pmanifest = &manifest;
8481 } else {
8482 pmanifest = &astate->manifest;
8483 /* don't send the object's tail for garbage collection */
8484 astate->keep_tail = true;
8485 }
8486
8487 if (copy_first) {
8488 ret = read_op.read(0, max_chunk_size, first_chunk);
8489 if (ret < 0) {
8490 goto done_ret;
8491 }
8492
8493 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8494 } else {
8495 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8496 }
8497
8498 write_op.meta.data = &first_chunk;
8499 write_op.meta.manifest = pmanifest;
8500 write_op.meta.ptag = &tag;
8501 write_op.meta.owner = dest_bucket_info.owner;
8502 write_op.meta.mtime = mtime;
8503 write_op.meta.flags = PUT_OBJ_CREATE;
8504 write_op.meta.category = category;
8505 write_op.meta.olh_epoch = olh_epoch;
8506 write_op.meta.delete_at = delete_at;
181888fb 8507 write_op.meta.modify_tail = !copy_itself;
7c673cae
FG
8508
8509 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8510 if (ret < 0) {
8511 goto done_ret;
8512 }
8513
8514 return 0;
8515
8516done_ret:
8517 if (!copy_itself) {
8518 vector<rgw_raw_obj>::iterator riter;
8519
7c673cae
FG
8520 /* rollback reference */
8521 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8522 ObjectWriteOperation op;
8523 cls_refcount_put(op, tag, true);
8524
8525 ref.ioctx.locator_set_key(riter->loc);
8526
8527 int r = ref.ioctx.operate(riter->oid, &op);
8528 if (r < 0) {
8529 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8530 }
8531 }
8532 }
8533 return ret;
8534}
8535
8536
8537int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8538 RGWBucketInfo& dest_bucket_info,
8539 RGWRados::Object::Read& read_op, off_t end,
8540 rgw_obj& dest_obj,
8541 rgw_obj& src_obj,
8542 uint64_t max_chunk_size,
8543 real_time *mtime,
8544 real_time set_mtime,
8545 map<string, bufferlist>& attrs,
8546 RGWObjCategory category,
8547 uint64_t olh_epoch,
8548 real_time delete_at,
8549 string *version_id,
8550 string *ptag,
31f18b77 8551 ceph::buffer::list *petag)
7c673cae
FG
8552{
8553 bufferlist first_chunk;
8554 RGWObjManifest manifest;
8555
8556 string tag;
8557 append_rand_alpha(cct, tag, tag, 32);
8558
8559 RGWPutObjProcessor_Atomic processor(obj_ctx,
b32b8144 8560 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7c673cae
FG
8561 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8562 if (version_id) {
8563 processor.set_version_id(*version_id);
8564 }
8565 processor.set_olh_epoch(olh_epoch);
8566 int ret = processor.prepare(this, NULL);
8567 if (ret < 0)
8568 return ret;
8569
8570 off_t ofs = 0;
8571
8572 do {
8573 bufferlist bl;
8574 ret = read_op.read(ofs, end, bl);
8575
8576 uint64_t read_len = ret;
8577 bool again;
8578
8579 do {
8580 void *handle;
8581 rgw_raw_obj obj;
8582
8583 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8584 if (ret < 0) {
8585 return ret;
8586 }
8587 ret = processor.throttle_data(handle, obj, read_len, false);
8588 if (ret < 0)
8589 return ret;
8590 } while (again);
8591
8592 ofs += read_len;
8593 } while (ofs <= end);
8594
8595 string etag;
8596 auto iter = attrs.find(RGW_ATTR_ETAG);
8597 if (iter != attrs.end()) {
8598 bufferlist& bl = iter->second;
8599 etag = string(bl.c_str(), bl.length());
8600 if (petag) {
8601 *petag = bl;
8602 }
8603 }
8604
8605 uint64_t accounted_size;
8606 {
8607 bool compressed{false};
8608 RGWCompressionInfo cs_info;
8609 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8610 if (ret < 0) {
8611 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8612 return ret;
8613 }
8614 // pass original size if compressed
8615 accounted_size = compressed ? cs_info.orig_size : ofs;
8616 }
8617
8618 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8619}
8620
8621bool RGWRados::is_meta_master()
8622{
31f18b77 8623 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8624 return false;
8625 }
8626
8627 return (get_zonegroup().master_zone == zone_public_config.id);
8628}
8629
8630/**
8631 * Check to see if the bucket metadata could be synced
8632 * bucket: the bucket to check
8633 * Returns false is the bucket is not synced
8634 */
8635bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8636{
8637
8638 /* no current period */
8639 if (current_period.get_id().empty()) {
8640 return false;
8641 }
8642
8643 /* zonegroup is not master zonegroup */
31f18b77 8644 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8645 return false;
8646 }
8647
8648 /* single zonegroup and a single zone */
224ce89b 8649 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
7c673cae
FG
8650 return false;
8651 }
8652
8653 /* zone is not master */
8654 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8655 return false;
8656 }
8657
8658 return true;
8659}
8660
8661int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8662{
1adf2230 8663 std::vector<rgw_bucket_dir_entry> ent_list;
7c673cae
FG
8664 rgw_obj_index_key marker;
8665 string prefix;
8666 bool is_truncated;
8667
8668 do {
1adf2230
AA
8669 constexpr uint NUM_ENTRIES = 1000u;
8670 int r = cls_bucket_list_unordered(bucket_info,
8671 RGW_NO_SHARD,
8672 marker,
8673 prefix,
8674 NUM_ENTRIES,
8675 true,
8676 ent_list,
8677 &is_truncated,
8678 &marker);
7c673cae
FG
8679 if (r < 0)
8680 return r;
8681
8682 string ns;
1adf2230 8683 for (auto const& dirent : ent_list) {
7c673cae
FG
8684 rgw_obj_key obj;
8685
1adf2230 8686 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
7c673cae
FG
8687 return -ENOTEMPTY;
8688 }
8689 } while (is_truncated);
1adf2230 8690
7c673cae
FG
8691 return 0;
8692}
8693
8694/**
8695 * Delete a bucket.
8696 * bucket: the name of the bucket to delete
8697 * Returns 0 on success, -ERR# otherwise.
8698 */
8699int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8700{
8701 const rgw_bucket& bucket = bucket_info.bucket;
8702 librados::IoCtx index_ctx;
8703 map<int, string> bucket_objs;
8704 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8705 if (r < 0)
8706 return r;
8707
8708 if (check_empty) {
8709 r = check_bucket_empty(bucket_info);
8710 if (r < 0) {
8711 return r;
8712 }
8713 }
8714
8715 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8716 if (r < 0)
8717 return r;
8718
8719 /* if the bucket is not synced we can remove the meta file */
8720 if (!is_syncing_bucket_meta(bucket)) {
8721 RGWObjVersionTracker objv_tracker;
f64942e4 8722 r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
7c673cae
FG
8723 if (r < 0) {
8724 return r;
8725 }
f64942e4
AA
8726
8727 /* remove bucket index objects asynchronously by best effort */
8728 (void) CLSRGWIssueBucketIndexClean(index_ctx,
8729 bucket_objs,
8730 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 8731 }
f64942e4 8732
7c673cae
FG
8733 return 0;
8734}
8735
8736int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8737{
8738 RGWBucketInfo info;
8739 map<string, bufferlist> attrs;
8740 RGWObjectCtx obj_ctx(this);
31f18b77
FG
8741 int r;
8742 if (bucket.bucket_id.empty()) {
8743 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8744 } else {
8745 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8746 }
7c673cae
FG
8747 if (r < 0) {
8748 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8749 return r;
8750 }
8751
8752 info.owner = owner.get_id();
8753
8754 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8755 if (r < 0) {
8756 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8757 return r;
8758 }
8759
8760 return 0;
8761}
8762
8763
8764int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8765{
8766 int ret = 0;
8767
8768 vector<rgw_bucket>::iterator iter;
8769
8770 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8771 rgw_bucket& bucket = *iter;
8772 if (enabled)
8773 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8774 else
8775 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8776
8777 RGWBucketInfo info;
8778 map<string, bufferlist> attrs;
8779 RGWObjectCtx obj_ctx(this);
8780 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8781 if (r < 0) {
8782 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8783 ret = r;
8784 continue;
8785 }
8786 if (enabled) {
8787 info.flags &= ~BUCKET_SUSPENDED;
8788 } else {
8789 info.flags |= BUCKET_SUSPENDED;
8790 }
8791
8792 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8793 if (r < 0) {
8794 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8795 ret = r;
8796 continue;
8797 }
8798 }
8799 return ret;
8800}
8801
8802int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8803{
8804 RGWBucketInfo bucket_info;
8805 RGWObjectCtx obj_ctx(this);
8806 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8807 if (ret < 0) {
8808 return ret;
8809 }
8810
8811 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8812 return 0;
8813}
8814
8815int RGWRados::Object::complete_atomic_modification()
8816{
8817 if (!state->has_manifest || state->keep_tail)
8818 return 0;
8819
8820 cls_rgw_obj_chain chain;
8821 store->update_gc_chain(obj, state->manifest, &chain);
8822
8823 if (chain.empty()) {
8824 return 0;
8825 }
8826
181888fb 8827 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
7c673cae
FG
8828 return store->gc->send_chain(chain, tag, false); // do it async
8829}
8830
8831void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8832{
8833 RGWObjManifest::obj_iterator iter;
8834 rgw_raw_obj raw_head;
8835 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8836 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8837 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8838 if (mobj == raw_head)
8839 continue;
8840 cls_rgw_obj_key key(mobj.oid);
8841 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8842 }
8843}
8844
8845int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8846{
8847 return gc->send_chain(chain, tag, sync);
8848}
8849
1adf2230
AA
8850int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8851 librados::IoCtx& index_ctx,
8852 string& bucket_oid)
7c673cae
FG
8853{
8854 const rgw_bucket& bucket = bucket_info.bucket;
8855 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8856 if (r < 0)
8857 return r;
8858
8859 if (bucket.bucket_id.empty()) {
8860 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8861 return -EIO;
8862 }
8863
8864 bucket_oid = dir_oid_prefix;
8865 bucket_oid.append(bucket.bucket_id);
8866
8867 return 0;
8868}
8869
1adf2230
AA
8870int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
8871 librados::IoCtx& index_ctx,
8872 string& bucket_oid_base) {
7c673cae
FG
8873 const rgw_bucket& bucket = bucket_info.bucket;
8874 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8875 if (r < 0)
8876 return r;
8877
8878 if (bucket.bucket_id.empty()) {
8879 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8880 return -EIO;
8881 }
8882
8883 bucket_oid_base = dir_oid_prefix;
8884 bucket_oid_base.append(bucket.bucket_id);
8885
8886 return 0;
8887
8888}
8889
1adf2230
AA
8890int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8891 librados::IoCtx& index_ctx,
8892 map<int, string>& bucket_objs,
8893 int shard_id,
8894 map<int, string> *bucket_instance_ids) {
7c673cae
FG
8895 string bucket_oid_base;
8896 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8897 if (ret < 0) {
8898 return ret;
8899 }
8900
8901 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8902 if (bucket_instance_ids) {
8903 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8904 }
8905 return 0;
8906}
8907
8908template<typename T>
8909int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8910 map<int, string>& oids, map<int, T>& bucket_objs,
8911 int shard_id, map<int, string> *bucket_instance_ids)
8912{
8913 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8914 if (ret < 0)
8915 return ret;
8916
8917 map<int, string>::const_iterator iter = oids.begin();
8918 for (; iter != oids.end(); ++iter) {
8919 bucket_objs[iter->first] = T();
8920 }
8921 return 0;
8922}
8923
8924int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8925 const string& obj_key, string *bucket_obj, int *shard_id)
8926{
8927 string bucket_oid_base;
8928 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8929 if (ret < 0)
8930 return ret;
8931
8932 RGWObjectCtx obj_ctx(this);
8933
8934 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8935 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8936 if (ret < 0) {
8937 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8938 return ret;
8939 }
8940 return 0;
8941}
8942
8943int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8944 int shard_id, string *bucket_obj)
8945{
8946 string bucket_oid_base;
8947 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8948 if (ret < 0)
8949 return ret;
8950
8951 RGWObjectCtx obj_ctx(this);
8952
8953 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8954 shard_id, bucket_obj);
8955 return 0;
8956}
8957
8958static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8959 map<RGWObjCategory, RGWStorageStats>& stats)
8960{
8961 for (const auto& pair : header.stats) {
8962 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8963 const rgw_bucket_category_stats& header_stats = pair.second;
8964
8965 RGWStorageStats& s = stats[category];
8966
8967 s.category = category;
8968 s.size += header_stats.total_size;
8969 s.size_rounded += header_stats.total_size_rounded;
8970 s.size_utilized += header_stats.actual_size;
8971 s.num_objects += header_stats.num_entries;
8972 }
8973}
8974
8975int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8976 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8977 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8978{
8979 librados::IoCtx index_ctx;
8980 // key - bucket index object id
8981 // value - bucket index check OP returned result with the given bucket index object (shard)
8982 map<int, string> oids;
8983 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 8984
7c673cae 8985 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
31f18b77
FG
8986 if (ret < 0) {
8987 return ret;
8988 }
7c673cae
FG
8989
8990 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
8991 if (ret < 0) {
8992 return ret;
8993 }
7c673cae
FG
8994
8995 // Aggregate results (from different shards if there is any)
8996 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8997 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8998 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8999 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
9000 }
9001
9002 return 0;
9003}
9004
9005int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
9006{
9007 librados::IoCtx index_ctx;
9008 map<int, string> bucket_objs;
31f18b77 9009
7c673cae 9010 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
31f18b77 9011 if (r < 0) {
7c673cae 9012 return r;
31f18b77 9013 }
7c673cae
FG
9014
9015 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
9016}
9017
f64942e4 9018int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77
FG
9019{
9020 librados::IoCtx index_ctx;
9021 map<int, string> bucket_objs;
9022
9023 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
9024 if (r < 0) {
9025 return r;
9026 }
9027
9028 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
9029}
7c673cae
FG
9030
9031int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
9032{
9033 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9034 std::string oid, key;
9035 get_obj_bucket_and_oid_loc(obj, oid, key);
9036 if (!rctx)
9037 return 0;
9038
9039 RGWObjState *state = NULL;
9040
9041 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
9042 if (r < 0)
9043 return r;
9044
9045 if (!state->is_atomic) {
9046 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
9047 return -EINVAL;
9048 }
9049
181888fb
FG
9050 string tag;
9051
9052 if (state->tail_tag.length() > 0) {
9053 tag = state->tail_tag.c_str();
9054 } else if (state->obj_tag.length() > 0) {
9055 tag = state->obj_tag.c_str();
9056 } else {
7c673cae
FG
9057 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
9058 return -EINVAL;
9059 }
9060
7c673cae
FG
9061 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
9062
9063 return gc->defer_chain(tag, false);
9064}
9065
9066void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
9067{
9068 list<string> prefixes;
9069 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
9070 cls_rgw_remove_obj(op, prefixes);
9071}
9072
9073void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
9074{
9075 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
9076}
9077
9078void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
9079{
9080 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
9081}
9082
9083
9084/**
9085 * Delete an object.
9086 * bucket: name of the bucket storing the object
9087 * obj: name of the object to delete
9088 * Returns: 0 on success, -ERR# otherwise.
9089 */
9090int RGWRados::Object::Delete::delete_obj()
9091{
9092 RGWRados *store = target->get_store();
9093 rgw_obj& src_obj = target->get_obj();
9094 const string& instance = src_obj.key.instance;
9095 rgw_obj obj = src_obj;
9096
9097 if (instance == "null") {
9098 obj.key.instance.clear();
9099 }
9100
9101 bool explicit_marker_version = (!params.marker_version_id.empty());
9102
9103 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
9104 if (instance.empty() || explicit_marker_version) {
9105 rgw_obj marker = obj;
9106
9107 if (!params.marker_version_id.empty()) {
9108 if (params.marker_version_id != "null") {
9109 marker.key.set_instance(params.marker_version_id);
9110 }
9111 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
9112 store->gen_rand_obj_instance_name(&marker);
9113 }
9114
9115 result.version_id = marker.key.instance;
91327a77
AA
9116 if (result.version_id.empty())
9117 result.version_id = "null";
7c673cae
FG
9118 result.delete_marker = true;
9119
9120 struct rgw_bucket_dir_entry_meta meta;
9121
9122 meta.owner = params.obj_owner.get_id().to_str();
9123 meta.owner_display_name = params.obj_owner.get_display_name();
9124
9125 if (real_clock::is_zero(params.mtime)) {
9126 meta.mtime = real_clock::now();
9127 } else {
9128 meta.mtime = params.mtime;
9129 }
9130
31f18b77 9131 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
7c673cae
FG
9132 if (r < 0) {
9133 return r;
9134 }
9135 } else {
9136 rgw_bucket_dir_entry dirent;
9137
9138 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
9139 if (r < 0) {
9140 return r;
9141 }
9142 result.delete_marker = dirent.is_delete_marker();
31f18b77 9143 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
7c673cae
FG
9144 if (r < 0) {
9145 return r;
9146 }
9147 result.version_id = instance;
9148 }
9149
9150 BucketShard *bs;
9151 int r = target->get_bucket_shard(&bs);
9152 if (r < 0) {
9153 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
9154 return r;
9155 }
9156
c07f9fc5
FG
9157 if (target->bucket_info.datasync_flag_enabled()) {
9158 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9159 if (r < 0) {
9160 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9161 return r;
9162 }
7c673cae
FG
9163 }
9164
9165 return 0;
9166 }
9167
9168 rgw_rados_ref ref;
9169 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
9170 if (r < 0) {
9171 return r;
9172 }
9173
9174 RGWObjState *state;
9175 r = target->get_state(&state, false);
9176 if (r < 0)
9177 return r;
9178
9179 ObjectWriteOperation op;
9180
9181 if (!real_clock::is_zero(params.unmod_since)) {
9182 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
9183 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
9184 if (!params.high_precision_time) {
9185 ctime.tv_nsec = 0;
9186 unmod.tv_nsec = 0;
9187 }
9188
9189 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
9190 if (ctime > unmod) {
9191 return -ERR_PRECONDITION_FAILED;
9192 }
9193
9194 /* only delete object if mtime is less than or equal to params.unmod_since */
9195 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
9196 }
9197 uint64_t obj_size = state->size;
9198
9199 if (!real_clock::is_zero(params.expiration_time)) {
9200 bufferlist bl;
9201 real_time delete_at;
9202
9203 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
9204 try {
9205 bufferlist::iterator iter = bl.begin();
9206 ::decode(delete_at, iter);
9207 } catch (buffer::error& err) {
9208 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
9209 return -EIO;
9210 }
9211
9212 if (params.expiration_time != delete_at) {
9213 return -ERR_PRECONDITION_FAILED;
9214 }
9215 } else {
9216 return -ERR_PRECONDITION_FAILED;
9217 }
9218 }
9219
9220 if (!state->exists) {
9221 target->invalidate_state();
9222 return -ENOENT;
9223 }
9224
181888fb 9225 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
7c673cae
FG
9226 if (r < 0)
9227 return r;
9228
9229 RGWBucketInfo& bucket_info = target->get_bucket_info();
9230
9231 RGWRados::Bucket bop(store, bucket_info);
9232 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
9233
9234 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
9235 index_op.set_bilog_flags(params.bilog_flags);
9236
7c673cae
FG
9237 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
9238 if (r < 0)
9239 return r;
9240
9241 store->remove_rgw_head_obj(op);
9242 r = ref.ioctx.operate(ref.oid, &op);
94b18763
FG
9243
9244 /* raced with another operation, object state is indeterminate */
9245 const bool need_invalidate = (r == -ECANCELED);
7c673cae
FG
9246
9247 int64_t poolid = ref.ioctx.get_id();
9248 if (r >= 0) {
9249 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9250 if (obj_tombstone_cache) {
9251 tombstone_entry entry{*state};
9252 obj_tombstone_cache->add(obj, entry);
9253 }
9254 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 9255
7c673cae
FG
9256 int ret = target->complete_atomic_modification();
9257 if (ret < 0) {
9258 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9259 }
9260 /* other than that, no need to propagate error */
224ce89b
WB
9261 } else {
9262 int ret = index_op.cancel();
9263 if (ret < 0) {
9264 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9265 }
7c673cae
FG
9266 }
9267
9268 if (need_invalidate) {
9269 target->invalidate_state();
9270 }
9271
9272 if (r < 0)
9273 return r;
9274
9275 /* update quota cache */
9276 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9277
9278 return 0;
9279}
9280
9281int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9282 const RGWBucketInfo& bucket_info,
9283 const rgw_obj& obj,
9284 int versioning_status,
9285 uint16_t bilog_flags,
31f18b77
FG
9286 const real_time& expiration_time,
9287 rgw_zone_set *zones_trace)
7c673cae
FG
9288{
9289 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9290 RGWRados::Object::Delete del_op(&del_target);
9291
9292 del_op.params.bucket_owner = bucket_info.owner;
9293 del_op.params.versioning_status = versioning_status;
9294 del_op.params.bilog_flags = bilog_flags;
9295 del_op.params.expiration_time = expiration_time;
31f18b77 9296 del_op.params.zones_trace = zones_trace;
7c673cae
FG
9297
9298 return del_op.delete_obj();
9299}
9300
9301int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9302{
9303 rgw_rados_ref ref;
224ce89b 9304 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9305 if (r < 0) {
9306 return r;
9307 }
9308
9309 ObjectWriteOperation op;
9310
9311 op.remove();
9312 r = ref.ioctx.operate(ref.oid, &op);
9313 if (r < 0)
9314 return r;
9315
9316 return 0;
9317}
9318
9319int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9320{
9321 if (obj.empty()) {
9322 ldout(cct, 1) << "delete_system_obj got empty object name "
9323 << obj << ", returning EINVAL" << dendl;
9324 return -EINVAL;
9325 }
9326 rgw_rados_ref ref;
224ce89b 9327 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9328 if (r < 0) {
9329 return r;
9330 }
9331
9332 ObjectWriteOperation op;
9333
9334 if (objv_tracker) {
9335 objv_tracker->prepare_op_for_write(&op);
9336 }
9337
9338 op.remove();
9339 r = ref.ioctx.operate(ref.oid, &op);
9340 if (r < 0)
9341 return r;
9342
9343 return 0;
9344}
9345
9346int RGWRados::delete_obj_index(const rgw_obj& obj)
9347{
9348 std::string oid, key;
9349 get_obj_bucket_and_oid_loc(obj, oid, key);
9350
9351 RGWObjectCtx obj_ctx(this);
9352
9353 RGWBucketInfo bucket_info;
9354 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9355 if (ret < 0) {
9356 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9357 return ret;
9358 }
9359
9360 RGWRados::Bucket bop(this, bucket_info);
9361 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9362
9363 real_time removed_mtime;
9364 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9365
9366 return r;
9367}
9368
9369static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9370{
9371 string tag;
9372
9373 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9374 if (mi != manifest.obj_end()) {
9375 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9376 ++mi;
9377 tag = mi.get_location().get_raw_obj(store).oid;
9378 tag.append("_");
9379 }
9380
9381 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9382 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9383 MD5 hash;
9384 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9385
9386 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9387 if (iter != attrset.end()) {
9388 bufferlist& bl = iter->second;
9389 hash.Update((const byte *)bl.c_str(), bl.length());
9390 }
9391
9392 hash.Final(md5);
9393 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9394 tag.append(md5_str);
9395
9396 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9397
9398 tag_bl.append(tag.c_str(), tag.size() + 1);
9399}
9400
9401static bool is_olh(map<string, bufferlist>& attrs)
9402{
9403 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9404 return (iter != attrs.end());
9405}
9406
9407static bool has_olh_tag(map<string, bufferlist>& attrs)
9408{
9409 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9410 return (iter != attrs.end());
9411}
9412
9413int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9414 RGWObjState *olh_state, RGWObjState **target_state)
9415{
9416 assert(olh_state->is_olh);
9417
9418 rgw_obj target;
9419 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9420 if (r < 0) {
9421 return r;
9422 }
9423 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9424 if (r < 0) {
9425 return r;
9426 }
9427
9428 return 0;
9429}
9430
9431int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9432{
9433 if (obj.empty()) {
9434 return -EINVAL;
9435 }
9436
9437 RGWRawObjState *s = rctx->raw.get_state(obj);
9438 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9439 *state = s;
9440 if (s->has_attrs) {
9441 return 0;
9442 }
9443
9444 s->obj = obj;
9445
9446 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9447 if (r == -ENOENT) {
9448 s->exists = false;
9449 s->has_attrs = true;
9450 s->mtime = real_time();
9451 return 0;
9452 }
9453 if (r < 0)
9454 return r;
9455
9456 s->exists = true;
9457 s->has_attrs = true;
9458 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9459
9460 if (s->obj_tag.length())
31f18b77
FG
9461 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9462 << s->obj_tag.c_str() << dendl;
7c673cae
FG
9463 else
9464 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9465
9466 return 0;
9467}
9468
9469int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9470{
9471 int ret;
9472
9473 do {
9474 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9475 } while (ret == -EAGAIN);
9476
9477 return ret;
9478}
9479
9480int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9481 RGWObjState **state, bool follow_olh, bool assume_noent)
9482{
9483 if (obj.empty()) {
9484 return -EINVAL;
9485 }
9486
9487 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9488
9489 RGWObjState *s = rctx->obj.get_state(obj);
9490 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9491 *state = s;
9492 if (s->has_attrs) {
9493 if (s->is_olh && need_follow_olh) {
9494 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9495 }
9496 return 0;
9497 }
9498
9499 s->obj = obj;
9500
9501 rgw_raw_obj raw_obj;
9502 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9503
9504 int r = -ENOENT;
9505
9506 if (!assume_noent) {
9507 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9508 }
9509
9510 if (r == -ENOENT) {
9511 s->exists = false;
9512 s->has_attrs = true;
9513 tombstone_entry entry;
9514 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9515 s->mtime = entry.mtime;
9516 s->zone_short_id = entry.zone_short_id;
9517 s->pg_ver = entry.pg_ver;
9518 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9519 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9520 } else {
9521 s->mtime = real_time();
9522 }
9523 return 0;
9524 }
9525 if (r < 0)
9526 return r;
9527
9528 s->exists = true;
9529 s->has_attrs = true;
9530 s->accounted_size = s->size;
9531
9532 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
9533 const bool compressed = (iter != s->attrset.end());
9534 if (compressed) {
7c673cae
FG
9535 // use uncompressed size for accounted_size
9536 try {
9537 RGWCompressionInfo info;
9538 auto p = iter->second.begin();
9539 ::decode(info, p);
31f18b77 9540 s->accounted_size = info.orig_size;
7c673cae
FG
9541 } catch (buffer::error&) {
9542 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9543 return -EIO;
9544 }
9545 }
9546
9547 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9548 if (iter != s->attrset.end()) {
9549 bufferlist bl = iter->second;
9550 bufferlist::iterator it = bl.begin();
9551 it.copy(bl.length(), s->shadow_obj);
9552 s->shadow_obj[bl.length()] = '\0';
9553 }
9554 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
9555 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9556 if (ttiter != s->attrset.end()) {
9557 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9558 }
7c673cae
FG
9559
9560 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9561 if (manifest_bl.length()) {
9562 bufferlist::iterator miter = manifest_bl.begin();
9563 try {
9564 ::decode(s->manifest, miter);
9565 s->has_manifest = true;
9566 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9567 broken due to old bugs */
9568 s->size = s->manifest.get_obj_size();
31f18b77
FG
9569 if (!compressed)
9570 s->accounted_size = s->size;
7c673cae
FG
9571 } catch (buffer::error& err) {
9572 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9573 return -EIO;
9574 }
9575 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9576 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9577 RGWObjManifest::obj_iterator mi;
9578 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9579 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9580 }
9581 }
9582
9583 if (!s->obj_tag.length()) {
9584 /*
9585 * Uh oh, something's wrong, object with manifest should have tag. Let's
9586 * create one out of the manifest, would be unique
9587 */
9588 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9589 s->fake_tag = true;
9590 }
9591 }
9592 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9593 if (aiter != s->attrset.end()) {
9594 bufferlist& pg_ver_bl = aiter->second;
9595 if (pg_ver_bl.length()) {
9596 bufferlist::iterator pgbl = pg_ver_bl.begin();
9597 try {
9598 ::decode(s->pg_ver, pgbl);
9599 } catch (buffer::error& err) {
9600 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9601 }
9602 }
9603 }
9604 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9605 if (aiter != s->attrset.end()) {
9606 bufferlist& zone_short_id_bl = aiter->second;
9607 if (zone_short_id_bl.length()) {
9608 bufferlist::iterator zbl = zone_short_id_bl.begin();
9609 try {
9610 ::decode(s->zone_short_id, zbl);
9611 } catch (buffer::error& err) {
9612 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9613 }
9614 }
9615 }
9616 if (s->obj_tag.length())
31f18b77 9617 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
9618 else
9619 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9620
9621 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9622 * it exist, and not only if is_olh() returns true
9623 */
9624 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9625 if (iter != s->attrset.end()) {
9626 s->olh_tag = iter->second;
9627 }
9628
9629 if (is_olh(s->attrset)) {
9630 s->is_olh = true;
9631
9632 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9633
9634 if (need_follow_olh) {
9635 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9636 }
9637 }
9638
9639 return 0;
9640}
9641
9642int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9643 bool follow_olh, bool assume_noent)
9644{
9645 int ret;
9646
9647 do {
9648 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9649 } while (ret == -EAGAIN);
9650
9651 return ret;
9652}
9653
9654int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9655{
9656 RGWObjState *astate;
9657 int r = get_state(&astate, true);
9658 if (r < 0) {
9659 return r;
9660 }
9661
9662 *pmanifest = &astate->manifest;
9663
9664 return 0;
9665}
9666
9667int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9668{
9669 RGWObjState *state;
9670 int r = source->get_state(&state, true);
9671 if (r < 0)
9672 return r;
9673 if (!state->exists)
9674 return -ENOENT;
9675 if (!state->get_attr(name, dest))
9676 return -ENODATA;
9677
9678 return 0;
9679}
9680
9681
9682int RGWRados::Object::Stat::stat_async()
9683{
9684 RGWObjectCtx& ctx = source->get_ctx();
9685 rgw_obj& obj = source->get_obj();
9686 RGWRados *store = source->get_store();
9687
9688 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9689 result.obj = obj;
9690 if (s->has_attrs) {
9691 state.ret = 0;
9692 result.size = s->size;
9693 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9694 result.attrs = s->attrset;
9695 result.has_manifest = s->has_manifest;
9696 result.manifest = s->manifest;
9697 return 0;
9698 }
9699
9700 string oid;
9701 string loc;
9702 get_obj_bucket_and_oid_loc(obj, oid, loc);
9703
9704 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9705 if (r < 0) {
9706 return r;
9707 }
9708
9709 librados::ObjectReadOperation op;
9710 op.stat2(&result.size, &result.mtime, NULL);
9711 op.getxattrs(&result.attrs, NULL);
9712 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9713 state.io_ctx.locator_set_key(loc);
9714 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9715 if (r < 0) {
9716 ldout(store->ctx(), 5) << __func__
9717 << ": ERROR: aio_operate() returned ret=" << r
9718 << dendl;
9719 return r;
9720 }
9721
9722 return 0;
9723}
9724
9725
9726int RGWRados::Object::Stat::wait()
9727{
9728 if (!state.completion) {
9729 return state.ret;
9730 }
9731
9732 state.completion->wait_for_safe();
9733 state.ret = state.completion->get_return_value();
9734 state.completion->release();
9735
9736 if (state.ret != 0) {
9737 return state.ret;
9738 }
9739
9740 return finish();
9741}
9742
9743int RGWRados::Object::Stat::finish()
9744{
9745 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9746 if (iter != result.attrs.end()) {
9747 bufferlist& bl = iter->second;
9748 bufferlist::iterator biter = bl.begin();
9749 try {
9750 ::decode(result.manifest, biter);
9751 } catch (buffer::error& err) {
9752 RGWRados *store = source->get_store();
9753 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9754 return -EIO;
9755 }
9756 result.has_manifest = true;
9757 }
9758
9759 return 0;
9760}
9761
9762/**
31f18b77
FG
9763 * Get an attribute for a system object.
9764 * obj: the object to get attr
7c673cae
FG
9765 * name: name of the attr to retrieve
9766 * dest: bufferlist to store the result in
9767 * Returns: 0 on success, -ERR# otherwise.
9768 */
9769int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9770{
9771 rgw_rados_ref ref;
224ce89b 9772 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9773 if (r < 0) {
9774 return r;
9775 }
9776
9777 ObjectReadOperation op;
9778
9779 int rval;
9780 op.getxattr(name, &dest, &rval);
9781
9782 r = ref.ioctx.operate(ref.oid, &op, NULL);
9783 if (r < 0)
9784 return r;
9785
9786 return 0;
9787}
9788
9789int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9790 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9791 ObjectOperation& op, RGWObjState **pstate)
9792{
9793 if (!rctx)
9794 return 0;
9795
9796 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9797 if (r < 0)
9798 return r;
9799
9800 RGWObjState *state = *pstate;
9801
9802 if (!state->is_atomic) {
9803 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9804 return 0;
9805 }
9806
9807 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9808 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9809 } else {
9810 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9811 }
9812 return 0;
9813}
9814
9815int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9816{
9817 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9818}
9819
9820void RGWRados::Object::invalidate_state()
9821{
9822 ctx.obj.invalidate(obj);
9823}
9824
9825void RGWRados::SystemObject::invalidate_state()
9826{
9827 ctx.raw.invalidate(obj);
9828}
9829
9830int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb
FG
9831 const char *if_match, const char *if_nomatch, bool removal_op,
9832 bool modify_tail)
7c673cae
FG
9833{
9834 int r = get_state(&state, false);
9835 if (r < 0)
9836 return r;
9837
9838 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9839 if_match != NULL || if_nomatch != NULL) &&
9840 (!state->fake_tag);
9841
9842 if (!state->is_atomic) {
9843 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9844
9845 if (reset_obj) {
9846 op.create(false);
9847 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9848 }
9849
9850 return 0;
9851 }
9852
9853 if (need_guard) {
9854 /* first verify that the object wasn't replaced under */
9855 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9856 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9857 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9858 }
9859
9860 if (if_match) {
9861 if (strcmp(if_match, "*") == 0) {
9862 // test the object is existing
9863 if (!state->exists) {
9864 return -ERR_PRECONDITION_FAILED;
9865 }
9866 } else {
9867 bufferlist bl;
9868 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9869 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9870 return -ERR_PRECONDITION_FAILED;
9871 }
9872 }
9873 }
9874
9875 if (if_nomatch) {
9876 if (strcmp(if_nomatch, "*") == 0) {
9877 // test the object is NOT existing
9878 if (state->exists) {
9879 return -ERR_PRECONDITION_FAILED;
9880 }
9881 } else {
9882 bufferlist bl;
9883 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9884 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9885 return -ERR_PRECONDITION_FAILED;
9886 }
9887 }
9888 }
9889 }
9890
9891 if (reset_obj) {
9892 if (state->exists) {
9893 op.create(false);
9894 store->remove_rgw_head_obj(op);
9895 } else {
9896 op.create(true);
9897 }
9898 }
9899
9900 if (removal_op) {
9901 /* the object is being removed, no need to update its tag */
9902 return 0;
9903 }
9904
9905 if (ptag) {
9906 state->write_tag = *ptag;
9907 } else {
9908 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9909 }
9910 bufferlist bl;
9911 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9912
9913 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9914
9915 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
9916 if (modify_tail) {
9917 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9918 }
7c673cae
FG
9919
9920 return 0;
9921}
9922
9923int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9924 RGWObjVersionTracker *objv_tracker)
9925{
9926 map<string, bufferlist> attrs;
9927 attrs[name] = bl;
9928 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9929}
9930
9931int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9932 map<string, bufferlist>& attrs,
9933 map<string, bufferlist>* rmattrs,
9934 RGWObjVersionTracker *objv_tracker)
9935{
9936 rgw_rados_ref ref;
224ce89b 9937 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9938 if (r < 0) {
9939 return r;
9940 }
9941 ObjectWriteOperation op;
9942
9943 if (objv_tracker) {
9944 objv_tracker->prepare_op_for_write(&op);
9945 }
9946
9947 map<string, bufferlist>::iterator iter;
9948 if (rmattrs) {
9949 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9950 const string& name = iter->first;
9951 op.rmxattr(name.c_str());
9952 }
9953 }
9954
9955 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9956 const string& name = iter->first;
9957 bufferlist& bl = iter->second;
9958
9959 if (!bl.length())
9960 continue;
9961
9962 op.setxattr(name.c_str(), bl);
9963 }
9964
9965 if (!op.size())
9966 return 0;
9967
9968 bufferlist bl;
9969
9970 r = ref.ioctx.operate(ref.oid, &op);
9971 if (r < 0)
9972 return r;
9973
9974 return 0;
9975}
9976
9977/**
9978 * Set an attr on an object.
9979 * bucket: name of the bucket holding the object
9980 * obj: name of the object to set the attr on
9981 * name: the attr to set
9982 * bl: the contents of the attr
9983 * Returns: 0 on success, -ERR# otherwise.
9984 */
9985int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9986{
9987 map<string, bufferlist> attrs;
9988 attrs[name] = bl;
9989 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9990}
9991
9992int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9993 map<string, bufferlist>& attrs,
9994 map<string, bufferlist>* rmattrs)
9995{
9996 rgw_rados_ref ref;
9997 int r = get_obj_head_ref(bucket_info, obj, &ref);
9998 if (r < 0) {
9999 return r;
10000 }
10001 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10002
10003 ObjectWriteOperation op;
10004 RGWObjState *state = NULL;
10005
10006 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
10007 if (r < 0)
10008 return r;
10009
10010 map<string, bufferlist>::iterator iter;
10011 if (rmattrs) {
10012 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10013 const string& name = iter->first;
10014 op.rmxattr(name.c_str());
10015 }
10016 }
10017
10018 const rgw_bucket& bucket = obj.bucket;
10019
10020 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10021 const string& name = iter->first;
10022 bufferlist& bl = iter->second;
10023
10024 if (!bl.length())
10025 continue;
10026
10027 op.setxattr(name.c_str(), bl);
10028
10029 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
10030 real_time ts;
10031 try {
10032 ::decode(ts, bl);
10033
10034 rgw_obj_index_key obj_key;
10035 obj.key.get_index_key(&obj_key);
10036
10037 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
10038 } catch (buffer::error& err) {
10039 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
10040 }
10041 }
10042 }
10043
10044 if (!op.size())
10045 return 0;
10046
10047 RGWObjectCtx obj_ctx(this);
10048
10049 bufferlist bl;
10050 RGWRados::Bucket bop(this, bucket_info);
10051 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
10052
10053 if (state) {
10054 string tag;
10055 append_rand_alpha(cct, tag, tag, 32);
10056 state->write_tag = tag;
10057 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
10058
10059 if (r < 0)
10060 return r;
10061
10062 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
10063 op.setxattr(RGW_ATTR_ID_TAG, bl);
10064 }
10065
3efd9988
FG
10066
10067 real_time mtime = real_clock::now();
10068 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10069 op.mtime2(&mtime_ts);
7c673cae
FG
10070 r = ref.ioctx.operate(ref.oid, &op);
10071 if (state) {
10072 if (r >= 0) {
10073 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
10074 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
10075 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
10076 string etag(etag_bl.c_str(), etag_bl.length());
10077 string content_type(content_type_bl.c_str(), content_type_bl.length());
10078 uint64_t epoch = ref.ioctx.get_last_version();
10079 int64_t poolid = ref.ioctx.get_id();
7c673cae
FG
10080 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
10081 mtime, etag, content_type, &acl_bl,
10082 RGW_OBJ_CATEGORY_MAIN, NULL);
10083 } else {
10084 int ret = index_op.cancel();
10085 if (ret < 0) {
10086 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
10087 }
10088 }
10089 }
10090 if (r < 0)
10091 return r;
10092
10093 if (state) {
10094 state->obj_tag.swap(bl);
10095 if (rmattrs) {
10096 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10097 state->attrset.erase(iter->first);
10098 }
10099 }
10100 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10101 state->attrset[iter->first] = iter->second;
10102 }
10103 }
10104
10105 return 0;
10106}
10107
7c673cae
FG
10108int RGWRados::Object::Read::prepare()
10109{
10110 RGWRados *store = source->get_store();
10111 CephContext *cct = store->ctx();
10112
10113 bufferlist etag;
10114
10115 map<string, bufferlist>::iterator iter;
10116
10117 RGWObjState *astate;
10118 int r = source->get_state(&astate, true);
10119 if (r < 0)
10120 return r;
10121
10122 if (!astate->exists) {
10123 return -ENOENT;
10124 }
10125
10126 const RGWBucketInfo& bucket_info = source->get_bucket_info();
10127
10128 state.obj = astate->obj;
10129 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
10130
10131 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
10132 if (r < 0) {
10133 return r;
10134 }
10135 if (params.attrs) {
10136 *params.attrs = astate->attrset;
10137 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10138 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
10139 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10140 }
10141 }
10142 }
10143
10144 /* Convert all times go GMT to make them compatible */
10145 if (conds.mod_ptr || conds.unmod_ptr) {
10146 obj_time_weight src_weight;
10147 src_weight.init(astate);
10148 src_weight.high_precision = conds.high_precision_time;
10149
10150 obj_time_weight dest_weight;
10151 dest_weight.high_precision = conds.high_precision_time;
10152
10153 if (conds.mod_ptr) {
10154 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10155 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10156 if (!(dest_weight < src_weight)) {
10157 return -ERR_NOT_MODIFIED;
10158 }
10159 }
10160
10161 if (conds.unmod_ptr) {
10162 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10163 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10164 if (dest_weight < src_weight) {
10165 return -ERR_PRECONDITION_FAILED;
10166 }
10167 }
10168 }
10169 if (conds.if_match || conds.if_nomatch) {
10170 r = get_attr(RGW_ATTR_ETAG, etag);
10171 if (r < 0)
10172 return r;
10173
10174 if (conds.if_match) {
10175 string if_match_str = rgw_string_unquote(conds.if_match);
10176 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
10177 if (if_match_str.compare(etag.c_str()) != 0) {
10178 return -ERR_PRECONDITION_FAILED;
10179 }
10180 }
10181
10182 if (conds.if_nomatch) {
10183 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
10184 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
10185 if (if_nomatch_str.compare(etag.c_str()) == 0) {
10186 return -ERR_NOT_MODIFIED;
10187 }
10188 }
10189 }
10190
10191 if (params.obj_size)
10192 *params.obj_size = astate->size;
10193 if (params.lastmod)
10194 *params.lastmod = astate->mtime;
10195
10196 return 0;
10197}
10198
10199int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
10200{
10201 if (ofs < 0) {
10202 ofs += obj_size;
10203 if (ofs < 0)
10204 ofs = 0;
10205 end = obj_size - 1;
10206 } else if (end < 0) {
10207 end = obj_size - 1;
10208 }
10209
10210 if (obj_size > 0) {
10211 if (ofs >= (off_t)obj_size) {
10212 return -ERANGE;
10213 }
10214 if (end >= (off_t)obj_size) {
10215 end = obj_size - 1;
10216 }
10217 }
10218 return 0;
10219}
10220
10221int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
10222{
10223 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
10224}
10225
10226int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
10227 RGWRados::SystemObject::Read::GetObjState& state,
10228 rgw_raw_obj& obj,
10229 map<string, bufferlist> *attrs,
10230 real_time *lastmod,
10231 uint64_t *obj_size,
10232 RGWObjVersionTracker *objv_tracker)
10233{
10234 RGWRawObjState *astate = NULL;
10235
10236 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
10237 if (r < 0)
10238 return r;
10239
10240 if (!astate->exists) {
10241 return -ENOENT;
10242 }
10243
10244 if (attrs) {
10245 *attrs = astate->attrset;
10246 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10247 map<string, bufferlist>::iterator iter;
10248 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10249 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10250 }
10251 }
10252 }
10253
10254 if (obj_size)
10255 *obj_size = astate->size;
10256 if (lastmod)
10257 *lastmod = astate->mtime;
10258
10259 return 0;
10260}
10261
31f18b77
FG
10262
10263int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10264{
10265 RGWRados *store = target->get_store();
10266 BucketShard *bs;
10267 int r;
10268
10269#define NUM_RESHARD_RETRIES 10
10270 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10271 int ret = get_bucket_shard(&bs);
10272 if (ret < 0) {
10273 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10274 return ret;
10275 }
10276 r = call(bs);
10277 if (r != -ERR_BUSY_RESHARDING) {
10278 break;
10279 }
10280 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10281 string new_bucket_id;
f64942e4 10282 r = store->block_while_resharding(bs, &new_bucket_id, target->bucket_info);
31f18b77
FG
10283 if (r == -ERR_BUSY_RESHARDING) {
10284 continue;
10285 }
10286 if (r < 0) {
10287 return r;
10288 }
10289 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10290 i = 0; /* resharding is finished, make sure we can retry */
10291 r = target->update_bucket_id(new_bucket_id);
10292 if (r < 0) {
10293 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10294 return r;
10295 }
10296 invalidate_bs();
10297 }
10298
10299 if (r < 0) {
10300 return r;
10301 }
10302
10303 if (pbs) {
10304 *pbs = bs;
10305 }
10306
10307 return 0;
10308}
10309
7c673cae
FG
10310int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10311{
10312 RGWRados *store = source->get_store();
10313 rgw_raw_obj& obj = source->get_obj();
10314
10315 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10316 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10317}
10318
10319int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10320{
10321 if (blind) {
10322 return 0;
10323 }
10324 RGWRados *store = target->get_store();
7c673cae
FG
10325
10326 if (write_tag && write_tag->length()) {
10327 optag = string(write_tag->c_str(), write_tag->length());
10328 } else {
10329 if (optag.empty()) {
10330 append_rand_alpha(store->ctx(), optag, optag, 32);
10331 }
10332 }
10333
f64942e4
AA
10334 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10335 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10336 });
31f18b77 10337
7c673cae
FG
10338 if (r < 0) {
10339 return r;
10340 }
10341 prepared = true;
31f18b77 10342
7c673cae
FG
10343 return 0;
10344}
10345
10346int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10347 uint64_t size, uint64_t accounted_size,
10348 ceph::real_time& ut, const string& etag,
10349 const string& content_type,
10350 bufferlist *acl_bl,
10351 RGWObjCategory category,
10352 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10353{
10354 if (blind) {
10355 return 0;
10356 }
10357 RGWRados *store = target->get_store();
10358 BucketShard *bs;
31f18b77 10359
7c673cae
FG
10360 int ret = get_bucket_shard(&bs);
10361 if (ret < 0) {
10362 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10363 return ret;
10364 }
10365
10366 rgw_bucket_dir_entry ent;
10367 obj.key.get_index_key(&ent.key);
10368 ent.meta.size = size;
10369 ent.meta.accounted_size = accounted_size;
10370 ent.meta.mtime = ut;
10371 ent.meta.etag = etag;
10372 if (user_data)
10373 ent.meta.user_data = *user_data;
10374
10375 ACLOwner owner;
10376 if (acl_bl && acl_bl->length()) {
10377 int ret = store->decode_policy(*acl_bl, &owner);
10378 if (ret < 0) {
10379 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10380 }
10381 }
10382 ent.meta.owner = owner.get_id().to_str();
10383 ent.meta.owner_display_name = owner.get_display_name();
10384 ent.meta.content_type = content_type;
10385
31f18b77 10386 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 10387
c07f9fc5
FG
10388 if (target->bucket_info.datasync_flag_enabled()) {
10389 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10390 if (r < 0) {
10391 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10392 }
7c673cae
FG
10393 }
10394
10395 return ret;
10396}
10397
10398int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10399 real_time& removed_mtime,
10400 list<rgw_obj_index_key> *remove_objs)
10401{
10402 if (blind) {
10403 return 0;
10404 }
10405 RGWRados *store = target->get_store();
10406 BucketShard *bs;
31f18b77 10407
7c673cae
FG
10408 int ret = get_bucket_shard(&bs);
10409 if (ret < 0) {
10410 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10411 return ret;
10412 }
10413
31f18b77 10414 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 10415
c07f9fc5
FG
10416 if (target->bucket_info.datasync_flag_enabled()) {
10417 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10418 if (r < 0) {
10419 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10420 }
7c673cae
FG
10421 }
10422
10423 return ret;
10424}
10425
10426
10427int RGWRados::Bucket::UpdateIndex::cancel()
10428{
10429 if (blind) {
10430 return 0;
10431 }
10432 RGWRados *store = target->get_store();
10433 BucketShard *bs;
7c673cae 10434
f64942e4
AA
10435 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10436 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10437 });
7c673cae
FG
10438
10439 /*
10440 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10441 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10442 * have no way to tell that they're all caught up
10443 */
c07f9fc5
FG
10444 if (target->bucket_info.datasync_flag_enabled()) {
10445 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10446 if (r < 0) {
10447 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10448 }
7c673cae
FG
10449 }
10450
10451 return ret;
10452}
10453
10454int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10455{
10456 RGWRados *store = source->get_store();
10457 CephContext *cct = store->ctx();
10458
7c673cae
FG
10459 rgw_raw_obj read_obj;
10460 uint64_t read_ofs = ofs;
10461 uint64_t len, read_len;
10462 bool reading_from_head = true;
10463 ObjectReadOperation op;
10464
10465 bool merge_bl = false;
10466 bufferlist *pbl = &bl;
10467 bufferlist read_bl;
10468 uint64_t max_chunk_size;
10469
10470 RGWObjState *astate;
10471 int r = source->get_state(&astate, true);
10472 if (r < 0)
10473 return r;
10474
10475 if (end < 0)
10476 len = 0;
10477 else
10478 len = end - ofs + 1;
10479
10480 if (astate->has_manifest && astate->manifest.has_tail()) {
10481 /* now get the relevant object part */
10482 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10483
10484 uint64_t stripe_ofs = iter.get_stripe_ofs();
10485 read_obj = iter.get_location().get_raw_obj(store);
10486 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10487 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10488 reading_from_head = (read_obj == state.head_obj);
10489 } else {
10490 read_obj = state.head_obj;
10491 }
10492
10493 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10494 if (r < 0) {
10495 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10496 return r;
10497 }
10498
10499 if (len > max_chunk_size)
10500 len = max_chunk_size;
10501
10502
10503 state.io_ctx.locator_set_key(read_obj.loc);
10504
10505 read_len = len;
10506
10507 if (reading_from_head) {
10508 /* only when reading from the head object do we need to do the atomic test */
10509 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10510 if (r < 0)
10511 return r;
10512
10513 if (astate && astate->prefetch_data) {
10514 if (!ofs && astate->data.length() >= len) {
10515 bl = astate->data;
10516 return bl.length();
10517 }
10518
10519 if (ofs < astate->data.length()) {
10520 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10521 astate->data.copy(ofs, copy_len, bl);
10522 read_len -= copy_len;
10523 read_ofs += copy_len;
10524 if (!read_len)
10525 return bl.length();
10526
10527 merge_bl = true;
10528 pbl = &read_bl;
10529 }
10530 }
10531 }
10532
10533 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10534 op.read(read_ofs, read_len, pbl, NULL);
10535
10536 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10537 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10538
10539 if (r < 0) {
10540 return r;
10541 }
10542
10543 if (merge_bl) {
10544 bl.append(read_bl);
10545 }
10546
10547 return bl.length();
10548}
10549
10550int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10551{
10552 if (!has_ref) {
224ce89b 10553 int r = store->get_raw_obj_ref(obj, &ref);
7c673cae
FG
10554 if (r < 0) {
10555 return r;
10556 }
10557 has_ref = true;
10558 }
10559 *pref = &ref;
10560 return 0;
10561
10562}
10563
10564int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10565 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10566 bufferlist& bl, off_t ofs, off_t end,
10567 map<string, bufferlist> *attrs,
b32b8144
FG
10568 rgw_cache_entry_info *cache_info,
10569 boost::optional<obj_version>)
7c673cae
FG
10570{
10571 uint64_t len;
10572 ObjectReadOperation op;
10573
10574 if (end < 0)
10575 len = 0;
10576 else
10577 len = end - ofs + 1;
10578
10579 if (objv_tracker) {
10580 objv_tracker->prepare_op_for_read(&op);
10581 }
10582
10583 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10584 op.read(ofs, len, &bl, NULL);
10585
10586 if (attrs) {
10587 op.getxattrs(attrs, NULL);
10588 }
10589
10590 rgw_rados_ref *ref;
10591 int r = read_state.get_ref(this, obj, &ref);
10592 if (r < 0) {
10593 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10594 return r;
10595 }
10596 r = ref->ioctx.operate(ref->oid, &op, NULL);
10597 if (r < 0) {
10598 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10599 return r;
10600 }
10601 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10602
10603 uint64_t op_ver = ref->ioctx.get_last_version();
10604
10605 if (read_state.last_ver > 0 &&
10606 read_state.last_ver != op_ver) {
10607 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10608 return -ECANCELED;
10609 }
10610
10611 read_state.last_ver = op_ver;
10612
10613 return bl.length();
10614}
10615
b32b8144
FG
10616int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10617 RGWObjVersionTracker *objv_tracker,
10618 boost::optional<obj_version> refresh_version)
7c673cae
FG
10619{
10620 RGWRados *store = source->get_store();
10621 rgw_raw_obj& obj = source->get_obj();
10622
b32b8144
FG
10623 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10624 ofs, end, read_params.attrs,
10625 read_params.cache_info, refresh_version);
7c673cae
FG
10626}
10627
10628int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10629{
10630 RGWRados *store = source->get_store();
10631 rgw_raw_obj& obj = source->get_obj();
10632
10633 return store->system_obj_get_attr(obj, name, dest);
10634}
10635
10636struct get_obj_data;
10637
10638struct get_obj_aio_data {
10639 struct get_obj_data *op_data;
10640 off_t ofs;
10641 off_t len;
10642};
10643
10644struct get_obj_io {
10645 off_t len;
10646 bufferlist bl;
10647};
10648
10649static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10650
10651struct get_obj_data : public RefCountedObject {
10652 CephContext *cct;
10653 RGWRados *rados;
10654 RGWObjectCtx *ctx;
10655 IoCtx io_ctx;
10656 map<off_t, get_obj_io> io_map;
10657 map<off_t, librados::AioCompletion *> completion_map;
10658 uint64_t total_read;
10659 Mutex lock;
10660 Mutex data_lock;
10661 list<get_obj_aio_data> aio_data;
10662 RGWGetDataCB *client_cb;
10663 std::atomic<bool> cancelled = { false };
10664 std::atomic<int64_t> err_code = { 0 };
10665 Throttle throttle;
10666 list<bufferlist> read_list;
10667
10668 explicit get_obj_data(CephContext *_cct)
10669 : cct(_cct),
10670 rados(NULL), ctx(NULL),
10671 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10672 client_cb(NULL),
10673 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10674 ~get_obj_data() override { }
10675 void set_cancelled(int r) {
10676 cancelled = true;
10677 err_code = r;
10678 }
10679
10680 bool is_cancelled() {
10681 return cancelled;
10682 }
10683
10684 int get_err_code() {
10685 return err_code;
10686 }
10687
10688 int wait_next_io(bool *done) {
10689 lock.Lock();
10690 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10691 if (iter == completion_map.end()) {
10692 *done = true;
10693 lock.Unlock();
10694 return 0;
10695 }
10696 off_t cur_ofs = iter->first;
10697 librados::AioCompletion *c = iter->second;
10698 lock.Unlock();
10699
10700 c->wait_for_safe_and_cb();
10701 int r = c->get_return_value();
10702
10703 lock.Lock();
10704 completion_map.erase(cur_ofs);
10705
10706 if (completion_map.empty()) {
10707 *done = true;
10708 }
10709 lock.Unlock();
10710
10711 c->release();
10712
10713 return r;
10714 }
10715
10716 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10717 Mutex::Locker l(lock);
10718
10719 const auto& io_iter = io_map.insert(
10720 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10721
10722 assert(io_iter.second); // assert new insertion
10723
10724 get_obj_io& io = (io_iter.first)->second;
10725 *pbl = &io.bl;
10726
10727 struct get_obj_aio_data aio;
10728 aio.ofs = ofs;
10729 aio.len = len;
10730 aio.op_data = this;
10731
10732 aio_data.push_back(aio);
10733
10734 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10735
10736 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10737 completion_map[ofs] = c;
10738
10739 *pc = c;
10740
10741 /* we have a reference per IO, plus one reference for the calling function.
10742 * reference is dropped for each callback, plus when we're done iterating
10743 * over the parts */
10744 get();
10745 }
10746
10747 void cancel_io(off_t ofs) {
10748 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10749 lock.Lock();
10750 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10751 if (iter != completion_map.end()) {
10752 AioCompletion *c = iter->second;
10753 c->release();
10754 completion_map.erase(ofs);
10755 io_map.erase(ofs);
10756 }
10757 lock.Unlock();
10758
10759 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10760 * need IoCtx to live, as io callback may still be called
10761 */
10762 }
10763
10764 void cancel_all_io() {
10765 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10766 Mutex::Locker l(lock);
10767 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10768 iter != completion_map.end(); ++iter) {
10769 librados::AioCompletion *c = iter->second;
10770 c->release();
10771 }
10772 }
10773
10774 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10775 Mutex::Locker l(lock);
10776
10777 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10778
10779 if (liter == io_map.end() ||
10780 liter->first != ofs) {
10781 return 0;
10782 }
10783
10784 map<off_t, librados::AioCompletion *>::iterator aiter;
10785 aiter = completion_map.find(ofs);
10786 if (aiter == completion_map.end()) {
10787 /* completion map does not hold this io, it was cancelled */
10788 return 0;
10789 }
10790
10791 AioCompletion *completion = aiter->second;
10792 int r = completion->get_return_value();
10793 if (r < 0)
10794 return r;
10795
10796 for (; aiter != completion_map.end(); ++aiter) {
10797 completion = aiter->second;
10798 if (!completion->is_safe()) {
10799 /* reached a request that is not yet complete, stop */
10800 break;
10801 }
10802
10803 r = completion->get_return_value();
10804 if (r < 0) {
10805 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10806 return r;
10807 }
10808
10809 total_read += r;
10810
10811 map<off_t, get_obj_io>::iterator old_liter = liter++;
10812 bl_list.push_back(old_liter->second.bl);
10813 io_map.erase(old_liter);
10814 }
10815
10816 return 0;
10817 }
10818};
10819
10820static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10821{
10822 struct get_obj_data *d = (struct get_obj_data *)arg;
10823
10824 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10825}
10826
10827static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10828{
10829 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10830 struct get_obj_data *d = aio_data->op_data;
10831
10832 d->rados->get_obj_aio_completion_cb(cb, arg);
10833}
10834
10835
10836void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10837{
10838 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10839 struct get_obj_data *d = aio_data->op_data;
10840 off_t ofs = aio_data->ofs;
10841 off_t len = aio_data->len;
10842
10843 list<bufferlist> bl_list;
10844 list<bufferlist>::iterator iter;
10845 int r;
10846
10847 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10848 d->throttle.put(len);
10849
10850 r = rados_aio_get_return_value(c);
10851 if (r < 0) {
10852 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10853 d->set_cancelled(r);
10854 goto done;
10855 }
10856
10857 if (d->is_cancelled()) {
10858 goto done;
10859 }
10860
10861 d->data_lock.Lock();
10862
10863 r = d->get_complete_ios(ofs, bl_list);
10864 if (r < 0) {
10865 goto done_unlock;
10866 }
10867
10868 d->read_list.splice(d->read_list.end(), bl_list);
10869
10870done_unlock:
10871 d->data_lock.Unlock();
10872done:
10873 d->put();
10874 return;
10875}
10876
10877int RGWRados::flush_read_list(struct get_obj_data *d)
10878{
10879 d->data_lock.Lock();
10880 list<bufferlist> l;
10881 l.swap(d->read_list);
10882 d->get();
10883 d->read_list.clear();
10884
10885 d->data_lock.Unlock();
10886
10887 int r = 0;
10888
10889 list<bufferlist>::iterator iter;
10890 for (iter = l.begin(); iter != l.end(); ++iter) {
10891 bufferlist& bl = *iter;
10892 r = d->client_cb->handle_data(bl, 0, bl.length());
10893 if (r < 0) {
10894 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10895 break;
10896 }
10897 }
10898
10899 d->data_lock.Lock();
10900 d->put();
10901 if (r < 0) {
10902 d->set_cancelled(r);
10903 }
10904 d->data_lock.Unlock();
10905 return r;
10906}
10907
10908int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10909 const RGWBucketInfo& bucket_info,
10910 const rgw_obj& obj,
10911 const rgw_raw_obj& read_obj,
10912 off_t obj_ofs,
10913 off_t read_ofs, off_t len,
10914 bool is_head_obj, void *arg)
10915{
10916 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10917 ObjectReadOperation op;
10918 struct get_obj_data *d = (struct get_obj_data *)arg;
10919 string oid, key;
10920 bufferlist *pbl;
10921 AioCompletion *c;
10922
10923 int r;
10924
10925 if (is_head_obj) {
10926 /* only when reading from the head object do we need to do the atomic test */
10927 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10928 if (r < 0)
10929 return r;
10930
10931 if (astate &&
10932 obj_ofs < astate->data.length()) {
10933 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10934
10935 d->data_lock.Lock();
10936 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10937 d->data_lock.Unlock();
10938 if (r < 0)
10939 return r;
10940
10941 d->lock.Lock();
10942 d->total_read += chunk_len;
10943 d->lock.Unlock();
10944
10945 len -= chunk_len;
10946 read_ofs += chunk_len;
10947 obj_ofs += chunk_len;
10948 if (!len)
10949 return 0;
10950 }
10951 }
10952
10953 d->throttle.get(len);
10954 if (d->is_cancelled()) {
10955 return d->get_err_code();
10956 }
10957
10958 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10959 * cleaning up
10960 */
10961 d->add_io(obj_ofs, len, &pbl, &c);
10962
10963 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10964 op.read(read_ofs, len, pbl, NULL);
10965
10966 librados::IoCtx io_ctx(d->io_ctx);
10967 io_ctx.locator_set_key(read_obj.loc);
10968
10969 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10970 if (r < 0) {
10971 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10972 goto done_err;
10973 }
10974
10975 // Flush data to client if there is any
10976 r = flush_read_list(d);
10977 if (r < 0)
10978 return r;
10979
10980 return 0;
10981
10982done_err:
10983 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10984 d->set_cancelled(r);
10985 d->cancel_io(obj_ofs);
10986
10987 return r;
10988}
10989
10990int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10991{
10992 RGWRados *store = source->get_store();
10993 CephContext *cct = store->ctx();
10994
10995 struct get_obj_data *data = new get_obj_data(cct);
10996 bool done = false;
10997
10998 RGWObjectCtx& obj_ctx = source->get_ctx();
10999
11000 data->rados = store;
11001 data->io_ctx.dup(state.io_ctx);
11002 data->client_cb = cb;
11003
11004 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
11005 if (r < 0) {
11006 data->cancel_all_io();
11007 goto done;
11008 }
11009
11010 while (!done) {
11011 r = data->wait_next_io(&done);
11012 if (r < 0) {
11013 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
11014 data->cancel_all_io();
11015 break;
11016 }
11017 r = store->flush_read_list(data);
11018 if (r < 0) {
11019 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
11020 data->cancel_all_io();
11021 break;
11022 }
11023 }
11024
11025done:
11026 data->put();
11027 return r;
11028}
11029
11030int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
11031 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11032 off_t ofs, off_t end,
11033 uint64_t max_chunk_size,
11034 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
11035 const rgw_raw_obj&, off_t, off_t, off_t, bool,
11036 RGWObjState *, void *),
11037 void *arg)
11038{
11039 rgw_raw_obj head_obj;
11040 rgw_raw_obj read_obj;
11041 uint64_t read_ofs = ofs;
11042 uint64_t len;
11043 bool reading_from_head = true;
11044 RGWObjState *astate = NULL;
11045
11046 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
11047
11048 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
11049 if (r < 0) {
11050 return r;
11051 }
11052
11053 if (end < 0)
11054 len = 0;
11055 else
11056 len = end - ofs + 1;
11057
11058 if (astate->has_manifest) {
11059 /* now get the relevant object stripe */
11060 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
11061
11062 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
11063
11064 for (; iter != obj_end && ofs <= end; ++iter) {
11065 off_t stripe_ofs = iter.get_stripe_ofs();
11066 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
11067
11068 while (ofs < next_stripe_ofs && ofs <= end) {
11069 read_obj = iter.get_location().get_raw_obj(this);
11070 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
11071 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
11072
11073 if (read_len > max_chunk_size) {
11074 read_len = max_chunk_size;
11075 }
11076
11077 reading_from_head = (read_obj == head_obj);
11078 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
11079 if (r < 0) {
11080 return r;
11081 }
11082
11083 len -= read_len;
11084 ofs += read_len;
11085 }
11086 }
11087 } else {
11088 while (ofs <= end) {
11089 read_obj = head_obj;
11090 uint64_t read_len = min(len, max_chunk_size);
11091
11092 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
11093 if (r < 0) {
11094 return r;
11095 }
11096
11097 len -= read_len;
11098 ofs += read_len;
11099 }
11100 }
11101
11102 return 0;
11103}
11104
11105int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
11106{
11107 rgw_rados_ref ref;
11108 int r = get_obj_head_ref(bucket_info, obj, &ref);
11109 if (r < 0) {
11110 return r;
11111 }
11112
11113 return ref.ioctx.operate(ref.oid, op);
11114}
11115
11116int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
11117{
11118 rgw_rados_ref ref;
11119 int r = get_obj_head_ref(bucket_info, obj, &ref);
11120 if (r < 0) {
11121 return r;
11122 }
11123
11124 bufferlist outbl;
11125
11126 return ref.ioctx.operate(ref.oid, op, &outbl);
11127}
11128
11129int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
11130{
11131 ObjectWriteOperation op;
11132
11133 assert(olh_obj.key.instance.empty());
11134
11135 bool has_tag = (state.exists && has_olh_tag(state.attrset));
11136
11137 if (!state.exists) {
11138 op.create(true);
11139 } else {
11140 op.assert_exists();
b32b8144
FG
11141 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11142 op.mtime2(&mtime_ts);
7c673cae
FG
11143 }
11144
11145 /*
11146 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
11147 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
11148 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
11149 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
11150 * log will reflect that.
11151 *
11152 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
11153 * is used for object data instance, olh_tag for olh instance.
11154 */
11155 if (has_tag) {
11156 /* guard against racing writes */
11157 bucket_index_guard_olh_op(state, op);
11158 }
11159
11160 if (!has_tag) {
11161 /* obj tag */
11162 string obj_tag;
11163 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
11164 if (ret < 0) {
11165 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11166 return ret;
11167 }
11168 bufferlist bl;
11169 bl.append(obj_tag.c_str(), obj_tag.size());
11170 op.setxattr(RGW_ATTR_ID_TAG, bl);
11171
11172 state.attrset[RGW_ATTR_ID_TAG] = bl;
11173 state.obj_tag = bl;
11174
11175 /* olh tag */
11176 string olh_tag;
11177 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
11178 if (ret < 0) {
11179 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11180 return ret;
11181 }
11182 bufferlist olh_bl;
11183 olh_bl.append(olh_tag.c_str(), olh_tag.size());
11184 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
11185
11186 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
11187 state.olh_tag = olh_bl;
11188 state.is_olh = true;
11189
11190 bufferlist verbl;
11191 op.setxattr(RGW_ATTR_OLH_VER, verbl);
11192 }
11193
11194 bufferlist bl;
11195 RGWOLHPendingInfo pending_info;
11196 pending_info.time = real_clock::now();
11197 ::encode(pending_info, bl);
11198
11199#define OLH_PENDING_TAG_LEN 32
11200 /* tag will start with current time epoch, this so that entries are sorted by time */
11201 char buf[32];
11202 utime_t ut(pending_info.time);
11203 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
11204 *op_tag = buf;
11205
11206 string s;
11207 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
11208 if (ret < 0) {
11209 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11210 return ret;
11211 }
11212 op_tag->append(s);
11213
11214 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11215 attr_name.append(*op_tag);
11216
11217 op.setxattr(attr_name.c_str(), bl);
11218
11219 ret = obj_operate(bucket_info, olh_obj, &op);
11220 if (ret < 0) {
11221 return ret;
11222 }
11223
11224 state.exists = true;
11225 state.attrset[attr_name] = bl;
11226
11227 return 0;
11228}
11229
11230int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
11231{
11232 int ret;
11233
11234 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
11235 if (ret == -EEXIST) {
11236 ret = -ECANCELED;
11237 }
11238
11239 return ret;
11240}
11241
f64942e4
AA
11242int RGWRados::guard_reshard(BucketShard *bs,
11243 const rgw_obj& obj_instance,
11244 const RGWBucketInfo& bucket_info,
11245 std::function<int(BucketShard *)> call)
31f18b77
FG
11246{
11247 rgw_obj obj;
11248 const rgw_obj *pobj = &obj_instance;
11249 int r;
11250
11251 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
f64942e4 11252 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
31f18b77
FG
11253 if (r < 0) {
11254 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11255 return r;
11256 }
11257 r = call(bs);
11258 if (r != -ERR_BUSY_RESHARDING) {
11259 break;
11260 }
11261 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11262 string new_bucket_id;
f64942e4 11263 r = block_while_resharding(bs, &new_bucket_id, bucket_info);
31f18b77
FG
11264 if (r == -ERR_BUSY_RESHARDING) {
11265 continue;
11266 }
11267 if (r < 0) {
11268 return r;
11269 }
11270 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11271 i = 0; /* resharding is finished, make sure we can retry */
11272
11273 obj = *pobj;
11274 obj.bucket.update_bucket_id(new_bucket_id);
11275 pobj = &obj;
11276 }
11277
11278 if (r < 0) {
11279 return r;
11280 }
11281
11282 return 0;
11283}
11284
f64942e4
AA
11285int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
11286 string *new_bucket_id,
11287 const RGWBucketInfo& bucket_info)
31f18b77
FG
11288{
11289 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11290
f64942e4 11291 return waiter->block_while_resharding(bs, new_bucket_id, bucket_info);
31f18b77
FG
11292}
11293
7c673cae
FG
11294int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11295 bool delete_marker,
11296 const string& op_tag,
11297 struct rgw_bucket_dir_entry_meta *meta,
11298 uint64_t olh_epoch,
91327a77
AA
11299 real_time unmod_since, bool high_precision_time,
11300 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
11301{
11302 rgw_rados_ref ref;
11303 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11304 if (r < 0) {
11305 return r;
11306 }
11307
31f18b77
FG
11308 rgw_zone_set zones_trace;
11309 if (_zones_trace) {
11310 zones_trace = *_zones_trace;
7c673cae 11311 }
1adf2230 11312 zones_trace.insert(get_zone().id);
7c673cae 11313
31f18b77
FG
11314 BucketShard bs(this);
11315
7c673cae 11316 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
11317 r = guard_reshard(&bs, obj_instance, bucket_info,
11318 [&](BucketShard *bs) -> int {
11319 librados::ObjectWriteOperation op;
11320 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11321 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11322 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11323 unmod_since, high_precision_time,
11324 get_zone().log_data, zones_trace);
31f18b77
FG
11325 });
11326 if (r < 0) {
11327 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11328 return r;
7c673cae
FG
11329 }
11330
91327a77
AA
11331 if (log_data_change && bucket_info.datasync_flag_enabled()) {
11332 data_log->add_entry(bs.bucket, bs.shard_id);
11333 }
11334
7c673cae
FG
11335 return 0;
11336}
11337
11338void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11339{
11340 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11341 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11342}
11343
11344int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 11345 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
11346{
11347 rgw_rados_ref ref;
11348 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11349 if (r < 0) {
11350 return r;
11351 }
11352
31f18b77
FG
11353 rgw_zone_set zones_trace;
11354 if (_zones_trace) {
11355 zones_trace = *_zones_trace;
7c673cae 11356 }
31f18b77
FG
11357 zones_trace.insert(get_zone().id);
11358
11359 BucketShard bs(this);
7c673cae
FG
11360
11361 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
11362 r = guard_reshard(&bs, obj_instance, bucket_info,
11363 [&](BucketShard *bs) -> int {
11364 librados::ObjectWriteOperation op;
11365 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11366 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11367 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
31f18b77
FG
11368 });
11369 if (r < 0) {
11370 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11371 return r;
7c673cae
FG
11372 }
11373
11374 return 0;
11375}
11376
11377int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11378 const rgw_obj& obj_instance, uint64_t ver_marker,
11379 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11380 bool *is_truncated)
11381{
11382 rgw_rados_ref ref;
11383 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11384 if (r < 0) {
11385 return r;
11386 }
11387
11388 BucketShard bs(this);
f64942e4
AA
11389 int ret =
11390 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
11391 if (ret < 0) {
11392 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11393 return ret;
11394 }
11395
11396 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11397
11398 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11399
f64942e4
AA
11400 ret = guard_reshard(&bs, obj_instance, bucket_info,
11401 [&](BucketShard *bs) -> int {
11402 ObjectReadOperation op;
11403 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11404 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11405 key, ver_marker, olh_tag, log, is_truncated);
11406 });
31f18b77
FG
11407 if (ret < 0) {
11408 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 11409 return ret;
31f18b77 11410 }
7c673cae
FG
11411
11412 return 0;
11413}
11414
11415int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11416{
11417 rgw_rados_ref ref;
11418 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11419 if (r < 0) {
11420 return r;
11421 }
11422
11423 BucketShard bs(this);
f64942e4
AA
11424 int ret =
11425 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
11426 if (ret < 0) {
11427 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11428 return ret;
11429 }
11430
11431 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11432
11433 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11434
f64942e4
AA
11435 ret = guard_reshard(&bs, obj_instance, bucket_info,
11436 [&](BucketShard *pbs) -> int {
11437 ObjectWriteOperation op;
11438 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11439 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11440 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
31f18b77
FG
11441 });
11442 if (ret < 0) {
11443 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 11444 return ret;
31f18b77 11445 }
7c673cae
FG
11446
11447 return 0;
11448}
11449
11450int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11451{
11452 rgw_rados_ref ref;
11453 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11454 if (r < 0) {
11455 return r;
11456 }
11457
11458 BucketShard bs(this);
7c673cae
FG
11459
11460 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11461
11462 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11463
f64942e4
AA
11464 int ret = guard_reshard(&bs, obj_instance, bucket_info,
11465 [&](BucketShard *pbs) -> int {
11466 ObjectWriteOperation op;
11467 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11468 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
31f18b77 11469 });
7c673cae
FG
11470 if (ret < 0) {
11471 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11472 return ret;
11473 }
11474
11475 return 0;
11476}
11477
11478int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11479 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 11480 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
11481{
11482 if (log.empty()) {
11483 return 0;
11484 }
11485
11486 librados::ObjectWriteOperation op;
11487
11488 uint64_t last_ver = log.rbegin()->first;
11489 *plast_ver = last_ver;
11490
11491 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11492
11493 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11494 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11495
b32b8144
FG
11496 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11497 op.mtime2(&mtime_ts);
11498
7c673cae
FG
11499 bool need_to_link = false;
11500 cls_rgw_obj_key key;
11501 bool delete_marker = false;
11502 list<cls_rgw_obj_key> remove_instances;
11503 bool need_to_remove = false;
11504
11505 for (iter = log.begin(); iter != log.end(); ++iter) {
11506 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11507 for (; viter != iter->second.end(); ++viter) {
11508 rgw_bucket_olh_log_entry& entry = *viter;
11509
11510 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11511 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11512 << (entry.delete_marker ? "(delete)" : "") << dendl;
11513 switch (entry.op) {
11514 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11515 remove_instances.push_back(entry.key);
11516 break;
11517 case CLS_RGW_OLH_OP_LINK_OLH:
11518 need_to_link = true;
11519 need_to_remove = false;
11520 key = entry.key;
11521 delete_marker = entry.delete_marker;
11522 break;
11523 case CLS_RGW_OLH_OP_UNLINK_OLH:
11524 need_to_remove = true;
11525 need_to_link = false;
11526 break;
11527 default:
11528 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11529 return -EIO;
11530 }
11531 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11532 attr_name.append(entry.op_tag);
11533 op.rmxattr(attr_name.c_str());
11534 }
11535 }
11536
11537 rgw_rados_ref ref;
11538 int r = get_obj_head_ref(bucket_info, obj, &ref);
11539 if (r < 0) {
11540 return r;
11541 }
11542
11543 const rgw_bucket& bucket = obj.bucket;
11544
11545 if (need_to_link) {
11546 rgw_obj target(bucket, key);
11547 RGWOLHInfo info;
11548 info.target = target;
11549 info.removed = delete_marker;
11550 bufferlist bl;
11551 ::encode(info, bl);
11552 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11553 }
11554
11555 /* first remove object instances */
11556 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11557 liter != remove_instances.end(); ++liter) {
11558 cls_rgw_obj_key& key = *liter;
11559 rgw_obj obj_instance(bucket, key);
31f18b77 11560 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
11561 if (ret < 0 && ret != -ENOENT) {
11562 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11563 return ret;
11564 }
11565 }
11566
11567 /* update olh object */
11568 r = ref.ioctx.operate(ref.oid, &op);
11569 if (r == -ECANCELED) {
11570 r = 0;
11571 }
11572 if (r < 0) {
11573 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11574 return r;
11575 }
11576
11577 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11578 if (r < 0) {
11579 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11580 return r;
11581 }
11582
11583 if (need_to_remove) {
11584 ObjectWriteOperation rm_op;
11585
11586 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11587 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11588 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11589 rm_op.remove();
11590
11591 r = ref.ioctx.operate(ref.oid, &rm_op);
11592 if (r == -ECANCELED) {
11593 return 0; /* someone else won this race */
11594 } else {
11595 /*
11596 * only clear if was successful, otherwise we might clobber pending operations on this object
11597 */
11598 r = bucket_index_clear_olh(bucket_info, state, obj);
11599 if (r < 0) {
11600 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11601 return r;
11602 }
11603 }
11604 }
11605
11606 return 0;
11607}
11608
11609/*
11610 * read olh log and apply it
11611 */
31f18b77 11612int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
11613{
11614 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11615 bool is_truncated;
11616 uint64_t ver_marker = 0;
11617
11618 do {
11619 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11620 if (ret < 0) {
11621 return ret;
11622 }
31f18b77 11623 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
11624 if (ret < 0) {
11625 return ret;
11626 }
11627 } while (is_truncated);
11628
11629 return 0;
11630}
11631
11632int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77
AA
11633 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
11634 rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
11635{
11636 string op_tag;
11637
11638 rgw_obj olh_obj = target_obj;
11639 olh_obj.key.instance.clear();
11640
11641 RGWObjState *state = NULL;
11642
11643 int ret = 0;
11644 int i;
31f18b77 11645
7c673cae
FG
11646#define MAX_ECANCELED_RETRY 100
11647 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11648 if (ret == -ECANCELED) {
11649 obj_ctx.obj.invalidate(olh_obj);
11650 }
11651
11652 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11653 if (ret < 0) {
11654 return ret;
11655 }
11656
11657 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11658 if (ret < 0) {
11659 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11660 if (ret == -ECANCELED) {
11661 continue;
11662 }
11663 return ret;
11664 }
91327a77
AA
11665 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
11666 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
11667 zones_trace, log_data_change);
7c673cae
FG
11668 if (ret < 0) {
11669 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11670 if (ret == -ECANCELED) {
11671 continue;
11672 }
11673 return ret;
11674 }
11675 break;
11676 }
11677
11678 if (i == MAX_ECANCELED_RETRY) {
11679 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11680 return -EIO;
11681 }
11682
11683 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11684 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11685 ret = 0;
11686 }
11687 if (ret < 0) {
11688 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11689 return ret;
11690 }
11691
11692 return 0;
11693}
11694
11695int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
31f18b77 11696 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7c673cae
FG
11697{
11698 string op_tag;
11699
11700 rgw_obj olh_obj = target_obj;
11701 olh_obj.key.instance.clear();
11702
11703 RGWObjState *state = NULL;
11704
11705 int ret = 0;
11706 int i;
11707
11708 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11709 if (ret == -ECANCELED) {
11710 obj_ctx.obj.invalidate(olh_obj);
11711 }
11712
11713 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11714 if (ret < 0)
11715 return ret;
11716
11717 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11718 if (ret < 0) {
11719 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11720 if (ret == -ECANCELED) {
11721 continue;
11722 }
11723 return ret;
11724 }
11725
11726 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11727
31f18b77 11728 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
11729 if (ret < 0) {
11730 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11731 if (ret == -ECANCELED) {
11732 continue;
11733 }
11734 return ret;
11735 }
11736 break;
11737 }
11738
11739 if (i == MAX_ECANCELED_RETRY) {
11740 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11741 return -EIO;
11742 }
11743
31f18b77 11744 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
11745 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11746 return 0;
11747 }
11748 if (ret < 0) {
11749 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11750 return ret;
11751 }
11752
11753 return 0;
11754}
11755
11756void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11757{
11758#define OBJ_INSTANCE_LEN 32
11759 char buf[OBJ_INSTANCE_LEN + 1];
11760
11761 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11762 no underscore for instance name due to the way we encode the raw keys */
11763
11764 target_obj->key.set_instance(buf);
11765}
11766
11767static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11768 map<string, bufferlist> *attrset)
11769{
11770 attrset->clear();
11771 map<string, bufferlist>::iterator iter;
11772 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11773 iter != unfiltered_attrset.end(); ++iter) {
11774 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11775 break;
11776 (*attrset)[iter->first] = iter->second;
11777 }
11778}
11779
11780int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11781{
11782 map<string, bufferlist> unfiltered_attrset;
11783
11784 ObjectReadOperation op;
11785 op.getxattrs(&unfiltered_attrset, NULL);
11786
11787 bufferlist outbl;
11788 int r = obj_operate(bucket_info, obj, &op);
11789
11790 if (r < 0) {
11791 return r;
11792 }
11793 map<string, bufferlist> attrset;
11794
11795 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11796
11797 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11798 if (iter == attrset.end()) { /* not an olh */
11799 return -EINVAL;
11800 }
11801
11802 try {
11803 bufferlist::iterator biter = iter->second.begin();
11804 ::decode(*olh, biter);
11805 } catch (buffer::error& err) {
11806 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11807 return -EIO;
11808 }
11809
11810 return 0;
11811}
11812
11813void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11814 map<string, bufferlist> *rm_pending_entries)
11815{
11816 map<string, bufferlist>::iterator iter = pending_entries.begin();
11817
11818 real_time now = real_clock::now();
11819
11820 while (iter != pending_entries.end()) {
11821 bufferlist::iterator biter = iter->second.begin();
11822 RGWOLHPendingInfo pending_info;
11823 try {
11824 ::decode(pending_info, biter);
11825 } catch (buffer::error& err) {
11826 /* skipping bad entry, we could remove it but it might hide a bug */
11827 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11828 ++iter;
11829 continue;
11830 }
11831
11832 map<string, bufferlist>::iterator cur_iter = iter;
11833 ++iter;
11834 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11835 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11836 pending_entries.erase(cur_iter);
11837 } else {
11838 /* entries names are sorted by time (rounded to a second) */
11839 break;
11840 }
11841 }
11842}
11843
11844int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11845{
11846 ObjectWriteOperation op;
11847
11848 bucket_index_guard_olh_op(state, op);
11849
11850 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11851 op.rmxattr(iter->first.c_str());
11852 }
11853
11854 rgw_rados_ref ref;
11855 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11856 if (r < 0) {
11857 return r;
11858 }
11859
11860 /* update olh object */
11861 r = ref.ioctx.operate(ref.oid, &op);
11862 if (r == -ENOENT || r == -ECANCELED) {
11863 /* raced with some other change, shouldn't sweat about it */
11864 r = 0;
11865 }
11866 if (r < 0) {
11867 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11868 return r;
11869 }
11870
11871 return 0;
11872}
11873
11874int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11875{
11876 map<string, bufferlist> pending_entries;
11877 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11878
11879 map<string, bufferlist> rm_pending_entries;
11880 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11881
11882 if (!rm_pending_entries.empty()) {
11883 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11884 if (ret < 0) {
11885 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11886 return ret;
11887 }
11888 }
11889 if (!pending_entries.empty()) {
11890 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11891
11892 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11893 if (ret < 0) {
11894 return ret;
11895 }
11896 }
11897
11898 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11899 assert(iter != state->attrset.end());
11900 RGWOLHInfo olh;
11901 try {
11902 bufferlist::iterator biter = iter->second.begin();
11903 ::decode(olh, biter);
11904 } catch (buffer::error& err) {
11905 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11906 return -EIO;
11907 }
11908
11909 if (olh.removed) {
11910 return -ENOENT;
11911 }
11912
11913 *target = olh.target;
11914
11915 return 0;
11916}
11917
11918int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11919 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11920 RGWObjVersionTracker *objv_tracker)
11921{
11922 rgw_rados_ref ref;
11923 int r = get_raw_obj_ref(obj, &ref);
11924 if (r < 0) {
11925 return r;
11926 }
11927
11928 map<string, bufferlist> unfiltered_attrset;
11929 uint64_t size = 0;
11930 struct timespec mtime_ts;
11931
11932 ObjectReadOperation op;
11933 if (objv_tracker) {
11934 objv_tracker->prepare_op_for_read(&op);
11935 }
11936 if (attrs) {
11937 op.getxattrs(&unfiltered_attrset, NULL);
11938 }
11939 if (psize || pmtime) {
11940 op.stat2(&size, &mtime_ts, NULL);
11941 }
11942 if (first_chunk) {
11943 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11944 }
11945 bufferlist outbl;
11946 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11947
11948 if (epoch) {
11949 *epoch = ref.ioctx.get_last_version();
11950 }
11951
11952 if (r < 0)
11953 return r;
11954
11955 if (psize)
11956 *psize = size;
11957 if (pmtime)
11958 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11959 if (attrs) {
11960 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11961 }
11962
11963 return 0;
11964}
11965
11966int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 11967 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae
FG
11968{
11969 map<string, rgw_bucket_dir_header> headers;
11970 map<int, string> bucket_instance_ids;
11971 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11972 if (r < 0) {
11973 return r;
11974 }
11975
11976 assert(headers.size() == bucket_instance_ids.size());
11977
11978 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11979 map<int, string>::iterator viter = bucket_instance_ids.begin();
11980 BucketIndexShardsManager ver_mgr;
11981 BucketIndexShardsManager master_ver_mgr;
11982 BucketIndexShardsManager marker_mgr;
7c673cae
FG
11983 char buf[64];
11984 for(; iter != headers.end(); ++iter, ++viter) {
11985 accumulate_raw_stats(iter->second, stats);
11986 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11987 ver_mgr.add(viter->first, string(buf));
11988 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11989 master_ver_mgr.add(viter->first, string(buf));
11990 if (shard_id >= 0) {
11991 *max_marker = iter->second.max_marker;
11992 } else {
11993 marker_mgr.add(viter->first, iter->second.max_marker);
11994 }
c07f9fc5
FG
11995 if (syncstopped != NULL)
11996 *syncstopped = iter->second.syncstopped;
7c673cae
FG
11997 }
11998 ver_mgr.to_string(bucket_ver);
11999 master_ver_mgr.to_string(master_ver);
12000 if (shard_id < 0) {
12001 marker_mgr.to_string(max_marker);
12002 }
12003 return 0;
12004}
12005
12006int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
12007 map<int, string>& markers)
12008{
12009 map<string, rgw_bucket_dir_header> headers;
12010 map<int, string> bucket_instance_ids;
12011 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
12012 if (r < 0)
12013 return r;
12014
12015 assert(headers.size() == bucket_instance_ids.size());
12016
12017 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
12018 map<int, string>::iterator viter = bucket_instance_ids.begin();
12019
12020 for(; iter != headers.end(); ++iter, ++viter) {
12021 if (shard_id >= 0) {
12022 markers[shard_id] = iter->second.max_marker;
12023 } else {
12024 markers[viter->first] = iter->second.max_marker;
12025 }
12026 }
12027 return 0;
12028}
12029
12030class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
12031 RGWGetBucketStats_CB *cb;
12032 uint32_t pendings;
12033 map<RGWObjCategory, RGWStorageStats> stats;
12034 int ret_code;
12035 bool should_cb;
12036 Mutex lock;
12037
12038public:
12039 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
12040 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
12041 lock("RGWGetBucketStatsContext") {}
12042
12043 void handle_response(int r, rgw_bucket_dir_header& header) override {
12044 Mutex::Locker l(lock);
12045 if (should_cb) {
12046 if ( r >= 0) {
12047 accumulate_raw_stats(header, stats);
12048 } else {
12049 ret_code = r;
12050 }
12051
12052 // Are we all done?
12053 if (--pendings == 0) {
12054 if (!ret_code) {
12055 cb->set_response(&stats);
12056 }
12057 cb->handle_response(ret_code);
12058 cb->put();
12059 }
12060 }
12061 }
12062
12063 void unset_cb() {
12064 Mutex::Locker l(lock);
12065 should_cb = false;
12066 }
12067};
12068
12069int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
12070{
12071 int num_aio = 0;
c07f9fc5 12072 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
7c673cae
FG
12073 assert(get_ctx);
12074 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
12075 if (r < 0) {
12076 ctx->put();
12077 if (num_aio) {
12078 get_ctx->unset_cb();
12079 }
12080 }
c07f9fc5 12081 get_ctx->put();
7c673cae
FG
12082 return r;
12083}
12084
12085class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
12086 RGWGetUserStats_CB *cb;
12087
12088public:
12089 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
12090 : cb(cb) {}
12091
12092 void handle_response(int r, cls_user_header& header) override {
12093 const cls_user_stats& hs = header.stats;
12094 if (r >= 0) {
12095 RGWStorageStats stats;
12096
12097 stats.size = hs.total_bytes;
12098 stats.size_rounded = hs.total_bytes_rounded;
12099 stats.num_objects = hs.total_entries;
12100
12101 cb->set_response(stats);
12102 }
12103
12104 cb->handle_response(r);
12105
12106 cb->put();
12107 }
12108};
12109
12110int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
12111{
12112 string user_str = user.to_str();
12113
12114 cls_user_header header;
12115 int r = cls_user_get_header(user_str, &header);
12116 if (r < 0)
12117 return r;
12118
12119 const cls_user_stats& hs = header.stats;
12120
12121 stats.size = hs.total_bytes;
12122 stats.size_rounded = hs.total_bytes_rounded;
12123 stats.num_objects = hs.total_entries;
12124
12125 return 0;
12126}
12127
12128int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
12129{
12130 string user_str = user.to_str();
12131
12132 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
12133 int r = cls_user_get_header_async(user_str, get_ctx);
12134 if (r < 0) {
12135 ctx->put();
12136 delete get_ctx;
12137 return r;
12138 }
12139
12140 return 0;
12141}
12142
12143void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
12144{
12145 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
12146}
12147
12148void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
12149{
12150 if (!bucket.oid.empty()) {
12151 obj.init(get_zone_params().domain_root, bucket.oid);
12152 } else {
12153 string oid;
12154 get_bucket_meta_oid(bucket, oid);
12155 obj.init(get_zone_params().domain_root, oid);
12156 }
12157}
12158
12159int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
12160 real_time *pmtime, map<string, bufferlist> *pattrs)
12161{
12162 size_t pos = meta_key.find(':');
12163 if (pos == string::npos) {
12164 return -EINVAL;
12165 }
12166 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
12167 rgw_bucket_instance_key_to_oid(oid);
12168
12169 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12170}
12171
12172int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
12173 real_time *pmtime, map<string, bufferlist> *pattrs)
12174{
12175 string oid;
12176 if (bucket.oid.empty()) {
12177 get_bucket_meta_oid(bucket, oid);
12178 } else {
12179 oid = bucket.oid;
12180 }
12181
12182 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12183}
12184
31f18b77 12185int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
7c673cae 12186 real_time *pmtime, map<string, bufferlist> *pattrs,
b32b8144
FG
12187 rgw_cache_entry_info *cache_info,
12188 boost::optional<obj_version> refresh_version)
7c673cae
FG
12189{
12190 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
12191
12192 bufferlist epbl;
12193
b32b8144
FG
12194 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12195 oid, epbl, &info.objv_tracker, pmtime, pattrs,
12196 cache_info, refresh_version);
7c673cae
FG
12197 if (ret < 0) {
12198 return ret;
12199 }
12200
12201 bufferlist::iterator iter = epbl.begin();
12202 try {
12203 ::decode(info, iter);
12204 } catch (buffer::error& err) {
12205 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12206 return -EIO;
12207 }
12208 info.bucket.oid = oid;
12209 return 0;
12210}
12211
12212int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
12213 const string& tenant_name,
12214 const string& bucket_name,
12215 RGWBucketEntryPoint& entry_point,
12216 RGWObjVersionTracker *objv_tracker,
12217 real_time *pmtime,
12218 map<string, bufferlist> *pattrs,
b32b8144
FG
12219 rgw_cache_entry_info *cache_info,
12220 boost::optional<obj_version> refresh_version)
7c673cae
FG
12221{
12222 bufferlist bl;
12223 string bucket_entry;
12224
12225 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
b32b8144
FG
12226 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12227 bucket_entry, bl, objv_tracker, pmtime, pattrs,
12228 cache_info, refresh_version);
7c673cae
FG
12229 if (ret < 0) {
12230 return ret;
12231 }
12232
12233 bufferlist::iterator iter = bl.begin();
12234 try {
12235 ::decode(entry_point, iter);
12236 } catch (buffer::error& err) {
12237 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12238 return -EIO;
12239 }
12240 return 0;
12241}
12242
12243int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
12244 const string& tenant_name,
12245 const string& bucket_name)
12246{
12247 RGWBucketEntryPoint entry_point;
12248 real_time ep_mtime;
12249 RGWObjVersionTracker ot;
12250 map<string, bufferlist> attrs;
12251 RGWBucketInfo info;
12252
12253 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
12254
12255 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
12256 if (ret < 0) {
12257 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
12258 return ret;
12259 }
12260
12261 if (!entry_point.has_bucket_info) {
12262 /* already converted! */
12263 return 0;
12264 }
12265
12266 info = entry_point.old_bucket_info;
12267 info.bucket.oid = bucket_name;
12268 info.ep_objv = ot.read_version;
12269
12270 ot.generate_new_write_ver(cct);
12271
12272 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12273 if (ret < 0) {
12274 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12275 return ret;
12276 }
12277
12278 return 0;
12279}
12280
b32b8144
FG
12281int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12282 const string& tenant,
12283 const string& bucket_name,
12284 RGWBucketInfo& info,
12285 real_time *pmtime,
12286 map<string, bufferlist> *pattrs,
12287 boost::optional<obj_version> refresh_version)
7c673cae
FG
12288{
12289 bucket_info_entry e;
12290 string bucket_entry;
12291 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12292
b32b8144 12293
7c673cae 12294 if (binfo_cache->find(bucket_entry, &e)) {
b32b8144
FG
12295 if (refresh_version &&
12296 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12297 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12298 << "a failure that should be debugged. I am a nice machine, "
12299 << "so I will try to recover." << dendl;
12300 binfo_cache->invalidate(bucket_entry);
12301 }
7c673cae
FG
12302 info = e.info;
12303 if (pattrs)
12304 *pattrs = e.attrs;
12305 if (pmtime)
12306 *pmtime = e.mtime;
12307 return 0;
12308 }
12309
12310 RGWBucketEntryPoint entry_point;
12311 real_time ep_mtime;
12312 RGWObjVersionTracker ot;
12313 rgw_cache_entry_info entry_cache_info;
b32b8144
FG
12314 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12315 entry_point, &ot, &ep_mtime, pattrs,
12316 &entry_cache_info, refresh_version);
7c673cae
FG
12317 if (ret < 0) {
12318 /* only init these fields */
12319 info.bucket.tenant = tenant;
12320 info.bucket.name = bucket_name;
12321 return ret;
12322 }
12323
12324 if (entry_point.has_bucket_info) {
12325 info = entry_point.old_bucket_info;
12326 info.bucket.oid = bucket_name;
12327 info.bucket.tenant = tenant;
12328 info.ep_objv = ot.read_version;
12329 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12330 return 0;
12331 }
12332
12333 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12334 * that we got
12335 */
12336 if (pattrs) {
12337 pattrs->clear();
12338 }
12339
12340 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12341
12342
12343 /* read bucket instance info */
12344
12345 string oid;
12346 get_bucket_meta_oid(entry_point.bucket, oid);
12347
12348 rgw_cache_entry_info cache_info;
12349
b32b8144
FG
12350 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12351 &cache_info, refresh_version);
7c673cae
FG
12352 e.info.ep_objv = ot.read_version;
12353 info = e.info;
12354 if (ret < 0) {
b32b8144 12355 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
7c673cae
FG
12356 info.bucket.tenant = tenant;
12357 info.bucket.name = bucket_name;
12358 // XXX and why return anything in case of an error anyway?
12359 return ret;
12360 }
12361
12362 if (pmtime)
12363 *pmtime = e.mtime;
12364 if (pattrs)
12365 *pattrs = e.attrs;
12366
12367 list<rgw_cache_entry_info *> cache_info_entries;
12368 cache_info_entries.push_back(&entry_cache_info);
12369 cache_info_entries.push_back(&cache_info);
12370
12371
12372 /* chain to both bucket entry point and bucket instance */
12373 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12374 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12375 }
12376
b32b8144
FG
12377 if (refresh_version &&
12378 refresh_version->compare(&info.objv_tracker.read_version)) {
12379 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12380 << "have gone squirrelly. An administrator may have forced a "
12381 << "change; otherwise there is a problem somewhere." << dendl;
12382 }
12383
7c673cae
FG
12384 return 0;
12385}
12386
b32b8144
FG
12387int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12388 const string& tenant, const string& bucket_name,
12389 RGWBucketInfo& info,
12390 real_time *pmtime, map<string, bufferlist> *pattrs)
12391{
12392 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12393 pattrs, boost::none);
12394}
12395
12396int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12397 ceph::real_time *pmtime,
12398 map<string, bufferlist> *pattrs)
12399{
12400 RGWObjectCtx obj_ctx(this);
12401
12402 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12403 info, pmtime, pattrs, info.objv_tracker.read_version);
12404}
12405
7c673cae
FG
12406int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12407 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12408 map<string, bufferlist> *pattrs)
12409{
12410 bufferlist epbl;
12411 ::encode(entry_point, epbl);
12412 string bucket_entry;
12413 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12414 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12415}
12416
12417int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12418 real_time mtime, map<string, bufferlist> *pattrs)
12419{
12420 info.has_instance_obj = true;
12421 bufferlist bl;
12422
12423 ::encode(info, bl);
12424
12425 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12426 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12427 if (ret == -EEXIST) {
12428 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12429 * bucket operation on this specific bucket (e.g., being synced from the master), but
12430 * since bucket instace meta object is unique for this specific bucket instace, we don't
12431 * need to return an error.
12432 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12433 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12434 * locally, while in the sync thread we sync the new bucket.
12435 */
12436 ret = 0;
12437 }
12438 return ret;
12439}
12440
12441int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12442 map<string, bufferlist> *pattrs, bool create_entry_point)
12443{
12444 bool create_head = !info.has_instance_obj || create_entry_point;
12445
12446 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12447 if (ret < 0) {
12448 return ret;
12449 }
12450
12451 if (!create_head)
12452 return 0; /* done! */
12453
12454 RGWBucketEntryPoint entry_point;
12455 entry_point.bucket = info.bucket;
12456 entry_point.owner = info.owner;
12457 entry_point.creation_time = info.creation_time;
12458 entry_point.linked = true;
12459 RGWObjVersionTracker ot;
12460 if (pep_objv && !pep_objv->tag.empty()) {
12461 ot.write_version = *pep_objv;
12462 } else {
12463 ot.generate_new_write_ver(cct);
12464 if (pep_objv) {
12465 *pep_objv = ot.write_version;
12466 }
12467 }
12468 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12469 if (ret < 0)
12470 return ret;
12471
12472 return 0;
12473}
12474
12475int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12476{
12477 rgw_rados_ref ref;
12478 int r = get_raw_obj_ref(obj, &ref);
12479 if (r < 0) {
12480 return r;
12481 }
12482
12483 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12484 if (r < 0)
12485 return r;
12486
12487 return 0;
12488
12489}
12490
12491int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12492 std::map<string, bufferlist>& m)
12493{
12494 rgw_rados_ref ref;
12495 int r = get_raw_obj_ref(obj, &ref);
12496 if (r < 0) {
12497 return r;
12498 }
12499
12500#define MAX_OMAP_GET_ENTRIES 1024
12501 const int count = MAX_OMAP_GET_ENTRIES;
12502 string start_after;
12503
12504 while (true) {
12505 std::map<string, bufferlist> t;
12506 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12507 if (r < 0) {
12508 return r;
12509 }
12510 if (t.empty()) {
12511 break;
12512 }
12513 start_after = t.rbegin()->first;
12514 m.insert(t.begin(), t.end());
12515 }
12516 return 0;
12517}
12518
12519int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12520{
12521 rgw_rados_ref ref;
12522 int r = get_raw_obj_ref(obj, &ref);
12523 if (r < 0) {
12524 return r;
12525 }
12526 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12527
12528 map<string, bufferlist> m;
12529 m[key] = bl;
12530
12531 r = ref.ioctx.omap_set(ref.oid, m);
12532
12533 return r;
12534}
12535
12536int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12537{
12538 rgw_rados_ref ref;
12539 int r = get_raw_obj_ref(obj, &ref);
12540 if (r < 0) {
12541 return r;
12542 }
12543
12544 r = ref.ioctx.omap_set(ref.oid, m);
12545
12546 return r;
12547}
12548
12549int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12550{
12551 rgw_rados_ref ref;
12552 int r = get_raw_obj_ref(obj, &ref);
12553 if (r < 0) {
12554 return r;
12555 }
12556
12557 set<string> k;
12558 k.insert(key);
12559
12560 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12561 return r;
12562}
12563
12564int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12565{
12566 RGWObjectCtx obj_ctx(this);
12567
12568 map<string, RGWBucketEnt>::iterator iter;
12569 for (iter = m.begin(); iter != m.end(); ++iter) {
12570 RGWBucketEnt& ent = iter->second;
12571 rgw_bucket& bucket = ent.bucket;
12572 ent.count = 0;
12573 ent.size = 0;
12574 ent.size_rounded = 0;
12575
12576 map<string, rgw_bucket_dir_header> headers;
12577
12578 RGWBucketInfo bucket_info;
12579 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12580 if (ret < 0) {
12581 return ret;
12582 }
12583
12584 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12585 if (r < 0)
12586 return r;
12587
12588 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12589 for (; hiter != headers.end(); ++hiter) {
12590 RGWObjCategory category = main_category;
12591 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12592 if (iter != hiter->second.stats.end()) {
12593 struct rgw_bucket_category_stats& stats = iter->second;
12594 ent.count += stats.num_entries;
12595 ent.size += stats.total_size;
12596 ent.size_rounded += stats.total_size_rounded;
12597 }
12598 }
3efd9988
FG
12599
12600 // fill in placement_rule from the bucket instance for use in swift's
12601 // per-storage policy statistics
12602 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
12603 }
12604
12605 return m.size();
12606}
12607
12608int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12609{
12610 rgw_rados_ref ref;
12611 int r = get_raw_obj_ref(obj, &ref);
12612 if (r < 0) {
12613 return r;
12614 }
12615 librados::Rados *rad = get_rados_handle();
12616 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12617
12618 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12619 completion->release();
12620 return r;
12621}
12622
12623int RGWRados::distribute(const string& key, bufferlist& bl)
12624{
12625 /*
12626 * we were called before watch was initialized. This can only happen if we're updating some system
12627 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12628 * objects, they're currently only read on startup anyway.
12629 */
12630 if (!watch_initialized)
12631 return 0;
12632
12633 string notify_oid;
12634 pick_control_oid(key, notify_oid);
12635
12636 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12637 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12638}
12639
12640int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12641{
12642 librados::IoCtx& io_ctx = ctx.io_ctx;
12643 librados::NObjectIterator& iter = ctx.iter;
12644
12645 int r = open_pool_ctx(pool, io_ctx);
12646 if (r < 0)
12647 return r;
12648
12649 iter = io_ctx.nobjects_begin();
12650
12651 return 0;
12652}
12653
181888fb
FG
12654int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12655{
12656 librados::IoCtx& io_ctx = ctx.io_ctx;
12657 librados::NObjectIterator& iter = ctx.iter;
12658
12659 int r = open_pool_ctx(pool, io_ctx);
12660 if (r < 0)
12661 return r;
12662
12663 librados::ObjectCursor oc;
12664 if (!oc.from_str(cursor)) {
12665 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12666 return -EINVAL;
12667 }
12668
f64942e4
AA
12669 try {
12670 iter = io_ctx.nobjects_begin(oc);
12671 return 0;
12672 } catch (const std::system_error& e) {
12673 r = -e.code().value();
12674 ldout(cct, 10) << "nobjects_begin threw " << e.what()
12675 << ", returning " << r << dendl;
12676 return r;
12677 } catch (const std::exception& e) {
12678 ldout(cct, 10) << "nobjects_begin threw " << e.what()
12679 << ", returning -5" << dendl;
12680 return -EIO;
12681 }
181888fb
FG
12682}
12683
12684string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12685{
12686 return ctx.iter.get_cursor().to_str();
12687}
12688
f64942e4
AA
12689static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
12690 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
12691 bool *is_truncated, RGWAccessListFilter *filter)
12692{
12693 librados::IoCtx& io_ctx = ctx.io_ctx;
12694 librados::NObjectIterator& iter = ctx.iter;
12695
12696 if (iter == io_ctx.nobjects_end())
12697 return -ENOENT;
12698
12699 uint32_t i;
12700
12701 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12702 rgw_bucket_dir_entry e;
12703
12704 string oid = iter->get_oid();
12705 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12706
12707 // fill it in with initial values; we may correct later
12708 if (filter && !filter->filter(oid, oid))
12709 continue;
12710
12711 e.key = oid;
12712 objs.push_back(e);
12713 }
12714
12715 if (is_truncated)
12716 *is_truncated = (iter != io_ctx.nobjects_end());
12717
12718 return objs.size();
12719}
12720struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12721 string prefix;
12722
12723 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12724 bool filter(string& name, string& key) override {
12725 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12726 }
12727};
12728
f64942e4
AA
12729int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12730 bool *is_truncated, RGWAccessListFilter *filter)
12731{
12732 // catch exceptions from NObjectIterator::operator++()
12733 try {
12734 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
12735 } catch (const std::system_error& e) {
12736 int r = -e.code().value();
12737 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
12738 << ", returning " << r << dendl;
12739 return r;
12740 } catch (const std::exception& e) {
12741 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
12742 << ", returning -5" << dendl;
12743 return -EIO;
12744 }
12745}
12746
181888fb 12747int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 12748{
181888fb
FG
12749 if (!ctx->initialized) {
12750 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
12751 if (r < 0) {
12752 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12753 return r;
12754 }
181888fb 12755 ctx->initialized = true;
7c673cae 12756 }
181888fb
FG
12757 return 0;
12758}
7c673cae 12759
181888fb
FG
12760int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12761 RGWListRawObjsCtx& ctx, list<string>& oids,
12762 bool *is_truncated)
12763{
12764 if (!ctx.initialized) {
12765 return -EINVAL;
12766 }
12767 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
12768 vector<rgw_bucket_dir_entry> objs;
12769 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12770 if (r < 0) {
12771 if(r != -ENOENT)
12772 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12773 return r;
12774 }
12775
12776 vector<rgw_bucket_dir_entry>::iterator iter;
12777 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12778 oids.push_back(iter->key.name);
12779 }
12780
12781 return oids.size();
12782}
12783
181888fb
FG
12784int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12785 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12786 bool *is_truncated)
12787{
12788 if (!ctx.initialized) {
12789 int r = list_raw_objects_init(pool, string(), &ctx);
12790 if (r < 0) {
12791 return r;
12792 }
12793 }
12794
12795 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12796}
12797
12798string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12799{
12800 return pool_iterate_get_cursor(ctx.iter_ctx);
12801}
12802
7c673cae
FG
12803int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12804 std::list<rgw_bi_log_entry>& result, bool *truncated)
12805{
12806 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12807 result.clear();
12808
12809 librados::IoCtx index_ctx;
12810 map<int, string> oids;
12811 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12812 map<int, string> bucket_instance_ids;
12813 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12814 if (r < 0)
12815 return r;
12816
12817 BucketIndexShardsManager marker_mgr;
12818 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12819 // If there are multiple shards for the bucket index object, the marker
12820 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12821 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12822 // only contain one record, and the key is the bucket instance id.
12823 r = marker_mgr.from_string(marker, shard_id);
12824 if (r < 0)
12825 return r;
12826
12827 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12828 if (r < 0)
12829 return r;
12830
12831 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12832 map<int, list<rgw_bi_log_entry>::iterator> vends;
12833 if (truncated) {
12834 *truncated = false;
12835 }
12836 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12837 for (; miter != bi_log_lists.end(); ++miter) {
12838 int shard_id = miter->first;
12839 vcurrents[shard_id] = miter->second.entries.begin();
12840 vends[shard_id] = miter->second.entries.end();
12841 if (truncated) {
12842 *truncated = (*truncated || miter->second.truncated);
12843 }
12844 }
12845
12846 size_t total = 0;
12847 bool has_more = true;
12848 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12849 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12850 while (total < max && has_more) {
12851 has_more = false;
12852
12853 viter = vcurrents.begin();
12854 eiter = vends.begin();
12855
12856 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12857 assert (eiter != vends.end());
12858
12859 int shard_id = viter->first;
12860 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12861
12862 if (liter == eiter->second){
12863 continue;
12864 }
12865 rgw_bi_log_entry& entry = *(liter);
12866 if (has_shards) {
12867 char buf[16];
12868 snprintf(buf, sizeof(buf), "%d", shard_id);
12869 string tmp_id;
12870 build_bucket_index_marker(buf, entry.id, &tmp_id);
12871 entry.id.swap(tmp_id);
12872 }
12873 marker_mgr.add(shard_id, entry.id);
12874 result.push_back(entry);
12875 total++;
12876 has_more = true;
12877 ++liter;
12878 }
12879 }
12880
12881 if (truncated) {
12882 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12883 assert (eiter != vends.end());
12884 *truncated = (*truncated || (viter->second != eiter->second));
12885 }
12886 }
12887
12888 // Refresh marker, if there are multiple shards, the output will look like
12889 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12890 // if there is no sharding, the simply marker (without oid) is returned
12891 if (has_shards) {
12892 marker_mgr.to_string(&marker);
12893 } else {
12894 if (!result.empty()) {
12895 marker = result.rbegin()->id;
12896 }
12897 }
12898
12899 return 0;
12900}
12901
12902int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12903{
12904 librados::IoCtx index_ctx;
12905 map<int, string> bucket_objs;
31f18b77
FG
12906
12907 BucketIndexShardsManager start_marker_mgr;
12908 BucketIndexShardsManager end_marker_mgr;
12909
7c673cae 12910 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
31f18b77 12911 if (r < 0) {
7c673cae 12912 return r;
31f18b77 12913 }
7c673cae 12914
7c673cae 12915 r = start_marker_mgr.from_string(start_marker, shard_id);
31f18b77 12916 if (r < 0) {
7c673cae 12917 return r;
31f18b77
FG
12918 }
12919
7c673cae 12920 r = end_marker_mgr.from_string(end_marker, shard_id);
31f18b77 12921 if (r < 0) {
7c673cae 12922 return r;
31f18b77 12923 }
7c673cae
FG
12924
12925 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
31f18b77
FG
12926 cct->_conf->rgw_bucket_index_max_aio)();
12927
12928 return r;
7c673cae
FG
12929}
12930
c07f9fc5
FG
12931int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12932{
12933 librados::IoCtx index_ctx;
12934 map<int, string> bucket_objs;
12935 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12936 if (r < 0)
12937 return r;
12938
12939 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12940}
12941
12942int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12943{
12944 librados::IoCtx index_ctx;
12945 map<int, string> bucket_objs;
12946 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12947 if (r < 0)
12948 return r;
12949
12950 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12951}
12952
7c673cae
FG
12953int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12954{
12955 rgw_rados_ref ref;
12956 int r = get_obj_head_ref(bucket_info, obj, &ref);
12957 if (r < 0) {
12958 return r;
12959 }
12960
12961 rgw_cls_bi_entry bi_entry;
12962 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12963 if (r < 0 && r != -ENOENT) {
12964 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12965 }
12966 if (r < 0) {
12967 return r;
12968 }
12969 bufferlist::iterator iter = bi_entry.data.begin();
12970 try {
12971 ::decode(*dirent, iter);
12972 } catch (buffer::error& err) {
12973 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12974 return -EIO;
12975 }
12976
12977 return 0;
12978}
12979
12980int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12981{
12982 BucketShard bs(this);
f64942e4 12983 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
12984 if (ret < 0) {
12985 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12986 return ret;
12987 }
12988
12989 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12990
12991 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12992 if (ret < 0)
12993 return ret;
12994
12995 return 0;
12996}
12997
12998void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12999{
13000 cls_rgw_bi_put(op, bs.bucket_obj, entry);
13001}
13002
13003int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
13004{
13005 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
13006 if (ret < 0)
13007 return ret;
13008
13009 return 0;
13010}
13011
13012int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
13013{
13014 BucketShard bs(this);
f64942e4 13015 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13016 if (ret < 0) {
13017 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13018 return ret;
13019 }
13020
13021 return bi_put(bs, entry);
13022}
13023
13024int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13025{
13026 rgw_obj obj(bucket, obj_name);
13027 BucketShard bs(this);
f64942e4 13028 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13029 if (ret < 0) {
13030 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13031 return ret;
13032 }
13033
13034 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
13035 if (ret == -ENOENT) {
13036 *is_truncated = false;
13037 }
7c673cae
FG
13038 if (ret < 0)
13039 return ret;
13040
13041 return 0;
13042}
13043
13044int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13045{
13046 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
13047 if (ret < 0)
13048 return ret;
13049
13050 return 0;
13051}
13052
13053int RGWRados::bi_remove(BucketShard& bs)
13054{
13055 int ret = bs.index_ctx.remove(bs.bucket_obj);
13056 if (ret == -ENOENT) {
13057 ret = 0;
13058 }
13059 if (ret < 0) {
13060 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
13061 return ret;
13062 }
13063
13064 return 0;
13065}
13066
13067int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
13068{
13069 BucketShard bs(this);
f64942e4 13070 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
7c673cae
FG
13071 if (ret < 0) {
13072 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
13073 return ret;
13074 }
13075
13076 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
13077}
13078
13079int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
13080{
13081 return gc_pool_ctx.operate(oid, op);
13082}
13083
13084int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
13085{
13086 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13087 int r = gc_pool_ctx.aio_operate(oid, c, op);
13088 c->release();
13089 return r;
13090}
13091
13092int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
13093{
13094 return gc_pool_ctx.operate(oid, op, pbl);
13095}
13096
13097int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
13098{
13099 return gc->list(index, marker, max, expired_only, result, truncated);
13100}
13101
13102int RGWRados::process_gc()
13103{
13104 return gc->process();
13105}
13106
13107int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
13108{
13109 return lc->list_lc_progress(marker, max_entries, progress_map);
13110}
13111
13112int RGWRados::process_lc()
13113{
13114 return lc->process();
13115}
13116
1adf2230 13117bool RGWRados::process_expire_objects()
7c673cae 13118{
1adf2230 13119 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
13120}
13121
7c673cae 13122int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
31f18b77 13123 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13124{
31f18b77
FG
13125 rgw_zone_set zones_trace;
13126 if (_zones_trace) {
13127 zones_trace = *_zones_trace;
13128 }
1adf2230
AA
13129 zones_trace.insert(get_zone().id);
13130
7c673cae
FG
13131 ObjectWriteOperation o;
13132 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77
FG
13133 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13134 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
7c673cae
FG
13135 return bs.index_ctx.operate(bs.bucket_obj, &o);
13136}
13137
31f18b77 13138int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
13139 int64_t pool, uint64_t epoch,
13140 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13141 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13142{
7c673cae
FG
13143 ObjectWriteOperation o;
13144 rgw_bucket_dir_entry_meta dir_meta;
13145 dir_meta = ent.meta;
13146 dir_meta.category = category;
13147
1adf2230
AA
13148 rgw_zone_set zones_trace;
13149 if (_zones_trace) {
13150 zones_trace = *_zones_trace;
13151 }
13152 zones_trace.insert(get_zone().id);
13153
7c673cae
FG
13154 rgw_bucket_entry_ver ver;
13155 ver.pool = pool;
13156 ver.epoch = epoch;
13157 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
13158 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13159 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13160 get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
13161 complete_op_data *arg;
13162 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13163 get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77
FG
13164 librados::AioCompletion *completion = arg->rados_completion;
13165 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
13166 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
13167 return ret;
13168}
13169
31f18b77 13170int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
13171 int64_t pool, uint64_t epoch,
13172 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13173 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 13174{
31f18b77 13175 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13176}
13177
13178int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
13179 int64_t pool, uint64_t epoch,
13180 rgw_obj& obj,
13181 real_time& removed_mtime,
13182 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
13183 uint16_t bilog_flags,
13184 rgw_zone_set *zones_trace)
7c673cae
FG
13185{
13186 rgw_bucket_dir_entry ent;
13187 ent.meta.mtime = removed_mtime;
13188 obj.key.get_index_key(&ent.key);
31f18b77 13189 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13190}
13191
31f18b77 13192int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
13193{
13194 rgw_bucket_dir_entry ent;
13195 obj.key.get_index_key(&ent.key);
31f18b77 13196 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
7c673cae
FG
13197}
13198
13199int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
13200{
13201 librados::IoCtx index_ctx;
13202 map<int, string> bucket_objs;
13203 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
13204 if (r < 0)
13205 return r;
13206
13207 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
13208}
13209
1adf2230
AA
13210
13211int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
13212 int shard_id,
13213 rgw_obj_index_key& start,
13214 const string& prefix,
13215 uint32_t num_entries,
13216 bool list_versions,
13217 map<string, rgw_bucket_dir_entry>& m,
13218 bool *is_truncated,
13219 rgw_obj_index_key *last_entry,
13220 bool (*force_check_filter)(const string& name))
7c673cae 13221{
1adf2230
AA
13222 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
13223 " start " << start.name << "[" << start.instance << "] num_entries " <<
13224 num_entries << dendl;
7c673cae
FG
13225
13226 librados::IoCtx index_ctx;
13227 // key - oid (for different shards if there is any)
1adf2230
AA
13228 // value - list result for the corresponding oid (shard), it is filled by
13229 // the AIO callback
7c673cae
FG
13230 map<int, string> oids;
13231 map<int, struct rgw_cls_list_ret> list_results;
13232 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13233 if (r < 0)
13234 return r;
13235
13236 cls_rgw_obj_key start_key(start.name, start.instance);
1adf2230
AA
13237 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
13238 list_versions, oids, list_results,
13239 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
13240 if (r < 0)
13241 return r;
13242
13243 // Create a list of iterators that are used to iterate each shard
13244 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
13245 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
13246 vector<string> vnames(list_results.size());
13247 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13248 *is_truncated = false;
13249 for (; iter != list_results.end(); ++iter) {
13250 vcurrents.push_back(iter->second.dir.m.begin());
13251 vends.push_back(iter->second.dir.m.end());
13252 vnames.push_back(oids[iter->first]);
13253 *is_truncated = (*is_truncated || iter->second.is_truncated);
13254 }
13255
13256 // Create a map to track the next candidate entry from each shard, if the entry
13257 // from a specified shard is selected/erased, the next entry from that shard will
13258 // be inserted for next round selection
13259 map<string, size_t> candidates;
13260 for (size_t i = 0; i < vcurrents.size(); ++i) {
13261 if (vcurrents[i] != vends[i]) {
13262 candidates[vcurrents[i]->first] = i;
13263 }
13264 }
13265
13266 map<string, bufferlist> updates;
13267 uint32_t count = 0;
13268 while (count < num_entries && !candidates.empty()) {
13269 r = 0;
13270 // Select the next one
13271 int pos = candidates.begin()->second;
13272 const string& name = vcurrents[pos]->first;
13273 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
13274
3efd9988
FG
13275 bool force_check = force_check_filter &&
13276 force_check_filter(dirent.key.name);
13277 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13278 !dirent.pending_map.empty() ||
13279 force_check) {
7c673cae
FG
13280 /* there are uncommitted ops. We need to check the current state,
13281 * and if the tags are old we need to do cleanup as well. */
13282 librados::IoCtx sub_ctx;
13283 sub_ctx.dup(index_ctx);
1adf2230
AA
13284 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
13285 updates[vnames[pos]]);
7c673cae
FG
13286 if (r < 0 && r != -ENOENT) {
13287 return r;
13288 }
13289 }
13290 if (r >= 0) {
1adf2230
AA
13291 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
13292 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae
FG
13293 m[name] = std::move(dirent);
13294 ++count;
13295 }
13296
13297 // Refresh the candidates map
13298 candidates.erase(candidates.begin());
13299 ++vcurrents[pos];
13300 if (vcurrents[pos] != vends[pos]) {
13301 candidates[vcurrents[pos]->first] = pos;
13302 }
13303 }
13304
13305 // Suggest updates if there is any
13306 map<string, bufferlist>::iterator miter = updates.begin();
13307 for (; miter != updates.end(); ++miter) {
13308 if (miter->second.length()) {
13309 ObjectWriteOperation o;
13310 cls_rgw_suggest_changes(o, miter->second);
13311 // we don't care if we lose suggested updates, send them off blindly
13312 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13313 index_ctx.aio_operate(miter->first, c, &o);
1adf2230 13314 c->release();
7c673cae
FG
13315 }
13316 }
13317
13318 // Check if all the returned entries are consumed or not
13319 for (size_t i = 0; i < vcurrents.size(); ++i) {
1adf2230 13320 if (vcurrents[i] != vends[i]) {
7c673cae 13321 *is_truncated = true;
1adf2230
AA
13322 break;
13323 }
7c673cae
FG
13324 }
13325 if (!m.empty())
13326 *last_entry = m.rbegin()->first;
13327
13328 return 0;
13329}
13330
1adf2230
AA
13331
13332int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
13333 int shard_id,
13334 rgw_obj_index_key& start,
13335 const string& prefix,
13336 uint32_t num_entries,
13337 bool list_versions,
13338 std::vector<rgw_bucket_dir_entry>& ent_list,
13339 bool *is_truncated,
13340 rgw_obj_index_key *last_entry,
13341 bool (*force_check_filter)(const string& name)) {
13342 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
13343 " start " << start.name << "[" << start.instance <<
13344 "] num_entries " << num_entries << dendl;
13345
13346 *is_truncated = false;
13347 librados::IoCtx index_ctx;
13348
13349 rgw_obj_index_key my_start = start;
13350
13351 map<int, string> oids;
13352 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13353 if (r < 0)
13354 return r;
13355 const uint32_t num_shards = oids.size();
13356
13357 uint32_t current_shard;
13358 if (shard_id >= 0) {
13359 current_shard = shard_id;
13360 } else if (my_start.empty()) {
13361 current_shard = 0u;
13362 } else {
13363 current_shard =
13364 rgw_bucket_shard_index(my_start.name, num_shards);
13365 }
13366
13367 uint32_t count = 0u;
13368 map<string, bufferlist> updates;
13369 std::string last_added_entry;
13370 while (count <= num_entries &&
13371 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
13372 current_shard < num_shards)) {
13373 // key - oid (for different shards if there is any)
13374 // value - list result for the corresponding oid (shard), it is filled by
13375 // the AIO callback
13376 map<int, struct rgw_cls_list_ret> list_results;
13377 r = CLSRGWIssueBucketList(index_ctx, my_start, prefix, num_entries,
13378 list_versions, oids, list_results,
13379 cct->_conf->rgw_bucket_index_max_aio)();
13380 if (r < 0)
13381 return r;
13382
13383 const std::string& oid = oids[current_shard];
13384 assert(list_results.find(current_shard) != list_results.end());
13385 auto& result = list_results[current_shard];
13386 for (auto& entry : result.dir.m) {
13387 rgw_bucket_dir_entry& dirent = entry.second;
13388
13389 bool force_check = force_check_filter &&
13390 force_check_filter(dirent.key.name);
13391 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13392 !dirent.pending_map.empty() ||
13393 force_check) {
13394 /* there are uncommitted ops. We need to check the current state,
13395 * and if the tags are old we need to do cleanup as well. */
13396 librados::IoCtx sub_ctx;
13397 sub_ctx.dup(index_ctx);
13398 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
13399 if (r < 0 && r != -ENOENT) {
13400 return r;
13401 }
13402 }
13403
13404 // at this point either r >=0 or r == -ENOENT
13405 if (r >= 0) { // i.e., if r != -ENOENT
13406 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
13407 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13408
13409 if (count < num_entries) {
13410 last_added_entry = entry.first;
13411 my_start = dirent.key;
13412 ent_list.emplace_back(std::move(dirent));
13413 ++count;
13414 } else {
13415 *is_truncated = true;
13416 goto check_updates;
13417 }
13418 } else { // r == -ENOENT
13419 // in the case of -ENOENT, make sure we're advancing marker
13420 // for possible next call to CLSRGWIssueBucketList
13421 my_start = dirent.key;
13422 }
13423 } // entry for loop
13424
13425 if (!result.is_truncated) {
13426 // if we reached the end of the shard read next shard
13427 ++current_shard;
13428 my_start = rgw_obj_index_key();
13429 }
13430 } // shard loop
13431
13432check_updates:
13433 // suggest updates if there is any
13434 map<string, bufferlist>::iterator miter = updates.begin();
13435 for (; miter != updates.end(); ++miter) {
13436 if (miter->second.length()) {
13437 ObjectWriteOperation o;
13438 cls_rgw_suggest_changes(o, miter->second);
13439 // we don't care if we lose suggested updates, send them off blindly
13440 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13441 index_ctx.aio_operate(miter->first, c, &o);
13442 c->release();
13443 }
13444 }
13445
13446 if (last_entry && !ent_list.empty()) {
13447 *last_entry = last_added_entry;
13448 }
13449
13450 return 0;
13451}
13452
13453
13454int RGWRados::cls_obj_usage_log_add(const string& oid,
13455 rgw_usage_log_info& info)
7c673cae
FG
13456{
13457 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13458
13459 rgw_rados_ref ref;
224ce89b 13460 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13461 if (r < 0) {
13462 return r;
13463 }
13464
13465 ObjectWriteOperation op;
13466 cls_rgw_usage_log_add(op, info);
13467
13468 r = ref.ioctx.operate(ref.oid, &op);
13469 return r;
13470}
13471
13472int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13473 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13474{
13475 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13476
13477 rgw_rados_ref ref;
224ce89b 13478 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13479 if (r < 0) {
13480 return r;
13481 }
13482
13483 *is_truncated = false;
13484
13485 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13486 max_entries, read_iter, usage, is_truncated);
13487
13488 return r;
13489}
13490
13491int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13492{
13493 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13494
13495 rgw_rados_ref ref;
224ce89b 13496 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13497 if (r < 0) {
13498 return r;
13499 }
13500
b32b8144 13501 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
7c673cae
FG
13502 return r;
13503}
13504
13505int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13506{
13507 librados::IoCtx index_ctx;
13508 string dir_oid;
13509
13510 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13511
13512 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13513 if (r < 0)
13514 return r;
13515
13516 bufferlist updates;
13517
13518 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13519 rgw_bucket_dir_entry entry;
13520 entry.key = *iter;
13521 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13522 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13523 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13524 ::encode(entry, updates);
13525 }
13526
13527 bufferlist out;
13528
13529 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13530
13531 return r;
13532}
13533
13534int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13535 const RGWBucketInfo& bucket_info,
13536 rgw_bucket_dir_entry& list_state,
13537 rgw_bucket_dir_entry& object,
13538 bufferlist& suggested_updates)
13539{
13540 const rgw_bucket& bucket = bucket_info.bucket;
13541 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13542
13543 std::string loc;
13544
13545 rgw_obj obj(bucket, list_state.key);
13546
13547 string oid;
13548 get_obj_bucket_and_oid_loc(obj, oid, loc);
13549
13550 if (loc != list_state.locator) {
13551 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13552 }
13553
13554 io_ctx.locator_set_key(list_state.locator);
13555
13556 RGWObjState *astate = NULL;
13557 RGWObjectCtx rctx(this);
13558 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13559 if (r < 0)
13560 return r;
13561
13562 list_state.pending_map.clear(); // we don't need this and it inflates size
13563 if (!astate->exists) {
13564 /* object doesn't exist right now -- hopefully because it's
13565 * marked as !exists and got deleted */
13566 if (list_state.exists) {
13567 /* FIXME: what should happen now? Work out if there are any
13568 * non-bad ways this could happen (there probably are, but annoying
13569 * to handle!) */
13570 }
13571 // encode a suggested removal of that key
13572 list_state.ver.epoch = io_ctx.get_last_version();
13573 list_state.ver.pool = io_ctx.get_id();
13574 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13575 return -ENOENT;
13576 }
13577
13578 string etag;
13579 string content_type;
13580 ACLOwner owner;
13581
13582 object.meta.size = astate->size;
13583 object.meta.accounted_size = astate->accounted_size;
13584 object.meta.mtime = astate->mtime;
13585
13586 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13587 if (iter != astate->attrset.end()) {
13588 etag = iter->second.c_str();
13589 }
13590 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13591 if (iter != astate->attrset.end()) {
13592 content_type = iter->second.c_str();
13593 }
13594 iter = astate->attrset.find(RGW_ATTR_ACL);
13595 if (iter != astate->attrset.end()) {
13596 r = decode_policy(iter->second, &owner);
13597 if (r < 0) {
13598 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13599 }
13600 }
13601
13602 if (astate->has_manifest) {
13603 RGWObjManifest::obj_iterator miter;
13604 RGWObjManifest& manifest = astate->manifest;
13605 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13606 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13607 rgw_obj loc;
13608 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13609
13610 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13611 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13612 r = delete_obj_index(loc);
13613 if (r < 0) {
13614 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13615 }
13616 }
13617 }
13618 }
13619
13620 object.meta.etag = etag;
13621 object.meta.content_type = content_type;
13622 object.meta.owner = owner.get_id().to_str();
13623 object.meta.owner_display_name = owner.get_display_name();
13624
13625 // encode suggested updates
13626 list_state.ver.pool = io_ctx.get_id();
13627 list_state.ver.epoch = astate->epoch;
13628 list_state.meta.size = object.meta.size;
13629 list_state.meta.accounted_size = object.meta.accounted_size;
13630 list_state.meta.mtime = object.meta.mtime;
13631 list_state.meta.category = main_category;
13632 list_state.meta.etag = etag;
13633 list_state.meta.content_type = content_type;
13634 if (astate->obj_tag.length() > 0)
13635 list_state.tag = astate->obj_tag.c_str();
13636 list_state.meta.owner = owner.get_id().to_str();
13637 list_state.meta.owner_display_name = owner.get_display_name();
13638
13639 list_state.exists = true;
13640 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13641 return 0;
13642}
13643
13644int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13645{
13646 librados::IoCtx index_ctx;
13647 map<int, string> oids;
13648 map<int, struct rgw_cls_list_ret> list_results;
13649 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13650 if (r < 0)
13651 return r;
13652
13653 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13654 if (r < 0)
13655 return r;
13656
13657 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13658 for(; iter != list_results.end(); ++iter) {
13659 headers[oids[iter->first]] = iter->second.dir.header;
13660 }
13661 return 0;
13662}
13663
13664int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13665{
13666 librados::IoCtx index_ctx;
13667 map<int, string> bucket_objs;
13668 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13669 if (r < 0)
13670 return r;
13671
13672 map<int, string>::iterator iter = bucket_objs.begin();
13673 for (; iter != bucket_objs.end(); ++iter) {
13674 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13675 if (r < 0) {
13676 ctx->put();
13677 break;
13678 } else {
13679 (*num_aio)++;
13680 }
13681 }
13682 return r;
13683}
13684
13685int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13686{
13687 string buckets_obj_id;
13688 rgw_get_buckets_obj(user_id, buckets_obj_id);
13689 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13690
13691 rgw_rados_ref ref;
224ce89b 13692 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13693 if (r < 0) {
13694 return r;
13695 }
13696
13697 librados::ObjectReadOperation op;
13698 int rc;
13699 ::cls_user_get_header(op, header, &rc);
13700 bufferlist ibl;
13701 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13702 if (r < 0)
13703 return r;
13704 if (rc < 0)
13705 return rc;
13706
13707 return 0;
13708}
13709
94b18763
FG
13710int RGWRados::cls_user_reset_stats(const string& user_id)
13711{
13712 string buckets_obj_id;
13713 rgw_get_buckets_obj(user_id, buckets_obj_id);
13714 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13715
13716 rgw_rados_ref ref;
13717 int r = get_raw_obj_ref(obj, &ref);
13718 if (r < 0) {
13719 return r;
13720 }
13721
13722 librados::ObjectWriteOperation op;
13723 ::cls_user_reset_stats(op);
13724 return ref.ioctx.operate(ref.oid, &op);
13725}
13726
7c673cae
FG
13727int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13728{
13729 string buckets_obj_id;
13730 rgw_get_buckets_obj(user_id, buckets_obj_id);
13731 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13732
13733 rgw_rados_ref ref;
224ce89b 13734 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13735 if (r < 0) {
13736 return r;
13737 }
13738
13739 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13740 if (r < 0)
13741 return r;
13742
13743 return 0;
13744}
13745
13746int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13747{
13748 map<string, struct rgw_bucket_dir_header> headers;
13749 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13750 if (r < 0) {
13751 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13752 return r;
13753 }
13754
13755 cls_user_bucket_entry entry;
13756
13757 bucket_info.bucket.convert(&entry.bucket);
13758
c07f9fc5
FG
13759 for (const auto& hiter : headers) {
13760 for (const auto& iter : hiter.second.stats) {
13761 const struct rgw_bucket_category_stats& header_stats = iter.second;
7c673cae
FG
13762 entry.size += header_stats.total_size;
13763 entry.size_rounded += header_stats.total_size_rounded;
13764 entry.count += header_stats.num_entries;
13765 }
13766 }
13767
13768 list<cls_user_bucket_entry> entries;
13769 entries.push_back(entry);
13770
13771 r = cls_user_update_buckets(user_obj, entries, false);
13772 if (r < 0) {
13773 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13774 return r;
13775 }
13776
13777 return 0;
13778}
13779
c07f9fc5
FG
13780int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13781{
13782 map<string, struct rgw_bucket_dir_header> headers;
13783 RGWBucketInfo bucket_info;
13784 RGWObjectCtx obj_ctx(this);
13785 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13786 if (ret < 0) {
13787 return ret;
13788 }
13789
13790 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13791 if (ret < 0) {
13792 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13793 return ret;
13794 }
13795
13796 bucket.convert(&entry.bucket);
13797
13798 for (const auto& hiter : headers) {
13799 for (const auto& iter : hiter.second.stats) {
13800 const struct rgw_bucket_category_stats& header_stats = iter.second;
13801 entry.size += header_stats.total_size;
13802 entry.size_rounded += header_stats.total_size_rounded;
13803 entry.count += header_stats.num_entries;
13804 }
13805 }
13806
13807 return 0;
13808}
13809
7c673cae
FG
13810int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13811 const string& in_marker,
13812 const string& end_marker,
13813 const int max_entries,
13814 list<cls_user_bucket_entry>& entries,
13815 string * const out_marker,
13816 bool * const truncated)
13817{
13818 rgw_rados_ref ref;
224ce89b 13819 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13820 if (r < 0) {
13821 return r;
13822 }
13823
13824 librados::ObjectReadOperation op;
13825 int rc;
13826
13827 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13828 bufferlist ibl;
13829 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13830 if (r < 0)
13831 return r;
13832 if (rc < 0)
13833 return rc;
13834
13835 return 0;
13836}
13837
13838int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13839{
13840 rgw_rados_ref ref;
224ce89b 13841 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13842 if (r < 0) {
13843 return r;
13844 }
13845
13846 librados::ObjectWriteOperation op;
13847 cls_user_set_buckets(op, entries, add);
13848 r = ref.ioctx.operate(ref.oid, &op);
13849 if (r < 0)
13850 return r;
13851
13852 return 0;
13853}
13854
13855int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13856{
13857 string buckets_obj_id;
13858 rgw_get_buckets_obj(user_id, buckets_obj_id);
13859 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13860 return cls_user_complete_stats_sync(obj);
13861}
13862
13863int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13864{
13865 rgw_rados_ref ref;
224ce89b 13866 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13867 if (r < 0) {
13868 return r;
13869 }
13870
13871 librados::ObjectWriteOperation op;
13872 ::cls_user_complete_stats_sync(op);
13873 r = ref.ioctx.operate(ref.oid, &op);
13874 if (r < 0)
13875 return r;
13876
13877 return 0;
13878}
13879
13880int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13881{
13882 list<cls_user_bucket_entry> l;
13883 l.push_back(entry);
13884
13885 return cls_user_update_buckets(obj, l, true);
13886}
13887
13888int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13889{
7c673cae 13890 rgw_rados_ref ref;
224ce89b 13891 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
13892 if (r < 0) {
13893 return r;
13894 }
13895
13896 librados::ObjectWriteOperation op;
13897 ::cls_user_remove_bucket(op, bucket);
13898 r = ref.ioctx.operate(ref.oid, &op);
13899 if (r < 0)
13900 return r;
13901
13902 return 0;
13903}
13904
224ce89b 13905int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
31f18b77
FG
13906 RGWQuotaInfo& bucket_quota)
13907{
13908 if (!cct->_conf->rgw_dynamic_resharding) {
13909 return 0;
13910 }
13911
13912 bool need_resharding = false;
13913 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13914 uint32_t suggested_num_shards;
13915
13916 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13917 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13918 1, need_resharding, &suggested_num_shards);
13919 if (ret < 0) {
13920 return ret;
13921 }
13922
13923 if (need_resharding) {
224ce89b
WB
13924 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13925 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13926 dendl;
31f18b77
FG
13927 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13928 }
13929
13930 return ret;
13931}
13932
13933int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13934{
13935 RGWReshard reshard(this);
13936
13937 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13938
13939 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13940 if (new_num_shards <= num_source_shards) {
13941 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13942 return 0;
13943 }
13944
13945 cls_rgw_reshard_entry entry;
13946 entry.time = real_clock::now();
13947 entry.tenant = bucket_info.owner.tenant;
13948 entry.bucket_name = bucket_info.bucket.name;
13949 entry.bucket_id = bucket_info.bucket.bucket_id;
13950 entry.old_num_shards = num_source_shards;
13951 entry.new_num_shards = new_num_shards;
13952
13953 return reshard.add(entry);
13954}
13955
7c673cae
FG
13956int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13957 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13958{
13959 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13960}
13961
13962void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
1adf2230
AA
13963 uint32_t num_shards,
13964 map<int, string>& bucket_objects,
13965 int shard_id) {
7c673cae
FG
13966 if (!num_shards) {
13967 bucket_objects[0] = bucket_oid_base;
13968 } else {
13969 char buf[bucket_oid_base.size() + 32];
13970 if (shard_id < 0) {
13971 for (uint32_t i = 0; i < num_shards; ++i) {
13972 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13973 bucket_objects[i] = buf;
13974 }
13975 } else {
13976 if ((uint32_t)shard_id > num_shards) {
13977 return;
13978 }
13979 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13980 bucket_objects[shard_id] = buf;
13981 }
13982 }
13983}
13984
13985void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13986{
13987 const rgw_bucket& bucket = bucket_info.bucket;
13988 string plain_id = bucket.name + ":" + bucket.bucket_id;
13989 if (!bucket_info.num_shards) {
13990 (*result)[0] = plain_id;
13991 } else {
13992 char buf[16];
13993 if (shard_id < 0) {
13994 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13995 snprintf(buf, sizeof(buf), ":%d", i);
13996 (*result)[i] = plain_id + buf;
13997 }
13998 } else {
13999 if ((uint32_t)shard_id > bucket_info.num_shards) {
14000 return;
14001 }
14002 snprintf(buf, sizeof(buf), ":%d", shard_id);
14003 (*result)[shard_id] = plain_id + buf;
14004 }
14005 }
14006}
14007
14008int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
14009 int *shard_id)
14010{
14011 int r = 0;
14012 switch (bucket_info.bucket_index_shard_hash_type) {
14013 case RGWBucketInfo::MOD:
14014 if (!bucket_info.num_shards) {
14015 if (shard_id) {
14016 *shard_id = -1;
14017 }
14018 } else {
1adf2230 14019 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
7c673cae
FG
14020 if (shard_id) {
14021 *shard_id = (int)sid;
14022 }
14023 }
14024 break;
14025 default:
14026 r = -ENOTSUP;
14027 }
14028 return r;
14029}
14030
14031void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
14032 int shard_id, string *bucket_obj)
14033{
14034 if (!num_shards) {
14035 // By default with no sharding, we use the bucket oid as itself
14036 (*bucket_obj) = bucket_oid_base;
14037 } else {
14038 char buf[bucket_oid_base.size() + 32];
14039 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
14040 (*bucket_obj) = buf;
14041 }
14042}
14043
14044int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
14045 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
14046{
14047 int r = 0;
14048 switch (hash_type) {
14049 case RGWBucketInfo::MOD:
14050 if (!num_shards) {
14051 // By default with no sharding, we use the bucket oid as itself
14052 (*bucket_obj) = bucket_oid_base;
14053 if (shard_id) {
14054 *shard_id = -1;
14055 }
14056 } else {
1adf2230 14057 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
14058 char buf[bucket_oid_base.size() + 32];
14059 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
14060 (*bucket_obj) = buf;
14061 if (shard_id) {
14062 *shard_id = (int)sid;
14063 }
14064 }
14065 break;
14066 default:
14067 r = -ENOTSUP;
14068 }
14069 return r;
14070}
14071
14072void RGWStateLog::oid_str(int shard, string& oid) {
14073 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
14074 char buf[16];
14075 snprintf(buf, sizeof(buf), "%d", shard);
14076 oid += buf;
14077}
14078
14079int RGWStateLog::get_shard_num(const string& object) {
14080 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
14081 return val % num_shards;
14082}
14083
14084string RGWStateLog::get_oid(const string& object) {
14085 int shard = get_shard_num(object);
14086 string oid;
14087 oid_str(shard, oid);
14088 return oid;
14089}
14090
14091int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
14092 rgw_pool pool;
14093 store->get_log_pool(pool);
14094 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
14095 if (r < 0) {
14096 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
14097 return r;
14098 }
14099 return 0;
14100}
14101
14102int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
14103 uint32_t state, bufferlist *bl, uint32_t *check_state)
14104{
14105 if (client_id.empty() ||
14106 op_id.empty() ||
14107 object.empty()) {
14108 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14109 }
14110
14111 librados::IoCtx ioctx;
14112 int r = open_ioctx(ioctx);
14113 if (r < 0)
14114 return r;
14115
14116 string oid = get_oid(object);
14117
14118 librados::ObjectWriteOperation op;
14119 if (check_state) {
14120 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
14121 }
14122 utime_t ts = ceph_clock_now();
14123 bufferlist nobl;
14124 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
14125 r = ioctx.operate(oid, &op);
14126 if (r < 0) {
14127 return r;
14128 }
14129
14130 return 0;
14131}
14132
14133int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
14134{
14135 if (client_id.empty() ||
14136 op_id.empty() ||
14137 object.empty()) {
14138 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14139 }
14140
14141 librados::IoCtx ioctx;
14142 int r = open_ioctx(ioctx);
14143 if (r < 0)
14144 return r;
14145
14146 string oid = get_oid(object);
14147
14148 librados::ObjectWriteOperation op;
14149 cls_statelog_remove_by_object(op, object, op_id);
14150 r = ioctx.operate(oid, &op);
14151 if (r < 0) {
14152 return r;
14153 }
14154
14155 return 0;
14156}
14157
14158void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
14159 void **handle)
14160{
14161 list_state *state = new list_state;
14162 state->client_id = client_id;
14163 state->op_id = op_id;
14164 state->object = object;
14165 if (object.empty()) {
14166 state->cur_shard = 0;
14167 state->max_shard = num_shards - 1;
14168 } else {
14169 state->cur_shard = state->max_shard = get_shard_num(object);
14170 }
14171 *handle = (void *)state;
14172}
14173
14174int RGWStateLog::list_entries(void *handle, int max_entries,
14175 list<cls_statelog_entry>& entries,
14176 bool *done)
14177{
14178 list_state *state = static_cast<list_state *>(handle);
14179
14180 librados::IoCtx ioctx;
14181 int r = open_ioctx(ioctx);
14182 if (r < 0)
14183 return r;
14184
14185 entries.clear();
14186
14187 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
14188 string oid;
14189 oid_str(state->cur_shard, oid);
14190
14191 librados::ObjectReadOperation op;
14192 list<cls_statelog_entry> ents;
14193 bool truncated;
14194 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
14195 max_entries, ents, &state->marker, &truncated);
14196 bufferlist ibl;
14197 r = ioctx.operate(oid, &op, &ibl);
14198 if (r == -ENOENT) {
14199 truncated = false;
14200 r = 0;
14201 }
14202 if (r < 0) {
14203 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
14204 return r;
14205 }
14206
14207 if (!truncated) {
14208 state->marker.clear();
14209 }
14210
14211 max_entries -= ents.size();
14212
14213 entries.splice(entries.end(), ents);
14214
14215 if (truncated)
14216 break;
14217 }
14218
14219 *done = (state->cur_shard > state->max_shard);
14220
14221 return 0;
14222}
14223
14224void RGWStateLog::finish_list_entries(void *handle)
14225{
14226 list_state *state = static_cast<list_state *>(handle);
14227 delete state;
14228}
14229
14230void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
14231{
14232 f->open_object_section("statelog_entry");
14233 f->dump_string("client_id", entry.client_id);
14234 f->dump_string("op_id", entry.op_id);
14235 f->dump_string("object", entry.object);
14236 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
14237 if (!dump_entry_internal(entry, f)) {
14238 f->dump_int("state", entry.state);
14239 }
14240 f->close_section();
14241}
14242
14243RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
14244{
14245}
14246
14247bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
14248{
14249 string s;
14250 switch ((OpState)entry.state) {
14251 case OPSTATE_UNKNOWN:
14252 s = "unknown";
14253 break;
14254 case OPSTATE_IN_PROGRESS:
14255 s = "in-progress";
14256 break;
14257 case OPSTATE_COMPLETE:
14258 s = "complete";
14259 break;
14260 case OPSTATE_ERROR:
14261 s = "error";
14262 break;
14263 case OPSTATE_ABORT:
14264 s = "abort";
14265 break;
14266 case OPSTATE_CANCELLED:
14267 s = "cancelled";
14268 break;
14269 default:
14270 s = "invalid";
14271 }
14272 f->dump_string("state", s);
14273 return true;
14274}
14275
14276int RGWOpState::state_from_str(const string& s, OpState *state)
14277{
14278 if (s == "unknown") {
14279 *state = OPSTATE_UNKNOWN;
14280 } else if (s == "in-progress") {
14281 *state = OPSTATE_IN_PROGRESS;
14282 } else if (s == "complete") {
14283 *state = OPSTATE_COMPLETE;
14284 } else if (s == "error") {
14285 *state = OPSTATE_ERROR;
14286 } else if (s == "abort") {
14287 *state = OPSTATE_ABORT;
14288 } else if (s == "cancelled") {
14289 *state = OPSTATE_CANCELLED;
14290 } else {
14291 return -EINVAL;
14292 }
14293
14294 return 0;
14295}
14296
14297int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
14298{
14299 uint32_t s = (uint32_t)state;
14300 return store_entry(client_id, op_id, object, s, NULL, NULL);
14301}
14302
14303int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
14304{
14305 uint32_t s = (uint32_t)state;
14306 return store_entry(client_id, op_id, object, s, NULL, &s);
14307}
14308
14309RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
14310 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
14311{
14312 cct = store->ctx();
14313 cur_state = RGWOpState::OPSTATE_UNKNOWN;
14314}
14315
14316int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
14317 last_update = real_clock::now();
14318 cur_state = state;
14319 return os.set_state(client_id, op_id, object, state);
14320}
14321
14322int RGWOpStateSingleOp::renew_state() {
14323 real_time now = real_clock::now();
14324
14325 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
14326
14327 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
14328 return 0;
14329 }
14330
14331 last_update = now;
14332 return os.renew_state(client_id, op_id, object, cur_state);
14333}
14334
14335
14336uint64_t RGWRados::instance_id()
14337{
14338 return get_rados_handle()->get_instance_id();
14339}
14340
14341uint64_t RGWRados::next_bucket_id()
14342{
14343 Mutex::Locker l(bucket_id_lock);
14344 return ++max_bucket_id;
14345}
14346
28e407b8
AA
14347RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
14348 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
7c673cae 14349{
7c673cae
FG
14350 RGWRados *store = NULL;
14351 if (!use_cache) {
14352 store = new RGWRados;
14353 } else {
28e407b8 14354 store = new RGWCache<RGWRados>;
7c673cae
FG
14355 }
14356
31f18b77 14357 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
7c673cae
FG
14358 delete store;
14359 return NULL;
14360 }
14361
14362 return store;
14363}
14364
14365RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
14366{
14367 RGWRados *store = NULL;
14368 store = new RGWRados;
14369
14370 store->set_context(cct);
14371
14372 if (store->init_rados() < 0) {
14373 delete store;
14374 return NULL;
14375 }
14376
14377 return store;
14378}
14379
14380void RGWStoreManager::close_storage(RGWRados *store)
14381{
14382 if (!store)
14383 return;
14384
14385 store->finalize();
14386
14387 delete store;
14388}
14389
14390librados::Rados* RGWRados::get_rados_handle()
14391{
14392 if (rados.size() == 1) {
14393 return &rados[0];
14394 } else {
14395 handle_lock.get_read();
14396 pthread_t id = pthread_self();
14397 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
14398
14399 if (it != rados_map.end()) {
14400 handle_lock.put_read();
14401 return &rados[it->second];
14402 } else {
14403 handle_lock.put_read();
14404 handle_lock.get_write();
14405 const uint32_t handle = next_rados_handle;
14406 rados_map[id] = handle;
14407 if (++next_rados_handle == rados.size()) {
14408 next_rados_handle = 0;
14409 }
14410 handle_lock.put_write();
14411 return &rados[handle];
14412 }
14413 }
14414}
14415
14416int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
14417{
14418 rgw_rados_ref ref;
14419 int ret = get_raw_obj_ref(obj, &ref);
14420 if (ret < 0) {
14421 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14422 return ret;
14423 }
14424
14425 ObjectWriteOperation op;
14426 list<string> prefixes;
14427 cls_rgw_remove_obj(op, prefixes);
14428
14429 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14430 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14431 if (ret < 0) {
14432 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14433 c->release();
14434 return ret;
14435 }
14436
14437 handles.push_back(c);
14438
14439 return 0;
14440}
14441
14442int RGWRados::delete_obj_aio(const rgw_obj& obj,
14443 RGWBucketInfo& bucket_info, RGWObjState *astate,
14444 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14445{
14446 rgw_rados_ref ref;
14447 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14448 if (ret < 0) {
14449 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14450 return ret;
14451 }
14452
14453 if (keep_index_consistent) {
14454 RGWRados::Bucket bop(this, bucket_info);
14455 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14456
14457 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14458 if (ret < 0) {
14459 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14460 return ret;
14461 }
14462 }
14463
14464 ObjectWriteOperation op;
14465 list<string> prefixes;
14466 cls_rgw_remove_obj(op, prefixes);
14467
14468 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14469 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14470 if (ret < 0) {
14471 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14472 c->release();
14473 return ret;
14474 }
14475
14476 handles.push_back(c);
14477
14478 if (keep_index_consistent) {
14479 ret = delete_obj_index(obj);
14480 if (ret < 0) {
14481 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14482 return ret;
14483 }
14484 }
14485 return ret;
14486}
14487
14488int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14489 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14490 if (value != attrs.end()) {
14491 bufferlist::iterator bliter = value->second.begin();
14492 try {
14493 ::decode(cs_info, bliter);
14494 } catch (buffer::error& err) {
14495 return -EIO;
14496 }
14497 if (cs_info.blocks.size() == 0) {
14498 return -EIO;
14499 }
14500 if (cs_info.compression_type != "none")
14501 need_decompress = true;
14502 else
14503 need_decompress = false;
14504 return 0;
14505 } else {
14506 need_decompress = false;
14507 return 0;
14508 }
14509}
14510
3a9019d9
FG
14511bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14512 bufferlist& out)
14513{
14514 if (command == "cache list") {
14515 boost::optional<std::string> filter;
14516 auto i = cmdmap.find("filter");
14517 if (i != cmdmap.cend()) {
14518 filter = boost::get<std::string>(i->second);
14519 }
14520 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14521 if (f) {
14522 f->open_array_section("cache_entries");
14523 call_list(filter, f.get());
14524 f->close_section();
14525 f->flush(out);
14526 return true;
14527 } else {
14528 out.append("Unable to create Formatter.\n");
14529 return false;
14530 }
14531 } else if (command == "cache inspect") {
14532 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14533 if (f) {
14534 const auto& target = boost::get<std::string>(cmdmap["target"]);
14535 if (call_inspect(target, f.get())) {
14536 f->flush(out);
14537 return true;
14538 } else {
14539 out.append(string("Unable to find entry ") + target + string(".\n"));
14540 return false;
14541 }
14542 } else {
14543 out.append("Unable to create Formatter.\n");
14544 return false;
14545 }
14546 } else if (command == "cache erase") {
14547 const auto& target = boost::get<std::string>(cmdmap["target"]);
14548 if (call_erase(target)) {
14549 return true;
14550 } else {
14551 out.append(string("Unable to find entry ") + target + string(".\n"));
14552 return false;
14553 }
14554 } else if (command == "cache zap") {
14555 call_zap();
14556 return true;
14557 }
14558 return false;
14559}
14560
14561void RGWRados::call_list(const boost::optional<std::string>&,
14562 ceph::Formatter*)
14563{
14564 return;
14565}
14566
14567bool RGWRados::call_inspect(const std::string&, Formatter*)
14568{
14569 return false;
14570}
14571
14572bool RGWRados::call_erase(const std::string&) {
14573 return false;
14574}
14575
14576void RGWRados::call_zap() {
14577 return;
14578}