]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
bump version to 12.2.10-pve1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
8#include <boost/algorithm/string.hpp>
9
10#include <boost/format.hpp>
11#include <boost/optional.hpp>
12#include <boost/utility/in_place_factory.hpp>
13
14#include "common/ceph_json.h"
15#include "common/utf8.h"
16
17#include "common/errno.h"
18#include "common/Formatter.h"
19#include "common/Throttle.h"
20#include "common/Finisher.h"
21
22#include "rgw_rados.h"
23#include "rgw_cache.h"
24#include "rgw_acl.h"
25#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26#include "rgw_metadata.h"
27#include "rgw_bucket.h"
28#include "rgw_rest_conn.h"
29#include "rgw_cr_rados.h"
30#include "rgw_cr_rest.h"
31
32#include "cls/rgw/cls_rgw_ops.h"
33#include "cls/rgw/cls_rgw_types.h"
34#include "cls/rgw/cls_rgw_client.h"
35#include "cls/rgw/cls_rgw_const.h"
36#include "cls/refcount/cls_refcount_client.h"
37#include "cls/version/cls_version_client.h"
38#include "cls/log/cls_log_client.h"
39#include "cls/statelog/cls_statelog_client.h"
40#include "cls/timeindex/cls_timeindex_client.h"
41#include "cls/lock/cls_lock_client.h"
42#include "cls/user/cls_user_client.h"
c07f9fc5 43#include "osd/osd_types.h"
7c673cae
FG
44
45#include "rgw_tools.h"
46#include "rgw_coroutine.h"
47#include "rgw_compression.h"
48
7c673cae
FG
49#undef fork // fails to compile RGWPeriod::fork() below
50
51#include "common/Clock.h"
52
53#include "include/rados/librados.hpp"
54using namespace librados;
55
56#include <string>
57#include <iostream>
58#include <vector>
59#include <atomic>
60#include <list>
61#include <map>
62#include "auth/Crypto.h" // get_random_bytes()
63
64#include "rgw_log.h"
65
66#include "rgw_gc.h"
67#include "rgw_lc.h"
68
69#include "rgw_object_expirer_core.h"
70#include "rgw_sync.h"
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae
FG
74
75#include "compressor/Compressor.h"
76
7c673cae
FG
77#define dout_context g_ceph_context
78#define dout_subsys ceph_subsys_rgw
79
80using namespace std;
81
82static string notify_oid_prefix = "notify";
83static string *notify_oids = NULL;
84static string shadow_ns = "shadow";
85static string dir_oid_prefix = ".dir.";
86static string default_storage_pool_suffix = "rgw.buckets.data";
87static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89static string avail_pools = ".pools.avail";
90
91static string zone_info_oid_prefix = "zone_info.";
92static string zone_names_oid_prefix = "zone_names.";
93static string region_info_oid_prefix = "region_info.";
94static string zone_group_info_oid_prefix = "zonegroup_info.";
95static string realm_names_oid_prefix = "realms_names.";
96static string realm_info_oid_prefix = "realms.";
97static string default_region_info_oid = "default.region";
98static string default_zone_group_info_oid = "default.zonegroup";
99static string period_info_oid_prefix = "periods.";
100static string period_latest_epoch_info_oid = ".latest_epoch";
101static string region_map_oid = "region_map";
102static string zonegroup_map_oid = "zonegroup_map";
103static string log_lock_name = "rgw_log_lock";
104static string default_realm_info_oid = "default.realm";
105const string default_zonegroup_name = "default";
106const string default_zone_name = "default";
107static string zonegroup_names_oid_prefix = "zonegroups_names.";
108static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109#define RGW_USAGE_OBJ_PREFIX "usage."
110#define FIRST_EPOCH 1
111static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116#define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118#define dout_subsys ceph_subsys_rgw
119
120
121static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123{
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
31f18b77 133 *pool = placement.get_data_extra_pool();
7c673cae
FG
134 }
135 }
136
137 return true;
138}
139
140static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142{
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146}
147
148rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149{
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156}
157
158rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159{
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166}
167
168int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169{
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
28e407b8
AA
173 if (r == -ERANGE) {
174 dout(0)
175 << __func__
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
179 << dendl;
180 }
7c673cae
FG
181 if (r < 0 && r != -EEXIST) {
182 return r;
183 }
184
185 r = rados->ioctx_create(pool.name.c_str(), ioctx);
c07f9fc5
FG
186 if (r < 0) {
187 return r;
188 }
189
190 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
191 if (r < 0 && r != -EOPNOTSUPP) {
192 return r;
193 }
194 } else if (r < 0) {
7c673cae
FG
195 return r;
196 }
197 if (!pool.ns.empty()) {
198 ioctx.set_namespace(pool.ns);
199 }
200 return 0;
201}
202
203template<>
204void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
205 RWLock::WLocker wl(lock);
206 auto iter = objs_state.find(obj);
207 if (iter == objs_state.end()) {
208 return;
209 }
210 bool is_atomic = iter->second.is_atomic;
211 bool prefetch_data = iter->second.prefetch_data;
212
213 objs_state.erase(iter);
214
215 if (is_atomic || prefetch_data) {
216 auto& s = objs_state[obj];
217 s.is_atomic = is_atomic;
218 s.prefetch_data = prefetch_data;
219 }
220}
221
222template<>
223void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
224 RWLock::WLocker wl(lock);
225 auto iter = objs_state.find(obj);
226 if (iter == objs_state.end()) {
227 return;
228 }
229
230 objs_state.erase(iter);
231}
232
233void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
234 encode_json("default_zonegroup", default_zonegroup, f);
235}
236
237void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
238
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
240 /* backward compatability with region */
241 if (default_zonegroup.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
243 }
244}
245
246rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
247{
248 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
250 }
251
252 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
253}
254
255int RGWZoneGroup::create_default(bool old_format)
256{
257 name = default_zonegroup_name;
258 is_master = true;
259
260 RGWZoneGroupPlacementTarget placement_target;
261 placement_target.name = "default-placement";
262 placement_targets[placement_target.name] = placement_target;
263 default_placement = "default-placement";
264
265 RGWZoneParams zone_params(default_zone_name);
266
267 int r = zone_params.init(cct, store, false);
268 if (r < 0) {
269 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
270 return r;
271 }
272
273 r = zone_params.create_default();
274 if (r < 0 && r != -EEXIST) {
275 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 } else if (r == -EEXIST) {
278 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
279 zone_params.clear_id();
280 r = zone_params.init(cct, store);
281 if (r < 0) {
282 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
283 return r;
284 }
285 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
286 << dendl;
287 }
288
289 RGWZone& default_zone = zones[zone_params.get_id()];
290 default_zone.name = zone_params.get_name();
291 default_zone.id = zone_params.get_id();
292 master_zone = default_zone.id;
293
294 r = create();
295 if (r < 0 && r != -EEXIST) {
296 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
297 return r;
298 }
299
300 if (r == -EEXIST) {
301 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
302 id.clear();
303 r = init(cct, store);
304 if (r < 0) {
305 return r;
306 }
307 }
308
309 if (old_format) {
310 name = id;
311 }
312
313 post_process_params();
314
315 return 0;
316}
317
318const string RGWZoneGroup::get_default_oid(bool old_region_format)
319{
320 if (old_region_format) {
321 if (cct->_conf->rgw_default_region_info_oid.empty()) {
322 return default_region_info_oid;
323 }
324 return cct->_conf->rgw_default_region_info_oid;
325 }
326
327 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
328
329 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
330 default_oid = default_zone_group_info_oid;
331 }
332
333 default_oid += "." + realm_id;
334
335 return default_oid;
336}
337
338const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
339{
340 if (old_region_format) {
341 return region_info_oid_prefix;
342 }
343 return zone_group_info_oid_prefix;
344}
345
346const string& RGWZoneGroup::get_names_oid_prefix()
347{
348 return zonegroup_names_oid_prefix;
349}
350
351const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
352 return cct->_conf->rgw_zonegroup;
353}
354
355int RGWZoneGroup::equals(const string& other_zonegroup) const
356{
357 if (is_master && other_zonegroup.empty())
358 return true;
359
360 return (id == other_zonegroup);
361}
362
363int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
364 const list<string>& endpoints, const string *ptier_type,
365 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
366{
367 auto& zone_id = zone_params.get_id();
368 auto& zone_name = zone_params.get_name();
369
370 // check for duplicate zone name on insert
371 if (!zones.count(zone_id)) {
372 for (const auto& zone : zones) {
373 if (zone.second.name == zone_name) {
374 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
376 return -EEXIST;
377 }
378 }
379 }
380
381 if (is_master) {
382 if (*is_master) {
383 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
384 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
385 }
386 master_zone = zone_params.get_id();
387 } else if (master_zone == zone_params.get_id()) {
388 master_zone.clear();
389 }
390 }
391
392 RGWZone& zone = zones[zone_params.get_id()];
393 zone.name = zone_params.get_name();
394 zone.id = zone_params.get_id();
395 if (!endpoints.empty()) {
396 zone.endpoints = endpoints;
397 }
398 if (read_only) {
399 zone.read_only = *read_only;
400 }
401 if (ptier_type) {
402 zone.tier_type = *ptier_type;
403 }
404
405 if (psync_from_all) {
406 zone.sync_from_all = *psync_from_all;
407 }
408
409 for (auto add : sync_from) {
410 zone.sync_from.insert(add);
411 }
412
413 for (auto rm : sync_from_rm) {
414 zone.sync_from.erase(rm);
415 }
416
417 post_process_params();
418
419 return update();
420}
421
422
423int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
424{
425 RGWZone& zone = zones[zone_params.get_id()];
426 zone.name = zone_params.get_name();
427
428 return update();
429}
430
431void RGWZoneGroup::post_process_params()
432{
433 bool log_data = zones.size() > 1;
434
435 if (master_zone.empty()) {
436 map<string, RGWZone>::iterator iter = zones.begin();
437 if (iter != zones.end()) {
438 master_zone = iter->first;
439 }
440 }
441
442 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
443 RGWZone& zone = iter->second;
444 zone.log_data = log_data;
7c673cae
FG
445
446 RGWZoneParams zone_params(zone.id, zone.name);
447 int ret = zone_params.init(cct, store);
448 if (ret < 0) {
449 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
450 continue;
451 }
452
453 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
454 iter != zone_params.placement_pools.end(); ++iter) {
455 const string& placement_name = iter->first;
456 if (placement_targets.find(placement_name) == placement_targets.end()) {
457 RGWZoneGroupPlacementTarget placement_target;
458 placement_target.name = placement_name;
459 placement_targets[placement_name] = placement_target;
460 }
461 }
462 }
463
464 if (default_placement.empty() && !placement_targets.empty()) {
465 default_placement = placement_targets.begin()->first;
466 }
467}
468
469int RGWZoneGroup::remove_zone(const std::string& zone_id)
470{
471 map<string, RGWZone>::iterator iter = zones.find(zone_id);
472 if (iter == zones.end()) {
473 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
474 << name << dendl;
475 return -ENOENT;
476 }
477
478 zones.erase(iter);
479
480 post_process_params();
481
482 return update();
483}
484
485int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
486{
487 if (realm_id.empty()) {
488 /* try using default realm */
489 RGWRealm realm;
490 int ret = realm.init(cct, store);
b32b8144 491 // no default realm exist
7c673cae 492 if (ret < 0) {
b32b8144 493 return read_id(default_zonegroup_name, default_id);
7c673cae
FG
494 }
495 realm_id = realm.get_id();
496 }
497
498 return RGWSystemMetaObj::read_default_id(default_id, old_format);
499}
500
501int RGWZoneGroup::set_as_default(bool exclusive)
502{
503 if (realm_id.empty()) {
504 /* try using default realm */
505 RGWRealm realm;
506 int ret = realm.init(cct, store);
507 if (ret < 0) {
508 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
509 return -EINVAL;
510 }
511 realm_id = realm.get_id();
512 }
513
514 return RGWSystemMetaObj::set_as_default(exclusive);
515}
516
517int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
518{
519 cct = _cct;
520 store = _store;
521
522 if (!setup_obj)
523 return 0;
524
525 if (old_format && id.empty()) {
526 id = name;
527 }
528
529 if (id.empty()) {
530 int r;
531 if (name.empty()) {
532 name = get_predefined_name(cct);
533 }
534 if (name.empty()) {
535 r = use_default(old_format);
536 if (r < 0) {
537 return r;
538 }
539 } else if (!old_format) {
540 r = read_id(name, id);
541 if (r < 0) {
542 if (r != -ENOENT) {
543 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
544 }
545 return r;
546 }
547 }
548 }
549
550 return read_info(id, old_format);
551}
552
553int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
554{
555 auto pool = get_pool(cct);
556 bufferlist bl;
557 RGWObjectCtx obj_ctx(store);
558 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
559 if (ret < 0)
560 return ret;
561
562 try {
563 bufferlist::iterator iter = bl.begin();
564 ::decode(default_info, iter);
565 } catch (buffer::error& err) {
566 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
567 return -EIO;
568 }
569
570 return 0;
571}
572
573int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
574{
575 RGWDefaultSystemMetaObjInfo default_info;
576
577 int ret = read_default(default_info, get_default_oid(old_format));
578 if (ret < 0) {
579 return ret;
580 }
581
582 default_id = default_info.default_id;
583
584 return 0;
585}
586
587int RGWSystemMetaObj::use_default(bool old_format)
588{
589 return read_default_id(id, old_format);
590}
591
592int RGWSystemMetaObj::set_as_default(bool exclusive)
593{
594 string oid = get_default_oid();
595
596 rgw_pool pool(get_pool(cct));
597 bufferlist bl;
598
599 RGWDefaultSystemMetaObjInfo default_info;
600 default_info.default_id = id;
601
602 ::encode(default_info, bl);
603
604 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
605 exclusive, NULL, real_time(), NULL);
606 if (ret < 0)
607 return ret;
608
609 return 0;
610}
611
612int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
613{
614 rgw_pool pool(get_pool(cct));
615 bufferlist bl;
616
617 string oid = get_names_oid_prefix() + obj_name;
618
619 RGWObjectCtx obj_ctx(store);
620 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
621 if (ret < 0) {
622 return ret;
623 }
624
625 RGWNameToId nameToId;
626 try {
627 bufferlist::iterator iter = bl.begin();
628 ::decode(nameToId, iter);
629 } catch (buffer::error& err) {
630 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
631 return -EIO;
632 }
633 object_id = nameToId.obj_id;
634 return 0;
635}
636
637int RGWSystemMetaObj::delete_obj(bool old_format)
638{
639 rgw_pool pool(get_pool(cct));
640
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info;
643 int ret = read_default(default_info, get_default_oid(old_format));
644 if (ret < 0 && ret != -ENOENT)
645 return ret;
646 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
647 string oid = get_default_oid(old_format);
648 rgw_raw_obj default_named_obj(pool, oid);
649 ret = store->delete_system_obj(default_named_obj);
650 if (ret < 0) {
651 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
652 return ret;
653 }
654 }
655 if (!old_format) {
656 string oid = get_names_oid_prefix() + name;
657 rgw_raw_obj object_name(pool, oid);
658 ret = store->delete_system_obj(object_name);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
661 return ret;
662 }
663 }
664
665 string oid = get_info_oid_prefix(old_format);
666 if (old_format) {
667 oid += name;
668 } else {
669 oid += id;
670 }
671
672 rgw_raw_obj object_id(pool, oid);
673 ret = store->delete_system_obj(object_id);
674 if (ret < 0) {
675 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
676 }
677
678 return ret;
679}
680
681int RGWSystemMetaObj::store_name(bool exclusive)
682{
683 rgw_pool pool(get_pool(cct));
684 string oid = get_names_oid_prefix() + name;
685
686 RGWNameToId nameToId;
687 nameToId.obj_id = id;
688
689 bufferlist bl;
690 ::encode(nameToId, bl);
691 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
692}
693
694int RGWSystemMetaObj::rename(const string& new_name)
695{
696 string new_id;
697 int ret = read_id(new_name, new_id);
698 if (!ret) {
699 return -EEXIST;
700 }
701 if (ret < 0 && ret != -ENOENT) {
702 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 string old_name = name;
706 name = new_name;
707 ret = update();
708 if (ret < 0) {
709 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712 ret = store_name(true);
713 if (ret < 0) {
714 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
715 return ret;
716 }
717 /* delete old name */
718 rgw_pool pool(get_pool(cct));
719 string oid = get_names_oid_prefix() + old_name;
720 rgw_raw_obj old_name_obj(pool, oid);
721 ret = store->delete_system_obj(old_name_obj);
722 if (ret < 0) {
723 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
724 return ret;
725 }
726
727 return ret;
728}
729
730int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
731{
732 rgw_pool pool(get_pool(cct));
733
734 bufferlist bl;
735
736 string oid = get_info_oid_prefix(old_format) + obj_id;
737
738 RGWObjectCtx obj_ctx(store);
739 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
740 if (ret < 0) {
741 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
742 return ret;
743 }
744
745 try {
746 bufferlist::iterator iter = bl.begin();
747 ::decode(*this, iter);
748 } catch (buffer::error& err) {
749 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
750 return -EIO;
751 }
752
753 return 0;
754}
755
756int RGWSystemMetaObj::read()
757{
758 int ret = read_id(name, id);
759 if (ret < 0) {
760 return ret;
761 }
762
763 return read_info(id);
764}
765
766int RGWSystemMetaObj::create(bool exclusive)
767{
768 int ret;
769
770 /* check to see the name is not used */
771 ret = read_id(name, id);
772 if (exclusive && ret == 0) {
773 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
774 return -EEXIST;
775 } else if ( ret < 0 && ret != -ENOENT) {
776 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 if (id.empty()) {
781 /* create unique id */
782 uuid_d new_uuid;
783 char uuid_str[37];
784 new_uuid.generate_random();
785 new_uuid.print(uuid_str);
786 id = uuid_str;
787 }
788
789 ret = store_info(exclusive);
790 if (ret < 0) {
791 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
792 return ret;
793 }
794
795 return store_name(exclusive);
796}
797
798int RGWSystemMetaObj::store_info(bool exclusive)
799{
800 rgw_pool pool(get_pool(cct));
801
802 string oid = get_info_oid_prefix() + id;
803
804 bufferlist bl;
805 ::encode(*this, bl);
806 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
807}
808
809int RGWSystemMetaObj::write(bool exclusive)
810{
811 int ret = store_info(exclusive);
812 if (ret < 0) {
813 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
814 return ret;
815 }
816 ret = store_name(exclusive);
817 if (ret < 0) {
818 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
819 return ret;
820 }
821 return 0;
822}
823
824
825const string& RGWRealm::get_predefined_name(CephContext *cct) {
826 return cct->_conf->rgw_realm;
827}
828
829int RGWRealm::create(bool exclusive)
830{
831 int ret = RGWSystemMetaObj::create(exclusive);
832 if (ret < 0) {
833 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
834 return ret;
835 }
836 // create the control object for watch/notify
837 ret = create_control(exclusive);
838 if (ret < 0) {
839 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
840 return ret;
841 }
842 RGWPeriod period;
843 if (current_period.empty()) {
844 /* create new period for the realm */
845 ret = period.init(cct, store, id, name, false);
846 if (ret < 0 ) {
847 return ret;
848 }
849 ret = period.create(true);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
852 return ret;
853 }
854 } else {
855 period = RGWPeriod(current_period, 0);
856 int ret = period.init(cct, store, id, name);
857 if (ret < 0) {
858 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
859 return ret;
860 }
861 }
862 ret = set_current_period(period);
863 if (ret < 0) {
864 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
865 return ret;
866 }
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret = set_as_default(true);
870 if (ret < 0 && ret != -EEXIST) {
871 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
872 }
873
874 return 0;
875}
876
877int RGWRealm::delete_obj()
878{
879 int ret = RGWSystemMetaObj::delete_obj();
880 if (ret < 0) {
881 return ret;
882 }
883 return delete_control();
884}
885
886int RGWRealm::create_control(bool exclusive)
887{
888 auto pool = rgw_pool{get_pool(cct)};
889 auto oid = get_control_oid();
890 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
891 nullptr, real_time(), nullptr);
892}
893
894int RGWRealm::delete_control()
895{
896 auto pool = rgw_pool{get_pool(cct)};
897 auto obj = rgw_raw_obj{pool, get_control_oid()};
898 return store->delete_system_obj(obj);
899}
900
901rgw_pool RGWRealm::get_pool(CephContext *cct)
902{
903 if (cct->_conf->rgw_realm_root_pool.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
905 }
906 return rgw_pool(cct->_conf->rgw_realm_root_pool);
907}
908
909const string RGWRealm::get_default_oid(bool old_format)
910{
911 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
912 return default_realm_info_oid;
913 }
914 return cct->_conf->rgw_default_realm_info_oid;
915}
916
917const string& RGWRealm::get_names_oid_prefix()
918{
919 return realm_names_oid_prefix;
920}
921
922const string& RGWRealm::get_info_oid_prefix(bool old_format)
923{
924 return realm_info_oid_prefix;
925}
926
927int RGWRealm::set_current_period(RGWPeriod& period)
928{
929 // update realm epoch to match the period's
930 if (epoch > period.get_realm_epoch()) {
931 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
932 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
933 return -EINVAL;
934 }
935 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
936 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
937 << period.get_realm_epoch() << ", but different period id "
938 << period.get_id() << " != " << current_period << dendl;
939 return -EINVAL;
940 }
941
942 epoch = period.get_realm_epoch();
943 current_period = period.get_id();
944
945 int ret = update();
946 if (ret < 0) {
947 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
948 return ret;
949 }
950
951 ret = period.reflect();
952 if (ret < 0) {
953 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
954 return ret;
955 }
956
957 return 0;
958}
959
960string RGWRealm::get_control_oid()
961{
962 return get_info_oid_prefix() + id + ".control";
963}
964
965int RGWRealm::notify_zone(bufferlist& bl)
966{
967 // open a context on the realm's pool
968 rgw_pool pool{get_pool(cct)};
969 librados::IoCtx ctx;
970 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
971 if (r < 0) {
972 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
973 return r;
974 }
975 // send a notify on the realm object
976 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
977 if (r < 0) {
978 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
979 return r;
980 }
981 return 0;
982}
983
984int RGWRealm::notify_new_period(const RGWPeriod& period)
985{
986 bufferlist bl;
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
989 ::encode(period, bl);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload, bl);
992
993 return notify_zone(bl);
994}
995
996std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
997{
998 if (realm_id.empty()) {
999 return "period_config.default";
1000 }
1001 return "period_config." + realm_id;
1002}
1003
1004rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
1005{
1006 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1007 if (pool_name.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1009 }
1010 return {pool_name};
1011}
1012
1013int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1014{
1015 RGWObjectCtx obj_ctx(store);
1016 const auto& pool = get_pool(store->ctx());
1017 const auto& oid = get_oid(realm_id);
1018 bufferlist bl;
1019
1020 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1021 if (ret < 0) {
1022 return ret;
1023 }
1024 try {
1025 bufferlist::iterator iter = bl.begin();
1026 ::decode(*this, iter);
1027 } catch (buffer::error& err) {
1028 return -EIO;
1029 }
1030 return 0;
1031}
1032
1033int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1034{
1035 const auto& pool = get_pool(store->ctx());
1036 const auto& oid = get_oid(realm_id);
1037 bufferlist bl;
1038 ::encode(*this, bl);
1039 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1040 false, nullptr, real_time(), nullptr);
1041}
1042
1043int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1044 const string& period_realm_name, bool setup_obj)
1045{
1046 cct = _cct;
1047 store = _store;
1048 realm_id = period_realm_id;
1049 realm_name = period_realm_name;
1050
1051 if (!setup_obj)
1052 return 0;
1053
1054 return init(_cct, _store, setup_obj);
1055}
1056
1057
1058int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1059{
1060 cct = _cct;
1061 store = _store;
1062
1063 if (!setup_obj)
1064 return 0;
1065
1066 if (id.empty()) {
1067 RGWRealm realm(realm_id, realm_name);
1068 int ret = realm.init(cct, store);
1069 if (ret < 0) {
1070 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1071 cpp_strerror(-ret) << dendl;
1072 return ret;
1073 }
1074 id = realm.get_current_period();
1075 realm_id = realm.get_id();
1076 }
1077
1078 if (!epoch) {
1079 int ret = use_latest_epoch();
1080 if (ret < 0) {
1081 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1082 << " : " << cpp_strerror(-ret) << dendl;
1083 return ret;
1084 }
1085 }
1086
1087 return read_info();
1088}
1089
1090
1091int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1092 map<string, RGWZoneGroup>::const_iterator iter;
1093 if (!zonegroup_id.empty()) {
1094 iter = period_map.zonegroups.find(zonegroup_id);
1095 } else {
1096 iter = period_map.zonegroups.find("default");
1097 }
1098 if (iter != period_map.zonegroups.end()) {
1099 zonegroup = iter->second;
1100 return 0;
1101 }
1102
1103 return -ENOENT;
1104}
1105
7c673cae
FG
1106const string& RGWPeriod::get_latest_epoch_oid()
1107{
1108 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1109 return period_latest_epoch_info_oid;
1110 }
1111 return cct->_conf->rgw_period_latest_epoch_info_oid;
1112}
1113
1114const string& RGWPeriod::get_info_oid_prefix()
1115{
1116 return period_info_oid_prefix;
1117}
1118
1119const string RGWPeriod::get_period_oid_prefix()
1120{
1121 return get_info_oid_prefix() + id;
1122}
1123
1124const string RGWPeriod::get_period_oid()
1125{
1126 std::ostringstream oss;
1127 oss << get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id != get_staging_id(realm_id))
1130 oss << "." << epoch;
1131 return oss.str();
1132}
1133
224ce89b
WB
1134int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1135 RGWObjVersionTracker *objv)
7c673cae
FG
1136{
1137 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1138
1139 rgw_pool pool(get_pool(cct));
1140 bufferlist bl;
1141 RGWObjectCtx obj_ctx(store);
224ce89b 1142 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
7c673cae
FG
1143 if (ret < 0) {
1144 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1145 return ret;
1146 }
1147 try {
1148 bufferlist::iterator iter = bl.begin();
1149 ::decode(info, iter);
1150 } catch (buffer::error& err) {
1151 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1152 return -EIO;
1153 }
1154
1155 return 0;
1156}
1157
1158int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1159{
1160 RGWPeriodLatestEpochInfo info;
1161
1162 int ret = read_latest_epoch(info);
1163 if (ret < 0) {
1164 return ret;
1165 }
1166
1167 latest_epoch = info.epoch;
1168
1169 return 0;
1170}
1171
1172int RGWPeriod::use_latest_epoch()
1173{
1174 RGWPeriodLatestEpochInfo info;
1175 int ret = read_latest_epoch(info);
1176 if (ret < 0) {
1177 return ret;
1178 }
1179
1180 epoch = info.epoch;
1181
1182 return 0;
1183}
1184
224ce89b
WB
1185int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1186 RGWObjVersionTracker *objv)
7c673cae
FG
1187{
1188 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1189
1190 rgw_pool pool(get_pool(cct));
1191 bufferlist bl;
1192
1193 RGWPeriodLatestEpochInfo info;
1194 info.epoch = epoch;
1195
1196 ::encode(info, bl);
1197
1198 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
224ce89b
WB
1199 exclusive, objv, real_time(), nullptr);
1200}
1201
1202int RGWPeriod::update_latest_epoch(epoch_t epoch)
1203{
1204 static constexpr int MAX_RETRIES = 20;
1205
1206 for (int i = 0; i < MAX_RETRIES; i++) {
1207 RGWPeriodLatestEpochInfo info;
1208 RGWObjVersionTracker objv;
1209 bool exclusive = false;
1210
1211 // read existing epoch
1212 int r = read_latest_epoch(info, &objv);
1213 if (r == -ENOENT) {
1214 // use an exclusive create to set the epoch atomically
1215 exclusive = true;
1216 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id << dendl;
1218 } else if (r < 0) {
1219 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1220 return r;
1221 } else if (epoch <= info.epoch) {
1222 r = -EEXIST; // fail with EEXIST if epoch is not newer
1223 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1224 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1225 return r;
1226 } else {
1227 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1228 << " -> " << epoch << " on period=" << id << dendl;
1229 }
1230
1231 r = set_latest_epoch(epoch, exclusive, &objv);
1232 if (r == -EEXIST) {
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r == -ECANCELED) {
1235 continue; // write raced with a conflicting version, retry
1236 }
1237 if (r < 0) {
1238 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1239 return r;
1240 }
1241 return 0; // return success
1242 }
1243
1244 return -ECANCELED; // fail after max retries
7c673cae
FG
1245}
1246
1247int RGWPeriod::delete_obj()
1248{
1249 rgw_pool pool(get_pool(cct));
1250
1251 // delete the object for each period epoch
1252 for (epoch_t e = 1; e <= epoch; e++) {
1253 RGWPeriod p{get_id(), e};
1254 rgw_raw_obj oid{pool, p.get_period_oid()};
1255 int ret = store->delete_system_obj(oid);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret) << dendl;
1259 }
1260 }
1261
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret = store->delete_system_obj(oid);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret) << dendl;
1268 }
1269 return ret;
1270}
1271
1272int RGWPeriod::read_info()
1273{
1274 rgw_pool pool(get_pool(cct));
1275
1276 bufferlist bl;
1277
1278 RGWObjectCtx obj_ctx(store);
1279 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1280 if (ret < 0) {
1281 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1282 return ret;
1283 }
1284
1285 try {
1286 bufferlist::iterator iter = bl.begin();
1287 ::decode(*this, iter);
1288 } catch (buffer::error& err) {
1289 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1290 return -EIO;
1291 }
1292
1293 return 0;
1294}
1295
1296int RGWPeriod::create(bool exclusive)
1297{
1298 int ret;
1299
1300 /* create unique id */
1301 uuid_d new_uuid;
1302 char uuid_str[37];
1303 new_uuid.generate_random();
1304 new_uuid.print(uuid_str);
1305 id = uuid_str;
1306
1307 epoch = FIRST_EPOCH;
1308
1309 period_map.id = id;
1310
1311 ret = store_info(exclusive);
1312 if (ret < 0) {
1313 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
224ce89b 1314 return ret;
7c673cae
FG
1315 }
1316
1317 ret = set_latest_epoch(epoch);
1318 if (ret < 0) {
1319 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1320 }
1321
1322 return ret;
1323}
1324
1325int RGWPeriod::store_info(bool exclusive)
1326{
7c673cae
FG
1327 rgw_pool pool(get_pool(cct));
1328
1329 string oid = get_period_oid();
1330 bufferlist bl;
1331 ::encode(*this, bl);
224ce89b
WB
1332
1333 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1334 exclusive, NULL, real_time(), NULL);
7c673cae
FG
1335}
1336
1337rgw_pool RGWPeriod::get_pool(CephContext *cct)
1338{
1339 if (cct->_conf->rgw_period_root_pool.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1341 }
1342 return rgw_pool(cct->_conf->rgw_period_root_pool);
1343}
1344
7c673cae
FG
1345int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1346{
1347 if (zonegroup.realm_id != realm_id) {
1348 return 0;
1349 }
1350 int ret = period_map.update(zonegroup, cct);
1351 if (ret < 0) {
1352 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1353 return ret;
1354 }
1355
1356 return store_info(false);
1357}
1358
1359int RGWPeriod::update()
1360{
1361 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1362 list<string> zonegroups;
1363 int ret = store->list_zonegroups(zonegroups);
1364 if (ret < 0) {
1365 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1366 return ret;
1367 }
1368
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map.short_zone_ids.clear();
1372
1373 for (auto& iter : zonegroups) {
1374 RGWZoneGroup zg(string(), iter);
1375 ret = zg.init(cct, store);
1376 if (ret < 0) {
1377 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1378 continue;
1379 }
1380
1381 if (zg.realm_id != realm_id) {
1382 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1383 continue;
1384 }
1385
1386 if (zg.master_zone.empty()) {
1387 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1388 return -EINVAL;
1389 }
1390
1391 if (zg.is_master_zonegroup()) {
1392 master_zonegroup = zg.get_id();
1393 master_zone = zg.master_zone;
1394 }
1395
1396 int ret = period_map.update(zg, cct);
1397 if (ret < 0) {
1398 return ret;
1399 }
1400 }
1401
1402 ret = period_config.read(store, realm_id);
1403 if (ret < 0 && ret != -ENOENT) {
1404 ldout(cct, 0) << "ERROR: failed to read period config: "
1405 << cpp_strerror(ret) << dendl;
1406 return ret;
1407 }
1408 return 0;
1409}
1410
1411int RGWPeriod::reflect()
1412{
1413 for (auto& iter : period_map.zonegroups) {
1414 RGWZoneGroup& zg = iter.second;
1415 zg.reinit_instance(cct, store);
1416 int r = zg.write(false);
1417 if (r < 0) {
1418 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1419 return r;
1420 }
1421 if (zg.is_master_zonegroup()) {
1422 // set master as default if no default exists
1423 r = zg.set_as_default(true);
1424 if (r == 0) {
1425 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1426 << " as the default" << dendl;
1427 }
1428 }
1429 }
1430
1431 int r = period_config.write(store, realm_id);
1432 if (r < 0) {
1433 ldout(cct, 0) << "ERROR: failed to store period config: "
1434 << cpp_strerror(-r) << dendl;
1435 return r;
1436 }
1437 return 0;
1438}
1439
1440void RGWPeriod::fork()
1441{
1442 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1443 predecessor_uuid = id;
1444 id = get_staging_id(realm_id);
1445 period_map.reset();
1446 realm_epoch++;
1447}
1448
1449static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1450{
1451 // initialize a sync status manager to read the status
1452 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1453 int r = mgr.init();
1454 if (r < 0) {
1455 return r;
1456 }
1457 r = mgr.read_sync_status(sync_status);
1458 mgr.stop();
1459 return r;
1460}
1461
1462int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1463 std::ostream& error_stream,
1464 bool force_if_stale)
1465{
1466 rgw_meta_sync_status status;
1467 int r = read_sync_status(store, &status);
1468 if (r < 0) {
1469 ldout(cct, 0) << "period failed to read sync status: "
1470 << cpp_strerror(-r) << dendl;
1471 return r;
1472 }
1473
1474 std::vector<std::string> markers;
1475
1476 const auto current_epoch = current_period.get_realm_epoch();
1477 if (current_epoch != status.sync_info.realm_epoch) {
1478 // no sync status markers for the current period
1479 assert(current_epoch > status.sync_info.realm_epoch);
1480 const int behind = current_epoch - status.sync_info.realm_epoch;
1481 if (!force_if_stale && current_epoch > 1) {
1482 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1483 "the current master zone in metadata sync. If this zone is promoted "
1484 "to master, any metadata changes during that time are likely to "
1485 "be lost.\n"
1486 "Waiting for this zone to catch up on metadata sync (see "
1487 "'radosgw-admin sync status') is recommended.\n"
1488 "To promote this zone to master anyway, add the flag "
1489 "--yes-i-really-mean-it." << std::endl;
1490 return -EINVAL;
1491 }
1492 // empty sync status markers - other zones will skip this period during
1493 // incremental metadata sync
1494 markers.resize(status.sync_info.num_shards);
1495 } else {
1496 markers.reserve(status.sync_info.num_shards);
1497 for (auto& i : status.sync_markers) {
1498 auto& marker = i.second;
1499 // filter out markers from other periods
1500 if (marker.realm_epoch != current_epoch) {
1501 marker.marker.clear();
1502 }
1503 markers.emplace_back(std::move(marker.marker));
1504 }
1505 }
1506
1507 std::swap(sync_status, markers);
1508 return 0;
1509}
1510
1511int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1512 std::ostream& error_stream, bool force_if_stale)
1513{
1514 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1515 // gateway must be in the master zone to commit
1516 if (master_zone != store->get_zone_params().get_id()) {
1517 error_stream << "Cannot commit period on zone "
1518 << store->get_zone_params().get_id() << ", it must be sent to "
1519 "the period's master zone " << master_zone << '.' << std::endl;
1520 return -EINVAL;
1521 }
1522 // period predecessor must match current period
1523 if (predecessor_uuid != current_period.get_id()) {
1524 error_stream << "Period predecessor " << predecessor_uuid
1525 << " does not match current period " << current_period.get_id()
1526 << ". Use 'period pull' to get the latest period from the master, "
1527 "reapply your changes, and try again." << std::endl;
1528 return -EINVAL;
1529 }
1530 // realm epoch must be 1 greater than current period
1531 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1532 error_stream << "Period's realm epoch " << realm_epoch
1533 << " does not come directly after current realm epoch "
1534 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1535 "latest realm and period from the master zone, reapply your changes, "
1536 "and try again." << std::endl;
1537 return -EINVAL;
1538 }
1539 // did the master zone change?
1540 if (master_zone != current_period.get_master_zone()) {
1541 // store the current metadata sync status in the period
1542 int r = update_sync_status(current_period, error_stream, force_if_stale);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to update metadata sync status: "
1545 << cpp_strerror(-r) << dendl;
1546 return r;
1547 }
1548 // create an object with a new period id
1549 r = create(true);
1550 if (r < 0) {
1551 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 // set as current period
1555 r = realm.set_current_period(*this);
1556 if (r < 0) {
1557 ldout(cct, 0) << "failed to update realm's current period: "
1558 << cpp_strerror(-r) << dendl;
1559 return r;
1560 }
1561 ldout(cct, 4) << "Promoted to master zone and committed new period "
1562 << id << dendl;
1563 realm.notify_new_period(*this);
1564 return 0;
1565 }
1566 // period must be based on current epoch
1567 if (epoch != current_period.get_epoch()) {
1568 error_stream << "Period epoch " << epoch << " does not match "
1569 "predecessor epoch " << current_period.get_epoch()
1570 << ". Use 'period pull' to get the latest epoch from the master zone, "
1571 "reapply your changes, and try again." << std::endl;
1572 return -EINVAL;
1573 }
1574 // set period as next epoch
1575 set_id(current_period.get_id());
1576 set_epoch(current_period.get_epoch() + 1);
1577 set_predecessor(current_period.get_predecessor());
1578 realm_epoch = current_period.get_realm_epoch();
1579 // write the period to rados
1580 int r = store_info(false);
1581 if (r < 0) {
1582 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1583 return r;
1584 }
1585 // set as latest epoch
224ce89b
WB
1586 r = update_latest_epoch(epoch);
1587 if (r == -EEXIST) {
1588 // already have this epoch (or a more recent one)
1589 return 0;
1590 }
7c673cae
FG
1591 if (r < 0) {
1592 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1593 return r;
1594 }
1595 r = reflect();
1596 if (r < 0) {
1597 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1598 return r;
1599 }
1600 ldout(cct, 4) << "Committed new epoch " << epoch
1601 << " for period " << id << dendl;
1602 realm.notify_new_period(*this);
1603 return 0;
1604}
1605
1606int RGWZoneParams::create_default(bool old_format)
1607{
1608 name = default_zone_name;
1609
1610 int r = create();
1611 if (r < 0) {
1612 return r;
1613 }
1614
1615 if (old_format) {
1616 name = id;
1617 }
1618
1619 return r;
1620}
1621
1622
1623int get_zones_pool_set(CephContext* cct,
1624 RGWRados* store,
1625 const list<string>& zones,
1626 const string& my_zone_id,
1627 set<rgw_pool>& pool_names)
1628{
1629 for(auto const& iter : zones) {
1630 RGWZoneParams zone(iter);
1631 int r = zone.init(cct, store);
1632 if (r < 0) {
1633 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1634 return r;
1635 }
1636 if (zone.get_id() != my_zone_id) {
1637 pool_names.insert(zone.domain_root);
1638 pool_names.insert(zone.metadata_heap);
1639 pool_names.insert(zone.control_pool);
1640 pool_names.insert(zone.gc_pool);
1641 pool_names.insert(zone.log_pool);
1642 pool_names.insert(zone.intent_log_pool);
1643 pool_names.insert(zone.usage_log_pool);
1644 pool_names.insert(zone.user_keys_pool);
1645 pool_names.insert(zone.user_email_pool);
1646 pool_names.insert(zone.user_swift_pool);
1647 pool_names.insert(zone.user_uid_pool);
1648 pool_names.insert(zone.roles_pool);
31f18b77 1649 pool_names.insert(zone.reshard_pool);
7c673cae
FG
1650 for(auto& iter : zone.placement_pools) {
1651 pool_names.insert(iter.second.index_pool);
1652 pool_names.insert(iter.second.data_pool);
1653 pool_names.insert(iter.second.data_extra_pool);
1654 }
1655 }
1656 }
1657 return 0;
1658}
1659
1660rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1661 const string& default_prefix,
1662 const string& default_suffix,
1663 const rgw_pool& suggested_pool)
1664{
1665 string suggested_name = suggested_pool.to_str();
1666
1667 string prefix = default_prefix;
1668 string suffix = default_suffix;
1669
1670 if (!suggested_pool.empty()) {
1671 prefix = suggested_name.substr(0, suggested_name.find("."));
1672 suffix = suggested_name.substr(prefix.length());
1673 }
1674
1675 rgw_pool pool(prefix + suffix);
1676
1677 if (pools.find(pool) == pools.end()) {
1678 return pool;
1679 } else {
1680 while(true) {
1681 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1682 if (pools.find(pool) == pools.end()) {
1683 return pool;
1684 }
1685 }
1686 }
1687}
1688
1689int RGWZoneParams::fix_pool_names()
1690{
1691
1692 list<string> zones;
1693 int r = store->list_zones(zones);
1694 if (r < 0) {
1695 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1696 }
1697
1698 set<rgw_pool> pools;
1699 r = get_zones_pool_set(cct, store, zones, id, pools);
1700 if (r < 0) {
1701 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1702 return r;
1703 }
1704
1705 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1706 if (!metadata_heap.name.empty()) {
1707 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1708 }
1709 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1710 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1711 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1712 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1713 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1714 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1715 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1716 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1717 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1718 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1719 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
31f18b77 1720 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
7c673cae
FG
1721
1722 for(auto& iter : placement_pools) {
1723 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1724 iter.second.index_pool);
1725 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1726 iter.second.data_pool);
1727 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1728 iter.second.data_extra_pool);
1729 }
1730
1731 return 0;
1732}
1733
1734int RGWZoneParams::create(bool exclusive)
1735{
1736 /* check for old pools config */
1737 rgw_raw_obj obj(domain_root, avail_pools);
1738 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1739 if (r < 0) {
1740 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1741 /* a new system, let's set new placement info */
1742 RGWZonePlacementInfo default_placement;
1743 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1744 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1745 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1746 placement_pools["default-placement"] = default_placement;
1747 }
1748
1749 r = fix_pool_names();
1750 if (r < 0) {
1751 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1752 return r;
1753 }
1754
1755 r = RGWSystemMetaObj::create(exclusive);
1756 if (r < 0) {
1757 return r;
1758 }
1759
1760 // try to set as default. may race with another create, so pass exclusive=true
1761 // so we don't override an existing default
1762 r = set_as_default(true);
1763 if (r < 0 && r != -EEXIST) {
1764 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1765 }
1766
1767 return 0;
1768}
1769
1770rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1771{
1772 if (cct->_conf->rgw_zone_root_pool.empty()) {
1773 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1774 }
1775
1776 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1777}
1778
1779const string RGWZoneParams::get_default_oid(bool old_format)
1780{
1781 if (old_format) {
1782 return cct->_conf->rgw_default_zone_info_oid;
1783 }
1784
1785 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1786}
1787
1788const string& RGWZoneParams::get_names_oid_prefix()
1789{
1790 return zone_names_oid_prefix;
1791}
1792
1793const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1794{
1795 return zone_info_oid_prefix;
1796}
1797
1798const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1799 return cct->_conf->rgw_zone;
1800}
1801
1802int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1803{
1804 if (name.empty()) {
1805 name = cct->_conf->rgw_zone;
1806 }
1807
1808 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1809}
1810
1811int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1812{
1813 if (realm_id.empty()) {
1814 /* try using default realm */
1815 RGWRealm realm;
1816 int ret = realm.init(cct, store);
b32b8144 1817 //no default realm exist
7c673cae 1818 if (ret < 0) {
b32b8144 1819 return read_id(default_zone_name, default_id);
7c673cae
FG
1820 }
1821 realm_id = realm.get_id();
1822 }
1823
1824 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1825}
1826
1827
1828int RGWZoneParams::set_as_default(bool exclusive)
1829{
1830 if (realm_id.empty()) {
1831 /* try using default realm */
1832 RGWRealm realm;
1833 int ret = realm.init(cct, store);
1834 if (ret < 0) {
1835 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1836 return -EINVAL;
1837 }
1838 realm_id = realm.get_id();
1839 }
1840
1841 return RGWSystemMetaObj::set_as_default(exclusive);
1842}
1843
1844const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1845{
1846 static const std::string NONE{"none"};
1847 auto p = placement_pools.find(placement_rule);
1848 if (p == placement_pools.end()) {
1849 return NONE;
1850 }
1851 const auto& type = p->second.compression_type;
1852 return !type.empty() ? type : NONE;
1853}
1854
1855void RGWPeriodMap::encode(bufferlist& bl) const {
1856 ENCODE_START(2, 1, bl);
1857 ::encode(id, bl);
1858 ::encode(zonegroups, bl);
1859 ::encode(master_zonegroup, bl);
1860 ::encode(short_zone_ids, bl);
1861 ENCODE_FINISH(bl);
1862}
1863
1864void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1865 DECODE_START(2, bl);
1866 ::decode(id, bl);
1867 ::decode(zonegroups, bl);
1868 ::decode(master_zonegroup, bl);
1869 if (struct_v >= 2) {
1870 ::decode(short_zone_ids, bl);
1871 }
1872 DECODE_FINISH(bl);
1873
1874 zonegroups_by_api.clear();
1875 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1876 iter != zonegroups.end(); ++iter) {
1877 RGWZoneGroup& zonegroup = iter->second;
1878 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 1879 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1880 master_zonegroup = zonegroup.get_id();
1881 }
1882 }
1883}
1884
1885// run an MD5 hash on the zone_id and return the first 32 bits
1886static uint32_t gen_short_zone_id(const std::string zone_id)
1887{
1888 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1889 MD5 hash;
1890 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1891 hash.Final(md5);
1892
1893 uint32_t short_id;
1894 memcpy((char *)&short_id, md5, sizeof(short_id));
1895 return std::max(short_id, 1u);
1896}
1897
1898int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1899{
31f18b77 1900 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
7c673cae
FG
1901 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1902 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1903 return -EINVAL;
1904 }
1905 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1906 if (iter != zonegroups.end()) {
1907 RGWZoneGroup& old_zonegroup = iter->second;
1908 if (!old_zonegroup.api_name.empty()) {
1909 zonegroups_by_api.erase(old_zonegroup.api_name);
1910 }
1911 }
1912 zonegroups[zonegroup.get_id()] = zonegroup;
1913
1914 if (!zonegroup.api_name.empty()) {
1915 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1916 }
1917
31f18b77 1918 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
1919 master_zonegroup = zonegroup.get_id();
1920 } else if (master_zonegroup == zonegroup.get_id()) {
1921 master_zonegroup = "";
1922 }
1923
1924 for (auto& i : zonegroup.zones) {
1925 auto& zone = i.second;
1926 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1927 continue;
1928 }
1929 // calculate the zone's short id
1930 uint32_t short_id = gen_short_zone_id(zone.id);
1931
1932 // search for an existing zone with the same short id
1933 for (auto& s : short_zone_ids) {
1934 if (s.second == short_id) {
1935 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1936 << ") generates the same short_zone_id " << short_id
1937 << " as existing zone id " << s.first << dendl;
1938 return -EEXIST;
1939 }
1940 }
1941
1942 short_zone_ids[zone.id] = short_id;
1943 }
1944
1945 return 0;
1946}
1947
1948uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1949{
1950 auto i = short_zone_ids.find(zone_id);
1951 if (i == short_zone_ids.end()) {
1952 return 0;
1953 }
1954 return i->second;
1955}
1956
1957int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1958{
1959
1960 RGWPeriod period;
1961 int ret = period.init(cct, store);
1962 if (ret < 0) {
1963 cerr << "failed to read current period info: " << cpp_strerror(ret);
1964 return ret;
1965 }
1966
1967 bucket_quota = period.get_config().bucket_quota;
1968 user_quota = period.get_config().user_quota;
1969 zonegroups = period.get_map().zonegroups;
1970 zonegroups_by_api = period.get_map().zonegroups_by_api;
1971 master_zonegroup = period.get_map().master_zonegroup;
1972
1973 return 0;
1974}
1975
1976void RGWRegionMap::encode(bufferlist& bl) const {
1977 ENCODE_START( 3, 1, bl);
1978 ::encode(regions, bl);
1979 ::encode(master_region, bl);
1980 ::encode(bucket_quota, bl);
1981 ::encode(user_quota, bl);
1982 ENCODE_FINISH(bl);
1983}
1984
1985void RGWRegionMap::decode(bufferlist::iterator& bl) {
1986 DECODE_START(3, bl);
1987 ::decode(regions, bl);
1988 ::decode(master_region, bl);
1989 if (struct_v >= 2)
1990 ::decode(bucket_quota, bl);
1991 if (struct_v >= 3)
1992 ::decode(user_quota, bl);
1993 DECODE_FINISH(bl);
1994}
1995
1996void RGWZoneGroupMap::encode(bufferlist& bl) const {
1997 ENCODE_START( 3, 1, bl);
1998 ::encode(zonegroups, bl);
1999 ::encode(master_zonegroup, bl);
2000 ::encode(bucket_quota, bl);
2001 ::encode(user_quota, bl);
2002 ENCODE_FINISH(bl);
2003}
2004
2005void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
2006 DECODE_START(3, bl);
2007 ::decode(zonegroups, bl);
2008 ::decode(master_zonegroup, bl);
2009 if (struct_v >= 2)
2010 ::decode(bucket_quota, bl);
2011 if (struct_v >= 3)
2012 ::decode(user_quota, bl);
2013 DECODE_FINISH(bl);
2014
2015 zonegroups_by_api.clear();
2016 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2017 iter != zonegroups.end(); ++iter) {
2018 RGWZoneGroup& zonegroup = iter->second;
2019 zonegroups_by_api[zonegroup.api_name] = zonegroup;
31f18b77 2020 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
2021 master_zonegroup = zonegroup.get_name();
2022 }
2023 }
2024}
2025
2026void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2027{
2028 obj_version *check_objv = version_for_check();
2029
2030 if (check_objv) {
2031 cls_version_check(*op, *check_objv, VER_COND_EQ);
2032 }
2033
2034 cls_version_read(*op, &read_version);
2035}
2036
2037void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2038{
2039 obj_version *check_objv = version_for_check();
2040 obj_version *modify_version = version_for_write();
2041
2042 if (check_objv) {
2043 cls_version_check(*op, *check_objv, VER_COND_EQ);
2044 }
2045
2046 if (modify_version) {
2047 cls_version_set(*op, *modify_version);
2048 } else {
2049 cls_version_inc(*op);
2050 }
2051}
2052
2053void RGWObjManifest::obj_iterator::operator++()
2054{
2055 if (manifest->explicit_objs) {
2056 ++explicit_iter;
2057
2058 if (explicit_iter == manifest->objs.end()) {
2059 ofs = manifest->obj_size;
2060 return;
2061 }
2062
2063 update_explicit_pos();
2064
2065 update_location();
2066 return;
2067 }
2068
2069 uint64_t obj_size = manifest->get_obj_size();
2070 uint64_t head_size = manifest->get_head_size();
2071
2072 if (ofs == obj_size) {
2073 return;
2074 }
2075
2076 if (manifest->rules.empty()) {
2077 return;
2078 }
2079
2080 /* are we still pointing at the head? */
2081 if (ofs < head_size) {
2082 rule_iter = manifest->rules.begin();
2083 RGWObjManifestRule *rule = &rule_iter->second;
2084 ofs = MIN(head_size, obj_size);
2085 stripe_ofs = ofs;
2086 cur_stripe = 1;
2087 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2088 if (rule->part_size > 0) {
2089 stripe_size = MIN(stripe_size, rule->part_size);
2090 }
2091 update_location();
2092 return;
2093 }
2094
2095 RGWObjManifestRule *rule = &rule_iter->second;
2096
2097 stripe_ofs += rule->stripe_max_size;
2098 cur_stripe++;
2099 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2100
2101 if (rule->part_size > 0) {
2102 /* multi part, multi stripes object */
2103
2104 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2105
2106 if (stripe_ofs >= part_ofs + rule->part_size) {
2107 /* moved to the next part */
2108 cur_stripe = 0;
2109 part_ofs += rule->part_size;
2110 stripe_ofs = part_ofs;
2111
2112 bool last_rule = (next_rule_iter == manifest->rules.end());
2113 /* move to the next rule? */
2114 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2115 rule_iter = next_rule_iter;
2116 last_rule = (next_rule_iter == manifest->rules.end());
2117 if (!last_rule) {
2118 ++next_rule_iter;
2119 }
2120 cur_part_id = rule_iter->second.start_part_num;
2121 } else {
2122 cur_part_id++;
2123 }
2124
2125 rule = &rule_iter->second;
2126 }
2127
2128 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2129 }
2130
2131 cur_override_prefix = rule->override_prefix;
2132
2133 ofs = stripe_ofs;
2134 if (ofs > obj_size) {
2135 ofs = obj_size;
2136 stripe_ofs = ofs;
2137 stripe_size = 0;
2138 }
2139
2140 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2141 update_location();
2142}
2143
2144int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2145{
2146 manifest = _m;
2147
2148 manifest->set_tail_placement(placement_rule, _b);
2149 manifest->set_head(placement_rule, _obj, 0);
2150 last_ofs = 0;
2151
2152 if (manifest->get_prefix().empty()) {
2153 char buf[33];
2154 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2155
2156 string oid_prefix = ".";
2157 oid_prefix.append(buf);
2158 oid_prefix.append("_");
2159
2160 manifest->set_prefix(oid_prefix);
2161 }
2162
2163 bool found = manifest->get_rule(0, &rule);
2164 if (!found) {
2165 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2166 return -EIO;
2167 }
2168
2169 uint64_t head_size = manifest->get_head_size();
2170
2171 if (head_size > 0) {
2172 cur_stripe_size = head_size;
2173 } else {
2174 cur_stripe_size = rule.stripe_max_size;
2175 }
2176
2177 cur_part_id = rule.start_part_num;
2178
2179 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2180
2181 // Normal object which not generated through copy operation
2182 manifest->set_tail_instance(_obj.key.instance);
2183
2184 manifest->update_iterators();
2185
2186 return 0;
2187}
2188
2189int RGWObjManifest::generator::create_next(uint64_t ofs)
2190{
2191 if (ofs < last_ofs) /* only going forward */
2192 return -EINVAL;
2193
2194 uint64_t max_head_size = manifest->get_max_head_size();
2195
2196 if (ofs < max_head_size) {
2197 manifest->set_head_size(ofs);
2198 }
2199
2200 if (ofs >= max_head_size) {
2201 manifest->set_head_size(max_head_size);
2202 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2203 cur_stripe_size = rule.stripe_max_size;
2204
2205 if (cur_part_id == 0 && max_head_size > 0) {
2206 cur_stripe++;
2207 }
2208 }
2209
2210 last_ofs = ofs;
2211 manifest->set_obj_size(ofs);
2212
2213 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2214
2215 manifest->update_iterators();
2216
2217 return 0;
2218}
2219
2220const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2221{
2222 return begin_iter;
2223}
2224
2225const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2226{
2227 return end_iter;
2228}
2229
2230RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2231{
2232 if (ofs > obj_size) {
2233 ofs = obj_size;
2234 }
2235 RGWObjManifest::obj_iterator iter(this);
2236 iter.seek(ofs);
2237 return iter;
2238}
2239
2240int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2241{
2242 if (explicit_objs || m.explicit_objs) {
2243 return append_explicit(m, zonegroup, zone_params);
2244 }
2245
2246 if (rules.empty()) {
2247 *this = m;
2248 return 0;
2249 }
2250
2251 string override_prefix;
2252
2253 if (prefix.empty()) {
2254 prefix = m.prefix;
2255 }
2256
2257 if (prefix != m.prefix) {
2258 override_prefix = m.prefix;
2259 }
2260
2261 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2262 if (miter == m.rules.end()) {
2263 return append_explicit(m, zonegroup, zone_params);
2264 }
2265
2266 for (; miter != m.rules.end(); ++miter) {
2267 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2268
2269 RGWObjManifestRule& rule = last_rule->second;
2270
2271 if (rule.part_size == 0) {
2272 rule.part_size = obj_size - rule.start_ofs;
2273 }
2274
2275 RGWObjManifestRule& next_rule = miter->second;
2276 if (!next_rule.part_size) {
2277 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2278 }
2279
2280 string rule_prefix = prefix;
2281 if (!rule.override_prefix.empty()) {
2282 rule_prefix = rule.override_prefix;
2283 }
2284
2285 string next_rule_prefix = m.prefix;
2286 if (!next_rule.override_prefix.empty()) {
2287 next_rule_prefix = next_rule.override_prefix;
2288 }
2289
2290 if (rule.part_size != next_rule.part_size ||
2291 rule.stripe_max_size != next_rule.stripe_max_size ||
2292 rule_prefix != next_rule_prefix) {
2293 if (next_rule_prefix != prefix) {
2294 append_rules(m, miter, &next_rule_prefix);
2295 } else {
2296 append_rules(m, miter, NULL);
2297 }
2298 break;
2299 }
2300
2301 uint64_t expected_part_num = rule.start_part_num + 1;
2302 if (rule.part_size > 0) {
2303 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2304 }
2305
2306 if (expected_part_num != next_rule.start_part_num) {
2307 append_rules(m, miter, NULL);
2308 break;
2309 }
2310 }
2311
2312 set_obj_size(obj_size + m.obj_size);
2313
2314 return 0;
2315}
2316
2317int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2318{
2319 return append(m, store->get_zonegroup(), store->get_zone_params());
2320}
2321
2322void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2323 string *override_prefix)
2324{
2325 for (; miter != m.rules.end(); ++miter) {
2326 RGWObjManifestRule rule = miter->second;
2327 rule.start_ofs += obj_size;
2328 if (override_prefix)
2329 rule.override_prefix = *override_prefix;
2330 rules[rule.start_ofs] = rule;
2331 }
2332}
2333
2334void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2335{
2336 if (explicit_objs) {
2337 return;
2338 }
2339 obj_iterator iter = obj_begin();
2340
2341 while (iter != obj_end()) {
2342 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2343 const rgw_obj_select& os = iter.get_location();
2344 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2345 part.loc_ofs = 0;
2346
2347 uint64_t ofs = iter.get_stripe_ofs();
2348
2349 if (ofs == 0) {
2350 part.loc = obj;
2351 } else {
2352 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2353 }
2354 ++iter;
2355 uint64_t next_ofs = iter.get_stripe_ofs();
2356
2357 part.size = next_ofs - ofs;
2358 }
2359
2360 explicit_objs = true;
2361 rules.clear();
2362 prefix.clear();
2363}
2364
2365int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2366{
2367 if (!explicit_objs) {
2368 convert_to_explicit(zonegroup, zone_params);
2369 }
2370 if (!m.explicit_objs) {
2371 m.convert_to_explicit(zonegroup, zone_params);
2372 }
2373 map<uint64_t, RGWObjManifestPart>::iterator iter;
2374 uint64_t base = obj_size;
2375 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2376 RGWObjManifestPart& part = iter->second;
2377 objs[base + iter->first] = part;
2378 }
2379 obj_size += m.obj_size;
2380
2381 return 0;
2382}
2383
2384bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2385{
2386 if (rules.empty()) {
2387 return false;
2388 }
2389
2390 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2391 if (iter != rules.begin()) {
2392 --iter;
2393 }
2394
2395 *rule = iter->second;
2396
2397 return true;
2398}
2399
2400void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2401{
2402 write_version.ver = 1;
2403#define TAG_LEN 24
2404
2405 write_version.tag.clear();
2406 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2407}
2408
2409int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2410 real_time *mtime, real_time set_mtime,
2411 map<string, bufferlist>& attrs, real_time delete_at,
31f18b77
FG
2412 const char *if_match, const char *if_nomatch, const string *user_data,
2413 rgw_zone_set *zones_trace)
7c673cae 2414{
31f18b77 2415 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
7c673cae
FG
2416 if (r < 0)
2417 return r;
2418
2419 is_complete = !canceled;
2420 return 0;
2421}
2422
2423CephContext *RGWPutObjProcessor::ctx()
2424{
2425 return store->ctx();
2426}
2427
2428RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2429{
2430 drain_pending();
2431
2432 if (is_complete)
2433 return;
2434
2435 set<rgw_raw_obj>::iterator iter;
2436 bool need_to_remove_head = false;
2437 rgw_raw_obj raw_head;
2438
2439 if (!head_obj.empty()) {
2440 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2441 }
2442
2443 /**
2444 * We should delete the object in the "multipart" namespace to avoid race condition.
2445 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2446 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2447 * written by the second upload may be deleted by the first upload.
2448 * details is describled on #11749
2449 *
2450 * The above comment still stands, but instead of searching for a specific object in the multipart
2451 * namespace, we just make sure that we remove the object that is marked as the head object after
2452 * we remove all the other raw objects. Note that we use different call to remove the head object,
2453 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2454 */
2455 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2456 const rgw_raw_obj& obj = *iter;
2457 if (!head_obj.empty() && obj == raw_head) {
2458 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2459 need_to_remove_head = true;
2460 continue;
2461 }
2462
2463 int r = store->delete_raw_obj(obj);
2464 if (r < 0 && r != -ENOENT) {
2465 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2466 }
2467 }
2468
2469 if (need_to_remove_head) {
2470 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2471 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2472 if (r < 0 && r != -ENOENT) {
2473 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2474 }
2475 }
2476}
2477
2478int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2479{
2480 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2481 obj_len = abs_ofs + bl.length();
2482
2483 if (!(obj == last_written_obj)) {
2484 last_written_obj = obj;
2485 }
2486
2487 // For the first call pass -1 as the offset to
2488 // do a write_full.
2489 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2490}
2491
2492struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2493{
2494 struct put_obj_aio_info info;
2495 info = pending.front();
2496 pending.pop_front();
2497 pending_size -= info.size;
2498 return info;
2499}
2500
2501int RGWPutObjProcessor_Aio::wait_pending_front()
2502{
2503 if (pending.empty()) {
2504 return 0;
2505 }
2506 struct put_obj_aio_info info = pop_pending();
2507 int ret = store->aio_wait(info.handle);
2508
2509 if (ret >= 0) {
2510 add_written_obj(info.obj);
2511 }
2512
2513 return ret;
2514}
2515
2516bool RGWPutObjProcessor_Aio::pending_has_completed()
2517{
2518 if (pending.empty())
2519 return false;
2520
2521 struct put_obj_aio_info& info = pending.front();
2522 return store->aio_completed(info.handle);
2523}
2524
2525int RGWPutObjProcessor_Aio::drain_pending()
2526{
2527 int ret = 0;
2528 while (!pending.empty()) {
2529 int r = wait_pending_front();
2530 if (r < 0)
2531 ret = r;
2532 }
2533 return ret;
2534}
2535
2536int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2537{
2538 bool _wait = need_to_wait;
2539
2540 if (handle) {
2541 struct put_obj_aio_info info;
2542 info.handle = handle;
2543 info.obj = obj;
2544 info.size = size;
2545 pending_size += size;
2546 pending.push_back(info);
2547 }
2548 size_t orig_size = pending_size;
2549
2550 /* first drain complete IOs */
2551 while (pending_has_completed()) {
2552 int r = wait_pending_front();
2553 if (r < 0)
2554 return r;
2555
2556 _wait = false;
2557 }
2558
2559 /* resize window in case messages are draining too fast */
2560 if (orig_size - pending_size >= window_size) {
2561 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2562 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2563 if (window_size > max_window_size) {
2564 window_size = max_window_size;
2565 }
2566 }
2567
2568 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2569 if (pending_size > window_size || _wait) {
2570 int r = wait_pending_front();
2571 if (r < 0)
2572 return r;
2573 }
2574 return 0;
2575}
2576
2577int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2578{
2579 if (ofs >= next_part_ofs) {
2580 int r = prepare_next_part(ofs);
2581 if (r < 0) {
2582 return r;
2583 }
2584 }
2585
2586 *pobj = cur_obj;
2587
224ce89b
WB
2588 if (!bl.length()) {
2589 *phandle = nullptr;
7c673cae 2590 return 0;
224ce89b 2591 }
7c673cae
FG
2592
2593 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2594}
2595
2596int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2597{
2598 RGWPutObjProcessor::prepare(store, oid_rand);
2599
2600 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2601
2602 return 0;
2603}
2604
2605int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2606{
2607 *phandle = NULL;
2608 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2609
2610 pending_data_bl.claim_append(bl);
2611 if (pending_data_bl.length() < max_write_size) {
2612 *again = false;
2613 return 0;
2614 }
2615
2616 pending_data_bl.splice(0, max_write_size, &bl);
2617
2618 /* do we have enough data pending accumulated that needs to be written? */
2619 *again = (pending_data_bl.length() >= max_chunk_size);
2620
2621 if (!data_ofs && !immutable_head()) {
2622 first_chunk.claim(bl);
2623 obj_len = (uint64_t)first_chunk.length();
2624 int r = prepare_next_part(obj_len);
2625 if (r < 0) {
2626 return r;
2627 }
2628 data_ofs = obj_len;
2629 return 0;
2630 }
2631 off_t write_ofs = data_ofs;
2632 data_ofs = write_ofs + bl.length();
2633 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2634 we could be racing with another upload, to the same
2635 object and cleanup can be messy */
2636 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2637 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2638 bl.clear();
2639 }
2640 return ret;
2641}
2642
2643
2644int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2645{
2646 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2647
2648 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2649 if (r < 0) {
2650 return r;
2651 }
2652
2653 return 0;
2654}
2655
2656int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2657{
2658 head_obj.init(bucket, obj_str);
2659
2660 int r = prepare_init(store, oid_rand);
2661 if (r < 0) {
2662 return r;
2663 }
2664
2665 if (!version_id.empty()) {
2666 head_obj.key.set_instance(version_id);
2667 } else if (versioned_object) {
2668 store->gen_rand_obj_instance_name(&head_obj);
2669 }
2670
2671 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2672
2673 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2674 if (r < 0) {
2675 return r;
2676 }
2677
2678 return 0;
2679}
2680
2681int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2682
2683 int ret = manifest_gen.create_next(ofs);
2684 if (ret < 0) {
2685 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2686 return ret;
2687 }
2688 cur_part_ofs = ofs;
2689 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2690 cur_obj = manifest_gen.get_cur_obj(store);
2691
2692 return 0;
2693}
2694
2695int RGWPutObjProcessor_Atomic::complete_parts()
2696{
2697 if (obj_len > (uint64_t)cur_part_ofs) {
2698 return prepare_next_part(obj_len);
2699 }
2700 return 0;
2701}
2702
2703int RGWPutObjProcessor_Atomic::complete_writing_data()
2704{
2705 if (!data_ofs && !immutable_head()) {
2706 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2707 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2708 * clobber first_chunk
2709 */
2710 if (pending_data_bl.length() > 0) {
2711 first_chunk.claim(pending_data_bl);
2712 }
2713 obj_len = (uint64_t)first_chunk.length();
2714 }
2715 while (pending_data_bl.length()) {
224ce89b 2716 void *handle = nullptr;
7c673cae
FG
2717 rgw_raw_obj obj;
2718 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2719 if (max_write_size > pending_data_bl.length()) {
2720 max_write_size = pending_data_bl.length();
2721 }
2722 bufferlist bl;
2723 pending_data_bl.splice(0, max_write_size, &bl);
2724 uint64_t write_len = bl.length();
2725 int r = write_data(bl, data_ofs, &handle, &obj, false);
2726 if (r < 0) {
2727 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2728 return r;
2729 }
2730 data_ofs += write_len;
2731 r = throttle_data(handle, obj, write_len, false);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2734 return r;
2735 }
2736
2737 if (data_ofs >= next_part_ofs) {
2738 r = prepare_next_part(data_ofs);
2739 if (r < 0) {
2740 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2741 return r;
2742 }
2743 }
2744 }
2745 int r = complete_parts();
2746 if (r < 0) {
2747 return r;
2748 }
2749
2750 r = drain_pending();
2751 if (r < 0)
2752 return r;
2753
2754 return 0;
2755}
2756
2757int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2758 real_time *mtime, real_time set_mtime,
2759 map<string, bufferlist>& attrs,
2760 real_time delete_at,
2761 const char *if_match,
31f18b77
FG
2762 const char *if_nomatch, const string *user_data,
2763 rgw_zone_set *zones_trace) {
7c673cae
FG
2764 int r = complete_writing_data();
2765 if (r < 0)
2766 return r;
2767
2768 obj_ctx.obj.set_atomic(head_obj);
2769
2770 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2771
2772 /* some object types shouldn't be versioned, e.g., multipart parts */
2773 op_target.set_versioning_disabled(!versioned_object);
2774
2775 RGWRados::Object::Write obj_op(&op_target);
2776
2777 obj_op.meta.data = &first_chunk;
2778 obj_op.meta.manifest = &manifest;
2779 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2780 obj_op.meta.if_match = if_match;
2781 obj_op.meta.if_nomatch = if_nomatch;
2782 obj_op.meta.mtime = mtime;
2783 obj_op.meta.set_mtime = set_mtime;
2784 obj_op.meta.owner = bucket_info.owner;
2785 obj_op.meta.flags = PUT_OBJ_CREATE;
2786 obj_op.meta.olh_epoch = olh_epoch;
2787 obj_op.meta.delete_at = delete_at;
2788 obj_op.meta.user_data = user_data;
31f18b77 2789 obj_op.meta.zones_trace = zones_trace;
181888fb 2790 obj_op.meta.modify_tail = true;
7c673cae
FG
2791
2792 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2793 if (r < 0) {
2794 return r;
2795 }
2796
2797 canceled = obj_op.meta.canceled;
2798
2799 return 0;
2800}
2801
3a9019d9
FG
2802const char* RGWRados::admin_commands[4][3] = {
2803 { "cache list",
2804 "cache list name=filter,type=CephString,req=false",
2805 "cache list [filter_str]: list object cache, possibly matching substrings" },
2806 { "cache inspect",
2807 "cache inspect name=target,type=CephString,req=true",
2808 "cache inspect target: print cache element" },
2809 { "cache erase",
2810 "cache erase name=target,type=CephString,req=true",
2811 "cache erase target: erase element from cache" },
2812 { "cache zap",
2813 "cache zap",
2814 "cache zap: erase all elements from cache" }
2815};
2816
2817
7c673cae
FG
2818int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2819 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2820 if (r < 0)
2821 return r;
2822 return 0;
2823}
2824
2825int RGWRados::unwatch(uint64_t watch_handle)
2826{
2827 int r = control_pool_ctx.unwatch2(watch_handle);
2828 if (r < 0) {
2829 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2830 return r;
2831 }
2832 r = rados[0].watch_flush();
2833 if (r < 0) {
2834 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2835 return r;
2836 }
2837 return 0;
2838}
2839
2840void RGWRados::add_watcher(int i)
2841{
2842 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2843 Mutex::Locker l(watchers_lock);
2844 watchers_set.insert(i);
2845 if (watchers_set.size() == (size_t)num_watchers) {
2846 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2847 set_cache_enabled(true);
2848 }
2849}
2850
2851void RGWRados::remove_watcher(int i)
2852{
2853 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2854 Mutex::Locker l(watchers_lock);
2855 size_t orig_size = watchers_set.size();
2856 watchers_set.erase(i);
2857 if (orig_size == (size_t)num_watchers &&
2858 watchers_set.size() < orig_size) { /* actually removed */
2859 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2860 set_cache_enabled(false);
2861 }
2862}
2863
2864class RGWWatcher : public librados::WatchCtx2 {
2865 RGWRados *rados;
2866 int index;
2867 string oid;
2868 uint64_t watch_handle;
2869
2870 class C_ReinitWatch : public Context {
2871 RGWWatcher *watcher;
2872 public:
2873 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2874 void finish(int r) override {
2875 watcher->reinit();
2876 }
2877 };
2878public:
2879 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2880 void handle_notify(uint64_t notify_id,
2881 uint64_t cookie,
2882 uint64_t notifier_id,
2883 bufferlist& bl) override {
2884 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2885 << " notify_id " << notify_id
2886 << " cookie " << cookie
2887 << " notifier " << notifier_id
2888 << " bl.length()=" << bl.length() << dendl;
2889 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2890
2891 bufferlist reply_bl; // empty reply payload
2892 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2893 }
2894 void handle_error(uint64_t cookie, int err) override {
2895 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2896 << " err " << cpp_strerror(err) << dendl;
2897 rados->remove_watcher(index);
2898 rados->schedule_context(new C_ReinitWatch(this));
2899 }
2900
2901 void reinit() {
2902 int ret = unregister_watch();
2903 if (ret < 0) {
2904 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2905 return;
2906 }
2907 ret = register_watch();
2908 if (ret < 0) {
2909 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2910 return;
2911 }
2912 }
2913
2914 int unregister_watch() {
2915 int r = rados->unwatch(watch_handle);
2916 if (r < 0) {
2917 return r;
2918 }
2919 rados->remove_watcher(index);
2920 return 0;
2921 }
2922
2923 int register_watch() {
2924 int r = rados->watch(oid, &watch_handle, this);
2925 if (r < 0) {
2926 return r;
2927 }
2928 rados->add_watcher(index);
2929 return 0;
2930 }
2931};
2932
2933class RGWMetaNotifierManager : public RGWCoroutinesManager {
2934 RGWRados *store;
2935 RGWHTTPManager http_manager;
2936
2937public:
2938 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2939 http_manager(store->ctx(), completion_mgr) {
2940 http_manager.set_threaded();
2941 }
2942
2943 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2944 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2945 { "notify", NULL },
2946 { NULL, NULL } };
2947
2948 list<RGWCoroutinesStack *> stacks;
2949 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2950 RGWRESTConn *conn = iter->second;
2951 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2952 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2953
2954 stacks.push_back(stack);
2955 }
2956 return run(stacks);
2957 }
2958};
2959
2960class RGWDataNotifierManager : public RGWCoroutinesManager {
2961 RGWRados *store;
2962 RGWHTTPManager http_manager;
2963
2964public:
2965 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2966 http_manager(store->ctx(), completion_mgr) {
2967 http_manager.set_threaded();
2968 }
2969
2970 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2971 rgw_http_param_pair pairs[] = { { "type", "data" },
2972 { "notify", NULL },
2973 { "source-zone", store->get_zone_params().get_id().c_str() },
2974 { NULL, NULL } };
2975
2976 list<RGWCoroutinesStack *> stacks;
2977 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2978 RGWRESTConn *conn = iter->second;
2979 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2980 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2981
2982 stacks.push_back(stack);
2983 }
2984 return run(stacks);
2985 }
2986};
2987
2988class RGWRadosThread {
2989 class Worker : public Thread {
2990 CephContext *cct;
2991 RGWRadosThread *processor;
2992 Mutex lock;
2993 Cond cond;
2994
31f18b77
FG
2995 void wait() {
2996 Mutex::Locker l(lock);
2997 cond.Wait(lock);
2998 };
2999
3000 void wait_interval(const utime_t& wait_time) {
3001 Mutex::Locker l(lock);
3002 cond.WaitInterval(lock, wait_time);
3003 }
3004
7c673cae
FG
3005 public:
3006 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
3007 void *entry() override;
31f18b77 3008 void signal() {
7c673cae
FG
3009 Mutex::Locker l(lock);
3010 cond.Signal();
3011 }
3012 };
3013
3014 Worker *worker;
3015
3016protected:
3017 CephContext *cct;
3018 RGWRados *store;
3019
3020 std::atomic<bool> down_flag = { false };
3021
3022 string thread_name;
3023
3024 virtual uint64_t interval_msec() = 0;
3025 virtual void stop_process() {}
3026public:
3027 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3028 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3029 virtual ~RGWRadosThread() {
3030 stop();
3031 }
3032
3033 virtual int init() { return 0; }
3034 virtual int process() = 0;
3035
3036 bool going_down() { return down_flag; }
3037
3038 void start();
3039 void stop();
31f18b77
FG
3040
3041 void signal() {
3042 if (worker) {
3043 worker->signal();
3044 }
3045 }
7c673cae
FG
3046};
3047
3048void RGWRadosThread::start()
3049{
3050 worker = new Worker(cct, this);
3051 worker->create(thread_name.c_str());
3052}
3053
3054void RGWRadosThread::stop()
3055{
3056 down_flag = true;
3057 stop_process();
3058 if (worker) {
31f18b77 3059 worker->signal();
7c673cae
FG
3060 worker->join();
3061 }
3062 delete worker;
3063 worker = NULL;
3064}
3065
3066void *RGWRadosThread::Worker::entry() {
3067 uint64_t msec = processor->interval_msec();
3068 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3069
3070 do {
3071 utime_t start = ceph_clock_now();
3072 int r = processor->process();
3073 if (r < 0) {
3074 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3075 }
3076
3077 if (processor->going_down())
3078 break;
3079
3080 utime_t end = ceph_clock_now();
3081 end -= start;
3082
3083 uint64_t cur_msec = processor->interval_msec();
3084 if (cur_msec != msec) { /* was it reconfigured? */
3085 msec = cur_msec;
3086 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3087 }
3088
3089 if (cur_msec > 0) {
3090 if (interval <= end)
3091 continue; // next round
3092
3093 utime_t wait_time = interval;
3094 wait_time -= end;
3095
31f18b77 3096 wait_interval(wait_time);
7c673cae 3097 } else {
31f18b77 3098 wait();
7c673cae
FG
3099 }
3100 } while (!processor->going_down());
3101
3102 return NULL;
3103}
3104
3105class RGWMetaNotifier : public RGWRadosThread {
3106 RGWMetaNotifierManager notify_mgr;
3107 RGWMetadataLog *const log;
3108
3109 uint64_t interval_msec() override {
3110 return cct->_conf->rgw_md_notify_interval_msec;
3111 }
1adf2230
AA
3112 void stop_process() override {
3113 notify_mgr.stop();
3114 }
7c673cae
FG
3115public:
3116 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3117 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3118
3119 int process() override;
3120};
3121
3122int RGWMetaNotifier::process()
3123{
3124 set<int> shards;
3125
3126 log->read_clear_modified(shards);
3127
3128 if (shards.empty()) {
3129 return 0;
3130 }
3131
3132 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3133 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3134 }
3135
3136 notify_mgr.notify_all(store->zone_conn_map, shards);
3137
3138 return 0;
3139}
3140
3141class RGWDataNotifier : public RGWRadosThread {
3142 RGWDataNotifierManager notify_mgr;
3143
3144 uint64_t interval_msec() override {
d2e6a577 3145 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 3146 }
1adf2230
AA
3147 void stop_process() override {
3148 notify_mgr.stop();
3149 }
7c673cae
FG
3150public:
3151 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3152
3153 int process() override;
3154};
3155
3156int RGWDataNotifier::process()
3157{
3158 if (!store->data_log) {
3159 return 0;
3160 }
3161
3162 map<int, set<string> > shards;
3163
3164 store->data_log->read_clear_modified(shards);
3165
3166 if (shards.empty()) {
3167 return 0;
3168 }
3169
3170 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3171 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3172 }
3173
3174 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3175
3176 return 0;
3177}
3178
3179class RGWSyncProcessorThread : public RGWRadosThread {
3180public:
3181 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3182 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3183 ~RGWSyncProcessorThread() override {}
3184 int init() override = 0 ;
3185 int process() override = 0;
3186};
3187
3188class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3189{
3190 RGWMetaSyncStatusManager sync;
3191
3192 uint64_t interval_msec() override {
3193 return 0; /* no interval associated, it'll run once until stopped */
3194 }
3195 void stop_process() override {
3196 sync.stop();
3197 }
3198public:
3199 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3200 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3201
3202 void wakeup_sync_shards(set<int>& shard_ids) {
3203 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3204 sync.wakeup(*iter);
3205 }
3206 }
3207 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3208
3209 int init() override {
3210 int ret = sync.init();
3211 if (ret < 0) {
3212 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3213 return ret;
3214 }
3215 return 0;
3216 }
3217
3218 int process() override {
3219 sync.run();
3220 return 0;
3221 }
3222};
3223
3224class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3225{
3226 RGWDataSyncStatusManager sync;
3227 bool initialized;
3228
3229 uint64_t interval_msec() override {
3230 if (initialized) {
3231 return 0; /* no interval associated, it'll run once until stopped */
3232 } else {
3233#define DATA_SYNC_INIT_WAIT_SEC 20
3234 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3235 }
3236 }
3237 void stop_process() override {
3238 sync.stop();
3239 }
3240public:
3241 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
91327a77 3242 const string& _source_zone)
b32b8144 3243 : RGWSyncProcessorThread(_store, "data-sync"),
91327a77 3244 sync(_store, async_rados, _source_zone),
7c673cae
FG
3245 initialized(false) {}
3246
3247 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3248 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3249 sync.wakeup(iter->first, iter->second);
3250 }
3251 }
3252 RGWDataSyncStatusManager* get_manager() { return &sync; }
3253
3254 int init() override {
3255 return 0;
3256 }
3257
3258 int process() override {
3259 while (!initialized) {
3260 if (going_down()) {
3261 return 0;
3262 }
3263 int ret = sync.init();
3264 if (ret >= 0) {
3265 initialized = true;
3266 break;
3267 }
3268 /* we'll be back! */
3269 return 0;
3270 }
3271 sync.run();
3272 return 0;
3273 }
3274};
3275
3276class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3277{
3278 RGWCoroutinesManager crs;
3279 RGWRados *store;
b32b8144 3280 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
3281 RGWHTTPManager http;
3282 const utime_t trim_interval;
3283
3284 uint64_t interval_msec() override { return 0; }
3285 void stop_process() override { crs.stop(); }
3286public:
b32b8144
FG
3287 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3288 int interval)
7c673cae
FG
3289 : RGWSyncProcessorThread(store, "sync-log-trim"),
3290 crs(store->ctx(), store->get_cr_registry()), store(store),
b32b8144 3291 bucket_trim(bucket_trim),
7c673cae
FG
3292 http(store->ctx(), crs.get_completion_mgr()),
3293 trim_interval(interval, 0)
3294 {}
3295
3296 int init() override {
3297 return http.set_threaded();
3298 }
3299 int process() override {
3300 list<RGWCoroutinesStack*> stacks;
3301 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3302 meta->call(create_meta_log_trim_cr(store, &http,
3303 cct->_conf->rgw_md_log_max_shards,
3304 trim_interval));
3305 stacks.push_back(meta);
3306
3307 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3308 data->call(create_data_log_trim_cr(store, &http,
3309 cct->_conf->rgw_data_log_num_shards,
3310 trim_interval));
3311 stacks.push_back(data);
3312
b32b8144
FG
3313 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3314 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3315 stacks.push_back(bucket);
3316
7c673cae
FG
3317 crs.run(stacks);
3318 return 0;
3319 }
3320};
3321
3322void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3323{
3324 Mutex::Locker l(meta_sync_thread_lock);
3325 if (meta_sync_processor_thread) {
3326 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3327 }
3328}
3329
3330void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3331{
3332 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3333 Mutex::Locker l(data_sync_thread_lock);
3334 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3335 if (iter == data_sync_processor_threads.end()) {
3336 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3337 return;
3338 }
3339
3340 RGWDataSyncProcessorThread *thread = iter->second;
3341 assert(thread);
3342 thread->wakeup_sync_shards(shard_ids);
3343}
3344
3345RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3346{
3347 Mutex::Locker l(meta_sync_thread_lock);
3348 if (meta_sync_processor_thread) {
3349 return meta_sync_processor_thread->get_manager();
3350 }
3351 return nullptr;
3352}
3353
3354RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3355{
3356 Mutex::Locker l(data_sync_thread_lock);
3357 auto thread = data_sync_processor_threads.find(source_zone);
3358 if (thread == data_sync_processor_threads.end()) {
3359 return nullptr;
3360 }
3361 return thread->second->get_manager();
3362}
3363
3364int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3365{
3366 IoCtx ioctx;
3367 int r = open_pool_ctx(pool, ioctx);
3368 if (r < 0) {
3369 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3370 return r;
3371 }
3372
3373 bool requires;
3374 r = ioctx.pool_requires_alignment2(&requires);
3375 if (r < 0) {
3376 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3377 << r << dendl;
3378 return r;
3379 }
3380
3381 if (!requires) {
3382 *alignment = 0;
3383 return 0;
3384 }
3385
3386 uint64_t align;
3387 r = ioctx.pool_required_alignment2(&align);
3388 if (r < 0) {
3389 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3390 << r << dendl;
3391 return r;
3392 }
3393 if (align != 0) {
3394 ldout(cct, 20) << "required alignment=" << align << dendl;
3395 }
3396 *alignment = align;
3397 return 0;
3398}
3399
3400int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3401{
224ce89b 3402 uint64_t alignment = 0;
7c673cae
FG
3403 int r = get_required_alignment(pool, &alignment);
3404 if (r < 0) {
3405 return r;
3406 }
3407
3408 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3409
3410 if (alignment == 0) {
3411 *max_chunk_size = config_chunk_size;
3412 return 0;
3413 }
3414
3415 if (config_chunk_size <= alignment) {
3416 *max_chunk_size = alignment;
3417 return 0;
3418 }
3419
3420 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3421
3422 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3423
3424 return 0;
3425}
3426
3427int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3428{
3429 rgw_pool pool;
3430 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3431 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3432 return -EIO;
3433 }
3434 return get_max_chunk_size(pool, max_chunk_size);
3435}
3436
31f18b77
FG
3437class RGWIndexCompletionManager;
3438
3439struct complete_op_data {
3440 Mutex lock{"complete_op_data"};
3441 AioCompletion *rados_completion{nullptr};
3442 int manager_shard_id{-1};
3443 RGWIndexCompletionManager *manager{nullptr};
3444 rgw_obj obj;
3445 RGWModifyOp op;
3446 string tag;
3447 rgw_bucket_entry_ver ver;
3448 cls_rgw_obj_key key;
3449 rgw_bucket_dir_entry_meta dir_meta;
3450 list<cls_rgw_obj_key> remove_objs;
3451 bool log_op;
3452 uint16_t bilog_op;
3453 rgw_zone_set zones_trace;
3454
3455 bool stopped{false};
3456
3457 void stop() {
3458 Mutex::Locker l(lock);
3459 stopped = true;
3460 }
3461};
3462
3463class RGWIndexCompletionThread : public RGWRadosThread {
3464 RGWRados *store;
3465
3466 uint64_t interval_msec() override {
3467 return 0;
3468 }
3469
3470 list<complete_op_data *> completions;
3471
3472 Mutex completions_lock;
3473public:
3474 RGWIndexCompletionThread(RGWRados *_store)
3475 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3476
3477 int process() override;
3478
3479 void add_completion(complete_op_data *completion) {
3480 {
3481 Mutex::Locker l(completions_lock);
3482 completions.push_back(completion);
3483 }
3484
3485 signal();
3486 }
3487};
3488
3489int RGWIndexCompletionThread::process()
3490{
3491 list<complete_op_data *> comps;
3492
3493 {
3494 Mutex::Locker l(completions_lock);
3495 completions.swap(comps);
3496 }
3497
3498 for (auto c : comps) {
3499 std::unique_ptr<complete_op_data> up{c};
3500
3501 if (going_down()) {
3502 continue;
3503 }
3504 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3505
3506 RGWRados::BucketShard bs(store);
3507
3508 int r = bs.init(c->obj.bucket, c->obj);
3509 if (r < 0) {
3510 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3511 /* not much to do */
3512 continue;
3513 }
3514
3515 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3516 librados::ObjectWriteOperation o;
3517 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3518 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3519 c->log_op, c->bilog_op, &c->zones_trace);
3520
3521 return bs->index_ctx.operate(bs->bucket_obj, &o);
3522 });
3523 if (r < 0) {
3524 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3525 /* ignoring error, can't do anything about it */
3526 continue;
3527 }
3528 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3529 if (r < 0) {
3530 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3531 }
3532 }
3533
3534 return 0;
3535}
3536
3537class RGWIndexCompletionManager {
3538 RGWRados *store{nullptr};
3539 vector<Mutex *> locks;
3540 vector<set<complete_op_data *> > completions;
3541
3542 RGWIndexCompletionThread *completion_thread{nullptr};
3543
3544 int num_shards;
3545
3546 std::atomic<int> cur_shard {0};
3547
3548
3549public:
3550 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3551 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3552
3553 for (int i = 0; i < num_shards; i++) {
3554 char buf[64];
3555 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3556 locks.push_back(new Mutex(buf));
3557 }
3558
3559 completions.resize(num_shards);
3560 }
3561 ~RGWIndexCompletionManager() {
3562 stop();
3563
3564 for (auto l : locks) {
3565 delete l;
3566 }
3567 }
3568
3569 int next_shard() {
3570 int result = cur_shard % num_shards;
3571 cur_shard++;
3572 return result;
3573 }
3574
3575 void create_completion(const rgw_obj& obj,
3576 RGWModifyOp op, string& tag,
3577 rgw_bucket_entry_ver& ver,
3578 const cls_rgw_obj_key& key,
3579 rgw_bucket_dir_entry_meta& dir_meta,
3580 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3581 uint16_t bilog_op,
3582 rgw_zone_set *zones_trace,
3583 complete_op_data **result);
3584 bool handle_completion(completion_t cb, complete_op_data *arg);
3585
3586 int start() {
3587 completion_thread = new RGWIndexCompletionThread(store);
3588 int ret = completion_thread->init();
3589 if (ret < 0) {
3590 return ret;
3591 }
3592 completion_thread->start();
3593 return 0;
3594 }
3595 void stop() {
3596 if (completion_thread) {
3597 completion_thread->stop();
3598 delete completion_thread;
3599 }
3600
3601 for (int i = 0; i < num_shards; ++i) {
3602 Mutex::Locker l(*locks[i]);
3603 for (auto c : completions[i]) {
31f18b77
FG
3604 c->stop();
3605 }
3606 }
3607 completions.clear();
3608 }
3609};
3610
3611static void obj_complete_cb(completion_t cb, void *arg)
3612{
3613 complete_op_data *completion = (complete_op_data *)arg;
3614 completion->lock.Lock();
3615 if (completion->stopped) {
3616 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3617 delete completion;
3618 return;
3619 }
3620 bool need_delete = completion->manager->handle_completion(cb, completion);
3621 completion->lock.Unlock();
3622 if (need_delete) {
3623 delete completion;
3624 }
3625}
3626
3627
3628void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3629 RGWModifyOp op, string& tag,
3630 rgw_bucket_entry_ver& ver,
3631 const cls_rgw_obj_key& key,
3632 rgw_bucket_dir_entry_meta& dir_meta,
3633 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3634 uint16_t bilog_op,
3635 rgw_zone_set *zones_trace,
3636 complete_op_data **result)
3637{
3638 complete_op_data *entry = new complete_op_data;
3639
3640 int shard_id = next_shard();
3641
3642 entry->manager_shard_id = shard_id;
3643 entry->manager = this;
3644 entry->obj = obj;
3645 entry->op = op;
3646 entry->tag = tag;
3647 entry->ver = ver;
3648 entry->key = key;
3649 entry->dir_meta = dir_meta;
3650 entry->log_op = log_op;
3651 entry->bilog_op = bilog_op;
3652
3653 if (remove_objs) {
3654 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3655 entry->remove_objs.push_back(*iter);
3656 }
3657 }
3658
3659 if (zones_trace) {
3660 entry->zones_trace = *zones_trace;
3661 } else {
3662 entry->zones_trace.insert(store->get_zone().id);
3663 }
3664
3665 *result = entry;
3666
3667 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3668
3669 Mutex::Locker l(*locks[shard_id]);
3670 completions[shard_id].insert(entry);
3671}
3672
3673bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3674{
3675 int shard_id = arg->manager_shard_id;
3676 {
3677 Mutex::Locker l(*locks[shard_id]);
3678
3679 auto& comps = completions[shard_id];
3680
3681 auto iter = comps.find(arg);
3682 if (iter == comps.end()) {
3683 return true;
3684 }
3685
3686 comps.erase(iter);
3687 }
3688
3689 int r = rados_aio_get_return_value(cb);
3690 if (r != -ERR_BUSY_RESHARDING) {
3691 return true;
3692 }
3693 completion_thread->add_completion(arg);
3694 return false;
3695}
3696
7c673cae
FG
3697void RGWRados::finalize()
3698{
3a9019d9
FG
3699 auto admin_socket = cct->get_admin_socket();
3700 for (auto cmd : admin_commands) {
3701 int r = admin_socket->unregister_command(cmd[0]);
3702 if (r < 0) {
3703 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3704 << ")" << dendl;
3705 }
3706 }
3707
7c673cae
FG
3708 if (run_sync_thread) {
3709 Mutex::Locker l(meta_sync_thread_lock);
3710 meta_sync_processor_thread->stop();
3711
3712 Mutex::Locker dl(data_sync_thread_lock);
3713 for (auto iter : data_sync_processor_threads) {
3714 RGWDataSyncProcessorThread *thread = iter.second;
3715 thread->stop();
3716 }
3717 if (sync_log_trimmer) {
3718 sync_log_trimmer->stop();
3719 }
3720 }
3721 if (async_rados) {
3722 async_rados->stop();
3723 }
3724 if (run_sync_thread) {
3725 delete meta_sync_processor_thread;
3726 meta_sync_processor_thread = NULL;
3727 Mutex::Locker dl(data_sync_thread_lock);
3728 for (auto iter : data_sync_processor_threads) {
3729 RGWDataSyncProcessorThread *thread = iter.second;
3730 delete thread;
3731 }
3732 data_sync_processor_threads.clear();
3733 delete sync_log_trimmer;
3734 sync_log_trimmer = nullptr;
b32b8144 3735 bucket_trim = boost::none;
7c673cae
FG
3736 }
3737 if (finisher) {
3738 finisher->stop();
3739 }
3740 if (need_watch_notify()) {
3741 finalize_watch();
3742 }
3743 if (finisher) {
3744 /* delete finisher only after cleaning up watches, as watch error path might call
3745 * into finisher. We stop finisher before finalizing watch to make sure we don't
3746 * actually handle any racing work
3747 */
3748 delete finisher;
3749 }
3750 if (meta_notifier) {
3751 meta_notifier->stop();
3752 delete meta_notifier;
3753 }
3754 if (data_notifier) {
3755 data_notifier->stop();
3756 delete data_notifier;
3757 }
3758 delete data_log;
3759 if (async_rados) {
3760 delete async_rados;
3761 }
224ce89b 3762
c07f9fc5
FG
3763 delete lc;
3764 lc = NULL;
3765
7c673cae
FG
3766 delete gc;
3767 gc = NULL;
3768
7c673cae
FG
3769 delete obj_expirer;
3770 obj_expirer = NULL;
3771
3772 delete rest_master_conn;
3773
3774 map<string, RGWRESTConn *>::iterator iter;
3775 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3776 RGWRESTConn *conn = iter->second;
3777 delete conn;
3778 }
3779
3780 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3781 RGWRESTConn *conn = iter->second;
3782 delete conn;
3783 }
3784 RGWQuotaHandler::free_handler(quota_handler);
3785 if (cr_registry) {
3786 cr_registry->put();
3787 }
3788 delete meta_mgr;
3789 delete binfo_cache;
3790 delete obj_tombstone_cache;
3791 delete sync_modules_manager;
31f18b77
FG
3792
3793 if (reshard_wait.get()) {
3794 reshard_wait->stop();
3795 reshard_wait.reset();
3796 }
3797
3798 if (run_reshard_thread) {
3799 reshard->stop_processor();
3800 }
3801 delete reshard;
3802 delete index_completion_manager;
7c673cae
FG
3803}
3804
3805/**
3806 * Initialize the RADOS instance and prepare to do other ops
3807 * Returns 0 on success, -ERR# on failure.
3808 */
3809int RGWRados::init_rados()
3810{
3811 int ret = 0;
3a9019d9
FG
3812 auto admin_socket = cct->get_admin_socket();
3813 for (auto cmd : admin_commands) {
3814 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3815 cmd[2]);
3816 if (r < 0) {
3817 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3818 << ")" << dendl;
3819 return r;
3820 }
3821 }
3822
7c673cae
FG
3823 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3824
3825 for (auto& r : handles) {
3826 ret = r.init_with_context(cct);
3827 if (ret < 0) {
3828 return ret;
3829 }
7c673cae
FG
3830 ret = r.connect();
3831 if (ret < 0) {
3832 return ret;
3833 }
3834 }
3835
3836 sync_modules_manager = new RGWSyncModulesManager();
3837
3838 rgw_register_sync_modules(sync_modules_manager);
3839
3840 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3841 new RGWCoroutinesManagerRegistry(cct)};
3842 ret = crs->hook_to_admin_command("cr dump");
3843 if (ret < 0) {
3844 return ret;
3845 }
3846
3847 meta_mgr = new RGWMetadataManager(cct, this);
3848 data_log = new RGWDataChangesLog(cct, this);
3849 cr_registry = crs.release();
3850
3851 std::swap(handles, rados);
3852 return ret;
3853}
3854
224ce89b
WB
3855
3856int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3857{
3858 map<string,string> metadata = meta;
3859 metadata["num_handles"] = stringify(rados.size());
3860 metadata["zonegroup_id"] = zonegroup.get_id();
3861 metadata["zonegroup_name"] = zonegroup.get_name();
3862 metadata["zone_name"] = zone_name();
3863 metadata["zone_id"] = zone_id();;
3864 string name = cct->_conf->name.get_id();
3865 if (name.find("rgw.") == 0) {
3866 name = name.substr(4);
3867 }
3868 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3869 if (ret < 0) {
3870 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3871 return ret;
3872 }
3873
3874 return 0;
3875}
3876
7c673cae
FG
3877/**
3878 * Add new connection to connections map
3879 * @param zonegroup_conn_map map which new connection will be added to
3880 * @param zonegroup zonegroup which new connection will connect to
3881 * @param new_connection pointer to new connection instance
3882 */
3883static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3884 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3885{
3886 // Delete if connection is already exists
3887 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3888 if (iterZoneGroup != zonegroup_conn_map.end()) {
3889 delete iterZoneGroup->second;
3890 }
3891
3892 // Add new connection to connections map
3893 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3894}
3895
3896int RGWRados::convert_regionmap()
3897{
3898 RGWZoneGroupMap zonegroupmap;
3899
3900 string pool_name = cct->_conf->rgw_zone_root_pool;
3901 if (pool_name.empty()) {
3902 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3903 }
3904 string oid = region_map_oid;
3905
3906 rgw_pool pool(pool_name);
3907 bufferlist bl;
3908 RGWObjectCtx obj_ctx(this);
3909 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3910 if (ret < 0 && ret != -ENOENT) {
3911 return ret;
3912 } else if (ret == -ENOENT) {
3913 return 0;
3914 }
3915
3916 try {
3917 bufferlist::iterator iter = bl.begin();
3918 ::decode(zonegroupmap, iter);
3919 } catch (buffer::error& err) {
3920 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3921 return -EIO;
3922 }
3923
3924 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3925 iter != zonegroupmap.zonegroups.end(); ++iter) {
3926 RGWZoneGroup& zonegroup = iter->second;
3927 ret = zonegroup.init(cct, this, false);
3928 ret = zonegroup.update();
3929 if (ret < 0 && ret != -ENOENT) {
3930 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3931 cpp_strerror(-ret) << dendl;
3932 return ret;
3933 } else if (ret == -ENOENT) {
3934 ret = zonegroup.create();
3935 if (ret < 0) {
3936 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3937 cpp_strerror(-ret) << dendl;
3938 return ret;
3939 }
3940 }
3941 }
3942
3943 current_period.set_user_quota(zonegroupmap.user_quota);
3944 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3945
3946 // remove the region_map so we don't try to convert again
3947 rgw_raw_obj obj(pool, oid);
3948 ret = delete_system_obj(obj);
3949 if (ret < 0) {
3950 ldout(cct, 0) << "Error could not remove " << obj
3951 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3952 return ret;
3953 }
3954
3955 return 0;
3956}
3957
3958/**
3959 * Replace all region configuration with zonegroup for
3960 * backward compatability
3961 * Returns 0 on success, -ERR# on failure.
3962 */
3963int RGWRados::replace_region_with_zonegroup()
3964{
3965 /* copy default region */
3966 /* convert default region to default zonegroup */
3967 string default_oid = cct->_conf->rgw_default_region_info_oid;
3968 if (default_oid.empty()) {
3969 default_oid = default_region_info_oid;
3970 }
3971
3972
3973 RGWZoneGroup default_zonegroup;
3974 rgw_pool pool{default_zonegroup.get_pool(cct)};
3975 string oid = "converted";
3976 bufferlist bl;
3977 RGWObjectCtx obj_ctx(this);
3978
3979 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3980 if (ret < 0 && ret != -ENOENT) {
3981 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3982 << dendl;
3983 return ret;
3984 } else if (ret != -ENOENT) {
3985 ldout(cct, 20) << "System already converted " << dendl;
3986 return 0;
3987 }
3988
3989 string default_region;
3990 ret = default_zonegroup.init(cct, this, false, true);
3991 if (ret < 0) {
3992 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3993 return ret;
3994 }
3995 ret = default_zonegroup.read_default_id(default_region, true);
3996 if (ret < 0 && ret != -ENOENT) {
3997 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3998 return ret;
3999 }
4000
4001 /* convert regions to zonegroups */
4002 list<string> regions;
4003 ret = list_regions(regions);
4004 if (ret < 0 && ret != -ENOENT) {
4005 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4006 return ret;
4007 } else if (ret == -ENOENT || regions.empty()) {
4008 RGWZoneParams zoneparams(default_zone_name);
4009 int ret = zoneparams.init(cct, this);
4010 if (ret < 0 && ret != -ENOENT) {
4011 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4012 return ret;
4013 }
4014 /* update master zone */
4015 RGWZoneGroup default_zg(default_zonegroup_name);
4016 ret = default_zg.init(cct, this);
4017 if (ret < 0 && ret != -ENOENT) {
4018 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4019 return ret;
4020 }
4021 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4022 default_zg.master_zone = zoneparams.get_id();
4023 return default_zg.update();
4024 }
4025 return 0;
4026 }
4027
4028 string master_region, master_zone;
4029 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4030 if (*iter != default_zonegroup_name){
4031 RGWZoneGroup region(*iter);
4032 int ret = region.init(cct, this, true, true);
4033 if (ret < 0) {
4034 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4035 return ret;
4036 }
31f18b77 4037 if (region.is_master_zonegroup()) {
7c673cae
FG
4038 master_region = region.get_id();
4039 master_zone = region.master_zone;
4040 }
4041 }
4042 }
4043
4044 /* create realm if there is none.
4045 The realm name will be the region and zone concatenated
4046 realm id will be mds of its name */
4047 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4048 string new_realm_name = master_region + "." + master_zone;
4049 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4050 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4051 MD5 hash;
4052 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4053 hash.Final(md5);
4054 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4055 string new_realm_id(md5_str);
4056 RGWRealm new_realm(new_realm_id,new_realm_name);
4057 ret = new_realm.init(cct, this, false);
4058 if (ret < 0) {
4059 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4060 return ret;
4061 }
4062 ret = new_realm.create();
4063 if (ret < 0 && ret != -EEXIST) {
4064 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4065 return ret;
4066 }
4067 ret = new_realm.set_as_default();
4068 if (ret < 0) {
4069 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4070 return ret;
4071 }
4072 ret = realm.init(cct, this);
4073 if (ret < 0) {
4074 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4075 return ret;
4076 }
4077 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4078 if (ret < 0) {
4079 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4080 return ret;
4081 }
4082 }
4083
4084 list<string>::iterator iter;
4085 /* create zonegroups */
4086 for (iter = regions.begin(); iter != regions.end(); ++iter)
4087 {
4088 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4089 /* check to see if we don't have already a zonegroup with this name */
4090 RGWZoneGroup new_zonegroup(*iter);
4091 ret = new_zonegroup.init(cct , this);
4092 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4093 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4094 " skipping conversion " << dendl;
4095 continue;
4096 }
4097 RGWZoneGroup zonegroup(*iter);
4098 zonegroup.set_id(*iter);
4099 int ret = zonegroup.init(cct, this, true, true);
4100 if (ret < 0) {
4101 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4102 return ret;
4103 }
4104 zonegroup.realm_id = realm.get_id();
4105 /* fix default region master zone */
4106 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4107 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4108 zonegroup.master_zone = default_zone_name;
4109 }
4110 ret = zonegroup.update();
4111 if (ret < 0 && ret != -EEXIST) {
4112 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4113 << dendl;
4114 return ret;
4115 }
4116 ret = zonegroup.update_name();
4117 if (ret < 0 && ret != -EEXIST) {
4118 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4119 << dendl;
4120 return ret;
4121 }
4122 if (zonegroup.get_name() == default_region) {
4123 ret = zonegroup.set_as_default();
4124 if (ret < 0) {
4125 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4126 << dendl;
4127 return ret;
4128 }
4129 }
4130 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4131 ++iter) {
4132 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4133 RGWZoneParams zoneparams(iter->first, iter->first);
4134 zoneparams.set_id(iter->first);
4135 zoneparams.realm_id = realm.get_id();
4136 ret = zoneparams.init(cct, this);
4137 if (ret < 0 && ret != -ENOENT) {
4138 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4139 return ret;
4140 } else if (ret == -ENOENT) {
4141 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4142 continue;
4143 }
4144 zonegroup.realm_id = realm.get_id();
4145 ret = zoneparams.update();
4146 if (ret < 0 && ret != -EEXIST) {
4147 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4148 return ret;
4149 }
4150 ret = zoneparams.update_name();
4151 if (ret < 0 && ret != -EEXIST) {
4152 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4153 return ret;
4154 }
4155 }
4156
4157 if (!current_period.get_id().empty()) {
4158 ret = current_period.add_zonegroup(zonegroup);
4159 if (ret < 0) {
4160 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4161 return ret;
4162 }
4163 }
4164 }
4165
4166 if (!current_period.get_id().empty()) {
4167 ret = current_period.update();
4168 if (ret < 0) {
4169 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4170 return ret;
4171 }
4172 ret = current_period.store_info(false);
4173 if (ret < 0) {
4174 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4175 return ret;
4176 }
4177 ret = current_period.reflect();
4178 if (ret < 0) {
4179 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4180 return ret;
4181 }
4182 }
4183
4184 for (auto const& iter : regions) {
4185 RGWZoneGroup zonegroup(iter);
4186 int ret = zonegroup.init(cct, this, true, true);
4187 if (ret < 0) {
4188 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4189 return ret;
4190 }
4191 ret = zonegroup.delete_obj(true);
4192 if (ret < 0 && ret != -ENOENT) {
4193 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4194 << dendl;
4195 return ret;
4196 }
4197 }
4198
4199 /* mark as converted */
4200 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4201 true, NULL, real_time(), NULL);
4202 if (ret < 0 ) {
4203 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4204 << dendl;
4205 return ret;
4206 }
4207
4208 return 0;
4209}
4210
4211int RGWRados::init_zg_from_period(bool *initialized)
4212{
4213 *initialized = false;
4214
4215 if (current_period.get_id().empty()) {
4216 return 0;
4217 }
4218
4219 int ret = zonegroup.init(cct, this);
4220 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4221 if (ret == -ENOENT) {
4222 return 0;
4223 }
4224 if (ret < 0) {
4225 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4226 return ret;
4227 }
4228 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4229
4230 map<string, RGWZoneGroup>::const_iterator iter =
4231 current_period.get_map().zonegroups.find(zonegroup.get_id());
4232
4233 if (iter != current_period.get_map().zonegroups.end()) {
4234 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4235 zonegroup = iter->second;
4236 ret = zonegroup.init(cct, this, false);
4237 if (ret < 0) {
4238 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4239 return ret;
4240 }
4241 ret = zone_params.init(cct, this);
4242 if (ret < 0 && ret != -ENOENT) {
4243 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4244 return ret;
4245 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4246 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4247 zone_params.set_name(default_zone_name);
4248 ret = zone_params.init(cct, this);
4249 if (ret < 0 && ret != -ENOENT) {
4250 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4251 return ret;
4252 }
4253 }
4254 }
4255 for (iter = current_period.get_map().zonegroups.begin();
4256 iter != current_period.get_map().zonegroups.end(); ++iter){
4257 const RGWZoneGroup& zg = iter->second;
4258 // use endpoints from the zonegroup's master zone
4259 auto master = zg.zones.find(zg.master_zone);
4260 if (master == zg.zones.end()) {
4261 // fix missing master zone for a single zone zonegroup
4262 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4263 master = zg.zones.begin();
4264 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4265 master->second.name << " id:" << master->second.id << " as master" << dendl;
4266 if (zonegroup.get_id() == zg.get_id()) {
4267 zonegroup.master_zone = master->second.id;
4268 ret = zonegroup.update();
4269 if (ret < 0) {
4270 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4271 return ret;
4272 }
4273 } else {
4274 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4275 ret = fixed_zg.init(cct, this);
4276 if (ret < 0) {
4277 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4278 return ret;
4279 }
4280 fixed_zg.master_zone = master->second.id;
4281 ret = fixed_zg.update();
4282 if (ret < 0) {
4283 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4284 return ret;
4285 }
4286 }
4287 } else {
4288 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4289 zg.master_zone << dendl;
4290 return -EINVAL;
4291 }
4292 }
4293 const auto& endpoints = master->second.endpoints;
4294 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4295 if (!current_period.get_master_zonegroup().empty() &&
4296 zg.get_id() == current_period.get_master_zonegroup()) {
4297 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4298 }
4299 }
4300
4301 *initialized = true;
4302
4303 return 0;
4304}
4305
4306int RGWRados::init_zg_from_local(bool *creating_defaults)
4307{
4308 int ret = zonegroup.init(cct, this);
4309 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4310 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4311 return ret;
4312 } else if (ret == -ENOENT) {
4313 *creating_defaults = true;
4314 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4315 ret = zonegroup.create_default();
4316 if (ret < 0) {
4317 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4318 << dendl;
4319 return ret;
4320 }
4321 ret = zonegroup.init(cct, this);
4322 if (ret < 0) {
4323 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4324 << dendl;
4325 return ret;
4326 }
4327 }
4328 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
31f18b77 4329 if (zonegroup.is_master_zonegroup()) {
7c673cae
FG
4330 // use endpoints from the zonegroup's master zone
4331 auto master = zonegroup.zones.find(zonegroup.master_zone);
4332 if (master == zonegroup.zones.end()) {
4333 // fix missing master zone for a single zone zonegroup
4334 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4335 master = zonegroup.zones.begin();
4336 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4337 master->second.name << " id:" << master->second.id << " as master" << dendl;
4338 zonegroup.master_zone = master->second.id;
4339 ret = zonegroup.update();
4340 if (ret < 0) {
4341 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4342 return ret;
4343 }
4344 } else {
4345 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4346 "master_zone=" << zonegroup.master_zone << dendl;
4347 return -EINVAL;
4348 }
4349 }
4350 const auto& endpoints = master->second.endpoints;
4351 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4352 }
4353
4354 return 0;
4355}
4356
4357
4358bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4359{
4360 return target_zone.syncs_from(source_zone.name) &&
4361 sync_modules_manager->supports_data_export(source_zone.tier_type);
4362}
4363
4364/**
4365 * Initialize the RADOS instance and prepare to do other ops
4366 * Returns 0 on success, -ERR# on failure.
4367 */
4368int RGWRados::init_complete()
4369{
4370 int ret = realm.init(cct, this);
4371 if (ret < 0 && ret != -ENOENT) {
4372 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4373 return ret;
4374 } else if (ret != -ENOENT) {
4375 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4376 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4377 if (ret < 0 && ret != -ENOENT) {
4378 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4379 return ret;
4380 }
4381 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4382 }
4383
4384 ret = replace_region_with_zonegroup();
4385 if (ret < 0) {
4386 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4387 return ret;
4388 }
4389
4390 ret = convert_regionmap();
4391 if (ret < 0) {
4392 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4393 return ret;
4394 }
4395
4396 bool zg_initialized = false;
4397
4398 if (!current_period.get_id().empty()) {
4399 ret = init_zg_from_period(&zg_initialized);
4400 if (ret < 0) {
4401 return ret;
4402 }
4403 }
4404
4405 bool creating_defaults = false;
4406 bool using_local = (!zg_initialized);
4407 if (using_local) {
4408 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4409 ret = init_zg_from_local(&creating_defaults);
4410 if (ret < 0) {
4411 return ret;
4412 }
4413 // read period_config into current_period
4414 auto& period_config = current_period.get_config();
4415 ret = period_config.read(this, zonegroup.realm_id);
4416 if (ret < 0 && ret != -ENOENT) {
4417 ldout(cct, 0) << "ERROR: failed to read period config: "
4418 << cpp_strerror(ret) << dendl;
4419 return ret;
4420 }
4421 }
4422
4423 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4424 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4425 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4426 zone_params.set_name(default_zone_name);
4427 }
4428
4429 ret = zone_params.init(cct, this);
4430 if (ret < 0 && ret != -ENOENT) {
4431 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4432 return ret;
4433 }
4434 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4435 if (zone_iter == get_zonegroup().zones.end()) {
4436 if (using_local) {
4437 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4438 return -EINVAL;
4439 }
4440 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4441 ret = init_zg_from_local(&creating_defaults);
4442 if (ret < 0) {
4443 return ret;
4444 }
4445 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4446 }
4447 if (zone_iter != get_zonegroup().zones.end()) {
4448 zone_public_config = zone_iter->second;
4449 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4450 } else {
4451 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4452 return -EINVAL;
4453 }
4454
4455 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4456
31f18b77
FG
4457 if (run_sync_thread) {
4458 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4459 if (ret < 0) {
4460 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4461 return ret;
4462 }
7c673cae
FG
4463 }
4464
4465 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4466
4467 init_unique_trans_id_deps();
4468
4469 finisher = new Finisher(cct);
4470 finisher->start();
4471
4472 period_puller.reset(new RGWPeriodPuller(this));
4473 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4474 current_period));
4475
4476 if (need_watch_notify()) {
4477 ret = init_watch();
4478 if (ret < 0) {
4479 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4480 return ret;
4481 }
4482 }
4483
4484 /* first build all zones index */
4485 for (auto ziter : get_zonegroup().zones) {
4486 const string& id = ziter.first;
4487 RGWZone& z = ziter.second;
4488 zone_id_by_name[z.name] = id;
4489 zone_by_id[id] = z;
4490 }
31f18b77 4491
7c673cae
FG
4492 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4493 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4494 }
4495 zone_public_config = zone_by_id[zone_id()];
4496 for (auto ziter : get_zonegroup().zones) {
4497 const string& id = ziter.first;
4498 RGWZone& z = ziter.second;
4499 if (id == zone_id()) {
4500 continue;
4501 }
4502 if (z.endpoints.empty()) {
4503 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4504 continue;
4505 }
4506 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4507 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4508 zone_conn_map[id] = conn;
4509 if (zone_syncs_from(zone_public_config, z) ||
4510 zone_syncs_from(z, zone_public_config)) {
4511 if (zone_syncs_from(zone_public_config, z)) {
4512 zone_data_sync_from_map[id] = conn;
4513 }
4514 if (zone_syncs_from(z, zone_public_config)) {
4515 zone_data_notify_to_map[id] = conn;
4516 }
4517 } else {
4518 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4519 }
4520 }
4521
4522 ret = open_root_pool_ctx();
4523 if (ret < 0)
4524 return ret;
4525
4526 ret = open_gc_pool_ctx();
4527 if (ret < 0)
4528 return ret;
4529
4530 ret = open_lc_pool_ctx();
4531 if (ret < 0)
4532 return ret;
4533
4534 ret = open_objexp_pool_ctx();
4535 if (ret < 0)
4536 return ret;
4537
31f18b77
FG
4538 ret = open_reshard_pool_ctx();
4539 if (ret < 0)
4540 return ret;
4541
7c673cae
FG
4542 pools_initialized = true;
4543
4544 gc = new RGWGC();
4545 gc->initialize(cct, this);
4546
4547 obj_expirer = new RGWObjectExpirer(this);
4548
4549 if (use_gc_thread) {
4550 gc->start_processor();
4551 obj_expirer->start_processor();
4552 }
4553
7c673cae
FG
4554 /* no point of running sync thread if we don't have a master zone configured
4555 or there is no rest_master_conn */
4556 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4557 || current_period.get_id().empty()) {
4558 run_sync_thread = false;
4559 }
4560
b32b8144
FG
4561 if (run_sync_thread) {
4562 // initialize the log period history
4563 meta_mgr->init_oldest_log_period();
4564 }
4565
7c673cae
FG
4566 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4567 async_rados->start();
4568
4569 ret = meta_mgr->init(current_period.get_id());
4570 if (ret < 0) {
4571 lderr(cct) << "ERROR: failed to initialize metadata log: "
4572 << cpp_strerror(-ret) << dendl;
4573 return ret;
4574 }
4575
4576 if (is_meta_master()) {
4577 auto md_log = meta_mgr->get_log(current_period.get_id());
4578 meta_notifier = new RGWMetaNotifier(this, md_log);
4579 meta_notifier->start();
4580 }
4581
4582 if (run_sync_thread) {
4583 Mutex::Locker l(meta_sync_thread_lock);
4584 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4585 ret = meta_sync_processor_thread->init();
4586 if (ret < 0) {
4587 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4588 return ret;
4589 }
4590 meta_sync_processor_thread->start();
4591
b32b8144
FG
4592 // configure the bucket trim manager
4593 rgw::BucketTrimConfig config;
4594 rgw::configure_bucket_trim(cct, config);
4595
4596 bucket_trim.emplace(this, config);
4597 ret = bucket_trim->init();
4598 if (ret < 0) {
4599 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4600 return ret;
4601 }
91327a77 4602 data_log->set_observer(&*bucket_trim);
b32b8144 4603
7c673cae
FG
4604 Mutex::Locker dl(data_sync_thread_lock);
4605 for (auto iter : zone_data_sync_from_map) {
4606 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
91327a77 4607 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
7c673cae
FG
4608 ret = thread->init();
4609 if (ret < 0) {
4610 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4611 return ret;
4612 }
4613 thread->start();
4614 data_sync_processor_threads[iter.first] = thread;
4615 }
4616 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4617 if (interval > 0) {
b32b8144 4618 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
7c673cae
FG
4619 ret = sync_log_trimmer->init();
4620 if (ret < 0) {
4621 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4622 return ret;
4623 }
4624 sync_log_trimmer->start();
4625 }
4626 }
4627 data_notifier = new RGWDataNotifier(this);
4628 data_notifier->start();
4629
4630 lc = new RGWLC();
4631 lc->initialize(cct, this);
31f18b77 4632
7c673cae
FG
4633 if (use_lc_thread)
4634 lc->start_processor();
31f18b77 4635
7c673cae
FG
4636 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4637
4638 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4639 get_zone().bucket_index_max_shards);
31f18b77
FG
4640 if (bucket_index_max_shards > get_max_bucket_shards()) {
4641 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 4642 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 4643 << get_max_bucket_shards() << dendl;
7c673cae
FG
4644 }
4645 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4646
4647 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4648 binfo_cache->init(this);
4649
4650 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4651
4652 if (need_tombstone_cache) {
4653 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4654 }
4655
31f18b77
FG
4656 reshard_wait = std::make_shared<RGWReshardWait>(this);
4657
4658 reshard = new RGWReshard(this);
4659
4660 /* only the master zone in the zonegroup reshards buckets */
4661 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4662 if (run_reshard_thread) {
4663 reshard->start_processor();
4664 }
4665
4666 index_completion_manager = new RGWIndexCompletionManager(this);
4667 ret = index_completion_manager->start();
4668
7c673cae
FG
4669 return ret;
4670}
4671
4672/**
4673 * Initialize the RADOS instance and prepare to do other ops
4674 * Returns 0 on success, -ERR# on failure.
4675 */
4676int RGWRados::initialize()
4677{
4678 int ret;
4679
4680 ret = init_rados();
4681 if (ret < 0)
4682 return ret;
4683
4684 return init_complete();
4685}
4686
4687void RGWRados::finalize_watch()
4688{
4689 for (int i = 0; i < num_watchers; i++) {
4690 RGWWatcher *watcher = watchers[i];
4691 watcher->unregister_watch();
4692 delete watcher;
4693 }
4694
4695 delete[] notify_oids;
4696 delete[] watchers;
4697}
4698
4699void RGWRados::schedule_context(Context *c) {
4700 finisher->queue(c);
4701}
4702
4703int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4704{
4705 bool is_truncated;
4706 RGWListRawObjsCtx ctx;
4707 do {
4708 list<string> oids;
4709 int r = list_raw_objects(pool, prefix, 1000,
4710 ctx, oids, &is_truncated);
4711 if (r < 0) {
4712 return r;
4713 }
4714 list<string>::iterator iter;
4715 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4716 string& val = *iter;
4717 if (val.size() > prefix.size())
4718 result.push_back(val.substr(prefix.size()));
4719 }
4720 } while (is_truncated);
4721
4722 return 0;
4723}
4724
4725int RGWRados::list_regions(list<string>& regions)
4726{
4727 RGWZoneGroup zonegroup;
4728
4729 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4730}
4731
4732int RGWRados::list_zonegroups(list<string>& zonegroups)
4733{
4734 RGWZoneGroup zonegroup;
4735
4736 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4737}
4738
4739int RGWRados::list_zones(list<string>& zones)
4740{
4741 RGWZoneParams zoneparams;
4742
4743 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4744}
4745
4746int RGWRados::list_realms(list<string>& realms)
4747{
4748 RGWRealm realm(cct, this);
4749 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4750}
4751
4752int RGWRados::list_periods(list<string>& periods)
4753{
4754 RGWPeriod period;
4755 list<string> raw_periods;
4756 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4757 if (ret < 0) {
4758 return ret;
4759 }
4760 for (const auto& oid : raw_periods) {
4761 size_t pos = oid.find(".");
4762 if (pos != std::string::npos) {
4763 periods.push_back(oid.substr(0, pos));
4764 } else {
4765 periods.push_back(oid);
4766 }
4767 }
4768 periods.sort(); // unique() only detects duplicates if they're adjacent
4769 periods.unique();
4770 return 0;
4771}
4772
4773
4774int RGWRados::list_periods(const string& current_period, list<string>& periods)
4775{
4776 int ret = 0;
4777 string period_id = current_period;
4778 while(!period_id.empty()) {
4779 RGWPeriod period(period_id);
4780 ret = period.init(cct, this);
4781 if (ret < 0) {
4782 return ret;
4783 }
4784 periods.push_back(period.get_id());
4785 period_id = period.get_predecessor();
4786 }
4787
4788 return ret;
4789}
4790
4791/**
4792 * Open the pool used as root for this gateway
4793 * Returns: 0 on success, -ERR# otherwise.
4794 */
4795int RGWRados::open_root_pool_ctx()
4796{
4797 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4798}
4799
4800int RGWRados::open_gc_pool_ctx()
4801{
4802 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4803}
4804
4805int RGWRados::open_lc_pool_ctx()
4806{
4807 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4808}
4809
4810int RGWRados::open_objexp_pool_ctx()
4811{
4812 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4813}
4814
31f18b77
FG
4815int RGWRados::open_reshard_pool_ctx()
4816{
4817 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4818}
4819
7c673cae
FG
4820int RGWRados::init_watch()
4821{
4822 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4823 if (r < 0) {
4824 return r;
4825 }
4826
4827 num_watchers = cct->_conf->rgw_num_control_oids;
4828
4829 bool compat_oid = (num_watchers == 0);
4830
4831 if (num_watchers <= 0)
4832 num_watchers = 1;
4833
4834 notify_oids = new string[num_watchers];
4835 watchers = new RGWWatcher *[num_watchers];
4836
4837 for (int i=0; i < num_watchers; i++) {
4838 string& notify_oid = notify_oids[i];
4839 notify_oid = notify_oid_prefix;
4840 if (!compat_oid) {
4841 char buf[16];
4842 snprintf(buf, sizeof(buf), ".%d", i);
4843 notify_oid.append(buf);
4844 }
4845 r = control_pool_ctx.create(notify_oid, false);
4846 if (r < 0 && r != -EEXIST)
4847 return r;
4848
4849 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4850 watchers[i] = watcher;
4851
4852 r = watcher->register_watch();
4853 if (r < 0)
4854 return r;
4855 }
4856
4857 watch_initialized = true;
4858
4859 set_cache_enabled(true);
4860
4861 return 0;
4862}
4863
4864void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4865{
4866 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4867
4868 int i = r % num_watchers;
4869 char buf[16];
4870 snprintf(buf, sizeof(buf), ".%d", i);
4871
4872 notify_oid = notify_oid_prefix;
4873 notify_oid.append(buf);
4874}
4875
28e407b8 4876int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
7c673cae 4877{
28e407b8
AA
4878 constexpr bool create = true; // create the pool if it doesn't exist
4879 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
4880}
4881
4882void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4883 string *marker) {
4884 if (marker) {
4885 *marker = shard_id_str;
4886 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4887 marker->append(shard_marker);
4888 }
4889}
4890
4891int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4892{
3a9019d9
FG
4893 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4894
4895 if (!explicit_pool.empty()) {
4896 return open_pool_ctx(explicit_pool, index_ctx);
4897 }
4898
7c673cae
FG
4899 const string *rule = &bucket_info.placement_rule;
4900 if (rule->empty()) {
4901 rule = &zonegroup.default_placement;
4902 }
4903 auto iter = zone_params.placement_pools.find(*rule);
4904 if (iter == zone_params.placement_pools.end()) {
4905 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4906 return -EINVAL;
4907 }
4908
4909 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4910 if (r < 0)
4911 return r;
4912
4913 return 0;
4914}
4915
4916/**
4917 * set up a bucket listing.
4918 * handle is filled in.
4919 * Returns 0 on success, -ERR# otherwise.
4920 */
4921int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4922{
4923 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4924 *handle = (RGWAccessHandle)state;
4925 return 0;
4926}
4927
4928/**
4929 * get the next bucket in the listing.
4930 * obj is filled in,
4931 * handle is updated.
4932 * returns 0 on success, -ERR# otherwise.
4933 */
4934int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4935{
4936 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4937
4938 do {
4939 if (*state == root_pool_ctx.nobjects_end()) {
4940 delete state;
4941 return -ENOENT;
4942 }
4943
4944 obj.key.name = (*state)->get_oid();
4945 if (obj.key.name[0] == '_') {
4946 obj.key.name = obj.key.name.substr(1);
4947 }
4948
4949 (*state)++;
4950 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4951
4952 return 0;
4953}
4954
4955
4956/**** logs ****/
4957
4958struct log_list_state {
4959 string prefix;
4960 librados::IoCtx io_ctx;
4961 librados::NObjectIterator obit;
4962};
4963
4964int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4965{
4966 log_list_state *state = new log_list_state;
4967 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4968 if (r < 0) {
4969 delete state;
4970 return r;
4971 }
4972 state->prefix = prefix;
4973 state->obit = state->io_ctx.nobjects_begin();
4974 *handle = (RGWAccessHandle)state;
4975 return 0;
4976}
4977
4978int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4979{
4980 log_list_state *state = static_cast<log_list_state *>(handle);
4981 while (true) {
4982 if (state->obit == state->io_ctx.nobjects_end()) {
4983 delete state;
4984 return -ENOENT;
4985 }
4986 if (state->prefix.length() &&
4987 state->obit->get_oid().find(state->prefix) != 0) {
4988 state->obit++;
4989 continue;
4990 }
4991 *name = state->obit->get_oid();
4992 state->obit++;
4993 break;
4994 }
4995 return 0;
4996}
4997
4998int RGWRados::log_remove(const string& name)
4999{
5000 librados::IoCtx io_ctx;
5001 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5002 if (r < 0)
5003 return r;
5004 return io_ctx.remove(name);
5005}
5006
5007struct log_show_state {
5008 librados::IoCtx io_ctx;
5009 bufferlist bl;
5010 bufferlist::iterator p;
5011 string name;
5012 uint64_t pos;
5013 bool eof;
5014 log_show_state() : pos(0), eof(false) {}
5015};
5016
5017int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5018{
5019 log_show_state *state = new log_show_state;
5020 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5021 if (r < 0) {
5022 delete state;
5023 return r;
5024 }
5025 state->name = name;
5026 *handle = (RGWAccessHandle)state;
5027 return 0;
5028}
5029
5030int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5031{
5032 log_show_state *state = static_cast<log_show_state *>(handle);
5033 off_t off = state->p.get_off();
5034
5035 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5036 << " off " << off
5037 << " eof " << (int)state->eof
5038 << dendl;
5039 // read some?
5040 unsigned chunk = 1024*1024;
5041 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5042 bufferlist more;
5043 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5044 if (r < 0)
5045 return r;
5046 state->pos += r;
5047 bufferlist old;
5048 try {
5049 old.substr_of(state->bl, off, state->bl.length() - off);
5050 } catch (buffer::error& err) {
5051 return -EINVAL;
5052 }
5053 state->bl.clear();
5054 state->bl.claim(old);
5055 state->bl.claim_append(more);
5056 state->p = state->bl.begin();
5057 if ((unsigned)r < chunk)
5058 state->eof = true;
5059 ldout(cct, 10) << " read " << r << dendl;
5060 }
5061
5062 if (state->p.end())
5063 return 0; // end of file
5064 try {
5065 ::decode(*entry, state->p);
5066 }
5067 catch (const buffer::error &e) {
5068 return -EINVAL;
5069 }
5070 return 1;
5071}
5072
5073/**
5074 * usage_log_hash: get usage log key hash, based on name and index
5075 *
5076 * Get the usage object name. Since a user may have more than 1
5077 * object holding that info (multiple shards), we use index to
5078 * specify that shard number. Once index exceeds max shards it
5079 * wraps.
5080 * If name is not being set, results for all users will be returned
5081 * and index will wrap only after total shards number.
5082 *
5083 * @param cct [in] ceph context
5084 * @param name [in] user name
5085 * @param hash [out] hash value
5086 * @param index [in] shard index number
5087 */
5088static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5089{
5090 uint32_t val = index;
5091
5092 if (!name.empty()) {
c07f9fc5 5093 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
5094 val %= max_user_shards;
5095 val += ceph_str_hash_linux(name.c_str(), name.size());
5096 }
5097 char buf[17];
c07f9fc5 5098 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
5099 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5100 hash = buf;
5101}
5102
5103int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5104{
5105 uint32_t index = 0;
5106
5107 map<string, rgw_usage_log_info> log_objs;
5108
5109 string hash;
5110 string last_user;
5111
5112 /* restructure usage map, zone by object hash */
5113 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5114 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5115 const rgw_user_bucket& ub = iter->first;
5116 RGWUsageBatch& info = iter->second;
5117
5118 if (ub.user.empty()) {
5119 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5120 continue;
5121 }
5122
5123 if (ub.user != last_user) {
5124 /* index *should* be random, but why waste extra cycles
5125 in most cases max user shards is not going to exceed 1,
5126 so just incrementing it */
5127 usage_log_hash(cct, ub.user, hash, index++);
5128 }
5129 last_user = ub.user;
5130 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5131
5132 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5133 v.push_back(miter->second);
5134 }
5135 }
5136
5137 map<string, rgw_usage_log_info>::iterator liter;
5138
5139 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5140 int r = cls_obj_usage_log_add(liter->first, liter->second);
5141 if (r < 0)
5142 return r;
5143 }
5144 return 0;
5145}
5146
5147int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5148 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5149{
5150 uint32_t num = max_entries;
5151 string hash, first_hash;
5152 string user_str = user.to_str();
5153 usage_log_hash(cct, user_str, first_hash, 0);
5154
5155 if (usage_iter.index) {
5156 usage_log_hash(cct, user_str, hash, usage_iter.index);
5157 } else {
5158 hash = first_hash;
5159 }
5160
5161 usage.clear();
5162
5163 do {
5164 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5165 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5166
5167 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5168 usage_iter.read_iter, ret_usage, is_truncated);
5169 if (ret == -ENOENT)
5170 goto next;
5171
5172 if (ret < 0)
5173 return ret;
5174
5175 num -= ret_usage.size();
5176
5177 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5178 usage[iter->first].aggregate(iter->second);
5179 }
5180
5181next:
5182 if (!*is_truncated) {
5183 usage_iter.read_iter.clear();
5184 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5185 }
5186 } while (num && !*is_truncated && hash != first_hash);
5187 return 0;
5188}
5189
5190int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5191{
5192 uint32_t index = 0;
5193 string hash, first_hash;
5194 string user_str = user.to_str();
5195 usage_log_hash(cct, user_str, first_hash, index);
5196
5197 hash = first_hash;
7c673cae
FG
5198 do {
5199 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
7c673cae 5200
b32b8144 5201 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
5202 return ret;
5203
7c673cae
FG
5204 usage_log_hash(cct, user_str, hash, ++index);
5205 } while (hash != first_hash);
5206
5207 return 0;
5208}
5209
7c673cae
FG
5210int RGWRados::key_to_shard_id(const string& key, int max_shards)
5211{
1adf2230 5212 return rgw_shard_id(key, max_shards);
7c673cae
FG
5213}
5214
5215void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5216{
5217 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5218 char buf[16];
5219 if (shard_id) {
5220 *shard_id = val % max_shards;
5221 }
5222 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5223 name = prefix + buf;
5224}
5225
5226void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5227{
5228 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5229 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5230 char buf[16];
5231 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5232 name = prefix + buf;
5233}
5234
5235void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5236{
5237 char buf[16];
5238 snprintf(buf, sizeof(buf), "%u", shard_id);
5239 name = prefix + buf;
5240
5241}
5242
5243void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5244{
5245 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5246}
5247
5248int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5249{
5250 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5251
5252}
5253
5254int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5255{
5256 librados::IoCtx io_ctx;
5257
5258 int r = time_log_add_init(io_ctx);
5259 if (r < 0) {
5260 return r;
5261 }
5262
5263 ObjectWriteOperation op;
5264 utime_t t(ut);
5265 cls_log_add(op, t, section, key, bl);
5266
5267 return io_ctx.operate(oid, &op);
5268}
5269
5270int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5271 librados::AioCompletion *completion, bool monotonic_inc)
5272{
5273 librados::IoCtx io_ctx;
5274
5275 int r = time_log_add_init(io_ctx);
5276 if (r < 0) {
5277 return r;
5278 }
5279
5280 ObjectWriteOperation op;
5281 cls_log_add(op, entries, monotonic_inc);
5282
5283 if (!completion) {
5284 r = io_ctx.operate(oid, &op);
5285 } else {
5286 r = io_ctx.aio_operate(oid, completion, &op);
5287 }
5288 return r;
5289}
5290
5291int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5292 int max_entries, list<cls_log_entry>& entries,
5293 const string& marker,
5294 string *out_marker,
5295 bool *truncated)
5296{
5297 librados::IoCtx io_ctx;
5298
5299 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5300 if (r < 0)
5301 return r;
5302 librados::ObjectReadOperation op;
5303
5304 utime_t st(start_time);
5305 utime_t et(end_time);
5306
5307 cls_log_list(op, st, et, marker, max_entries, entries,
5308 out_marker, truncated);
5309
5310 bufferlist obl;
5311
5312 int ret = io_ctx.operate(oid, &op, &obl);
5313 if (ret < 0)
5314 return ret;
5315
5316 return 0;
5317}
5318
5319int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5320{
5321 librados::IoCtx io_ctx;
5322
5323 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5324 if (r < 0)
5325 return r;
5326 librados::ObjectReadOperation op;
5327
5328 cls_log_info(op, header);
5329
5330 bufferlist obl;
5331
5332 int ret = io_ctx.operate(oid, &op, &obl);
5333 if (ret < 0)
5334 return ret;
5335
5336 return 0;
5337}
5338
5339int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5340{
5341 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5342 if (r < 0)
5343 return r;
5344
5345 librados::ObjectReadOperation op;
5346
5347 cls_log_info(op, header);
5348
5349 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5350 if (ret < 0)
5351 return ret;
5352
5353 return 0;
5354}
5355
5356int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5357 const string& from_marker, const string& to_marker,
5358 librados::AioCompletion *completion)
5359{
5360 librados::IoCtx io_ctx;
5361
5362 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5363 if (r < 0)
5364 return r;
5365
5366 utime_t st(start_time);
5367 utime_t et(end_time);
5368
5369 ObjectWriteOperation op;
5370 cls_log_trim(op, st, et, from_marker, to_marker);
5371
5372 if (!completion) {
5373 r = io_ctx.operate(oid, &op);
5374 } else {
5375 r = io_ctx.aio_operate(oid, completion, &op);
5376 }
5377 return r;
5378}
5379
5380string RGWRados::objexp_hint_get_shardname(int shard_num)
5381{
5382 char buf[32];
5383 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5384
5385 string objname("obj_delete_at_hint.");
5386 return objname + buf;
5387}
5388
7c673cae
FG
5389int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5390{
5391 string obj_key = key.name + key.instance;
5392 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
1adf2230 5393 return rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
5394}
5395
5396static string objexp_hint_get_keyext(const string& tenant_name,
5397 const string& bucket_name,
5398 const string& bucket_id,
5399 const rgw_obj_key& obj_key)
5400{
5401 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5402 ":" + obj_key.name + ":" + obj_key.instance;
5403}
5404
5405int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5406 const string& tenant_name,
5407 const string& bucket_name,
5408 const string& bucket_id,
5409 const rgw_obj_index_key& obj_key)
5410{
5411 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5412 bucket_id, obj_key);
5413 objexp_hint_entry he = {
5414 .tenant = tenant_name,
5415 .bucket_name = bucket_name,
5416 .bucket_id = bucket_id,
5417 .obj_key = obj_key,
5418 .exp_time = delete_at };
5419 bufferlist hebl;
5420 ::encode(he, hebl);
5421 ObjectWriteOperation op;
5422 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5423
5424 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5425 return objexp_pool_ctx.operate(shard_name, &op);
5426}
5427
5428void RGWRados::objexp_get_shard(int shard_num,
5429 string& shard) /* out */
5430{
5431 shard = objexp_hint_get_shardname(shard_num);
5432}
5433
5434int RGWRados::objexp_hint_list(const string& oid,
5435 const ceph::real_time& start_time,
5436 const ceph::real_time& end_time,
5437 const int max_entries,
5438 const string& marker,
5439 list<cls_timeindex_entry>& entries, /* out */
5440 string *out_marker, /* out */
5441 bool *truncated) /* out */
5442{
5443 librados::ObjectReadOperation op;
5444 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5445 out_marker, truncated);
5446
5447 bufferlist obl;
5448 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5449
5450 if ((ret < 0 ) && (ret != -ENOENT)) {
5451 return ret;
5452 }
5453
5454 if ((ret == -ENOENT) && truncated) {
5455 *truncated = false;
5456 }
5457
5458 return 0;
5459}
5460
5461int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5462 objexp_hint_entry& hint_entry) /* out */
5463{
5464 try {
5465 bufferlist::iterator iter = ti_entry.value.begin();
5466 ::decode(hint_entry, iter);
5467 } catch (buffer::error& err) {
5468 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5469 }
5470
5471 return 0;
5472}
5473
5474int RGWRados::objexp_hint_trim(const string& oid,
5475 const ceph::real_time& start_time,
5476 const ceph::real_time& end_time,
5477 const string& from_marker,
5478 const string& to_marker)
5479{
5480 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5481 from_marker, to_marker);
5482 if ((ret < 0 ) && (ret != -ENOENT)) {
5483 return ret;
5484 }
5485
5486 return 0;
5487}
5488
5489int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5490 string& zone_id, string& owner_id) {
5491 librados::IoCtx io_ctx;
5492
5493 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5494 if (r < 0) {
5495 return r;
5496 }
5497 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5498 utime_t ut(msec / 1000, msec % 1000);
5499
5500 rados::cls::lock::Lock l(log_lock_name);
5501 l.set_duration(ut);
5502 l.set_cookie(owner_id);
5503 l.set_tag(zone_id);
5504 l.set_renew(true);
5505
5506 return l.lock_exclusive(&io_ctx, oid);
5507}
5508
5509int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5510 librados::IoCtx io_ctx;
5511
5512 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5513 if (r < 0) {
5514 return r;
5515 }
5516
5517 rados::cls::lock::Lock l(log_lock_name);
5518 l.set_tag(zone_id);
5519 l.set_cookie(owner_id);
5520
5521 return l.unlock(&io_ctx, oid);
5522}
5523
5524int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5525{
5526 bufferlist::iterator i = bl.begin();
5527 RGWAccessControlPolicy policy(cct);
5528 try {
5529 policy.decode_owner(i);
5530 } catch (buffer::error& err) {
5531 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5532 return -EIO;
5533 }
5534 *owner = policy.get_owner();
5535 return 0;
5536}
5537
5538int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5539{
5540 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5541 if (aiter == attrset.end())
5542 return -EIO;
5543
5544 bufferlist& bl = aiter->second;
5545 bufferlist::iterator iter = bl.begin();
5546 try {
5547 policy->decode(iter);
5548 } catch (buffer::error& err) {
5549 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5550 return -EIO;
5551 }
5552 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5553 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5554 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5555 s3policy->to_xml(*_dout);
5556 *_dout << dendl;
5557 }
5558 return 0;
5559}
5560
5561
31f18b77
FG
5562int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5563{
5564 rgw_bucket bucket = bucket_info.bucket;
5565 bucket.update_bucket_id(new_bucket_id);
5566
5567 RGWObjectCtx obj_ctx(store);
5568
5569 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5570 if (ret < 0) {
5571 return ret;
5572 }
5573
5574 return 0;
5575}
5576
1adf2230
AA
5577
5578/**
5579 * Get ordered listing of the objects in a bucket.
7c673cae
FG
5580 *
5581 * max: maximum number of results to return
5582 * bucket: bucket to list contents of
5583 * prefix: only return results that match this prefix
5584 * delim: do not include results that match this string.
5585 * Any skipped results will have the matching portion of their name
5586 * inserted in common_prefixes with a "true" mark.
5587 * marker: if filled in, begin the listing with this object.
5588 * end_marker: if filled in, end the listing with this object.
5589 * result: the objects are put in here.
5590 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5591 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5592 */
1adf2230
AA
5593int RGWRados::Bucket::List::list_objects_ordered(int64_t max,
5594 vector<rgw_bucket_dir_entry> *result,
5595 map<string, bool> *common_prefixes,
5596 bool *is_truncated)
7c673cae
FG
5597{
5598 RGWRados *store = target->get_store();
5599 CephContext *cct = store->ctx();
5600 int shard_id = target->get_shard_id();
5601
5602 int count = 0;
5603 bool truncated = true;
5604 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5605
5606 result->clear();
5607
5608 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
7c673cae
FG
5609 rgw_obj_index_key cur_marker;
5610 marker_obj.get_index_key(&cur_marker);
5611
3efd9988
FG
5612 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5613 params.ns);
5614 rgw_obj_index_key cur_end_marker;
5615 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
5616 const bool cur_end_marker_valid = !params.end_marker.empty();
5617
5618 rgw_obj_key prefix_obj(params.prefix);
5619 prefix_obj.ns = params.ns;
5620 string cur_prefix = prefix_obj.get_index_key_name();
5621
5622 string bigger_than_delim;
5623
5624 if (!params.delim.empty()) {
1adf2230
AA
5625 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(),
5626 params.delim.size());
7c673cae
FG
5627 char buf[params.delim.size() + 16];
5628 int r = encode_utf8(val + 1, (unsigned char *)buf);
5629 if (r < 0) {
5630 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5631 return -EINVAL;
5632 }
5633 buf[r] = '\0';
5634
5635 bigger_than_delim = buf;
5636
5637 /* if marker points at a common prefix, fast forward it into its upperbound string */
224ce89b 5638 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
5639 if (delim_pos >= 0) {
5640 string s = cur_marker.name.substr(0, delim_pos);
5641 s.append(bigger_than_delim);
5642 cur_marker = s;
5643 }
5644 }
1adf2230 5645
7c673cae
FG
5646 string skip_after_delim;
5647 while (truncated && count <= max) {
5648 if (skip_after_delim > cur_marker.name) {
5649 cur_marker = skip_after_delim;
5650 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5651 }
5652 std::map<string, rgw_bucket_dir_entry> ent_map;
1adf2230
AA
5653 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
5654 shard_id,
5655 cur_marker,
5656 cur_prefix,
5657 read_ahead + 1 - count,
5658 params.list_versions,
5659 ent_map,
5660 &truncated,
5661 &cur_marker);
7c673cae
FG
5662 if (r < 0)
5663 return r;
5664
1adf2230 5665 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
5666 rgw_bucket_dir_entry& entry = eiter->second;
5667 rgw_obj_index_key index_key = entry.key;
5668
5669 rgw_obj_key obj(index_key);
5670
1adf2230
AA
5671 /* note that parse_raw_oid() here will not set the correct
5672 * object's instance, as rgw_obj_index_key encodes that
5673 * separately. We don't need to set the instance because it's
5674 * not needed for the checks here and we end up using the raw
5675 * entry for the return vector
7c673cae
FG
5676 */
5677 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5678 if (!valid) {
5679 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5680 continue;
5681 }
5682 bool check_ns = (obj.ns == params.ns);
5683 if (!params.list_versions && !entry.is_visible()) {
5684 continue;
5685 }
5686
5687 if (params.enforce_ns && !check_ns) {
5688 if (!params.ns.empty()) {
5689 /* we've iterated past the namespace we're searching -- done now */
5690 truncated = false;
5691 goto done;
5692 }
5693
5694 /* we're not looking at the namespace this object is in, next! */
5695 continue;
5696 }
5697
5698 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5699 truncated = false;
5700 goto done;
5701 }
5702
5703 if (count < max) {
5704 params.marker = index_key;
5705 next_marker = index_key;
5706 }
5707
5708 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5709 continue;
5710
1adf2230
AA
5711 if (params.prefix.size() &&
5712 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
7c673cae
FG
5713 continue;
5714
5715 if (!params.delim.empty()) {
5716 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5717
5718 if (delim_pos >= 0) {
5719 string prefix_key = obj.name.substr(0, delim_pos + 1);
5720
5721 if (common_prefixes &&
5722 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5723 if (count >= max) {
5724 truncated = true;
5725 goto done;
5726 }
5727 next_marker = prefix_key;
5728 (*common_prefixes)[prefix_key] = true;
5729
224ce89b
WB
5730 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5731
5732 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
7c673cae
FG
5733 skip_after_delim.append(bigger_than_delim);
5734
5735 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5736
5737 count++;
5738 }
5739
5740 continue;
5741 }
5742 }
5743
5744 if (count >= max) {
5745 truncated = true;
5746 goto done;
5747 }
5748
5749 result->emplace_back(std::move(entry));
5750 count++;
5751 }
7c673cae
FG
5752 }
5753
5754done:
5755 if (is_truncated)
5756 *is_truncated = truncated;
5757
5758 return 0;
1adf2230
AA
5759} // list_objects_ordered
5760
5761
5762/**
5763 * Get listing of the objects in a bucket and allow the results to be out
5764 * of order.
5765 *
5766 * Even though there are key differences with the ordered counterpart,
5767 * the parameters are the same to maintain some compatability.
5768 *
5769 * max: maximum number of results to return
5770 * bucket: bucket to list contents of
5771 * prefix: only return results that match this prefix
5772 * delim: should not be set; if it is we should have indicated an error
5773 * marker: if filled in, begin the listing with this object.
5774 * end_marker: if filled in, end the listing with this object.
5775 * result: the objects are put in here.
5776 * common_prefixes: this is never filled with an unordered list; the param
5777 * is maintained for compatibility
5778 * is_truncated: if number of objects in the bucket is bigger than max, then
5779 * truncated.
5780 */
5781int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
5782 vector<rgw_bucket_dir_entry> *result,
5783 map<string, bool> *common_prefixes,
5784 bool *is_truncated)
5785{
5786 RGWRados *store = target->get_store();
5787 CephContext *cct = store->ctx();
5788 int shard_id = target->get_shard_id();
5789
5790 int count = 0;
5791 bool truncated = true;
5792
5793 // read a few extra in each call to cls_bucket_list_unordered in
5794 // case some are filtered out due to namespace matching, versioning,
5795 // filtering, etc.
5796 const int64_t max_read_ahead = 100;
5797 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
5798
5799 result->clear();
5800
5801 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5802 rgw_obj_index_key cur_marker;
5803 marker_obj.get_index_key(&cur_marker);
5804
5805 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5806 params.ns);
5807 rgw_obj_index_key cur_end_marker;
5808 end_marker_obj.get_index_key(&cur_end_marker);
5809 const bool cur_end_marker_valid = !params.end_marker.empty();
5810
5811 rgw_obj_key prefix_obj(params.prefix);
5812 prefix_obj.ns = params.ns;
5813 string cur_prefix = prefix_obj.get_index_key_name();
5814
5815 while (truncated && count <= max) {
5816 std::vector<rgw_bucket_dir_entry> ent_list;
5817 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
5818 shard_id,
5819 cur_marker,
5820 cur_prefix,
5821 read_ahead,
5822 params.list_versions,
5823 ent_list,
5824 &truncated,
5825 &cur_marker);
5826 if (r < 0)
5827 return r;
5828
5829 // NB: while regions of ent_list will be sorted, we have no
5830 // guarantee that all items will be sorted since they can cross
5831 // shard boundaries
5832
5833 for (auto& entry : ent_list) {
5834 rgw_obj_index_key index_key = entry.key;
5835 rgw_obj_key obj(index_key);
5836
5837 /* note that parse_raw_oid() here will not set the correct
5838 * object's instance, as rgw_obj_index_key encodes that
5839 * separately. We don't need to set the instance because it's
5840 * not needed for the checks here and we end up using the raw
5841 * entry for the return vector
5842 */
5843 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5844 if (!valid) {
5845 ldout(cct, 0) << "ERROR: could not parse object name: " <<
5846 obj.name << dendl;
5847 continue;
5848 }
5849
5850 if (!params.list_versions && !entry.is_visible()) {
5851 continue;
5852 }
5853
5854 if (params.enforce_ns && obj.ns != params.ns) {
5855 continue;
5856 }
5857
5858 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5859 // we're not guaranteed items will come in order, so we have
5860 // to loop through all
5861 continue;
5862 }
5863
5864 if (count < max) {
5865 params.marker = index_key;
5866 next_marker = index_key;
5867 }
5868
5869 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5870 continue;
5871
5872 if (params.prefix.size() &&
5873 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
5874 continue;
5875
5876 if (count >= max) {
5877 truncated = true;
5878 goto done;
5879 }
5880
5881 result->emplace_back(std::move(entry));
5882 count++;
5883 } // for (auto& entry : ent_list)
5884 } // while (truncated && count <= max)
5885
5886done:
5887 if (is_truncated)
5888 *is_truncated = truncated;
5889
5890 return 0;
5891} // list_objects_unordered
5892
7c673cae
FG
5893
5894/**
5895 * create a rados pool, associated meta info
5896 * returns 0 on success, -ERR# otherwise.
5897 */
5898int RGWRados::create_pool(const rgw_pool& pool)
5899{
c07f9fc5 5900 librados::IoCtx io_ctx;
28e407b8
AA
5901 constexpr bool create = true;
5902 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
5903}
5904
5905int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5906{
5907 librados::IoCtx index_ctx; // context for new bucket
5908
31f18b77 5909 string dir_oid = dir_oid_prefix;
7c673cae 5910 int r = open_bucket_index_ctx(bucket_info, index_ctx);
31f18b77 5911 if (r < 0) {
7c673cae 5912 return r;
31f18b77 5913 }
7c673cae 5914
7c673cae
FG
5915 dir_oid.append(bucket_info.bucket.bucket_id);
5916
5917 map<int, string> bucket_objs;
5918 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5919
5920 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5921}
5922
5923void RGWRados::create_bucket_id(string *bucket_id)
5924{
5925 uint64_t iid = instance_id();
5926 uint64_t bid = next_bucket_id();
5927 char buf[get_zone_params().get_id().size() + 48];
5928 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5929 *bucket_id = buf;
5930}
5931
7c673cae
FG
5932int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5933 const string& zonegroup_id,
5934 const string& placement_rule,
5935 const string& swift_ver_location,
5936 const RGWQuotaInfo * pquota_info,
5937 map<std::string, bufferlist>& attrs,
5938 RGWBucketInfo& info,
5939 obj_version *pobjv,
5940 obj_version *pep_objv,
5941 real_time creation_time,
5942 rgw_bucket *pmaster_bucket,
5943 uint32_t *pmaster_num_shards,
5944 bool exclusive)
5945{
5946#define MAX_CREATE_RETRIES 20 /* need to bound retries */
5947 string selected_placement_rule_name;
5948 RGWZonePlacementInfo rule_info;
5949
5950 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5951 int ret = 0;
5952 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5953 &selected_placement_rule_name, &rule_info);
5954 if (ret < 0)
5955 return ret;
5956
5957 if (!pmaster_bucket) {
5958 create_bucket_id(&bucket.marker);
5959 bucket.bucket_id = bucket.marker;
5960 } else {
5961 bucket.marker = pmaster_bucket->marker;
5962 bucket.bucket_id = pmaster_bucket->bucket_id;
5963 }
5964
5965 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5966
5967 if (pobjv) {
5968 objv_tracker.write_version = *pobjv;
5969 } else {
5970 objv_tracker.generate_new_write_ver(cct);
5971 }
5972
5973 info.bucket = bucket;
5974 info.owner = owner.user_id;
5975 info.zonegroup = zonegroup_id;
5976 info.placement_rule = selected_placement_rule_name;
5977 info.index_type = rule_info.index_type;
5978 info.swift_ver_location = swift_ver_location;
5979 info.swift_versioning = (!swift_ver_location.empty());
5980 if (pmaster_num_shards) {
5981 info.num_shards = *pmaster_num_shards;
5982 } else {
5983 info.num_shards = bucket_index_max_shards;
5984 }
5985 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5986 info.requester_pays = false;
5987 if (real_clock::is_zero(creation_time)) {
5988 info.creation_time = ceph::real_clock::now();
5989 } else {
5990 info.creation_time = creation_time;
5991 }
5992 if (pquota_info) {
5993 info.quota = *pquota_info;
5994 }
5995
5996 int r = init_bucket_index(info, info.num_shards);
5997 if (r < 0) {
5998 return r;
5999 }
6000
6001 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
6002 if (ret == -EEXIST) {
6003 librados::IoCtx index_ctx;
6004 map<int, string> bucket_objs;
6005 int r = open_bucket_index(info, index_ctx, bucket_objs);
6006 if (r < 0)
6007 return r;
6008
6009 /* we need to reread the info and return it, caller will have a use for it */
6010 RGWObjVersionTracker instance_ver = info.objv_tracker;
6011 info.objv_tracker.clear();
6012 RGWObjectCtx obj_ctx(this);
6013 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
6014 if (r < 0) {
6015 if (r == -ENOENT) {
6016 continue;
6017 }
6018 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
6019 return r;
6020 }
6021
6022 /* only remove it if it's a different bucket instance */
6023 if (info.bucket.bucket_id != bucket.bucket_id) {
6024 /* remove bucket meta instance */
6025 string entry = bucket.get_key();
6026 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
6027 if (r < 0)
6028 return r;
6029
6030 map<int, string>::const_iterator biter;
6031 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
6032 // Do best effort removal
6033 index_ctx.remove(biter->second);
6034 }
6035 }
6036 /* ret == -ENOENT here */
6037 }
6038 return ret;
6039 }
6040
6041 /* this is highly unlikely */
6042 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
6043 return -ENOENT;
6044}
6045
6046int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
6047 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6048
6049{
c07f9fc5 6050 /* first check that zonegroup exists within current period. */
7c673cae
FG
6051 RGWZoneGroup zonegroup;
6052 int ret = get_zonegroup(zonegroup_id, zonegroup);
6053 if (ret < 0) {
6054 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
6055 return ret;
6056 }
6057
7c673cae 6058 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
c07f9fc5
FG
6059 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
6060
6061 if (!request_rule.empty()) {
6062 titer = zonegroup.placement_targets.find(request_rule);
6063 if (titer == zonegroup.placement_targets.end()) {
6064 ldout(cct, 0) << "could not find requested placement id " << request_rule
6065 << " within zonegroup " << dendl;
6066 return -ERR_INVALID_LOCATION_CONSTRAINT;
6067 }
6068 } else if (!user_info.default_placement.empty()) {
6069 titer = zonegroup.placement_targets.find(user_info.default_placement);
6070 if (titer == zonegroup.placement_targets.end()) {
6071 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
6072 << " within zonegroup " << dendl;
6073 return -ERR_INVALID_LOCATION_CONSTRAINT;
6074 }
6075 } else {
6076 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
6077 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
6078 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
6079 } else {
6080 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
6081 if (titer == zonegroup.placement_targets.end()) {
6082 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
6083 << " within zonegroup " << dendl;
6084 return -ERR_INVALID_LOCATION_CONSTRAINT;
6085 }
6086 }
7c673cae
FG
6087 }
6088
6089 /* now check tag for the rule, whether user is permitted to use rule */
c07f9fc5 6090 const auto& target_rule = titer->second;
7c673cae 6091 if (!target_rule.user_permitted(user_info.placement_tags)) {
c07f9fc5 6092 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
7c673cae
FG
6093 return -EPERM;
6094 }
6095
6096 if (pselected_rule_name)
c07f9fc5 6097 *pselected_rule_name = titer->first;
7c673cae 6098
c07f9fc5 6099 return select_bucket_location_by_rule(titer->first, rule_info);
7c673cae
FG
6100}
6101
6102int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
6103{
6104 if (location_rule.empty()) {
6105 /* we can only reach here if we're trying to set a bucket location from a bucket
6106 * created on a different zone, using a legacy / default pool configuration
6107 */
6108 return select_legacy_bucket_placement(rule_info);
6109 }
6110
6111 /*
6112 * make sure that zone has this rule configured. We're
6113 * checking it for the local zone, because that's where this bucket object is going to
6114 * reside.
6115 */
6116 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
6117 if (piter == get_zone_params().placement_pools.end()) {
6118 /* couldn't find, means we cannot really place data for this bucket in this zone */
224ce89b 6119 if (get_zonegroup().equals(zonegroup.get_id())) {
7c673cae
FG
6120 /* that's a configuration error, zone should have that rule, as we're within the requested
6121 * zonegroup */
6122 return -EINVAL;
6123 } else {
6124 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6125 return 0;
6126 }
6127 }
6128
6129 RGWZonePlacementInfo& placement_info = piter->second;
6130
6131 if (rule_info) {
6132 *rule_info = placement_info;
6133 }
6134
6135 return 0;
6136}
6137
6138int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
6139 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6140{
6141 if (!get_zone_params().placement_pools.empty()) {
6142 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6143 pselected_rule_name, rule_info);
6144 }
6145
6146 if (pselected_rule_name) {
6147 pselected_rule_name->clear();
6148 }
6149
6150 return select_legacy_bucket_placement(rule_info);
6151}
6152
6153int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6154{
6155 bufferlist map_bl;
6156 map<string, bufferlist> m;
6157 string pool_name;
6158 bool write_map = false;
6159
6160 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6161
6162 RGWObjectCtx obj_ctx(this);
6163 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6164 if (ret < 0) {
6165 goto read_omap;
6166 }
6167
6168 try {
6169 bufferlist::iterator iter = map_bl.begin();
6170 ::decode(m, iter);
6171 } catch (buffer::error& err) {
6172 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6173 }
6174
6175read_omap:
6176 if (m.empty()) {
6177 bufferlist header;
6178 ret = omap_get_all(obj, header, m);
6179
6180 write_map = true;
6181 }
6182
6183 if (ret < 0 || m.empty()) {
6184 vector<rgw_pool> pools;
6185 string s = string("default.") + default_storage_pool_suffix;
6186 pools.push_back(rgw_pool(s));
6187 vector<int> retcodes;
6188 bufferlist bl;
6189 ret = create_pools(pools, retcodes);
6190 if (ret < 0)
6191 return ret;
6192 ret = omap_set(obj, s, bl);
6193 if (ret < 0)
6194 return ret;
6195 m[s] = bl;
6196 }
6197
6198 if (write_map) {
6199 bufferlist new_bl;
6200 ::encode(m, new_bl);
6201 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6202 if (ret < 0) {
6203 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6204 }
6205 }
6206
6207 map<string, bufferlist>::iterator miter;
6208 if (m.size() > 1) {
6209 vector<string> v;
6210 for (miter = m.begin(); miter != m.end(); ++miter) {
6211 v.push_back(miter->first);
6212 }
6213
6214 uint32_t r;
6215 ret = get_random_bytes((char *)&r, sizeof(r));
6216 if (ret < 0)
6217 return ret;
6218
6219 int i = r % v.size();
6220 pool_name = v[i];
6221 } else {
6222 miter = m.begin();
6223 pool_name = miter->first;
6224 }
6225
6226 rule_info->data_pool = pool_name;
6227 rule_info->data_extra_pool = pool_name;
6228 rule_info->index_pool = pool_name;
6229 rule_info->index_type = RGWBIType_Normal;
6230
6231 return 0;
6232}
6233
6234bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6235{
6236 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6237}
6238
6239bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6240{
6241 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6242
6243 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6244}
6245
6246int RGWRados::update_placement_map()
6247{
6248 bufferlist header;
6249 map<string, bufferlist> m;
6250 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6251 int ret = omap_get_all(obj, header, m);
6252 if (ret < 0)
6253 return ret;
6254
6255 bufferlist new_bl;
6256 ::encode(m, new_bl);
6257 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6258 if (ret < 0) {
6259 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6260 }
6261
6262 return ret;
6263}
6264
6265int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6266{
6267 librados::Rados *rad = get_rados_handle();
6268 int ret = rad->pool_lookup(new_pool.name.c_str());
6269 if (ret < 0) // DNE, or something
6270 return ret;
6271
6272 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6273 bufferlist empty_bl;
6274 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6275
6276 // don't care about return value
6277 update_placement_map();
6278
6279 return ret;
6280}
6281
6282int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6283{
6284 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6285 int ret = omap_del(obj, old_pool.to_str());
6286
6287 // don't care about return value
6288 update_placement_map();
6289
6290 return ret;
6291}
6292
6293int RGWRados::list_placement_set(set<rgw_pool>& names)
6294{
6295 bufferlist header;
6296 map<string, bufferlist> m;
6297
6298 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6299 int ret = omap_get_all(obj, header, m);
6300 if (ret < 0)
6301 return ret;
6302
6303 names.clear();
6304 map<string, bufferlist>::iterator miter;
6305 for (miter = m.begin(); miter != m.end(); ++miter) {
6306 names.insert(rgw_pool(miter->first));
6307 }
6308
6309 return names.size();
6310}
6311
6312int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6313{
6314 vector<librados::PoolAsyncCompletion *> completions;
6315 vector<int> rets;
6316
6317 librados::Rados *rad = get_rados_handle();
6318 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6319 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6320 completions.push_back(c);
6321 rgw_pool& pool = *iter;
6322 int ret = rad->pool_create_async(pool.name.c_str(), c);
6323 rets.push_back(ret);
6324 }
6325
6326 vector<int>::iterator riter;
6327 vector<librados::PoolAsyncCompletion *>::iterator citer;
6328
c07f9fc5 6329 bool error = false;
7c673cae
FG
6330 assert(rets.size() == completions.size());
6331 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6332 int r = *riter;
6333 PoolAsyncCompletion *c = *citer;
6334 if (r == 0) {
6335 c->wait();
6336 r = c->get_return_value();
6337 if (r < 0) {
6338 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
c07f9fc5 6339 error = true;
7c673cae
FG
6340 }
6341 }
6342 c->release();
6343 retcodes.push_back(r);
6344 }
c07f9fc5
FG
6345 if (error) {
6346 return 0;
6347 }
6348
6349 std::vector<librados::IoCtx> io_ctxs;
6350 retcodes.clear();
6351 for (auto pool : pools) {
6352 io_ctxs.emplace_back();
6353 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6354 if (ret < 0) {
6355 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6356 error = true;
6357 }
6358 retcodes.push_back(ret);
6359 }
6360 if (error) {
6361 return 0;
6362 }
6363
6364 completions.clear();
6365 for (auto &io_ctx : io_ctxs) {
6366 librados::PoolAsyncCompletion *c =
6367 librados::Rados::pool_async_create_completion();
6368 completions.push_back(c);
6369 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6370 false, c);
6371 assert(ret == 0);
6372 }
6373
6374 retcodes.clear();
6375 for (auto c : completions) {
6376 c->wait();
6377 int ret = c->get_return_value();
6378 if (ret == -EOPNOTSUPP) {
6379 ret = 0;
6380 } else if (ret < 0) {
6381 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6382 << dendl;
6383 error = true;
6384 }
6385 c->release();
6386 retcodes.push_back(ret);
6387 }
7c673cae
FG
6388 return 0;
6389}
6390
6391int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6392{
6393 string oid, key;
6394 get_obj_bucket_and_oid_loc(obj, oid, key);
6395
6396 rgw_pool pool;
6397 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6398 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6399 return -EIO;
6400 }
6401
6402 int r = open_pool_ctx(pool, *ioctx);
6403 if (r < 0) {
6404 return r;
6405 }
6406
6407 ioctx->locator_set_key(key);
6408
6409 return 0;
6410}
6411
6412int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6413{
6414 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6415
6416 rgw_pool pool;
6417 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6418 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6419 return -EIO;
6420 }
6421
6422 int r = open_pool_ctx(pool, ref->ioctx);
6423 if (r < 0) {
6424 return r;
6425 }
6426
6427 ref->ioctx.locator_set_key(ref->key);
6428
6429 return 0;
6430}
6431
224ce89b 6432int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae
FG
6433{
6434 ref->oid = obj.oid;
6435 ref->key = obj.loc;
6436
6437 int r;
6438
6439 if (ref->oid.empty()) {
6440 ref->oid = obj.pool.to_str();
6441 ref->pool = get_zone_params().domain_root;
6442 } else {
6443 ref->pool = obj.pool;
6444 }
7c673cae
FG
6445 r = open_pool_ctx(ref->pool, ref->ioctx);
6446 if (r < 0)
6447 return r;
6448
6449 ref->ioctx.locator_set_key(ref->key);
6450
6451 return 0;
6452}
6453
224ce89b 6454int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 6455{
224ce89b 6456 return get_raw_obj_ref(obj, ref);
7c673cae
FG
6457}
6458
6459/*
6460 * fixes an issue where head objects were supposed to have a locator created, but ended
6461 * up without one
6462 */
6463int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6464{
6465 const rgw_bucket& bucket = bucket_info.bucket;
6466 string oid;
6467 string locator;
6468
6469 rgw_obj obj(bucket, key);
6470
6471 get_obj_bucket_and_oid_loc(obj, oid, locator);
6472
6473 if (locator.empty()) {
6474 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6475 return 0;
6476 }
6477
6478 librados::IoCtx ioctx;
6479
6480 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6481 if (ret < 0) {
6482 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6483 return ret;
6484 }
6485 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6486
6487 uint64_t size;
6488 bufferlist data;
6489
6490 struct timespec mtime_ts;
6491 map<string, bufferlist> attrs;
6492 librados::ObjectReadOperation op;
6493 op.getxattrs(&attrs, NULL);
6494 op.stat2(&size, &mtime_ts, NULL);
6495#define HEAD_SIZE 512 * 1024
6496 op.read(0, HEAD_SIZE, &data, NULL);
6497
6498 ret = ioctx.operate(oid, &op, NULL);
6499 if (ret < 0) {
6500 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6501 return ret;
6502 }
6503
6504 if (size > HEAD_SIZE) {
6505 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6506 return -EIO;
6507 }
6508
6509 if (size != data.length()) {
6510 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6511 return -EIO;
6512 }
6513
6514 if (copy_obj) {
6515 librados::ObjectWriteOperation wop;
6516
6517 wop.mtime2(&mtime_ts);
6518
6519 map<string, bufferlist>::iterator iter;
6520 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6521 wop.setxattr(iter->first.c_str(), iter->second);
6522 }
6523
6524 wop.write(0, data);
6525
6526 ioctx.locator_set_key(locator);
6527 ioctx.operate(oid, &wop);
6528 }
6529
6530 if (remove_bad) {
6531 ioctx.locator_set_key(string());
6532
6533 ret = ioctx.remove(oid);
6534 if (ret < 0) {
6535 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6536 return ret;
6537 }
6538 }
6539
6540 return 0;
6541}
6542
6543int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6544 const string& src_oid, const string& src_locator,
6545 librados::IoCtx& dst_ioctx,
6546 const string& dst_oid, const string& dst_locator)
6547{
6548
6549#define COPY_BUF_SIZE (4 * 1024 * 1024)
6550 bool done = false;
6551 uint64_t chunk_size = COPY_BUF_SIZE;
6552 uint64_t ofs = 0;
6553 int ret = 0;
6554 real_time mtime;
6555 struct timespec mtime_ts;
6556 uint64_t size;
6557
6558 if (src_oid == dst_oid && src_locator == dst_locator) {
6559 return 0;
6560 }
6561
6562 src_ioctx.locator_set_key(src_locator);
6563 dst_ioctx.locator_set_key(dst_locator);
6564
6565 do {
6566 bufferlist data;
6567 ObjectReadOperation rop;
6568 ObjectWriteOperation wop;
6569
6570 if (ofs == 0) {
6571 rop.stat2(&size, &mtime_ts, NULL);
6572 mtime = real_clock::from_timespec(mtime_ts);
6573 }
6574 rop.read(ofs, chunk_size, &data, NULL);
6575 ret = src_ioctx.operate(src_oid, &rop, NULL);
6576 if (ret < 0) {
6577 goto done_err;
6578 }
6579
6580 if (data.length() == 0) {
6581 break;
6582 }
6583
6584 if (ofs == 0) {
6585 wop.create(true); /* make it exclusive */
6586 wop.mtime2(&mtime_ts);
6587 mtime = real_clock::from_timespec(mtime_ts);
6588 }
6589 wop.write(ofs, data);
6590 ret = dst_ioctx.operate(dst_oid, &wop);
6591 ofs += data.length();
6592 done = data.length() != chunk_size;
6593 } while (!done);
6594
6595 if (ofs != size) {
6596 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6597 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6598 ret = -EIO;
6599 goto done_err;
6600 }
6601
6602 src_ioctx.remove(src_oid);
6603
6604 return 0;
6605
6606done_err:
6607 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6608 return ret;
6609}
6610
6611/*
6612 * fixes an issue where head objects were supposed to have a locator created, but ended
6613 * up without one
6614 */
6615int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6616{
6617 const rgw_bucket& bucket = bucket_info.bucket;
6618 rgw_obj obj(bucket, key);
6619
6620 if (need_fix) {
6621 *need_fix = false;
6622 }
6623
6624 rgw_rados_ref ref;
6625 int r = get_obj_head_ref(bucket_info, obj, &ref);
6626 if (r < 0) {
6627 return r;
6628 }
6629
6630 RGWObjState *astate = NULL;
6631 RGWObjectCtx rctx(this);
6632 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6633 if (r < 0)
6634 return r;
6635
6636 if (astate->has_manifest) {
6637 RGWObjManifest::obj_iterator miter;
6638 RGWObjManifest& manifest = astate->manifest;
6639 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6640 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6641 rgw_obj loc;
6642 string oid;
6643 string locator;
6644
6645 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6646
6647 if (loc.key.ns.empty()) {
6648 /* continue, we're only interested in tail objects */
6649 continue;
6650 }
6651
6652 get_obj_bucket_and_oid_loc(loc, oid, locator);
6653 ref.ioctx.locator_set_key(locator);
6654
6655 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6656
6657 r = ref.ioctx.stat(oid, NULL, NULL);
6658 if (r != -ENOENT) {
6659 continue;
6660 }
6661
6662 string bad_loc;
6663 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6664
6665 /* create a new ioctx with the bad locator */
6666 librados::IoCtx src_ioctx;
6667 src_ioctx.dup(ref.ioctx);
6668 src_ioctx.locator_set_key(bad_loc);
6669
6670 r = src_ioctx.stat(oid, NULL, NULL);
6671 if (r != 0) {
6672 /* cannot find a broken part */
6673 continue;
6674 }
6675 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6676 if (need_fix) {
6677 *need_fix = true;
6678 }
6679 if (fix) {
6680 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6681 if (r < 0) {
6682 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6683 }
6684 }
6685 }
6686 }
6687
6688 return 0;
6689}
6690
6691int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6692{
6693 bucket = _bucket;
6694
6695 RGWObjectCtx obj_ctx(store);
6696
6697 RGWBucketInfo bucket_info;
6698 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6699 if (ret < 0) {
6700 return ret;
6701 }
6702
6703 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6704 if (ret < 0) {
6705 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6706 return ret;
6707 }
6708 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6709
6710 return 0;
6711}
6712
6713int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6714{
6715 bucket = _bucket;
6716 shard_id = sid;
6717
6718 RGWObjectCtx obj_ctx(store);
6719
6720 RGWBucketInfo bucket_info;
6721 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6722 if (ret < 0) {
6723 return ret;
6724 }
6725
6726 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6727 if (ret < 0) {
6728 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6729 return ret;
6730 }
6731 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6732
6733 return 0;
6734}
6735
b32b8144
FG
6736int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6737{
6738 bucket = bucket_info.bucket;
6739 shard_id = sid;
6740
6741 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6742 if (ret < 0) {
6743 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6744 return ret;
6745 }
6746 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6747
6748 return 0;
6749}
6750
7c673cae
FG
6751
6752/* Execute @handler on last item in bucket listing for bucket specified
6753 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6754 * to objects matching these criterias. */
6755int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6756 const std::string& obj_prefix,
6757 const std::string& obj_delim,
6758 std::function<int(const rgw_bucket_dir_entry&)> handler)
6759{
6760 RGWRados::Bucket target(this, bucket_info);
6761 RGWRados::Bucket::List list_op(&target);
6762
6763 list_op.params.prefix = obj_prefix;
6764 list_op.params.delim = obj_delim;
6765
6766 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6767 << ", obj_prefix=" << obj_prefix
6768 << ", obj_delim=" << obj_delim
6769 << dendl;
6770
6771 bool is_truncated = false;
6772
6773 boost::optional<rgw_bucket_dir_entry> last_entry;
6774 /* We need to rewind to the last object in a listing. */
6775 do {
6776 /* List bucket entries in chunks. */
6777 static constexpr int MAX_LIST_OBJS = 100;
6778 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6779
6780 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6781 &is_truncated);
6782 if (ret < 0) {
6783 return ret;
6784 } else if (!entries.empty()) {
6785 last_entry = entries.back();
6786 }
6787 } while (is_truncated);
6788
6789 if (last_entry) {
6790 return handler(*last_entry);
6791 }
6792
6793 /* Empty listing - no items we can run handler on. */
6794 return 0;
6795}
6796
6797
6798int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6799 const rgw_user& user,
6800 RGWBucketInfo& bucket_info,
6801 rgw_obj& obj)
6802{
6803 if (! swift_versioning_enabled(bucket_info)) {
6804 return 0;
6805 }
6806
6807 obj_ctx.obj.set_atomic(obj);
6808
6809 RGWObjState * state = nullptr;
6810 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6811 if (r < 0) {
6812 return r;
6813 }
6814
6815 if (!state->exists) {
6816 return 0;
6817 }
6818
6819 string client_id;
6820 string op_id;
6821
6822 const string& src_name = obj.get_oid();
6823 char buf[src_name.size() + 32];
6824 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6825 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6826 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6827
6828 RGWBucketInfo dest_bucket_info;
6829
6830 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6831 if (r < 0) {
6832 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6833 if (r == -ENOENT) {
6834 return -ERR_PRECONDITION_FAILED;
6835 }
6836 return r;
6837 }
6838
6839 if (dest_bucket_info.owner != bucket_info.owner) {
6840 return -ERR_PRECONDITION_FAILED;
6841 }
6842
6843 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6844 obj_ctx.obj.set_atomic(dest_obj);
6845
6846 string no_zone;
6847
6848 r = copy_obj(obj_ctx,
6849 user,
6850 client_id,
6851 op_id,
6852 NULL, /* req_info *info */
6853 no_zone,
6854 dest_obj,
6855 obj,
6856 dest_bucket_info,
6857 bucket_info,
6858 NULL, /* time_t *src_mtime */
6859 NULL, /* time_t *mtime */
6860 NULL, /* const time_t *mod_ptr */
6861 NULL, /* const time_t *unmod_ptr */
6862 false, /* bool high_precision_time */
6863 NULL, /* const char *if_match */
6864 NULL, /* const char *if_nomatch */
6865 RGWRados::ATTRSMOD_NONE,
6866 true, /* bool copy_if_newer */
6867 state->attrset,
6868 RGW_OBJ_CATEGORY_MAIN,
6869 0, /* uint64_t olh_epoch */
6870 real_time(), /* time_t delete_at */
6871 NULL, /* string *version_id */
6872 NULL, /* string *ptag */
6873 NULL, /* string *petag */
7c673cae
FG
6874 NULL, /* void (*progress_cb)(off_t, void *) */
6875 NULL); /* void *progress_data */
6876 if (r == -ECANCELED || r == -ENOENT) {
6877 /* Has already been overwritten, meaning another rgw process already
6878 * copied it out */
6879 return 0;
6880 }
6881
6882 return r;
6883}
6884
6885int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6886 const rgw_user& user,
6887 RGWBucketInfo& bucket_info,
6888 rgw_obj& obj,
6889 bool& restored) /* out */
6890{
6891 if (! swift_versioning_enabled(bucket_info)) {
6892 return 0;
6893 }
6894
6895 /* Bucket info of the bucket that stores previous versions of our object. */
6896 RGWBucketInfo archive_binfo;
6897
6898 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6899 bucket_info.swift_ver_location, archive_binfo,
6900 nullptr, nullptr);
6901 if (ret < 0) {
6902 return ret;
6903 }
6904
6905 /* Abort the operation if the bucket storing our archive belongs to someone
6906 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6907 * into consideration. For we can live with that.
6908 *
6909 * TODO: delegate this check to un upper layer and compare with ACLs. */
6910 if (bucket_info.owner != archive_binfo.owner) {
6911 return -EPERM;
6912 }
6913
6914 /* This code will be executed on latest version of the object. */
6915 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6916 std::string no_client_id;
6917 std::string no_op_id;
6918 std::string no_zone;
6919
6920 /* We don't support object versioning of Swift API on those buckets that
6921 * are already versioned using the S3 mechanism. This affects also bucket
6922 * storing archived objects. Otherwise the delete operation would create
6923 * a deletion marker. */
6924 if (archive_binfo.versioned()) {
6925 restored = false;
6926 return -ERR_PRECONDITION_FAILED;
6927 }
6928
6929 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6930 * irrelevant and may be safely skipped. */
6931 std::map<std::string, ceph::bufferlist> no_attrs;
6932
6933 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6934 obj_ctx.obj.set_atomic(archive_obj);
6935 obj_ctx.obj.set_atomic(obj);
6936
6937 int ret = copy_obj(obj_ctx,
6938 user,
6939 no_client_id,
6940 no_op_id,
6941 nullptr, /* req_info *info */
6942 no_zone,
6943 obj, /* dest obj */
6944 archive_obj, /* src obj */
6945 bucket_info, /* dest bucket info */
6946 archive_binfo, /* src bucket info */
6947 nullptr, /* time_t *src_mtime */
6948 nullptr, /* time_t *mtime */
6949 nullptr, /* const time_t *mod_ptr */
6950 nullptr, /* const time_t *unmod_ptr */
6951 false, /* bool high_precision_time */
6952 nullptr, /* const char *if_match */
6953 nullptr, /* const char *if_nomatch */
6954 RGWRados::ATTRSMOD_NONE,
6955 true, /* bool copy_if_newer */
6956 no_attrs,
6957 RGW_OBJ_CATEGORY_MAIN,
6958 0, /* uint64_t olh_epoch */
6959 real_time(), /* time_t delete_at */
6960 nullptr, /* string *version_id */
6961 nullptr, /* string *ptag */
6962 nullptr, /* string *petag */
7c673cae
FG
6963 nullptr, /* void (*progress_cb)(off_t, void *) */
6964 nullptr); /* void *progress_data */
6965 if (ret == -ECANCELED || ret == -ENOENT) {
6966 /* Has already been overwritten, meaning another rgw process already
6967 * copied it out */
6968 return 0;
6969 } else if (ret < 0) {
6970 return ret;
6971 } else {
6972 restored = true;
6973 }
6974
6975 /* Need to remove the archived copy. */
6976 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6977 archive_binfo.versioning_status());
6978
6979 return ret;
6980 };
6981
6982 const std::string& obj_name = obj.get_oid();
6983 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6984 % obj_name);
6985
6986 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6987 handler);
6988}
6989
6990/**
6991 * Write/overwrite an object to the bucket storage.
6992 * bucket: the bucket to store the object in
6993 * obj: the object name/key
6994 * data: the object contents/value
6995 * size: the amount of data to write (data must be this long)
6996 * accounted_size: original size of data before compression, encryption
6997 * mtime: if non-NULL, writes the given mtime to the bucket storage
6998 * attrs: all the given attrs are written to bucket storage for the given object
6999 * exclusive: create object exclusively
7000 * Returns: 0 on success, -ERR# otherwise.
7001 */
7002int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
7003 map<string, bufferlist>& attrs,
7004 bool assume_noent, bool modify_tail,
7c673cae
FG
7005 void *_index_op)
7006{
7007 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
7008 RGWRados *store = target->get_store();
7009
7010 ObjectWriteOperation op;
7011
7012 RGWObjState *state;
7013 int r = target->get_state(&state, false, assume_noent);
7014 if (r < 0)
7015 return r;
7016
7017 rgw_obj& obj = target->get_obj();
7018
7019 if (obj.get_oid().empty()) {
7020 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7021 return -EIO;
7022 }
7023
224ce89b 7024 rgw_rados_ref ref;
7c673cae
FG
7025 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
7026 if (r < 0)
7027 return r;
7028
7029 bool is_olh = state->is_olh;
7030
7031 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
7032
7033 const string *ptag = meta.ptag;
7034 if (!ptag && !index_op->get_optag()->empty()) {
7035 ptag = index_op->get_optag();
7036 }
181888fb 7037 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
7c673cae
FG
7038 if (r < 0)
7039 return r;
7040
7041 if (real_clock::is_zero(meta.set_mtime)) {
7042 meta.set_mtime = real_clock::now();
7043 }
7044
7045 if (state->is_olh) {
7046 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
7047 }
7048
7049 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
7050 op.mtime2(&mtime_ts);
7051
7052 if (meta.data) {
7053 /* if we want to overwrite the data, we also want to overwrite the
7054 xattrs, so just remove the object */
7055 op.write_full(*meta.data);
7056 }
7057
7058 string etag;
7059 string content_type;
7060 bufferlist acl_bl;
7061
7062 map<string, bufferlist>::iterator iter;
7063 if (meta.rmattrs) {
7064 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
7065 const string& name = iter->first;
7066 op.rmxattr(name.c_str());
7067 }
7068 }
7069
7070 if (meta.manifest) {
7071 /* remove existing manifest attr */
7072 iter = attrs.find(RGW_ATTR_MANIFEST);
7073 if (iter != attrs.end())
7074 attrs.erase(iter);
7075
7076 bufferlist bl;
7077 ::encode(*meta.manifest, bl);
7078 op.setxattr(RGW_ATTR_MANIFEST, bl);
7079 }
7080
7081 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
7082 const string& name = iter->first;
7083 bufferlist& bl = iter->second;
7084
7085 if (!bl.length())
7086 continue;
7087
7088 op.setxattr(name.c_str(), bl);
7089
7090 if (name.compare(RGW_ATTR_ETAG) == 0) {
7091 etag = bl.c_str();
7092 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
7093 content_type = bl.c_str();
7094 } else if (name.compare(RGW_ATTR_ACL) == 0) {
7095 acl_bl = bl;
7096 }
7097 }
7098 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
7099 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
7100 }
7101
7102 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
7103 bufferlist bl;
7104 ::encode(store->get_zone_short_id(), bl);
7105 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
7106 }
7107
7108 if (!op.size())
7109 return 0;
7110
7111 uint64_t epoch;
7112 int64_t poolid;
224ce89b
WB
7113 bool orig_exists;
7114 uint64_t orig_size;
7115
7116 if (!reset_obj) { //Multipart upload, it has immutable head.
7117 orig_exists = false;
7118 orig_size = 0;
7119 } else {
7120 orig_exists = state->exists;
7121 orig_size = state->accounted_size;
7122 }
7c673cae 7123
91327a77
AA
7124 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
7125 !obj.key.instance.empty();
7c673cae
FG
7126
7127 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
7128
7129 if (versioned_op) {
7130 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
7131 }
7132
7133 if (!index_op->is_prepared()) {
7134 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
7135 if (r < 0)
7136 return r;
7137 }
7138
7139 r = ref.ioctx.operate(ref.oid, &op);
7140 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7141 or -ENOENT if was removed, or -EEXIST if it did not exist
7142 before and now it does */
7143 if (r == -EEXIST && assume_noent) {
7144 target->invalidate_state();
7145 return r;
7146 }
7147 goto done_cancel;
7148 }
7149
7150 epoch = ref.ioctx.get_last_version();
7151 poolid = ref.ioctx.get_id();
7152
7153 r = target->complete_atomic_modification();
7154 if (r < 0) {
7155 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7156 }
7157
7158 r = index_op->complete(poolid, epoch, size, accounted_size,
7159 meta.set_mtime, etag, content_type, &acl_bl,
7160 meta.category, meta.remove_objs, meta.user_data);
7161 if (r < 0)
7162 goto done_cancel;
7163
7164 if (meta.mtime) {
7165 *meta.mtime = meta.set_mtime;
7166 }
7167
7168 /* note that index_op was using state so we couldn't invalidate it earlier */
7169 target->invalidate_state();
7170 state = NULL;
7171
91327a77
AA
7172 if (versioned_op && meta.olh_epoch) {
7173 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
7c673cae
FG
7174 if (r < 0) {
7175 return r;
7176 }
7177 }
7178
7179 if (!real_clock::is_zero(meta.delete_at)) {
7180 rgw_obj_index_key obj_key;
7181 obj.key.get_index_key(&obj_key);
7182
7183 r = store->objexp_hint_add(meta.delete_at,
7184 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7185 if (r < 0) {
7186 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7187 /* ignoring error, nothing we can do at this point */
7188 }
7189 }
7190 meta.canceled = false;
7191
7192 /* update quota cache */
3efd9988
FG
7193 if (meta.completeMultipart){
7194 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7195 0, orig_size);
7196 }
7197 else {
7198 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7199 accounted_size, orig_size);
7200 }
7c673cae
FG
7201 return 0;
7202
7203done_cancel:
7204 int ret = index_op->cancel();
7205 if (ret < 0) {
7206 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7207 }
7208
7209 meta.canceled = true;
7210
7211 /* we lost in a race. There are a few options:
7212 * - existing object was rewritten (ECANCELED)
7213 * - non existing object was created (EEXIST)
7214 * - object was removed (ENOENT)
7215 * should treat it as a success
7216 */
7217 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7218 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7219 r = 0;
7220 }
7221 } else {
7222 if (meta.if_match != NULL) {
7223 // only overwrite existing object
7224 if (strcmp(meta.if_match, "*") == 0) {
7225 if (r == -ENOENT) {
7226 r = -ERR_PRECONDITION_FAILED;
7227 } else if (r == -ECANCELED) {
7228 r = 0;
7229 }
7230 }
7231 }
7232
7233 if (meta.if_nomatch != NULL) {
7234 // only create a new object
7235 if (strcmp(meta.if_nomatch, "*") == 0) {
7236 if (r == -EEXIST) {
7237 r = -ERR_PRECONDITION_FAILED;
7238 } else if (r == -ENOENT) {
7239 r = 0;
7240 }
7241 }
7242 }
7243 }
7244
7245 return r;
7246}
7247
7248int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7249 map<string, bufferlist>& attrs)
7250{
7251 RGWBucketInfo& bucket_info = target->get_bucket_info();
7252
7253 RGWRados::Bucket bop(target->get_store(), bucket_info);
7254 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
7255 index_op.set_zones_trace(meta.zones_trace);
7256
7c673cae
FG
7257 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7258 int r;
7259 if (assume_noent) {
181888fb 7260 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7261 if (r == -EEXIST) {
7262 assume_noent = false;
7263 }
7264 }
7265 if (!assume_noent) {
181888fb 7266 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
7267 }
7268 return r;
7269}
7270
7271/** Write/overwrite a system object. */
7272int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7273 map<std::string, bufferlist>& attrs, int flags,
7274 bufferlist& data,
7275 RGWObjVersionTracker *objv_tracker,
7276 real_time set_mtime /* 0 for don't set */)
7277{
7c673cae 7278 rgw_rados_ref ref;
224ce89b 7279 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7280 if (r < 0)
7281 return r;
7282
7283 ObjectWriteOperation op;
7284
7285 if (flags & PUT_OBJ_EXCL) {
7286 if (!(flags & PUT_OBJ_CREATE))
7287 return -EINVAL;
7288 op.create(true); // exclusive create
7289 } else {
7290 op.remove();
7291 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7292 op.create(false);
7293 }
7294
7295 if (objv_tracker) {
7296 objv_tracker->prepare_op_for_write(&op);
7297 }
7298
7299 if (real_clock::is_zero(set_mtime)) {
7300 set_mtime = real_clock::now();
7301 }
7302
7303 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7304 op.mtime2(&mtime_ts);
7305 op.write_full(data);
7306
7307 bufferlist acl_bl;
7308
7309 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7310 const string& name = iter->first;
7311 bufferlist& bl = iter->second;
7312
7313 if (!bl.length())
7314 continue;
7315
7316 op.setxattr(name.c_str(), bl);
7317 }
7318
7319 r = ref.ioctx.operate(ref.oid, &op);
7320 if (r < 0) {
7321 return r;
7322 }
7323
7324 if (objv_tracker) {
7325 objv_tracker->apply_write();
7326 }
7327
7328 if (mtime) {
7329 *mtime = set_mtime;
7330 }
7331
7332 return 0;
7333}
7334
7335int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7336 off_t ofs, bool exclusive,
7337 RGWObjVersionTracker *objv_tracker)
7338{
7339 rgw_rados_ref ref;
224ce89b 7340 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
7341 if (r < 0) {
7342 return r;
7343 }
7344
7345 ObjectWriteOperation op;
7346
7347 if (exclusive)
7348 op.create(true);
7349
7350 if (objv_tracker) {
7351 objv_tracker->prepare_op_for_write(&op);
7352 }
7353 if (ofs == -1) {
7354 op.write_full(bl);
7355 } else {
7356 op.write(ofs, bl);
7357 }
7358 r = ref.ioctx.operate(ref.oid, &op);
7359 if (r < 0)
7360 return r;
7361
7362 if (objv_tracker) {
7363 objv_tracker->apply_write();
7364 }
7365 return 0;
7366}
7367
7368/**
7369 * Write/overwrite an object to the bucket storage.
7370 * bucket: the bucket to store the object in
7371 * obj: the object name/key
7372 * data: the object contents/value
7373 * offset: the offet to write to in the object
7374 * If this is -1, we will overwrite the whole object.
7375 * size: the amount of data to write (data must be this long)
7376 * attrs: all the given attrs are written to bucket storage for the given object
7377 * Returns: 0 on success, -ERR# otherwise.
7378 */
7379
7380int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7381 off_t ofs, bool exclusive,
7382 void **handle)
7383{
7384 rgw_rados_ref ref;
7385 int r = get_raw_obj_ref(obj, &ref);
7386 if (r < 0) {
7387 return r;
7388 }
7389
7390 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7391 *handle = c;
7392
7393 ObjectWriteOperation op;
7394
7395 if (exclusive)
7396 op.create(true);
7397
7398 if (ofs == -1) {
7399 op.write_full(bl);
7400 } else {
7401 op.write(ofs, bl);
7402 }
7403 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7404 if (r < 0)
7405 return r;
7406
7407 return 0;
7408}
7409
7410int RGWRados::aio_wait(void *handle)
7411{
7412 AioCompletion *c = (AioCompletion *)handle;
7413 c->wait_for_safe();
7414 int ret = c->get_return_value();
7415 c->release();
7416 return ret;
7417}
7418
7419bool RGWRados::aio_completed(void *handle)
7420{
7421 AioCompletion *c = (AioCompletion *)handle;
7422 return c->is_safe();
7423}
7424
28e407b8
AA
7425// PutObj filter that buffers data so we don't try to compress tiny blocks.
7426// libcurl reads in 16k at a time, and we need at least 64k to get a good
7427// compression ratio
7428class RGWPutObj_Buffer : public RGWPutObj_Filter {
7429 const unsigned buffer_size;
7430 bufferlist buffer;
7431 public:
7432 RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
7433 : RGWPutObj_Filter(next), buffer_size(buffer_size) {
7434 assert(ISP2(buffer_size)); // must be power of 2
7435 }
7436
7437 int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
7438 bool *again) override {
7439 if (*again || !bl.length()) {
7440 // flush buffered data
7441 return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
7442 }
7443 // transform offset to the beginning of the buffer
7444 ofs = ofs - buffer.length();
7445 buffer.claim_append(bl);
7446 if (buffer.length() < buffer_size) {
7447 *again = false; // don't come back until there's more data
7448 return 0;
7449 }
7450 const auto count = P2ALIGN(buffer.length(), buffer_size);
7451 buffer.splice(0, count, &bl);
7452 return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
7453 }
7454};
7455
7c673cae
FG
7456class RGWRadosPutObj : public RGWGetDataCB
7457{
7458 CephContext* cct;
7459 rgw_obj obj;
7460 RGWPutObjDataProcessor *filter;
7461 boost::optional<RGWPutObj_Compress>& compressor;
28e407b8 7462 boost::optional<RGWPutObj_Buffer> buffering;
7c673cae
FG
7463 CompressorRef& plugin;
7464 RGWPutObjProcessor_Atomic *processor;
7465 RGWOpStateSingleOp *opstate;
7466 void (*progress_cb)(off_t, void *);
7467 void *progress_data;
7468 bufferlist extra_data_bl;
b32b8144 7469 uint64_t extra_data_left;
7c673cae
FG
7470 uint64_t data_len;
7471 map<string, bufferlist> src_attrs;
7472public:
7473 RGWRadosPutObj(CephContext* cct,
7474 CompressorRef& plugin,
7475 boost::optional<RGWPutObj_Compress>& compressor,
7476 RGWPutObjProcessor_Atomic *p,
7477 RGWOpStateSingleOp *_ops,
7478 void (*_progress_cb)(off_t, void *),
7479 void *_progress_data) :
7480 cct(cct),
7481 filter(p),
7482 compressor(compressor),
7483 plugin(plugin),
7484 processor(p),
7485 opstate(_ops),
7486 progress_cb(_progress_cb),
7487 progress_data(_progress_data),
b32b8144 7488 extra_data_left(0),
7c673cae
FG
7489 data_len(0) {}
7490
7491 int process_attrs(void) {
7492 if (extra_data_bl.length()) {
7493 JSONParser jp;
7494 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7495 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7496 return -EIO;
7497 }
7498
7499 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7500
7501 src_attrs.erase(RGW_ATTR_COMPRESSION);
7502 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7503 }
7504
7505 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7506 //do not compress if object is encrypted
7507 compressor = boost::in_place(cct, plugin, filter);
28e407b8
AA
7508 constexpr unsigned buffer_size = 512 * 1024;
7509 buffering = boost::in_place(&*compressor, buffer_size);
7510 filter = &*buffering;
7c673cae
FG
7511 }
7512 return 0;
7513 }
7514
7515 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7516 if (progress_cb) {
7517 progress_cb(ofs, progress_data);
7518 }
b32b8144 7519 if (extra_data_left) {
7c673cae 7520 size_t extra_len = bl.length();
b32b8144
FG
7521 if (extra_len > extra_data_left)
7522 extra_len = extra_data_left;
7c673cae
FG
7523
7524 bufferlist extra;
7525 bl.splice(0, extra_len, &extra);
7526 extra_data_bl.append(extra);
7527
b32b8144
FG
7528 extra_data_left -= extra_len;
7529 if (extra_data_left == 0) {
7c673cae
FG
7530 int res = process_attrs();
7531 if (res < 0)
7532 return res;
7533 }
7534 if (bl.length() == 0) {
7535 return 0;
7536 }
b32b8144 7537 ofs += extra_len;
7c673cae 7538 }
b32b8144
FG
7539 // adjust ofs based on extra_data_len, so the result is a logical offset
7540 // into the object data
7541 assert(uint64_t(ofs) >= extra_data_len);
7542 ofs -= extra_data_len;
7543
7c673cae
FG
7544 data_len += bl.length();
7545 bool again = false;
7546
7547 bool need_opstate = true;
7548
7549 do {
7550 void *handle = NULL;
7551 rgw_raw_obj obj;
7552 uint64_t size = bl.length();
7553 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7554 if (ret < 0)
7555 return ret;
7556
7557 if (need_opstate && opstate) {
7558 /* need to update opstate repository with new state. This is ratelimited, so we're not
7559 * really doing it every time
7560 */
7561 ret = opstate->renew_state();
7562 if (ret < 0) {
7563 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7564 int r = filter->throttle_data(handle, obj, size, false);
7565 if (r < 0) {
7566 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7567 }
7568 /* could not renew state! might have been marked as cancelled */
7569 return ret;
7570 }
7571 need_opstate = false;
7572 }
7573
7574 ret = filter->throttle_data(handle, obj, size, false);
7575 if (ret < 0)
7576 return ret;
7577 } while (again);
7578
7579 return 0;
7580 }
7581
28e407b8
AA
7582 int flush() {
7583 bufferlist bl;
7584 return put_data_and_throttle(filter, bl, 0, false);
7585 }
7586
7c673cae
FG
7587 bufferlist& get_extra_data() { return extra_data_bl; }
7588
7589 map<string, bufferlist>& get_attrs() { return src_attrs; }
7590
7591 void set_extra_data_len(uint64_t len) override {
b32b8144
FG
7592 extra_data_left = len;
7593 RGWGetDataCB::set_extra_data_len(len);
7c673cae
FG
7594 }
7595
7596 uint64_t get_data_len() {
7597 return data_len;
7598 }
7599
7600 int complete(const string& etag, real_time *mtime, real_time set_mtime,
31f18b77
FG
7601 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7602 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7c673cae
FG
7603 }
7604
7605 bool is_canceled() {
7606 return processor->is_canceled();
7607 }
7608};
7609
7610/*
7611 * prepare attrset depending on attrs_mod.
7612 */
7613static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7614 map<string, bufferlist>& attrs,
7615 RGWRados::AttrsMod attrs_mod)
7616{
7617 switch (attrs_mod) {
7618 case RGWRados::ATTRSMOD_NONE:
7619 attrs = src_attrs;
7620 break;
7621 case RGWRados::ATTRSMOD_REPLACE:
7622 if (!attrs[RGW_ATTR_ETAG].length()) {
7623 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7624 }
181888fb
FG
7625 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7626 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7627 if (ttiter != src_attrs.end()) {
7628 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7629 }
7630 }
7c673cae
FG
7631 break;
7632 case RGWRados::ATTRSMOD_MERGE:
7633 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7634 if (attrs.find(it->first) == attrs.end()) {
7635 attrs[it->first] = it->second;
7636 }
7637 }
7638 break;
7639 }
7640}
7641
7642int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7643{
7644 map<string, bufferlist> attrset;
7645
7646 real_time mtime;
7647 uint64_t obj_size;
7648 RGWObjectCtx rctx(this);
7649
7650 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7651 RGWRados::Object::Read read_op(&op_target);
7652
7653 read_op.params.attrs = &attrset;
7654 read_op.params.lastmod = &mtime;
7655 read_op.params.obj_size = &obj_size;
7656
7657 int ret = read_op.prepare();
7658 if (ret < 0)
7659 return ret;
7660
7661 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 7662 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
7663
7664 uint64_t max_chunk_size;
7665
7666 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7667 if (ret < 0) {
7668 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7669 return ret;
7670 }
7671
b32b8144
FG
7672 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7673 max_chunk_size, NULL, mtime, attrset,
7674 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7675 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7676 NULL, NULL);
7c673cae
FG
7677}
7678
7679struct obj_time_weight {
7680 real_time mtime;
7681 uint32_t zone_short_id;
7682 uint64_t pg_ver;
7683 bool high_precision;
7684
7685 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7686
7687 bool compare_low_precision(const obj_time_weight& rhs) {
7688 struct timespec l = ceph::real_clock::to_timespec(mtime);
7689 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7690 l.tv_nsec = 0;
7691 r.tv_nsec = 0;
7692 if (l > r) {
7693 return false;
7694 }
7695 if (l < r) {
7696 return true;
7697 }
7698 if (zone_short_id != rhs.zone_short_id) {
7699 return (zone_short_id < rhs.zone_short_id);
7700 }
7701 return (pg_ver < rhs.pg_ver);
7702
7703 }
7704
7705 bool operator<(const obj_time_weight& rhs) {
7706 if (!high_precision || !rhs.high_precision) {
7707 return compare_low_precision(rhs);
7708 }
7709 if (mtime > rhs.mtime) {
7710 return false;
7711 }
7712 if (mtime < rhs.mtime) {
7713 return true;
7714 }
7715 if (zone_short_id != rhs.zone_short_id) {
7716 return (zone_short_id < rhs.zone_short_id);
7717 }
7718 return (pg_ver < rhs.pg_ver);
7719 }
7720
7721 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7722 mtime = _mtime;
7723 zone_short_id = _short_id;
7724 pg_ver = _pg_ver;
7725 }
7726
7727 void init(RGWObjState *state) {
7728 mtime = state->mtime;
7729 zone_short_id = state->zone_short_id;
7730 pg_ver = state->pg_ver;
7731 }
7732};
7733
7734inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7735 out << o.mtime;
7736
7737 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7738 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7739 }
7740
7741 return out;
7742}
7743
7744class RGWGetExtraDataCB : public RGWGetDataCB {
7745 bufferlist extra_data;
7746public:
7747 RGWGetExtraDataCB() {}
7748 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7749 if (extra_data.length() < extra_data_len) {
7750 off_t max = extra_data_len - extra_data.length();
7751 if (max > bl_len) {
7752 max = bl_len;
7753 }
7754 bl.splice(0, max, &extra_data);
7755 }
7756 return bl_len;
7757 }
7758
7759 bufferlist& get_extra_data() {
7760 return extra_data;
7761 }
7762};
7763
7764int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7765 const rgw_user& user_id,
7766 const string& client_id,
7767 req_info *info,
7768 const string& source_zone,
7769 rgw_obj& src_obj,
7770 RGWBucketInfo& src_bucket_info,
7771 real_time *src_mtime,
7772 uint64_t *psize,
7773 const real_time *mod_ptr,
7774 const real_time *unmod_ptr,
7775 bool high_precision_time,
7776 const char *if_match,
7777 const char *if_nomatch,
7778 map<string, bufferlist> *pattrs,
7779 string *version_id,
7780 string *ptag,
7781 string *petag)
7782{
7783 /* source is in a different zonegroup, copy from there */
7784
7785 RGWRESTStreamRWRequest *in_stream_req;
7786 string tag;
7787 map<string, bufferlist> src_attrs;
7788 append_rand_alpha(cct, tag, tag, 32);
7789 obj_time_weight set_mtime_weight;
7790 set_mtime_weight.high_precision = high_precision_time;
7791
7792 RGWRESTConn *conn;
7793 if (source_zone.empty()) {
7794 if (src_bucket_info.zonegroup.empty()) {
7795 /* source is in the master zonegroup */
7796 conn = rest_master_conn;
7797 } else {
7798 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7799 if (iter == zonegroup_conn_map.end()) {
7800 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7801 return -ENOENT;
7802 }
7803 conn = iter->second;
7804 }
7805 } else {
7806 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7807 if (iter == zone_conn_map.end()) {
7808 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7809 return -ENOENT;
7810 }
7811 conn = iter->second;
7812 }
7813
7814 RGWGetExtraDataCB cb;
7815 string etag;
7816 map<string, string> req_headers;
7817 real_time set_mtime;
7818
7819 const real_time *pmod = mod_ptr;
7820
7821 obj_time_weight dest_mtime_weight;
7822
181888fb
FG
7823 constexpr bool prepend_meta = true;
7824 constexpr bool get_op = true;
7825 constexpr bool rgwx_stat = true;
7826 constexpr bool sync_manifest = true;
7827 constexpr bool skip_decrypt = true;
7c673cae
FG
7828 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7829 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
7830 prepend_meta, get_op, rgwx_stat,
7831 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
7832 if (ret < 0) {
7833 return ret;
7834 }
7835
7836 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7837 if (ret < 0) {
7838 return ret;
7839 }
7840
7841 bufferlist& extra_data_bl = cb.get_extra_data();
7842 if (extra_data_bl.length()) {
7843 JSONParser jp;
7844 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7845 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7846 return -EIO;
7847 }
7848
7849 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7850
7851 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7852 }
7853
7854 if (src_mtime) {
7855 *src_mtime = set_mtime;
7856 }
7857
7858 if (petag) {
7859 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7860 if (iter != src_attrs.end()) {
7861 bufferlist& etagbl = iter->second;
7862 *petag = etagbl.to_str();
7863 }
7864 }
7865
7866 if (pattrs) {
7867 *pattrs = src_attrs;
7868 }
7869
7870 return 0;
7871}
7872
7873int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7874 const rgw_user& user_id,
7875 const string& client_id,
7876 const string& op_id,
7877 bool record_op_state,
7878 req_info *info,
7879 const string& source_zone,
7880 rgw_obj& dest_obj,
7881 rgw_obj& src_obj,
7882 RGWBucketInfo& dest_bucket_info,
7883 RGWBucketInfo& src_bucket_info,
7884 real_time *src_mtime,
7885 real_time *mtime,
7886 const real_time *mod_ptr,
7887 const real_time *unmod_ptr,
7888 bool high_precision_time,
7889 const char *if_match,
7890 const char *if_nomatch,
7891 AttrsMod attrs_mod,
7892 bool copy_if_newer,
7893 map<string, bufferlist>& attrs,
7894 RGWObjCategory category,
91327a77 7895 boost::optional<uint64_t> olh_epoch,
7c673cae
FG
7896 real_time delete_at,
7897 string *version_id,
7898 string *ptag,
7899 ceph::buffer::list *petag,
7c673cae 7900 void (*progress_cb)(off_t, void *),
31f18b77
FG
7901 void *progress_data,
7902 rgw_zone_set *zones_trace)
7c673cae
FG
7903{
7904 /* source is in a different zonegroup, copy from there */
7905
7906 RGWRESTStreamRWRequest *in_stream_req;
7907 string tag;
7908 int i;
7909 append_rand_alpha(cct, tag, tag, 32);
7910 obj_time_weight set_mtime_weight;
7911 set_mtime_weight.high_precision = high_precision_time;
7912
7913 RGWPutObjProcessor_Atomic processor(obj_ctx,
7914 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7915 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7916 if (version_id && *version_id != "null") {
7917 processor.set_version_id(*version_id);
7918 }
91327a77
AA
7919 if (olh_epoch) {
7920 processor.set_olh_epoch(*olh_epoch);
7921 }
7c673cae
FG
7922 int ret = processor.prepare(this, NULL);
7923 if (ret < 0) {
7924 return ret;
7925 }
7926
7927 RGWRESTConn *conn;
7928 if (source_zone.empty()) {
7929 if (dest_bucket_info.zonegroup.empty()) {
7930 /* source is in the master zonegroup */
7931 conn = rest_master_conn;
7932 } else {
7933 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7934 if (iter == zonegroup_conn_map.end()) {
7935 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7936 return -ENOENT;
7937 }
7938 conn = iter->second;
7939 }
7940 } else {
7941 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7942 if (iter == zone_conn_map.end()) {
7943 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7944 return -ENOENT;
7945 }
7946 conn = iter->second;
7947 }
7948
7949 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7950
7951 RGWOpStateSingleOp *opstate = NULL;
7952
7953 if (record_op_state) {
7954 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7955
7956 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7957 if (ret < 0) {
7958 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7959 delete opstate;
7960 return ret;
7961 }
7962 }
7963
7964 boost::optional<RGWPutObj_Compress> compressor;
7965 CompressorRef plugin;
7966
7967 const auto& compression_type = zone_params.get_compression_type(
7968 dest_bucket_info.placement_rule);
7969 if (compression_type != "none") {
7970 plugin = Compressor::create(cct, compression_type);
7971 if (!plugin) {
7972 ldout(cct, 1) << "Cannot load plugin for compression type "
7973 << compression_type << dendl;
7974 }
7975 }
7976
7977 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7978
7979 string etag;
7980 map<string, string> req_headers;
7981 real_time set_mtime;
7982
7983 RGWObjState *dest_state = NULL;
7984
7985 const real_time *pmod = mod_ptr;
7986
7987 obj_time_weight dest_mtime_weight;
7988
7989 if (copy_if_newer) {
7990 /* need to get mtime for destination */
7991 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7992 if (ret < 0)
7993 goto set_err_state;
7994
7995 if (!real_clock::is_zero(dest_state->mtime)) {
7996 dest_mtime_weight.init(dest_state);
7997 pmod = &dest_mtime_weight.mtime;
7998 }
7999 }
8000
181888fb
FG
8001 static constexpr bool prepend_meta = true;
8002 static constexpr bool get_op = true;
8003 static constexpr bool rgwx_stat = false;
8004 static constexpr bool sync_manifest = true;
8005 static constexpr bool skip_decrypt = true;
7c673cae
FG
8006 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
8007 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb
FG
8008 prepend_meta, get_op, rgwx_stat,
8009 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7c673cae
FG
8010 if (ret < 0) {
8011 goto set_err_state;
8012 }
8013
8014 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
8015 if (ret < 0) {
8016 goto set_err_state;
8017 }
28e407b8
AA
8018 ret = cb.flush();
8019 if (ret < 0) {
8020 goto set_err_state;
8021 }
7c673cae
FG
8022 if (compressor && compressor->is_compressed()) {
8023 bufferlist tmp;
8024 RGWCompressionInfo cs_info;
8025 cs_info.compression_type = plugin->get_type_name();
8026 cs_info.orig_size = cb.get_data_len();
8027 cs_info.blocks = move(compressor->get_compression_blocks());
8028 ::encode(cs_info, tmp);
8029 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
8030 }
8031
8032 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
8033 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
8034 } else {
8035 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
8036 if (iter != cb.get_attrs().end()) {
8037 try {
8038 ::decode(delete_at, iter->second);
8039 } catch (buffer::error& err) {
8040 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
8041 }
8042 }
8043 }
8044
8045 if (src_mtime) {
8046 *src_mtime = set_mtime;
8047 }
8048
8049 if (petag) {
8050 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
8051 if (iter != cb.get_attrs().end()) {
8052 *petag = iter->second;
8053 }
8054 }
8055
8056 if (source_zone.empty()) {
8057 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
8058 } else {
8059 attrs = cb.get_attrs();
8060 }
8061
8062 if (copy_if_newer) {
8063 uint64_t pg_ver = 0;
8064 auto i = attrs.find(RGW_ATTR_PG_VER);
8065 if (i != attrs.end() && i->second.length() > 0) {
8066 bufferlist::iterator iter = i->second.begin();
8067 try {
8068 ::decode(pg_ver, iter);
8069 } catch (buffer::error& err) {
8070 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
8071 /* non critical error */
8072 }
8073 }
8074 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
8075 }
8076
8077#define MAX_COMPLETE_RETRY 100
8078 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
31f18b77 8079 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7c673cae
FG
8080 if (ret < 0) {
8081 goto set_err_state;
8082 }
8083 if (copy_if_newer && cb.is_canceled()) {
8084 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
8085 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
8086 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8087 if (ret < 0) {
8088 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
8089 goto set_err_state;
8090 }
8091 dest_mtime_weight.init(dest_state);
8092 dest_mtime_weight.high_precision = high_precision_time;
8093 if (!dest_state->exists ||
8094 dest_mtime_weight < set_mtime_weight) {
8095 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8096 continue;
8097 } else {
8098 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8099 }
8100 }
8101 break;
8102 }
8103
8104 if (i == MAX_COMPLETE_RETRY) {
8105 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
8106 ret = -EIO;
8107 goto set_err_state;
8108 }
8109
8110 if (opstate) {
8111 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
8112 if (ret < 0) {
8113 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8114 }
8115 delete opstate;
8116 }
8117
8118 return 0;
8119set_err_state:
8120 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
8121 // we may have already fetched during sync of OP_ADD, but were waiting
8122 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
8123 if (olh_epoch && *olh_epoch > 0) {
8124 constexpr bool log_data_change = true;
8125 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
8126 *olh_epoch, real_time(), false, zones_trace, log_data_change);
8127 } else {
8128 // we already have the latest copy
8129 ret = 0;
8130 }
7c673cae
FG
8131 }
8132 if (opstate) {
8133 RGWOpState::OpState state;
8134 if (ret < 0) {
8135 state = RGWOpState::OPSTATE_ERROR;
8136 } else {
8137 state = RGWOpState::OPSTATE_COMPLETE;
8138 }
8139 int r = opstate->set_state(state);
8140 if (r < 0) {
8141 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
8142 }
8143 delete opstate;
8144 }
8145 return ret;
8146}
8147
8148
8149int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
8150 map<string, bufferlist>& src_attrs,
8151 RGWRados::Object::Read& read_op,
8152 const rgw_user& user_id,
8153 rgw_obj& dest_obj,
8154 real_time *mtime)
8155{
8156 string etag;
8157
8158 RGWRESTStreamWriteRequest *out_stream_req;
8159
8160 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
8161 if (ret < 0) {
7c673cae
FG
8162 return ret;
8163 }
8164
8165 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
224ce89b
WB
8166 if (ret < 0) {
8167 delete out_stream_req;
7c673cae 8168 return ret;
224ce89b 8169 }
7c673cae
FG
8170
8171 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8172 if (ret < 0)
8173 return ret;
8174
8175 return 0;
8176}
8177
8178/**
8179 * Copy an object.
8180 * dest_obj: the object to copy into
8181 * src_obj: the object to copy from
8182 * attrs: usage depends on attrs_mod parameter
8183 * attrs_mod: the modification mode of the attrs, may have the following values:
8184 * ATTRSMOD_NONE - the attributes of the source object will be
8185 * copied without modifications, attrs parameter is ignored;
8186 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8187 * parameter, source object attributes are not copied;
8188 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8189 * are overwritten by values contained in attrs parameter.
8190 * err: stores any errors resulting from the get of the original object
8191 * Returns: 0 on success, -ERR# otherwise.
8192 */
8193int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8194 const rgw_user& user_id,
8195 const string& client_id,
8196 const string& op_id,
8197 req_info *info,
8198 const string& source_zone,
8199 rgw_obj& dest_obj,
8200 rgw_obj& src_obj,
8201 RGWBucketInfo& dest_bucket_info,
8202 RGWBucketInfo& src_bucket_info,
8203 real_time *src_mtime,
8204 real_time *mtime,
8205 const real_time *mod_ptr,
8206 const real_time *unmod_ptr,
8207 bool high_precision_time,
8208 const char *if_match,
8209 const char *if_nomatch,
8210 AttrsMod attrs_mod,
8211 bool copy_if_newer,
8212 map<string, bufferlist>& attrs,
8213 RGWObjCategory category,
8214 uint64_t olh_epoch,
8215 real_time delete_at,
8216 string *version_id,
8217 string *ptag,
8218 ceph::buffer::list *petag,
7c673cae
FG
8219 void (*progress_cb)(off_t, void *),
8220 void *progress_data)
8221{
8222 int ret;
8223 uint64_t obj_size;
8224 rgw_obj shadow_obj = dest_obj;
8225 string shadow_oid;
8226
8227 bool remote_src;
8228 bool remote_dest;
8229
8230 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8231 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8232
8233 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8234 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8235
8236 if (remote_src && remote_dest) {
8237 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8238 return -EINVAL;
8239 }
8240
8241 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8242
8243 if (remote_src || !source_zone.empty()) {
8244 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8245 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8246 unmod_ptr, high_precision_time,
8247 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
31f18b77 8248 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7c673cae
FG
8249 }
8250
8251 map<string, bufferlist> src_attrs;
8252 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8253 RGWRados::Object::Read read_op(&src_op_target);
8254
8255 read_op.conds.mod_ptr = mod_ptr;
8256 read_op.conds.unmod_ptr = unmod_ptr;
8257 read_op.conds.high_precision_time = high_precision_time;
8258 read_op.conds.if_match = if_match;
8259 read_op.conds.if_nomatch = if_nomatch;
8260 read_op.params.attrs = &src_attrs;
8261 read_op.params.lastmod = src_mtime;
8262 read_op.params.obj_size = &obj_size;
7c673cae
FG
8263
8264 ret = read_op.prepare();
8265 if (ret < 0) {
8266 return ret;
8267 }
94b18763
FG
8268 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8269 // Current implementation does not follow S3 spec and even
8270 // may result in data corruption silently when copying
8271 // multipart objects acorss pools. So reject COPY operations
8272 //on encrypted objects before it is fully functional.
8273 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8274 << " has not been implemented." << dendl;
8275 return -ERR_NOT_IMPLEMENTED;
8276 }
7c673cae
FG
8277
8278 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8279 src_attrs.erase(RGW_ATTR_DELETE_AT);
8280
8281 set_copy_attrs(src_attrs, attrs, attrs_mod);
8282 attrs.erase(RGW_ATTR_ID_TAG);
8283 attrs.erase(RGW_ATTR_PG_VER);
8284 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8285 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8286 if (cmp != src_attrs.end())
8287 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8288
8289 RGWObjManifest manifest;
8290 RGWObjState *astate = NULL;
8291
8292 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8293 if (ret < 0) {
8294 return ret;
8295 }
8296
8297 vector<rgw_raw_obj> ref_objs;
8298
8299 if (remote_dest) {
8300 /* dest is in a different zonegroup, copy it there */
8301 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8302 }
8303 uint64_t max_chunk_size;
8304
8305 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8306 if (ret < 0) {
8307 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8308 return ret;
8309 }
8310
8311 rgw_pool src_pool;
8312 rgw_pool dest_pool;
8313 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8314 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8315 return -EIO;
8316 }
8317 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8318 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8319 return -EIO;
8320 }
8321
8322
8323 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8324 bool copy_first = false;
8325 if (astate->has_manifest) {
8326 if (!astate->manifest.has_tail()) {
8327 copy_data = true;
8328 } else {
8329 uint64_t head_size = astate->manifest.get_head_size();
8330
8331 if (head_size > 0) {
8332 if (head_size > max_chunk_size) {
8333 copy_data = true;
8334 } else {
8335 copy_first = true;
8336 }
8337 }
8338 }
8339 }
8340
8341 if (petag) {
8342 const auto iter = attrs.find(RGW_ATTR_ETAG);
8343 if (iter != attrs.end()) {
8344 *petag = iter->second;
8345 }
8346 }
8347
8348 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8349 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8350 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
31f18b77 8351 version_id, ptag, petag);
7c673cae
FG
8352 }
8353
8354 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8355
8356 if (copy_first) { // we need to copy first chunk, not increase refcount
8357 ++miter;
8358 }
8359
8360 rgw_rados_ref ref;
8361 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8362 if (ret < 0) {
8363 return ret;
8364 }
8365
8366 bool versioned_dest = dest_bucket_info.versioning_enabled();
8367
8368 if (version_id && !version_id->empty()) {
8369 versioned_dest = true;
8370 dest_obj.key.set_instance(*version_id);
8371 } else if (versioned_dest) {
8372 gen_rand_obj_instance_name(&dest_obj);
8373 }
8374
8375 bufferlist first_chunk;
8376
8377 bool copy_itself = (dest_obj == src_obj);
8378 RGWObjManifest *pmanifest;
31f18b77 8379 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
8380
8381 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8382 RGWRados::Object::Write write_op(&dest_op_target);
8383
8384 string tag;
8385
8386 if (ptag) {
8387 tag = *ptag;
8388 }
8389
8390 if (tag.empty()) {
8391 append_rand_alpha(cct, tag, tag, 32);
8392 }
8393
8394 if (!copy_itself) {
181888fb 8395 attrs.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
8396 manifest = astate->manifest;
8397 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8398 if (tail_placement.bucket.name.empty()) {
8399 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8400 }
3efd9988 8401 string ref_tag;
7c673cae
FG
8402 for (; miter != astate->manifest.obj_end(); ++miter) {
8403 ObjectWriteOperation op;
3efd9988
FG
8404 ref_tag = tag + '\0';
8405 cls_refcount_get(op, ref_tag, true);
7c673cae
FG
8406 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8407 ref.ioctx.locator_set_key(loc.loc);
8408
8409 ret = ref.ioctx.operate(loc.oid, &op);
8410 if (ret < 0) {
8411 goto done_ret;
8412 }
8413
8414 ref_objs.push_back(loc);
8415 }
8416
8417 pmanifest = &manifest;
8418 } else {
8419 pmanifest = &astate->manifest;
8420 /* don't send the object's tail for garbage collection */
8421 astate->keep_tail = true;
8422 }
8423
8424 if (copy_first) {
8425 ret = read_op.read(0, max_chunk_size, first_chunk);
8426 if (ret < 0) {
8427 goto done_ret;
8428 }
8429
8430 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8431 } else {
8432 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8433 }
8434
8435 write_op.meta.data = &first_chunk;
8436 write_op.meta.manifest = pmanifest;
8437 write_op.meta.ptag = &tag;
8438 write_op.meta.owner = dest_bucket_info.owner;
8439 write_op.meta.mtime = mtime;
8440 write_op.meta.flags = PUT_OBJ_CREATE;
8441 write_op.meta.category = category;
8442 write_op.meta.olh_epoch = olh_epoch;
8443 write_op.meta.delete_at = delete_at;
181888fb 8444 write_op.meta.modify_tail = !copy_itself;
7c673cae
FG
8445
8446 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8447 if (ret < 0) {
8448 goto done_ret;
8449 }
8450
8451 return 0;
8452
8453done_ret:
8454 if (!copy_itself) {
8455 vector<rgw_raw_obj>::iterator riter;
8456
7c673cae
FG
8457 /* rollback reference */
8458 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8459 ObjectWriteOperation op;
8460 cls_refcount_put(op, tag, true);
8461
8462 ref.ioctx.locator_set_key(riter->loc);
8463
8464 int r = ref.ioctx.operate(riter->oid, &op);
8465 if (r < 0) {
8466 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8467 }
8468 }
8469 }
8470 return ret;
8471}
8472
8473
8474int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8475 RGWBucketInfo& dest_bucket_info,
8476 RGWRados::Object::Read& read_op, off_t end,
8477 rgw_obj& dest_obj,
8478 rgw_obj& src_obj,
8479 uint64_t max_chunk_size,
8480 real_time *mtime,
8481 real_time set_mtime,
8482 map<string, bufferlist>& attrs,
8483 RGWObjCategory category,
8484 uint64_t olh_epoch,
8485 real_time delete_at,
8486 string *version_id,
8487 string *ptag,
31f18b77 8488 ceph::buffer::list *petag)
7c673cae
FG
8489{
8490 bufferlist first_chunk;
8491 RGWObjManifest manifest;
8492
8493 string tag;
8494 append_rand_alpha(cct, tag, tag, 32);
8495
8496 RGWPutObjProcessor_Atomic processor(obj_ctx,
b32b8144 8497 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7c673cae
FG
8498 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8499 if (version_id) {
8500 processor.set_version_id(*version_id);
8501 }
8502 processor.set_olh_epoch(olh_epoch);
8503 int ret = processor.prepare(this, NULL);
8504 if (ret < 0)
8505 return ret;
8506
8507 off_t ofs = 0;
8508
8509 do {
8510 bufferlist bl;
8511 ret = read_op.read(ofs, end, bl);
8512
8513 uint64_t read_len = ret;
8514 bool again;
8515
8516 do {
8517 void *handle;
8518 rgw_raw_obj obj;
8519
8520 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8521 if (ret < 0) {
8522 return ret;
8523 }
8524 ret = processor.throttle_data(handle, obj, read_len, false);
8525 if (ret < 0)
8526 return ret;
8527 } while (again);
8528
8529 ofs += read_len;
8530 } while (ofs <= end);
8531
8532 string etag;
8533 auto iter = attrs.find(RGW_ATTR_ETAG);
8534 if (iter != attrs.end()) {
8535 bufferlist& bl = iter->second;
8536 etag = string(bl.c_str(), bl.length());
8537 if (petag) {
8538 *petag = bl;
8539 }
8540 }
8541
8542 uint64_t accounted_size;
8543 {
8544 bool compressed{false};
8545 RGWCompressionInfo cs_info;
8546 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8547 if (ret < 0) {
8548 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8549 return ret;
8550 }
8551 // pass original size if compressed
8552 accounted_size = compressed ? cs_info.orig_size : ofs;
8553 }
8554
8555 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8556}
8557
8558bool RGWRados::is_meta_master()
8559{
31f18b77 8560 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8561 return false;
8562 }
8563
8564 return (get_zonegroup().master_zone == zone_public_config.id);
8565}
8566
8567/**
8568 * Check to see if the bucket metadata could be synced
8569 * bucket: the bucket to check
8570 * Returns false is the bucket is not synced
8571 */
8572bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8573{
8574
8575 /* no current period */
8576 if (current_period.get_id().empty()) {
8577 return false;
8578 }
8579
8580 /* zonegroup is not master zonegroup */
31f18b77 8581 if (!get_zonegroup().is_master_zonegroup()) {
7c673cae
FG
8582 return false;
8583 }
8584
8585 /* single zonegroup and a single zone */
224ce89b 8586 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
7c673cae
FG
8587 return false;
8588 }
8589
8590 /* zone is not master */
8591 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8592 return false;
8593 }
8594
8595 return true;
8596}
8597
8598int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8599{
1adf2230 8600 std::vector<rgw_bucket_dir_entry> ent_list;
7c673cae
FG
8601 rgw_obj_index_key marker;
8602 string prefix;
8603 bool is_truncated;
8604
8605 do {
1adf2230
AA
8606 constexpr uint NUM_ENTRIES = 1000u;
8607 int r = cls_bucket_list_unordered(bucket_info,
8608 RGW_NO_SHARD,
8609 marker,
8610 prefix,
8611 NUM_ENTRIES,
8612 true,
8613 ent_list,
8614 &is_truncated,
8615 &marker);
7c673cae
FG
8616 if (r < 0)
8617 return r;
8618
8619 string ns;
1adf2230 8620 for (auto const& dirent : ent_list) {
7c673cae
FG
8621 rgw_obj_key obj;
8622
1adf2230 8623 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
7c673cae
FG
8624 return -ENOTEMPTY;
8625 }
8626 } while (is_truncated);
1adf2230 8627
7c673cae
FG
8628 return 0;
8629}
8630
8631/**
8632 * Delete a bucket.
8633 * bucket: the name of the bucket to delete
8634 * Returns 0 on success, -ERR# otherwise.
8635 */
8636int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8637{
8638 const rgw_bucket& bucket = bucket_info.bucket;
8639 librados::IoCtx index_ctx;
8640 map<int, string> bucket_objs;
8641 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8642 if (r < 0)
8643 return r;
8644
8645 if (check_empty) {
8646 r = check_bucket_empty(bucket_info);
8647 if (r < 0) {
8648 return r;
8649 }
8650 }
8651
8652 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8653 if (r < 0)
8654 return r;
8655
8656 /* if the bucket is not synced we can remove the meta file */
8657 if (!is_syncing_bucket_meta(bucket)) {
8658 RGWObjVersionTracker objv_tracker;
8659 string entry = bucket.get_key();
8660 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8661 if (r < 0) {
8662 return r;
8663 }
8664 /* remove bucket index objects*/
8665 map<int, string>::const_iterator biter;
8666 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8667 index_ctx.remove(biter->second);
8668 }
8669 }
8670 return 0;
8671}
8672
8673int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8674{
8675 RGWBucketInfo info;
8676 map<string, bufferlist> attrs;
8677 RGWObjectCtx obj_ctx(this);
31f18b77
FG
8678 int r;
8679 if (bucket.bucket_id.empty()) {
8680 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8681 } else {
8682 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8683 }
7c673cae
FG
8684 if (r < 0) {
8685 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8686 return r;
8687 }
8688
8689 info.owner = owner.get_id();
8690
8691 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8692 if (r < 0) {
8693 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8694 return r;
8695 }
8696
8697 return 0;
8698}
8699
8700
8701int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8702{
8703 int ret = 0;
8704
8705 vector<rgw_bucket>::iterator iter;
8706
8707 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8708 rgw_bucket& bucket = *iter;
8709 if (enabled)
8710 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8711 else
8712 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8713
8714 RGWBucketInfo info;
8715 map<string, bufferlist> attrs;
8716 RGWObjectCtx obj_ctx(this);
8717 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8718 if (r < 0) {
8719 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8720 ret = r;
8721 continue;
8722 }
8723 if (enabled) {
8724 info.flags &= ~BUCKET_SUSPENDED;
8725 } else {
8726 info.flags |= BUCKET_SUSPENDED;
8727 }
8728
8729 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8730 if (r < 0) {
8731 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8732 ret = r;
8733 continue;
8734 }
8735 }
8736 return ret;
8737}
8738
8739int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8740{
8741 RGWBucketInfo bucket_info;
8742 RGWObjectCtx obj_ctx(this);
8743 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8744 if (ret < 0) {
8745 return ret;
8746 }
8747
8748 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8749 return 0;
8750}
8751
8752int RGWRados::Object::complete_atomic_modification()
8753{
8754 if (!state->has_manifest || state->keep_tail)
8755 return 0;
8756
8757 cls_rgw_obj_chain chain;
8758 store->update_gc_chain(obj, state->manifest, &chain);
8759
8760 if (chain.empty()) {
8761 return 0;
8762 }
8763
181888fb 8764 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
7c673cae
FG
8765 return store->gc->send_chain(chain, tag, false); // do it async
8766}
8767
8768void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8769{
8770 RGWObjManifest::obj_iterator iter;
8771 rgw_raw_obj raw_head;
8772 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8773 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8774 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8775 if (mobj == raw_head)
8776 continue;
8777 cls_rgw_obj_key key(mobj.oid);
8778 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8779 }
8780}
8781
8782int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8783{
8784 return gc->send_chain(chain, tag, sync);
8785}
8786
1adf2230
AA
8787int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8788 librados::IoCtx& index_ctx,
8789 string& bucket_oid)
7c673cae
FG
8790{
8791 const rgw_bucket& bucket = bucket_info.bucket;
8792 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8793 if (r < 0)
8794 return r;
8795
8796 if (bucket.bucket_id.empty()) {
8797 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8798 return -EIO;
8799 }
8800
8801 bucket_oid = dir_oid_prefix;
8802 bucket_oid.append(bucket.bucket_id);
8803
8804 return 0;
8805}
8806
1adf2230
AA
8807int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
8808 librados::IoCtx& index_ctx,
8809 string& bucket_oid_base) {
7c673cae
FG
8810 const rgw_bucket& bucket = bucket_info.bucket;
8811 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8812 if (r < 0)
8813 return r;
8814
8815 if (bucket.bucket_id.empty()) {
8816 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8817 return -EIO;
8818 }
8819
8820 bucket_oid_base = dir_oid_prefix;
8821 bucket_oid_base.append(bucket.bucket_id);
8822
8823 return 0;
8824
8825}
8826
1adf2230
AA
8827int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8828 librados::IoCtx& index_ctx,
8829 map<int, string>& bucket_objs,
8830 int shard_id,
8831 map<int, string> *bucket_instance_ids) {
7c673cae
FG
8832 string bucket_oid_base;
8833 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8834 if (ret < 0) {
8835 return ret;
8836 }
8837
8838 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8839 if (bucket_instance_ids) {
8840 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8841 }
8842 return 0;
8843}
8844
8845template<typename T>
8846int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8847 map<int, string>& oids, map<int, T>& bucket_objs,
8848 int shard_id, map<int, string> *bucket_instance_ids)
8849{
8850 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8851 if (ret < 0)
8852 return ret;
8853
8854 map<int, string>::const_iterator iter = oids.begin();
8855 for (; iter != oids.end(); ++iter) {
8856 bucket_objs[iter->first] = T();
8857 }
8858 return 0;
8859}
8860
8861int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8862 const string& obj_key, string *bucket_obj, int *shard_id)
8863{
8864 string bucket_oid_base;
8865 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8866 if (ret < 0)
8867 return ret;
8868
8869 RGWObjectCtx obj_ctx(this);
8870
8871 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8872 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8873 if (ret < 0) {
8874 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8875 return ret;
8876 }
8877 return 0;
8878}
8879
8880int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8881 int shard_id, string *bucket_obj)
8882{
8883 string bucket_oid_base;
8884 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8885 if (ret < 0)
8886 return ret;
8887
8888 RGWObjectCtx obj_ctx(this);
8889
8890 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8891 shard_id, bucket_obj);
8892 return 0;
8893}
8894
8895static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8896 map<RGWObjCategory, RGWStorageStats>& stats)
8897{
8898 for (const auto& pair : header.stats) {
8899 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8900 const rgw_bucket_category_stats& header_stats = pair.second;
8901
8902 RGWStorageStats& s = stats[category];
8903
8904 s.category = category;
8905 s.size += header_stats.total_size;
8906 s.size_rounded += header_stats.total_size_rounded;
8907 s.size_utilized += header_stats.actual_size;
8908 s.num_objects += header_stats.num_entries;
8909 }
8910}
8911
8912int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8913 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8914 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8915{
8916 librados::IoCtx index_ctx;
8917 // key - bucket index object id
8918 // value - bucket index check OP returned result with the given bucket index object (shard)
8919 map<int, string> oids;
8920 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 8921
7c673cae 8922 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
31f18b77
FG
8923 if (ret < 0) {
8924 return ret;
8925 }
7c673cae
FG
8926
8927 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
8928 if (ret < 0) {
8929 return ret;
8930 }
7c673cae
FG
8931
8932 // Aggregate results (from different shards if there is any)
8933 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8934 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8935 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8936 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8937 }
8938
8939 return 0;
8940}
8941
8942int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8943{
8944 librados::IoCtx index_ctx;
8945 map<int, string> bucket_objs;
31f18b77 8946
7c673cae 8947 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
31f18b77 8948 if (r < 0) {
7c673cae 8949 return r;
31f18b77 8950 }
7c673cae
FG
8951
8952 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8953}
8954
31f18b77
FG
8955int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8956{
8957 librados::IoCtx index_ctx;
8958 map<int, string> bucket_objs;
8959
8960 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8961 if (r < 0) {
8962 return r;
8963 }
8964
8965 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8966}
7c673cae
FG
8967
8968int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8969{
8970 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8971 std::string oid, key;
8972 get_obj_bucket_and_oid_loc(obj, oid, key);
8973 if (!rctx)
8974 return 0;
8975
8976 RGWObjState *state = NULL;
8977
8978 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8979 if (r < 0)
8980 return r;
8981
8982 if (!state->is_atomic) {
8983 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8984 return -EINVAL;
8985 }
8986
181888fb
FG
8987 string tag;
8988
8989 if (state->tail_tag.length() > 0) {
8990 tag = state->tail_tag.c_str();
8991 } else if (state->obj_tag.length() > 0) {
8992 tag = state->obj_tag.c_str();
8993 } else {
7c673cae
FG
8994 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8995 return -EINVAL;
8996 }
8997
7c673cae
FG
8998 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8999
9000 return gc->defer_chain(tag, false);
9001}
9002
9003void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
9004{
9005 list<string> prefixes;
9006 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
9007 cls_rgw_remove_obj(op, prefixes);
9008}
9009
9010void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
9011{
9012 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
9013}
9014
9015void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
9016{
9017 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
9018}
9019
9020
9021/**
9022 * Delete an object.
9023 * bucket: name of the bucket storing the object
9024 * obj: name of the object to delete
9025 * Returns: 0 on success, -ERR# otherwise.
9026 */
9027int RGWRados::Object::Delete::delete_obj()
9028{
9029 RGWRados *store = target->get_store();
9030 rgw_obj& src_obj = target->get_obj();
9031 const string& instance = src_obj.key.instance;
9032 rgw_obj obj = src_obj;
9033
9034 if (instance == "null") {
9035 obj.key.instance.clear();
9036 }
9037
9038 bool explicit_marker_version = (!params.marker_version_id.empty());
9039
9040 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
9041 if (instance.empty() || explicit_marker_version) {
9042 rgw_obj marker = obj;
9043
9044 if (!params.marker_version_id.empty()) {
9045 if (params.marker_version_id != "null") {
9046 marker.key.set_instance(params.marker_version_id);
9047 }
9048 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
9049 store->gen_rand_obj_instance_name(&marker);
9050 }
9051
9052 result.version_id = marker.key.instance;
91327a77
AA
9053 if (result.version_id.empty())
9054 result.version_id = "null";
7c673cae
FG
9055 result.delete_marker = true;
9056
9057 struct rgw_bucket_dir_entry_meta meta;
9058
9059 meta.owner = params.obj_owner.get_id().to_str();
9060 meta.owner_display_name = params.obj_owner.get_display_name();
9061
9062 if (real_clock::is_zero(params.mtime)) {
9063 meta.mtime = real_clock::now();
9064 } else {
9065 meta.mtime = params.mtime;
9066 }
9067
31f18b77 9068 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
7c673cae
FG
9069 if (r < 0) {
9070 return r;
9071 }
9072 } else {
9073 rgw_bucket_dir_entry dirent;
9074
9075 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
9076 if (r < 0) {
9077 return r;
9078 }
9079 result.delete_marker = dirent.is_delete_marker();
31f18b77 9080 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
7c673cae
FG
9081 if (r < 0) {
9082 return r;
9083 }
9084 result.version_id = instance;
9085 }
9086
9087 BucketShard *bs;
9088 int r = target->get_bucket_shard(&bs);
9089 if (r < 0) {
9090 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
9091 return r;
9092 }
9093
c07f9fc5
FG
9094 if (target->bucket_info.datasync_flag_enabled()) {
9095 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9096 if (r < 0) {
9097 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9098 return r;
9099 }
7c673cae
FG
9100 }
9101
9102 return 0;
9103 }
9104
9105 rgw_rados_ref ref;
9106 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
9107 if (r < 0) {
9108 return r;
9109 }
9110
9111 RGWObjState *state;
9112 r = target->get_state(&state, false);
9113 if (r < 0)
9114 return r;
9115
9116 ObjectWriteOperation op;
9117
9118 if (!real_clock::is_zero(params.unmod_since)) {
9119 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
9120 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
9121 if (!params.high_precision_time) {
9122 ctime.tv_nsec = 0;
9123 unmod.tv_nsec = 0;
9124 }
9125
9126 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
9127 if (ctime > unmod) {
9128 return -ERR_PRECONDITION_FAILED;
9129 }
9130
9131 /* only delete object if mtime is less than or equal to params.unmod_since */
9132 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
9133 }
9134 uint64_t obj_size = state->size;
9135
9136 if (!real_clock::is_zero(params.expiration_time)) {
9137 bufferlist bl;
9138 real_time delete_at;
9139
9140 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
9141 try {
9142 bufferlist::iterator iter = bl.begin();
9143 ::decode(delete_at, iter);
9144 } catch (buffer::error& err) {
9145 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
9146 return -EIO;
9147 }
9148
9149 if (params.expiration_time != delete_at) {
9150 return -ERR_PRECONDITION_FAILED;
9151 }
9152 } else {
9153 return -ERR_PRECONDITION_FAILED;
9154 }
9155 }
9156
9157 if (!state->exists) {
9158 target->invalidate_state();
9159 return -ENOENT;
9160 }
9161
181888fb 9162 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
7c673cae
FG
9163 if (r < 0)
9164 return r;
9165
9166 RGWBucketInfo& bucket_info = target->get_bucket_info();
9167
9168 RGWRados::Bucket bop(store, bucket_info);
9169 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
9170
9171 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
9172 index_op.set_bilog_flags(params.bilog_flags);
9173
7c673cae
FG
9174 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
9175 if (r < 0)
9176 return r;
9177
9178 store->remove_rgw_head_obj(op);
9179 r = ref.ioctx.operate(ref.oid, &op);
94b18763
FG
9180
9181 /* raced with another operation, object state is indeterminate */
9182 const bool need_invalidate = (r == -ECANCELED);
7c673cae
FG
9183
9184 int64_t poolid = ref.ioctx.get_id();
9185 if (r >= 0) {
9186 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9187 if (obj_tombstone_cache) {
9188 tombstone_entry entry{*state};
9189 obj_tombstone_cache->add(obj, entry);
9190 }
9191 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 9192
7c673cae
FG
9193 int ret = target->complete_atomic_modification();
9194 if (ret < 0) {
9195 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9196 }
9197 /* other than that, no need to propagate error */
224ce89b
WB
9198 } else {
9199 int ret = index_op.cancel();
9200 if (ret < 0) {
9201 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9202 }
7c673cae
FG
9203 }
9204
9205 if (need_invalidate) {
9206 target->invalidate_state();
9207 }
9208
9209 if (r < 0)
9210 return r;
9211
9212 /* update quota cache */
9213 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9214
9215 return 0;
9216}
9217
9218int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9219 const RGWBucketInfo& bucket_info,
9220 const rgw_obj& obj,
9221 int versioning_status,
9222 uint16_t bilog_flags,
31f18b77
FG
9223 const real_time& expiration_time,
9224 rgw_zone_set *zones_trace)
7c673cae
FG
9225{
9226 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9227 RGWRados::Object::Delete del_op(&del_target);
9228
9229 del_op.params.bucket_owner = bucket_info.owner;
9230 del_op.params.versioning_status = versioning_status;
9231 del_op.params.bilog_flags = bilog_flags;
9232 del_op.params.expiration_time = expiration_time;
31f18b77 9233 del_op.params.zones_trace = zones_trace;
7c673cae
FG
9234
9235 return del_op.delete_obj();
9236}
9237
9238int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9239{
9240 rgw_rados_ref ref;
224ce89b 9241 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9242 if (r < 0) {
9243 return r;
9244 }
9245
9246 ObjectWriteOperation op;
9247
9248 op.remove();
9249 r = ref.ioctx.operate(ref.oid, &op);
9250 if (r < 0)
9251 return r;
9252
9253 return 0;
9254}
9255
9256int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9257{
9258 if (obj.empty()) {
9259 ldout(cct, 1) << "delete_system_obj got empty object name "
9260 << obj << ", returning EINVAL" << dendl;
9261 return -EINVAL;
9262 }
9263 rgw_rados_ref ref;
224ce89b 9264 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9265 if (r < 0) {
9266 return r;
9267 }
9268
9269 ObjectWriteOperation op;
9270
9271 if (objv_tracker) {
9272 objv_tracker->prepare_op_for_write(&op);
9273 }
9274
9275 op.remove();
9276 r = ref.ioctx.operate(ref.oid, &op);
9277 if (r < 0)
9278 return r;
9279
9280 return 0;
9281}
9282
9283int RGWRados::delete_obj_index(const rgw_obj& obj)
9284{
9285 std::string oid, key;
9286 get_obj_bucket_and_oid_loc(obj, oid, key);
9287
9288 RGWObjectCtx obj_ctx(this);
9289
9290 RGWBucketInfo bucket_info;
9291 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9292 if (ret < 0) {
9293 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9294 return ret;
9295 }
9296
9297 RGWRados::Bucket bop(this, bucket_info);
9298 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9299
9300 real_time removed_mtime;
9301 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9302
9303 return r;
9304}
9305
9306static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9307{
9308 string tag;
9309
9310 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9311 if (mi != manifest.obj_end()) {
9312 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9313 ++mi;
9314 tag = mi.get_location().get_raw_obj(store).oid;
9315 tag.append("_");
9316 }
9317
9318 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9319 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9320 MD5 hash;
9321 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9322
9323 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9324 if (iter != attrset.end()) {
9325 bufferlist& bl = iter->second;
9326 hash.Update((const byte *)bl.c_str(), bl.length());
9327 }
9328
9329 hash.Final(md5);
9330 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9331 tag.append(md5_str);
9332
9333 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9334
9335 tag_bl.append(tag.c_str(), tag.size() + 1);
9336}
9337
9338static bool is_olh(map<string, bufferlist>& attrs)
9339{
9340 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9341 return (iter != attrs.end());
9342}
9343
9344static bool has_olh_tag(map<string, bufferlist>& attrs)
9345{
9346 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9347 return (iter != attrs.end());
9348}
9349
9350int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9351 RGWObjState *olh_state, RGWObjState **target_state)
9352{
9353 assert(olh_state->is_olh);
9354
9355 rgw_obj target;
9356 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9357 if (r < 0) {
9358 return r;
9359 }
9360 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9361 if (r < 0) {
9362 return r;
9363 }
9364
9365 return 0;
9366}
9367
9368int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9369{
9370 if (obj.empty()) {
9371 return -EINVAL;
9372 }
9373
9374 RGWRawObjState *s = rctx->raw.get_state(obj);
9375 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9376 *state = s;
9377 if (s->has_attrs) {
9378 return 0;
9379 }
9380
9381 s->obj = obj;
9382
9383 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9384 if (r == -ENOENT) {
9385 s->exists = false;
9386 s->has_attrs = true;
9387 s->mtime = real_time();
9388 return 0;
9389 }
9390 if (r < 0)
9391 return r;
9392
9393 s->exists = true;
9394 s->has_attrs = true;
9395 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9396
9397 if (s->obj_tag.length())
31f18b77
FG
9398 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9399 << s->obj_tag.c_str() << dendl;
7c673cae
FG
9400 else
9401 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9402
9403 return 0;
9404}
9405
9406int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9407{
9408 int ret;
9409
9410 do {
9411 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9412 } while (ret == -EAGAIN);
9413
9414 return ret;
9415}
9416
9417int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9418 RGWObjState **state, bool follow_olh, bool assume_noent)
9419{
9420 if (obj.empty()) {
9421 return -EINVAL;
9422 }
9423
9424 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9425
9426 RGWObjState *s = rctx->obj.get_state(obj);
9427 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9428 *state = s;
9429 if (s->has_attrs) {
9430 if (s->is_olh && need_follow_olh) {
9431 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9432 }
9433 return 0;
9434 }
9435
9436 s->obj = obj;
9437
9438 rgw_raw_obj raw_obj;
9439 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9440
9441 int r = -ENOENT;
9442
9443 if (!assume_noent) {
9444 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9445 }
9446
9447 if (r == -ENOENT) {
9448 s->exists = false;
9449 s->has_attrs = true;
9450 tombstone_entry entry;
9451 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9452 s->mtime = entry.mtime;
9453 s->zone_short_id = entry.zone_short_id;
9454 s->pg_ver = entry.pg_ver;
9455 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9456 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9457 } else {
9458 s->mtime = real_time();
9459 }
9460 return 0;
9461 }
9462 if (r < 0)
9463 return r;
9464
9465 s->exists = true;
9466 s->has_attrs = true;
9467 s->accounted_size = s->size;
9468
9469 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
9470 const bool compressed = (iter != s->attrset.end());
9471 if (compressed) {
7c673cae
FG
9472 // use uncompressed size for accounted_size
9473 try {
9474 RGWCompressionInfo info;
9475 auto p = iter->second.begin();
9476 ::decode(info, p);
31f18b77 9477 s->accounted_size = info.orig_size;
7c673cae
FG
9478 } catch (buffer::error&) {
9479 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9480 return -EIO;
9481 }
9482 }
9483
9484 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9485 if (iter != s->attrset.end()) {
9486 bufferlist bl = iter->second;
9487 bufferlist::iterator it = bl.begin();
9488 it.copy(bl.length(), s->shadow_obj);
9489 s->shadow_obj[bl.length()] = '\0';
9490 }
9491 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
9492 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9493 if (ttiter != s->attrset.end()) {
9494 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9495 }
7c673cae
FG
9496
9497 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9498 if (manifest_bl.length()) {
9499 bufferlist::iterator miter = manifest_bl.begin();
9500 try {
9501 ::decode(s->manifest, miter);
9502 s->has_manifest = true;
9503 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9504 broken due to old bugs */
9505 s->size = s->manifest.get_obj_size();
31f18b77
FG
9506 if (!compressed)
9507 s->accounted_size = s->size;
7c673cae
FG
9508 } catch (buffer::error& err) {
9509 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9510 return -EIO;
9511 }
9512 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9513 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9514 RGWObjManifest::obj_iterator mi;
9515 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9516 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9517 }
9518 }
9519
9520 if (!s->obj_tag.length()) {
9521 /*
9522 * Uh oh, something's wrong, object with manifest should have tag. Let's
9523 * create one out of the manifest, would be unique
9524 */
9525 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9526 s->fake_tag = true;
9527 }
9528 }
9529 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9530 if (aiter != s->attrset.end()) {
9531 bufferlist& pg_ver_bl = aiter->second;
9532 if (pg_ver_bl.length()) {
9533 bufferlist::iterator pgbl = pg_ver_bl.begin();
9534 try {
9535 ::decode(s->pg_ver, pgbl);
9536 } catch (buffer::error& err) {
9537 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9538 }
9539 }
9540 }
9541 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9542 if (aiter != s->attrset.end()) {
9543 bufferlist& zone_short_id_bl = aiter->second;
9544 if (zone_short_id_bl.length()) {
9545 bufferlist::iterator zbl = zone_short_id_bl.begin();
9546 try {
9547 ::decode(s->zone_short_id, zbl);
9548 } catch (buffer::error& err) {
9549 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9550 }
9551 }
9552 }
9553 if (s->obj_tag.length())
31f18b77 9554 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
9555 else
9556 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9557
9558 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9559 * it exist, and not only if is_olh() returns true
9560 */
9561 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9562 if (iter != s->attrset.end()) {
9563 s->olh_tag = iter->second;
9564 }
9565
9566 if (is_olh(s->attrset)) {
9567 s->is_olh = true;
9568
9569 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9570
9571 if (need_follow_olh) {
9572 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9573 }
9574 }
9575
9576 return 0;
9577}
9578
9579int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9580 bool follow_olh, bool assume_noent)
9581{
9582 int ret;
9583
9584 do {
9585 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9586 } while (ret == -EAGAIN);
9587
9588 return ret;
9589}
9590
9591int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9592{
9593 RGWObjState *astate;
9594 int r = get_state(&astate, true);
9595 if (r < 0) {
9596 return r;
9597 }
9598
9599 *pmanifest = &astate->manifest;
9600
9601 return 0;
9602}
9603
9604int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9605{
9606 RGWObjState *state;
9607 int r = source->get_state(&state, true);
9608 if (r < 0)
9609 return r;
9610 if (!state->exists)
9611 return -ENOENT;
9612 if (!state->get_attr(name, dest))
9613 return -ENODATA;
9614
9615 return 0;
9616}
9617
9618
9619int RGWRados::Object::Stat::stat_async()
9620{
9621 RGWObjectCtx& ctx = source->get_ctx();
9622 rgw_obj& obj = source->get_obj();
9623 RGWRados *store = source->get_store();
9624
9625 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9626 result.obj = obj;
9627 if (s->has_attrs) {
9628 state.ret = 0;
9629 result.size = s->size;
9630 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9631 result.attrs = s->attrset;
9632 result.has_manifest = s->has_manifest;
9633 result.manifest = s->manifest;
9634 return 0;
9635 }
9636
9637 string oid;
9638 string loc;
9639 get_obj_bucket_and_oid_loc(obj, oid, loc);
9640
9641 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9642 if (r < 0) {
9643 return r;
9644 }
9645
9646 librados::ObjectReadOperation op;
9647 op.stat2(&result.size, &result.mtime, NULL);
9648 op.getxattrs(&result.attrs, NULL);
9649 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9650 state.io_ctx.locator_set_key(loc);
9651 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9652 if (r < 0) {
9653 ldout(store->ctx(), 5) << __func__
9654 << ": ERROR: aio_operate() returned ret=" << r
9655 << dendl;
9656 return r;
9657 }
9658
9659 return 0;
9660}
9661
9662
9663int RGWRados::Object::Stat::wait()
9664{
9665 if (!state.completion) {
9666 return state.ret;
9667 }
9668
9669 state.completion->wait_for_safe();
9670 state.ret = state.completion->get_return_value();
9671 state.completion->release();
9672
9673 if (state.ret != 0) {
9674 return state.ret;
9675 }
9676
9677 return finish();
9678}
9679
9680int RGWRados::Object::Stat::finish()
9681{
9682 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9683 if (iter != result.attrs.end()) {
9684 bufferlist& bl = iter->second;
9685 bufferlist::iterator biter = bl.begin();
9686 try {
9687 ::decode(result.manifest, biter);
9688 } catch (buffer::error& err) {
9689 RGWRados *store = source->get_store();
9690 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9691 return -EIO;
9692 }
9693 result.has_manifest = true;
9694 }
9695
9696 return 0;
9697}
9698
9699/**
31f18b77
FG
9700 * Get an attribute for a system object.
9701 * obj: the object to get attr
7c673cae
FG
9702 * name: name of the attr to retrieve
9703 * dest: bufferlist to store the result in
9704 * Returns: 0 on success, -ERR# otherwise.
9705 */
9706int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9707{
9708 rgw_rados_ref ref;
224ce89b 9709 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9710 if (r < 0) {
9711 return r;
9712 }
9713
9714 ObjectReadOperation op;
9715
9716 int rval;
9717 op.getxattr(name, &dest, &rval);
9718
9719 r = ref.ioctx.operate(ref.oid, &op, NULL);
9720 if (r < 0)
9721 return r;
9722
9723 return 0;
9724}
9725
9726int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9727 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9728 ObjectOperation& op, RGWObjState **pstate)
9729{
9730 if (!rctx)
9731 return 0;
9732
9733 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9734 if (r < 0)
9735 return r;
9736
9737 RGWObjState *state = *pstate;
9738
9739 if (!state->is_atomic) {
9740 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9741 return 0;
9742 }
9743
9744 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9745 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9746 } else {
9747 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9748 }
9749 return 0;
9750}
9751
9752int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9753{
9754 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9755}
9756
9757void RGWRados::Object::invalidate_state()
9758{
9759 ctx.obj.invalidate(obj);
9760}
9761
9762void RGWRados::SystemObject::invalidate_state()
9763{
9764 ctx.raw.invalidate(obj);
9765}
9766
9767int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb
FG
9768 const char *if_match, const char *if_nomatch, bool removal_op,
9769 bool modify_tail)
7c673cae
FG
9770{
9771 int r = get_state(&state, false);
9772 if (r < 0)
9773 return r;
9774
9775 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9776 if_match != NULL || if_nomatch != NULL) &&
9777 (!state->fake_tag);
9778
9779 if (!state->is_atomic) {
9780 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9781
9782 if (reset_obj) {
9783 op.create(false);
9784 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9785 }
9786
9787 return 0;
9788 }
9789
9790 if (need_guard) {
9791 /* first verify that the object wasn't replaced under */
9792 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9793 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9794 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9795 }
9796
9797 if (if_match) {
9798 if (strcmp(if_match, "*") == 0) {
9799 // test the object is existing
9800 if (!state->exists) {
9801 return -ERR_PRECONDITION_FAILED;
9802 }
9803 } else {
9804 bufferlist bl;
9805 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9806 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9807 return -ERR_PRECONDITION_FAILED;
9808 }
9809 }
9810 }
9811
9812 if (if_nomatch) {
9813 if (strcmp(if_nomatch, "*") == 0) {
9814 // test the object is NOT existing
9815 if (state->exists) {
9816 return -ERR_PRECONDITION_FAILED;
9817 }
9818 } else {
9819 bufferlist bl;
9820 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9821 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9822 return -ERR_PRECONDITION_FAILED;
9823 }
9824 }
9825 }
9826 }
9827
9828 if (reset_obj) {
9829 if (state->exists) {
9830 op.create(false);
9831 store->remove_rgw_head_obj(op);
9832 } else {
9833 op.create(true);
9834 }
9835 }
9836
9837 if (removal_op) {
9838 /* the object is being removed, no need to update its tag */
9839 return 0;
9840 }
9841
9842 if (ptag) {
9843 state->write_tag = *ptag;
9844 } else {
9845 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9846 }
9847 bufferlist bl;
9848 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9849
9850 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9851
9852 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
9853 if (modify_tail) {
9854 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9855 }
7c673cae
FG
9856
9857 return 0;
9858}
9859
9860int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9861 RGWObjVersionTracker *objv_tracker)
9862{
9863 map<string, bufferlist> attrs;
9864 attrs[name] = bl;
9865 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9866}
9867
9868int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9869 map<string, bufferlist>& attrs,
9870 map<string, bufferlist>* rmattrs,
9871 RGWObjVersionTracker *objv_tracker)
9872{
9873 rgw_rados_ref ref;
224ce89b 9874 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9875 if (r < 0) {
9876 return r;
9877 }
9878 ObjectWriteOperation op;
9879
9880 if (objv_tracker) {
9881 objv_tracker->prepare_op_for_write(&op);
9882 }
9883
9884 map<string, bufferlist>::iterator iter;
9885 if (rmattrs) {
9886 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9887 const string& name = iter->first;
9888 op.rmxattr(name.c_str());
9889 }
9890 }
9891
9892 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9893 const string& name = iter->first;
9894 bufferlist& bl = iter->second;
9895
9896 if (!bl.length())
9897 continue;
9898
9899 op.setxattr(name.c_str(), bl);
9900 }
9901
9902 if (!op.size())
9903 return 0;
9904
9905 bufferlist bl;
9906
9907 r = ref.ioctx.operate(ref.oid, &op);
9908 if (r < 0)
9909 return r;
9910
9911 return 0;
9912}
9913
9914/**
9915 * Set an attr on an object.
9916 * bucket: name of the bucket holding the object
9917 * obj: name of the object to set the attr on
9918 * name: the attr to set
9919 * bl: the contents of the attr
9920 * Returns: 0 on success, -ERR# otherwise.
9921 */
9922int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9923{
9924 map<string, bufferlist> attrs;
9925 attrs[name] = bl;
9926 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9927}
9928
9929int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9930 map<string, bufferlist>& attrs,
9931 map<string, bufferlist>* rmattrs)
9932{
9933 rgw_rados_ref ref;
9934 int r = get_obj_head_ref(bucket_info, obj, &ref);
9935 if (r < 0) {
9936 return r;
9937 }
9938 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9939
9940 ObjectWriteOperation op;
9941 RGWObjState *state = NULL;
9942
9943 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9944 if (r < 0)
9945 return r;
9946
9947 map<string, bufferlist>::iterator iter;
9948 if (rmattrs) {
9949 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9950 const string& name = iter->first;
9951 op.rmxattr(name.c_str());
9952 }
9953 }
9954
9955 const rgw_bucket& bucket = obj.bucket;
9956
9957 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9958 const string& name = iter->first;
9959 bufferlist& bl = iter->second;
9960
9961 if (!bl.length())
9962 continue;
9963
9964 op.setxattr(name.c_str(), bl);
9965
9966 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9967 real_time ts;
9968 try {
9969 ::decode(ts, bl);
9970
9971 rgw_obj_index_key obj_key;
9972 obj.key.get_index_key(&obj_key);
9973
9974 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9975 } catch (buffer::error& err) {
9976 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9977 }
9978 }
9979 }
9980
9981 if (!op.size())
9982 return 0;
9983
9984 RGWObjectCtx obj_ctx(this);
9985
9986 bufferlist bl;
9987 RGWRados::Bucket bop(this, bucket_info);
9988 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9989
9990 if (state) {
9991 string tag;
9992 append_rand_alpha(cct, tag, tag, 32);
9993 state->write_tag = tag;
9994 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9995
9996 if (r < 0)
9997 return r;
9998
9999 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
10000 op.setxattr(RGW_ATTR_ID_TAG, bl);
10001 }
10002
3efd9988
FG
10003
10004 real_time mtime = real_clock::now();
10005 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10006 op.mtime2(&mtime_ts);
7c673cae
FG
10007 r = ref.ioctx.operate(ref.oid, &op);
10008 if (state) {
10009 if (r >= 0) {
10010 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
10011 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
10012 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
10013 string etag(etag_bl.c_str(), etag_bl.length());
10014 string content_type(content_type_bl.c_str(), content_type_bl.length());
10015 uint64_t epoch = ref.ioctx.get_last_version();
10016 int64_t poolid = ref.ioctx.get_id();
7c673cae
FG
10017 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
10018 mtime, etag, content_type, &acl_bl,
10019 RGW_OBJ_CATEGORY_MAIN, NULL);
10020 } else {
10021 int ret = index_op.cancel();
10022 if (ret < 0) {
10023 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
10024 }
10025 }
10026 }
10027 if (r < 0)
10028 return r;
10029
10030 if (state) {
10031 state->obj_tag.swap(bl);
10032 if (rmattrs) {
10033 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10034 state->attrset.erase(iter->first);
10035 }
10036 }
10037 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10038 state->attrset[iter->first] = iter->second;
10039 }
10040 }
10041
10042 return 0;
10043}
10044
7c673cae
FG
10045int RGWRados::Object::Read::prepare()
10046{
10047 RGWRados *store = source->get_store();
10048 CephContext *cct = store->ctx();
10049
10050 bufferlist etag;
10051
10052 map<string, bufferlist>::iterator iter;
10053
10054 RGWObjState *astate;
10055 int r = source->get_state(&astate, true);
10056 if (r < 0)
10057 return r;
10058
10059 if (!astate->exists) {
10060 return -ENOENT;
10061 }
10062
10063 const RGWBucketInfo& bucket_info = source->get_bucket_info();
10064
10065 state.obj = astate->obj;
10066 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
10067
10068 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
10069 if (r < 0) {
10070 return r;
10071 }
10072 if (params.attrs) {
10073 *params.attrs = astate->attrset;
10074 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10075 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
10076 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10077 }
10078 }
10079 }
10080
10081 /* Convert all times go GMT to make them compatible */
10082 if (conds.mod_ptr || conds.unmod_ptr) {
10083 obj_time_weight src_weight;
10084 src_weight.init(astate);
10085 src_weight.high_precision = conds.high_precision_time;
10086
10087 obj_time_weight dest_weight;
10088 dest_weight.high_precision = conds.high_precision_time;
10089
10090 if (conds.mod_ptr) {
10091 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10092 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10093 if (!(dest_weight < src_weight)) {
10094 return -ERR_NOT_MODIFIED;
10095 }
10096 }
10097
10098 if (conds.unmod_ptr) {
10099 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10100 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10101 if (dest_weight < src_weight) {
10102 return -ERR_PRECONDITION_FAILED;
10103 }
10104 }
10105 }
10106 if (conds.if_match || conds.if_nomatch) {
10107 r = get_attr(RGW_ATTR_ETAG, etag);
10108 if (r < 0)
10109 return r;
10110
10111 if (conds.if_match) {
10112 string if_match_str = rgw_string_unquote(conds.if_match);
10113 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
10114 if (if_match_str.compare(etag.c_str()) != 0) {
10115 return -ERR_PRECONDITION_FAILED;
10116 }
10117 }
10118
10119 if (conds.if_nomatch) {
10120 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
10121 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
10122 if (if_nomatch_str.compare(etag.c_str()) == 0) {
10123 return -ERR_NOT_MODIFIED;
10124 }
10125 }
10126 }
10127
10128 if (params.obj_size)
10129 *params.obj_size = astate->size;
10130 if (params.lastmod)
10131 *params.lastmod = astate->mtime;
10132
10133 return 0;
10134}
10135
10136int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
10137{
10138 if (ofs < 0) {
10139 ofs += obj_size;
10140 if (ofs < 0)
10141 ofs = 0;
10142 end = obj_size - 1;
10143 } else if (end < 0) {
10144 end = obj_size - 1;
10145 }
10146
10147 if (obj_size > 0) {
10148 if (ofs >= (off_t)obj_size) {
10149 return -ERANGE;
10150 }
10151 if (end >= (off_t)obj_size) {
10152 end = obj_size - 1;
10153 }
10154 }
10155 return 0;
10156}
10157
10158int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
10159{
10160 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
10161}
10162
10163int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
10164 RGWRados::SystemObject::Read::GetObjState& state,
10165 rgw_raw_obj& obj,
10166 map<string, bufferlist> *attrs,
10167 real_time *lastmod,
10168 uint64_t *obj_size,
10169 RGWObjVersionTracker *objv_tracker)
10170{
10171 RGWRawObjState *astate = NULL;
10172
10173 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
10174 if (r < 0)
10175 return r;
10176
10177 if (!astate->exists) {
10178 return -ENOENT;
10179 }
10180
10181 if (attrs) {
10182 *attrs = astate->attrset;
10183 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10184 map<string, bufferlist>::iterator iter;
10185 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10186 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10187 }
10188 }
10189 }
10190
10191 if (obj_size)
10192 *obj_size = astate->size;
10193 if (lastmod)
10194 *lastmod = astate->mtime;
10195
10196 return 0;
10197}
10198
31f18b77
FG
10199
10200int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10201{
10202 RGWRados *store = target->get_store();
10203 BucketShard *bs;
10204 int r;
10205
10206#define NUM_RESHARD_RETRIES 10
10207 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10208 int ret = get_bucket_shard(&bs);
10209 if (ret < 0) {
10210 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10211 return ret;
10212 }
10213 r = call(bs);
10214 if (r != -ERR_BUSY_RESHARDING) {
10215 break;
10216 }
10217 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10218 string new_bucket_id;
10219 r = store->block_while_resharding(bs, &new_bucket_id);
10220 if (r == -ERR_BUSY_RESHARDING) {
10221 continue;
10222 }
10223 if (r < 0) {
10224 return r;
10225 }
10226 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10227 i = 0; /* resharding is finished, make sure we can retry */
10228 r = target->update_bucket_id(new_bucket_id);
10229 if (r < 0) {
10230 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10231 return r;
10232 }
10233 invalidate_bs();
10234 }
10235
10236 if (r < 0) {
10237 return r;
10238 }
10239
10240 if (pbs) {
10241 *pbs = bs;
10242 }
10243
10244 return 0;
10245}
10246
7c673cae
FG
10247int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10248{
10249 RGWRados *store = source->get_store();
10250 rgw_raw_obj& obj = source->get_obj();
10251
10252 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10253 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10254}
10255
10256int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10257{
10258 if (blind) {
10259 return 0;
10260 }
10261 RGWRados *store = target->get_store();
7c673cae
FG
10262
10263 if (write_tag && write_tag->length()) {
10264 optag = string(write_tag->c_str(), write_tag->length());
10265 } else {
10266 if (optag.empty()) {
10267 append_rand_alpha(store->ctx(), optag, optag, 32);
10268 }
10269 }
10270
31f18b77
FG
10271 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10272 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10273 });
10274
7c673cae
FG
10275 if (r < 0) {
10276 return r;
10277 }
10278 prepared = true;
31f18b77 10279
7c673cae
FG
10280 return 0;
10281}
10282
10283int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10284 uint64_t size, uint64_t accounted_size,
10285 ceph::real_time& ut, const string& etag,
10286 const string& content_type,
10287 bufferlist *acl_bl,
10288 RGWObjCategory category,
10289 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10290{
10291 if (blind) {
10292 return 0;
10293 }
10294 RGWRados *store = target->get_store();
10295 BucketShard *bs;
31f18b77 10296
7c673cae
FG
10297 int ret = get_bucket_shard(&bs);
10298 if (ret < 0) {
10299 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10300 return ret;
10301 }
10302
10303 rgw_bucket_dir_entry ent;
10304 obj.key.get_index_key(&ent.key);
10305 ent.meta.size = size;
10306 ent.meta.accounted_size = accounted_size;
10307 ent.meta.mtime = ut;
10308 ent.meta.etag = etag;
10309 if (user_data)
10310 ent.meta.user_data = *user_data;
10311
10312 ACLOwner owner;
10313 if (acl_bl && acl_bl->length()) {
10314 int ret = store->decode_policy(*acl_bl, &owner);
10315 if (ret < 0) {
10316 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10317 }
10318 }
10319 ent.meta.owner = owner.get_id().to_str();
10320 ent.meta.owner_display_name = owner.get_display_name();
10321 ent.meta.content_type = content_type;
10322
31f18b77 10323 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 10324
c07f9fc5
FG
10325 if (target->bucket_info.datasync_flag_enabled()) {
10326 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10327 if (r < 0) {
10328 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10329 }
7c673cae
FG
10330 }
10331
10332 return ret;
10333}
10334
10335int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10336 real_time& removed_mtime,
10337 list<rgw_obj_index_key> *remove_objs)
10338{
10339 if (blind) {
10340 return 0;
10341 }
10342 RGWRados *store = target->get_store();
10343 BucketShard *bs;
31f18b77 10344
7c673cae
FG
10345 int ret = get_bucket_shard(&bs);
10346 if (ret < 0) {
10347 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10348 return ret;
10349 }
10350
31f18b77 10351 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 10352
c07f9fc5
FG
10353 if (target->bucket_info.datasync_flag_enabled()) {
10354 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10355 if (r < 0) {
10356 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10357 }
7c673cae
FG
10358 }
10359
10360 return ret;
10361}
10362
10363
10364int RGWRados::Bucket::UpdateIndex::cancel()
10365{
10366 if (blind) {
10367 return 0;
10368 }
10369 RGWRados *store = target->get_store();
10370 BucketShard *bs;
7c673cae 10371
31f18b77
FG
10372 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10373 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10374 });
7c673cae
FG
10375
10376 /*
10377 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10378 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10379 * have no way to tell that they're all caught up
10380 */
c07f9fc5
FG
10381 if (target->bucket_info.datasync_flag_enabled()) {
10382 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10383 if (r < 0) {
10384 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10385 }
7c673cae
FG
10386 }
10387
10388 return ret;
10389}
10390
10391int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10392{
10393 RGWRados *store = source->get_store();
10394 CephContext *cct = store->ctx();
10395
7c673cae
FG
10396 rgw_raw_obj read_obj;
10397 uint64_t read_ofs = ofs;
10398 uint64_t len, read_len;
10399 bool reading_from_head = true;
10400 ObjectReadOperation op;
10401
10402 bool merge_bl = false;
10403 bufferlist *pbl = &bl;
10404 bufferlist read_bl;
10405 uint64_t max_chunk_size;
10406
10407 RGWObjState *astate;
10408 int r = source->get_state(&astate, true);
10409 if (r < 0)
10410 return r;
10411
10412 if (end < 0)
10413 len = 0;
10414 else
10415 len = end - ofs + 1;
10416
10417 if (astate->has_manifest && astate->manifest.has_tail()) {
10418 /* now get the relevant object part */
10419 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10420
10421 uint64_t stripe_ofs = iter.get_stripe_ofs();
10422 read_obj = iter.get_location().get_raw_obj(store);
10423 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10424 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10425 reading_from_head = (read_obj == state.head_obj);
10426 } else {
10427 read_obj = state.head_obj;
10428 }
10429
10430 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10431 if (r < 0) {
10432 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10433 return r;
10434 }
10435
10436 if (len > max_chunk_size)
10437 len = max_chunk_size;
10438
10439
10440 state.io_ctx.locator_set_key(read_obj.loc);
10441
10442 read_len = len;
10443
10444 if (reading_from_head) {
10445 /* only when reading from the head object do we need to do the atomic test */
10446 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10447 if (r < 0)
10448 return r;
10449
10450 if (astate && astate->prefetch_data) {
10451 if (!ofs && astate->data.length() >= len) {
10452 bl = astate->data;
10453 return bl.length();
10454 }
10455
10456 if (ofs < astate->data.length()) {
10457 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10458 astate->data.copy(ofs, copy_len, bl);
10459 read_len -= copy_len;
10460 read_ofs += copy_len;
10461 if (!read_len)
10462 return bl.length();
10463
10464 merge_bl = true;
10465 pbl = &read_bl;
10466 }
10467 }
10468 }
10469
10470 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10471 op.read(read_ofs, read_len, pbl, NULL);
10472
10473 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10474 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10475
10476 if (r < 0) {
10477 return r;
10478 }
10479
10480 if (merge_bl) {
10481 bl.append(read_bl);
10482 }
10483
10484 return bl.length();
10485}
10486
10487int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10488{
10489 if (!has_ref) {
224ce89b 10490 int r = store->get_raw_obj_ref(obj, &ref);
7c673cae
FG
10491 if (r < 0) {
10492 return r;
10493 }
10494 has_ref = true;
10495 }
10496 *pref = &ref;
10497 return 0;
10498
10499}
10500
10501int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10502 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10503 bufferlist& bl, off_t ofs, off_t end,
10504 map<string, bufferlist> *attrs,
b32b8144
FG
10505 rgw_cache_entry_info *cache_info,
10506 boost::optional<obj_version>)
7c673cae
FG
10507{
10508 uint64_t len;
10509 ObjectReadOperation op;
10510
10511 if (end < 0)
10512 len = 0;
10513 else
10514 len = end - ofs + 1;
10515
10516 if (objv_tracker) {
10517 objv_tracker->prepare_op_for_read(&op);
10518 }
10519
10520 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10521 op.read(ofs, len, &bl, NULL);
10522
10523 if (attrs) {
10524 op.getxattrs(attrs, NULL);
10525 }
10526
10527 rgw_rados_ref *ref;
10528 int r = read_state.get_ref(this, obj, &ref);
10529 if (r < 0) {
10530 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10531 return r;
10532 }
10533 r = ref->ioctx.operate(ref->oid, &op, NULL);
10534 if (r < 0) {
10535 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10536 return r;
10537 }
10538 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10539
10540 uint64_t op_ver = ref->ioctx.get_last_version();
10541
10542 if (read_state.last_ver > 0 &&
10543 read_state.last_ver != op_ver) {
10544 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10545 return -ECANCELED;
10546 }
10547
10548 read_state.last_ver = op_ver;
10549
10550 return bl.length();
10551}
10552
b32b8144
FG
10553int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10554 RGWObjVersionTracker *objv_tracker,
10555 boost::optional<obj_version> refresh_version)
7c673cae
FG
10556{
10557 RGWRados *store = source->get_store();
10558 rgw_raw_obj& obj = source->get_obj();
10559
b32b8144
FG
10560 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10561 ofs, end, read_params.attrs,
10562 read_params.cache_info, refresh_version);
7c673cae
FG
10563}
10564
10565int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10566{
10567 RGWRados *store = source->get_store();
10568 rgw_raw_obj& obj = source->get_obj();
10569
10570 return store->system_obj_get_attr(obj, name, dest);
10571}
10572
10573struct get_obj_data;
10574
10575struct get_obj_aio_data {
10576 struct get_obj_data *op_data;
10577 off_t ofs;
10578 off_t len;
10579};
10580
10581struct get_obj_io {
10582 off_t len;
10583 bufferlist bl;
10584};
10585
10586static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10587
10588struct get_obj_data : public RefCountedObject {
10589 CephContext *cct;
10590 RGWRados *rados;
10591 RGWObjectCtx *ctx;
10592 IoCtx io_ctx;
10593 map<off_t, get_obj_io> io_map;
10594 map<off_t, librados::AioCompletion *> completion_map;
10595 uint64_t total_read;
10596 Mutex lock;
10597 Mutex data_lock;
10598 list<get_obj_aio_data> aio_data;
10599 RGWGetDataCB *client_cb;
10600 std::atomic<bool> cancelled = { false };
10601 std::atomic<int64_t> err_code = { 0 };
10602 Throttle throttle;
10603 list<bufferlist> read_list;
10604
10605 explicit get_obj_data(CephContext *_cct)
10606 : cct(_cct),
10607 rados(NULL), ctx(NULL),
10608 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10609 client_cb(NULL),
10610 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10611 ~get_obj_data() override { }
10612 void set_cancelled(int r) {
10613 cancelled = true;
10614 err_code = r;
10615 }
10616
10617 bool is_cancelled() {
10618 return cancelled;
10619 }
10620
10621 int get_err_code() {
10622 return err_code;
10623 }
10624
10625 int wait_next_io(bool *done) {
10626 lock.Lock();
10627 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10628 if (iter == completion_map.end()) {
10629 *done = true;
10630 lock.Unlock();
10631 return 0;
10632 }
10633 off_t cur_ofs = iter->first;
10634 librados::AioCompletion *c = iter->second;
10635 lock.Unlock();
10636
10637 c->wait_for_safe_and_cb();
10638 int r = c->get_return_value();
10639
10640 lock.Lock();
10641 completion_map.erase(cur_ofs);
10642
10643 if (completion_map.empty()) {
10644 *done = true;
10645 }
10646 lock.Unlock();
10647
10648 c->release();
10649
10650 return r;
10651 }
10652
10653 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10654 Mutex::Locker l(lock);
10655
10656 const auto& io_iter = io_map.insert(
10657 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10658
10659 assert(io_iter.second); // assert new insertion
10660
10661 get_obj_io& io = (io_iter.first)->second;
10662 *pbl = &io.bl;
10663
10664 struct get_obj_aio_data aio;
10665 aio.ofs = ofs;
10666 aio.len = len;
10667 aio.op_data = this;
10668
10669 aio_data.push_back(aio);
10670
10671 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10672
10673 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10674 completion_map[ofs] = c;
10675
10676 *pc = c;
10677
10678 /* we have a reference per IO, plus one reference for the calling function.
10679 * reference is dropped for each callback, plus when we're done iterating
10680 * over the parts */
10681 get();
10682 }
10683
10684 void cancel_io(off_t ofs) {
10685 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10686 lock.Lock();
10687 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10688 if (iter != completion_map.end()) {
10689 AioCompletion *c = iter->second;
10690 c->release();
10691 completion_map.erase(ofs);
10692 io_map.erase(ofs);
10693 }
10694 lock.Unlock();
10695
10696 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10697 * need IoCtx to live, as io callback may still be called
10698 */
10699 }
10700
10701 void cancel_all_io() {
10702 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10703 Mutex::Locker l(lock);
10704 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10705 iter != completion_map.end(); ++iter) {
10706 librados::AioCompletion *c = iter->second;
10707 c->release();
10708 }
10709 }
10710
10711 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10712 Mutex::Locker l(lock);
10713
10714 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10715
10716 if (liter == io_map.end() ||
10717 liter->first != ofs) {
10718 return 0;
10719 }
10720
10721 map<off_t, librados::AioCompletion *>::iterator aiter;
10722 aiter = completion_map.find(ofs);
10723 if (aiter == completion_map.end()) {
10724 /* completion map does not hold this io, it was cancelled */
10725 return 0;
10726 }
10727
10728 AioCompletion *completion = aiter->second;
10729 int r = completion->get_return_value();
10730 if (r < 0)
10731 return r;
10732
10733 for (; aiter != completion_map.end(); ++aiter) {
10734 completion = aiter->second;
10735 if (!completion->is_safe()) {
10736 /* reached a request that is not yet complete, stop */
10737 break;
10738 }
10739
10740 r = completion->get_return_value();
10741 if (r < 0) {
10742 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10743 return r;
10744 }
10745
10746 total_read += r;
10747
10748 map<off_t, get_obj_io>::iterator old_liter = liter++;
10749 bl_list.push_back(old_liter->second.bl);
10750 io_map.erase(old_liter);
10751 }
10752
10753 return 0;
10754 }
10755};
10756
10757static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10758{
10759 struct get_obj_data *d = (struct get_obj_data *)arg;
10760
10761 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10762}
10763
10764static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10765{
10766 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10767 struct get_obj_data *d = aio_data->op_data;
10768
10769 d->rados->get_obj_aio_completion_cb(cb, arg);
10770}
10771
10772
10773void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10774{
10775 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10776 struct get_obj_data *d = aio_data->op_data;
10777 off_t ofs = aio_data->ofs;
10778 off_t len = aio_data->len;
10779
10780 list<bufferlist> bl_list;
10781 list<bufferlist>::iterator iter;
10782 int r;
10783
10784 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10785 d->throttle.put(len);
10786
10787 r = rados_aio_get_return_value(c);
10788 if (r < 0) {
10789 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10790 d->set_cancelled(r);
10791 goto done;
10792 }
10793
10794 if (d->is_cancelled()) {
10795 goto done;
10796 }
10797
10798 d->data_lock.Lock();
10799
10800 r = d->get_complete_ios(ofs, bl_list);
10801 if (r < 0) {
10802 goto done_unlock;
10803 }
10804
10805 d->read_list.splice(d->read_list.end(), bl_list);
10806
10807done_unlock:
10808 d->data_lock.Unlock();
10809done:
10810 d->put();
10811 return;
10812}
10813
10814int RGWRados::flush_read_list(struct get_obj_data *d)
10815{
10816 d->data_lock.Lock();
10817 list<bufferlist> l;
10818 l.swap(d->read_list);
10819 d->get();
10820 d->read_list.clear();
10821
10822 d->data_lock.Unlock();
10823
10824 int r = 0;
10825
10826 list<bufferlist>::iterator iter;
10827 for (iter = l.begin(); iter != l.end(); ++iter) {
10828 bufferlist& bl = *iter;
10829 r = d->client_cb->handle_data(bl, 0, bl.length());
10830 if (r < 0) {
10831 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10832 break;
10833 }
10834 }
10835
10836 d->data_lock.Lock();
10837 d->put();
10838 if (r < 0) {
10839 d->set_cancelled(r);
10840 }
10841 d->data_lock.Unlock();
10842 return r;
10843}
10844
10845int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10846 const RGWBucketInfo& bucket_info,
10847 const rgw_obj& obj,
10848 const rgw_raw_obj& read_obj,
10849 off_t obj_ofs,
10850 off_t read_ofs, off_t len,
10851 bool is_head_obj, void *arg)
10852{
10853 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10854 ObjectReadOperation op;
10855 struct get_obj_data *d = (struct get_obj_data *)arg;
10856 string oid, key;
10857 bufferlist *pbl;
10858 AioCompletion *c;
10859
10860 int r;
10861
10862 if (is_head_obj) {
10863 /* only when reading from the head object do we need to do the atomic test */
10864 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10865 if (r < 0)
10866 return r;
10867
10868 if (astate &&
10869 obj_ofs < astate->data.length()) {
10870 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10871
10872 d->data_lock.Lock();
10873 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10874 d->data_lock.Unlock();
10875 if (r < 0)
10876 return r;
10877
10878 d->lock.Lock();
10879 d->total_read += chunk_len;
10880 d->lock.Unlock();
10881
10882 len -= chunk_len;
10883 read_ofs += chunk_len;
10884 obj_ofs += chunk_len;
10885 if (!len)
10886 return 0;
10887 }
10888 }
10889
10890 d->throttle.get(len);
10891 if (d->is_cancelled()) {
10892 return d->get_err_code();
10893 }
10894
10895 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10896 * cleaning up
10897 */
10898 d->add_io(obj_ofs, len, &pbl, &c);
10899
10900 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10901 op.read(read_ofs, len, pbl, NULL);
10902
10903 librados::IoCtx io_ctx(d->io_ctx);
10904 io_ctx.locator_set_key(read_obj.loc);
10905
10906 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10907 if (r < 0) {
10908 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10909 goto done_err;
10910 }
10911
10912 // Flush data to client if there is any
10913 r = flush_read_list(d);
10914 if (r < 0)
10915 return r;
10916
10917 return 0;
10918
10919done_err:
10920 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10921 d->set_cancelled(r);
10922 d->cancel_io(obj_ofs);
10923
10924 return r;
10925}
10926
10927int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10928{
10929 RGWRados *store = source->get_store();
10930 CephContext *cct = store->ctx();
10931
10932 struct get_obj_data *data = new get_obj_data(cct);
10933 bool done = false;
10934
10935 RGWObjectCtx& obj_ctx = source->get_ctx();
10936
10937 data->rados = store;
10938 data->io_ctx.dup(state.io_ctx);
10939 data->client_cb = cb;
10940
10941 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10942 if (r < 0) {
10943 data->cancel_all_io();
10944 goto done;
10945 }
10946
10947 while (!done) {
10948 r = data->wait_next_io(&done);
10949 if (r < 0) {
10950 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10951 data->cancel_all_io();
10952 break;
10953 }
10954 r = store->flush_read_list(data);
10955 if (r < 0) {
10956 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10957 data->cancel_all_io();
10958 break;
10959 }
10960 }
10961
10962done:
10963 data->put();
10964 return r;
10965}
10966
10967int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10968 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10969 off_t ofs, off_t end,
10970 uint64_t max_chunk_size,
10971 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10972 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10973 RGWObjState *, void *),
10974 void *arg)
10975{
10976 rgw_raw_obj head_obj;
10977 rgw_raw_obj read_obj;
10978 uint64_t read_ofs = ofs;
10979 uint64_t len;
10980 bool reading_from_head = true;
10981 RGWObjState *astate = NULL;
10982
10983 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10984
10985 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10986 if (r < 0) {
10987 return r;
10988 }
10989
10990 if (end < 0)
10991 len = 0;
10992 else
10993 len = end - ofs + 1;
10994
10995 if (astate->has_manifest) {
10996 /* now get the relevant object stripe */
10997 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10998
10999 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
11000
11001 for (; iter != obj_end && ofs <= end; ++iter) {
11002 off_t stripe_ofs = iter.get_stripe_ofs();
11003 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
11004
11005 while (ofs < next_stripe_ofs && ofs <= end) {
11006 read_obj = iter.get_location().get_raw_obj(this);
11007 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
11008 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
11009
11010 if (read_len > max_chunk_size) {
11011 read_len = max_chunk_size;
11012 }
11013
11014 reading_from_head = (read_obj == head_obj);
11015 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
11016 if (r < 0) {
11017 return r;
11018 }
11019
11020 len -= read_len;
11021 ofs += read_len;
11022 }
11023 }
11024 } else {
11025 while (ofs <= end) {
11026 read_obj = head_obj;
11027 uint64_t read_len = min(len, max_chunk_size);
11028
11029 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
11030 if (r < 0) {
11031 return r;
11032 }
11033
11034 len -= read_len;
11035 ofs += read_len;
11036 }
11037 }
11038
11039 return 0;
11040}
11041
11042int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
11043{
11044 rgw_rados_ref ref;
11045 int r = get_obj_head_ref(bucket_info, obj, &ref);
11046 if (r < 0) {
11047 return r;
11048 }
11049
11050 return ref.ioctx.operate(ref.oid, op);
11051}
11052
11053int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
11054{
11055 rgw_rados_ref ref;
11056 int r = get_obj_head_ref(bucket_info, obj, &ref);
11057 if (r < 0) {
11058 return r;
11059 }
11060
11061 bufferlist outbl;
11062
11063 return ref.ioctx.operate(ref.oid, op, &outbl);
11064}
11065
11066int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
11067{
11068 ObjectWriteOperation op;
11069
11070 assert(olh_obj.key.instance.empty());
11071
11072 bool has_tag = (state.exists && has_olh_tag(state.attrset));
11073
11074 if (!state.exists) {
11075 op.create(true);
11076 } else {
11077 op.assert_exists();
b32b8144
FG
11078 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11079 op.mtime2(&mtime_ts);
7c673cae
FG
11080 }
11081
11082 /*
11083 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
11084 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
11085 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
11086 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
11087 * log will reflect that.
11088 *
11089 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
11090 * is used for object data instance, olh_tag for olh instance.
11091 */
11092 if (has_tag) {
11093 /* guard against racing writes */
11094 bucket_index_guard_olh_op(state, op);
11095 }
11096
11097 if (!has_tag) {
11098 /* obj tag */
11099 string obj_tag;
11100 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
11101 if (ret < 0) {
11102 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11103 return ret;
11104 }
11105 bufferlist bl;
11106 bl.append(obj_tag.c_str(), obj_tag.size());
11107 op.setxattr(RGW_ATTR_ID_TAG, bl);
11108
11109 state.attrset[RGW_ATTR_ID_TAG] = bl;
11110 state.obj_tag = bl;
11111
11112 /* olh tag */
11113 string olh_tag;
11114 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
11115 if (ret < 0) {
11116 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11117 return ret;
11118 }
11119 bufferlist olh_bl;
11120 olh_bl.append(olh_tag.c_str(), olh_tag.size());
11121 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
11122
11123 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
11124 state.olh_tag = olh_bl;
11125 state.is_olh = true;
11126
11127 bufferlist verbl;
11128 op.setxattr(RGW_ATTR_OLH_VER, verbl);
11129 }
11130
11131 bufferlist bl;
11132 RGWOLHPendingInfo pending_info;
11133 pending_info.time = real_clock::now();
11134 ::encode(pending_info, bl);
11135
11136#define OLH_PENDING_TAG_LEN 32
11137 /* tag will start with current time epoch, this so that entries are sorted by time */
11138 char buf[32];
11139 utime_t ut(pending_info.time);
11140 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
11141 *op_tag = buf;
11142
11143 string s;
11144 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
11145 if (ret < 0) {
11146 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11147 return ret;
11148 }
11149 op_tag->append(s);
11150
11151 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11152 attr_name.append(*op_tag);
11153
11154 op.setxattr(attr_name.c_str(), bl);
11155
11156 ret = obj_operate(bucket_info, olh_obj, &op);
11157 if (ret < 0) {
11158 return ret;
11159 }
11160
11161 state.exists = true;
11162 state.attrset[attr_name] = bl;
11163
11164 return 0;
11165}
11166
11167int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
11168{
11169 int ret;
11170
11171 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
11172 if (ret == -EEXIST) {
11173 ret = -ECANCELED;
11174 }
11175
11176 return ret;
11177}
11178
31f18b77
FG
11179int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
11180{
11181 rgw_obj obj;
11182 const rgw_obj *pobj = &obj_instance;
11183 int r;
11184
11185 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
11186 r = bs->init(pobj->bucket, *pobj);
11187 if (r < 0) {
11188 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11189 return r;
11190 }
11191 r = call(bs);
11192 if (r != -ERR_BUSY_RESHARDING) {
11193 break;
11194 }
11195 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11196 string new_bucket_id;
11197 r = block_while_resharding(bs, &new_bucket_id);
11198 if (r == -ERR_BUSY_RESHARDING) {
11199 continue;
11200 }
11201 if (r < 0) {
11202 return r;
11203 }
11204 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11205 i = 0; /* resharding is finished, make sure we can retry */
11206
11207 obj = *pobj;
11208 obj.bucket.update_bucket_id(new_bucket_id);
11209 pobj = &obj;
11210 }
11211
11212 if (r < 0) {
11213 return r;
11214 }
11215
11216 return 0;
11217}
11218
11219int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
11220{
11221 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11222
11223 return waiter->block_while_resharding(bs, new_bucket_id);
11224}
11225
7c673cae
FG
11226int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11227 bool delete_marker,
11228 const string& op_tag,
11229 struct rgw_bucket_dir_entry_meta *meta,
11230 uint64_t olh_epoch,
91327a77
AA
11231 real_time unmod_since, bool high_precision_time,
11232 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
11233{
11234 rgw_rados_ref ref;
11235 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11236 if (r < 0) {
11237 return r;
11238 }
11239
31f18b77
FG
11240 rgw_zone_set zones_trace;
11241 if (_zones_trace) {
11242 zones_trace = *_zones_trace;
7c673cae 11243 }
1adf2230 11244 zones_trace.insert(get_zone().id);
7c673cae 11245
31f18b77
FG
11246 BucketShard bs(this);
11247
7c673cae 11248 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
31f18b77
FG
11249 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11250 librados::ObjectWriteOperation op;
11251 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11252 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11253 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11254 unmod_since, high_precision_time,
11255 get_zone().log_data, zones_trace);
11256 });
11257 if (r < 0) {
11258 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11259 return r;
7c673cae
FG
11260 }
11261
91327a77
AA
11262 if (log_data_change && bucket_info.datasync_flag_enabled()) {
11263 data_log->add_entry(bs.bucket, bs.shard_id);
11264 }
11265
7c673cae
FG
11266 return 0;
11267}
11268
11269void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11270{
11271 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11272 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11273}
11274
11275int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 11276 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
11277{
11278 rgw_rados_ref ref;
11279 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11280 if (r < 0) {
11281 return r;
11282 }
11283
31f18b77
FG
11284 rgw_zone_set zones_trace;
11285 if (_zones_trace) {
11286 zones_trace = *_zones_trace;
7c673cae 11287 }
31f18b77
FG
11288 zones_trace.insert(get_zone().id);
11289
11290 BucketShard bs(this);
7c673cae
FG
11291
11292 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
31f18b77
FG
11293 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11294 librados::ObjectWriteOperation op;
11295 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11296 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11297 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11298 });
11299 if (r < 0) {
11300 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11301 return r;
7c673cae
FG
11302 }
11303
11304 return 0;
11305}
11306
11307int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11308 const rgw_obj& obj_instance, uint64_t ver_marker,
11309 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11310 bool *is_truncated)
11311{
11312 rgw_rados_ref ref;
11313 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11314 if (r < 0) {
11315 return r;
11316 }
11317
11318 BucketShard bs(this);
11319 int ret = bs.init(obj_instance.bucket, obj_instance);
11320 if (ret < 0) {
11321 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11322 return ret;
11323 }
11324
11325 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11326
11327 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11328
31f18b77
FG
11329 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11330 ObjectReadOperation op;
11331 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11332 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11333 key, ver_marker, olh_tag, log, is_truncated);
11334 });
11335 if (ret < 0) {
11336 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 11337 return ret;
31f18b77 11338 }
7c673cae
FG
11339
11340 return 0;
11341}
11342
11343int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11344{
11345 rgw_rados_ref ref;
11346 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11347 if (r < 0) {
11348 return r;
11349 }
11350
11351 BucketShard bs(this);
11352 int ret = bs.init(obj_instance.bucket, obj_instance);
11353 if (ret < 0) {
11354 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11355 return ret;
11356 }
11357
11358 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11359
11360 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11361
31f18b77
FG
11362 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11363 ObjectWriteOperation op;
11364 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11365 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11366 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11367 });
11368 if (ret < 0) {
11369 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 11370 return ret;
31f18b77 11371 }
7c673cae
FG
11372
11373 return 0;
11374}
11375
11376int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11377{
11378 rgw_rados_ref ref;
11379 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11380 if (r < 0) {
11381 return r;
11382 }
11383
11384 BucketShard bs(this);
7c673cae
FG
11385
11386 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11387
11388 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11389
31f18b77
FG
11390 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11391 ObjectWriteOperation op;
11392 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11393 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11394 });
7c673cae
FG
11395 if (ret < 0) {
11396 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11397 return ret;
11398 }
11399
11400 return 0;
11401}
11402
11403int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11404 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 11405 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
11406{
11407 if (log.empty()) {
11408 return 0;
11409 }
11410
11411 librados::ObjectWriteOperation op;
11412
11413 uint64_t last_ver = log.rbegin()->first;
11414 *plast_ver = last_ver;
11415
11416 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11417
11418 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11419 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11420
b32b8144
FG
11421 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11422 op.mtime2(&mtime_ts);
11423
7c673cae
FG
11424 bool need_to_link = false;
11425 cls_rgw_obj_key key;
11426 bool delete_marker = false;
11427 list<cls_rgw_obj_key> remove_instances;
11428 bool need_to_remove = false;
11429
11430 for (iter = log.begin(); iter != log.end(); ++iter) {
11431 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11432 for (; viter != iter->second.end(); ++viter) {
11433 rgw_bucket_olh_log_entry& entry = *viter;
11434
11435 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11436 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11437 << (entry.delete_marker ? "(delete)" : "") << dendl;
11438 switch (entry.op) {
11439 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11440 remove_instances.push_back(entry.key);
11441 break;
11442 case CLS_RGW_OLH_OP_LINK_OLH:
11443 need_to_link = true;
11444 need_to_remove = false;
11445 key = entry.key;
11446 delete_marker = entry.delete_marker;
11447 break;
11448 case CLS_RGW_OLH_OP_UNLINK_OLH:
11449 need_to_remove = true;
11450 need_to_link = false;
11451 break;
11452 default:
11453 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11454 return -EIO;
11455 }
11456 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11457 attr_name.append(entry.op_tag);
11458 op.rmxattr(attr_name.c_str());
11459 }
11460 }
11461
11462 rgw_rados_ref ref;
11463 int r = get_obj_head_ref(bucket_info, obj, &ref);
11464 if (r < 0) {
11465 return r;
11466 }
11467
11468 const rgw_bucket& bucket = obj.bucket;
11469
11470 if (need_to_link) {
11471 rgw_obj target(bucket, key);
11472 RGWOLHInfo info;
11473 info.target = target;
11474 info.removed = delete_marker;
11475 bufferlist bl;
11476 ::encode(info, bl);
11477 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11478 }
11479
11480 /* first remove object instances */
11481 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11482 liter != remove_instances.end(); ++liter) {
11483 cls_rgw_obj_key& key = *liter;
11484 rgw_obj obj_instance(bucket, key);
31f18b77 11485 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
11486 if (ret < 0 && ret != -ENOENT) {
11487 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11488 return ret;
11489 }
11490 }
11491
11492 /* update olh object */
11493 r = ref.ioctx.operate(ref.oid, &op);
11494 if (r == -ECANCELED) {
11495 r = 0;
11496 }
11497 if (r < 0) {
11498 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11499 return r;
11500 }
11501
11502 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11503 if (r < 0) {
11504 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11505 return r;
11506 }
11507
11508 if (need_to_remove) {
11509 ObjectWriteOperation rm_op;
11510
11511 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11512 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11513 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11514 rm_op.remove();
11515
11516 r = ref.ioctx.operate(ref.oid, &rm_op);
11517 if (r == -ECANCELED) {
11518 return 0; /* someone else won this race */
11519 } else {
11520 /*
11521 * only clear if was successful, otherwise we might clobber pending operations on this object
11522 */
11523 r = bucket_index_clear_olh(bucket_info, state, obj);
11524 if (r < 0) {
11525 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11526 return r;
11527 }
11528 }
11529 }
11530
11531 return 0;
11532}
11533
11534/*
11535 * read olh log and apply it
11536 */
31f18b77 11537int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
11538{
11539 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11540 bool is_truncated;
11541 uint64_t ver_marker = 0;
11542
11543 do {
11544 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11545 if (ret < 0) {
11546 return ret;
11547 }
31f18b77 11548 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
11549 if (ret < 0) {
11550 return ret;
11551 }
11552 } while (is_truncated);
11553
11554 return 0;
11555}
11556
11557int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77
AA
11558 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
11559 rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
11560{
11561 string op_tag;
11562
11563 rgw_obj olh_obj = target_obj;
11564 olh_obj.key.instance.clear();
11565
11566 RGWObjState *state = NULL;
11567
11568 int ret = 0;
11569 int i;
31f18b77 11570
7c673cae
FG
11571#define MAX_ECANCELED_RETRY 100
11572 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11573 if (ret == -ECANCELED) {
11574 obj_ctx.obj.invalidate(olh_obj);
11575 }
11576
11577 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11578 if (ret < 0) {
11579 return ret;
11580 }
11581
11582 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11583 if (ret < 0) {
11584 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11585 if (ret == -ECANCELED) {
11586 continue;
11587 }
11588 return ret;
11589 }
91327a77
AA
11590 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
11591 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
11592 zones_trace, log_data_change);
7c673cae
FG
11593 if (ret < 0) {
11594 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11595 if (ret == -ECANCELED) {
11596 continue;
11597 }
11598 return ret;
11599 }
11600 break;
11601 }
11602
11603 if (i == MAX_ECANCELED_RETRY) {
11604 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11605 return -EIO;
11606 }
11607
11608 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11609 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11610 ret = 0;
11611 }
11612 if (ret < 0) {
11613 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11614 return ret;
11615 }
11616
11617 return 0;
11618}
11619
11620int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
31f18b77 11621 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7c673cae
FG
11622{
11623 string op_tag;
11624
11625 rgw_obj olh_obj = target_obj;
11626 olh_obj.key.instance.clear();
11627
11628 RGWObjState *state = NULL;
11629
11630 int ret = 0;
11631 int i;
11632
11633 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11634 if (ret == -ECANCELED) {
11635 obj_ctx.obj.invalidate(olh_obj);
11636 }
11637
11638 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11639 if (ret < 0)
11640 return ret;
11641
11642 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11643 if (ret < 0) {
11644 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11645 if (ret == -ECANCELED) {
11646 continue;
11647 }
11648 return ret;
11649 }
11650
11651 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11652
31f18b77 11653 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
11654 if (ret < 0) {
11655 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11656 if (ret == -ECANCELED) {
11657 continue;
11658 }
11659 return ret;
11660 }
11661 break;
11662 }
11663
11664 if (i == MAX_ECANCELED_RETRY) {
11665 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11666 return -EIO;
11667 }
11668
31f18b77 11669 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
11670 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11671 return 0;
11672 }
11673 if (ret < 0) {
11674 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11675 return ret;
11676 }
11677
11678 return 0;
11679}
11680
11681void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11682{
11683#define OBJ_INSTANCE_LEN 32
11684 char buf[OBJ_INSTANCE_LEN + 1];
11685
11686 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11687 no underscore for instance name due to the way we encode the raw keys */
11688
11689 target_obj->key.set_instance(buf);
11690}
11691
11692static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11693 map<string, bufferlist> *attrset)
11694{
11695 attrset->clear();
11696 map<string, bufferlist>::iterator iter;
11697 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11698 iter != unfiltered_attrset.end(); ++iter) {
11699 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11700 break;
11701 (*attrset)[iter->first] = iter->second;
11702 }
11703}
11704
11705int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11706{
11707 map<string, bufferlist> unfiltered_attrset;
11708
11709 ObjectReadOperation op;
11710 op.getxattrs(&unfiltered_attrset, NULL);
11711
11712 bufferlist outbl;
11713 int r = obj_operate(bucket_info, obj, &op);
11714
11715 if (r < 0) {
11716 return r;
11717 }
11718 map<string, bufferlist> attrset;
11719
11720 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11721
11722 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11723 if (iter == attrset.end()) { /* not an olh */
11724 return -EINVAL;
11725 }
11726
11727 try {
11728 bufferlist::iterator biter = iter->second.begin();
11729 ::decode(*olh, biter);
11730 } catch (buffer::error& err) {
11731 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11732 return -EIO;
11733 }
11734
11735 return 0;
11736}
11737
11738void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11739 map<string, bufferlist> *rm_pending_entries)
11740{
11741 map<string, bufferlist>::iterator iter = pending_entries.begin();
11742
11743 real_time now = real_clock::now();
11744
11745 while (iter != pending_entries.end()) {
11746 bufferlist::iterator biter = iter->second.begin();
11747 RGWOLHPendingInfo pending_info;
11748 try {
11749 ::decode(pending_info, biter);
11750 } catch (buffer::error& err) {
11751 /* skipping bad entry, we could remove it but it might hide a bug */
11752 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11753 ++iter;
11754 continue;
11755 }
11756
11757 map<string, bufferlist>::iterator cur_iter = iter;
11758 ++iter;
11759 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11760 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11761 pending_entries.erase(cur_iter);
11762 } else {
11763 /* entries names are sorted by time (rounded to a second) */
11764 break;
11765 }
11766 }
11767}
11768
11769int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11770{
11771 ObjectWriteOperation op;
11772
11773 bucket_index_guard_olh_op(state, op);
11774
11775 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11776 op.rmxattr(iter->first.c_str());
11777 }
11778
11779 rgw_rados_ref ref;
11780 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11781 if (r < 0) {
11782 return r;
11783 }
11784
11785 /* update olh object */
11786 r = ref.ioctx.operate(ref.oid, &op);
11787 if (r == -ENOENT || r == -ECANCELED) {
11788 /* raced with some other change, shouldn't sweat about it */
11789 r = 0;
11790 }
11791 if (r < 0) {
11792 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11793 return r;
11794 }
11795
11796 return 0;
11797}
11798
11799int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11800{
11801 map<string, bufferlist> pending_entries;
11802 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11803
11804 map<string, bufferlist> rm_pending_entries;
11805 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11806
11807 if (!rm_pending_entries.empty()) {
11808 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11809 if (ret < 0) {
11810 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11811 return ret;
11812 }
11813 }
11814 if (!pending_entries.empty()) {
11815 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11816
11817 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11818 if (ret < 0) {
11819 return ret;
11820 }
11821 }
11822
11823 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11824 assert(iter != state->attrset.end());
11825 RGWOLHInfo olh;
11826 try {
11827 bufferlist::iterator biter = iter->second.begin();
11828 ::decode(olh, biter);
11829 } catch (buffer::error& err) {
11830 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11831 return -EIO;
11832 }
11833
11834 if (olh.removed) {
11835 return -ENOENT;
11836 }
11837
11838 *target = olh.target;
11839
11840 return 0;
11841}
11842
11843int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11844 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11845 RGWObjVersionTracker *objv_tracker)
11846{
11847 rgw_rados_ref ref;
11848 int r = get_raw_obj_ref(obj, &ref);
11849 if (r < 0) {
11850 return r;
11851 }
11852
11853 map<string, bufferlist> unfiltered_attrset;
11854 uint64_t size = 0;
11855 struct timespec mtime_ts;
11856
11857 ObjectReadOperation op;
11858 if (objv_tracker) {
11859 objv_tracker->prepare_op_for_read(&op);
11860 }
11861 if (attrs) {
11862 op.getxattrs(&unfiltered_attrset, NULL);
11863 }
11864 if (psize || pmtime) {
11865 op.stat2(&size, &mtime_ts, NULL);
11866 }
11867 if (first_chunk) {
11868 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11869 }
11870 bufferlist outbl;
11871 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11872
11873 if (epoch) {
11874 *epoch = ref.ioctx.get_last_version();
11875 }
11876
11877 if (r < 0)
11878 return r;
11879
11880 if (psize)
11881 *psize = size;
11882 if (pmtime)
11883 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11884 if (attrs) {
11885 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11886 }
11887
11888 return 0;
11889}
11890
11891int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 11892 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae
FG
11893{
11894 map<string, rgw_bucket_dir_header> headers;
11895 map<int, string> bucket_instance_ids;
11896 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11897 if (r < 0) {
11898 return r;
11899 }
11900
11901 assert(headers.size() == bucket_instance_ids.size());
11902
11903 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11904 map<int, string>::iterator viter = bucket_instance_ids.begin();
11905 BucketIndexShardsManager ver_mgr;
11906 BucketIndexShardsManager master_ver_mgr;
11907 BucketIndexShardsManager marker_mgr;
7c673cae
FG
11908 char buf[64];
11909 for(; iter != headers.end(); ++iter, ++viter) {
11910 accumulate_raw_stats(iter->second, stats);
11911 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11912 ver_mgr.add(viter->first, string(buf));
11913 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11914 master_ver_mgr.add(viter->first, string(buf));
11915 if (shard_id >= 0) {
11916 *max_marker = iter->second.max_marker;
11917 } else {
11918 marker_mgr.add(viter->first, iter->second.max_marker);
11919 }
c07f9fc5
FG
11920 if (syncstopped != NULL)
11921 *syncstopped = iter->second.syncstopped;
7c673cae
FG
11922 }
11923 ver_mgr.to_string(bucket_ver);
11924 master_ver_mgr.to_string(master_ver);
11925 if (shard_id < 0) {
11926 marker_mgr.to_string(max_marker);
11927 }
11928 return 0;
11929}
11930
11931int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11932 map<int, string>& markers)
11933{
11934 map<string, rgw_bucket_dir_header> headers;
11935 map<int, string> bucket_instance_ids;
11936 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11937 if (r < 0)
11938 return r;
11939
11940 assert(headers.size() == bucket_instance_ids.size());
11941
11942 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11943 map<int, string>::iterator viter = bucket_instance_ids.begin();
11944
11945 for(; iter != headers.end(); ++iter, ++viter) {
11946 if (shard_id >= 0) {
11947 markers[shard_id] = iter->second.max_marker;
11948 } else {
11949 markers[viter->first] = iter->second.max_marker;
11950 }
11951 }
11952 return 0;
11953}
11954
11955class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11956 RGWGetBucketStats_CB *cb;
11957 uint32_t pendings;
11958 map<RGWObjCategory, RGWStorageStats> stats;
11959 int ret_code;
11960 bool should_cb;
11961 Mutex lock;
11962
11963public:
11964 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11965 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11966 lock("RGWGetBucketStatsContext") {}
11967
11968 void handle_response(int r, rgw_bucket_dir_header& header) override {
11969 Mutex::Locker l(lock);
11970 if (should_cb) {
11971 if ( r >= 0) {
11972 accumulate_raw_stats(header, stats);
11973 } else {
11974 ret_code = r;
11975 }
11976
11977 // Are we all done?
11978 if (--pendings == 0) {
11979 if (!ret_code) {
11980 cb->set_response(&stats);
11981 }
11982 cb->handle_response(ret_code);
11983 cb->put();
11984 }
11985 }
11986 }
11987
11988 void unset_cb() {
11989 Mutex::Locker l(lock);
11990 should_cb = false;
11991 }
11992};
11993
11994int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11995{
11996 int num_aio = 0;
c07f9fc5 11997 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
7c673cae
FG
11998 assert(get_ctx);
11999 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
12000 if (r < 0) {
12001 ctx->put();
12002 if (num_aio) {
12003 get_ctx->unset_cb();
12004 }
12005 }
c07f9fc5 12006 get_ctx->put();
7c673cae
FG
12007 return r;
12008}
12009
12010class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
12011 RGWGetUserStats_CB *cb;
12012
12013public:
12014 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
12015 : cb(cb) {}
12016
12017 void handle_response(int r, cls_user_header& header) override {
12018 const cls_user_stats& hs = header.stats;
12019 if (r >= 0) {
12020 RGWStorageStats stats;
12021
12022 stats.size = hs.total_bytes;
12023 stats.size_rounded = hs.total_bytes_rounded;
12024 stats.num_objects = hs.total_entries;
12025
12026 cb->set_response(stats);
12027 }
12028
12029 cb->handle_response(r);
12030
12031 cb->put();
12032 }
12033};
12034
12035int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
12036{
12037 string user_str = user.to_str();
12038
12039 cls_user_header header;
12040 int r = cls_user_get_header(user_str, &header);
12041 if (r < 0)
12042 return r;
12043
12044 const cls_user_stats& hs = header.stats;
12045
12046 stats.size = hs.total_bytes;
12047 stats.size_rounded = hs.total_bytes_rounded;
12048 stats.num_objects = hs.total_entries;
12049
12050 return 0;
12051}
12052
12053int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
12054{
12055 string user_str = user.to_str();
12056
12057 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
12058 int r = cls_user_get_header_async(user_str, get_ctx);
12059 if (r < 0) {
12060 ctx->put();
12061 delete get_ctx;
12062 return r;
12063 }
12064
12065 return 0;
12066}
12067
12068void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
12069{
12070 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
12071}
12072
12073void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
12074{
12075 if (!bucket.oid.empty()) {
12076 obj.init(get_zone_params().domain_root, bucket.oid);
12077 } else {
12078 string oid;
12079 get_bucket_meta_oid(bucket, oid);
12080 obj.init(get_zone_params().domain_root, oid);
12081 }
12082}
12083
12084int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
12085 real_time *pmtime, map<string, bufferlist> *pattrs)
12086{
12087 size_t pos = meta_key.find(':');
12088 if (pos == string::npos) {
12089 return -EINVAL;
12090 }
12091 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
12092 rgw_bucket_instance_key_to_oid(oid);
12093
12094 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12095}
12096
12097int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
12098 real_time *pmtime, map<string, bufferlist> *pattrs)
12099{
12100 string oid;
12101 if (bucket.oid.empty()) {
12102 get_bucket_meta_oid(bucket, oid);
12103 } else {
12104 oid = bucket.oid;
12105 }
12106
12107 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12108}
12109
31f18b77 12110int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
7c673cae 12111 real_time *pmtime, map<string, bufferlist> *pattrs,
b32b8144
FG
12112 rgw_cache_entry_info *cache_info,
12113 boost::optional<obj_version> refresh_version)
7c673cae
FG
12114{
12115 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
12116
12117 bufferlist epbl;
12118
b32b8144
FG
12119 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12120 oid, epbl, &info.objv_tracker, pmtime, pattrs,
12121 cache_info, refresh_version);
7c673cae
FG
12122 if (ret < 0) {
12123 return ret;
12124 }
12125
12126 bufferlist::iterator iter = epbl.begin();
12127 try {
12128 ::decode(info, iter);
12129 } catch (buffer::error& err) {
12130 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12131 return -EIO;
12132 }
12133 info.bucket.oid = oid;
12134 return 0;
12135}
12136
12137int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
12138 const string& tenant_name,
12139 const string& bucket_name,
12140 RGWBucketEntryPoint& entry_point,
12141 RGWObjVersionTracker *objv_tracker,
12142 real_time *pmtime,
12143 map<string, bufferlist> *pattrs,
b32b8144
FG
12144 rgw_cache_entry_info *cache_info,
12145 boost::optional<obj_version> refresh_version)
7c673cae
FG
12146{
12147 bufferlist bl;
12148 string bucket_entry;
12149
12150 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
b32b8144
FG
12151 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12152 bucket_entry, bl, objv_tracker, pmtime, pattrs,
12153 cache_info, refresh_version);
7c673cae
FG
12154 if (ret < 0) {
12155 return ret;
12156 }
12157
12158 bufferlist::iterator iter = bl.begin();
12159 try {
12160 ::decode(entry_point, iter);
12161 } catch (buffer::error& err) {
12162 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12163 return -EIO;
12164 }
12165 return 0;
12166}
12167
12168int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
12169 const string& tenant_name,
12170 const string& bucket_name)
12171{
12172 RGWBucketEntryPoint entry_point;
12173 real_time ep_mtime;
12174 RGWObjVersionTracker ot;
12175 map<string, bufferlist> attrs;
12176 RGWBucketInfo info;
12177
12178 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
12179
12180 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
12181 if (ret < 0) {
12182 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
12183 return ret;
12184 }
12185
12186 if (!entry_point.has_bucket_info) {
12187 /* already converted! */
12188 return 0;
12189 }
12190
12191 info = entry_point.old_bucket_info;
12192 info.bucket.oid = bucket_name;
12193 info.ep_objv = ot.read_version;
12194
12195 ot.generate_new_write_ver(cct);
12196
12197 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12198 if (ret < 0) {
12199 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12200 return ret;
12201 }
12202
12203 return 0;
12204}
12205
b32b8144
FG
12206int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12207 const string& tenant,
12208 const string& bucket_name,
12209 RGWBucketInfo& info,
12210 real_time *pmtime,
12211 map<string, bufferlist> *pattrs,
12212 boost::optional<obj_version> refresh_version)
7c673cae
FG
12213{
12214 bucket_info_entry e;
12215 string bucket_entry;
12216 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12217
b32b8144 12218
7c673cae 12219 if (binfo_cache->find(bucket_entry, &e)) {
b32b8144
FG
12220 if (refresh_version &&
12221 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12222 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12223 << "a failure that should be debugged. I am a nice machine, "
12224 << "so I will try to recover." << dendl;
12225 binfo_cache->invalidate(bucket_entry);
12226 }
7c673cae
FG
12227 info = e.info;
12228 if (pattrs)
12229 *pattrs = e.attrs;
12230 if (pmtime)
12231 *pmtime = e.mtime;
12232 return 0;
12233 }
12234
12235 RGWBucketEntryPoint entry_point;
12236 real_time ep_mtime;
12237 RGWObjVersionTracker ot;
12238 rgw_cache_entry_info entry_cache_info;
b32b8144
FG
12239 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12240 entry_point, &ot, &ep_mtime, pattrs,
12241 &entry_cache_info, refresh_version);
7c673cae
FG
12242 if (ret < 0) {
12243 /* only init these fields */
12244 info.bucket.tenant = tenant;
12245 info.bucket.name = bucket_name;
12246 return ret;
12247 }
12248
12249 if (entry_point.has_bucket_info) {
12250 info = entry_point.old_bucket_info;
12251 info.bucket.oid = bucket_name;
12252 info.bucket.tenant = tenant;
12253 info.ep_objv = ot.read_version;
12254 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12255 return 0;
12256 }
12257
12258 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12259 * that we got
12260 */
12261 if (pattrs) {
12262 pattrs->clear();
12263 }
12264
12265 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12266
12267
12268 /* read bucket instance info */
12269
12270 string oid;
12271 get_bucket_meta_oid(entry_point.bucket, oid);
12272
12273 rgw_cache_entry_info cache_info;
12274
b32b8144
FG
12275 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12276 &cache_info, refresh_version);
7c673cae
FG
12277 e.info.ep_objv = ot.read_version;
12278 info = e.info;
12279 if (ret < 0) {
b32b8144 12280 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
7c673cae
FG
12281 info.bucket.tenant = tenant;
12282 info.bucket.name = bucket_name;
12283 // XXX and why return anything in case of an error anyway?
12284 return ret;
12285 }
12286
12287 if (pmtime)
12288 *pmtime = e.mtime;
12289 if (pattrs)
12290 *pattrs = e.attrs;
12291
12292 list<rgw_cache_entry_info *> cache_info_entries;
12293 cache_info_entries.push_back(&entry_cache_info);
12294 cache_info_entries.push_back(&cache_info);
12295
12296
12297 /* chain to both bucket entry point and bucket instance */
12298 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12299 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12300 }
12301
b32b8144
FG
12302 if (refresh_version &&
12303 refresh_version->compare(&info.objv_tracker.read_version)) {
12304 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12305 << "have gone squirrelly. An administrator may have forced a "
12306 << "change; otherwise there is a problem somewhere." << dendl;
12307 }
12308
7c673cae
FG
12309 return 0;
12310}
12311
b32b8144
FG
12312int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12313 const string& tenant, const string& bucket_name,
12314 RGWBucketInfo& info,
12315 real_time *pmtime, map<string, bufferlist> *pattrs)
12316{
12317 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12318 pattrs, boost::none);
12319}
12320
12321int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12322 ceph::real_time *pmtime,
12323 map<string, bufferlist> *pattrs)
12324{
12325 RGWObjectCtx obj_ctx(this);
12326
12327 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12328 info, pmtime, pattrs, info.objv_tracker.read_version);
12329}
12330
7c673cae
FG
12331int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12332 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12333 map<string, bufferlist> *pattrs)
12334{
12335 bufferlist epbl;
12336 ::encode(entry_point, epbl);
12337 string bucket_entry;
12338 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12339 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12340}
12341
12342int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12343 real_time mtime, map<string, bufferlist> *pattrs)
12344{
12345 info.has_instance_obj = true;
12346 bufferlist bl;
12347
12348 ::encode(info, bl);
12349
12350 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12351 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12352 if (ret == -EEXIST) {
12353 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12354 * bucket operation on this specific bucket (e.g., being synced from the master), but
12355 * since bucket instace meta object is unique for this specific bucket instace, we don't
12356 * need to return an error.
12357 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12358 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12359 * locally, while in the sync thread we sync the new bucket.
12360 */
12361 ret = 0;
12362 }
12363 return ret;
12364}
12365
12366int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12367 map<string, bufferlist> *pattrs, bool create_entry_point)
12368{
12369 bool create_head = !info.has_instance_obj || create_entry_point;
12370
12371 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12372 if (ret < 0) {
12373 return ret;
12374 }
12375
12376 if (!create_head)
12377 return 0; /* done! */
12378
12379 RGWBucketEntryPoint entry_point;
12380 entry_point.bucket = info.bucket;
12381 entry_point.owner = info.owner;
12382 entry_point.creation_time = info.creation_time;
12383 entry_point.linked = true;
12384 RGWObjVersionTracker ot;
12385 if (pep_objv && !pep_objv->tag.empty()) {
12386 ot.write_version = *pep_objv;
12387 } else {
12388 ot.generate_new_write_ver(cct);
12389 if (pep_objv) {
12390 *pep_objv = ot.write_version;
12391 }
12392 }
12393 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12394 if (ret < 0)
12395 return ret;
12396
12397 return 0;
12398}
12399
12400int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12401{
12402 rgw_rados_ref ref;
12403 int r = get_raw_obj_ref(obj, &ref);
12404 if (r < 0) {
12405 return r;
12406 }
12407
12408 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12409 if (r < 0)
12410 return r;
12411
12412 return 0;
12413
12414}
12415
12416int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12417 std::map<string, bufferlist>& m)
12418{
12419 rgw_rados_ref ref;
12420 int r = get_raw_obj_ref(obj, &ref);
12421 if (r < 0) {
12422 return r;
12423 }
12424
12425#define MAX_OMAP_GET_ENTRIES 1024
12426 const int count = MAX_OMAP_GET_ENTRIES;
12427 string start_after;
12428
12429 while (true) {
12430 std::map<string, bufferlist> t;
12431 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12432 if (r < 0) {
12433 return r;
12434 }
12435 if (t.empty()) {
12436 break;
12437 }
12438 start_after = t.rbegin()->first;
12439 m.insert(t.begin(), t.end());
12440 }
12441 return 0;
12442}
12443
12444int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12445{
12446 rgw_rados_ref ref;
12447 int r = get_raw_obj_ref(obj, &ref);
12448 if (r < 0) {
12449 return r;
12450 }
12451 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12452
12453 map<string, bufferlist> m;
12454 m[key] = bl;
12455
12456 r = ref.ioctx.omap_set(ref.oid, m);
12457
12458 return r;
12459}
12460
12461int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12462{
12463 rgw_rados_ref ref;
12464 int r = get_raw_obj_ref(obj, &ref);
12465 if (r < 0) {
12466 return r;
12467 }
12468
12469 r = ref.ioctx.omap_set(ref.oid, m);
12470
12471 return r;
12472}
12473
12474int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12475{
12476 rgw_rados_ref ref;
12477 int r = get_raw_obj_ref(obj, &ref);
12478 if (r < 0) {
12479 return r;
12480 }
12481
12482 set<string> k;
12483 k.insert(key);
12484
12485 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12486 return r;
12487}
12488
12489int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12490{
12491 RGWObjectCtx obj_ctx(this);
12492
12493 map<string, RGWBucketEnt>::iterator iter;
12494 for (iter = m.begin(); iter != m.end(); ++iter) {
12495 RGWBucketEnt& ent = iter->second;
12496 rgw_bucket& bucket = ent.bucket;
12497 ent.count = 0;
12498 ent.size = 0;
12499 ent.size_rounded = 0;
12500
12501 map<string, rgw_bucket_dir_header> headers;
12502
12503 RGWBucketInfo bucket_info;
12504 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12505 if (ret < 0) {
12506 return ret;
12507 }
12508
12509 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12510 if (r < 0)
12511 return r;
12512
12513 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12514 for (; hiter != headers.end(); ++hiter) {
12515 RGWObjCategory category = main_category;
12516 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12517 if (iter != hiter->second.stats.end()) {
12518 struct rgw_bucket_category_stats& stats = iter->second;
12519 ent.count += stats.num_entries;
12520 ent.size += stats.total_size;
12521 ent.size_rounded += stats.total_size_rounded;
12522 }
12523 }
3efd9988
FG
12524
12525 // fill in placement_rule from the bucket instance for use in swift's
12526 // per-storage policy statistics
12527 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
12528 }
12529
12530 return m.size();
12531}
12532
12533int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12534{
12535 rgw_rados_ref ref;
12536 int r = get_raw_obj_ref(obj, &ref);
12537 if (r < 0) {
12538 return r;
12539 }
12540 librados::Rados *rad = get_rados_handle();
12541 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12542
12543 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12544 completion->release();
12545 return r;
12546}
12547
12548int RGWRados::distribute(const string& key, bufferlist& bl)
12549{
12550 /*
12551 * we were called before watch was initialized. This can only happen if we're updating some system
12552 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12553 * objects, they're currently only read on startup anyway.
12554 */
12555 if (!watch_initialized)
12556 return 0;
12557
12558 string notify_oid;
12559 pick_control_oid(key, notify_oid);
12560
12561 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12562 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12563}
12564
12565int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12566{
12567 librados::IoCtx& io_ctx = ctx.io_ctx;
12568 librados::NObjectIterator& iter = ctx.iter;
12569
12570 int r = open_pool_ctx(pool, io_ctx);
12571 if (r < 0)
12572 return r;
12573
12574 iter = io_ctx.nobjects_begin();
12575
12576 return 0;
12577}
12578
181888fb
FG
12579int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12580{
12581 librados::IoCtx& io_ctx = ctx.io_ctx;
12582 librados::NObjectIterator& iter = ctx.iter;
12583
12584 int r = open_pool_ctx(pool, io_ctx);
12585 if (r < 0)
12586 return r;
12587
12588 librados::ObjectCursor oc;
12589 if (!oc.from_str(cursor)) {
12590 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12591 return -EINVAL;
12592 }
12593
12594 iter = io_ctx.nobjects_begin(oc);
12595
12596 return 0;
12597}
12598
12599string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12600{
12601 return ctx.iter.get_cursor().to_str();
12602}
12603
7c673cae
FG
12604int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12605 bool *is_truncated, RGWAccessListFilter *filter)
12606{
12607 librados::IoCtx& io_ctx = ctx.io_ctx;
12608 librados::NObjectIterator& iter = ctx.iter;
12609
12610 if (iter == io_ctx.nobjects_end())
12611 return -ENOENT;
12612
12613 uint32_t i;
12614
12615 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12616 rgw_bucket_dir_entry e;
12617
12618 string oid = iter->get_oid();
12619 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12620
12621 // fill it in with initial values; we may correct later
12622 if (filter && !filter->filter(oid, oid))
12623 continue;
12624
12625 e.key = oid;
12626 objs.push_back(e);
12627 }
12628
12629 if (is_truncated)
12630 *is_truncated = (iter != io_ctx.nobjects_end());
12631
12632 return objs.size();
12633}
12634struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12635 string prefix;
12636
12637 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12638 bool filter(string& name, string& key) override {
12639 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12640 }
12641};
12642
181888fb 12643int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 12644{
181888fb
FG
12645 if (!ctx->initialized) {
12646 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
12647 if (r < 0) {
12648 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12649 return r;
12650 }
181888fb 12651 ctx->initialized = true;
7c673cae 12652 }
181888fb
FG
12653 return 0;
12654}
7c673cae 12655
181888fb
FG
12656int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12657 RGWListRawObjsCtx& ctx, list<string>& oids,
12658 bool *is_truncated)
12659{
12660 if (!ctx.initialized) {
12661 return -EINVAL;
12662 }
12663 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
12664 vector<rgw_bucket_dir_entry> objs;
12665 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12666 if (r < 0) {
12667 if(r != -ENOENT)
12668 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12669 return r;
12670 }
12671
12672 vector<rgw_bucket_dir_entry>::iterator iter;
12673 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12674 oids.push_back(iter->key.name);
12675 }
12676
12677 return oids.size();
12678}
12679
181888fb
FG
12680int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12681 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12682 bool *is_truncated)
12683{
12684 if (!ctx.initialized) {
12685 int r = list_raw_objects_init(pool, string(), &ctx);
12686 if (r < 0) {
12687 return r;
12688 }
12689 }
12690
12691 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12692}
12693
12694string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12695{
12696 return pool_iterate_get_cursor(ctx.iter_ctx);
12697}
12698
7c673cae
FG
12699int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12700 std::list<rgw_bi_log_entry>& result, bool *truncated)
12701{
12702 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12703 result.clear();
12704
12705 librados::IoCtx index_ctx;
12706 map<int, string> oids;
12707 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12708 map<int, string> bucket_instance_ids;
12709 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12710 if (r < 0)
12711 return r;
12712
12713 BucketIndexShardsManager marker_mgr;
12714 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12715 // If there are multiple shards for the bucket index object, the marker
12716 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12717 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12718 // only contain one record, and the key is the bucket instance id.
12719 r = marker_mgr.from_string(marker, shard_id);
12720 if (r < 0)
12721 return r;
12722
12723 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12724 if (r < 0)
12725 return r;
12726
12727 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12728 map<int, list<rgw_bi_log_entry>::iterator> vends;
12729 if (truncated) {
12730 *truncated = false;
12731 }
12732 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12733 for (; miter != bi_log_lists.end(); ++miter) {
12734 int shard_id = miter->first;
12735 vcurrents[shard_id] = miter->second.entries.begin();
12736 vends[shard_id] = miter->second.entries.end();
12737 if (truncated) {
12738 *truncated = (*truncated || miter->second.truncated);
12739 }
12740 }
12741
12742 size_t total = 0;
12743 bool has_more = true;
12744 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12745 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12746 while (total < max && has_more) {
12747 has_more = false;
12748
12749 viter = vcurrents.begin();
12750 eiter = vends.begin();
12751
12752 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12753 assert (eiter != vends.end());
12754
12755 int shard_id = viter->first;
12756 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12757
12758 if (liter == eiter->second){
12759 continue;
12760 }
12761 rgw_bi_log_entry& entry = *(liter);
12762 if (has_shards) {
12763 char buf[16];
12764 snprintf(buf, sizeof(buf), "%d", shard_id);
12765 string tmp_id;
12766 build_bucket_index_marker(buf, entry.id, &tmp_id);
12767 entry.id.swap(tmp_id);
12768 }
12769 marker_mgr.add(shard_id, entry.id);
12770 result.push_back(entry);
12771 total++;
12772 has_more = true;
12773 ++liter;
12774 }
12775 }
12776
12777 if (truncated) {
12778 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12779 assert (eiter != vends.end());
12780 *truncated = (*truncated || (viter->second != eiter->second));
12781 }
12782 }
12783
12784 // Refresh marker, if there are multiple shards, the output will look like
12785 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12786 // if there is no sharding, the simply marker (without oid) is returned
12787 if (has_shards) {
12788 marker_mgr.to_string(&marker);
12789 } else {
12790 if (!result.empty()) {
12791 marker = result.rbegin()->id;
12792 }
12793 }
12794
12795 return 0;
12796}
12797
12798int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12799{
12800 librados::IoCtx index_ctx;
12801 map<int, string> bucket_objs;
31f18b77
FG
12802
12803 BucketIndexShardsManager start_marker_mgr;
12804 BucketIndexShardsManager end_marker_mgr;
12805
7c673cae 12806 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
31f18b77 12807 if (r < 0) {
7c673cae 12808 return r;
31f18b77 12809 }
7c673cae 12810
7c673cae 12811 r = start_marker_mgr.from_string(start_marker, shard_id);
31f18b77 12812 if (r < 0) {
7c673cae 12813 return r;
31f18b77
FG
12814 }
12815
7c673cae 12816 r = end_marker_mgr.from_string(end_marker, shard_id);
31f18b77 12817 if (r < 0) {
7c673cae 12818 return r;
31f18b77 12819 }
7c673cae
FG
12820
12821 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
31f18b77
FG
12822 cct->_conf->rgw_bucket_index_max_aio)();
12823
12824 return r;
7c673cae
FG
12825}
12826
c07f9fc5
FG
12827int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12828{
12829 librados::IoCtx index_ctx;
12830 map<int, string> bucket_objs;
12831 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12832 if (r < 0)
12833 return r;
12834
12835 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12836}
12837
12838int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12839{
12840 librados::IoCtx index_ctx;
12841 map<int, string> bucket_objs;
12842 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12843 if (r < 0)
12844 return r;
12845
12846 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12847}
12848
7c673cae
FG
12849int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12850{
12851 rgw_rados_ref ref;
12852 int r = get_obj_head_ref(bucket_info, obj, &ref);
12853 if (r < 0) {
12854 return r;
12855 }
12856
12857 rgw_cls_bi_entry bi_entry;
12858 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12859 if (r < 0 && r != -ENOENT) {
12860 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12861 }
12862 if (r < 0) {
12863 return r;
12864 }
12865 bufferlist::iterator iter = bi_entry.data.begin();
12866 try {
12867 ::decode(*dirent, iter);
12868 } catch (buffer::error& err) {
12869 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12870 return -EIO;
12871 }
12872
12873 return 0;
12874}
12875
12876int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12877{
12878 BucketShard bs(this);
12879 int ret = bs.init(bucket, obj);
12880 if (ret < 0) {
12881 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12882 return ret;
12883 }
12884
12885 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12886
12887 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12888 if (ret < 0)
12889 return ret;
12890
12891 return 0;
12892}
12893
12894void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12895{
12896 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12897}
12898
12899int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12900{
12901 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12902 if (ret < 0)
12903 return ret;
12904
12905 return 0;
12906}
12907
12908int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12909{
12910 BucketShard bs(this);
12911 int ret = bs.init(bucket, obj);
12912 if (ret < 0) {
12913 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12914 return ret;
12915 }
12916
12917 return bi_put(bs, entry);
12918}
12919
12920int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12921{
12922 rgw_obj obj(bucket, obj_name);
12923 BucketShard bs(this);
12924 int ret = bs.init(bucket, obj);
12925 if (ret < 0) {
12926 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12927 return ret;
12928 }
12929
12930 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
12931 if (ret == -ENOENT) {
12932 *is_truncated = false;
12933 }
7c673cae
FG
12934 if (ret < 0)
12935 return ret;
12936
12937 return 0;
12938}
12939
12940int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12941{
12942 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12943 if (ret < 0)
12944 return ret;
12945
12946 return 0;
12947}
12948
12949int RGWRados::bi_remove(BucketShard& bs)
12950{
12951 int ret = bs.index_ctx.remove(bs.bucket_obj);
12952 if (ret == -ENOENT) {
12953 ret = 0;
12954 }
12955 if (ret < 0) {
12956 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12957 return ret;
12958 }
12959
12960 return 0;
12961}
12962
12963int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12964{
12965 BucketShard bs(this);
12966 int ret = bs.init(bucket, shard_id);
12967 if (ret < 0) {
12968 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12969 return ret;
12970 }
12971
12972 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12973}
12974
12975int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12976{
12977 return gc_pool_ctx.operate(oid, op);
12978}
12979
12980int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12981{
12982 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12983 int r = gc_pool_ctx.aio_operate(oid, c, op);
12984 c->release();
12985 return r;
12986}
12987
12988int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12989{
12990 return gc_pool_ctx.operate(oid, op, pbl);
12991}
12992
12993int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12994{
12995 return gc->list(index, marker, max, expired_only, result, truncated);
12996}
12997
12998int RGWRados::process_gc()
12999{
13000 return gc->process();
13001}
13002
13003int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
13004{
13005 return lc->list_lc_progress(marker, max_entries, progress_map);
13006}
13007
13008int RGWRados::process_lc()
13009{
13010 return lc->process();
13011}
13012
1adf2230 13013bool RGWRados::process_expire_objects()
7c673cae 13014{
1adf2230 13015 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
13016}
13017
13018int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
13019{
13020 bufferlist in;
13021 cls_rgw_bucket_init(op);
13022 return index_ctx.operate(oid, &op);
13023}
13024
13025int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
31f18b77 13026 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13027{
31f18b77
FG
13028 rgw_zone_set zones_trace;
13029 if (_zones_trace) {
13030 zones_trace = *_zones_trace;
13031 }
1adf2230
AA
13032 zones_trace.insert(get_zone().id);
13033
7c673cae
FG
13034 ObjectWriteOperation o;
13035 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77
FG
13036 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13037 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
7c673cae
FG
13038 return bs.index_ctx.operate(bs.bucket_obj, &o);
13039}
13040
31f18b77 13041int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
13042 int64_t pool, uint64_t epoch,
13043 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13044 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 13045{
7c673cae
FG
13046 ObjectWriteOperation o;
13047 rgw_bucket_dir_entry_meta dir_meta;
13048 dir_meta = ent.meta;
13049 dir_meta.category = category;
13050
1adf2230
AA
13051 rgw_zone_set zones_trace;
13052 if (_zones_trace) {
13053 zones_trace = *_zones_trace;
13054 }
13055 zones_trace.insert(get_zone().id);
13056
7c673cae
FG
13057 rgw_bucket_entry_ver ver;
13058 ver.pool = pool;
13059 ver.epoch = epoch;
13060 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
13061 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13062 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13063 get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
13064 complete_op_data *arg;
13065 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
1adf2230 13066 get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77
FG
13067 librados::AioCompletion *completion = arg->rados_completion;
13068 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
13069 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
13070 return ret;
13071}
13072
31f18b77 13073int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
13074 int64_t pool, uint64_t epoch,
13075 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 13076 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 13077{
31f18b77 13078 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13079}
13080
13081int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
13082 int64_t pool, uint64_t epoch,
13083 rgw_obj& obj,
13084 real_time& removed_mtime,
13085 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
13086 uint16_t bilog_flags,
13087 rgw_zone_set *zones_trace)
7c673cae
FG
13088{
13089 rgw_bucket_dir_entry ent;
13090 ent.meta.mtime = removed_mtime;
13091 obj.key.get_index_key(&ent.key);
31f18b77 13092 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
13093}
13094
31f18b77 13095int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
13096{
13097 rgw_bucket_dir_entry ent;
13098 obj.key.get_index_key(&ent.key);
31f18b77 13099 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
7c673cae
FG
13100}
13101
13102int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
13103{
13104 librados::IoCtx index_ctx;
13105 map<int, string> bucket_objs;
13106 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
13107 if (r < 0)
13108 return r;
13109
13110 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
13111}
13112
1adf2230
AA
13113
13114int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
13115 int shard_id,
13116 rgw_obj_index_key& start,
13117 const string& prefix,
13118 uint32_t num_entries,
13119 bool list_versions,
13120 map<string, rgw_bucket_dir_entry>& m,
13121 bool *is_truncated,
13122 rgw_obj_index_key *last_entry,
13123 bool (*force_check_filter)(const string& name))
7c673cae 13124{
1adf2230
AA
13125 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
13126 " start " << start.name << "[" << start.instance << "] num_entries " <<
13127 num_entries << dendl;
7c673cae
FG
13128
13129 librados::IoCtx index_ctx;
13130 // key - oid (for different shards if there is any)
1adf2230
AA
13131 // value - list result for the corresponding oid (shard), it is filled by
13132 // the AIO callback
7c673cae
FG
13133 map<int, string> oids;
13134 map<int, struct rgw_cls_list_ret> list_results;
13135 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13136 if (r < 0)
13137 return r;
13138
13139 cls_rgw_obj_key start_key(start.name, start.instance);
1adf2230
AA
13140 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
13141 list_versions, oids, list_results,
13142 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
13143 if (r < 0)
13144 return r;
13145
13146 // Create a list of iterators that are used to iterate each shard
13147 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
13148 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
13149 vector<string> vnames(list_results.size());
13150 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13151 *is_truncated = false;
13152 for (; iter != list_results.end(); ++iter) {
13153 vcurrents.push_back(iter->second.dir.m.begin());
13154 vends.push_back(iter->second.dir.m.end());
13155 vnames.push_back(oids[iter->first]);
13156 *is_truncated = (*is_truncated || iter->second.is_truncated);
13157 }
13158
13159 // Create a map to track the next candidate entry from each shard, if the entry
13160 // from a specified shard is selected/erased, the next entry from that shard will
13161 // be inserted for next round selection
13162 map<string, size_t> candidates;
13163 for (size_t i = 0; i < vcurrents.size(); ++i) {
13164 if (vcurrents[i] != vends[i]) {
13165 candidates[vcurrents[i]->first] = i;
13166 }
13167 }
13168
13169 map<string, bufferlist> updates;
13170 uint32_t count = 0;
13171 while (count < num_entries && !candidates.empty()) {
13172 r = 0;
13173 // Select the next one
13174 int pos = candidates.begin()->second;
13175 const string& name = vcurrents[pos]->first;
13176 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
13177
3efd9988
FG
13178 bool force_check = force_check_filter &&
13179 force_check_filter(dirent.key.name);
13180 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13181 !dirent.pending_map.empty() ||
13182 force_check) {
7c673cae
FG
13183 /* there are uncommitted ops. We need to check the current state,
13184 * and if the tags are old we need to do cleanup as well. */
13185 librados::IoCtx sub_ctx;
13186 sub_ctx.dup(index_ctx);
1adf2230
AA
13187 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
13188 updates[vnames[pos]]);
7c673cae
FG
13189 if (r < 0 && r != -ENOENT) {
13190 return r;
13191 }
13192 }
13193 if (r >= 0) {
1adf2230
AA
13194 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
13195 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae
FG
13196 m[name] = std::move(dirent);
13197 ++count;
13198 }
13199
13200 // Refresh the candidates map
13201 candidates.erase(candidates.begin());
13202 ++vcurrents[pos];
13203 if (vcurrents[pos] != vends[pos]) {
13204 candidates[vcurrents[pos]->first] = pos;
13205 }
13206 }
13207
13208 // Suggest updates if there is any
13209 map<string, bufferlist>::iterator miter = updates.begin();
13210 for (; miter != updates.end(); ++miter) {
13211 if (miter->second.length()) {
13212 ObjectWriteOperation o;
13213 cls_rgw_suggest_changes(o, miter->second);
13214 // we don't care if we lose suggested updates, send them off blindly
13215 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13216 index_ctx.aio_operate(miter->first, c, &o);
1adf2230 13217 c->release();
7c673cae
FG
13218 }
13219 }
13220
13221 // Check if all the returned entries are consumed or not
13222 for (size_t i = 0; i < vcurrents.size(); ++i) {
1adf2230 13223 if (vcurrents[i] != vends[i]) {
7c673cae 13224 *is_truncated = true;
1adf2230
AA
13225 break;
13226 }
7c673cae
FG
13227 }
13228 if (!m.empty())
13229 *last_entry = m.rbegin()->first;
13230
13231 return 0;
13232}
13233
1adf2230
AA
13234
13235int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
13236 int shard_id,
13237 rgw_obj_index_key& start,
13238 const string& prefix,
13239 uint32_t num_entries,
13240 bool list_versions,
13241 std::vector<rgw_bucket_dir_entry>& ent_list,
13242 bool *is_truncated,
13243 rgw_obj_index_key *last_entry,
13244 bool (*force_check_filter)(const string& name)) {
13245 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
13246 " start " << start.name << "[" << start.instance <<
13247 "] num_entries " << num_entries << dendl;
13248
13249 *is_truncated = false;
13250 librados::IoCtx index_ctx;
13251
13252 rgw_obj_index_key my_start = start;
13253
13254 map<int, string> oids;
13255 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13256 if (r < 0)
13257 return r;
13258 const uint32_t num_shards = oids.size();
13259
13260 uint32_t current_shard;
13261 if (shard_id >= 0) {
13262 current_shard = shard_id;
13263 } else if (my_start.empty()) {
13264 current_shard = 0u;
13265 } else {
13266 current_shard =
13267 rgw_bucket_shard_index(my_start.name, num_shards);
13268 }
13269
13270 uint32_t count = 0u;
13271 map<string, bufferlist> updates;
13272 std::string last_added_entry;
13273 while (count <= num_entries &&
13274 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
13275 current_shard < num_shards)) {
13276 // key - oid (for different shards if there is any)
13277 // value - list result for the corresponding oid (shard), it is filled by
13278 // the AIO callback
13279 map<int, struct rgw_cls_list_ret> list_results;
13280 r = CLSRGWIssueBucketList(index_ctx, my_start, prefix, num_entries,
13281 list_versions, oids, list_results,
13282 cct->_conf->rgw_bucket_index_max_aio)();
13283 if (r < 0)
13284 return r;
13285
13286 const std::string& oid = oids[current_shard];
13287 assert(list_results.find(current_shard) != list_results.end());
13288 auto& result = list_results[current_shard];
13289 for (auto& entry : result.dir.m) {
13290 rgw_bucket_dir_entry& dirent = entry.second;
13291
13292 bool force_check = force_check_filter &&
13293 force_check_filter(dirent.key.name);
13294 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13295 !dirent.pending_map.empty() ||
13296 force_check) {
13297 /* there are uncommitted ops. We need to check the current state,
13298 * and if the tags are old we need to do cleanup as well. */
13299 librados::IoCtx sub_ctx;
13300 sub_ctx.dup(index_ctx);
13301 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
13302 if (r < 0 && r != -ENOENT) {
13303 return r;
13304 }
13305 }
13306
13307 // at this point either r >=0 or r == -ENOENT
13308 if (r >= 0) { // i.e., if r != -ENOENT
13309 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
13310 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13311
13312 if (count < num_entries) {
13313 last_added_entry = entry.first;
13314 my_start = dirent.key;
13315 ent_list.emplace_back(std::move(dirent));
13316 ++count;
13317 } else {
13318 *is_truncated = true;
13319 goto check_updates;
13320 }
13321 } else { // r == -ENOENT
13322 // in the case of -ENOENT, make sure we're advancing marker
13323 // for possible next call to CLSRGWIssueBucketList
13324 my_start = dirent.key;
13325 }
13326 } // entry for loop
13327
13328 if (!result.is_truncated) {
13329 // if we reached the end of the shard read next shard
13330 ++current_shard;
13331 my_start = rgw_obj_index_key();
13332 }
13333 } // shard loop
13334
13335check_updates:
13336 // suggest updates if there is any
13337 map<string, bufferlist>::iterator miter = updates.begin();
13338 for (; miter != updates.end(); ++miter) {
13339 if (miter->second.length()) {
13340 ObjectWriteOperation o;
13341 cls_rgw_suggest_changes(o, miter->second);
13342 // we don't care if we lose suggested updates, send them off blindly
13343 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13344 index_ctx.aio_operate(miter->first, c, &o);
13345 c->release();
13346 }
13347 }
13348
13349 if (last_entry && !ent_list.empty()) {
13350 *last_entry = last_added_entry;
13351 }
13352
13353 return 0;
13354}
13355
13356
13357int RGWRados::cls_obj_usage_log_add(const string& oid,
13358 rgw_usage_log_info& info)
7c673cae
FG
13359{
13360 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13361
13362 rgw_rados_ref ref;
224ce89b 13363 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13364 if (r < 0) {
13365 return r;
13366 }
13367
13368 ObjectWriteOperation op;
13369 cls_rgw_usage_log_add(op, info);
13370
13371 r = ref.ioctx.operate(ref.oid, &op);
13372 return r;
13373}
13374
13375int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13376 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13377{
13378 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13379
13380 rgw_rados_ref ref;
224ce89b 13381 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13382 if (r < 0) {
13383 return r;
13384 }
13385
13386 *is_truncated = false;
13387
13388 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13389 max_entries, read_iter, usage, is_truncated);
13390
13391 return r;
13392}
13393
13394int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13395{
13396 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13397
13398 rgw_rados_ref ref;
224ce89b 13399 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13400 if (r < 0) {
13401 return r;
13402 }
13403
b32b8144 13404 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
7c673cae
FG
13405 return r;
13406}
13407
13408int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13409{
13410 librados::IoCtx index_ctx;
13411 string dir_oid;
13412
13413 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13414
13415 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13416 if (r < 0)
13417 return r;
13418
13419 bufferlist updates;
13420
13421 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13422 rgw_bucket_dir_entry entry;
13423 entry.key = *iter;
13424 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13425 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13426 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13427 ::encode(entry, updates);
13428 }
13429
13430 bufferlist out;
13431
13432 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13433
13434 return r;
13435}
13436
13437int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13438 const RGWBucketInfo& bucket_info,
13439 rgw_bucket_dir_entry& list_state,
13440 rgw_bucket_dir_entry& object,
13441 bufferlist& suggested_updates)
13442{
13443 const rgw_bucket& bucket = bucket_info.bucket;
13444 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13445
13446 std::string loc;
13447
13448 rgw_obj obj(bucket, list_state.key);
13449
13450 string oid;
13451 get_obj_bucket_and_oid_loc(obj, oid, loc);
13452
13453 if (loc != list_state.locator) {
13454 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13455 }
13456
13457 io_ctx.locator_set_key(list_state.locator);
13458
13459 RGWObjState *astate = NULL;
13460 RGWObjectCtx rctx(this);
13461 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13462 if (r < 0)
13463 return r;
13464
13465 list_state.pending_map.clear(); // we don't need this and it inflates size
13466 if (!astate->exists) {
13467 /* object doesn't exist right now -- hopefully because it's
13468 * marked as !exists and got deleted */
13469 if (list_state.exists) {
13470 /* FIXME: what should happen now? Work out if there are any
13471 * non-bad ways this could happen (there probably are, but annoying
13472 * to handle!) */
13473 }
13474 // encode a suggested removal of that key
13475 list_state.ver.epoch = io_ctx.get_last_version();
13476 list_state.ver.pool = io_ctx.get_id();
13477 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13478 return -ENOENT;
13479 }
13480
13481 string etag;
13482 string content_type;
13483 ACLOwner owner;
13484
13485 object.meta.size = astate->size;
13486 object.meta.accounted_size = astate->accounted_size;
13487 object.meta.mtime = astate->mtime;
13488
13489 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13490 if (iter != astate->attrset.end()) {
13491 etag = iter->second.c_str();
13492 }
13493 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13494 if (iter != astate->attrset.end()) {
13495 content_type = iter->second.c_str();
13496 }
13497 iter = astate->attrset.find(RGW_ATTR_ACL);
13498 if (iter != astate->attrset.end()) {
13499 r = decode_policy(iter->second, &owner);
13500 if (r < 0) {
13501 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13502 }
13503 }
13504
13505 if (astate->has_manifest) {
13506 RGWObjManifest::obj_iterator miter;
13507 RGWObjManifest& manifest = astate->manifest;
13508 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13509 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13510 rgw_obj loc;
13511 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13512
13513 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13514 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13515 r = delete_obj_index(loc);
13516 if (r < 0) {
13517 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13518 }
13519 }
13520 }
13521 }
13522
13523 object.meta.etag = etag;
13524 object.meta.content_type = content_type;
13525 object.meta.owner = owner.get_id().to_str();
13526 object.meta.owner_display_name = owner.get_display_name();
13527
13528 // encode suggested updates
13529 list_state.ver.pool = io_ctx.get_id();
13530 list_state.ver.epoch = astate->epoch;
13531 list_state.meta.size = object.meta.size;
13532 list_state.meta.accounted_size = object.meta.accounted_size;
13533 list_state.meta.mtime = object.meta.mtime;
13534 list_state.meta.category = main_category;
13535 list_state.meta.etag = etag;
13536 list_state.meta.content_type = content_type;
13537 if (astate->obj_tag.length() > 0)
13538 list_state.tag = astate->obj_tag.c_str();
13539 list_state.meta.owner = owner.get_id().to_str();
13540 list_state.meta.owner_display_name = owner.get_display_name();
13541
13542 list_state.exists = true;
13543 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13544 return 0;
13545}
13546
13547int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13548{
13549 librados::IoCtx index_ctx;
13550 map<int, string> oids;
13551 map<int, struct rgw_cls_list_ret> list_results;
13552 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13553 if (r < 0)
13554 return r;
13555
13556 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13557 if (r < 0)
13558 return r;
13559
13560 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13561 for(; iter != list_results.end(); ++iter) {
13562 headers[oids[iter->first]] = iter->second.dir.header;
13563 }
13564 return 0;
13565}
13566
13567int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13568{
13569 librados::IoCtx index_ctx;
13570 map<int, string> bucket_objs;
13571 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13572 if (r < 0)
13573 return r;
13574
13575 map<int, string>::iterator iter = bucket_objs.begin();
13576 for (; iter != bucket_objs.end(); ++iter) {
13577 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13578 if (r < 0) {
13579 ctx->put();
13580 break;
13581 } else {
13582 (*num_aio)++;
13583 }
13584 }
13585 return r;
13586}
13587
13588int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13589{
13590 string buckets_obj_id;
13591 rgw_get_buckets_obj(user_id, buckets_obj_id);
13592 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13593
13594 rgw_rados_ref ref;
224ce89b 13595 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13596 if (r < 0) {
13597 return r;
13598 }
13599
13600 librados::ObjectReadOperation op;
13601 int rc;
13602 ::cls_user_get_header(op, header, &rc);
13603 bufferlist ibl;
13604 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13605 if (r < 0)
13606 return r;
13607 if (rc < 0)
13608 return rc;
13609
13610 return 0;
13611}
13612
94b18763
FG
13613int RGWRados::cls_user_reset_stats(const string& user_id)
13614{
13615 string buckets_obj_id;
13616 rgw_get_buckets_obj(user_id, buckets_obj_id);
13617 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13618
13619 rgw_rados_ref ref;
13620 int r = get_raw_obj_ref(obj, &ref);
13621 if (r < 0) {
13622 return r;
13623 }
13624
13625 librados::ObjectWriteOperation op;
13626 ::cls_user_reset_stats(op);
13627 return ref.ioctx.operate(ref.oid, &op);
13628}
13629
7c673cae
FG
13630int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13631{
13632 string buckets_obj_id;
13633 rgw_get_buckets_obj(user_id, buckets_obj_id);
13634 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13635
13636 rgw_rados_ref ref;
224ce89b 13637 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13638 if (r < 0) {
13639 return r;
13640 }
13641
13642 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13643 if (r < 0)
13644 return r;
13645
13646 return 0;
13647}
13648
13649int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13650{
13651 map<string, struct rgw_bucket_dir_header> headers;
13652 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13653 if (r < 0) {
13654 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13655 return r;
13656 }
13657
13658 cls_user_bucket_entry entry;
13659
13660 bucket_info.bucket.convert(&entry.bucket);
13661
c07f9fc5
FG
13662 for (const auto& hiter : headers) {
13663 for (const auto& iter : hiter.second.stats) {
13664 const struct rgw_bucket_category_stats& header_stats = iter.second;
7c673cae
FG
13665 entry.size += header_stats.total_size;
13666 entry.size_rounded += header_stats.total_size_rounded;
13667 entry.count += header_stats.num_entries;
13668 }
13669 }
13670
13671 list<cls_user_bucket_entry> entries;
13672 entries.push_back(entry);
13673
13674 r = cls_user_update_buckets(user_obj, entries, false);
13675 if (r < 0) {
13676 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13677 return r;
13678 }
13679
13680 return 0;
13681}
13682
c07f9fc5
FG
13683int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13684{
13685 map<string, struct rgw_bucket_dir_header> headers;
13686 RGWBucketInfo bucket_info;
13687 RGWObjectCtx obj_ctx(this);
13688 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13689 if (ret < 0) {
13690 return ret;
13691 }
13692
13693 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13694 if (ret < 0) {
13695 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13696 return ret;
13697 }
13698
13699 bucket.convert(&entry.bucket);
13700
13701 for (const auto& hiter : headers) {
13702 for (const auto& iter : hiter.second.stats) {
13703 const struct rgw_bucket_category_stats& header_stats = iter.second;
13704 entry.size += header_stats.total_size;
13705 entry.size_rounded += header_stats.total_size_rounded;
13706 entry.count += header_stats.num_entries;
13707 }
13708 }
13709
13710 return 0;
13711}
13712
7c673cae
FG
13713int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13714 const string& in_marker,
13715 const string& end_marker,
13716 const int max_entries,
13717 list<cls_user_bucket_entry>& entries,
13718 string * const out_marker,
13719 bool * const truncated)
13720{
13721 rgw_rados_ref ref;
224ce89b 13722 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13723 if (r < 0) {
13724 return r;
13725 }
13726
13727 librados::ObjectReadOperation op;
13728 int rc;
13729
13730 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13731 bufferlist ibl;
13732 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13733 if (r < 0)
13734 return r;
13735 if (rc < 0)
13736 return rc;
13737
13738 return 0;
13739}
13740
13741int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13742{
13743 rgw_rados_ref ref;
224ce89b 13744 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13745 if (r < 0) {
13746 return r;
13747 }
13748
13749 librados::ObjectWriteOperation op;
13750 cls_user_set_buckets(op, entries, add);
13751 r = ref.ioctx.operate(ref.oid, &op);
13752 if (r < 0)
13753 return r;
13754
13755 return 0;
13756}
13757
13758int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13759{
13760 string buckets_obj_id;
13761 rgw_get_buckets_obj(user_id, buckets_obj_id);
13762 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13763 return cls_user_complete_stats_sync(obj);
13764}
13765
13766int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13767{
13768 rgw_rados_ref ref;
224ce89b 13769 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
13770 if (r < 0) {
13771 return r;
13772 }
13773
13774 librados::ObjectWriteOperation op;
13775 ::cls_user_complete_stats_sync(op);
13776 r = ref.ioctx.operate(ref.oid, &op);
13777 if (r < 0)
13778 return r;
13779
13780 return 0;
13781}
13782
13783int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13784{
13785 list<cls_user_bucket_entry> l;
13786 l.push_back(entry);
13787
13788 return cls_user_update_buckets(obj, l, true);
13789}
13790
13791int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13792{
7c673cae 13793 rgw_rados_ref ref;
224ce89b 13794 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
13795 if (r < 0) {
13796 return r;
13797 }
13798
13799 librados::ObjectWriteOperation op;
13800 ::cls_user_remove_bucket(op, bucket);
13801 r = ref.ioctx.operate(ref.oid, &op);
13802 if (r < 0)
13803 return r;
13804
13805 return 0;
13806}
13807
224ce89b 13808int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
31f18b77
FG
13809 RGWQuotaInfo& bucket_quota)
13810{
13811 if (!cct->_conf->rgw_dynamic_resharding) {
13812 return 0;
13813 }
13814
13815 bool need_resharding = false;
13816 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13817 uint32_t suggested_num_shards;
13818
13819 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13820 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13821 1, need_resharding, &suggested_num_shards);
13822 if (ret < 0) {
13823 return ret;
13824 }
13825
13826 if (need_resharding) {
224ce89b
WB
13827 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13828 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13829 dendl;
31f18b77
FG
13830 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13831 }
13832
13833 return ret;
13834}
13835
13836int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13837{
13838 RGWReshard reshard(this);
13839
13840 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13841
13842 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13843 if (new_num_shards <= num_source_shards) {
13844 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13845 return 0;
13846 }
13847
13848 cls_rgw_reshard_entry entry;
13849 entry.time = real_clock::now();
13850 entry.tenant = bucket_info.owner.tenant;
13851 entry.bucket_name = bucket_info.bucket.name;
13852 entry.bucket_id = bucket_info.bucket.bucket_id;
13853 entry.old_num_shards = num_source_shards;
13854 entry.new_num_shards = new_num_shards;
13855
13856 return reshard.add(entry);
13857}
13858
7c673cae
FG
13859int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13860 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13861{
13862 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13863}
13864
13865void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
1adf2230
AA
13866 uint32_t num_shards,
13867 map<int, string>& bucket_objects,
13868 int shard_id) {
7c673cae
FG
13869 if (!num_shards) {
13870 bucket_objects[0] = bucket_oid_base;
13871 } else {
13872 char buf[bucket_oid_base.size() + 32];
13873 if (shard_id < 0) {
13874 for (uint32_t i = 0; i < num_shards; ++i) {
13875 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13876 bucket_objects[i] = buf;
13877 }
13878 } else {
13879 if ((uint32_t)shard_id > num_shards) {
13880 return;
13881 }
13882 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13883 bucket_objects[shard_id] = buf;
13884 }
13885 }
13886}
13887
13888void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13889{
13890 const rgw_bucket& bucket = bucket_info.bucket;
13891 string plain_id = bucket.name + ":" + bucket.bucket_id;
13892 if (!bucket_info.num_shards) {
13893 (*result)[0] = plain_id;
13894 } else {
13895 char buf[16];
13896 if (shard_id < 0) {
13897 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13898 snprintf(buf, sizeof(buf), ":%d", i);
13899 (*result)[i] = plain_id + buf;
13900 }
13901 } else {
13902 if ((uint32_t)shard_id > bucket_info.num_shards) {
13903 return;
13904 }
13905 snprintf(buf, sizeof(buf), ":%d", shard_id);
13906 (*result)[shard_id] = plain_id + buf;
13907 }
13908 }
13909}
13910
13911int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13912 int *shard_id)
13913{
13914 int r = 0;
13915 switch (bucket_info.bucket_index_shard_hash_type) {
13916 case RGWBucketInfo::MOD:
13917 if (!bucket_info.num_shards) {
13918 if (shard_id) {
13919 *shard_id = -1;
13920 }
13921 } else {
1adf2230 13922 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
7c673cae
FG
13923 if (shard_id) {
13924 *shard_id = (int)sid;
13925 }
13926 }
13927 break;
13928 default:
13929 r = -ENOTSUP;
13930 }
13931 return r;
13932}
13933
13934void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13935 int shard_id, string *bucket_obj)
13936{
13937 if (!num_shards) {
13938 // By default with no sharding, we use the bucket oid as itself
13939 (*bucket_obj) = bucket_oid_base;
13940 } else {
13941 char buf[bucket_oid_base.size() + 32];
13942 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13943 (*bucket_obj) = buf;
13944 }
13945}
13946
13947int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13948 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13949{
13950 int r = 0;
13951 switch (hash_type) {
13952 case RGWBucketInfo::MOD:
13953 if (!num_shards) {
13954 // By default with no sharding, we use the bucket oid as itself
13955 (*bucket_obj) = bucket_oid_base;
13956 if (shard_id) {
13957 *shard_id = -1;
13958 }
13959 } else {
1adf2230 13960 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
13961 char buf[bucket_oid_base.size() + 32];
13962 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13963 (*bucket_obj) = buf;
13964 if (shard_id) {
13965 *shard_id = (int)sid;
13966 }
13967 }
13968 break;
13969 default:
13970 r = -ENOTSUP;
13971 }
13972 return r;
13973}
13974
13975void RGWStateLog::oid_str(int shard, string& oid) {
13976 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13977 char buf[16];
13978 snprintf(buf, sizeof(buf), "%d", shard);
13979 oid += buf;
13980}
13981
13982int RGWStateLog::get_shard_num(const string& object) {
13983 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13984 return val % num_shards;
13985}
13986
13987string RGWStateLog::get_oid(const string& object) {
13988 int shard = get_shard_num(object);
13989 string oid;
13990 oid_str(shard, oid);
13991 return oid;
13992}
13993
13994int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13995 rgw_pool pool;
13996 store->get_log_pool(pool);
13997 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13998 if (r < 0) {
13999 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
14000 return r;
14001 }
14002 return 0;
14003}
14004
14005int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
14006 uint32_t state, bufferlist *bl, uint32_t *check_state)
14007{
14008 if (client_id.empty() ||
14009 op_id.empty() ||
14010 object.empty()) {
14011 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14012 }
14013
14014 librados::IoCtx ioctx;
14015 int r = open_ioctx(ioctx);
14016 if (r < 0)
14017 return r;
14018
14019 string oid = get_oid(object);
14020
14021 librados::ObjectWriteOperation op;
14022 if (check_state) {
14023 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
14024 }
14025 utime_t ts = ceph_clock_now();
14026 bufferlist nobl;
14027 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
14028 r = ioctx.operate(oid, &op);
14029 if (r < 0) {
14030 return r;
14031 }
14032
14033 return 0;
14034}
14035
14036int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
14037{
14038 if (client_id.empty() ||
14039 op_id.empty() ||
14040 object.empty()) {
14041 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14042 }
14043
14044 librados::IoCtx ioctx;
14045 int r = open_ioctx(ioctx);
14046 if (r < 0)
14047 return r;
14048
14049 string oid = get_oid(object);
14050
14051 librados::ObjectWriteOperation op;
14052 cls_statelog_remove_by_object(op, object, op_id);
14053 r = ioctx.operate(oid, &op);
14054 if (r < 0) {
14055 return r;
14056 }
14057
14058 return 0;
14059}
14060
14061void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
14062 void **handle)
14063{
14064 list_state *state = new list_state;
14065 state->client_id = client_id;
14066 state->op_id = op_id;
14067 state->object = object;
14068 if (object.empty()) {
14069 state->cur_shard = 0;
14070 state->max_shard = num_shards - 1;
14071 } else {
14072 state->cur_shard = state->max_shard = get_shard_num(object);
14073 }
14074 *handle = (void *)state;
14075}
14076
14077int RGWStateLog::list_entries(void *handle, int max_entries,
14078 list<cls_statelog_entry>& entries,
14079 bool *done)
14080{
14081 list_state *state = static_cast<list_state *>(handle);
14082
14083 librados::IoCtx ioctx;
14084 int r = open_ioctx(ioctx);
14085 if (r < 0)
14086 return r;
14087
14088 entries.clear();
14089
14090 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
14091 string oid;
14092 oid_str(state->cur_shard, oid);
14093
14094 librados::ObjectReadOperation op;
14095 list<cls_statelog_entry> ents;
14096 bool truncated;
14097 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
14098 max_entries, ents, &state->marker, &truncated);
14099 bufferlist ibl;
14100 r = ioctx.operate(oid, &op, &ibl);
14101 if (r == -ENOENT) {
14102 truncated = false;
14103 r = 0;
14104 }
14105 if (r < 0) {
14106 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
14107 return r;
14108 }
14109
14110 if (!truncated) {
14111 state->marker.clear();
14112 }
14113
14114 max_entries -= ents.size();
14115
14116 entries.splice(entries.end(), ents);
14117
14118 if (truncated)
14119 break;
14120 }
14121
14122 *done = (state->cur_shard > state->max_shard);
14123
14124 return 0;
14125}
14126
14127void RGWStateLog::finish_list_entries(void *handle)
14128{
14129 list_state *state = static_cast<list_state *>(handle);
14130 delete state;
14131}
14132
14133void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
14134{
14135 f->open_object_section("statelog_entry");
14136 f->dump_string("client_id", entry.client_id);
14137 f->dump_string("op_id", entry.op_id);
14138 f->dump_string("object", entry.object);
14139 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
14140 if (!dump_entry_internal(entry, f)) {
14141 f->dump_int("state", entry.state);
14142 }
14143 f->close_section();
14144}
14145
14146RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
14147{
14148}
14149
14150bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
14151{
14152 string s;
14153 switch ((OpState)entry.state) {
14154 case OPSTATE_UNKNOWN:
14155 s = "unknown";
14156 break;
14157 case OPSTATE_IN_PROGRESS:
14158 s = "in-progress";
14159 break;
14160 case OPSTATE_COMPLETE:
14161 s = "complete";
14162 break;
14163 case OPSTATE_ERROR:
14164 s = "error";
14165 break;
14166 case OPSTATE_ABORT:
14167 s = "abort";
14168 break;
14169 case OPSTATE_CANCELLED:
14170 s = "cancelled";
14171 break;
14172 default:
14173 s = "invalid";
14174 }
14175 f->dump_string("state", s);
14176 return true;
14177}
14178
14179int RGWOpState::state_from_str(const string& s, OpState *state)
14180{
14181 if (s == "unknown") {
14182 *state = OPSTATE_UNKNOWN;
14183 } else if (s == "in-progress") {
14184 *state = OPSTATE_IN_PROGRESS;
14185 } else if (s == "complete") {
14186 *state = OPSTATE_COMPLETE;
14187 } else if (s == "error") {
14188 *state = OPSTATE_ERROR;
14189 } else if (s == "abort") {
14190 *state = OPSTATE_ABORT;
14191 } else if (s == "cancelled") {
14192 *state = OPSTATE_CANCELLED;
14193 } else {
14194 return -EINVAL;
14195 }
14196
14197 return 0;
14198}
14199
14200int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
14201{
14202 uint32_t s = (uint32_t)state;
14203 return store_entry(client_id, op_id, object, s, NULL, NULL);
14204}
14205
14206int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
14207{
14208 uint32_t s = (uint32_t)state;
14209 return store_entry(client_id, op_id, object, s, NULL, &s);
14210}
14211
14212RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
14213 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
14214{
14215 cct = store->ctx();
14216 cur_state = RGWOpState::OPSTATE_UNKNOWN;
14217}
14218
14219int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
14220 last_update = real_clock::now();
14221 cur_state = state;
14222 return os.set_state(client_id, op_id, object, state);
14223}
14224
14225int RGWOpStateSingleOp::renew_state() {
14226 real_time now = real_clock::now();
14227
14228 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
14229
14230 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
14231 return 0;
14232 }
14233
14234 last_update = now;
14235 return os.renew_state(client_id, op_id, object, cur_state);
14236}
14237
14238
14239uint64_t RGWRados::instance_id()
14240{
14241 return get_rados_handle()->get_instance_id();
14242}
14243
14244uint64_t RGWRados::next_bucket_id()
14245{
14246 Mutex::Locker l(bucket_id_lock);
14247 return ++max_bucket_id;
14248}
14249
28e407b8
AA
14250RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
14251 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
7c673cae 14252{
7c673cae
FG
14253 RGWRados *store = NULL;
14254 if (!use_cache) {
14255 store = new RGWRados;
14256 } else {
28e407b8 14257 store = new RGWCache<RGWRados>;
7c673cae
FG
14258 }
14259
31f18b77 14260 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
7c673cae
FG
14261 delete store;
14262 return NULL;
14263 }
14264
14265 return store;
14266}
14267
14268RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
14269{
14270 RGWRados *store = NULL;
14271 store = new RGWRados;
14272
14273 store->set_context(cct);
14274
14275 if (store->init_rados() < 0) {
14276 delete store;
14277 return NULL;
14278 }
14279
14280 return store;
14281}
14282
14283void RGWStoreManager::close_storage(RGWRados *store)
14284{
14285 if (!store)
14286 return;
14287
14288 store->finalize();
14289
14290 delete store;
14291}
14292
14293librados::Rados* RGWRados::get_rados_handle()
14294{
14295 if (rados.size() == 1) {
14296 return &rados[0];
14297 } else {
14298 handle_lock.get_read();
14299 pthread_t id = pthread_self();
14300 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
14301
14302 if (it != rados_map.end()) {
14303 handle_lock.put_read();
14304 return &rados[it->second];
14305 } else {
14306 handle_lock.put_read();
14307 handle_lock.get_write();
14308 const uint32_t handle = next_rados_handle;
14309 rados_map[id] = handle;
14310 if (++next_rados_handle == rados.size()) {
14311 next_rados_handle = 0;
14312 }
14313 handle_lock.put_write();
14314 return &rados[handle];
14315 }
14316 }
14317}
14318
14319int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
14320{
14321 rgw_rados_ref ref;
14322 int ret = get_raw_obj_ref(obj, &ref);
14323 if (ret < 0) {
14324 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14325 return ret;
14326 }
14327
14328 ObjectWriteOperation op;
14329 list<string> prefixes;
14330 cls_rgw_remove_obj(op, prefixes);
14331
14332 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14333 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14334 if (ret < 0) {
14335 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14336 c->release();
14337 return ret;
14338 }
14339
14340 handles.push_back(c);
14341
14342 return 0;
14343}
14344
14345int RGWRados::delete_obj_aio(const rgw_obj& obj,
14346 RGWBucketInfo& bucket_info, RGWObjState *astate,
14347 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14348{
14349 rgw_rados_ref ref;
14350 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14351 if (ret < 0) {
14352 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14353 return ret;
14354 }
14355
14356 if (keep_index_consistent) {
14357 RGWRados::Bucket bop(this, bucket_info);
14358 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14359
14360 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14361 if (ret < 0) {
14362 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14363 return ret;
14364 }
14365 }
14366
14367 ObjectWriteOperation op;
14368 list<string> prefixes;
14369 cls_rgw_remove_obj(op, prefixes);
14370
14371 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14372 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14373 if (ret < 0) {
14374 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14375 c->release();
14376 return ret;
14377 }
14378
14379 handles.push_back(c);
14380
14381 if (keep_index_consistent) {
14382 ret = delete_obj_index(obj);
14383 if (ret < 0) {
14384 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14385 return ret;
14386 }
14387 }
14388 return ret;
14389}
14390
14391int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14392 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14393 if (value != attrs.end()) {
14394 bufferlist::iterator bliter = value->second.begin();
14395 try {
14396 ::decode(cs_info, bliter);
14397 } catch (buffer::error& err) {
14398 return -EIO;
14399 }
14400 if (cs_info.blocks.size() == 0) {
14401 return -EIO;
14402 }
14403 if (cs_info.compression_type != "none")
14404 need_decompress = true;
14405 else
14406 need_decompress = false;
14407 return 0;
14408 } else {
14409 need_decompress = false;
14410 return 0;
14411 }
14412}
14413
3a9019d9
FG
14414bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14415 bufferlist& out)
14416{
14417 if (command == "cache list") {
14418 boost::optional<std::string> filter;
14419 auto i = cmdmap.find("filter");
14420 if (i != cmdmap.cend()) {
14421 filter = boost::get<std::string>(i->second);
14422 }
14423 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14424 if (f) {
14425 f->open_array_section("cache_entries");
14426 call_list(filter, f.get());
14427 f->close_section();
14428 f->flush(out);
14429 return true;
14430 } else {
14431 out.append("Unable to create Formatter.\n");
14432 return false;
14433 }
14434 } else if (command == "cache inspect") {
14435 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14436 if (f) {
14437 const auto& target = boost::get<std::string>(cmdmap["target"]);
14438 if (call_inspect(target, f.get())) {
14439 f->flush(out);
14440 return true;
14441 } else {
14442 out.append(string("Unable to find entry ") + target + string(".\n"));
14443 return false;
14444 }
14445 } else {
14446 out.append("Unable to create Formatter.\n");
14447 return false;
14448 }
14449 } else if (command == "cache erase") {
14450 const auto& target = boost::get<std::string>(cmdmap["target"]);
14451 if (call_erase(target)) {
14452 return true;
14453 } else {
14454 out.append(string("Unable to find entry ") + target + string(".\n"));
14455 return false;
14456 }
14457 } else if (command == "cache zap") {
14458 call_zap();
14459 return true;
14460 }
14461 return false;
14462}
14463
14464void RGWRados::call_list(const boost::optional<std::string>&,
14465 ceph::Formatter*)
14466{
14467 return;
14468}
14469
14470bool RGWRados::call_inspect(const std::string&, Formatter*)
14471{
14472 return false;
14473}
14474
14475bool RGWRados::call_erase(const std::string&) {
14476 return false;
14477}
14478
14479void RGWRados::call_zap() {
14480 return;
14481}