]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
4#include <errno.h>
5#include <stdlib.h>
6#include <sys/types.h>
7#include <boost/algorithm/string.hpp>
8
9#include <boost/format.hpp>
10#include <boost/optional.hpp>
11#include <boost/utility/in_place_factory.hpp>
12
13#include "common/ceph_json.h"
14#include "common/utf8.h"
15
16#include "common/errno.h"
17#include "common/Formatter.h"
18#include "common/Throttle.h"
19#include "common/Finisher.h"
20
21#include "rgw_rados.h"
22#include "rgw_cache.h"
23#include "rgw_acl.h"
24#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
25#include "rgw_metadata.h"
26#include "rgw_bucket.h"
27#include "rgw_rest_conn.h"
28#include "rgw_cr_rados.h"
29#include "rgw_cr_rest.h"
30
31#include "cls/rgw/cls_rgw_ops.h"
32#include "cls/rgw/cls_rgw_types.h"
33#include "cls/rgw/cls_rgw_client.h"
34#include "cls/rgw/cls_rgw_const.h"
35#include "cls/refcount/cls_refcount_client.h"
36#include "cls/version/cls_version_client.h"
37#include "cls/log/cls_log_client.h"
38#include "cls/statelog/cls_statelog_client.h"
39#include "cls/timeindex/cls_timeindex_client.h"
40#include "cls/lock/cls_lock_client.h"
41#include "cls/user/cls_user_client.h"
42
43#include "rgw_tools.h"
44#include "rgw_coroutine.h"
45#include "rgw_compression.h"
46
47#include "rgw_boost_asio_yield.h"
48#undef fork // fails to compile RGWPeriod::fork() below
49
50#include "common/Clock.h"
51
52#include "include/rados/librados.hpp"
53using namespace librados;
54
55#include <string>
56#include <iostream>
57#include <vector>
58#include <atomic>
59#include <list>
60#include <map>
61#include "auth/Crypto.h" // get_random_bytes()
62
63#include "rgw_log.h"
64
65#include "rgw_gc.h"
66#include "rgw_lc.h"
67
68#include "rgw_object_expirer_core.h"
69#include "rgw_sync.h"
70#include "rgw_data_sync.h"
71#include "rgw_realm_watcher.h"
72
73#include "compressor/Compressor.h"
74
75#include <atomic>
76
77#define dout_context g_ceph_context
78#define dout_subsys ceph_subsys_rgw
79
80using namespace std;
81
82static string notify_oid_prefix = "notify";
83static string *notify_oids = NULL;
84static string shadow_ns = "shadow";
85static string dir_oid_prefix = ".dir.";
86static string default_storage_pool_suffix = "rgw.buckets.data";
87static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89static string avail_pools = ".pools.avail";
90
91static string zone_info_oid_prefix = "zone_info.";
92static string zone_names_oid_prefix = "zone_names.";
93static string region_info_oid_prefix = "region_info.";
94static string zone_group_info_oid_prefix = "zonegroup_info.";
95static string realm_names_oid_prefix = "realms_names.";
96static string realm_info_oid_prefix = "realms.";
97static string default_region_info_oid = "default.region";
98static string default_zone_group_info_oid = "default.zonegroup";
99static string period_info_oid_prefix = "periods.";
100static string period_latest_epoch_info_oid = ".latest_epoch";
101static string region_map_oid = "region_map";
102static string zonegroup_map_oid = "zonegroup_map";
103static string log_lock_name = "rgw_log_lock";
104static string default_realm_info_oid = "default.realm";
105const string default_zonegroup_name = "default";
106const string default_zone_name = "default";
107static string zonegroup_names_oid_prefix = "zonegroups_names.";
108static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109#define RGW_USAGE_OBJ_PREFIX "usage."
110#define FIRST_EPOCH 1
111static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116#define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118#define dout_subsys ceph_subsys_rgw
119
120
121static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123{
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
133 *pool = placement.data_extra_pool;
134 }
135 }
136
137 return true;
138}
139
140static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142{
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146}
147
148rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149{
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156}
157
158rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159{
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166}
167
168int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169{
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
173 if (r < 0 && r != -EEXIST) {
174 return r;
175 }
176
177 r = rados->ioctx_create(pool.name.c_str(), ioctx);
178 }
179 if (r < 0) {
180 return r;
181 }
182 if (!pool.ns.empty()) {
183 ioctx.set_namespace(pool.ns);
184 }
185 return 0;
186}
187
188template<>
189void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
190 RWLock::WLocker wl(lock);
191 auto iter = objs_state.find(obj);
192 if (iter == objs_state.end()) {
193 return;
194 }
195 bool is_atomic = iter->second.is_atomic;
196 bool prefetch_data = iter->second.prefetch_data;
197
198 objs_state.erase(iter);
199
200 if (is_atomic || prefetch_data) {
201 auto& s = objs_state[obj];
202 s.is_atomic = is_atomic;
203 s.prefetch_data = prefetch_data;
204 }
205}
206
207template<>
208void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
209 RWLock::WLocker wl(lock);
210 auto iter = objs_state.find(obj);
211 if (iter == objs_state.end()) {
212 return;
213 }
214
215 objs_state.erase(iter);
216}
217
218void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
219 encode_json("default_zonegroup", default_zonegroup, f);
220}
221
222void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
223
224 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
225 /* backward compatability with region */
226 if (default_zonegroup.empty()) {
227 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
228 }
229}
230
231rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
232{
233 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
234 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
235 }
236
237 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
238}
239
240int RGWZoneGroup::create_default(bool old_format)
241{
242 name = default_zonegroup_name;
243 is_master = true;
244
245 RGWZoneGroupPlacementTarget placement_target;
246 placement_target.name = "default-placement";
247 placement_targets[placement_target.name] = placement_target;
248 default_placement = "default-placement";
249
250 RGWZoneParams zone_params(default_zone_name);
251
252 int r = zone_params.init(cct, store, false);
253 if (r < 0) {
254 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
255 return r;
256 }
257
258 r = zone_params.create_default();
259 if (r < 0 && r != -EEXIST) {
260 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
261 return r;
262 } else if (r == -EEXIST) {
263 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
264 zone_params.clear_id();
265 r = zone_params.init(cct, store);
266 if (r < 0) {
267 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
268 return r;
269 }
270 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
271 << dendl;
272 }
273
274 RGWZone& default_zone = zones[zone_params.get_id()];
275 default_zone.name = zone_params.get_name();
276 default_zone.id = zone_params.get_id();
277 master_zone = default_zone.id;
278
279 r = create();
280 if (r < 0 && r != -EEXIST) {
281 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
282 return r;
283 }
284
285 if (r == -EEXIST) {
286 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
287 id.clear();
288 r = init(cct, store);
289 if (r < 0) {
290 return r;
291 }
292 }
293
294 if (old_format) {
295 name = id;
296 }
297
298 post_process_params();
299
300 return 0;
301}
302
303const string RGWZoneGroup::get_default_oid(bool old_region_format)
304{
305 if (old_region_format) {
306 if (cct->_conf->rgw_default_region_info_oid.empty()) {
307 return default_region_info_oid;
308 }
309 return cct->_conf->rgw_default_region_info_oid;
310 }
311
312 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
313
314 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
315 default_oid = default_zone_group_info_oid;
316 }
317
318 default_oid += "." + realm_id;
319
320 return default_oid;
321}
322
323const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
324{
325 if (old_region_format) {
326 return region_info_oid_prefix;
327 }
328 return zone_group_info_oid_prefix;
329}
330
331const string& RGWZoneGroup::get_names_oid_prefix()
332{
333 return zonegroup_names_oid_prefix;
334}
335
336const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
337 return cct->_conf->rgw_zonegroup;
338}
339
340int RGWZoneGroup::equals(const string& other_zonegroup) const
341{
342 if (is_master && other_zonegroup.empty())
343 return true;
344
345 return (id == other_zonegroup);
346}
347
348int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
349 const list<string>& endpoints, const string *ptier_type,
350 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
351{
352 auto& zone_id = zone_params.get_id();
353 auto& zone_name = zone_params.get_name();
354
355 // check for duplicate zone name on insert
356 if (!zones.count(zone_id)) {
357 for (const auto& zone : zones) {
358 if (zone.second.name == zone_name) {
359 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
360 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
361 return -EEXIST;
362 }
363 }
364 }
365
366 if (is_master) {
367 if (*is_master) {
368 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
369 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
370 }
371 master_zone = zone_params.get_id();
372 } else if (master_zone == zone_params.get_id()) {
373 master_zone.clear();
374 }
375 }
376
377 RGWZone& zone = zones[zone_params.get_id()];
378 zone.name = zone_params.get_name();
379 zone.id = zone_params.get_id();
380 if (!endpoints.empty()) {
381 zone.endpoints = endpoints;
382 }
383 if (read_only) {
384 zone.read_only = *read_only;
385 }
386 if (ptier_type) {
387 zone.tier_type = *ptier_type;
388 }
389
390 if (psync_from_all) {
391 zone.sync_from_all = *psync_from_all;
392 }
393
394 for (auto add : sync_from) {
395 zone.sync_from.insert(add);
396 }
397
398 for (auto rm : sync_from_rm) {
399 zone.sync_from.erase(rm);
400 }
401
402 post_process_params();
403
404 return update();
405}
406
407
408int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
409{
410 RGWZone& zone = zones[zone_params.get_id()];
411 zone.name = zone_params.get_name();
412
413 return update();
414}
415
416void RGWZoneGroup::post_process_params()
417{
418 bool log_data = zones.size() > 1;
419
420 if (master_zone.empty()) {
421 map<string, RGWZone>::iterator iter = zones.begin();
422 if (iter != zones.end()) {
423 master_zone = iter->first;
424 }
425 }
426
427 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
428 RGWZone& zone = iter->second;
429 zone.log_data = log_data;
430 zone.log_meta = (is_master && zone.id == master_zone);
431
432 RGWZoneParams zone_params(zone.id, zone.name);
433 int ret = zone_params.init(cct, store);
434 if (ret < 0) {
435 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
436 continue;
437 }
438
439 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
440 iter != zone_params.placement_pools.end(); ++iter) {
441 const string& placement_name = iter->first;
442 if (placement_targets.find(placement_name) == placement_targets.end()) {
443 RGWZoneGroupPlacementTarget placement_target;
444 placement_target.name = placement_name;
445 placement_targets[placement_name] = placement_target;
446 }
447 }
448 }
449
450 if (default_placement.empty() && !placement_targets.empty()) {
451 default_placement = placement_targets.begin()->first;
452 }
453}
454
455int RGWZoneGroup::remove_zone(const std::string& zone_id)
456{
457 map<string, RGWZone>::iterator iter = zones.find(zone_id);
458 if (iter == zones.end()) {
459 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
460 << name << dendl;
461 return -ENOENT;
462 }
463
464 zones.erase(iter);
465
466 post_process_params();
467
468 return update();
469}
470
471int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
472{
473 if (realm_id.empty()) {
474 /* try using default realm */
475 RGWRealm realm;
476 int ret = realm.init(cct, store);
477 if (ret < 0) {
478 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
479 return -ENOENT;
480 }
481 realm_id = realm.get_id();
482 }
483
484 return RGWSystemMetaObj::read_default_id(default_id, old_format);
485}
486
487int RGWZoneGroup::set_as_default(bool exclusive)
488{
489 if (realm_id.empty()) {
490 /* try using default realm */
491 RGWRealm realm;
492 int ret = realm.init(cct, store);
493 if (ret < 0) {
494 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
495 return -EINVAL;
496 }
497 realm_id = realm.get_id();
498 }
499
500 return RGWSystemMetaObj::set_as_default(exclusive);
501}
502
503int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
504{
505 cct = _cct;
506 store = _store;
507
508 if (!setup_obj)
509 return 0;
510
511 if (old_format && id.empty()) {
512 id = name;
513 }
514
515 if (id.empty()) {
516 int r;
517 if (name.empty()) {
518 name = get_predefined_name(cct);
519 }
520 if (name.empty()) {
521 r = use_default(old_format);
522 if (r < 0) {
523 return r;
524 }
525 } else if (!old_format) {
526 r = read_id(name, id);
527 if (r < 0) {
528 if (r != -ENOENT) {
529 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
530 }
531 return r;
532 }
533 }
534 }
535
536 return read_info(id, old_format);
537}
538
539int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
540{
541 auto pool = get_pool(cct);
542 bufferlist bl;
543 RGWObjectCtx obj_ctx(store);
544 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
545 if (ret < 0)
546 return ret;
547
548 try {
549 bufferlist::iterator iter = bl.begin();
550 ::decode(default_info, iter);
551 } catch (buffer::error& err) {
552 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
553 return -EIO;
554 }
555
556 return 0;
557}
558
559int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
560{
561 RGWDefaultSystemMetaObjInfo default_info;
562
563 int ret = read_default(default_info, get_default_oid(old_format));
564 if (ret < 0) {
565 return ret;
566 }
567
568 default_id = default_info.default_id;
569
570 return 0;
571}
572
573int RGWSystemMetaObj::use_default(bool old_format)
574{
575 return read_default_id(id, old_format);
576}
577
578int RGWSystemMetaObj::set_as_default(bool exclusive)
579{
580 string oid = get_default_oid();
581
582 rgw_pool pool(get_pool(cct));
583 bufferlist bl;
584
585 RGWDefaultSystemMetaObjInfo default_info;
586 default_info.default_id = id;
587
588 ::encode(default_info, bl);
589
590 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
591 exclusive, NULL, real_time(), NULL);
592 if (ret < 0)
593 return ret;
594
595 return 0;
596}
597
598int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
599{
600 rgw_pool pool(get_pool(cct));
601 bufferlist bl;
602
603 string oid = get_names_oid_prefix() + obj_name;
604
605 RGWObjectCtx obj_ctx(store);
606 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
607 if (ret < 0) {
608 return ret;
609 }
610
611 RGWNameToId nameToId;
612 try {
613 bufferlist::iterator iter = bl.begin();
614 ::decode(nameToId, iter);
615 } catch (buffer::error& err) {
616 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
617 return -EIO;
618 }
619 object_id = nameToId.obj_id;
620 return 0;
621}
622
623int RGWSystemMetaObj::delete_obj(bool old_format)
624{
625 rgw_pool pool(get_pool(cct));
626
627 /* check to see if obj is the default */
628 RGWDefaultSystemMetaObjInfo default_info;
629 int ret = read_default(default_info, get_default_oid(old_format));
630 if (ret < 0 && ret != -ENOENT)
631 return ret;
632 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
633 string oid = get_default_oid(old_format);
634 rgw_raw_obj default_named_obj(pool, oid);
635 ret = store->delete_system_obj(default_named_obj);
636 if (ret < 0) {
637 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
638 return ret;
639 }
640 }
641 if (!old_format) {
642 string oid = get_names_oid_prefix() + name;
643 rgw_raw_obj object_name(pool, oid);
644 ret = store->delete_system_obj(object_name);
645 if (ret < 0) {
646 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
647 return ret;
648 }
649 }
650
651 string oid = get_info_oid_prefix(old_format);
652 if (old_format) {
653 oid += name;
654 } else {
655 oid += id;
656 }
657
658 rgw_raw_obj object_id(pool, oid);
659 ret = store->delete_system_obj(object_id);
660 if (ret < 0) {
661 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
662 }
663
664 return ret;
665}
666
667int RGWSystemMetaObj::store_name(bool exclusive)
668{
669 rgw_pool pool(get_pool(cct));
670 string oid = get_names_oid_prefix() + name;
671
672 RGWNameToId nameToId;
673 nameToId.obj_id = id;
674
675 bufferlist bl;
676 ::encode(nameToId, bl);
677 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
678}
679
680int RGWSystemMetaObj::rename(const string& new_name)
681{
682 string new_id;
683 int ret = read_id(new_name, new_id);
684 if (!ret) {
685 return -EEXIST;
686 }
687 if (ret < 0 && ret != -ENOENT) {
688 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
689 return ret;
690 }
691 string old_name = name;
692 name = new_name;
693 ret = update();
694 if (ret < 0) {
695 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
696 return ret;
697 }
698 ret = store_name(true);
699 if (ret < 0) {
700 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
701 return ret;
702 }
703 /* delete old name */
704 rgw_pool pool(get_pool(cct));
705 string oid = get_names_oid_prefix() + old_name;
706 rgw_raw_obj old_name_obj(pool, oid);
707 ret = store->delete_system_obj(old_name_obj);
708 if (ret < 0) {
709 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712
713 return ret;
714}
715
716int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
717{
718 rgw_pool pool(get_pool(cct));
719
720 bufferlist bl;
721
722 string oid = get_info_oid_prefix(old_format) + obj_id;
723
724 RGWObjectCtx obj_ctx(store);
725 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
726 if (ret < 0) {
727 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
728 return ret;
729 }
730
731 try {
732 bufferlist::iterator iter = bl.begin();
733 ::decode(*this, iter);
734 } catch (buffer::error& err) {
735 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
736 return -EIO;
737 }
738
739 return 0;
740}
741
742int RGWSystemMetaObj::read()
743{
744 int ret = read_id(name, id);
745 if (ret < 0) {
746 return ret;
747 }
748
749 return read_info(id);
750}
751
752int RGWSystemMetaObj::create(bool exclusive)
753{
754 int ret;
755
756 /* check to see the name is not used */
757 ret = read_id(name, id);
758 if (exclusive && ret == 0) {
759 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
760 return -EEXIST;
761 } else if ( ret < 0 && ret != -ENOENT) {
762 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
763 return ret;
764 }
765
766 if (id.empty()) {
767 /* create unique id */
768 uuid_d new_uuid;
769 char uuid_str[37];
770 new_uuid.generate_random();
771 new_uuid.print(uuid_str);
772 id = uuid_str;
773 }
774
775 ret = store_info(exclusive);
776 if (ret < 0) {
777 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
778 return ret;
779 }
780
781 return store_name(exclusive);
782}
783
784int RGWSystemMetaObj::store_info(bool exclusive)
785{
786 rgw_pool pool(get_pool(cct));
787
788 string oid = get_info_oid_prefix() + id;
789
790 bufferlist bl;
791 ::encode(*this, bl);
792 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
793}
794
795int RGWSystemMetaObj::write(bool exclusive)
796{
797 int ret = store_info(exclusive);
798 if (ret < 0) {
799 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
800 return ret;
801 }
802 ret = store_name(exclusive);
803 if (ret < 0) {
804 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
805 return ret;
806 }
807 return 0;
808}
809
810
811const string& RGWRealm::get_predefined_name(CephContext *cct) {
812 return cct->_conf->rgw_realm;
813}
814
815int RGWRealm::create(bool exclusive)
816{
817 int ret = RGWSystemMetaObj::create(exclusive);
818 if (ret < 0) {
819 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
820 return ret;
821 }
822 // create the control object for watch/notify
823 ret = create_control(exclusive);
824 if (ret < 0) {
825 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
826 return ret;
827 }
828 RGWPeriod period;
829 if (current_period.empty()) {
830 /* create new period for the realm */
831 ret = period.init(cct, store, id, name, false);
832 if (ret < 0 ) {
833 return ret;
834 }
835 ret = period.create(true);
836 if (ret < 0) {
837 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
838 return ret;
839 }
840 } else {
841 period = RGWPeriod(current_period, 0);
842 int ret = period.init(cct, store, id, name);
843 if (ret < 0) {
844 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
845 return ret;
846 }
847 }
848 ret = set_current_period(period);
849 if (ret < 0) {
850 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
851 return ret;
852 }
853 // try to set as default. may race with another create, so pass exclusive=true
854 // so we don't override an existing default
855 ret = set_as_default(true);
856 if (ret < 0 && ret != -EEXIST) {
857 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
858 }
859
860 return 0;
861}
862
863int RGWRealm::delete_obj()
864{
865 int ret = RGWSystemMetaObj::delete_obj();
866 if (ret < 0) {
867 return ret;
868 }
869 return delete_control();
870}
871
872int RGWRealm::create_control(bool exclusive)
873{
874 auto pool = rgw_pool{get_pool(cct)};
875 auto oid = get_control_oid();
876 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
877 nullptr, real_time(), nullptr);
878}
879
880int RGWRealm::delete_control()
881{
882 auto pool = rgw_pool{get_pool(cct)};
883 auto obj = rgw_raw_obj{pool, get_control_oid()};
884 return store->delete_system_obj(obj);
885}
886
887rgw_pool RGWRealm::get_pool(CephContext *cct)
888{
889 if (cct->_conf->rgw_realm_root_pool.empty()) {
890 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
891 }
892 return rgw_pool(cct->_conf->rgw_realm_root_pool);
893}
894
895const string RGWRealm::get_default_oid(bool old_format)
896{
897 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
898 return default_realm_info_oid;
899 }
900 return cct->_conf->rgw_default_realm_info_oid;
901}
902
903const string& RGWRealm::get_names_oid_prefix()
904{
905 return realm_names_oid_prefix;
906}
907
908const string& RGWRealm::get_info_oid_prefix(bool old_format)
909{
910 return realm_info_oid_prefix;
911}
912
913int RGWRealm::set_current_period(RGWPeriod& period)
914{
915 // update realm epoch to match the period's
916 if (epoch > period.get_realm_epoch()) {
917 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
918 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
919 return -EINVAL;
920 }
921 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
922 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
923 << period.get_realm_epoch() << ", but different period id "
924 << period.get_id() << " != " << current_period << dendl;
925 return -EINVAL;
926 }
927
928 epoch = period.get_realm_epoch();
929 current_period = period.get_id();
930
931 int ret = update();
932 if (ret < 0) {
933 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
934 return ret;
935 }
936
937 ret = period.reflect();
938 if (ret < 0) {
939 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
940 return ret;
941 }
942
943 return 0;
944}
945
946string RGWRealm::get_control_oid()
947{
948 return get_info_oid_prefix() + id + ".control";
949}
950
951int RGWRealm::notify_zone(bufferlist& bl)
952{
953 // open a context on the realm's pool
954 rgw_pool pool{get_pool(cct)};
955 librados::IoCtx ctx;
956 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
957 if (r < 0) {
958 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
959 return r;
960 }
961 // send a notify on the realm object
962 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
963 if (r < 0) {
964 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
965 return r;
966 }
967 return 0;
968}
969
970int RGWRealm::notify_new_period(const RGWPeriod& period)
971{
972 bufferlist bl;
973 // push the period to dependent zonegroups/zones
974 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
975 ::encode(period, bl);
976 // reload the gateway with the new period
977 ::encode(RGWRealmNotify::Reload, bl);
978
979 return notify_zone(bl);
980}
981
982std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
983{
984 if (realm_id.empty()) {
985 return "period_config.default";
986 }
987 return "period_config." + realm_id;
988}
989
990rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
991{
992 const auto& pool_name = cct->_conf->rgw_period_root_pool;
993 if (pool_name.empty()) {
994 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
995 }
996 return {pool_name};
997}
998
999int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1000{
1001 RGWObjectCtx obj_ctx(store);
1002 const auto& pool = get_pool(store->ctx());
1003 const auto& oid = get_oid(realm_id);
1004 bufferlist bl;
1005
1006 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1007 if (ret < 0) {
1008 return ret;
1009 }
1010 try {
1011 bufferlist::iterator iter = bl.begin();
1012 ::decode(*this, iter);
1013 } catch (buffer::error& err) {
1014 return -EIO;
1015 }
1016 return 0;
1017}
1018
1019int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1020{
1021 const auto& pool = get_pool(store->ctx());
1022 const auto& oid = get_oid(realm_id);
1023 bufferlist bl;
1024 ::encode(*this, bl);
1025 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1026 false, nullptr, real_time(), nullptr);
1027}
1028
1029int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1030 const string& period_realm_name, bool setup_obj)
1031{
1032 cct = _cct;
1033 store = _store;
1034 realm_id = period_realm_id;
1035 realm_name = period_realm_name;
1036
1037 if (!setup_obj)
1038 return 0;
1039
1040 return init(_cct, _store, setup_obj);
1041}
1042
1043
1044int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1045{
1046 cct = _cct;
1047 store = _store;
1048
1049 if (!setup_obj)
1050 return 0;
1051
1052 if (id.empty()) {
1053 RGWRealm realm(realm_id, realm_name);
1054 int ret = realm.init(cct, store);
1055 if (ret < 0) {
1056 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1057 cpp_strerror(-ret) << dendl;
1058 return ret;
1059 }
1060 id = realm.get_current_period();
1061 realm_id = realm.get_id();
1062 }
1063
1064 if (!epoch) {
1065 int ret = use_latest_epoch();
1066 if (ret < 0) {
1067 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1068 << " : " << cpp_strerror(-ret) << dendl;
1069 return ret;
1070 }
1071 }
1072
1073 return read_info();
1074}
1075
1076
1077int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1078 map<string, RGWZoneGroup>::const_iterator iter;
1079 if (!zonegroup_id.empty()) {
1080 iter = period_map.zonegroups.find(zonegroup_id);
1081 } else {
1082 iter = period_map.zonegroups.find("default");
1083 }
1084 if (iter != period_map.zonegroups.end()) {
1085 zonegroup = iter->second;
1086 return 0;
1087 }
1088
1089 return -ENOENT;
1090}
1091
1092bool RGWPeriod::is_single_zonegroup(CephContext *cct, RGWRados *store)
1093{
1094 return (period_map.zonegroups.size() == 1);
1095}
1096
1097const string& RGWPeriod::get_latest_epoch_oid()
1098{
1099 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1100 return period_latest_epoch_info_oid;
1101 }
1102 return cct->_conf->rgw_period_latest_epoch_info_oid;
1103}
1104
1105const string& RGWPeriod::get_info_oid_prefix()
1106{
1107 return period_info_oid_prefix;
1108}
1109
1110const string RGWPeriod::get_period_oid_prefix()
1111{
1112 return get_info_oid_prefix() + id;
1113}
1114
1115const string RGWPeriod::get_period_oid()
1116{
1117 std::ostringstream oss;
1118 oss << get_period_oid_prefix();
1119 // skip the epoch for the staging period
1120 if (id != get_staging_id(realm_id))
1121 oss << "." << epoch;
1122 return oss.str();
1123}
1124
1125int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info)
1126{
1127 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1128
1129 rgw_pool pool(get_pool(cct));
1130 bufferlist bl;
1131 RGWObjectCtx obj_ctx(store);
1132 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
1133 if (ret < 0) {
1134 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1135 return ret;
1136 }
1137 try {
1138 bufferlist::iterator iter = bl.begin();
1139 ::decode(info, iter);
1140 } catch (buffer::error& err) {
1141 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1142 return -EIO;
1143 }
1144
1145 return 0;
1146}
1147
1148int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1149{
1150 RGWPeriodLatestEpochInfo info;
1151
1152 int ret = read_latest_epoch(info);
1153 if (ret < 0) {
1154 return ret;
1155 }
1156
1157 latest_epoch = info.epoch;
1158
1159 return 0;
1160}
1161
1162int RGWPeriod::use_latest_epoch()
1163{
1164 RGWPeriodLatestEpochInfo info;
1165 int ret = read_latest_epoch(info);
1166 if (ret < 0) {
1167 return ret;
1168 }
1169
1170 epoch = info.epoch;
1171
1172 return 0;
1173}
1174
1175int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive)
1176{
1177 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1178
1179 rgw_pool pool(get_pool(cct));
1180 bufferlist bl;
1181
1182 RGWPeriodLatestEpochInfo info;
1183 info.epoch = epoch;
1184
1185 ::encode(info, bl);
1186
1187 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1188 exclusive, NULL, real_time(), NULL);
1189}
1190
1191int RGWPeriod::delete_obj()
1192{
1193 rgw_pool pool(get_pool(cct));
1194
1195 // delete the object for each period epoch
1196 for (epoch_t e = 1; e <= epoch; e++) {
1197 RGWPeriod p{get_id(), e};
1198 rgw_raw_obj oid{pool, p.get_period_oid()};
1199 int ret = store->delete_system_obj(oid);
1200 if (ret < 0) {
1201 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1202 << ": " << cpp_strerror(-ret) << dendl;
1203 }
1204 }
1205
1206 // delete the .latest_epoch object
1207 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1208 int ret = store->delete_system_obj(oid);
1209 if (ret < 0) {
1210 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1211 << ": " << cpp_strerror(-ret) << dendl;
1212 }
1213 return ret;
1214}
1215
1216int RGWPeriod::read_info()
1217{
1218 rgw_pool pool(get_pool(cct));
1219
1220 bufferlist bl;
1221
1222 RGWObjectCtx obj_ctx(store);
1223 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1224 if (ret < 0) {
1225 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1226 return ret;
1227 }
1228
1229 try {
1230 bufferlist::iterator iter = bl.begin();
1231 ::decode(*this, iter);
1232 } catch (buffer::error& err) {
1233 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1234 return -EIO;
1235 }
1236
1237 return 0;
1238}
1239
1240int RGWPeriod::create(bool exclusive)
1241{
1242 int ret;
1243
1244 /* create unique id */
1245 uuid_d new_uuid;
1246 char uuid_str[37];
1247 new_uuid.generate_random();
1248 new_uuid.print(uuid_str);
1249 id = uuid_str;
1250
1251 epoch = FIRST_EPOCH;
1252
1253 period_map.id = id;
1254
1255 ret = store_info(exclusive);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1258 }
1259
1260 ret = set_latest_epoch(epoch);
1261 if (ret < 0) {
1262 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1263 }
1264
1265 return ret;
1266}
1267
1268int RGWPeriod::store_info(bool exclusive)
1269{
1270 epoch_t latest_epoch = FIRST_EPOCH - 1;
1271 int ret = get_latest_epoch(latest_epoch);
1272 if (ret < 0 && ret != -ENOENT) {
1273 ldout(cct, 0) << "ERROR: RGWPeriod::get_latest_epoch() returned " << cpp_strerror(-ret) << dendl;
1274 return ret;
1275 }
1276
1277 rgw_pool pool(get_pool(cct));
1278
1279 string oid = get_period_oid();
1280 bufferlist bl;
1281 ::encode(*this, bl);
1282 ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
1283 if (ret < 0) {
1284 ldout(cct, 0) << "ERROR: rgw_put_system_obj(" << pool << ":" << oid << "): " << cpp_strerror(-ret) << dendl;
1285 return ret;
1286 }
1287 if (latest_epoch < epoch) {
1288 ret = set_latest_epoch(epoch);
1289 if (ret < 0) {
1290 ldout(cct, 0) << "ERROR: RGWPeriod::set_latest_epoch() returned " << cpp_strerror(-ret) << dendl;
1291 return ret;
1292 }
1293 }
1294 return 0;
1295}
1296
1297rgw_pool RGWPeriod::get_pool(CephContext *cct)
1298{
1299 if (cct->_conf->rgw_period_root_pool.empty()) {
1300 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1301 }
1302 return rgw_pool(cct->_conf->rgw_period_root_pool);
1303}
1304
1305int RGWPeriod::use_next_epoch()
1306{
1307 epoch_t latest_epoch;
1308 int ret = get_latest_epoch(latest_epoch);
1309 if (ret < 0) {
1310 return ret;
1311 }
1312 epoch = latest_epoch + 1;
1313 ret = read_info();
1314 if (ret < 0 && ret != -ENOENT) {
1315 return ret;
1316 }
1317 if (ret == -ENOENT) {
1318 ret = create();
1319 if (ret < 0) {
1320 ldout(cct, 0) << "Error creating new epoch " << epoch << dendl;
1321 return ret;
1322 }
1323 }
1324 return 0;
1325}
1326
1327int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1328{
1329 if (zonegroup.realm_id != realm_id) {
1330 return 0;
1331 }
1332 int ret = period_map.update(zonegroup, cct);
1333 if (ret < 0) {
1334 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1335 return ret;
1336 }
1337
1338 return store_info(false);
1339}
1340
1341int RGWPeriod::update()
1342{
1343 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1344 list<string> zonegroups;
1345 int ret = store->list_zonegroups(zonegroups);
1346 if (ret < 0) {
1347 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1348 return ret;
1349 }
1350
1351 // clear zone short ids of removed zones. period_map.update() will add the
1352 // remaining zones back
1353 period_map.short_zone_ids.clear();
1354
1355 for (auto& iter : zonegroups) {
1356 RGWZoneGroup zg(string(), iter);
1357 ret = zg.init(cct, store);
1358 if (ret < 0) {
1359 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1360 continue;
1361 }
1362
1363 if (zg.realm_id != realm_id) {
1364 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1365 continue;
1366 }
1367
1368 if (zg.master_zone.empty()) {
1369 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1370 return -EINVAL;
1371 }
1372
1373 if (zg.is_master_zonegroup()) {
1374 master_zonegroup = zg.get_id();
1375 master_zone = zg.master_zone;
1376 }
1377
1378 int ret = period_map.update(zg, cct);
1379 if (ret < 0) {
1380 return ret;
1381 }
1382 }
1383
1384 ret = period_config.read(store, realm_id);
1385 if (ret < 0 && ret != -ENOENT) {
1386 ldout(cct, 0) << "ERROR: failed to read period config: "
1387 << cpp_strerror(ret) << dendl;
1388 return ret;
1389 }
1390 return 0;
1391}
1392
1393int RGWPeriod::reflect()
1394{
1395 for (auto& iter : period_map.zonegroups) {
1396 RGWZoneGroup& zg = iter.second;
1397 zg.reinit_instance(cct, store);
1398 int r = zg.write(false);
1399 if (r < 0) {
1400 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1401 return r;
1402 }
1403 if (zg.is_master_zonegroup()) {
1404 // set master as default if no default exists
1405 r = zg.set_as_default(true);
1406 if (r == 0) {
1407 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1408 << " as the default" << dendl;
1409 }
1410 }
1411 }
1412
1413 int r = period_config.write(store, realm_id);
1414 if (r < 0) {
1415 ldout(cct, 0) << "ERROR: failed to store period config: "
1416 << cpp_strerror(-r) << dendl;
1417 return r;
1418 }
1419 return 0;
1420}
1421
1422void RGWPeriod::fork()
1423{
1424 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1425 predecessor_uuid = id;
1426 id = get_staging_id(realm_id);
1427 period_map.reset();
1428 realm_epoch++;
1429}
1430
1431static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1432{
1433 // initialize a sync status manager to read the status
1434 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1435 int r = mgr.init();
1436 if (r < 0) {
1437 return r;
1438 }
1439 r = mgr.read_sync_status(sync_status);
1440 mgr.stop();
1441 return r;
1442}
1443
1444int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1445 std::ostream& error_stream,
1446 bool force_if_stale)
1447{
1448 rgw_meta_sync_status status;
1449 int r = read_sync_status(store, &status);
1450 if (r < 0) {
1451 ldout(cct, 0) << "period failed to read sync status: "
1452 << cpp_strerror(-r) << dendl;
1453 return r;
1454 }
1455
1456 std::vector<std::string> markers;
1457
1458 const auto current_epoch = current_period.get_realm_epoch();
1459 if (current_epoch != status.sync_info.realm_epoch) {
1460 // no sync status markers for the current period
1461 assert(current_epoch > status.sync_info.realm_epoch);
1462 const int behind = current_epoch - status.sync_info.realm_epoch;
1463 if (!force_if_stale && current_epoch > 1) {
1464 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1465 "the current master zone in metadata sync. If this zone is promoted "
1466 "to master, any metadata changes during that time are likely to "
1467 "be lost.\n"
1468 "Waiting for this zone to catch up on metadata sync (see "
1469 "'radosgw-admin sync status') is recommended.\n"
1470 "To promote this zone to master anyway, add the flag "
1471 "--yes-i-really-mean-it." << std::endl;
1472 return -EINVAL;
1473 }
1474 // empty sync status markers - other zones will skip this period during
1475 // incremental metadata sync
1476 markers.resize(status.sync_info.num_shards);
1477 } else {
1478 markers.reserve(status.sync_info.num_shards);
1479 for (auto& i : status.sync_markers) {
1480 auto& marker = i.second;
1481 // filter out markers from other periods
1482 if (marker.realm_epoch != current_epoch) {
1483 marker.marker.clear();
1484 }
1485 markers.emplace_back(std::move(marker.marker));
1486 }
1487 }
1488
1489 std::swap(sync_status, markers);
1490 return 0;
1491}
1492
1493int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1494 std::ostream& error_stream, bool force_if_stale)
1495{
1496 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1497 // gateway must be in the master zone to commit
1498 if (master_zone != store->get_zone_params().get_id()) {
1499 error_stream << "Cannot commit period on zone "
1500 << store->get_zone_params().get_id() << ", it must be sent to "
1501 "the period's master zone " << master_zone << '.' << std::endl;
1502 return -EINVAL;
1503 }
1504 // period predecessor must match current period
1505 if (predecessor_uuid != current_period.get_id()) {
1506 error_stream << "Period predecessor " << predecessor_uuid
1507 << " does not match current period " << current_period.get_id()
1508 << ". Use 'period pull' to get the latest period from the master, "
1509 "reapply your changes, and try again." << std::endl;
1510 return -EINVAL;
1511 }
1512 // realm epoch must be 1 greater than current period
1513 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1514 error_stream << "Period's realm epoch " << realm_epoch
1515 << " does not come directly after current realm epoch "
1516 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1517 "latest realm and period from the master zone, reapply your changes, "
1518 "and try again." << std::endl;
1519 return -EINVAL;
1520 }
1521 // did the master zone change?
1522 if (master_zone != current_period.get_master_zone()) {
1523 // store the current metadata sync status in the period
1524 int r = update_sync_status(current_period, error_stream, force_if_stale);
1525 if (r < 0) {
1526 ldout(cct, 0) << "failed to update metadata sync status: "
1527 << cpp_strerror(-r) << dendl;
1528 return r;
1529 }
1530 // create an object with a new period id
1531 r = create(true);
1532 if (r < 0) {
1533 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1534 return r;
1535 }
1536 // set as current period
1537 r = realm.set_current_period(*this);
1538 if (r < 0) {
1539 ldout(cct, 0) << "failed to update realm's current period: "
1540 << cpp_strerror(-r) << dendl;
1541 return r;
1542 }
1543 ldout(cct, 4) << "Promoted to master zone and committed new period "
1544 << id << dendl;
1545 realm.notify_new_period(*this);
1546 return 0;
1547 }
1548 // period must be based on current epoch
1549 if (epoch != current_period.get_epoch()) {
1550 error_stream << "Period epoch " << epoch << " does not match "
1551 "predecessor epoch " << current_period.get_epoch()
1552 << ". Use 'period pull' to get the latest epoch from the master zone, "
1553 "reapply your changes, and try again." << std::endl;
1554 return -EINVAL;
1555 }
1556 // set period as next epoch
1557 set_id(current_period.get_id());
1558 set_epoch(current_period.get_epoch() + 1);
1559 set_predecessor(current_period.get_predecessor());
1560 realm_epoch = current_period.get_realm_epoch();
1561 // write the period to rados
1562 int r = store_info(false);
1563 if (r < 0) {
1564 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1565 return r;
1566 }
1567 // set as latest epoch
1568 r = set_latest_epoch(epoch);
1569 if (r < 0) {
1570 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1571 return r;
1572 }
1573 r = reflect();
1574 if (r < 0) {
1575 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1576 return r;
1577 }
1578 ldout(cct, 4) << "Committed new epoch " << epoch
1579 << " for period " << id << dendl;
1580 realm.notify_new_period(*this);
1581 return 0;
1582}
1583
1584int RGWZoneParams::create_default(bool old_format)
1585{
1586 name = default_zone_name;
1587
1588 int r = create();
1589 if (r < 0) {
1590 return r;
1591 }
1592
1593 if (old_format) {
1594 name = id;
1595 }
1596
1597 return r;
1598}
1599
1600
1601int get_zones_pool_set(CephContext* cct,
1602 RGWRados* store,
1603 const list<string>& zones,
1604 const string& my_zone_id,
1605 set<rgw_pool>& pool_names)
1606{
1607 for(auto const& iter : zones) {
1608 RGWZoneParams zone(iter);
1609 int r = zone.init(cct, store);
1610 if (r < 0) {
1611 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1612 return r;
1613 }
1614 if (zone.get_id() != my_zone_id) {
1615 pool_names.insert(zone.domain_root);
1616 pool_names.insert(zone.metadata_heap);
1617 pool_names.insert(zone.control_pool);
1618 pool_names.insert(zone.gc_pool);
1619 pool_names.insert(zone.log_pool);
1620 pool_names.insert(zone.intent_log_pool);
1621 pool_names.insert(zone.usage_log_pool);
1622 pool_names.insert(zone.user_keys_pool);
1623 pool_names.insert(zone.user_email_pool);
1624 pool_names.insert(zone.user_swift_pool);
1625 pool_names.insert(zone.user_uid_pool);
1626 pool_names.insert(zone.roles_pool);
1627 for(auto& iter : zone.placement_pools) {
1628 pool_names.insert(iter.second.index_pool);
1629 pool_names.insert(iter.second.data_pool);
1630 pool_names.insert(iter.second.data_extra_pool);
1631 }
1632 }
1633 }
1634 return 0;
1635}
1636
1637rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1638 const string& default_prefix,
1639 const string& default_suffix,
1640 const rgw_pool& suggested_pool)
1641{
1642 string suggested_name = suggested_pool.to_str();
1643
1644 string prefix = default_prefix;
1645 string suffix = default_suffix;
1646
1647 if (!suggested_pool.empty()) {
1648 prefix = suggested_name.substr(0, suggested_name.find("."));
1649 suffix = suggested_name.substr(prefix.length());
1650 }
1651
1652 rgw_pool pool(prefix + suffix);
1653
1654 if (pools.find(pool) == pools.end()) {
1655 return pool;
1656 } else {
1657 while(true) {
1658 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1659 if (pools.find(pool) == pools.end()) {
1660 return pool;
1661 }
1662 }
1663 }
1664}
1665
1666int RGWZoneParams::fix_pool_names()
1667{
1668
1669 list<string> zones;
1670 int r = store->list_zones(zones);
1671 if (r < 0) {
1672 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1673 }
1674
1675 set<rgw_pool> pools;
1676 r = get_zones_pool_set(cct, store, zones, id, pools);
1677 if (r < 0) {
1678 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1679 return r;
1680 }
1681
1682 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1683 if (!metadata_heap.name.empty()) {
1684 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1685 }
1686 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1687 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1688 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1689 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1690 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1691 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1692 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1693 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1694 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1695 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1696 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1697
1698 for(auto& iter : placement_pools) {
1699 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1700 iter.second.index_pool);
1701 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1702 iter.second.data_pool);
1703 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1704 iter.second.data_extra_pool);
1705 }
1706
1707 return 0;
1708}
1709
1710int RGWZoneParams::create(bool exclusive)
1711{
1712 /* check for old pools config */
1713 rgw_raw_obj obj(domain_root, avail_pools);
1714 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1715 if (r < 0) {
1716 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1717 /* a new system, let's set new placement info */
1718 RGWZonePlacementInfo default_placement;
1719 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1720 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1721 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1722 placement_pools["default-placement"] = default_placement;
1723 }
1724
1725 r = fix_pool_names();
1726 if (r < 0) {
1727 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1728 return r;
1729 }
1730
1731 r = RGWSystemMetaObj::create(exclusive);
1732 if (r < 0) {
1733 return r;
1734 }
1735
1736 // try to set as default. may race with another create, so pass exclusive=true
1737 // so we don't override an existing default
1738 r = set_as_default(true);
1739 if (r < 0 && r != -EEXIST) {
1740 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1741 }
1742
1743 return 0;
1744}
1745
1746rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1747{
1748 if (cct->_conf->rgw_zone_root_pool.empty()) {
1749 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1750 }
1751
1752 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1753}
1754
1755const string RGWZoneParams::get_default_oid(bool old_format)
1756{
1757 if (old_format) {
1758 return cct->_conf->rgw_default_zone_info_oid;
1759 }
1760
1761 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1762}
1763
1764const string& RGWZoneParams::get_names_oid_prefix()
1765{
1766 return zone_names_oid_prefix;
1767}
1768
1769const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1770{
1771 return zone_info_oid_prefix;
1772}
1773
1774const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1775 return cct->_conf->rgw_zone;
1776}
1777
1778int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1779{
1780 if (name.empty()) {
1781 name = cct->_conf->rgw_zone;
1782 }
1783
1784 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1785}
1786
1787int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1788{
1789 if (realm_id.empty()) {
1790 /* try using default realm */
1791 RGWRealm realm;
1792 int ret = realm.init(cct, store);
1793 if (ret < 0) {
1794 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1795 return -ENOENT;
1796 }
1797 realm_id = realm.get_id();
1798 }
1799
1800 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1801}
1802
1803
1804int RGWZoneParams::set_as_default(bool exclusive)
1805{
1806 if (realm_id.empty()) {
1807 /* try using default realm */
1808 RGWRealm realm;
1809 int ret = realm.init(cct, store);
1810 if (ret < 0) {
1811 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1812 return -EINVAL;
1813 }
1814 realm_id = realm.get_id();
1815 }
1816
1817 return RGWSystemMetaObj::set_as_default(exclusive);
1818}
1819
1820const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1821{
1822 static const std::string NONE{"none"};
1823 auto p = placement_pools.find(placement_rule);
1824 if (p == placement_pools.end()) {
1825 return NONE;
1826 }
1827 const auto& type = p->second.compression_type;
1828 return !type.empty() ? type : NONE;
1829}
1830
1831void RGWPeriodMap::encode(bufferlist& bl) const {
1832 ENCODE_START(2, 1, bl);
1833 ::encode(id, bl);
1834 ::encode(zonegroups, bl);
1835 ::encode(master_zonegroup, bl);
1836 ::encode(short_zone_ids, bl);
1837 ENCODE_FINISH(bl);
1838}
1839
1840void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1841 DECODE_START(2, bl);
1842 ::decode(id, bl);
1843 ::decode(zonegroups, bl);
1844 ::decode(master_zonegroup, bl);
1845 if (struct_v >= 2) {
1846 ::decode(short_zone_ids, bl);
1847 }
1848 DECODE_FINISH(bl);
1849
1850 zonegroups_by_api.clear();
1851 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1852 iter != zonegroups.end(); ++iter) {
1853 RGWZoneGroup& zonegroup = iter->second;
1854 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1855 if (zonegroup.is_master) {
1856 master_zonegroup = zonegroup.get_id();
1857 }
1858 }
1859}
1860
1861// run an MD5 hash on the zone_id and return the first 32 bits
1862static uint32_t gen_short_zone_id(const std::string zone_id)
1863{
1864 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1865 MD5 hash;
1866 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1867 hash.Final(md5);
1868
1869 uint32_t short_id;
1870 memcpy((char *)&short_id, md5, sizeof(short_id));
1871 return std::max(short_id, 1u);
1872}
1873
1874int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1875{
1876 if (zonegroup.is_master && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1877 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1878 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1879 return -EINVAL;
1880 }
1881 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1882 if (iter != zonegroups.end()) {
1883 RGWZoneGroup& old_zonegroup = iter->second;
1884 if (!old_zonegroup.api_name.empty()) {
1885 zonegroups_by_api.erase(old_zonegroup.api_name);
1886 }
1887 }
1888 zonegroups[zonegroup.get_id()] = zonegroup;
1889
1890 if (!zonegroup.api_name.empty()) {
1891 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1892 }
1893
1894 if (zonegroup.is_master) {
1895 master_zonegroup = zonegroup.get_id();
1896 } else if (master_zonegroup == zonegroup.get_id()) {
1897 master_zonegroup = "";
1898 }
1899
1900 for (auto& i : zonegroup.zones) {
1901 auto& zone = i.second;
1902 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1903 continue;
1904 }
1905 // calculate the zone's short id
1906 uint32_t short_id = gen_short_zone_id(zone.id);
1907
1908 // search for an existing zone with the same short id
1909 for (auto& s : short_zone_ids) {
1910 if (s.second == short_id) {
1911 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1912 << ") generates the same short_zone_id " << short_id
1913 << " as existing zone id " << s.first << dendl;
1914 return -EEXIST;
1915 }
1916 }
1917
1918 short_zone_ids[zone.id] = short_id;
1919 }
1920
1921 return 0;
1922}
1923
1924uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1925{
1926 auto i = short_zone_ids.find(zone_id);
1927 if (i == short_zone_ids.end()) {
1928 return 0;
1929 }
1930 return i->second;
1931}
1932
1933int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1934{
1935
1936 RGWPeriod period;
1937 int ret = period.init(cct, store);
1938 if (ret < 0) {
1939 cerr << "failed to read current period info: " << cpp_strerror(ret);
1940 return ret;
1941 }
1942
1943 bucket_quota = period.get_config().bucket_quota;
1944 user_quota = period.get_config().user_quota;
1945 zonegroups = period.get_map().zonegroups;
1946 zonegroups_by_api = period.get_map().zonegroups_by_api;
1947 master_zonegroup = period.get_map().master_zonegroup;
1948
1949 return 0;
1950}
1951
1952void RGWRegionMap::encode(bufferlist& bl) const {
1953 ENCODE_START( 3, 1, bl);
1954 ::encode(regions, bl);
1955 ::encode(master_region, bl);
1956 ::encode(bucket_quota, bl);
1957 ::encode(user_quota, bl);
1958 ENCODE_FINISH(bl);
1959}
1960
1961void RGWRegionMap::decode(bufferlist::iterator& bl) {
1962 DECODE_START(3, bl);
1963 ::decode(regions, bl);
1964 ::decode(master_region, bl);
1965 if (struct_v >= 2)
1966 ::decode(bucket_quota, bl);
1967 if (struct_v >= 3)
1968 ::decode(user_quota, bl);
1969 DECODE_FINISH(bl);
1970}
1971
1972void RGWZoneGroupMap::encode(bufferlist& bl) const {
1973 ENCODE_START( 3, 1, bl);
1974 ::encode(zonegroups, bl);
1975 ::encode(master_zonegroup, bl);
1976 ::encode(bucket_quota, bl);
1977 ::encode(user_quota, bl);
1978 ENCODE_FINISH(bl);
1979}
1980
1981void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1982 DECODE_START(3, bl);
1983 ::decode(zonegroups, bl);
1984 ::decode(master_zonegroup, bl);
1985 if (struct_v >= 2)
1986 ::decode(bucket_quota, bl);
1987 if (struct_v >= 3)
1988 ::decode(user_quota, bl);
1989 DECODE_FINISH(bl);
1990
1991 zonegroups_by_api.clear();
1992 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1993 iter != zonegroups.end(); ++iter) {
1994 RGWZoneGroup& zonegroup = iter->second;
1995 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1996 if (zonegroup.is_master) {
1997 master_zonegroup = zonegroup.get_name();
1998 }
1999 }
2000}
2001
2002void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2003{
2004 obj_version *check_objv = version_for_check();
2005
2006 if (check_objv) {
2007 cls_version_check(*op, *check_objv, VER_COND_EQ);
2008 }
2009
2010 cls_version_read(*op, &read_version);
2011}
2012
2013void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2014{
2015 obj_version *check_objv = version_for_check();
2016 obj_version *modify_version = version_for_write();
2017
2018 if (check_objv) {
2019 cls_version_check(*op, *check_objv, VER_COND_EQ);
2020 }
2021
2022 if (modify_version) {
2023 cls_version_set(*op, *modify_version);
2024 } else {
2025 cls_version_inc(*op);
2026 }
2027}
2028
2029void RGWObjManifest::obj_iterator::operator++()
2030{
2031 if (manifest->explicit_objs) {
2032 ++explicit_iter;
2033
2034 if (explicit_iter == manifest->objs.end()) {
2035 ofs = manifest->obj_size;
2036 return;
2037 }
2038
2039 update_explicit_pos();
2040
2041 update_location();
2042 return;
2043 }
2044
2045 uint64_t obj_size = manifest->get_obj_size();
2046 uint64_t head_size = manifest->get_head_size();
2047
2048 if (ofs == obj_size) {
2049 return;
2050 }
2051
2052 if (manifest->rules.empty()) {
2053 return;
2054 }
2055
2056 /* are we still pointing at the head? */
2057 if (ofs < head_size) {
2058 rule_iter = manifest->rules.begin();
2059 RGWObjManifestRule *rule = &rule_iter->second;
2060 ofs = MIN(head_size, obj_size);
2061 stripe_ofs = ofs;
2062 cur_stripe = 1;
2063 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2064 if (rule->part_size > 0) {
2065 stripe_size = MIN(stripe_size, rule->part_size);
2066 }
2067 update_location();
2068 return;
2069 }
2070
2071 RGWObjManifestRule *rule = &rule_iter->second;
2072
2073 stripe_ofs += rule->stripe_max_size;
2074 cur_stripe++;
2075 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2076
2077 if (rule->part_size > 0) {
2078 /* multi part, multi stripes object */
2079
2080 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2081
2082 if (stripe_ofs >= part_ofs + rule->part_size) {
2083 /* moved to the next part */
2084 cur_stripe = 0;
2085 part_ofs += rule->part_size;
2086 stripe_ofs = part_ofs;
2087
2088 bool last_rule = (next_rule_iter == manifest->rules.end());
2089 /* move to the next rule? */
2090 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2091 rule_iter = next_rule_iter;
2092 last_rule = (next_rule_iter == manifest->rules.end());
2093 if (!last_rule) {
2094 ++next_rule_iter;
2095 }
2096 cur_part_id = rule_iter->second.start_part_num;
2097 } else {
2098 cur_part_id++;
2099 }
2100
2101 rule = &rule_iter->second;
2102 }
2103
2104 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2105 }
2106
2107 cur_override_prefix = rule->override_prefix;
2108
2109 ofs = stripe_ofs;
2110 if (ofs > obj_size) {
2111 ofs = obj_size;
2112 stripe_ofs = ofs;
2113 stripe_size = 0;
2114 }
2115
2116 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2117 update_location();
2118}
2119
2120int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2121{
2122 manifest = _m;
2123
2124 manifest->set_tail_placement(placement_rule, _b);
2125 manifest->set_head(placement_rule, _obj, 0);
2126 last_ofs = 0;
2127
2128 if (manifest->get_prefix().empty()) {
2129 char buf[33];
2130 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2131
2132 string oid_prefix = ".";
2133 oid_prefix.append(buf);
2134 oid_prefix.append("_");
2135
2136 manifest->set_prefix(oid_prefix);
2137 }
2138
2139 bool found = manifest->get_rule(0, &rule);
2140 if (!found) {
2141 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2142 return -EIO;
2143 }
2144
2145 uint64_t head_size = manifest->get_head_size();
2146
2147 if (head_size > 0) {
2148 cur_stripe_size = head_size;
2149 } else {
2150 cur_stripe_size = rule.stripe_max_size;
2151 }
2152
2153 cur_part_id = rule.start_part_num;
2154
2155 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2156
2157 // Normal object which not generated through copy operation
2158 manifest->set_tail_instance(_obj.key.instance);
2159
2160 manifest->update_iterators();
2161
2162 return 0;
2163}
2164
2165int RGWObjManifest::generator::create_next(uint64_t ofs)
2166{
2167 if (ofs < last_ofs) /* only going forward */
2168 return -EINVAL;
2169
2170 uint64_t max_head_size = manifest->get_max_head_size();
2171
2172 if (ofs < max_head_size) {
2173 manifest->set_head_size(ofs);
2174 }
2175
2176 if (ofs >= max_head_size) {
2177 manifest->set_head_size(max_head_size);
2178 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2179 cur_stripe_size = rule.stripe_max_size;
2180
2181 if (cur_part_id == 0 && max_head_size > 0) {
2182 cur_stripe++;
2183 }
2184 }
2185
2186 last_ofs = ofs;
2187 manifest->set_obj_size(ofs);
2188
2189 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2190
2191 manifest->update_iterators();
2192
2193 return 0;
2194}
2195
2196const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2197{
2198 return begin_iter;
2199}
2200
2201const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2202{
2203 return end_iter;
2204}
2205
2206RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2207{
2208 if (ofs > obj_size) {
2209 ofs = obj_size;
2210 }
2211 RGWObjManifest::obj_iterator iter(this);
2212 iter.seek(ofs);
2213 return iter;
2214}
2215
2216int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2217{
2218 if (explicit_objs || m.explicit_objs) {
2219 return append_explicit(m, zonegroup, zone_params);
2220 }
2221
2222 if (rules.empty()) {
2223 *this = m;
2224 return 0;
2225 }
2226
2227 string override_prefix;
2228
2229 if (prefix.empty()) {
2230 prefix = m.prefix;
2231 }
2232
2233 if (prefix != m.prefix) {
2234 override_prefix = m.prefix;
2235 }
2236
2237 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2238 if (miter == m.rules.end()) {
2239 return append_explicit(m, zonegroup, zone_params);
2240 }
2241
2242 for (; miter != m.rules.end(); ++miter) {
2243 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2244
2245 RGWObjManifestRule& rule = last_rule->second;
2246
2247 if (rule.part_size == 0) {
2248 rule.part_size = obj_size - rule.start_ofs;
2249 }
2250
2251 RGWObjManifestRule& next_rule = miter->second;
2252 if (!next_rule.part_size) {
2253 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2254 }
2255
2256 string rule_prefix = prefix;
2257 if (!rule.override_prefix.empty()) {
2258 rule_prefix = rule.override_prefix;
2259 }
2260
2261 string next_rule_prefix = m.prefix;
2262 if (!next_rule.override_prefix.empty()) {
2263 next_rule_prefix = next_rule.override_prefix;
2264 }
2265
2266 if (rule.part_size != next_rule.part_size ||
2267 rule.stripe_max_size != next_rule.stripe_max_size ||
2268 rule_prefix != next_rule_prefix) {
2269 if (next_rule_prefix != prefix) {
2270 append_rules(m, miter, &next_rule_prefix);
2271 } else {
2272 append_rules(m, miter, NULL);
2273 }
2274 break;
2275 }
2276
2277 uint64_t expected_part_num = rule.start_part_num + 1;
2278 if (rule.part_size > 0) {
2279 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2280 }
2281
2282 if (expected_part_num != next_rule.start_part_num) {
2283 append_rules(m, miter, NULL);
2284 break;
2285 }
2286 }
2287
2288 set_obj_size(obj_size + m.obj_size);
2289
2290 return 0;
2291}
2292
2293int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2294{
2295 return append(m, store->get_zonegroup(), store->get_zone_params());
2296}
2297
2298void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2299 string *override_prefix)
2300{
2301 for (; miter != m.rules.end(); ++miter) {
2302 RGWObjManifestRule rule = miter->second;
2303 rule.start_ofs += obj_size;
2304 if (override_prefix)
2305 rule.override_prefix = *override_prefix;
2306 rules[rule.start_ofs] = rule;
2307 }
2308}
2309
2310void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2311{
2312 if (explicit_objs) {
2313 return;
2314 }
2315 obj_iterator iter = obj_begin();
2316
2317 while (iter != obj_end()) {
2318 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2319 const rgw_obj_select& os = iter.get_location();
2320 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2321 part.loc_ofs = 0;
2322
2323 uint64_t ofs = iter.get_stripe_ofs();
2324
2325 if (ofs == 0) {
2326 part.loc = obj;
2327 } else {
2328 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2329 }
2330 ++iter;
2331 uint64_t next_ofs = iter.get_stripe_ofs();
2332
2333 part.size = next_ofs - ofs;
2334 }
2335
2336 explicit_objs = true;
2337 rules.clear();
2338 prefix.clear();
2339}
2340
2341int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2342{
2343 if (!explicit_objs) {
2344 convert_to_explicit(zonegroup, zone_params);
2345 }
2346 if (!m.explicit_objs) {
2347 m.convert_to_explicit(zonegroup, zone_params);
2348 }
2349 map<uint64_t, RGWObjManifestPart>::iterator iter;
2350 uint64_t base = obj_size;
2351 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2352 RGWObjManifestPart& part = iter->second;
2353 objs[base + iter->first] = part;
2354 }
2355 obj_size += m.obj_size;
2356
2357 return 0;
2358}
2359
2360bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2361{
2362 if (rules.empty()) {
2363 return false;
2364 }
2365
2366 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2367 if (iter != rules.begin()) {
2368 --iter;
2369 }
2370
2371 *rule = iter->second;
2372
2373 return true;
2374}
2375
2376void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2377{
2378 write_version.ver = 1;
2379#define TAG_LEN 24
2380
2381 write_version.tag.clear();
2382 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2383}
2384
2385int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2386 real_time *mtime, real_time set_mtime,
2387 map<string, bufferlist>& attrs, real_time delete_at,
2388 const char *if_match, const char *if_nomatch, const string *user_data)
2389{
2390 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data);
2391 if (r < 0)
2392 return r;
2393
2394 is_complete = !canceled;
2395 return 0;
2396}
2397
2398CephContext *RGWPutObjProcessor::ctx()
2399{
2400 return store->ctx();
2401}
2402
2403RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2404{
2405 drain_pending();
2406
2407 if (is_complete)
2408 return;
2409
2410 set<rgw_raw_obj>::iterator iter;
2411 bool need_to_remove_head = false;
2412 rgw_raw_obj raw_head;
2413
2414 if (!head_obj.empty()) {
2415 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2416 }
2417
2418 /**
2419 * We should delete the object in the "multipart" namespace to avoid race condition.
2420 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2421 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2422 * written by the second upload may be deleted by the first upload.
2423 * details is describled on #11749
2424 *
2425 * The above comment still stands, but instead of searching for a specific object in the multipart
2426 * namespace, we just make sure that we remove the object that is marked as the head object after
2427 * we remove all the other raw objects. Note that we use different call to remove the head object,
2428 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2429 */
2430 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2431 const rgw_raw_obj& obj = *iter;
2432 if (!head_obj.empty() && obj == raw_head) {
2433 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2434 need_to_remove_head = true;
2435 continue;
2436 }
2437
2438 int r = store->delete_raw_obj(obj);
2439 if (r < 0 && r != -ENOENT) {
2440 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2441 }
2442 }
2443
2444 if (need_to_remove_head) {
2445 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2446 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2447 if (r < 0 && r != -ENOENT) {
2448 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2449 }
2450 }
2451}
2452
2453int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2454{
2455 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2456 obj_len = abs_ofs + bl.length();
2457
2458 if (!(obj == last_written_obj)) {
2459 last_written_obj = obj;
2460 }
2461
2462 // For the first call pass -1 as the offset to
2463 // do a write_full.
2464 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2465}
2466
2467struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2468{
2469 struct put_obj_aio_info info;
2470 info = pending.front();
2471 pending.pop_front();
2472 pending_size -= info.size;
2473 return info;
2474}
2475
2476int RGWPutObjProcessor_Aio::wait_pending_front()
2477{
2478 if (pending.empty()) {
2479 return 0;
2480 }
2481 struct put_obj_aio_info info = pop_pending();
2482 int ret = store->aio_wait(info.handle);
2483
2484 if (ret >= 0) {
2485 add_written_obj(info.obj);
2486 }
2487
2488 return ret;
2489}
2490
2491bool RGWPutObjProcessor_Aio::pending_has_completed()
2492{
2493 if (pending.empty())
2494 return false;
2495
2496 struct put_obj_aio_info& info = pending.front();
2497 return store->aio_completed(info.handle);
2498}
2499
2500int RGWPutObjProcessor_Aio::drain_pending()
2501{
2502 int ret = 0;
2503 while (!pending.empty()) {
2504 int r = wait_pending_front();
2505 if (r < 0)
2506 ret = r;
2507 }
2508 return ret;
2509}
2510
2511int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2512{
2513 bool _wait = need_to_wait;
2514
2515 if (handle) {
2516 struct put_obj_aio_info info;
2517 info.handle = handle;
2518 info.obj = obj;
2519 info.size = size;
2520 pending_size += size;
2521 pending.push_back(info);
2522 }
2523 size_t orig_size = pending_size;
2524
2525 /* first drain complete IOs */
2526 while (pending_has_completed()) {
2527 int r = wait_pending_front();
2528 if (r < 0)
2529 return r;
2530
2531 _wait = false;
2532 }
2533
2534 /* resize window in case messages are draining too fast */
2535 if (orig_size - pending_size >= window_size) {
2536 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2537 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2538 if (window_size > max_window_size) {
2539 window_size = max_window_size;
2540 }
2541 }
2542
2543 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2544 if (pending_size > window_size || _wait) {
2545 int r = wait_pending_front();
2546 if (r < 0)
2547 return r;
2548 }
2549 return 0;
2550}
2551
2552int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2553{
2554 if (ofs >= next_part_ofs) {
2555 int r = prepare_next_part(ofs);
2556 if (r < 0) {
2557 return r;
2558 }
2559 }
2560
2561 *pobj = cur_obj;
2562
2563 if (!bl.length())
2564 return 0;
2565
2566 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2567}
2568
2569int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2570{
2571 RGWPutObjProcessor::prepare(store, oid_rand);
2572
2573 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2574
2575 return 0;
2576}
2577
2578int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2579{
2580 *phandle = NULL;
2581 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2582
2583 pending_data_bl.claim_append(bl);
2584 if (pending_data_bl.length() < max_write_size) {
2585 *again = false;
2586 return 0;
2587 }
2588
2589 pending_data_bl.splice(0, max_write_size, &bl);
2590
2591 /* do we have enough data pending accumulated that needs to be written? */
2592 *again = (pending_data_bl.length() >= max_chunk_size);
2593
2594 if (!data_ofs && !immutable_head()) {
2595 first_chunk.claim(bl);
2596 obj_len = (uint64_t)first_chunk.length();
2597 int r = prepare_next_part(obj_len);
2598 if (r < 0) {
2599 return r;
2600 }
2601 data_ofs = obj_len;
2602 return 0;
2603 }
2604 off_t write_ofs = data_ofs;
2605 data_ofs = write_ofs + bl.length();
2606 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2607 we could be racing with another upload, to the same
2608 object and cleanup can be messy */
2609 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2610 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2611 bl.clear();
2612 }
2613 return ret;
2614}
2615
2616
2617int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2618{
2619 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2620
2621 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2622 if (r < 0) {
2623 return r;
2624 }
2625
2626 return 0;
2627}
2628
2629int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2630{
2631 head_obj.init(bucket, obj_str);
2632
2633 int r = prepare_init(store, oid_rand);
2634 if (r < 0) {
2635 return r;
2636 }
2637
2638 if (!version_id.empty()) {
2639 head_obj.key.set_instance(version_id);
2640 } else if (versioned_object) {
2641 store->gen_rand_obj_instance_name(&head_obj);
2642 }
2643
2644 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2645
2646 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2647 if (r < 0) {
2648 return r;
2649 }
2650
2651 return 0;
2652}
2653
2654int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2655
2656 int ret = manifest_gen.create_next(ofs);
2657 if (ret < 0) {
2658 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2659 return ret;
2660 }
2661 cur_part_ofs = ofs;
2662 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2663 cur_obj = manifest_gen.get_cur_obj(store);
2664
2665 return 0;
2666}
2667
2668int RGWPutObjProcessor_Atomic::complete_parts()
2669{
2670 if (obj_len > (uint64_t)cur_part_ofs) {
2671 return prepare_next_part(obj_len);
2672 }
2673 return 0;
2674}
2675
2676int RGWPutObjProcessor_Atomic::complete_writing_data()
2677{
2678 if (!data_ofs && !immutable_head()) {
2679 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2680 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2681 * clobber first_chunk
2682 */
2683 if (pending_data_bl.length() > 0) {
2684 first_chunk.claim(pending_data_bl);
2685 }
2686 obj_len = (uint64_t)first_chunk.length();
2687 }
2688 while (pending_data_bl.length()) {
2689 void *handle;
2690 rgw_raw_obj obj;
2691 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2692 if (max_write_size > pending_data_bl.length()) {
2693 max_write_size = pending_data_bl.length();
2694 }
2695 bufferlist bl;
2696 pending_data_bl.splice(0, max_write_size, &bl);
2697 uint64_t write_len = bl.length();
2698 int r = write_data(bl, data_ofs, &handle, &obj, false);
2699 if (r < 0) {
2700 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2701 return r;
2702 }
2703 data_ofs += write_len;
2704 r = throttle_data(handle, obj, write_len, false);
2705 if (r < 0) {
2706 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2707 return r;
2708 }
2709
2710 if (data_ofs >= next_part_ofs) {
2711 r = prepare_next_part(data_ofs);
2712 if (r < 0) {
2713 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2714 return r;
2715 }
2716 }
2717 }
2718 int r = complete_parts();
2719 if (r < 0) {
2720 return r;
2721 }
2722
2723 r = drain_pending();
2724 if (r < 0)
2725 return r;
2726
2727 return 0;
2728}
2729
2730int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2731 real_time *mtime, real_time set_mtime,
2732 map<string, bufferlist>& attrs,
2733 real_time delete_at,
2734 const char *if_match,
2735 const char *if_nomatch, const string *user_data) {
2736 int r = complete_writing_data();
2737 if (r < 0)
2738 return r;
2739
2740 obj_ctx.obj.set_atomic(head_obj);
2741
2742 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2743
2744 /* some object types shouldn't be versioned, e.g., multipart parts */
2745 op_target.set_versioning_disabled(!versioned_object);
2746
2747 RGWRados::Object::Write obj_op(&op_target);
2748
2749 obj_op.meta.data = &first_chunk;
2750 obj_op.meta.manifest = &manifest;
2751 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2752 obj_op.meta.if_match = if_match;
2753 obj_op.meta.if_nomatch = if_nomatch;
2754 obj_op.meta.mtime = mtime;
2755 obj_op.meta.set_mtime = set_mtime;
2756 obj_op.meta.owner = bucket_info.owner;
2757 obj_op.meta.flags = PUT_OBJ_CREATE;
2758 obj_op.meta.olh_epoch = olh_epoch;
2759 obj_op.meta.delete_at = delete_at;
2760 obj_op.meta.user_data = user_data;
2761
2762 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2763 if (r < 0) {
2764 return r;
2765 }
2766
2767 canceled = obj_op.meta.canceled;
2768
2769 return 0;
2770}
2771
2772int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2773 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2774 if (r < 0)
2775 return r;
2776 return 0;
2777}
2778
2779int RGWRados::unwatch(uint64_t watch_handle)
2780{
2781 int r = control_pool_ctx.unwatch2(watch_handle);
2782 if (r < 0) {
2783 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2784 return r;
2785 }
2786 r = rados[0].watch_flush();
2787 if (r < 0) {
2788 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2789 return r;
2790 }
2791 return 0;
2792}
2793
2794void RGWRados::add_watcher(int i)
2795{
2796 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2797 Mutex::Locker l(watchers_lock);
2798 watchers_set.insert(i);
2799 if (watchers_set.size() == (size_t)num_watchers) {
2800 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2801 set_cache_enabled(true);
2802 }
2803}
2804
2805void RGWRados::remove_watcher(int i)
2806{
2807 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2808 Mutex::Locker l(watchers_lock);
2809 size_t orig_size = watchers_set.size();
2810 watchers_set.erase(i);
2811 if (orig_size == (size_t)num_watchers &&
2812 watchers_set.size() < orig_size) { /* actually removed */
2813 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2814 set_cache_enabled(false);
2815 }
2816}
2817
2818class RGWWatcher : public librados::WatchCtx2 {
2819 RGWRados *rados;
2820 int index;
2821 string oid;
2822 uint64_t watch_handle;
2823
2824 class C_ReinitWatch : public Context {
2825 RGWWatcher *watcher;
2826 public:
2827 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2828 void finish(int r) override {
2829 watcher->reinit();
2830 }
2831 };
2832public:
2833 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2834 void handle_notify(uint64_t notify_id,
2835 uint64_t cookie,
2836 uint64_t notifier_id,
2837 bufferlist& bl) override {
2838 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2839 << " notify_id " << notify_id
2840 << " cookie " << cookie
2841 << " notifier " << notifier_id
2842 << " bl.length()=" << bl.length() << dendl;
2843 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2844
2845 bufferlist reply_bl; // empty reply payload
2846 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2847 }
2848 void handle_error(uint64_t cookie, int err) override {
2849 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2850 << " err " << cpp_strerror(err) << dendl;
2851 rados->remove_watcher(index);
2852 rados->schedule_context(new C_ReinitWatch(this));
2853 }
2854
2855 void reinit() {
2856 int ret = unregister_watch();
2857 if (ret < 0) {
2858 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2859 return;
2860 }
2861 ret = register_watch();
2862 if (ret < 0) {
2863 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2864 return;
2865 }
2866 }
2867
2868 int unregister_watch() {
2869 int r = rados->unwatch(watch_handle);
2870 if (r < 0) {
2871 return r;
2872 }
2873 rados->remove_watcher(index);
2874 return 0;
2875 }
2876
2877 int register_watch() {
2878 int r = rados->watch(oid, &watch_handle, this);
2879 if (r < 0) {
2880 return r;
2881 }
2882 rados->add_watcher(index);
2883 return 0;
2884 }
2885};
2886
2887class RGWMetaNotifierManager : public RGWCoroutinesManager {
2888 RGWRados *store;
2889 RGWHTTPManager http_manager;
2890
2891public:
2892 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2893 http_manager(store->ctx(), completion_mgr) {
2894 http_manager.set_threaded();
2895 }
2896
2897 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2898 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2899 { "notify", NULL },
2900 { NULL, NULL } };
2901
2902 list<RGWCoroutinesStack *> stacks;
2903 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2904 RGWRESTConn *conn = iter->second;
2905 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2906 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2907
2908 stacks.push_back(stack);
2909 }
2910 return run(stacks);
2911 }
2912};
2913
2914class RGWDataNotifierManager : public RGWCoroutinesManager {
2915 RGWRados *store;
2916 RGWHTTPManager http_manager;
2917
2918public:
2919 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2920 http_manager(store->ctx(), completion_mgr) {
2921 http_manager.set_threaded();
2922 }
2923
2924 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2925 rgw_http_param_pair pairs[] = { { "type", "data" },
2926 { "notify", NULL },
2927 { "source-zone", store->get_zone_params().get_id().c_str() },
2928 { NULL, NULL } };
2929
2930 list<RGWCoroutinesStack *> stacks;
2931 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2932 RGWRESTConn *conn = iter->second;
2933 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2934 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2935
2936 stacks.push_back(stack);
2937 }
2938 return run(stacks);
2939 }
2940};
2941
2942class RGWRadosThread {
2943 class Worker : public Thread {
2944 CephContext *cct;
2945 RGWRadosThread *processor;
2946 Mutex lock;
2947 Cond cond;
2948
2949 public:
2950 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2951 void *entry() override;
2952 void stop() {
2953 Mutex::Locker l(lock);
2954 cond.Signal();
2955 }
2956 };
2957
2958 Worker *worker;
2959
2960protected:
2961 CephContext *cct;
2962 RGWRados *store;
2963
2964 std::atomic<bool> down_flag = { false };
2965
2966 string thread_name;
2967
2968 virtual uint64_t interval_msec() = 0;
2969 virtual void stop_process() {}
2970public:
2971 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
2972 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
2973 virtual ~RGWRadosThread() {
2974 stop();
2975 }
2976
2977 virtual int init() { return 0; }
2978 virtual int process() = 0;
2979
2980 bool going_down() { return down_flag; }
2981
2982 void start();
2983 void stop();
2984};
2985
2986void RGWRadosThread::start()
2987{
2988 worker = new Worker(cct, this);
2989 worker->create(thread_name.c_str());
2990}
2991
2992void RGWRadosThread::stop()
2993{
2994 down_flag = true;
2995 stop_process();
2996 if (worker) {
2997 worker->stop();
2998 worker->join();
2999 }
3000 delete worker;
3001 worker = NULL;
3002}
3003
3004void *RGWRadosThread::Worker::entry() {
3005 uint64_t msec = processor->interval_msec();
3006 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3007
3008 do {
3009 utime_t start = ceph_clock_now();
3010 int r = processor->process();
3011 if (r < 0) {
3012 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3013 }
3014
3015 if (processor->going_down())
3016 break;
3017
3018 utime_t end = ceph_clock_now();
3019 end -= start;
3020
3021 uint64_t cur_msec = processor->interval_msec();
3022 if (cur_msec != msec) { /* was it reconfigured? */
3023 msec = cur_msec;
3024 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3025 }
3026
3027 if (cur_msec > 0) {
3028 if (interval <= end)
3029 continue; // next round
3030
3031 utime_t wait_time = interval;
3032 wait_time -= end;
3033
3034 lock.Lock();
3035 cond.WaitInterval(lock, wait_time);
3036 lock.Unlock();
3037 } else {
3038 lock.Lock();
3039 cond.Wait(lock);
3040 lock.Unlock();
3041 }
3042 } while (!processor->going_down());
3043
3044 return NULL;
3045}
3046
3047class RGWMetaNotifier : public RGWRadosThread {
3048 RGWMetaNotifierManager notify_mgr;
3049 RGWMetadataLog *const log;
3050
3051 uint64_t interval_msec() override {
3052 return cct->_conf->rgw_md_notify_interval_msec;
3053 }
3054public:
3055 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3056 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3057
3058 int process() override;
3059};
3060
3061int RGWMetaNotifier::process()
3062{
3063 set<int> shards;
3064
3065 log->read_clear_modified(shards);
3066
3067 if (shards.empty()) {
3068 return 0;
3069 }
3070
3071 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3072 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3073 }
3074
3075 notify_mgr.notify_all(store->zone_conn_map, shards);
3076
3077 return 0;
3078}
3079
3080class RGWDataNotifier : public RGWRadosThread {
3081 RGWDataNotifierManager notify_mgr;
3082
3083 uint64_t interval_msec() override {
3084 return cct->_conf->rgw_md_notify_interval_msec;
3085 }
3086public:
3087 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3088
3089 int process() override;
3090};
3091
3092int RGWDataNotifier::process()
3093{
3094 if (!store->data_log) {
3095 return 0;
3096 }
3097
3098 map<int, set<string> > shards;
3099
3100 store->data_log->read_clear_modified(shards);
3101
3102 if (shards.empty()) {
3103 return 0;
3104 }
3105
3106 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3107 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3108 }
3109
3110 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3111
3112 return 0;
3113}
3114
3115class RGWSyncProcessorThread : public RGWRadosThread {
3116public:
3117 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3118 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3119 ~RGWSyncProcessorThread() override {}
3120 int init() override = 0 ;
3121 int process() override = 0;
3122};
3123
3124class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3125{
3126 RGWMetaSyncStatusManager sync;
3127
3128 uint64_t interval_msec() override {
3129 return 0; /* no interval associated, it'll run once until stopped */
3130 }
3131 void stop_process() override {
3132 sync.stop();
3133 }
3134public:
3135 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3136 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3137
3138 void wakeup_sync_shards(set<int>& shard_ids) {
3139 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3140 sync.wakeup(*iter);
3141 }
3142 }
3143 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3144
3145 int init() override {
3146 int ret = sync.init();
3147 if (ret < 0) {
3148 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3149 return ret;
3150 }
3151 return 0;
3152 }
3153
3154 int process() override {
3155 sync.run();
3156 return 0;
3157 }
3158};
3159
3160class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3161{
3162 RGWDataSyncStatusManager sync;
3163 bool initialized;
3164
3165 uint64_t interval_msec() override {
3166 if (initialized) {
3167 return 0; /* no interval associated, it'll run once until stopped */
3168 } else {
3169#define DATA_SYNC_INIT_WAIT_SEC 20
3170 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3171 }
3172 }
3173 void stop_process() override {
3174 sync.stop();
3175 }
3176public:
3177 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3178 const string& _source_zone)
3179 : RGWSyncProcessorThread(_store, "data-sync"), sync(_store, async_rados, _source_zone),
3180 initialized(false) {}
3181
3182 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3183 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3184 sync.wakeup(iter->first, iter->second);
3185 }
3186 }
3187 RGWDataSyncStatusManager* get_manager() { return &sync; }
3188
3189 int init() override {
3190 return 0;
3191 }
3192
3193 int process() override {
3194 while (!initialized) {
3195 if (going_down()) {
3196 return 0;
3197 }
3198 int ret = sync.init();
3199 if (ret >= 0) {
3200 initialized = true;
3201 break;
3202 }
3203 /* we'll be back! */
3204 return 0;
3205 }
3206 sync.run();
3207 return 0;
3208 }
3209};
3210
3211class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3212{
3213 RGWCoroutinesManager crs;
3214 RGWRados *store;
3215 RGWHTTPManager http;
3216 const utime_t trim_interval;
3217
3218 uint64_t interval_msec() override { return 0; }
3219 void stop_process() override { crs.stop(); }
3220public:
3221 RGWSyncLogTrimThread(RGWRados *store, int interval)
3222 : RGWSyncProcessorThread(store, "sync-log-trim"),
3223 crs(store->ctx(), store->get_cr_registry()), store(store),
3224 http(store->ctx(), crs.get_completion_mgr()),
3225 trim_interval(interval, 0)
3226 {}
3227
3228 int init() override {
3229 return http.set_threaded();
3230 }
3231 int process() override {
3232 list<RGWCoroutinesStack*> stacks;
3233 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3234 meta->call(create_meta_log_trim_cr(store, &http,
3235 cct->_conf->rgw_md_log_max_shards,
3236 trim_interval));
3237 stacks.push_back(meta);
3238
3239 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3240 data->call(create_data_log_trim_cr(store, &http,
3241 cct->_conf->rgw_data_log_num_shards,
3242 trim_interval));
3243 stacks.push_back(data);
3244
3245 crs.run(stacks);
3246 return 0;
3247 }
3248};
3249
3250void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3251{
3252 Mutex::Locker l(meta_sync_thread_lock);
3253 if (meta_sync_processor_thread) {
3254 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3255 }
3256}
3257
3258void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3259{
3260 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3261 Mutex::Locker l(data_sync_thread_lock);
3262 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3263 if (iter == data_sync_processor_threads.end()) {
3264 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3265 return;
3266 }
3267
3268 RGWDataSyncProcessorThread *thread = iter->second;
3269 assert(thread);
3270 thread->wakeup_sync_shards(shard_ids);
3271}
3272
3273RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3274{
3275 Mutex::Locker l(meta_sync_thread_lock);
3276 if (meta_sync_processor_thread) {
3277 return meta_sync_processor_thread->get_manager();
3278 }
3279 return nullptr;
3280}
3281
3282RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3283{
3284 Mutex::Locker l(data_sync_thread_lock);
3285 auto thread = data_sync_processor_threads.find(source_zone);
3286 if (thread == data_sync_processor_threads.end()) {
3287 return nullptr;
3288 }
3289 return thread->second->get_manager();
3290}
3291
3292int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3293{
3294 IoCtx ioctx;
3295 int r = open_pool_ctx(pool, ioctx);
3296 if (r < 0) {
3297 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3298 return r;
3299 }
3300
3301 bool requires;
3302 r = ioctx.pool_requires_alignment2(&requires);
3303 if (r < 0) {
3304 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3305 << r << dendl;
3306 return r;
3307 }
3308
3309 if (!requires) {
3310 *alignment = 0;
3311 return 0;
3312 }
3313
3314 uint64_t align;
3315 r = ioctx.pool_required_alignment2(&align);
3316 if (r < 0) {
3317 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3318 << r << dendl;
3319 return r;
3320 }
3321 if (align != 0) {
3322 ldout(cct, 20) << "required alignment=" << align << dendl;
3323 }
3324 *alignment = align;
3325 return 0;
3326}
3327
3328int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3329{
3330 uint64_t alignment;
3331 int r = get_required_alignment(pool, &alignment);
3332 if (r < 0) {
3333 return r;
3334 }
3335
3336 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3337
3338 if (alignment == 0) {
3339 *max_chunk_size = config_chunk_size;
3340 return 0;
3341 }
3342
3343 if (config_chunk_size <= alignment) {
3344 *max_chunk_size = alignment;
3345 return 0;
3346 }
3347
3348 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3349
3350 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3351
3352 return 0;
3353}
3354
3355int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3356{
3357 rgw_pool pool;
3358 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3359 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3360 return -EIO;
3361 }
3362 return get_max_chunk_size(pool, max_chunk_size);
3363}
3364
3365void RGWRados::finalize()
3366{
3367 if (run_sync_thread) {
3368 Mutex::Locker l(meta_sync_thread_lock);
3369 meta_sync_processor_thread->stop();
3370
3371 Mutex::Locker dl(data_sync_thread_lock);
3372 for (auto iter : data_sync_processor_threads) {
3373 RGWDataSyncProcessorThread *thread = iter.second;
3374 thread->stop();
3375 }
3376 if (sync_log_trimmer) {
3377 sync_log_trimmer->stop();
3378 }
3379 }
3380 if (async_rados) {
3381 async_rados->stop();
3382 }
3383 if (run_sync_thread) {
3384 delete meta_sync_processor_thread;
3385 meta_sync_processor_thread = NULL;
3386 Mutex::Locker dl(data_sync_thread_lock);
3387 for (auto iter : data_sync_processor_threads) {
3388 RGWDataSyncProcessorThread *thread = iter.second;
3389 delete thread;
3390 }
3391 data_sync_processor_threads.clear();
3392 delete sync_log_trimmer;
3393 sync_log_trimmer = nullptr;
3394 }
3395 if (finisher) {
3396 finisher->stop();
3397 }
3398 if (need_watch_notify()) {
3399 finalize_watch();
3400 }
3401 if (finisher) {
3402 /* delete finisher only after cleaning up watches, as watch error path might call
3403 * into finisher. We stop finisher before finalizing watch to make sure we don't
3404 * actually handle any racing work
3405 */
3406 delete finisher;
3407 }
3408 if (meta_notifier) {
3409 meta_notifier->stop();
3410 delete meta_notifier;
3411 }
3412 if (data_notifier) {
3413 data_notifier->stop();
3414 delete data_notifier;
3415 }
3416 delete data_log;
3417 if (async_rados) {
3418 delete async_rados;
3419 }
3420 if (use_gc_thread) {
3421 gc->stop_processor();
3422 obj_expirer->stop_processor();
3423 }
3424 delete gc;
3425 gc = NULL;
3426
3427 if (use_lc_thread) {
3428 lc->stop_processor();
3429 }
3430 delete lc;
3431 lc = NULL;
3432
3433 delete obj_expirer;
3434 obj_expirer = NULL;
3435
3436 delete rest_master_conn;
3437
3438 map<string, RGWRESTConn *>::iterator iter;
3439 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3440 RGWRESTConn *conn = iter->second;
3441 delete conn;
3442 }
3443
3444 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3445 RGWRESTConn *conn = iter->second;
3446 delete conn;
3447 }
3448 RGWQuotaHandler::free_handler(quota_handler);
3449 if (cr_registry) {
3450 cr_registry->put();
3451 }
3452 delete meta_mgr;
3453 delete binfo_cache;
3454 delete obj_tombstone_cache;
3455 delete sync_modules_manager;
3456}
3457
3458/**
3459 * Initialize the RADOS instance and prepare to do other ops
3460 * Returns 0 on success, -ERR# on failure.
3461 */
3462int RGWRados::init_rados()
3463{
3464 int ret = 0;
3465 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3466
3467 for (auto& r : handles) {
3468 ret = r.init_with_context(cct);
3469 if (ret < 0) {
3470 return ret;
3471 }
3472
3473 ret = r.connect();
3474 if (ret < 0) {
3475 return ret;
3476 }
3477 }
3478
3479 sync_modules_manager = new RGWSyncModulesManager();
3480
3481 rgw_register_sync_modules(sync_modules_manager);
3482
3483 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3484 new RGWCoroutinesManagerRegistry(cct)};
3485 ret = crs->hook_to_admin_command("cr dump");
3486 if (ret < 0) {
3487 return ret;
3488 }
3489
3490 meta_mgr = new RGWMetadataManager(cct, this);
3491 data_log = new RGWDataChangesLog(cct, this);
3492 cr_registry = crs.release();
3493
3494 std::swap(handles, rados);
3495 return ret;
3496}
3497
3498/**
3499 * Add new connection to connections map
3500 * @param zonegroup_conn_map map which new connection will be added to
3501 * @param zonegroup zonegroup which new connection will connect to
3502 * @param new_connection pointer to new connection instance
3503 */
3504static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3505 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3506{
3507 // Delete if connection is already exists
3508 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3509 if (iterZoneGroup != zonegroup_conn_map.end()) {
3510 delete iterZoneGroup->second;
3511 }
3512
3513 // Add new connection to connections map
3514 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3515}
3516
3517int RGWRados::convert_regionmap()
3518{
3519 RGWZoneGroupMap zonegroupmap;
3520
3521 string pool_name = cct->_conf->rgw_zone_root_pool;
3522 if (pool_name.empty()) {
3523 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3524 }
3525 string oid = region_map_oid;
3526
3527 rgw_pool pool(pool_name);
3528 bufferlist bl;
3529 RGWObjectCtx obj_ctx(this);
3530 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3531 if (ret < 0 && ret != -ENOENT) {
3532 return ret;
3533 } else if (ret == -ENOENT) {
3534 return 0;
3535 }
3536
3537 try {
3538 bufferlist::iterator iter = bl.begin();
3539 ::decode(zonegroupmap, iter);
3540 } catch (buffer::error& err) {
3541 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3542 return -EIO;
3543 }
3544
3545 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3546 iter != zonegroupmap.zonegroups.end(); ++iter) {
3547 RGWZoneGroup& zonegroup = iter->second;
3548 ret = zonegroup.init(cct, this, false);
3549 ret = zonegroup.update();
3550 if (ret < 0 && ret != -ENOENT) {
3551 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3552 cpp_strerror(-ret) << dendl;
3553 return ret;
3554 } else if (ret == -ENOENT) {
3555 ret = zonegroup.create();
3556 if (ret < 0) {
3557 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3558 cpp_strerror(-ret) << dendl;
3559 return ret;
3560 }
3561 }
3562 }
3563
3564 current_period.set_user_quota(zonegroupmap.user_quota);
3565 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3566
3567 // remove the region_map so we don't try to convert again
3568 rgw_raw_obj obj(pool, oid);
3569 ret = delete_system_obj(obj);
3570 if (ret < 0) {
3571 ldout(cct, 0) << "Error could not remove " << obj
3572 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3573 return ret;
3574 }
3575
3576 return 0;
3577}
3578
3579/**
3580 * Replace all region configuration with zonegroup for
3581 * backward compatability
3582 * Returns 0 on success, -ERR# on failure.
3583 */
3584int RGWRados::replace_region_with_zonegroup()
3585{
3586 /* copy default region */
3587 /* convert default region to default zonegroup */
3588 string default_oid = cct->_conf->rgw_default_region_info_oid;
3589 if (default_oid.empty()) {
3590 default_oid = default_region_info_oid;
3591 }
3592
3593
3594 RGWZoneGroup default_zonegroup;
3595 rgw_pool pool{default_zonegroup.get_pool(cct)};
3596 string oid = "converted";
3597 bufferlist bl;
3598 RGWObjectCtx obj_ctx(this);
3599
3600 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3601 if (ret < 0 && ret != -ENOENT) {
3602 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3603 << dendl;
3604 return ret;
3605 } else if (ret != -ENOENT) {
3606 ldout(cct, 20) << "System already converted " << dendl;
3607 return 0;
3608 }
3609
3610 string default_region;
3611 ret = default_zonegroup.init(cct, this, false, true);
3612 if (ret < 0) {
3613 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3614 return ret;
3615 }
3616 ret = default_zonegroup.read_default_id(default_region, true);
3617 if (ret < 0 && ret != -ENOENT) {
3618 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3619 return ret;
3620 }
3621
3622 /* convert regions to zonegroups */
3623 list<string> regions;
3624 ret = list_regions(regions);
3625 if (ret < 0 && ret != -ENOENT) {
3626 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3627 return ret;
3628 } else if (ret == -ENOENT || regions.empty()) {
3629 RGWZoneParams zoneparams(default_zone_name);
3630 int ret = zoneparams.init(cct, this);
3631 if (ret < 0 && ret != -ENOENT) {
3632 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3633 return ret;
3634 }
3635 /* update master zone */
3636 RGWZoneGroup default_zg(default_zonegroup_name);
3637 ret = default_zg.init(cct, this);
3638 if (ret < 0 && ret != -ENOENT) {
3639 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3640 return ret;
3641 }
3642 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3643 default_zg.master_zone = zoneparams.get_id();
3644 return default_zg.update();
3645 }
3646 return 0;
3647 }
3648
3649 string master_region, master_zone;
3650 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3651 if (*iter != default_zonegroup_name){
3652 RGWZoneGroup region(*iter);
3653 int ret = region.init(cct, this, true, true);
3654 if (ret < 0) {
3655 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3656 return ret;
3657 }
3658 if (region.is_master) {
3659 master_region = region.get_id();
3660 master_zone = region.master_zone;
3661 }
3662 }
3663 }
3664
3665 /* create realm if there is none.
3666 The realm name will be the region and zone concatenated
3667 realm id will be mds of its name */
3668 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
3669 string new_realm_name = master_region + "." + master_zone;
3670 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
3671 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
3672 MD5 hash;
3673 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
3674 hash.Final(md5);
3675 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
3676 string new_realm_id(md5_str);
3677 RGWRealm new_realm(new_realm_id,new_realm_name);
3678 ret = new_realm.init(cct, this, false);
3679 if (ret < 0) {
3680 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
3681 return ret;
3682 }
3683 ret = new_realm.create();
3684 if (ret < 0 && ret != -EEXIST) {
3685 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
3686 return ret;
3687 }
3688 ret = new_realm.set_as_default();
3689 if (ret < 0) {
3690 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
3691 return ret;
3692 }
3693 ret = realm.init(cct, this);
3694 if (ret < 0) {
3695 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
3696 return ret;
3697 }
3698 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
3699 if (ret < 0) {
3700 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
3701 return ret;
3702 }
3703 }
3704
3705 list<string>::iterator iter;
3706 /* create zonegroups */
3707 for (iter = regions.begin(); iter != regions.end(); ++iter)
3708 {
3709 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
3710 /* check to see if we don't have already a zonegroup with this name */
3711 RGWZoneGroup new_zonegroup(*iter);
3712 ret = new_zonegroup.init(cct , this);
3713 if (ret == 0 && new_zonegroup.get_id() != *iter) {
3714 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
3715 " skipping conversion " << dendl;
3716 continue;
3717 }
3718 RGWZoneGroup zonegroup(*iter);
3719 zonegroup.set_id(*iter);
3720 int ret = zonegroup.init(cct, this, true, true);
3721 if (ret < 0) {
3722 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3723 return ret;
3724 }
3725 zonegroup.realm_id = realm.get_id();
3726 /* fix default region master zone */
3727 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
3728 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
3729 zonegroup.master_zone = default_zone_name;
3730 }
3731 ret = zonegroup.update();
3732 if (ret < 0 && ret != -EEXIST) {
3733 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
3734 << dendl;
3735 return ret;
3736 }
3737 ret = zonegroup.update_name();
3738 if (ret < 0 && ret != -EEXIST) {
3739 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
3740 << dendl;
3741 return ret;
3742 }
3743 if (zonegroup.get_name() == default_region) {
3744 ret = zonegroup.set_as_default();
3745 if (ret < 0) {
3746 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
3747 << dendl;
3748 return ret;
3749 }
3750 }
3751 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
3752 ++iter) {
3753 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
3754 RGWZoneParams zoneparams(iter->first, iter->first);
3755 zoneparams.set_id(iter->first);
3756 zoneparams.realm_id = realm.get_id();
3757 ret = zoneparams.init(cct, this);
3758 if (ret < 0 && ret != -ENOENT) {
3759 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
3760 return ret;
3761 } else if (ret == -ENOENT) {
3762 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
3763 continue;
3764 }
3765 zonegroup.realm_id = realm.get_id();
3766 ret = zoneparams.update();
3767 if (ret < 0 && ret != -EEXIST) {
3768 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
3769 return ret;
3770 }
3771 ret = zoneparams.update_name();
3772 if (ret < 0 && ret != -EEXIST) {
3773 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
3774 return ret;
3775 }
3776 }
3777
3778 if (!current_period.get_id().empty()) {
3779 ret = current_period.add_zonegroup(zonegroup);
3780 if (ret < 0) {
3781 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
3782 return ret;
3783 }
3784 }
3785 }
3786
3787 if (!current_period.get_id().empty()) {
3788 ret = current_period.update();
3789 if (ret < 0) {
3790 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
3791 return ret;
3792 }
3793 ret = current_period.store_info(false);
3794 if (ret < 0) {
3795 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
3796 return ret;
3797 }
3798 ret = current_period.reflect();
3799 if (ret < 0) {
3800 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
3801 return ret;
3802 }
3803 }
3804
3805 for (auto const& iter : regions) {
3806 RGWZoneGroup zonegroup(iter);
3807 int ret = zonegroup.init(cct, this, true, true);
3808 if (ret < 0) {
3809 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3810 return ret;
3811 }
3812 ret = zonegroup.delete_obj(true);
3813 if (ret < 0 && ret != -ENOENT) {
3814 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
3815 << dendl;
3816 return ret;
3817 }
3818 }
3819
3820 /* mark as converted */
3821 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
3822 true, NULL, real_time(), NULL);
3823 if (ret < 0 ) {
3824 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
3825 << dendl;
3826 return ret;
3827 }
3828
3829 return 0;
3830}
3831
3832int RGWRados::init_zg_from_period(bool *initialized)
3833{
3834 *initialized = false;
3835
3836 if (current_period.get_id().empty()) {
3837 return 0;
3838 }
3839
3840 int ret = zonegroup.init(cct, this);
3841 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
3842 if (ret == -ENOENT) {
3843 return 0;
3844 }
3845 if (ret < 0) {
3846 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
3847 return ret;
3848 }
3849 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
3850
3851 map<string, RGWZoneGroup>::const_iterator iter =
3852 current_period.get_map().zonegroups.find(zonegroup.get_id());
3853
3854 if (iter != current_period.get_map().zonegroups.end()) {
3855 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
3856 zonegroup = iter->second;
3857 ret = zonegroup.init(cct, this, false);
3858 if (ret < 0) {
3859 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
3860 return ret;
3861 }
3862 ret = zone_params.init(cct, this);
3863 if (ret < 0 && ret != -ENOENT) {
3864 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
3865 return ret;
3866 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
3867 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
3868 zone_params.set_name(default_zone_name);
3869 ret = zone_params.init(cct, this);
3870 if (ret < 0 && ret != -ENOENT) {
3871 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
3872 return ret;
3873 }
3874 }
3875 }
3876 for (iter = current_period.get_map().zonegroups.begin();
3877 iter != current_period.get_map().zonegroups.end(); ++iter){
3878 const RGWZoneGroup& zg = iter->second;
3879 // use endpoints from the zonegroup's master zone
3880 auto master = zg.zones.find(zg.master_zone);
3881 if (master == zg.zones.end()) {
3882 // fix missing master zone for a single zone zonegroup
3883 if (zg.master_zone.empty() && zg.zones.size() == 1) {
3884 master = zg.zones.begin();
3885 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
3886 master->second.name << " id:" << master->second.id << " as master" << dendl;
3887 if (zonegroup.get_id() == zg.get_id()) {
3888 zonegroup.master_zone = master->second.id;
3889 ret = zonegroup.update();
3890 if (ret < 0) {
3891 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
3892 return ret;
3893 }
3894 } else {
3895 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
3896 ret = fixed_zg.init(cct, this);
3897 if (ret < 0) {
3898 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
3899 return ret;
3900 }
3901 fixed_zg.master_zone = master->second.id;
3902 ret = fixed_zg.update();
3903 if (ret < 0) {
3904 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
3905 return ret;
3906 }
3907 }
3908 } else {
3909 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
3910 zg.master_zone << dendl;
3911 return -EINVAL;
3912 }
3913 }
3914 const auto& endpoints = master->second.endpoints;
3915 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
3916 if (!current_period.get_master_zonegroup().empty() &&
3917 zg.get_id() == current_period.get_master_zonegroup()) {
3918 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
3919 }
3920 }
3921
3922 *initialized = true;
3923
3924 return 0;
3925}
3926
3927int RGWRados::init_zg_from_local(bool *creating_defaults)
3928{
3929 int ret = zonegroup.init(cct, this);
3930 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
3931 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3932 return ret;
3933 } else if (ret == -ENOENT) {
3934 *creating_defaults = true;
3935 ldout(cct, 10) << "Creating default zonegroup " << dendl;
3936 ret = zonegroup.create_default();
3937 if (ret < 0) {
3938 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
3939 << dendl;
3940 return ret;
3941 }
3942 ret = zonegroup.init(cct, this);
3943 if (ret < 0) {
3944 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
3945 << dendl;
3946 return ret;
3947 }
3948 }
3949 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
3950 if (zonegroup.is_master) {
3951 // use endpoints from the zonegroup's master zone
3952 auto master = zonegroup.zones.find(zonegroup.master_zone);
3953 if (master == zonegroup.zones.end()) {
3954 // fix missing master zone for a single zone zonegroup
3955 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
3956 master = zonegroup.zones.begin();
3957 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
3958 master->second.name << " id:" << master->second.id << " as master" << dendl;
3959 zonegroup.master_zone = master->second.id;
3960 ret = zonegroup.update();
3961 if (ret < 0) {
3962 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
3963 return ret;
3964 }
3965 } else {
3966 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
3967 "master_zone=" << zonegroup.master_zone << dendl;
3968 return -EINVAL;
3969 }
3970 }
3971 const auto& endpoints = master->second.endpoints;
3972 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
3973 }
3974
3975 return 0;
3976}
3977
3978
3979bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
3980{
3981 return target_zone.syncs_from(source_zone.name) &&
3982 sync_modules_manager->supports_data_export(source_zone.tier_type);
3983}
3984
3985/**
3986 * Initialize the RADOS instance and prepare to do other ops
3987 * Returns 0 on success, -ERR# on failure.
3988 */
3989int RGWRados::init_complete()
3990{
3991 int ret = realm.init(cct, this);
3992 if (ret < 0 && ret != -ENOENT) {
3993 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3994 return ret;
3995 } else if (ret != -ENOENT) {
3996 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
3997 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
3998 if (ret < 0 && ret != -ENOENT) {
3999 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4000 return ret;
4001 }
4002 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4003 }
4004
4005 ret = replace_region_with_zonegroup();
4006 if (ret < 0) {
4007 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4008 return ret;
4009 }
4010
4011 ret = convert_regionmap();
4012 if (ret < 0) {
4013 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4014 return ret;
4015 }
4016
4017 bool zg_initialized = false;
4018
4019 if (!current_period.get_id().empty()) {
4020 ret = init_zg_from_period(&zg_initialized);
4021 if (ret < 0) {
4022 return ret;
4023 }
4024 }
4025
4026 bool creating_defaults = false;
4027 bool using_local = (!zg_initialized);
4028 if (using_local) {
4029 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4030 ret = init_zg_from_local(&creating_defaults);
4031 if (ret < 0) {
4032 return ret;
4033 }
4034 // read period_config into current_period
4035 auto& period_config = current_period.get_config();
4036 ret = period_config.read(this, zonegroup.realm_id);
4037 if (ret < 0 && ret != -ENOENT) {
4038 ldout(cct, 0) << "ERROR: failed to read period config: "
4039 << cpp_strerror(ret) << dendl;
4040 return ret;
4041 }
4042 }
4043
4044 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4045 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4046 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4047 zone_params.set_name(default_zone_name);
4048 }
4049
4050 ret = zone_params.init(cct, this);
4051 if (ret < 0 && ret != -ENOENT) {
4052 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4053 return ret;
4054 }
4055 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4056 if (zone_iter == get_zonegroup().zones.end()) {
4057 if (using_local) {
4058 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4059 return -EINVAL;
4060 }
4061 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4062 ret = init_zg_from_local(&creating_defaults);
4063 if (ret < 0) {
4064 return ret;
4065 }
4066 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4067 }
4068 if (zone_iter != get_zonegroup().zones.end()) {
4069 zone_public_config = zone_iter->second;
4070 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4071 } else {
4072 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4073 return -EINVAL;
4074 }
4075
4076 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4077
4078 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4079 if (ret < 0) {
4080 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4081 return ret;
4082 }
4083
4084 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4085
4086 init_unique_trans_id_deps();
4087
4088 finisher = new Finisher(cct);
4089 finisher->start();
4090
4091 period_puller.reset(new RGWPeriodPuller(this));
4092 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4093 current_period));
4094
4095 if (need_watch_notify()) {
4096 ret = init_watch();
4097 if (ret < 0) {
4098 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4099 return ret;
4100 }
4101 }
4102
4103 /* first build all zones index */
4104 for (auto ziter : get_zonegroup().zones) {
4105 const string& id = ziter.first;
4106 RGWZone& z = ziter.second;
4107 zone_id_by_name[z.name] = id;
4108 zone_by_id[id] = z;
4109 }
4110
4111 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4112 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4113 }
4114 zone_public_config = zone_by_id[zone_id()];
4115 for (auto ziter : get_zonegroup().zones) {
4116 const string& id = ziter.first;
4117 RGWZone& z = ziter.second;
4118 if (id == zone_id()) {
4119 continue;
4120 }
4121 if (z.endpoints.empty()) {
4122 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4123 continue;
4124 }
4125 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4126 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4127 zone_conn_map[id] = conn;
4128 if (zone_syncs_from(zone_public_config, z) ||
4129 zone_syncs_from(z, zone_public_config)) {
4130 if (zone_syncs_from(zone_public_config, z)) {
4131 zone_data_sync_from_map[id] = conn;
4132 }
4133 if (zone_syncs_from(z, zone_public_config)) {
4134 zone_data_notify_to_map[id] = conn;
4135 }
4136 } else {
4137 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4138 }
4139 }
4140
4141 ret = open_root_pool_ctx();
4142 if (ret < 0)
4143 return ret;
4144
4145 ret = open_gc_pool_ctx();
4146 if (ret < 0)
4147 return ret;
4148
4149 ret = open_lc_pool_ctx();
4150 if (ret < 0)
4151 return ret;
4152
4153 ret = open_objexp_pool_ctx();
4154 if (ret < 0)
4155 return ret;
4156
4157 pools_initialized = true;
4158
4159 gc = new RGWGC();
4160 gc->initialize(cct, this);
4161
4162 obj_expirer = new RGWObjectExpirer(this);
4163
4164 if (use_gc_thread) {
4165 gc->start_processor();
4166 obj_expirer->start_processor();
4167 }
4168
4169 if (run_sync_thread) {
4170 // initialize the log period history. we want to do this any time we're not
4171 // running under radosgw-admin, so we check run_sync_thread here before
4172 // disabling it based on the zone/zonegroup setup
4173 meta_mgr->init_oldest_log_period();
4174 }
4175
4176 /* no point of running sync thread if we don't have a master zone configured
4177 or there is no rest_master_conn */
4178 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4179 || current_period.get_id().empty()) {
4180 run_sync_thread = false;
4181 }
4182
4183 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4184 async_rados->start();
4185
4186 ret = meta_mgr->init(current_period.get_id());
4187 if (ret < 0) {
4188 lderr(cct) << "ERROR: failed to initialize metadata log: "
4189 << cpp_strerror(-ret) << dendl;
4190 return ret;
4191 }
4192
4193 if (is_meta_master()) {
4194 auto md_log = meta_mgr->get_log(current_period.get_id());
4195 meta_notifier = new RGWMetaNotifier(this, md_log);
4196 meta_notifier->start();
4197 }
4198
4199 if (run_sync_thread) {
4200 Mutex::Locker l(meta_sync_thread_lock);
4201 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4202 ret = meta_sync_processor_thread->init();
4203 if (ret < 0) {
4204 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4205 return ret;
4206 }
4207 meta_sync_processor_thread->start();
4208
4209 Mutex::Locker dl(data_sync_thread_lock);
4210 for (auto iter : zone_data_sync_from_map) {
4211 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4212 RGWDataSyncProcessorThread *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
4213 ret = thread->init();
4214 if (ret < 0) {
4215 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4216 return ret;
4217 }
4218 thread->start();
4219 data_sync_processor_threads[iter.first] = thread;
4220 }
4221 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4222 if (interval > 0) {
4223 sync_log_trimmer = new RGWSyncLogTrimThread(this, interval);
4224 ret = sync_log_trimmer->init();
4225 if (ret < 0) {
4226 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4227 return ret;
4228 }
4229 sync_log_trimmer->start();
4230 }
4231 }
4232 data_notifier = new RGWDataNotifier(this);
4233 data_notifier->start();
4234
4235 lc = new RGWLC();
4236 lc->initialize(cct, this);
4237
4238 if (use_lc_thread)
4239 lc->start_processor();
4240
4241 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4242
4243 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4244 get_zone().bucket_index_max_shards);
4245 if (bucket_index_max_shards > MAX_BUCKET_INDEX_SHARDS_PRIME) {
4246 bucket_index_max_shards = MAX_BUCKET_INDEX_SHARDS_PRIME;
4247 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4248 << MAX_BUCKET_INDEX_SHARDS_PRIME << dendl;
4249 }
4250 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4251
4252 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4253 binfo_cache->init(this);
4254
4255 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4256
4257 if (need_tombstone_cache) {
4258 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4259 }
4260
4261 return ret;
4262}
4263
4264/**
4265 * Initialize the RADOS instance and prepare to do other ops
4266 * Returns 0 on success, -ERR# on failure.
4267 */
4268int RGWRados::initialize()
4269{
4270 int ret;
4271
4272 ret = init_rados();
4273 if (ret < 0)
4274 return ret;
4275
4276 return init_complete();
4277}
4278
4279void RGWRados::finalize_watch()
4280{
4281 for (int i = 0; i < num_watchers; i++) {
4282 RGWWatcher *watcher = watchers[i];
4283 watcher->unregister_watch();
4284 delete watcher;
4285 }
4286
4287 delete[] notify_oids;
4288 delete[] watchers;
4289}
4290
4291void RGWRados::schedule_context(Context *c) {
4292 finisher->queue(c);
4293}
4294
4295int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4296{
4297 bool is_truncated;
4298 RGWListRawObjsCtx ctx;
4299 do {
4300 list<string> oids;
4301 int r = list_raw_objects(pool, prefix, 1000,
4302 ctx, oids, &is_truncated);
4303 if (r < 0) {
4304 return r;
4305 }
4306 list<string>::iterator iter;
4307 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4308 string& val = *iter;
4309 if (val.size() > prefix.size())
4310 result.push_back(val.substr(prefix.size()));
4311 }
4312 } while (is_truncated);
4313
4314 return 0;
4315}
4316
4317int RGWRados::list_regions(list<string>& regions)
4318{
4319 RGWZoneGroup zonegroup;
4320
4321 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4322}
4323
4324int RGWRados::list_zonegroups(list<string>& zonegroups)
4325{
4326 RGWZoneGroup zonegroup;
4327
4328 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4329}
4330
4331int RGWRados::list_zones(list<string>& zones)
4332{
4333 RGWZoneParams zoneparams;
4334
4335 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4336}
4337
4338int RGWRados::list_realms(list<string>& realms)
4339{
4340 RGWRealm realm(cct, this);
4341 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4342}
4343
4344int RGWRados::list_periods(list<string>& periods)
4345{
4346 RGWPeriod period;
4347 list<string> raw_periods;
4348 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4349 if (ret < 0) {
4350 return ret;
4351 }
4352 for (const auto& oid : raw_periods) {
4353 size_t pos = oid.find(".");
4354 if (pos != std::string::npos) {
4355 periods.push_back(oid.substr(0, pos));
4356 } else {
4357 periods.push_back(oid);
4358 }
4359 }
4360 periods.sort(); // unique() only detects duplicates if they're adjacent
4361 periods.unique();
4362 return 0;
4363}
4364
4365
4366int RGWRados::list_periods(const string& current_period, list<string>& periods)
4367{
4368 int ret = 0;
4369 string period_id = current_period;
4370 while(!period_id.empty()) {
4371 RGWPeriod period(period_id);
4372 ret = period.init(cct, this);
4373 if (ret < 0) {
4374 return ret;
4375 }
4376 periods.push_back(period.get_id());
4377 period_id = period.get_predecessor();
4378 }
4379
4380 return ret;
4381}
4382
4383/**
4384 * Open the pool used as root for this gateway
4385 * Returns: 0 on success, -ERR# otherwise.
4386 */
4387int RGWRados::open_root_pool_ctx()
4388{
4389 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4390}
4391
4392int RGWRados::open_gc_pool_ctx()
4393{
4394 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4395}
4396
4397int RGWRados::open_lc_pool_ctx()
4398{
4399 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4400}
4401
4402int RGWRados::open_objexp_pool_ctx()
4403{
4404 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4405}
4406
4407int RGWRados::init_watch()
4408{
4409 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4410 if (r < 0) {
4411 return r;
4412 }
4413
4414 num_watchers = cct->_conf->rgw_num_control_oids;
4415
4416 bool compat_oid = (num_watchers == 0);
4417
4418 if (num_watchers <= 0)
4419 num_watchers = 1;
4420
4421 notify_oids = new string[num_watchers];
4422 watchers = new RGWWatcher *[num_watchers];
4423
4424 for (int i=0; i < num_watchers; i++) {
4425 string& notify_oid = notify_oids[i];
4426 notify_oid = notify_oid_prefix;
4427 if (!compat_oid) {
4428 char buf[16];
4429 snprintf(buf, sizeof(buf), ".%d", i);
4430 notify_oid.append(buf);
4431 }
4432 r = control_pool_ctx.create(notify_oid, false);
4433 if (r < 0 && r != -EEXIST)
4434 return r;
4435
4436 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4437 watchers[i] = watcher;
4438
4439 r = watcher->register_watch();
4440 if (r < 0)
4441 return r;
4442 }
4443
4444 watch_initialized = true;
4445
4446 set_cache_enabled(true);
4447
4448 return 0;
4449}
4450
4451void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4452{
4453 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4454
4455 int i = r % num_watchers;
4456 char buf[16];
4457 snprintf(buf, sizeof(buf), ".%d", i);
4458
4459 notify_oid = notify_oid_prefix;
4460 notify_oid.append(buf);
4461}
4462
4463int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4464{
4465 librados::Rados *rad = get_rados_handle();
4466 int r = rgw_init_ioctx(rad, pool, io_ctx);
4467 if (r != -ENOENT)
4468 return r;
4469
4470 if (!pools_initialized)
4471 return r;
4472
4473 r = rad->pool_create(pool.name.c_str());
4474 if (r < 0 && r != -EEXIST)
4475 return r;
4476
4477 return rgw_init_ioctx(rad, pool, io_ctx);
4478}
4479
4480void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4481 string *marker) {
4482 if (marker) {
4483 *marker = shard_id_str;
4484 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4485 marker->append(shard_marker);
4486 }
4487}
4488
4489int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4490{
4491 const string *rule = &bucket_info.placement_rule;
4492 if (rule->empty()) {
4493 rule = &zonegroup.default_placement;
4494 }
4495 auto iter = zone_params.placement_pools.find(*rule);
4496 if (iter == zone_params.placement_pools.end()) {
4497 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4498 return -EINVAL;
4499 }
4500
4501 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4502 if (r < 0)
4503 return r;
4504
4505 return 0;
4506}
4507
4508/**
4509 * set up a bucket listing.
4510 * handle is filled in.
4511 * Returns 0 on success, -ERR# otherwise.
4512 */
4513int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4514{
4515 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4516 *handle = (RGWAccessHandle)state;
4517 return 0;
4518}
4519
4520/**
4521 * get the next bucket in the listing.
4522 * obj is filled in,
4523 * handle is updated.
4524 * returns 0 on success, -ERR# otherwise.
4525 */
4526int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4527{
4528 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4529
4530 do {
4531 if (*state == root_pool_ctx.nobjects_end()) {
4532 delete state;
4533 return -ENOENT;
4534 }
4535
4536 obj.key.name = (*state)->get_oid();
4537 if (obj.key.name[0] == '_') {
4538 obj.key.name = obj.key.name.substr(1);
4539 }
4540
4541 (*state)++;
4542 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4543
4544 return 0;
4545}
4546
4547
4548/**** logs ****/
4549
4550struct log_list_state {
4551 string prefix;
4552 librados::IoCtx io_ctx;
4553 librados::NObjectIterator obit;
4554};
4555
4556int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4557{
4558 log_list_state *state = new log_list_state;
4559 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4560 if (r < 0) {
4561 delete state;
4562 return r;
4563 }
4564 state->prefix = prefix;
4565 state->obit = state->io_ctx.nobjects_begin();
4566 *handle = (RGWAccessHandle)state;
4567 return 0;
4568}
4569
4570int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4571{
4572 log_list_state *state = static_cast<log_list_state *>(handle);
4573 while (true) {
4574 if (state->obit == state->io_ctx.nobjects_end()) {
4575 delete state;
4576 return -ENOENT;
4577 }
4578 if (state->prefix.length() &&
4579 state->obit->get_oid().find(state->prefix) != 0) {
4580 state->obit++;
4581 continue;
4582 }
4583 *name = state->obit->get_oid();
4584 state->obit++;
4585 break;
4586 }
4587 return 0;
4588}
4589
4590int RGWRados::log_remove(const string& name)
4591{
4592 librados::IoCtx io_ctx;
4593 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4594 if (r < 0)
4595 return r;
4596 return io_ctx.remove(name);
4597}
4598
4599struct log_show_state {
4600 librados::IoCtx io_ctx;
4601 bufferlist bl;
4602 bufferlist::iterator p;
4603 string name;
4604 uint64_t pos;
4605 bool eof;
4606 log_show_state() : pos(0), eof(false) {}
4607};
4608
4609int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4610{
4611 log_show_state *state = new log_show_state;
4612 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4613 if (r < 0) {
4614 delete state;
4615 return r;
4616 }
4617 state->name = name;
4618 *handle = (RGWAccessHandle)state;
4619 return 0;
4620}
4621
4622int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4623{
4624 log_show_state *state = static_cast<log_show_state *>(handle);
4625 off_t off = state->p.get_off();
4626
4627 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
4628 << " off " << off
4629 << " eof " << (int)state->eof
4630 << dendl;
4631 // read some?
4632 unsigned chunk = 1024*1024;
4633 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
4634 bufferlist more;
4635 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
4636 if (r < 0)
4637 return r;
4638 state->pos += r;
4639 bufferlist old;
4640 try {
4641 old.substr_of(state->bl, off, state->bl.length() - off);
4642 } catch (buffer::error& err) {
4643 return -EINVAL;
4644 }
4645 state->bl.clear();
4646 state->bl.claim(old);
4647 state->bl.claim_append(more);
4648 state->p = state->bl.begin();
4649 if ((unsigned)r < chunk)
4650 state->eof = true;
4651 ldout(cct, 10) << " read " << r << dendl;
4652 }
4653
4654 if (state->p.end())
4655 return 0; // end of file
4656 try {
4657 ::decode(*entry, state->p);
4658 }
4659 catch (const buffer::error &e) {
4660 return -EINVAL;
4661 }
4662 return 1;
4663}
4664
4665/**
4666 * usage_log_hash: get usage log key hash, based on name and index
4667 *
4668 * Get the usage object name. Since a user may have more than 1
4669 * object holding that info (multiple shards), we use index to
4670 * specify that shard number. Once index exceeds max shards it
4671 * wraps.
4672 * If name is not being set, results for all users will be returned
4673 * and index will wrap only after total shards number.
4674 *
4675 * @param cct [in] ceph context
4676 * @param name [in] user name
4677 * @param hash [out] hash value
4678 * @param index [in] shard index number
4679 */
4680static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
4681{
4682 uint32_t val = index;
4683
4684 if (!name.empty()) {
4685 int max_user_shards = max(cct->_conf->rgw_usage_max_user_shards, 1);
4686 val %= max_user_shards;
4687 val += ceph_str_hash_linux(name.c_str(), name.size());
4688 }
4689 char buf[17];
4690 int max_shards = max(cct->_conf->rgw_usage_max_shards, 1);
4691 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
4692 hash = buf;
4693}
4694
4695int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
4696{
4697 uint32_t index = 0;
4698
4699 map<string, rgw_usage_log_info> log_objs;
4700
4701 string hash;
4702 string last_user;
4703
4704 /* restructure usage map, zone by object hash */
4705 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
4706 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
4707 const rgw_user_bucket& ub = iter->first;
4708 RGWUsageBatch& info = iter->second;
4709
4710 if (ub.user.empty()) {
4711 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
4712 continue;
4713 }
4714
4715 if (ub.user != last_user) {
4716 /* index *should* be random, but why waste extra cycles
4717 in most cases max user shards is not going to exceed 1,
4718 so just incrementing it */
4719 usage_log_hash(cct, ub.user, hash, index++);
4720 }
4721 last_user = ub.user;
4722 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
4723
4724 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
4725 v.push_back(miter->second);
4726 }
4727 }
4728
4729 map<string, rgw_usage_log_info>::iterator liter;
4730
4731 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
4732 int r = cls_obj_usage_log_add(liter->first, liter->second);
4733 if (r < 0)
4734 return r;
4735 }
4736 return 0;
4737}
4738
4739int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
4740 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
4741{
4742 uint32_t num = max_entries;
4743 string hash, first_hash;
4744 string user_str = user.to_str();
4745 usage_log_hash(cct, user_str, first_hash, 0);
4746
4747 if (usage_iter.index) {
4748 usage_log_hash(cct, user_str, hash, usage_iter.index);
4749 } else {
4750 hash = first_hash;
4751 }
4752
4753 usage.clear();
4754
4755 do {
4756 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
4757 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
4758
4759 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
4760 usage_iter.read_iter, ret_usage, is_truncated);
4761 if (ret == -ENOENT)
4762 goto next;
4763
4764 if (ret < 0)
4765 return ret;
4766
4767 num -= ret_usage.size();
4768
4769 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
4770 usage[iter->first].aggregate(iter->second);
4771 }
4772
4773next:
4774 if (!*is_truncated) {
4775 usage_iter.read_iter.clear();
4776 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
4777 }
4778 } while (num && !*is_truncated && hash != first_hash);
4779 return 0;
4780}
4781
4782int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
4783{
4784 uint32_t index = 0;
4785 string hash, first_hash;
4786 string user_str = user.to_str();
4787 usage_log_hash(cct, user_str, first_hash, index);
4788
4789 hash = first_hash;
4790
4791 do {
4792 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
4793 if (ret == -ENOENT)
4794 goto next;
4795
4796 if (ret < 0)
4797 return ret;
4798
4799next:
4800 usage_log_hash(cct, user_str, hash, ++index);
4801 } while (hash != first_hash);
4802
4803 return 0;
4804}
4805
4806#define MAX_SHARDS_PRIME 7877
4807
4808int RGWRados::key_to_shard_id(const string& key, int max_shards)
4809{
4810 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size()) % MAX_SHARDS_PRIME;
4811 return val % max_shards;
4812}
4813
4814void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
4815{
4816 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
4817 char buf[16];
4818 if (shard_id) {
4819 *shard_id = val % max_shards;
4820 }
4821 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
4822 name = prefix + buf;
4823}
4824
4825void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
4826{
4827 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
4828 val ^= ceph_str_hash_linux(section.c_str(), section.size());
4829 char buf[16];
4830 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
4831 name = prefix + buf;
4832}
4833
4834void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
4835{
4836 char buf[16];
4837 snprintf(buf, sizeof(buf), "%u", shard_id);
4838 name = prefix + buf;
4839
4840}
4841
4842void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
4843{
4844 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
4845}
4846
4847int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
4848{
4849 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
4850
4851}
4852
4853int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
4854{
4855 librados::IoCtx io_ctx;
4856
4857 int r = time_log_add_init(io_ctx);
4858 if (r < 0) {
4859 return r;
4860 }
4861
4862 ObjectWriteOperation op;
4863 utime_t t(ut);
4864 cls_log_add(op, t, section, key, bl);
4865
4866 return io_ctx.operate(oid, &op);
4867}
4868
4869int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
4870 librados::AioCompletion *completion, bool monotonic_inc)
4871{
4872 librados::IoCtx io_ctx;
4873
4874 int r = time_log_add_init(io_ctx);
4875 if (r < 0) {
4876 return r;
4877 }
4878
4879 ObjectWriteOperation op;
4880 cls_log_add(op, entries, monotonic_inc);
4881
4882 if (!completion) {
4883 r = io_ctx.operate(oid, &op);
4884 } else {
4885 r = io_ctx.aio_operate(oid, completion, &op);
4886 }
4887 return r;
4888}
4889
4890int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
4891 int max_entries, list<cls_log_entry>& entries,
4892 const string& marker,
4893 string *out_marker,
4894 bool *truncated)
4895{
4896 librados::IoCtx io_ctx;
4897
4898 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4899 if (r < 0)
4900 return r;
4901 librados::ObjectReadOperation op;
4902
4903 utime_t st(start_time);
4904 utime_t et(end_time);
4905
4906 cls_log_list(op, st, et, marker, max_entries, entries,
4907 out_marker, truncated);
4908
4909 bufferlist obl;
4910
4911 int ret = io_ctx.operate(oid, &op, &obl);
4912 if (ret < 0)
4913 return ret;
4914
4915 return 0;
4916}
4917
4918int RGWRados::time_log_info(const string& oid, cls_log_header *header)
4919{
4920 librados::IoCtx io_ctx;
4921
4922 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4923 if (r < 0)
4924 return r;
4925 librados::ObjectReadOperation op;
4926
4927 cls_log_info(op, header);
4928
4929 bufferlist obl;
4930
4931 int ret = io_ctx.operate(oid, &op, &obl);
4932 if (ret < 0)
4933 return ret;
4934
4935 return 0;
4936}
4937
4938int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
4939{
4940 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4941 if (r < 0)
4942 return r;
4943
4944 librados::ObjectReadOperation op;
4945
4946 cls_log_info(op, header);
4947
4948 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
4949 if (ret < 0)
4950 return ret;
4951
4952 return 0;
4953}
4954
4955int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
4956 const string& from_marker, const string& to_marker,
4957 librados::AioCompletion *completion)
4958{
4959 librados::IoCtx io_ctx;
4960
4961 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4962 if (r < 0)
4963 return r;
4964
4965 utime_t st(start_time);
4966 utime_t et(end_time);
4967
4968 ObjectWriteOperation op;
4969 cls_log_trim(op, st, et, from_marker, to_marker);
4970
4971 if (!completion) {
4972 r = io_ctx.operate(oid, &op);
4973 } else {
4974 r = io_ctx.aio_operate(oid, completion, &op);
4975 }
4976 return r;
4977}
4978
4979string RGWRados::objexp_hint_get_shardname(int shard_num)
4980{
4981 char buf[32];
4982 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
4983
4984 string objname("obj_delete_at_hint.");
4985 return objname + buf;
4986}
4987
4988#define MAX_OBJEXP_SHARDS_PRIME 7877
4989
4990int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
4991{
4992 string obj_key = key.name + key.instance;
4993 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
4994 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
4995 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
4996 sid = sid2 % MAX_OBJEXP_SHARDS_PRIME % num_shards;
4997 return sid % num_shards;
4998}
4999
5000static string objexp_hint_get_keyext(const string& tenant_name,
5001 const string& bucket_name,
5002 const string& bucket_id,
5003 const rgw_obj_key& obj_key)
5004{
5005 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5006 ":" + obj_key.name + ":" + obj_key.instance;
5007}
5008
5009int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5010 const string& tenant_name,
5011 const string& bucket_name,
5012 const string& bucket_id,
5013 const rgw_obj_index_key& obj_key)
5014{
5015 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5016 bucket_id, obj_key);
5017 objexp_hint_entry he = {
5018 .tenant = tenant_name,
5019 .bucket_name = bucket_name,
5020 .bucket_id = bucket_id,
5021 .obj_key = obj_key,
5022 .exp_time = delete_at };
5023 bufferlist hebl;
5024 ::encode(he, hebl);
5025 ObjectWriteOperation op;
5026 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5027
5028 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5029 return objexp_pool_ctx.operate(shard_name, &op);
5030}
5031
5032void RGWRados::objexp_get_shard(int shard_num,
5033 string& shard) /* out */
5034{
5035 shard = objexp_hint_get_shardname(shard_num);
5036}
5037
5038int RGWRados::objexp_hint_list(const string& oid,
5039 const ceph::real_time& start_time,
5040 const ceph::real_time& end_time,
5041 const int max_entries,
5042 const string& marker,
5043 list<cls_timeindex_entry>& entries, /* out */
5044 string *out_marker, /* out */
5045 bool *truncated) /* out */
5046{
5047 librados::ObjectReadOperation op;
5048 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5049 out_marker, truncated);
5050
5051 bufferlist obl;
5052 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5053
5054 if ((ret < 0 ) && (ret != -ENOENT)) {
5055 return ret;
5056 }
5057
5058 if ((ret == -ENOENT) && truncated) {
5059 *truncated = false;
5060 }
5061
5062 return 0;
5063}
5064
5065int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5066 objexp_hint_entry& hint_entry) /* out */
5067{
5068 try {
5069 bufferlist::iterator iter = ti_entry.value.begin();
5070 ::decode(hint_entry, iter);
5071 } catch (buffer::error& err) {
5072 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5073 }
5074
5075 return 0;
5076}
5077
5078int RGWRados::objexp_hint_trim(const string& oid,
5079 const ceph::real_time& start_time,
5080 const ceph::real_time& end_time,
5081 const string& from_marker,
5082 const string& to_marker)
5083{
5084 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5085 from_marker, to_marker);
5086 if ((ret < 0 ) && (ret != -ENOENT)) {
5087 return ret;
5088 }
5089
5090 return 0;
5091}
5092
5093int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5094 string& zone_id, string& owner_id) {
5095 librados::IoCtx io_ctx;
5096
5097 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5098 if (r < 0) {
5099 return r;
5100 }
5101 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5102 utime_t ut(msec / 1000, msec % 1000);
5103
5104 rados::cls::lock::Lock l(log_lock_name);
5105 l.set_duration(ut);
5106 l.set_cookie(owner_id);
5107 l.set_tag(zone_id);
5108 l.set_renew(true);
5109
5110 return l.lock_exclusive(&io_ctx, oid);
5111}
5112
5113int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5114 librados::IoCtx io_ctx;
5115
5116 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5117 if (r < 0) {
5118 return r;
5119 }
5120
5121 rados::cls::lock::Lock l(log_lock_name);
5122 l.set_tag(zone_id);
5123 l.set_cookie(owner_id);
5124
5125 return l.unlock(&io_ctx, oid);
5126}
5127
5128int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5129{
5130 bufferlist::iterator i = bl.begin();
5131 RGWAccessControlPolicy policy(cct);
5132 try {
5133 policy.decode_owner(i);
5134 } catch (buffer::error& err) {
5135 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5136 return -EIO;
5137 }
5138 *owner = policy.get_owner();
5139 return 0;
5140}
5141
5142int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5143{
5144 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5145 if (aiter == attrset.end())
5146 return -EIO;
5147
5148 bufferlist& bl = aiter->second;
5149 bufferlist::iterator iter = bl.begin();
5150 try {
5151 policy->decode(iter);
5152 } catch (buffer::error& err) {
5153 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5154 return -EIO;
5155 }
5156 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5157 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5158 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5159 s3policy->to_xml(*_dout);
5160 *_dout << dendl;
5161 }
5162 return 0;
5163}
5164
5165
5166/**
5167 * get listing of the objects in a bucket.
5168 *
5169 * max: maximum number of results to return
5170 * bucket: bucket to list contents of
5171 * prefix: only return results that match this prefix
5172 * delim: do not include results that match this string.
5173 * Any skipped results will have the matching portion of their name
5174 * inserted in common_prefixes with a "true" mark.
5175 * marker: if filled in, begin the listing with this object.
5176 * end_marker: if filled in, end the listing with this object.
5177 * result: the objects are put in here.
5178 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5179 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5180 */
5181int RGWRados::Bucket::List::list_objects(int max, vector<rgw_bucket_dir_entry> *result,
5182 map<string, bool> *common_prefixes,
5183 bool *is_truncated)
5184{
5185 RGWRados *store = target->get_store();
5186 CephContext *cct = store->ctx();
5187 int shard_id = target->get_shard_id();
5188
5189 int count = 0;
5190 bool truncated = true;
5191 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5192
5193 result->clear();
5194
5195 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5196
5197 rgw_obj_key end_marker_obj;
5198 rgw_obj_index_key cur_end_marker;
5199 if (!params.ns.empty()) {
5200 end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
5201 end_marker_obj.ns = params.ns;
5202 end_marker_obj.get_index_key(&cur_end_marker);
5203 }
5204 rgw_obj_index_key cur_marker;
5205 marker_obj.get_index_key(&cur_marker);
5206
5207 const bool cur_end_marker_valid = !params.end_marker.empty();
5208
5209 rgw_obj_key prefix_obj(params.prefix);
5210 prefix_obj.ns = params.ns;
5211 string cur_prefix = prefix_obj.get_index_key_name();
5212
5213 string bigger_than_delim;
5214
5215 if (!params.delim.empty()) {
5216 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5217 char buf[params.delim.size() + 16];
5218 int r = encode_utf8(val + 1, (unsigned char *)buf);
5219 if (r < 0) {
5220 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5221 return -EINVAL;
5222 }
5223 buf[r] = '\0';
5224
5225 bigger_than_delim = buf;
5226
5227 /* if marker points at a common prefix, fast forward it into its upperbound string */
5228 int delim_pos = cur_marker.name.find(params.delim, params.prefix.size());
5229 if (delim_pos >= 0) {
5230 string s = cur_marker.name.substr(0, delim_pos);
5231 s.append(bigger_than_delim);
5232 cur_marker = s;
5233 }
5234 }
5235
5236 string skip_after_delim;
5237 while (truncated && count <= max) {
5238 if (skip_after_delim > cur_marker.name) {
5239 cur_marker = skip_after_delim;
5240 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5241 }
5242 std::map<string, rgw_bucket_dir_entry> ent_map;
5243 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5244 read_ahead + 1 - count, params.list_versions, ent_map,
5245 &truncated, &cur_marker);
5246 if (r < 0)
5247 return r;
5248
5249 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5250 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5251 rgw_bucket_dir_entry& entry = eiter->second;
5252 rgw_obj_index_key index_key = entry.key;
5253
5254 rgw_obj_key obj(index_key);
5255
5256 /* note that parse_raw_oid() here will not set the correct object's instance, as
5257 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5258 * not needed for the checks here and we end up using the raw entry for the return vector
5259 */
5260 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5261 if (!valid) {
5262 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5263 continue;
5264 }
5265 bool check_ns = (obj.ns == params.ns);
5266 if (!params.list_versions && !entry.is_visible()) {
5267 continue;
5268 }
5269
5270 if (params.enforce_ns && !check_ns) {
5271 if (!params.ns.empty()) {
5272 /* we've iterated past the namespace we're searching -- done now */
5273 truncated = false;
5274 goto done;
5275 }
5276
5277 /* we're not looking at the namespace this object is in, next! */
5278 continue;
5279 }
5280
5281 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5282 truncated = false;
5283 goto done;
5284 }
5285
5286 if (count < max) {
5287 params.marker = index_key;
5288 next_marker = index_key;
5289 }
5290
5291 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5292 continue;
5293
5294 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5295 continue;
5296
5297 if (!params.delim.empty()) {
5298 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5299
5300 if (delim_pos >= 0) {
5301 string prefix_key = obj.name.substr(0, delim_pos + 1);
5302
5303 if (common_prefixes &&
5304 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5305 if (count >= max) {
5306 truncated = true;
5307 goto done;
5308 }
5309 next_marker = prefix_key;
5310 (*common_prefixes)[prefix_key] = true;
5311
5312 skip_after_delim = obj.name.substr(0, delim_pos);
5313 skip_after_delim.append(bigger_than_delim);
5314
5315 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5316
5317 count++;
5318 }
5319
5320 continue;
5321 }
5322 }
5323
5324 if (count >= max) {
5325 truncated = true;
5326 goto done;
5327 }
5328
5329 result->emplace_back(std::move(entry));
5330 count++;
5331 }
5332
5333 // Either the back-end telling us truncated, or we don't consume all
5334 // items returned per the amount caller request
5335 truncated = (truncated || eiter != ent_map.end());
5336 }
5337
5338done:
5339 if (is_truncated)
5340 *is_truncated = truncated;
5341
5342 return 0;
5343}
5344
5345/**
5346 * create a rados pool, associated meta info
5347 * returns 0 on success, -ERR# otherwise.
5348 */
5349int RGWRados::create_pool(const rgw_pool& pool)
5350{
5351 int ret = 0;
5352
5353 librados::Rados *rad = get_rados_handle();
5354 ret = rad->pool_create(pool.name.c_str(), 0);
5355 if (ret == -EEXIST)
5356 ret = 0;
5357 else if (ret == -ERANGE) {
5358 ldout(cct, 0)
5359 << __func__
5360 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5361 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5362 << dendl;
5363 }
5364 if (ret < 0)
5365 return ret;
5366
5367 return 0;
5368}
5369
5370int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5371{
5372 librados::IoCtx index_ctx; // context for new bucket
5373
5374 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5375 if (r < 0)
5376 return r;
5377
5378 string dir_oid = dir_oid_prefix;
5379 dir_oid.append(bucket_info.bucket.bucket_id);
5380
5381 map<int, string> bucket_objs;
5382 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5383
5384 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5385}
5386
5387void RGWRados::create_bucket_id(string *bucket_id)
5388{
5389 uint64_t iid = instance_id();
5390 uint64_t bid = next_bucket_id();
5391 char buf[get_zone_params().get_id().size() + 48];
5392 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5393 *bucket_id = buf;
5394}
5395
5396/**
5397 * create a bucket with name bucket and the given list of attrs
5398 * returns 0 on success, -ERR# otherwise.
5399 */
5400int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5401 const string& zonegroup_id,
5402 const string& placement_rule,
5403 const string& swift_ver_location,
5404 const RGWQuotaInfo * pquota_info,
5405 map<std::string, bufferlist>& attrs,
5406 RGWBucketInfo& info,
5407 obj_version *pobjv,
5408 obj_version *pep_objv,
5409 real_time creation_time,
5410 rgw_bucket *pmaster_bucket,
5411 uint32_t *pmaster_num_shards,
5412 bool exclusive)
5413{
5414#define MAX_CREATE_RETRIES 20 /* need to bound retries */
5415 string selected_placement_rule_name;
5416 RGWZonePlacementInfo rule_info;
5417
5418 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5419 int ret = 0;
5420 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5421 &selected_placement_rule_name, &rule_info);
5422 if (ret < 0)
5423 return ret;
5424
5425 if (!pmaster_bucket) {
5426 create_bucket_id(&bucket.marker);
5427 bucket.bucket_id = bucket.marker;
5428 } else {
5429 bucket.marker = pmaster_bucket->marker;
5430 bucket.bucket_id = pmaster_bucket->bucket_id;
5431 }
5432
5433 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5434
5435 if (pobjv) {
5436 objv_tracker.write_version = *pobjv;
5437 } else {
5438 objv_tracker.generate_new_write_ver(cct);
5439 }
5440
5441 info.bucket = bucket;
5442 info.owner = owner.user_id;
5443 info.zonegroup = zonegroup_id;
5444 info.placement_rule = selected_placement_rule_name;
5445 info.index_type = rule_info.index_type;
5446 info.swift_ver_location = swift_ver_location;
5447 info.swift_versioning = (!swift_ver_location.empty());
5448 if (pmaster_num_shards) {
5449 info.num_shards = *pmaster_num_shards;
5450 } else {
5451 info.num_shards = bucket_index_max_shards;
5452 }
5453 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5454 info.requester_pays = false;
5455 if (real_clock::is_zero(creation_time)) {
5456 info.creation_time = ceph::real_clock::now();
5457 } else {
5458 info.creation_time = creation_time;
5459 }
5460 if (pquota_info) {
5461 info.quota = *pquota_info;
5462 }
5463
5464 int r = init_bucket_index(info, info.num_shards);
5465 if (r < 0) {
5466 return r;
5467 }
5468
5469 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5470 if (ret == -EEXIST) {
5471 librados::IoCtx index_ctx;
5472 map<int, string> bucket_objs;
5473 int r = open_bucket_index(info, index_ctx, bucket_objs);
5474 if (r < 0)
5475 return r;
5476
5477 /* we need to reread the info and return it, caller will have a use for it */
5478 RGWObjVersionTracker instance_ver = info.objv_tracker;
5479 info.objv_tracker.clear();
5480 RGWObjectCtx obj_ctx(this);
5481 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5482 if (r < 0) {
5483 if (r == -ENOENT) {
5484 continue;
5485 }
5486 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5487 return r;
5488 }
5489
5490 /* only remove it if it's a different bucket instance */
5491 if (info.bucket.bucket_id != bucket.bucket_id) {
5492 /* remove bucket meta instance */
5493 string entry = bucket.get_key();
5494 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5495 if (r < 0)
5496 return r;
5497
5498 map<int, string>::const_iterator biter;
5499 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5500 // Do best effort removal
5501 index_ctx.remove(biter->second);
5502 }
5503 }
5504 /* ret == -ENOENT here */
5505 }
5506 return ret;
5507 }
5508
5509 /* this is highly unlikely */
5510 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5511 return -ENOENT;
5512}
5513
5514int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5515 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5516
5517{
5518 /* first check that rule exists within the specific zonegroup */
5519 RGWZoneGroup zonegroup;
5520 int ret = get_zonegroup(zonegroup_id, zonegroup);
5521 if (ret < 0) {
5522 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5523 return ret;
5524 }
5525
5526 /* now check that tag exists within zonegroup */
5527 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5528 string rule = request_rule;
5529 if (rule.empty()) {
5530 rule = user_info.default_placement;
5531 if (rule.empty())
5532 rule = zonegroup.default_placement;
5533 }
5534
5535 if (rule.empty()) {
5536 ldout(cct, 0) << "misconfiguration, should not have an empty placement rule name" << dendl;
5537 return -EIO;
5538 }
5539
5540 map<string, RGWZoneGroupPlacementTarget>::iterator titer = zonegroup.placement_targets.find(rule);
5541 if (titer == zonegroup.placement_targets.end()) {
5542 ldout(cct, 0) << "could not find placement rule " << rule << " within zonegroup " << dendl;
5543 return -EINVAL;
5544 }
5545
5546 /* now check tag for the rule, whether user is permitted to use rule */
5547 RGWZoneGroupPlacementTarget& target_rule = titer->second;
5548 if (!target_rule.user_permitted(user_info.placement_tags)) {
5549 ldout(cct, 0) << "user not permitted to use placement rule" << dendl;
5550 return -EPERM;
5551 }
5552
5553 if (pselected_rule_name)
5554 *pselected_rule_name = rule;
5555
5556 return select_bucket_location_by_rule(rule, rule_info);
5557}
5558
5559int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5560{
5561 if (location_rule.empty()) {
5562 /* we can only reach here if we're trying to set a bucket location from a bucket
5563 * created on a different zone, using a legacy / default pool configuration
5564 */
5565 return select_legacy_bucket_placement(rule_info);
5566 }
5567
5568 /*
5569 * make sure that zone has this rule configured. We're
5570 * checking it for the local zone, because that's where this bucket object is going to
5571 * reside.
5572 */
5573 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5574 if (piter == get_zone_params().placement_pools.end()) {
5575 /* couldn't find, means we cannot really place data for this bucket in this zone */
5576 if (get_zonegroup().equals(zonegroup_id)) {
5577 /* that's a configuration error, zone should have that rule, as we're within the requested
5578 * zonegroup */
5579 return -EINVAL;
5580 } else {
5581 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5582 return 0;
5583 }
5584 }
5585
5586 RGWZonePlacementInfo& placement_info = piter->second;
5587
5588 if (rule_info) {
5589 *rule_info = placement_info;
5590 }
5591
5592 return 0;
5593}
5594
5595int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5596 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5597{
5598 if (!get_zone_params().placement_pools.empty()) {
5599 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5600 pselected_rule_name, rule_info);
5601 }
5602
5603 if (pselected_rule_name) {
5604 pselected_rule_name->clear();
5605 }
5606
5607 return select_legacy_bucket_placement(rule_info);
5608}
5609
5610int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
5611{
5612 bufferlist map_bl;
5613 map<string, bufferlist> m;
5614 string pool_name;
5615 bool write_map = false;
5616
5617 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5618
5619 RGWObjectCtx obj_ctx(this);
5620 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
5621 if (ret < 0) {
5622 goto read_omap;
5623 }
5624
5625 try {
5626 bufferlist::iterator iter = map_bl.begin();
5627 ::decode(m, iter);
5628 } catch (buffer::error& err) {
5629 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5630 }
5631
5632read_omap:
5633 if (m.empty()) {
5634 bufferlist header;
5635 ret = omap_get_all(obj, header, m);
5636
5637 write_map = true;
5638 }
5639
5640 if (ret < 0 || m.empty()) {
5641 vector<rgw_pool> pools;
5642 string s = string("default.") + default_storage_pool_suffix;
5643 pools.push_back(rgw_pool(s));
5644 vector<int> retcodes;
5645 bufferlist bl;
5646 ret = create_pools(pools, retcodes);
5647 if (ret < 0)
5648 return ret;
5649 ret = omap_set(obj, s, bl);
5650 if (ret < 0)
5651 return ret;
5652 m[s] = bl;
5653 }
5654
5655 if (write_map) {
5656 bufferlist new_bl;
5657 ::encode(m, new_bl);
5658 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
5659 if (ret < 0) {
5660 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
5661 }
5662 }
5663
5664 map<string, bufferlist>::iterator miter;
5665 if (m.size() > 1) {
5666 vector<string> v;
5667 for (miter = m.begin(); miter != m.end(); ++miter) {
5668 v.push_back(miter->first);
5669 }
5670
5671 uint32_t r;
5672 ret = get_random_bytes((char *)&r, sizeof(r));
5673 if (ret < 0)
5674 return ret;
5675
5676 int i = r % v.size();
5677 pool_name = v[i];
5678 } else {
5679 miter = m.begin();
5680 pool_name = miter->first;
5681 }
5682
5683 rule_info->data_pool = pool_name;
5684 rule_info->data_extra_pool = pool_name;
5685 rule_info->index_pool = pool_name;
5686 rule_info->index_type = RGWBIType_Normal;
5687
5688 return 0;
5689}
5690
5691bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
5692{
5693 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
5694}
5695
5696bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
5697{
5698 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
5699
5700 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
5701}
5702
5703int RGWRados::update_placement_map()
5704{
5705 bufferlist header;
5706 map<string, bufferlist> m;
5707 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5708 int ret = omap_get_all(obj, header, m);
5709 if (ret < 0)
5710 return ret;
5711
5712 bufferlist new_bl;
5713 ::encode(m, new_bl);
5714 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
5715 if (ret < 0) {
5716 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
5717 }
5718
5719 return ret;
5720}
5721
5722int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
5723{
5724 librados::Rados *rad = get_rados_handle();
5725 int ret = rad->pool_lookup(new_pool.name.c_str());
5726 if (ret < 0) // DNE, or something
5727 return ret;
5728
5729 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5730 bufferlist empty_bl;
5731 ret = omap_set(obj, new_pool.to_str(), empty_bl);
5732
5733 // don't care about return value
5734 update_placement_map();
5735
5736 return ret;
5737}
5738
5739int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
5740{
5741 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5742 int ret = omap_del(obj, old_pool.to_str());
5743
5744 // don't care about return value
5745 update_placement_map();
5746
5747 return ret;
5748}
5749
5750int RGWRados::list_placement_set(set<rgw_pool>& names)
5751{
5752 bufferlist header;
5753 map<string, bufferlist> m;
5754
5755 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5756 int ret = omap_get_all(obj, header, m);
5757 if (ret < 0)
5758 return ret;
5759
5760 names.clear();
5761 map<string, bufferlist>::iterator miter;
5762 for (miter = m.begin(); miter != m.end(); ++miter) {
5763 names.insert(rgw_pool(miter->first));
5764 }
5765
5766 return names.size();
5767}
5768
5769int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
5770{
5771 vector<librados::PoolAsyncCompletion *> completions;
5772 vector<int> rets;
5773
5774 librados::Rados *rad = get_rados_handle();
5775 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
5776 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
5777 completions.push_back(c);
5778 rgw_pool& pool = *iter;
5779 int ret = rad->pool_create_async(pool.name.c_str(), c);
5780 rets.push_back(ret);
5781 }
5782
5783 vector<int>::iterator riter;
5784 vector<librados::PoolAsyncCompletion *>::iterator citer;
5785
5786 assert(rets.size() == completions.size());
5787 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
5788 int r = *riter;
5789 PoolAsyncCompletion *c = *citer;
5790 if (r == 0) {
5791 c->wait();
5792 r = c->get_return_value();
5793 if (r < 0) {
5794 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
5795 }
5796 }
5797 c->release();
5798 retcodes.push_back(r);
5799 }
5800 return 0;
5801}
5802
5803int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
5804{
5805 string oid, key;
5806 get_obj_bucket_and_oid_loc(obj, oid, key);
5807
5808 rgw_pool pool;
5809 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
5810 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
5811 return -EIO;
5812 }
5813
5814 int r = open_pool_ctx(pool, *ioctx);
5815 if (r < 0) {
5816 return r;
5817 }
5818
5819 ioctx->locator_set_key(key);
5820
5821 return 0;
5822}
5823
5824int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
5825{
5826 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
5827
5828 rgw_pool pool;
5829 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
5830 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
5831 return -EIO;
5832 }
5833
5834 int r = open_pool_ctx(pool, ref->ioctx);
5835 if (r < 0) {
5836 return r;
5837 }
5838
5839 ref->ioctx.locator_set_key(ref->key);
5840
5841 return 0;
5842}
5843
5844int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref, rgw_pool *pool)
5845{
5846 ref->oid = obj.oid;
5847 ref->key = obj.loc;
5848
5849 int r;
5850
5851 if (ref->oid.empty()) {
5852 ref->oid = obj.pool.to_str();
5853 ref->pool = get_zone_params().domain_root;
5854 } else {
5855 ref->pool = obj.pool;
5856 }
5857 if (pool) {
5858 *pool = ref->pool;
5859 }
5860 r = open_pool_ctx(ref->pool, ref->ioctx);
5861 if (r < 0)
5862 return r;
5863
5864 ref->ioctx.locator_set_key(ref->key);
5865
5866 return 0;
5867}
5868
5869int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref, rgw_pool *pool)
5870{
5871 return get_raw_obj_ref(obj, ref, pool);
5872}
5873
5874/*
5875 * fixes an issue where head objects were supposed to have a locator created, but ended
5876 * up without one
5877 */
5878int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
5879{
5880 const rgw_bucket& bucket = bucket_info.bucket;
5881 string oid;
5882 string locator;
5883
5884 rgw_obj obj(bucket, key);
5885
5886 get_obj_bucket_and_oid_loc(obj, oid, locator);
5887
5888 if (locator.empty()) {
5889 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
5890 return 0;
5891 }
5892
5893 librados::IoCtx ioctx;
5894
5895 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
5896 if (ret < 0) {
5897 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
5898 return ret;
5899 }
5900 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
5901
5902 uint64_t size;
5903 bufferlist data;
5904
5905 struct timespec mtime_ts;
5906 map<string, bufferlist> attrs;
5907 librados::ObjectReadOperation op;
5908 op.getxattrs(&attrs, NULL);
5909 op.stat2(&size, &mtime_ts, NULL);
5910#define HEAD_SIZE 512 * 1024
5911 op.read(0, HEAD_SIZE, &data, NULL);
5912
5913 ret = ioctx.operate(oid, &op, NULL);
5914 if (ret < 0) {
5915 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
5916 return ret;
5917 }
5918
5919 if (size > HEAD_SIZE) {
5920 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
5921 return -EIO;
5922 }
5923
5924 if (size != data.length()) {
5925 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
5926 return -EIO;
5927 }
5928
5929 if (copy_obj) {
5930 librados::ObjectWriteOperation wop;
5931
5932 wop.mtime2(&mtime_ts);
5933
5934 map<string, bufferlist>::iterator iter;
5935 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5936 wop.setxattr(iter->first.c_str(), iter->second);
5937 }
5938
5939 wop.write(0, data);
5940
5941 ioctx.locator_set_key(locator);
5942 ioctx.operate(oid, &wop);
5943 }
5944
5945 if (remove_bad) {
5946 ioctx.locator_set_key(string());
5947
5948 ret = ioctx.remove(oid);
5949 if (ret < 0) {
5950 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
5951 return ret;
5952 }
5953 }
5954
5955 return 0;
5956}
5957
5958int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
5959 const string& src_oid, const string& src_locator,
5960 librados::IoCtx& dst_ioctx,
5961 const string& dst_oid, const string& dst_locator)
5962{
5963
5964#define COPY_BUF_SIZE (4 * 1024 * 1024)
5965 bool done = false;
5966 uint64_t chunk_size = COPY_BUF_SIZE;
5967 uint64_t ofs = 0;
5968 int ret = 0;
5969 real_time mtime;
5970 struct timespec mtime_ts;
5971 uint64_t size;
5972
5973 if (src_oid == dst_oid && src_locator == dst_locator) {
5974 return 0;
5975 }
5976
5977 src_ioctx.locator_set_key(src_locator);
5978 dst_ioctx.locator_set_key(dst_locator);
5979
5980 do {
5981 bufferlist data;
5982 ObjectReadOperation rop;
5983 ObjectWriteOperation wop;
5984
5985 if (ofs == 0) {
5986 rop.stat2(&size, &mtime_ts, NULL);
5987 mtime = real_clock::from_timespec(mtime_ts);
5988 }
5989 rop.read(ofs, chunk_size, &data, NULL);
5990 ret = src_ioctx.operate(src_oid, &rop, NULL);
5991 if (ret < 0) {
5992 goto done_err;
5993 }
5994
5995 if (data.length() == 0) {
5996 break;
5997 }
5998
5999 if (ofs == 0) {
6000 wop.create(true); /* make it exclusive */
6001 wop.mtime2(&mtime_ts);
6002 mtime = real_clock::from_timespec(mtime_ts);
6003 }
6004 wop.write(ofs, data);
6005 ret = dst_ioctx.operate(dst_oid, &wop);
6006 ofs += data.length();
6007 done = data.length() != chunk_size;
6008 } while (!done);
6009
6010 if (ofs != size) {
6011 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6012 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6013 ret = -EIO;
6014 goto done_err;
6015 }
6016
6017 src_ioctx.remove(src_oid);
6018
6019 return 0;
6020
6021done_err:
6022 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6023 return ret;
6024}
6025
6026/*
6027 * fixes an issue where head objects were supposed to have a locator created, but ended
6028 * up without one
6029 */
6030int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6031{
6032 const rgw_bucket& bucket = bucket_info.bucket;
6033 rgw_obj obj(bucket, key);
6034
6035 if (need_fix) {
6036 *need_fix = false;
6037 }
6038
6039 rgw_rados_ref ref;
6040 int r = get_obj_head_ref(bucket_info, obj, &ref);
6041 if (r < 0) {
6042 return r;
6043 }
6044
6045 RGWObjState *astate = NULL;
6046 RGWObjectCtx rctx(this);
6047 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6048 if (r < 0)
6049 return r;
6050
6051 if (astate->has_manifest) {
6052 RGWObjManifest::obj_iterator miter;
6053 RGWObjManifest& manifest = astate->manifest;
6054 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6055 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6056 rgw_obj loc;
6057 string oid;
6058 string locator;
6059
6060 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6061
6062 if (loc.key.ns.empty()) {
6063 /* continue, we're only interested in tail objects */
6064 continue;
6065 }
6066
6067 get_obj_bucket_and_oid_loc(loc, oid, locator);
6068 ref.ioctx.locator_set_key(locator);
6069
6070 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6071
6072 r = ref.ioctx.stat(oid, NULL, NULL);
6073 if (r != -ENOENT) {
6074 continue;
6075 }
6076
6077 string bad_loc;
6078 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6079
6080 /* create a new ioctx with the bad locator */
6081 librados::IoCtx src_ioctx;
6082 src_ioctx.dup(ref.ioctx);
6083 src_ioctx.locator_set_key(bad_loc);
6084
6085 r = src_ioctx.stat(oid, NULL, NULL);
6086 if (r != 0) {
6087 /* cannot find a broken part */
6088 continue;
6089 }
6090 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6091 if (need_fix) {
6092 *need_fix = true;
6093 }
6094 if (fix) {
6095 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6096 if (r < 0) {
6097 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6098 }
6099 }
6100 }
6101 }
6102
6103 return 0;
6104}
6105
6106int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6107{
6108 bucket = _bucket;
6109
6110 RGWObjectCtx obj_ctx(store);
6111
6112 RGWBucketInfo bucket_info;
6113 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6114 if (ret < 0) {
6115 return ret;
6116 }
6117
6118 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6119 if (ret < 0) {
6120 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6121 return ret;
6122 }
6123 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6124
6125 return 0;
6126}
6127
6128int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6129{
6130 bucket = _bucket;
6131 shard_id = sid;
6132
6133 RGWObjectCtx obj_ctx(store);
6134
6135 RGWBucketInfo bucket_info;
6136 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6137 if (ret < 0) {
6138 return ret;
6139 }
6140
6141 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6142 if (ret < 0) {
6143 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6144 return ret;
6145 }
6146 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6147
6148 return 0;
6149}
6150
6151
6152/* Execute @handler on last item in bucket listing for bucket specified
6153 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6154 * to objects matching these criterias. */
6155int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6156 const std::string& obj_prefix,
6157 const std::string& obj_delim,
6158 std::function<int(const rgw_bucket_dir_entry&)> handler)
6159{
6160 RGWRados::Bucket target(this, bucket_info);
6161 RGWRados::Bucket::List list_op(&target);
6162
6163 list_op.params.prefix = obj_prefix;
6164 list_op.params.delim = obj_delim;
6165
6166 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6167 << ", obj_prefix=" << obj_prefix
6168 << ", obj_delim=" << obj_delim
6169 << dendl;
6170
6171 bool is_truncated = false;
6172
6173 boost::optional<rgw_bucket_dir_entry> last_entry;
6174 /* We need to rewind to the last object in a listing. */
6175 do {
6176 /* List bucket entries in chunks. */
6177 static constexpr int MAX_LIST_OBJS = 100;
6178 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6179
6180 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6181 &is_truncated);
6182 if (ret < 0) {
6183 return ret;
6184 } else if (!entries.empty()) {
6185 last_entry = entries.back();
6186 }
6187 } while (is_truncated);
6188
6189 if (last_entry) {
6190 return handler(*last_entry);
6191 }
6192
6193 /* Empty listing - no items we can run handler on. */
6194 return 0;
6195}
6196
6197
6198int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6199 const rgw_user& user,
6200 RGWBucketInfo& bucket_info,
6201 rgw_obj& obj)
6202{
6203 if (! swift_versioning_enabled(bucket_info)) {
6204 return 0;
6205 }
6206
6207 obj_ctx.obj.set_atomic(obj);
6208
6209 RGWObjState * state = nullptr;
6210 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6211 if (r < 0) {
6212 return r;
6213 }
6214
6215 if (!state->exists) {
6216 return 0;
6217 }
6218
6219 string client_id;
6220 string op_id;
6221
6222 const string& src_name = obj.get_oid();
6223 char buf[src_name.size() + 32];
6224 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6225 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6226 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6227
6228 RGWBucketInfo dest_bucket_info;
6229
6230 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6231 if (r < 0) {
6232 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6233 if (r == -ENOENT) {
6234 return -ERR_PRECONDITION_FAILED;
6235 }
6236 return r;
6237 }
6238
6239 if (dest_bucket_info.owner != bucket_info.owner) {
6240 return -ERR_PRECONDITION_FAILED;
6241 }
6242
6243 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6244 obj_ctx.obj.set_atomic(dest_obj);
6245
6246 string no_zone;
6247
6248 r = copy_obj(obj_ctx,
6249 user,
6250 client_id,
6251 op_id,
6252 NULL, /* req_info *info */
6253 no_zone,
6254 dest_obj,
6255 obj,
6256 dest_bucket_info,
6257 bucket_info,
6258 NULL, /* time_t *src_mtime */
6259 NULL, /* time_t *mtime */
6260 NULL, /* const time_t *mod_ptr */
6261 NULL, /* const time_t *unmod_ptr */
6262 false, /* bool high_precision_time */
6263 NULL, /* const char *if_match */
6264 NULL, /* const char *if_nomatch */
6265 RGWRados::ATTRSMOD_NONE,
6266 true, /* bool copy_if_newer */
6267 state->attrset,
6268 RGW_OBJ_CATEGORY_MAIN,
6269 0, /* uint64_t olh_epoch */
6270 real_time(), /* time_t delete_at */
6271 NULL, /* string *version_id */
6272 NULL, /* string *ptag */
6273 NULL, /* string *petag */
6274 NULL, /* struct rgw_err *err */
6275 NULL, /* void (*progress_cb)(off_t, void *) */
6276 NULL); /* void *progress_data */
6277 if (r == -ECANCELED || r == -ENOENT) {
6278 /* Has already been overwritten, meaning another rgw process already
6279 * copied it out */
6280 return 0;
6281 }
6282
6283 return r;
6284}
6285
6286int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6287 const rgw_user& user,
6288 RGWBucketInfo& bucket_info,
6289 rgw_obj& obj,
6290 bool& restored) /* out */
6291{
6292 if (! swift_versioning_enabled(bucket_info)) {
6293 return 0;
6294 }
6295
6296 /* Bucket info of the bucket that stores previous versions of our object. */
6297 RGWBucketInfo archive_binfo;
6298
6299 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6300 bucket_info.swift_ver_location, archive_binfo,
6301 nullptr, nullptr);
6302 if (ret < 0) {
6303 return ret;
6304 }
6305
6306 /* Abort the operation if the bucket storing our archive belongs to someone
6307 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6308 * into consideration. For we can live with that.
6309 *
6310 * TODO: delegate this check to un upper layer and compare with ACLs. */
6311 if (bucket_info.owner != archive_binfo.owner) {
6312 return -EPERM;
6313 }
6314
6315 /* This code will be executed on latest version of the object. */
6316 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6317 std::string no_client_id;
6318 std::string no_op_id;
6319 std::string no_zone;
6320
6321 /* We don't support object versioning of Swift API on those buckets that
6322 * are already versioned using the S3 mechanism. This affects also bucket
6323 * storing archived objects. Otherwise the delete operation would create
6324 * a deletion marker. */
6325 if (archive_binfo.versioned()) {
6326 restored = false;
6327 return -ERR_PRECONDITION_FAILED;
6328 }
6329
6330 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6331 * irrelevant and may be safely skipped. */
6332 std::map<std::string, ceph::bufferlist> no_attrs;
6333
6334 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6335 obj_ctx.obj.set_atomic(archive_obj);
6336 obj_ctx.obj.set_atomic(obj);
6337
6338 int ret = copy_obj(obj_ctx,
6339 user,
6340 no_client_id,
6341 no_op_id,
6342 nullptr, /* req_info *info */
6343 no_zone,
6344 obj, /* dest obj */
6345 archive_obj, /* src obj */
6346 bucket_info, /* dest bucket info */
6347 archive_binfo, /* src bucket info */
6348 nullptr, /* time_t *src_mtime */
6349 nullptr, /* time_t *mtime */
6350 nullptr, /* const time_t *mod_ptr */
6351 nullptr, /* const time_t *unmod_ptr */
6352 false, /* bool high_precision_time */
6353 nullptr, /* const char *if_match */
6354 nullptr, /* const char *if_nomatch */
6355 RGWRados::ATTRSMOD_NONE,
6356 true, /* bool copy_if_newer */
6357 no_attrs,
6358 RGW_OBJ_CATEGORY_MAIN,
6359 0, /* uint64_t olh_epoch */
6360 real_time(), /* time_t delete_at */
6361 nullptr, /* string *version_id */
6362 nullptr, /* string *ptag */
6363 nullptr, /* string *petag */
6364 nullptr, /* struct rgw_err *err */
6365 nullptr, /* void (*progress_cb)(off_t, void *) */
6366 nullptr); /* void *progress_data */
6367 if (ret == -ECANCELED || ret == -ENOENT) {
6368 /* Has already been overwritten, meaning another rgw process already
6369 * copied it out */
6370 return 0;
6371 } else if (ret < 0) {
6372 return ret;
6373 } else {
6374 restored = true;
6375 }
6376
6377 /* Need to remove the archived copy. */
6378 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6379 archive_binfo.versioning_status());
6380
6381 return ret;
6382 };
6383
6384 const std::string& obj_name = obj.get_oid();
6385 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6386 % obj_name);
6387
6388 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6389 handler);
6390}
6391
6392/**
6393 * Write/overwrite an object to the bucket storage.
6394 * bucket: the bucket to store the object in
6395 * obj: the object name/key
6396 * data: the object contents/value
6397 * size: the amount of data to write (data must be this long)
6398 * accounted_size: original size of data before compression, encryption
6399 * mtime: if non-NULL, writes the given mtime to the bucket storage
6400 * attrs: all the given attrs are written to bucket storage for the given object
6401 * exclusive: create object exclusively
6402 * Returns: 0 on success, -ERR# otherwise.
6403 */
6404int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6405 map<string, bufferlist>& attrs, bool assume_noent,
6406 void *_index_op)
6407{
6408 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6409 rgw_pool pool;
6410 rgw_rados_ref ref;
6411 RGWRados *store = target->get_store();
6412
6413 ObjectWriteOperation op;
6414
6415 RGWObjState *state;
6416 int r = target->get_state(&state, false, assume_noent);
6417 if (r < 0)
6418 return r;
6419
6420 rgw_obj& obj = target->get_obj();
6421
6422 if (obj.get_oid().empty()) {
6423 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6424 return -EIO;
6425 }
6426
6427 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6428 if (r < 0)
6429 return r;
6430
6431 bool is_olh = state->is_olh;
6432
6433 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6434
6435 const string *ptag = meta.ptag;
6436 if (!ptag && !index_op->get_optag()->empty()) {
6437 ptag = index_op->get_optag();
6438 }
6439 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
6440 if (r < 0)
6441 return r;
6442
6443 if (real_clock::is_zero(meta.set_mtime)) {
6444 meta.set_mtime = real_clock::now();
6445 }
6446
6447 if (state->is_olh) {
6448 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6449 }
6450
6451 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6452 op.mtime2(&mtime_ts);
6453
6454 if (meta.data) {
6455 /* if we want to overwrite the data, we also want to overwrite the
6456 xattrs, so just remove the object */
6457 op.write_full(*meta.data);
6458 }
6459
6460 string etag;
6461 string content_type;
6462 bufferlist acl_bl;
6463
6464 map<string, bufferlist>::iterator iter;
6465 if (meta.rmattrs) {
6466 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6467 const string& name = iter->first;
6468 op.rmxattr(name.c_str());
6469 }
6470 }
6471
6472 if (meta.manifest) {
6473 /* remove existing manifest attr */
6474 iter = attrs.find(RGW_ATTR_MANIFEST);
6475 if (iter != attrs.end())
6476 attrs.erase(iter);
6477
6478 bufferlist bl;
6479 ::encode(*meta.manifest, bl);
6480 op.setxattr(RGW_ATTR_MANIFEST, bl);
6481 }
6482
6483 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6484 const string& name = iter->first;
6485 bufferlist& bl = iter->second;
6486
6487 if (!bl.length())
6488 continue;
6489
6490 op.setxattr(name.c_str(), bl);
6491
6492 if (name.compare(RGW_ATTR_ETAG) == 0) {
6493 etag = bl.c_str();
6494 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6495 content_type = bl.c_str();
6496 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6497 acl_bl = bl;
6498 }
6499 }
6500 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6501 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6502 }
6503
6504 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6505 bufferlist bl;
6506 ::encode(store->get_zone_short_id(), bl);
6507 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6508 }
6509
6510 if (!op.size())
6511 return 0;
6512
6513 uint64_t epoch;
6514 int64_t poolid;
6515
6516 bool orig_exists = state->exists;
6517 uint64_t orig_size = state->accounted_size;
6518
6519 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6520
6521 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6522
6523 if (versioned_op) {
6524 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6525 }
6526
6527 if (!index_op->is_prepared()) {
6528 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6529 if (r < 0)
6530 return r;
6531 }
6532
6533 r = ref.ioctx.operate(ref.oid, &op);
6534 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6535 or -ENOENT if was removed, or -EEXIST if it did not exist
6536 before and now it does */
6537 if (r == -EEXIST && assume_noent) {
6538 target->invalidate_state();
6539 return r;
6540 }
6541 goto done_cancel;
6542 }
6543
6544 epoch = ref.ioctx.get_last_version();
6545 poolid = ref.ioctx.get_id();
6546
6547 r = target->complete_atomic_modification();
6548 if (r < 0) {
6549 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
6550 }
6551
6552 r = index_op->complete(poolid, epoch, size, accounted_size,
6553 meta.set_mtime, etag, content_type, &acl_bl,
6554 meta.category, meta.remove_objs, meta.user_data);
6555 if (r < 0)
6556 goto done_cancel;
6557
6558 if (meta.mtime) {
6559 *meta.mtime = meta.set_mtime;
6560 }
6561
6562 /* note that index_op was using state so we couldn't invalidate it earlier */
6563 target->invalidate_state();
6564 state = NULL;
6565
6566 if (versioned_op) {
6567 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false);
6568 if (r < 0) {
6569 return r;
6570 }
6571 }
6572
6573 if (!real_clock::is_zero(meta.delete_at)) {
6574 rgw_obj_index_key obj_key;
6575 obj.key.get_index_key(&obj_key);
6576
6577 r = store->objexp_hint_add(meta.delete_at,
6578 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
6579 if (r < 0) {
6580 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
6581 /* ignoring error, nothing we can do at this point */
6582 }
6583 }
6584 meta.canceled = false;
6585
6586 /* update quota cache */
6587 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
6588 accounted_size, orig_size);
6589 return 0;
6590
6591done_cancel:
6592 int ret = index_op->cancel();
6593 if (ret < 0) {
6594 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
6595 }
6596
6597 meta.canceled = true;
6598
6599 /* we lost in a race. There are a few options:
6600 * - existing object was rewritten (ECANCELED)
6601 * - non existing object was created (EEXIST)
6602 * - object was removed (ENOENT)
6603 * should treat it as a success
6604 */
6605 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
6606 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
6607 r = 0;
6608 }
6609 } else {
6610 if (meta.if_match != NULL) {
6611 // only overwrite existing object
6612 if (strcmp(meta.if_match, "*") == 0) {
6613 if (r == -ENOENT) {
6614 r = -ERR_PRECONDITION_FAILED;
6615 } else if (r == -ECANCELED) {
6616 r = 0;
6617 }
6618 }
6619 }
6620
6621 if (meta.if_nomatch != NULL) {
6622 // only create a new object
6623 if (strcmp(meta.if_nomatch, "*") == 0) {
6624 if (r == -EEXIST) {
6625 r = -ERR_PRECONDITION_FAILED;
6626 } else if (r == -ENOENT) {
6627 r = 0;
6628 }
6629 }
6630 }
6631 }
6632
6633 return r;
6634}
6635
6636int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
6637 map<string, bufferlist>& attrs)
6638{
6639 RGWBucketInfo& bucket_info = target->get_bucket_info();
6640
6641 RGWRados::Bucket bop(target->get_store(), bucket_info);
6642 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
6643
6644 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
6645 int r;
6646 if (assume_noent) {
6647 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
6648 if (r == -EEXIST) {
6649 assume_noent = false;
6650 }
6651 }
6652 if (!assume_noent) {
6653 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
6654 }
6655 return r;
6656}
6657
6658/** Write/overwrite a system object. */
6659int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
6660 map<std::string, bufferlist>& attrs, int flags,
6661 bufferlist& data,
6662 RGWObjVersionTracker *objv_tracker,
6663 real_time set_mtime /* 0 for don't set */)
6664{
6665 rgw_pool pool;
6666 rgw_rados_ref ref;
6667 int r = get_system_obj_ref(obj, &ref, &pool);
6668 if (r < 0)
6669 return r;
6670
6671 ObjectWriteOperation op;
6672
6673 if (flags & PUT_OBJ_EXCL) {
6674 if (!(flags & PUT_OBJ_CREATE))
6675 return -EINVAL;
6676 op.create(true); // exclusive create
6677 } else {
6678 op.remove();
6679 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
6680 op.create(false);
6681 }
6682
6683 if (objv_tracker) {
6684 objv_tracker->prepare_op_for_write(&op);
6685 }
6686
6687 if (real_clock::is_zero(set_mtime)) {
6688 set_mtime = real_clock::now();
6689 }
6690
6691 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
6692 op.mtime2(&mtime_ts);
6693 op.write_full(data);
6694
6695 bufferlist acl_bl;
6696
6697 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
6698 const string& name = iter->first;
6699 bufferlist& bl = iter->second;
6700
6701 if (!bl.length())
6702 continue;
6703
6704 op.setxattr(name.c_str(), bl);
6705 }
6706
6707 r = ref.ioctx.operate(ref.oid, &op);
6708 if (r < 0) {
6709 return r;
6710 }
6711
6712 if (objv_tracker) {
6713 objv_tracker->apply_write();
6714 }
6715
6716 if (mtime) {
6717 *mtime = set_mtime;
6718 }
6719
6720 return 0;
6721}
6722
6723int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
6724 off_t ofs, bool exclusive,
6725 RGWObjVersionTracker *objv_tracker)
6726{
6727 rgw_rados_ref ref;
6728 rgw_pool pool;
6729 int r = get_system_obj_ref(obj, &ref, &pool);
6730 if (r < 0) {
6731 return r;
6732 }
6733
6734 ObjectWriteOperation op;
6735
6736 if (exclusive)
6737 op.create(true);
6738
6739 if (objv_tracker) {
6740 objv_tracker->prepare_op_for_write(&op);
6741 }
6742 if (ofs == -1) {
6743 op.write_full(bl);
6744 } else {
6745 op.write(ofs, bl);
6746 }
6747 r = ref.ioctx.operate(ref.oid, &op);
6748 if (r < 0)
6749 return r;
6750
6751 if (objv_tracker) {
6752 objv_tracker->apply_write();
6753 }
6754 return 0;
6755}
6756
6757/**
6758 * Write/overwrite an object to the bucket storage.
6759 * bucket: the bucket to store the object in
6760 * obj: the object name/key
6761 * data: the object contents/value
6762 * offset: the offet to write to in the object
6763 * If this is -1, we will overwrite the whole object.
6764 * size: the amount of data to write (data must be this long)
6765 * attrs: all the given attrs are written to bucket storage for the given object
6766 * Returns: 0 on success, -ERR# otherwise.
6767 */
6768
6769int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
6770 off_t ofs, bool exclusive,
6771 void **handle)
6772{
6773 rgw_rados_ref ref;
6774 int r = get_raw_obj_ref(obj, &ref);
6775 if (r < 0) {
6776 return r;
6777 }
6778
6779 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
6780 *handle = c;
6781
6782 ObjectWriteOperation op;
6783
6784 if (exclusive)
6785 op.create(true);
6786
6787 if (ofs == -1) {
6788 op.write_full(bl);
6789 } else {
6790 op.write(ofs, bl);
6791 }
6792 r = ref.ioctx.aio_operate(ref.oid, c, &op);
6793 if (r < 0)
6794 return r;
6795
6796 return 0;
6797}
6798
6799int RGWRados::aio_wait(void *handle)
6800{
6801 AioCompletion *c = (AioCompletion *)handle;
6802 c->wait_for_safe();
6803 int ret = c->get_return_value();
6804 c->release();
6805 return ret;
6806}
6807
6808bool RGWRados::aio_completed(void *handle)
6809{
6810 AioCompletion *c = (AioCompletion *)handle;
6811 return c->is_safe();
6812}
6813
6814class RGWRadosPutObj : public RGWGetDataCB
6815{
6816 CephContext* cct;
6817 rgw_obj obj;
6818 RGWPutObjDataProcessor *filter;
6819 boost::optional<RGWPutObj_Compress>& compressor;
6820 CompressorRef& plugin;
6821 RGWPutObjProcessor_Atomic *processor;
6822 RGWOpStateSingleOp *opstate;
6823 void (*progress_cb)(off_t, void *);
6824 void *progress_data;
6825 bufferlist extra_data_bl;
6826 uint64_t extra_data_len;
6827 uint64_t data_len;
6828 map<string, bufferlist> src_attrs;
6829public:
6830 RGWRadosPutObj(CephContext* cct,
6831 CompressorRef& plugin,
6832 boost::optional<RGWPutObj_Compress>& compressor,
6833 RGWPutObjProcessor_Atomic *p,
6834 RGWOpStateSingleOp *_ops,
6835 void (*_progress_cb)(off_t, void *),
6836 void *_progress_data) :
6837 cct(cct),
6838 filter(p),
6839 compressor(compressor),
6840 plugin(plugin),
6841 processor(p),
6842 opstate(_ops),
6843 progress_cb(_progress_cb),
6844 progress_data(_progress_data),
6845 extra_data_len(0),
6846 data_len(0) {}
6847
6848 int process_attrs(void) {
6849 if (extra_data_bl.length()) {
6850 JSONParser jp;
6851 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
6852 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
6853 return -EIO;
6854 }
6855
6856 JSONDecoder::decode_json("attrs", src_attrs, &jp);
6857
6858 src_attrs.erase(RGW_ATTR_COMPRESSION);
6859 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
6860 }
6861
6862 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
6863 //do not compress if object is encrypted
6864 compressor = boost::in_place(cct, plugin, filter);
6865 filter = &*compressor;
6866 }
6867 return 0;
6868 }
6869
6870 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
6871 if (progress_cb) {
6872 progress_cb(ofs, progress_data);
6873 }
6874 if (extra_data_len) {
6875 size_t extra_len = bl.length();
6876 if (extra_len > extra_data_len)
6877 extra_len = extra_data_len;
6878
6879 bufferlist extra;
6880 bl.splice(0, extra_len, &extra);
6881 extra_data_bl.append(extra);
6882
6883 extra_data_len -= extra_len;
6884 if (extra_data_len == 0) {
6885 int res = process_attrs();
6886 if (res < 0)
6887 return res;
6888 }
6889 if (bl.length() == 0) {
6890 return 0;
6891 }
6892 }
6893 data_len += bl.length();
6894 bool again = false;
6895
6896 bool need_opstate = true;
6897
6898 do {
6899 void *handle = NULL;
6900 rgw_raw_obj obj;
6901 uint64_t size = bl.length();
6902 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
6903 if (ret < 0)
6904 return ret;
6905
6906 if (need_opstate && opstate) {
6907 /* need to update opstate repository with new state. This is ratelimited, so we're not
6908 * really doing it every time
6909 */
6910 ret = opstate->renew_state();
6911 if (ret < 0) {
6912 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
6913 int r = filter->throttle_data(handle, obj, size, false);
6914 if (r < 0) {
6915 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
6916 }
6917 /* could not renew state! might have been marked as cancelled */
6918 return ret;
6919 }
6920 need_opstate = false;
6921 }
6922
6923 ret = filter->throttle_data(handle, obj, size, false);
6924 if (ret < 0)
6925 return ret;
6926 } while (again);
6927
6928 return 0;
6929 }
6930
6931 bufferlist& get_extra_data() { return extra_data_bl; }
6932
6933 map<string, bufferlist>& get_attrs() { return src_attrs; }
6934
6935 void set_extra_data_len(uint64_t len) override {
6936 extra_data_len = len;
6937 }
6938
6939 uint64_t get_data_len() {
6940 return data_len;
6941 }
6942
6943 int complete(const string& etag, real_time *mtime, real_time set_mtime,
6944 map<string, bufferlist>& attrs, real_time delete_at) {
6945 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at);
6946 }
6947
6948 bool is_canceled() {
6949 return processor->is_canceled();
6950 }
6951};
6952
6953/*
6954 * prepare attrset depending on attrs_mod.
6955 */
6956static void set_copy_attrs(map<string, bufferlist>& src_attrs,
6957 map<string, bufferlist>& attrs,
6958 RGWRados::AttrsMod attrs_mod)
6959{
6960 switch (attrs_mod) {
6961 case RGWRados::ATTRSMOD_NONE:
6962 attrs = src_attrs;
6963 break;
6964 case RGWRados::ATTRSMOD_REPLACE:
6965 if (!attrs[RGW_ATTR_ETAG].length()) {
6966 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
6967 }
6968 break;
6969 case RGWRados::ATTRSMOD_MERGE:
6970 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
6971 if (attrs.find(it->first) == attrs.end()) {
6972 attrs[it->first] = it->second;
6973 }
6974 }
6975 break;
6976 }
6977}
6978
6979int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
6980{
6981 map<string, bufferlist> attrset;
6982
6983 real_time mtime;
6984 uint64_t obj_size;
6985 RGWObjectCtx rctx(this);
6986
6987 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
6988 RGWRados::Object::Read read_op(&op_target);
6989
6990 read_op.params.attrs = &attrset;
6991 read_op.params.lastmod = &mtime;
6992 read_op.params.obj_size = &obj_size;
6993
6994 int ret = read_op.prepare();
6995 if (ret < 0)
6996 return ret;
6997
6998 attrset.erase(RGW_ATTR_ID_TAG);
6999
7000 uint64_t max_chunk_size;
7001
7002 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7003 if (ret < 0) {
7004 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7005 return ret;
7006 }
7007
7008 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
7009 RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL, NULL);
7010}
7011
7012struct obj_time_weight {
7013 real_time mtime;
7014 uint32_t zone_short_id;
7015 uint64_t pg_ver;
7016 bool high_precision;
7017
7018 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7019
7020 bool compare_low_precision(const obj_time_weight& rhs) {
7021 struct timespec l = ceph::real_clock::to_timespec(mtime);
7022 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7023 l.tv_nsec = 0;
7024 r.tv_nsec = 0;
7025 if (l > r) {
7026 return false;
7027 }
7028 if (l < r) {
7029 return true;
7030 }
7031 if (zone_short_id != rhs.zone_short_id) {
7032 return (zone_short_id < rhs.zone_short_id);
7033 }
7034 return (pg_ver < rhs.pg_ver);
7035
7036 }
7037
7038 bool operator<(const obj_time_weight& rhs) {
7039 if (!high_precision || !rhs.high_precision) {
7040 return compare_low_precision(rhs);
7041 }
7042 if (mtime > rhs.mtime) {
7043 return false;
7044 }
7045 if (mtime < rhs.mtime) {
7046 return true;
7047 }
7048 if (zone_short_id != rhs.zone_short_id) {
7049 return (zone_short_id < rhs.zone_short_id);
7050 }
7051 return (pg_ver < rhs.pg_ver);
7052 }
7053
7054 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7055 mtime = _mtime;
7056 zone_short_id = _short_id;
7057 pg_ver = _pg_ver;
7058 }
7059
7060 void init(RGWObjState *state) {
7061 mtime = state->mtime;
7062 zone_short_id = state->zone_short_id;
7063 pg_ver = state->pg_ver;
7064 }
7065};
7066
7067inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7068 out << o.mtime;
7069
7070 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7071 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7072 }
7073
7074 return out;
7075}
7076
7077class RGWGetExtraDataCB : public RGWGetDataCB {
7078 bufferlist extra_data;
7079public:
7080 RGWGetExtraDataCB() {}
7081 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7082 if (extra_data.length() < extra_data_len) {
7083 off_t max = extra_data_len - extra_data.length();
7084 if (max > bl_len) {
7085 max = bl_len;
7086 }
7087 bl.splice(0, max, &extra_data);
7088 }
7089 return bl_len;
7090 }
7091
7092 bufferlist& get_extra_data() {
7093 return extra_data;
7094 }
7095};
7096
7097int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7098 const rgw_user& user_id,
7099 const string& client_id,
7100 req_info *info,
7101 const string& source_zone,
7102 rgw_obj& src_obj,
7103 RGWBucketInfo& src_bucket_info,
7104 real_time *src_mtime,
7105 uint64_t *psize,
7106 const real_time *mod_ptr,
7107 const real_time *unmod_ptr,
7108 bool high_precision_time,
7109 const char *if_match,
7110 const char *if_nomatch,
7111 map<string, bufferlist> *pattrs,
7112 string *version_id,
7113 string *ptag,
7114 string *petag)
7115{
7116 /* source is in a different zonegroup, copy from there */
7117
7118 RGWRESTStreamRWRequest *in_stream_req;
7119 string tag;
7120 map<string, bufferlist> src_attrs;
7121 append_rand_alpha(cct, tag, tag, 32);
7122 obj_time_weight set_mtime_weight;
7123 set_mtime_weight.high_precision = high_precision_time;
7124
7125 RGWRESTConn *conn;
7126 if (source_zone.empty()) {
7127 if (src_bucket_info.zonegroup.empty()) {
7128 /* source is in the master zonegroup */
7129 conn = rest_master_conn;
7130 } else {
7131 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7132 if (iter == zonegroup_conn_map.end()) {
7133 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7134 return -ENOENT;
7135 }
7136 conn = iter->second;
7137 }
7138 } else {
7139 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7140 if (iter == zone_conn_map.end()) {
7141 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7142 return -ENOENT;
7143 }
7144 conn = iter->second;
7145 }
7146
7147 RGWGetExtraDataCB cb;
7148 string etag;
7149 map<string, string> req_headers;
7150 real_time set_mtime;
7151
7152 const real_time *pmod = mod_ptr;
7153
7154 obj_time_weight dest_mtime_weight;
7155
7156 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7157 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7158 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7159 true /* sync manifest */, &cb, &in_stream_req);
7160 if (ret < 0) {
7161 return ret;
7162 }
7163
7164 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7165 if (ret < 0) {
7166 return ret;
7167 }
7168
7169 bufferlist& extra_data_bl = cb.get_extra_data();
7170 if (extra_data_bl.length()) {
7171 JSONParser jp;
7172 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7173 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7174 return -EIO;
7175 }
7176
7177 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7178
7179 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7180 }
7181
7182 if (src_mtime) {
7183 *src_mtime = set_mtime;
7184 }
7185
7186 if (petag) {
7187 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7188 if (iter != src_attrs.end()) {
7189 bufferlist& etagbl = iter->second;
7190 *petag = etagbl.to_str();
7191 }
7192 }
7193
7194 if (pattrs) {
7195 *pattrs = src_attrs;
7196 }
7197
7198 return 0;
7199}
7200
7201int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7202 const rgw_user& user_id,
7203 const string& client_id,
7204 const string& op_id,
7205 bool record_op_state,
7206 req_info *info,
7207 const string& source_zone,
7208 rgw_obj& dest_obj,
7209 rgw_obj& src_obj,
7210 RGWBucketInfo& dest_bucket_info,
7211 RGWBucketInfo& src_bucket_info,
7212 real_time *src_mtime,
7213 real_time *mtime,
7214 const real_time *mod_ptr,
7215 const real_time *unmod_ptr,
7216 bool high_precision_time,
7217 const char *if_match,
7218 const char *if_nomatch,
7219 AttrsMod attrs_mod,
7220 bool copy_if_newer,
7221 map<string, bufferlist>& attrs,
7222 RGWObjCategory category,
7223 uint64_t olh_epoch,
7224 real_time delete_at,
7225 string *version_id,
7226 string *ptag,
7227 ceph::buffer::list *petag,
7228 struct rgw_err *err,
7229 void (*progress_cb)(off_t, void *),
7230 void *progress_data)
7231{
7232 /* source is in a different zonegroup, copy from there */
7233
7234 RGWRESTStreamRWRequest *in_stream_req;
7235 string tag;
7236 int i;
7237 append_rand_alpha(cct, tag, tag, 32);
7238 obj_time_weight set_mtime_weight;
7239 set_mtime_weight.high_precision = high_precision_time;
7240
7241 RGWPutObjProcessor_Atomic processor(obj_ctx,
7242 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7243 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7244 if (version_id && *version_id != "null") {
7245 processor.set_version_id(*version_id);
7246 }
7247 processor.set_olh_epoch(olh_epoch);
7248 int ret = processor.prepare(this, NULL);
7249 if (ret < 0) {
7250 return ret;
7251 }
7252
7253 RGWRESTConn *conn;
7254 if (source_zone.empty()) {
7255 if (dest_bucket_info.zonegroup.empty()) {
7256 /* source is in the master zonegroup */
7257 conn = rest_master_conn;
7258 } else {
7259 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7260 if (iter == zonegroup_conn_map.end()) {
7261 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7262 return -ENOENT;
7263 }
7264 conn = iter->second;
7265 }
7266 } else {
7267 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7268 if (iter == zone_conn_map.end()) {
7269 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7270 return -ENOENT;
7271 }
7272 conn = iter->second;
7273 }
7274
7275 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7276
7277 RGWOpStateSingleOp *opstate = NULL;
7278
7279 if (record_op_state) {
7280 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7281
7282 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7283 if (ret < 0) {
7284 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7285 delete opstate;
7286 return ret;
7287 }
7288 }
7289
7290 boost::optional<RGWPutObj_Compress> compressor;
7291 CompressorRef plugin;
7292
7293 const auto& compression_type = zone_params.get_compression_type(
7294 dest_bucket_info.placement_rule);
7295 if (compression_type != "none") {
7296 plugin = Compressor::create(cct, compression_type);
7297 if (!plugin) {
7298 ldout(cct, 1) << "Cannot load plugin for compression type "
7299 << compression_type << dendl;
7300 }
7301 }
7302
7303 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7304
7305 string etag;
7306 map<string, string> req_headers;
7307 real_time set_mtime;
7308
7309 RGWObjState *dest_state = NULL;
7310
7311 const real_time *pmod = mod_ptr;
7312
7313 obj_time_weight dest_mtime_weight;
7314
7315 if (copy_if_newer) {
7316 /* need to get mtime for destination */
7317 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7318 if (ret < 0)
7319 goto set_err_state;
7320
7321 if (!real_clock::is_zero(dest_state->mtime)) {
7322 dest_mtime_weight.init(dest_state);
7323 pmod = &dest_mtime_weight.mtime;
7324 }
7325 }
7326
7327 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7328 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7329 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7330 true /* sync manifest */, &cb, &in_stream_req);
7331 if (ret < 0) {
7332 goto set_err_state;
7333 }
7334
7335 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7336 if (ret < 0) {
7337 goto set_err_state;
7338 }
7339 if (compressor && compressor->is_compressed()) {
7340 bufferlist tmp;
7341 RGWCompressionInfo cs_info;
7342 cs_info.compression_type = plugin->get_type_name();
7343 cs_info.orig_size = cb.get_data_len();
7344 cs_info.blocks = move(compressor->get_compression_blocks());
7345 ::encode(cs_info, tmp);
7346 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7347 }
7348
7349 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7350 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7351 } else {
7352 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7353 if (iter != cb.get_attrs().end()) {
7354 try {
7355 ::decode(delete_at, iter->second);
7356 } catch (buffer::error& err) {
7357 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7358 }
7359 }
7360 }
7361
7362 if (src_mtime) {
7363 *src_mtime = set_mtime;
7364 }
7365
7366 if (petag) {
7367 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7368 if (iter != cb.get_attrs().end()) {
7369 *petag = iter->second;
7370 }
7371 }
7372
7373 if (source_zone.empty()) {
7374 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7375 } else {
7376 attrs = cb.get_attrs();
7377 }
7378
7379 if (copy_if_newer) {
7380 uint64_t pg_ver = 0;
7381 auto i = attrs.find(RGW_ATTR_PG_VER);
7382 if (i != attrs.end() && i->second.length() > 0) {
7383 bufferlist::iterator iter = i->second.begin();
7384 try {
7385 ::decode(pg_ver, iter);
7386 } catch (buffer::error& err) {
7387 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7388 /* non critical error */
7389 }
7390 }
7391 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7392 }
7393
7394#define MAX_COMPLETE_RETRY 100
7395 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7396 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at);
7397 if (ret < 0) {
7398 goto set_err_state;
7399 }
7400 if (copy_if_newer && cb.is_canceled()) {
7401 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7402 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7403 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7404 if (ret < 0) {
7405 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7406 goto set_err_state;
7407 }
7408 dest_mtime_weight.init(dest_state);
7409 dest_mtime_weight.high_precision = high_precision_time;
7410 if (!dest_state->exists ||
7411 dest_mtime_weight < set_mtime_weight) {
7412 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7413 continue;
7414 } else {
7415 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7416 }
7417 }
7418 break;
7419 }
7420
7421 if (i == MAX_COMPLETE_RETRY) {
7422 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7423 ret = -EIO;
7424 goto set_err_state;
7425 }
7426
7427 if (opstate) {
7428 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7429 if (ret < 0) {
7430 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7431 }
7432 delete opstate;
7433 }
7434
7435 return 0;
7436set_err_state:
7437 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7438 ret = 0;
7439 }
7440 if (opstate) {
7441 RGWOpState::OpState state;
7442 if (ret < 0) {
7443 state = RGWOpState::OPSTATE_ERROR;
7444 } else {
7445 state = RGWOpState::OPSTATE_COMPLETE;
7446 }
7447 int r = opstate->set_state(state);
7448 if (r < 0) {
7449 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7450 }
7451 delete opstate;
7452 }
7453 return ret;
7454}
7455
7456
7457int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7458 map<string, bufferlist>& src_attrs,
7459 RGWRados::Object::Read& read_op,
7460 const rgw_user& user_id,
7461 rgw_obj& dest_obj,
7462 real_time *mtime)
7463{
7464 string etag;
7465
7466 RGWRESTStreamWriteRequest *out_stream_req;
7467
7468 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7469 if (ret < 0) {
7470 delete out_stream_req;
7471 return ret;
7472 }
7473
7474 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7475 if (ret < 0)
7476 return ret;
7477
7478 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7479 if (ret < 0)
7480 return ret;
7481
7482 return 0;
7483}
7484
7485/**
7486 * Copy an object.
7487 * dest_obj: the object to copy into
7488 * src_obj: the object to copy from
7489 * attrs: usage depends on attrs_mod parameter
7490 * attrs_mod: the modification mode of the attrs, may have the following values:
7491 * ATTRSMOD_NONE - the attributes of the source object will be
7492 * copied without modifications, attrs parameter is ignored;
7493 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7494 * parameter, source object attributes are not copied;
7495 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7496 * are overwritten by values contained in attrs parameter.
7497 * err: stores any errors resulting from the get of the original object
7498 * Returns: 0 on success, -ERR# otherwise.
7499 */
7500int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7501 const rgw_user& user_id,
7502 const string& client_id,
7503 const string& op_id,
7504 req_info *info,
7505 const string& source_zone,
7506 rgw_obj& dest_obj,
7507 rgw_obj& src_obj,
7508 RGWBucketInfo& dest_bucket_info,
7509 RGWBucketInfo& src_bucket_info,
7510 real_time *src_mtime,
7511 real_time *mtime,
7512 const real_time *mod_ptr,
7513 const real_time *unmod_ptr,
7514 bool high_precision_time,
7515 const char *if_match,
7516 const char *if_nomatch,
7517 AttrsMod attrs_mod,
7518 bool copy_if_newer,
7519 map<string, bufferlist>& attrs,
7520 RGWObjCategory category,
7521 uint64_t olh_epoch,
7522 real_time delete_at,
7523 string *version_id,
7524 string *ptag,
7525 ceph::buffer::list *petag,
7526 struct rgw_err *err,
7527 void (*progress_cb)(off_t, void *),
7528 void *progress_data)
7529{
7530 int ret;
7531 uint64_t obj_size;
7532 rgw_obj shadow_obj = dest_obj;
7533 string shadow_oid;
7534
7535 bool remote_src;
7536 bool remote_dest;
7537
7538 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
7539 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
7540
7541 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
7542 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
7543
7544 if (remote_src && remote_dest) {
7545 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7546 return -EINVAL;
7547 }
7548
7549 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7550
7551 if (remote_src || !source_zone.empty()) {
7552 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
7553 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
7554 unmod_ptr, high_precision_time,
7555 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
7556 olh_epoch, delete_at, version_id, ptag, petag, err, progress_cb, progress_data);
7557 }
7558
7559 map<string, bufferlist> src_attrs;
7560 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
7561 RGWRados::Object::Read read_op(&src_op_target);
7562
7563 read_op.conds.mod_ptr = mod_ptr;
7564 read_op.conds.unmod_ptr = unmod_ptr;
7565 read_op.conds.high_precision_time = high_precision_time;
7566 read_op.conds.if_match = if_match;
7567 read_op.conds.if_nomatch = if_nomatch;
7568 read_op.params.attrs = &src_attrs;
7569 read_op.params.lastmod = src_mtime;
7570 read_op.params.obj_size = &obj_size;
7571 read_op.params.perr = err;
7572
7573 ret = read_op.prepare();
7574 if (ret < 0) {
7575 return ret;
7576 }
7577
7578 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
7579 src_attrs.erase(RGW_ATTR_DELETE_AT);
7580
7581 set_copy_attrs(src_attrs, attrs, attrs_mod);
7582 attrs.erase(RGW_ATTR_ID_TAG);
7583 attrs.erase(RGW_ATTR_PG_VER);
7584 attrs.erase(RGW_ATTR_SOURCE_ZONE);
7585 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
7586 if (cmp != src_attrs.end())
7587 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
7588
7589 RGWObjManifest manifest;
7590 RGWObjState *astate = NULL;
7591
7592 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
7593 if (ret < 0) {
7594 return ret;
7595 }
7596
7597 vector<rgw_raw_obj> ref_objs;
7598
7599 if (remote_dest) {
7600 /* dest is in a different zonegroup, copy it there */
7601 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
7602 }
7603 uint64_t max_chunk_size;
7604
7605 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
7606 if (ret < 0) {
7607 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
7608 return ret;
7609 }
7610
7611 rgw_pool src_pool;
7612 rgw_pool dest_pool;
7613 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
7614 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7615 return -EIO;
7616 }
7617 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
7618 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7619 return -EIO;
7620 }
7621
7622
7623 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
7624 bool copy_first = false;
7625 if (astate->has_manifest) {
7626 if (!astate->manifest.has_tail()) {
7627 copy_data = true;
7628 } else {
7629 uint64_t head_size = astate->manifest.get_head_size();
7630
7631 if (head_size > 0) {
7632 if (head_size > max_chunk_size) {
7633 copy_data = true;
7634 } else {
7635 copy_first = true;
7636 }
7637 }
7638 }
7639 }
7640
7641 if (petag) {
7642 const auto iter = attrs.find(RGW_ATTR_ETAG);
7643 if (iter != attrs.end()) {
7644 *petag = iter->second;
7645 }
7646 }
7647
7648 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
7649 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
7650 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
7651 version_id, ptag, petag, err);
7652 }
7653
7654 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
7655
7656 if (copy_first) { // we need to copy first chunk, not increase refcount
7657 ++miter;
7658 }
7659
7660 rgw_rados_ref ref;
7661 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
7662 if (ret < 0) {
7663 return ret;
7664 }
7665
7666 bool versioned_dest = dest_bucket_info.versioning_enabled();
7667
7668 if (version_id && !version_id->empty()) {
7669 versioned_dest = true;
7670 dest_obj.key.set_instance(*version_id);
7671 } else if (versioned_dest) {
7672 gen_rand_obj_instance_name(&dest_obj);
7673 }
7674
7675 bufferlist first_chunk;
7676
7677 bool copy_itself = (dest_obj == src_obj);
7678 RGWObjManifest *pmanifest;
7679 ldout(cct, 0) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7680
7681 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
7682 RGWRados::Object::Write write_op(&dest_op_target);
7683
7684 string tag;
7685
7686 if (ptag) {
7687 tag = *ptag;
7688 }
7689
7690 if (tag.empty()) {
7691 append_rand_alpha(cct, tag, tag, 32);
7692 }
7693
7694 if (!copy_itself) {
7695 manifest = astate->manifest;
7696 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
7697 if (tail_placement.bucket.name.empty()) {
7698 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
7699 }
7700 string oid, key;
7701 for (; miter != astate->manifest.obj_end(); ++miter) {
7702 ObjectWriteOperation op;
7703 cls_refcount_get(op, tag, true);
7704 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
7705 ref.ioctx.locator_set_key(loc.loc);
7706
7707 ret = ref.ioctx.operate(loc.oid, &op);
7708 if (ret < 0) {
7709 goto done_ret;
7710 }
7711
7712 ref_objs.push_back(loc);
7713 }
7714
7715 pmanifest = &manifest;
7716 } else {
7717 pmanifest = &astate->manifest;
7718 /* don't send the object's tail for garbage collection */
7719 astate->keep_tail = true;
7720 }
7721
7722 if (copy_first) {
7723 ret = read_op.read(0, max_chunk_size, first_chunk);
7724 if (ret < 0) {
7725 goto done_ret;
7726 }
7727
7728 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
7729 } else {
7730 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
7731 }
7732
7733 write_op.meta.data = &first_chunk;
7734 write_op.meta.manifest = pmanifest;
7735 write_op.meta.ptag = &tag;
7736 write_op.meta.owner = dest_bucket_info.owner;
7737 write_op.meta.mtime = mtime;
7738 write_op.meta.flags = PUT_OBJ_CREATE;
7739 write_op.meta.category = category;
7740 write_op.meta.olh_epoch = olh_epoch;
7741 write_op.meta.delete_at = delete_at;
7742
7743 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
7744 if (ret < 0) {
7745 goto done_ret;
7746 }
7747
7748 return 0;
7749
7750done_ret:
7751 if (!copy_itself) {
7752 vector<rgw_raw_obj>::iterator riter;
7753
7754 string oid, key;
7755
7756 /* rollback reference */
7757 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
7758 ObjectWriteOperation op;
7759 cls_refcount_put(op, tag, true);
7760
7761 ref.ioctx.locator_set_key(riter->loc);
7762
7763 int r = ref.ioctx.operate(riter->oid, &op);
7764 if (r < 0) {
7765 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7766 }
7767 }
7768 }
7769 return ret;
7770}
7771
7772
7773int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
7774 RGWBucketInfo& dest_bucket_info,
7775 RGWRados::Object::Read& read_op, off_t end,
7776 rgw_obj& dest_obj,
7777 rgw_obj& src_obj,
7778 uint64_t max_chunk_size,
7779 real_time *mtime,
7780 real_time set_mtime,
7781 map<string, bufferlist>& attrs,
7782 RGWObjCategory category,
7783 uint64_t olh_epoch,
7784 real_time delete_at,
7785 string *version_id,
7786 string *ptag,
7787 ceph::buffer::list *petag,
7788 struct rgw_err *err)
7789{
7790 bufferlist first_chunk;
7791 RGWObjManifest manifest;
7792
7793 string tag;
7794 append_rand_alpha(cct, tag, tag, 32);
7795
7796 RGWPutObjProcessor_Atomic processor(obj_ctx,
7797 dest_bucket_info, dest_obj.bucket, dest_obj.get_oid(),
7798 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7799 if (version_id) {
7800 processor.set_version_id(*version_id);
7801 }
7802 processor.set_olh_epoch(olh_epoch);
7803 int ret = processor.prepare(this, NULL);
7804 if (ret < 0)
7805 return ret;
7806
7807 off_t ofs = 0;
7808
7809 do {
7810 bufferlist bl;
7811 ret = read_op.read(ofs, end, bl);
7812
7813 uint64_t read_len = ret;
7814 bool again;
7815
7816 do {
7817 void *handle;
7818 rgw_raw_obj obj;
7819
7820 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
7821 if (ret < 0) {
7822 return ret;
7823 }
7824 ret = processor.throttle_data(handle, obj, read_len, false);
7825 if (ret < 0)
7826 return ret;
7827 } while (again);
7828
7829 ofs += read_len;
7830 } while (ofs <= end);
7831
7832 string etag;
7833 auto iter = attrs.find(RGW_ATTR_ETAG);
7834 if (iter != attrs.end()) {
7835 bufferlist& bl = iter->second;
7836 etag = string(bl.c_str(), bl.length());
7837 if (petag) {
7838 *petag = bl;
7839 }
7840 }
7841
7842 uint64_t accounted_size;
7843 {
7844 bool compressed{false};
7845 RGWCompressionInfo cs_info;
7846 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
7847 if (ret < 0) {
7848 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
7849 return ret;
7850 }
7851 // pass original size if compressed
7852 accounted_size = compressed ? cs_info.orig_size : ofs;
7853 }
7854
7855 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
7856}
7857
7858bool RGWRados::is_meta_master()
7859{
7860 if (!get_zonegroup().is_master) {
7861 return false;
7862 }
7863
7864 return (get_zonegroup().master_zone == zone_public_config.id);
7865}
7866
7867/**
7868 * Check to see if the bucket metadata could be synced
7869 * bucket: the bucket to check
7870 * Returns false is the bucket is not synced
7871 */
7872bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
7873{
7874
7875 /* no current period */
7876 if (current_period.get_id().empty()) {
7877 return false;
7878 }
7879
7880 /* zonegroup is not master zonegroup */
7881 if (!get_zonegroup().is_master) {
7882 return false;
7883 }
7884
7885 /* single zonegroup and a single zone */
7886 if (current_period.is_single_zonegroup(cct, this) && get_zonegroup().zones.size() == 1) {
7887 return false;
7888 }
7889
7890 /* zone is not master */
7891 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
7892 return false;
7893 }
7894
7895 return true;
7896}
7897
7898int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
7899{
7900 std::map<string, rgw_bucket_dir_entry> ent_map;
7901 rgw_obj_index_key marker;
7902 string prefix;
7903 bool is_truncated;
7904
7905 do {
7906#define NUM_ENTRIES 1000
7907 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
7908 &is_truncated, &marker);
7909 if (r < 0)
7910 return r;
7911
7912 string ns;
7913 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
7914 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7915 rgw_obj_key obj;
7916
7917 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
7918 return -ENOTEMPTY;
7919 }
7920 } while (is_truncated);
7921 return 0;
7922}
7923
7924/**
7925 * Delete a bucket.
7926 * bucket: the name of the bucket to delete
7927 * Returns 0 on success, -ERR# otherwise.
7928 */
7929int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
7930{
7931 const rgw_bucket& bucket = bucket_info.bucket;
7932 librados::IoCtx index_ctx;
7933 map<int, string> bucket_objs;
7934 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
7935 if (r < 0)
7936 return r;
7937
7938 if (check_empty) {
7939 r = check_bucket_empty(bucket_info);
7940 if (r < 0) {
7941 return r;
7942 }
7943 }
7944
7945 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
7946 if (r < 0)
7947 return r;
7948
7949 /* if the bucket is not synced we can remove the meta file */
7950 if (!is_syncing_bucket_meta(bucket)) {
7951 RGWObjVersionTracker objv_tracker;
7952 string entry = bucket.get_key();
7953 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
7954 if (r < 0) {
7955 return r;
7956 }
7957 /* remove bucket index objects*/
7958 map<int, string>::const_iterator biter;
7959 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
7960 index_ctx.remove(biter->second);
7961 }
7962 }
7963 return 0;
7964}
7965
7966int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
7967{
7968 RGWBucketInfo info;
7969 map<string, bufferlist> attrs;
7970 RGWObjectCtx obj_ctx(this);
7971 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
7972 if (r < 0) {
7973 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7974 return r;
7975 }
7976
7977 info.owner = owner.get_id();
7978
7979 r = put_bucket_instance_info(info, false, real_time(), &attrs);
7980 if (r < 0) {
7981 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7982 return r;
7983 }
7984
7985 return 0;
7986}
7987
7988
7989int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
7990{
7991 int ret = 0;
7992
7993 vector<rgw_bucket>::iterator iter;
7994
7995 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
7996 rgw_bucket& bucket = *iter;
7997 if (enabled)
7998 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
7999 else
8000 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8001
8002 RGWBucketInfo info;
8003 map<string, bufferlist> attrs;
8004 RGWObjectCtx obj_ctx(this);
8005 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8006 if (r < 0) {
8007 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8008 ret = r;
8009 continue;
8010 }
8011 if (enabled) {
8012 info.flags &= ~BUCKET_SUSPENDED;
8013 } else {
8014 info.flags |= BUCKET_SUSPENDED;
8015 }
8016
8017 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8018 if (r < 0) {
8019 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8020 ret = r;
8021 continue;
8022 }
8023 }
8024 return ret;
8025}
8026
8027int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8028{
8029 RGWBucketInfo bucket_info;
8030 RGWObjectCtx obj_ctx(this);
8031 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8032 if (ret < 0) {
8033 return ret;
8034 }
8035
8036 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8037 return 0;
8038}
8039
8040int RGWRados::Object::complete_atomic_modification()
8041{
8042 if (!state->has_manifest || state->keep_tail)
8043 return 0;
8044
8045 cls_rgw_obj_chain chain;
8046 store->update_gc_chain(obj, state->manifest, &chain);
8047
8048 if (chain.empty()) {
8049 return 0;
8050 }
8051
8052 string tag = state->obj_tag.to_str();
8053 return store->gc->send_chain(chain, tag, false); // do it async
8054}
8055
8056void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8057{
8058 RGWObjManifest::obj_iterator iter;
8059 rgw_raw_obj raw_head;
8060 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8061 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8062 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8063 if (mobj == raw_head)
8064 continue;
8065 cls_rgw_obj_key key(mobj.oid);
8066 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8067 }
8068}
8069
8070int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8071{
8072 return gc->send_chain(chain, tag, sync);
8073}
8074
8075int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8076{
8077 const rgw_bucket& bucket = bucket_info.bucket;
8078 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8079 if (r < 0)
8080 return r;
8081
8082 if (bucket.bucket_id.empty()) {
8083 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8084 return -EIO;
8085 }
8086
8087 bucket_oid = dir_oid_prefix;
8088 bucket_oid.append(bucket.bucket_id);
8089
8090 return 0;
8091}
8092
8093int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8094 string& bucket_oid_base) {
8095 const rgw_bucket& bucket = bucket_info.bucket;
8096 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8097 if (r < 0)
8098 return r;
8099
8100 if (bucket.bucket_id.empty()) {
8101 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8102 return -EIO;
8103 }
8104
8105 bucket_oid_base = dir_oid_prefix;
8106 bucket_oid_base.append(bucket.bucket_id);
8107
8108 return 0;
8109
8110}
8111
8112int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8113 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8114 string bucket_oid_base;
8115 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8116 if (ret < 0) {
8117 return ret;
8118 }
8119
8120 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8121 if (bucket_instance_ids) {
8122 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8123 }
8124 return 0;
8125}
8126
8127template<typename T>
8128int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8129 map<int, string>& oids, map<int, T>& bucket_objs,
8130 int shard_id, map<int, string> *bucket_instance_ids)
8131{
8132 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8133 if (ret < 0)
8134 return ret;
8135
8136 map<int, string>::const_iterator iter = oids.begin();
8137 for (; iter != oids.end(); ++iter) {
8138 bucket_objs[iter->first] = T();
8139 }
8140 return 0;
8141}
8142
8143int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8144 const string& obj_key, string *bucket_obj, int *shard_id)
8145{
8146 string bucket_oid_base;
8147 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8148 if (ret < 0)
8149 return ret;
8150
8151 RGWObjectCtx obj_ctx(this);
8152
8153 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8154 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8155 if (ret < 0) {
8156 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8157 return ret;
8158 }
8159 return 0;
8160}
8161
8162int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8163 int shard_id, string *bucket_obj)
8164{
8165 string bucket_oid_base;
8166 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8167 if (ret < 0)
8168 return ret;
8169
8170 RGWObjectCtx obj_ctx(this);
8171
8172 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8173 shard_id, bucket_obj);
8174 return 0;
8175}
8176
8177static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8178 map<RGWObjCategory, RGWStorageStats>& stats)
8179{
8180 for (const auto& pair : header.stats) {
8181 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8182 const rgw_bucket_category_stats& header_stats = pair.second;
8183
8184 RGWStorageStats& s = stats[category];
8185
8186 s.category = category;
8187 s.size += header_stats.total_size;
8188 s.size_rounded += header_stats.total_size_rounded;
8189 s.size_utilized += header_stats.actual_size;
8190 s.num_objects += header_stats.num_entries;
8191 }
8192}
8193
8194int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8195 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8196 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8197{
8198 librados::IoCtx index_ctx;
8199 // key - bucket index object id
8200 // value - bucket index check OP returned result with the given bucket index object (shard)
8201 map<int, string> oids;
8202 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8203 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8204 if (ret < 0)
8205 return ret;
8206
8207 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8208 if (ret < 0)
8209 return ret;
8210
8211 // Aggregate results (from different shards if there is any)
8212 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8213 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8214 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8215 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8216 }
8217
8218 return 0;
8219}
8220
8221int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8222{
8223 librados::IoCtx index_ctx;
8224 map<int, string> bucket_objs;
8225 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8226 if (r < 0)
8227 return r;
8228
8229 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8230}
8231
8232
8233int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8234{
8235 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8236 std::string oid, key;
8237 get_obj_bucket_and_oid_loc(obj, oid, key);
8238 if (!rctx)
8239 return 0;
8240
8241 RGWObjState *state = NULL;
8242
8243 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8244 if (r < 0)
8245 return r;
8246
8247 if (!state->is_atomic) {
8248 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8249 return -EINVAL;
8250 }
8251
8252 if (state->obj_tag.length() == 0) {// check for backward compatibility
8253 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8254 return -EINVAL;
8255 }
8256
8257 string tag = state->obj_tag.c_str();
8258
8259 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8260
8261 return gc->defer_chain(tag, false);
8262}
8263
8264void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8265{
8266 list<string> prefixes;
8267 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8268 cls_rgw_remove_obj(op, prefixes);
8269}
8270
8271void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8272{
8273 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8274}
8275
8276void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8277{
8278 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8279}
8280
8281
8282/**
8283 * Delete an object.
8284 * bucket: name of the bucket storing the object
8285 * obj: name of the object to delete
8286 * Returns: 0 on success, -ERR# otherwise.
8287 */
8288int RGWRados::Object::Delete::delete_obj()
8289{
8290 RGWRados *store = target->get_store();
8291 rgw_obj& src_obj = target->get_obj();
8292 const string& instance = src_obj.key.instance;
8293 rgw_obj obj = src_obj;
8294
8295 if (instance == "null") {
8296 obj.key.instance.clear();
8297 }
8298
8299 bool explicit_marker_version = (!params.marker_version_id.empty());
8300
8301 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8302 if (instance.empty() || explicit_marker_version) {
8303 rgw_obj marker = obj;
8304
8305 if (!params.marker_version_id.empty()) {
8306 if (params.marker_version_id != "null") {
8307 marker.key.set_instance(params.marker_version_id);
8308 }
8309 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8310 store->gen_rand_obj_instance_name(&marker);
8311 }
8312
8313 result.version_id = marker.key.instance;
8314 result.delete_marker = true;
8315
8316 struct rgw_bucket_dir_entry_meta meta;
8317
8318 meta.owner = params.obj_owner.get_id().to_str();
8319 meta.owner_display_name = params.obj_owner.get_display_name();
8320
8321 if (real_clock::is_zero(params.mtime)) {
8322 meta.mtime = real_clock::now();
8323 } else {
8324 meta.mtime = params.mtime;
8325 }
8326
8327 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time);
8328 if (r < 0) {
8329 return r;
8330 }
8331 } else {
8332 rgw_bucket_dir_entry dirent;
8333
8334 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8335 if (r < 0) {
8336 return r;
8337 }
8338 result.delete_marker = dirent.is_delete_marker();
8339 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch);
8340 if (r < 0) {
8341 return r;
8342 }
8343 result.version_id = instance;
8344 }
8345
8346 BucketShard *bs;
8347 int r = target->get_bucket_shard(&bs);
8348 if (r < 0) {
8349 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8350 return r;
8351 }
8352
8353 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8354 if (r < 0) {
8355 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8356 return r;
8357 }
8358
8359 return 0;
8360 }
8361
8362 rgw_rados_ref ref;
8363 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8364 if (r < 0) {
8365 return r;
8366 }
8367
8368 RGWObjState *state;
8369 r = target->get_state(&state, false);
8370 if (r < 0)
8371 return r;
8372
8373 ObjectWriteOperation op;
8374
8375 if (!real_clock::is_zero(params.unmod_since)) {
8376 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8377 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8378 if (!params.high_precision_time) {
8379 ctime.tv_nsec = 0;
8380 unmod.tv_nsec = 0;
8381 }
8382
8383 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8384 if (ctime > unmod) {
8385 return -ERR_PRECONDITION_FAILED;
8386 }
8387
8388 /* only delete object if mtime is less than or equal to params.unmod_since */
8389 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8390 }
8391 uint64_t obj_size = state->size;
8392
8393 if (!real_clock::is_zero(params.expiration_time)) {
8394 bufferlist bl;
8395 real_time delete_at;
8396
8397 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8398 try {
8399 bufferlist::iterator iter = bl.begin();
8400 ::decode(delete_at, iter);
8401 } catch (buffer::error& err) {
8402 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8403 return -EIO;
8404 }
8405
8406 if (params.expiration_time != delete_at) {
8407 return -ERR_PRECONDITION_FAILED;
8408 }
8409 } else {
8410 return -ERR_PRECONDITION_FAILED;
8411 }
8412 }
8413
8414 if (!state->exists) {
8415 target->invalidate_state();
8416 return -ENOENT;
8417 }
8418
8419 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
8420 if (r < 0)
8421 return r;
8422
8423 RGWBucketInfo& bucket_info = target->get_bucket_info();
8424
8425 RGWRados::Bucket bop(store, bucket_info);
8426 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8427
8428 index_op.set_bilog_flags(params.bilog_flags);
8429
8430
8431 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8432 if (r < 0)
8433 return r;
8434
8435 store->remove_rgw_head_obj(op);
8436 r = ref.ioctx.operate(ref.oid, &op);
8437 bool need_invalidate = false;
8438 if (r == -ECANCELED) {
8439 /* raced with another operation, we can regard it as removed */
8440 need_invalidate = true;
8441 r = 0;
8442 }
8443 bool removed = (r >= 0);
8444
8445 int64_t poolid = ref.ioctx.get_id();
8446 if (r >= 0) {
8447 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8448 if (obj_tombstone_cache) {
8449 tombstone_entry entry{*state};
8450 obj_tombstone_cache->add(obj, entry);
8451 }
8452 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8453 } else {
8454 int ret = index_op.cancel();
8455 if (ret < 0) {
8456 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8457 }
8458 }
8459 if (removed) {
8460 int ret = target->complete_atomic_modification();
8461 if (ret < 0) {
8462 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8463 }
8464 /* other than that, no need to propagate error */
8465 }
8466
8467 if (need_invalidate) {
8468 target->invalidate_state();
8469 }
8470
8471 if (r < 0)
8472 return r;
8473
8474 /* update quota cache */
8475 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8476
8477 return 0;
8478}
8479
8480int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8481 const RGWBucketInfo& bucket_info,
8482 const rgw_obj& obj,
8483 int versioning_status,
8484 uint16_t bilog_flags,
8485 const real_time& expiration_time)
8486{
8487 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
8488 RGWRados::Object::Delete del_op(&del_target);
8489
8490 del_op.params.bucket_owner = bucket_info.owner;
8491 del_op.params.versioning_status = versioning_status;
8492 del_op.params.bilog_flags = bilog_flags;
8493 del_op.params.expiration_time = expiration_time;
8494
8495 return del_op.delete_obj();
8496}
8497
8498int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
8499{
8500 rgw_rados_ref ref;
8501 rgw_pool pool;
8502 int r = get_raw_obj_ref(obj, &ref, &pool);
8503 if (r < 0) {
8504 return r;
8505 }
8506
8507 ObjectWriteOperation op;
8508
8509 op.remove();
8510 r = ref.ioctx.operate(ref.oid, &op);
8511 if (r < 0)
8512 return r;
8513
8514 return 0;
8515}
8516
8517int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
8518{
8519 if (obj.empty()) {
8520 ldout(cct, 1) << "delete_system_obj got empty object name "
8521 << obj << ", returning EINVAL" << dendl;
8522 return -EINVAL;
8523 }
8524 rgw_rados_ref ref;
8525 rgw_pool pool;
8526 int r = get_raw_obj_ref(obj, &ref, &pool);
8527 if (r < 0) {
8528 return r;
8529 }
8530
8531 ObjectWriteOperation op;
8532
8533 if (objv_tracker) {
8534 objv_tracker->prepare_op_for_write(&op);
8535 }
8536
8537 op.remove();
8538 r = ref.ioctx.operate(ref.oid, &op);
8539 if (r < 0)
8540 return r;
8541
8542 return 0;
8543}
8544
8545int RGWRados::delete_obj_index(const rgw_obj& obj)
8546{
8547 std::string oid, key;
8548 get_obj_bucket_and_oid_loc(obj, oid, key);
8549
8550 RGWObjectCtx obj_ctx(this);
8551
8552 RGWBucketInfo bucket_info;
8553 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
8554 if (ret < 0) {
8555 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
8556 return ret;
8557 }
8558
8559 RGWRados::Bucket bop(this, bucket_info);
8560 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8561
8562 real_time removed_mtime;
8563 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
8564
8565 return r;
8566}
8567
8568static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
8569{
8570 string tag;
8571
8572 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
8573 if (mi != manifest.obj_end()) {
8574 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
8575 ++mi;
8576 tag = mi.get_location().get_raw_obj(store).oid;
8577 tag.append("_");
8578 }
8579
8580 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
8581 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
8582 MD5 hash;
8583 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
8584
8585 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
8586 if (iter != attrset.end()) {
8587 bufferlist& bl = iter->second;
8588 hash.Update((const byte *)bl.c_str(), bl.length());
8589 }
8590
8591 hash.Final(md5);
8592 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
8593 tag.append(md5_str);
8594
8595 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
8596
8597 tag_bl.append(tag.c_str(), tag.size() + 1);
8598}
8599
8600static bool is_olh(map<string, bufferlist>& attrs)
8601{
8602 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
8603 return (iter != attrs.end());
8604}
8605
8606static bool has_olh_tag(map<string, bufferlist>& attrs)
8607{
8608 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
8609 return (iter != attrs.end());
8610}
8611
8612int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8613 RGWObjState *olh_state, RGWObjState **target_state)
8614{
8615 assert(olh_state->is_olh);
8616
8617 rgw_obj target;
8618 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
8619 if (r < 0) {
8620 return r;
8621 }
8622 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
8623 if (r < 0) {
8624 return r;
8625 }
8626
8627 return 0;
8628}
8629
8630int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
8631{
8632 if (obj.empty()) {
8633 return -EINVAL;
8634 }
8635
8636 RGWRawObjState *s = rctx->raw.get_state(obj);
8637 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
8638 *state = s;
8639 if (s->has_attrs) {
8640 return 0;
8641 }
8642
8643 s->obj = obj;
8644
8645 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
8646 if (r == -ENOENT) {
8647 s->exists = false;
8648 s->has_attrs = true;
8649 s->mtime = real_time();
8650 return 0;
8651 }
8652 if (r < 0)
8653 return r;
8654
8655 s->exists = true;
8656 s->has_attrs = true;
8657 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
8658
8659 if (s->obj_tag.length())
8660 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
8661 << string(s->obj_tag.c_str(), s->obj_tag.length()) << dendl;
8662 else
8663 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
8664
8665 return 0;
8666}
8667
8668int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
8669{
8670 int ret;
8671
8672 do {
8673 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
8674 } while (ret == -EAGAIN);
8675
8676 return ret;
8677}
8678
8679int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8680 RGWObjState **state, bool follow_olh, bool assume_noent)
8681{
8682 if (obj.empty()) {
8683 return -EINVAL;
8684 }
8685
8686 bool need_follow_olh = follow_olh && obj.key.instance.empty();
8687
8688 RGWObjState *s = rctx->obj.get_state(obj);
8689 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
8690 *state = s;
8691 if (s->has_attrs) {
8692 if (s->is_olh && need_follow_olh) {
8693 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
8694 }
8695 return 0;
8696 }
8697
8698 s->obj = obj;
8699
8700 rgw_raw_obj raw_obj;
8701 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
8702
8703 int r = -ENOENT;
8704
8705 if (!assume_noent) {
8706 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
8707 }
8708
8709 if (r == -ENOENT) {
8710 s->exists = false;
8711 s->has_attrs = true;
8712 tombstone_entry entry;
8713 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
8714 s->mtime = entry.mtime;
8715 s->zone_short_id = entry.zone_short_id;
8716 s->pg_ver = entry.pg_ver;
8717 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
8718 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
8719 } else {
8720 s->mtime = real_time();
8721 }
8722 return 0;
8723 }
8724 if (r < 0)
8725 return r;
8726
8727 s->exists = true;
8728 s->has_attrs = true;
8729 s->accounted_size = s->size;
8730
8731 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
8732 if (iter != s->attrset.end()) {
8733 // use uncompressed size for accounted_size
8734 try {
8735 RGWCompressionInfo info;
8736 auto p = iter->second.begin();
8737 ::decode(info, p);
8738 s->accounted_size = info.orig_size;
8739 } catch (buffer::error&) {
8740 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
8741 return -EIO;
8742 }
8743 }
8744
8745 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
8746 if (iter != s->attrset.end()) {
8747 bufferlist bl = iter->second;
8748 bufferlist::iterator it = bl.begin();
8749 it.copy(bl.length(), s->shadow_obj);
8750 s->shadow_obj[bl.length()] = '\0';
8751 }
8752 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
8753
8754 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
8755 if (manifest_bl.length()) {
8756 bufferlist::iterator miter = manifest_bl.begin();
8757 try {
8758 ::decode(s->manifest, miter);
8759 s->has_manifest = true;
8760 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
8761 broken due to old bugs */
8762 s->size = s->manifest.get_obj_size();
8763 } catch (buffer::error& err) {
8764 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
8765 return -EIO;
8766 }
8767 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
8768 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
8769 RGWObjManifest::obj_iterator mi;
8770 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
8771 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
8772 }
8773 }
8774
8775 if (!s->obj_tag.length()) {
8776 /*
8777 * Uh oh, something's wrong, object with manifest should have tag. Let's
8778 * create one out of the manifest, would be unique
8779 */
8780 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
8781 s->fake_tag = true;
8782 }
8783 }
8784 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
8785 if (aiter != s->attrset.end()) {
8786 bufferlist& pg_ver_bl = aiter->second;
8787 if (pg_ver_bl.length()) {
8788 bufferlist::iterator pgbl = pg_ver_bl.begin();
8789 try {
8790 ::decode(s->pg_ver, pgbl);
8791 } catch (buffer::error& err) {
8792 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
8793 }
8794 }
8795 }
8796 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
8797 if (aiter != s->attrset.end()) {
8798 bufferlist& zone_short_id_bl = aiter->second;
8799 if (zone_short_id_bl.length()) {
8800 bufferlist::iterator zbl = zone_short_id_bl.begin();
8801 try {
8802 ::decode(s->zone_short_id, zbl);
8803 } catch (buffer::error& err) {
8804 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
8805 }
8806 }
8807 }
8808 if (s->obj_tag.length())
8809 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << string(s->obj_tag.c_str(), s->obj_tag.length()) << dendl;
8810 else
8811 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
8812
8813 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
8814 * it exist, and not only if is_olh() returns true
8815 */
8816 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
8817 if (iter != s->attrset.end()) {
8818 s->olh_tag = iter->second;
8819 }
8820
8821 if (is_olh(s->attrset)) {
8822 s->is_olh = true;
8823
8824 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
8825
8826 if (need_follow_olh) {
8827 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
8828 }
8829 }
8830
8831 return 0;
8832}
8833
8834int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
8835 bool follow_olh, bool assume_noent)
8836{
8837 int ret;
8838
8839 do {
8840 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
8841 } while (ret == -EAGAIN);
8842
8843 return ret;
8844}
8845
8846int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
8847{
8848 RGWObjState *astate;
8849 int r = get_state(&astate, true);
8850 if (r < 0) {
8851 return r;
8852 }
8853
8854 *pmanifest = &astate->manifest;
8855
8856 return 0;
8857}
8858
8859int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
8860{
8861 RGWObjState *state;
8862 int r = source->get_state(&state, true);
8863 if (r < 0)
8864 return r;
8865 if (!state->exists)
8866 return -ENOENT;
8867 if (!state->get_attr(name, dest))
8868 return -ENODATA;
8869
8870 return 0;
8871}
8872
8873
8874int RGWRados::Object::Stat::stat_async()
8875{
8876 RGWObjectCtx& ctx = source->get_ctx();
8877 rgw_obj& obj = source->get_obj();
8878 RGWRados *store = source->get_store();
8879
8880 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
8881 result.obj = obj;
8882 if (s->has_attrs) {
8883 state.ret = 0;
8884 result.size = s->size;
8885 result.mtime = ceph::real_clock::to_timespec(s->mtime);
8886 result.attrs = s->attrset;
8887 result.has_manifest = s->has_manifest;
8888 result.manifest = s->manifest;
8889 return 0;
8890 }
8891
8892 string oid;
8893 string loc;
8894 get_obj_bucket_and_oid_loc(obj, oid, loc);
8895
8896 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
8897 if (r < 0) {
8898 return r;
8899 }
8900
8901 librados::ObjectReadOperation op;
8902 op.stat2(&result.size, &result.mtime, NULL);
8903 op.getxattrs(&result.attrs, NULL);
8904 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
8905 state.io_ctx.locator_set_key(loc);
8906 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
8907 if (r < 0) {
8908 ldout(store->ctx(), 5) << __func__
8909 << ": ERROR: aio_operate() returned ret=" << r
8910 << dendl;
8911 return r;
8912 }
8913
8914 return 0;
8915}
8916
8917
8918int RGWRados::Object::Stat::wait()
8919{
8920 if (!state.completion) {
8921 return state.ret;
8922 }
8923
8924 state.completion->wait_for_safe();
8925 state.ret = state.completion->get_return_value();
8926 state.completion->release();
8927
8928 if (state.ret != 0) {
8929 return state.ret;
8930 }
8931
8932 return finish();
8933}
8934
8935int RGWRados::Object::Stat::finish()
8936{
8937 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
8938 if (iter != result.attrs.end()) {
8939 bufferlist& bl = iter->second;
8940 bufferlist::iterator biter = bl.begin();
8941 try {
8942 ::decode(result.manifest, biter);
8943 } catch (buffer::error& err) {
8944 RGWRados *store = source->get_store();
8945 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
8946 return -EIO;
8947 }
8948 result.has_manifest = true;
8949 }
8950
8951 return 0;
8952}
8953
8954/**
8955 * Get the attributes for an object.
8956 * bucket: name of the bucket holding the object.
8957 * obj: name of the object
8958 * name: name of the attr to retrieve
8959 * dest: bufferlist to store the result in
8960 * Returns: 0 on success, -ERR# otherwise.
8961 */
8962int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
8963{
8964 rgw_rados_ref ref;
8965 rgw_pool pool;
8966 int r = get_system_obj_ref(obj, &ref, &pool);
8967 if (r < 0) {
8968 return r;
8969 }
8970
8971 ObjectReadOperation op;
8972
8973 int rval;
8974 op.getxattr(name, &dest, &rval);
8975
8976 r = ref.ioctx.operate(ref.oid, &op, NULL);
8977 if (r < 0)
8978 return r;
8979
8980 return 0;
8981}
8982
8983int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
8984 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8985 ObjectOperation& op, RGWObjState **pstate)
8986{
8987 if (!rctx)
8988 return 0;
8989
8990 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
8991 if (r < 0)
8992 return r;
8993
8994 RGWObjState *state = *pstate;
8995
8996 if (!state->is_atomic) {
8997 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
8998 return 0;
8999 }
9000
9001 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9002 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9003 } else {
9004 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9005 }
9006 return 0;
9007}
9008
9009int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9010{
9011 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9012}
9013
9014void RGWRados::Object::invalidate_state()
9015{
9016 ctx.obj.invalidate(obj);
9017}
9018
9019void RGWRados::SystemObject::invalidate_state()
9020{
9021 ctx.raw.invalidate(obj);
9022}
9023
9024int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9025 const char *if_match, const char *if_nomatch, bool removal_op)
9026{
9027 int r = get_state(&state, false);
9028 if (r < 0)
9029 return r;
9030
9031 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9032 if_match != NULL || if_nomatch != NULL) &&
9033 (!state->fake_tag);
9034
9035 if (!state->is_atomic) {
9036 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9037
9038 if (reset_obj) {
9039 op.create(false);
9040 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9041 }
9042
9043 return 0;
9044 }
9045
9046 if (need_guard) {
9047 /* first verify that the object wasn't replaced under */
9048 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9049 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9050 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9051 }
9052
9053 if (if_match) {
9054 if (strcmp(if_match, "*") == 0) {
9055 // test the object is existing
9056 if (!state->exists) {
9057 return -ERR_PRECONDITION_FAILED;
9058 }
9059 } else {
9060 bufferlist bl;
9061 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9062 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9063 return -ERR_PRECONDITION_FAILED;
9064 }
9065 }
9066 }
9067
9068 if (if_nomatch) {
9069 if (strcmp(if_nomatch, "*") == 0) {
9070 // test the object is NOT existing
9071 if (state->exists) {
9072 return -ERR_PRECONDITION_FAILED;
9073 }
9074 } else {
9075 bufferlist bl;
9076 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9077 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9078 return -ERR_PRECONDITION_FAILED;
9079 }
9080 }
9081 }
9082 }
9083
9084 if (reset_obj) {
9085 if (state->exists) {
9086 op.create(false);
9087 store->remove_rgw_head_obj(op);
9088 } else {
9089 op.create(true);
9090 }
9091 }
9092
9093 if (removal_op) {
9094 /* the object is being removed, no need to update its tag */
9095 return 0;
9096 }
9097
9098 if (ptag) {
9099 state->write_tag = *ptag;
9100 } else {
9101 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9102 }
9103 bufferlist bl;
9104 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9105
9106 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9107
9108 op.setxattr(RGW_ATTR_ID_TAG, bl);
9109
9110 return 0;
9111}
9112
9113int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9114 RGWObjVersionTracker *objv_tracker)
9115{
9116 map<string, bufferlist> attrs;
9117 attrs[name] = bl;
9118 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9119}
9120
9121int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9122 map<string, bufferlist>& attrs,
9123 map<string, bufferlist>* rmattrs,
9124 RGWObjVersionTracker *objv_tracker)
9125{
9126 rgw_rados_ref ref;
9127 rgw_pool pool;
9128 int r = get_system_obj_ref(obj, &ref, &pool);
9129 if (r < 0) {
9130 return r;
9131 }
9132 ObjectWriteOperation op;
9133
9134 if (objv_tracker) {
9135 objv_tracker->prepare_op_for_write(&op);
9136 }
9137
9138 map<string, bufferlist>::iterator iter;
9139 if (rmattrs) {
9140 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9141 const string& name = iter->first;
9142 op.rmxattr(name.c_str());
9143 }
9144 }
9145
9146 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9147 const string& name = iter->first;
9148 bufferlist& bl = iter->second;
9149
9150 if (!bl.length())
9151 continue;
9152
9153 op.setxattr(name.c_str(), bl);
9154 }
9155
9156 if (!op.size())
9157 return 0;
9158
9159 bufferlist bl;
9160
9161 r = ref.ioctx.operate(ref.oid, &op);
9162 if (r < 0)
9163 return r;
9164
9165 return 0;
9166}
9167
9168/**
9169 * Set an attr on an object.
9170 * bucket: name of the bucket holding the object
9171 * obj: name of the object to set the attr on
9172 * name: the attr to set
9173 * bl: the contents of the attr
9174 * Returns: 0 on success, -ERR# otherwise.
9175 */
9176int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9177{
9178 map<string, bufferlist> attrs;
9179 attrs[name] = bl;
9180 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9181}
9182
9183int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9184 map<string, bufferlist>& attrs,
9185 map<string, bufferlist>* rmattrs)
9186{
9187 rgw_rados_ref ref;
9188 int r = get_obj_head_ref(bucket_info, obj, &ref);
9189 if (r < 0) {
9190 return r;
9191 }
9192 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9193
9194 ObjectWriteOperation op;
9195 RGWObjState *state = NULL;
9196
9197 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9198 if (r < 0)
9199 return r;
9200
9201 map<string, bufferlist>::iterator iter;
9202 if (rmattrs) {
9203 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9204 const string& name = iter->first;
9205 op.rmxattr(name.c_str());
9206 }
9207 }
9208
9209 const rgw_bucket& bucket = obj.bucket;
9210
9211 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9212 const string& name = iter->first;
9213 bufferlist& bl = iter->second;
9214
9215 if (!bl.length())
9216 continue;
9217
9218 op.setxattr(name.c_str(), bl);
9219
9220 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9221 real_time ts;
9222 try {
9223 ::decode(ts, bl);
9224
9225 rgw_obj_index_key obj_key;
9226 obj.key.get_index_key(&obj_key);
9227
9228 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9229 } catch (buffer::error& err) {
9230 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9231 }
9232 }
9233 }
9234
9235 if (!op.size())
9236 return 0;
9237
9238 RGWObjectCtx obj_ctx(this);
9239
9240 bufferlist bl;
9241 RGWRados::Bucket bop(this, bucket_info);
9242 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9243
9244 if (state) {
9245 string tag;
9246 append_rand_alpha(cct, tag, tag, 32);
9247 state->write_tag = tag;
9248 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9249
9250 if (r < 0)
9251 return r;
9252
9253 bl.append(tag.c_str(), tag.size() + 1);
9254
9255 op.setxattr(RGW_ATTR_ID_TAG, bl);
9256 }
9257
9258 r = ref.ioctx.operate(ref.oid, &op);
9259 if (state) {
9260 if (r >= 0) {
9261 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9262 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9263 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9264 string etag(etag_bl.c_str(), etag_bl.length());
9265 string content_type(content_type_bl.c_str(), content_type_bl.length());
9266 uint64_t epoch = ref.ioctx.get_last_version();
9267 int64_t poolid = ref.ioctx.get_id();
9268 real_time mtime = real_clock::now();
9269 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9270 mtime, etag, content_type, &acl_bl,
9271 RGW_OBJ_CATEGORY_MAIN, NULL);
9272 } else {
9273 int ret = index_op.cancel();
9274 if (ret < 0) {
9275 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9276 }
9277 }
9278 }
9279 if (r < 0)
9280 return r;
9281
9282 if (state) {
9283 state->obj_tag.swap(bl);
9284 if (rmattrs) {
9285 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9286 state->attrset.erase(iter->first);
9287 }
9288 }
9289 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9290 state->attrset[iter->first] = iter->second;
9291 }
9292 }
9293
9294 return 0;
9295}
9296
9297/**
9298 * Get data about an object out of RADOS and into memory.
9299 * bucket: name of the bucket the object is in.
9300 * obj: name/key of the object to read
9301 * data: if get_data==true, this pointer will be set
9302 * to an address containing the object's data/value
9303 * attrs: if non-NULL, the pointed-to map will contain
9304 * all the attrs of the object when this function returns
9305 * mod_ptr: if non-NULL, compares the object's mtime to *mod_ptr,
9306 * and if mtime is smaller it fails.
9307 * unmod_ptr: if non-NULL, compares the object's mtime to *unmod_ptr,
9308 * and if mtime is >= it fails.
9309 * if_match/nomatch: if non-NULL, compares the object's etag attr
9310 * to the string and, if it doesn't/does match, fails out.
9311 * get_data: if true, the object's data/value will be read out, otherwise not
9312 * err: Many errors will result in this structure being filled
9313 * with extra informatin on the error.
9314 * Returns: -ERR# on failure, otherwise
9315 * (if get_data==true) length of read data,
9316 * (if get_data==false) length of the object
9317 */
9318// P3 XXX get_data is not seen used anywhere.
9319int RGWRados::Object::Read::prepare()
9320{
9321 RGWRados *store = source->get_store();
9322 CephContext *cct = store->ctx();
9323
9324 bufferlist etag;
9325
9326 map<string, bufferlist>::iterator iter;
9327
9328 RGWObjState *astate;
9329 int r = source->get_state(&astate, true);
9330 if (r < 0)
9331 return r;
9332
9333 if (!astate->exists) {
9334 return -ENOENT;
9335 }
9336
9337 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9338
9339 state.obj = astate->obj;
9340 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9341
9342 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9343 if (r < 0) {
9344 return r;
9345 }
9346 if (params.attrs) {
9347 *params.attrs = astate->attrset;
9348 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9349 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9350 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9351 }
9352 }
9353 }
9354
9355 /* Convert all times go GMT to make them compatible */
9356 if (conds.mod_ptr || conds.unmod_ptr) {
9357 obj_time_weight src_weight;
9358 src_weight.init(astate);
9359 src_weight.high_precision = conds.high_precision_time;
9360
9361 obj_time_weight dest_weight;
9362 dest_weight.high_precision = conds.high_precision_time;
9363
9364 if (conds.mod_ptr) {
9365 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9366 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9367 if (!(dest_weight < src_weight)) {
9368 return -ERR_NOT_MODIFIED;
9369 }
9370 }
9371
9372 if (conds.unmod_ptr) {
9373 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9374 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9375 if (dest_weight < src_weight) {
9376 return -ERR_PRECONDITION_FAILED;
9377 }
9378 }
9379 }
9380 if (conds.if_match || conds.if_nomatch) {
9381 r = get_attr(RGW_ATTR_ETAG, etag);
9382 if (r < 0)
9383 return r;
9384
9385 if (conds.if_match) {
9386 string if_match_str = rgw_string_unquote(conds.if_match);
9387 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9388 if (if_match_str.compare(etag.c_str()) != 0) {
9389 return -ERR_PRECONDITION_FAILED;
9390 }
9391 }
9392
9393 if (conds.if_nomatch) {
9394 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9395 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9396 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9397 return -ERR_NOT_MODIFIED;
9398 }
9399 }
9400 }
9401
9402 if (params.obj_size)
9403 *params.obj_size = astate->size;
9404 if (params.lastmod)
9405 *params.lastmod = astate->mtime;
9406
9407 return 0;
9408}
9409
9410int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9411{
9412 if (ofs < 0) {
9413 ofs += obj_size;
9414 if (ofs < 0)
9415 ofs = 0;
9416 end = obj_size - 1;
9417 } else if (end < 0) {
9418 end = obj_size - 1;
9419 }
9420
9421 if (obj_size > 0) {
9422 if (ofs >= (off_t)obj_size) {
9423 return -ERANGE;
9424 }
9425 if (end >= (off_t)obj_size) {
9426 end = obj_size - 1;
9427 }
9428 }
9429 return 0;
9430}
9431
9432int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9433{
9434 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9435}
9436
9437int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9438 RGWRados::SystemObject::Read::GetObjState& state,
9439 rgw_raw_obj& obj,
9440 map<string, bufferlist> *attrs,
9441 real_time *lastmod,
9442 uint64_t *obj_size,
9443 RGWObjVersionTracker *objv_tracker)
9444{
9445 RGWRawObjState *astate = NULL;
9446
9447 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9448 if (r < 0)
9449 return r;
9450
9451 if (!astate->exists) {
9452 return -ENOENT;
9453 }
9454
9455 if (attrs) {
9456 *attrs = astate->attrset;
9457 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9458 map<string, bufferlist>::iterator iter;
9459 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9460 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9461 }
9462 }
9463 }
9464
9465 if (obj_size)
9466 *obj_size = astate->size;
9467 if (lastmod)
9468 *lastmod = astate->mtime;
9469
9470 return 0;
9471}
9472
9473int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
9474{
9475 RGWRados *store = source->get_store();
9476 rgw_raw_obj& obj = source->get_obj();
9477
9478 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
9479 stat_params.lastmod, stat_params.obj_size, objv_tracker);
9480}
9481
9482int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
9483{
9484 if (blind) {
9485 return 0;
9486 }
9487 RGWRados *store = target->get_store();
9488 BucketShard *bs;
9489 int ret = get_bucket_shard(&bs);
9490 if (ret < 0) {
9491 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9492 return ret;
9493 }
9494
9495 if (write_tag && write_tag->length()) {
9496 optag = string(write_tag->c_str(), write_tag->length());
9497 } else {
9498 if (optag.empty()) {
9499 append_rand_alpha(store->ctx(), optag, optag, 32);
9500 }
9501 }
9502
9503 int r = store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags);
9504 if (r < 0) {
9505 return r;
9506 }
9507 prepared = true;
9508 return 0;
9509}
9510
9511int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
9512 uint64_t size, uint64_t accounted_size,
9513 ceph::real_time& ut, const string& etag,
9514 const string& content_type,
9515 bufferlist *acl_bl,
9516 RGWObjCategory category,
9517 list<rgw_obj_index_key> *remove_objs, const string *user_data)
9518{
9519 if (blind) {
9520 return 0;
9521 }
9522 RGWRados *store = target->get_store();
9523 BucketShard *bs;
9524 int ret = get_bucket_shard(&bs);
9525 if (ret < 0) {
9526 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9527 return ret;
9528 }
9529
9530 rgw_bucket_dir_entry ent;
9531 obj.key.get_index_key(&ent.key);
9532 ent.meta.size = size;
9533 ent.meta.accounted_size = accounted_size;
9534 ent.meta.mtime = ut;
9535 ent.meta.etag = etag;
9536 if (user_data)
9537 ent.meta.user_data = *user_data;
9538
9539 ACLOwner owner;
9540 if (acl_bl && acl_bl->length()) {
9541 int ret = store->decode_policy(*acl_bl, &owner);
9542 if (ret < 0) {
9543 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
9544 }
9545 }
9546 ent.meta.owner = owner.get_id().to_str();
9547 ent.meta.owner_display_name = owner.get_display_name();
9548 ent.meta.content_type = content_type;
9549
9550 ret = store->cls_obj_complete_add(*bs, optag, poolid, epoch, ent, category, remove_objs, bilog_flags);
9551
9552 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9553 if (r < 0) {
9554 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9555 }
9556
9557 return ret;
9558}
9559
9560int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
9561 real_time& removed_mtime,
9562 list<rgw_obj_index_key> *remove_objs)
9563{
9564 if (blind) {
9565 return 0;
9566 }
9567 RGWRados *store = target->get_store();
9568 BucketShard *bs;
9569 int ret = get_bucket_shard(&bs);
9570 if (ret < 0) {
9571 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9572 return ret;
9573 }
9574
9575 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags);
9576
9577 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9578 if (r < 0) {
9579 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9580 }
9581
9582 return ret;
9583}
9584
9585
9586int RGWRados::Bucket::UpdateIndex::cancel()
9587{
9588 if (blind) {
9589 return 0;
9590 }
9591 RGWRados *store = target->get_store();
9592 BucketShard *bs;
9593 int ret = get_bucket_shard(&bs);
9594 if (ret < 0) {
9595 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9596 return ret;
9597 }
9598
9599 ret = store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags);
9600
9601 /*
9602 * need to update data log anyhow, so that whoever follows needs to update its internal markers
9603 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
9604 * have no way to tell that they're all caught up
9605 */
9606 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9607 if (r < 0) {
9608 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9609 }
9610
9611 return ret;
9612}
9613
9614int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
9615{
9616 RGWRados *store = source->get_store();
9617 CephContext *cct = store->ctx();
9618
9619 std::string oid, key;
9620 rgw_raw_obj read_obj;
9621 uint64_t read_ofs = ofs;
9622 uint64_t len, read_len;
9623 bool reading_from_head = true;
9624 ObjectReadOperation op;
9625
9626 bool merge_bl = false;
9627 bufferlist *pbl = &bl;
9628 bufferlist read_bl;
9629 uint64_t max_chunk_size;
9630
9631 RGWObjState *astate;
9632 int r = source->get_state(&astate, true);
9633 if (r < 0)
9634 return r;
9635
9636 if (end < 0)
9637 len = 0;
9638 else
9639 len = end - ofs + 1;
9640
9641 if (astate->has_manifest && astate->manifest.has_tail()) {
9642 /* now get the relevant object part */
9643 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
9644
9645 uint64_t stripe_ofs = iter.get_stripe_ofs();
9646 read_obj = iter.get_location().get_raw_obj(store);
9647 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
9648 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
9649 reading_from_head = (read_obj == state.head_obj);
9650 } else {
9651 read_obj = state.head_obj;
9652 }
9653
9654 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
9655 if (r < 0) {
9656 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
9657 return r;
9658 }
9659
9660 if (len > max_chunk_size)
9661 len = max_chunk_size;
9662
9663
9664 state.io_ctx.locator_set_key(read_obj.loc);
9665
9666 read_len = len;
9667
9668 if (reading_from_head) {
9669 /* only when reading from the head object do we need to do the atomic test */
9670 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
9671 if (r < 0)
9672 return r;
9673
9674 if (astate && astate->prefetch_data) {
9675 if (!ofs && astate->data.length() >= len) {
9676 bl = astate->data;
9677 return bl.length();
9678 }
9679
9680 if (ofs < astate->data.length()) {
9681 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
9682 astate->data.copy(ofs, copy_len, bl);
9683 read_len -= copy_len;
9684 read_ofs += copy_len;
9685 if (!read_len)
9686 return bl.length();
9687
9688 merge_bl = true;
9689 pbl = &read_bl;
9690 }
9691 }
9692 }
9693
9694 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
9695 op.read(read_ofs, read_len, pbl, NULL);
9696
9697 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
9698 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
9699
9700 if (r < 0) {
9701 return r;
9702 }
9703
9704 if (merge_bl) {
9705 bl.append(read_bl);
9706 }
9707
9708 return bl.length();
9709}
9710
9711int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
9712{
9713 if (!has_ref) {
9714 rgw_pool pool;
9715 int r = store->get_raw_obj_ref(obj, &ref, &pool);
9716 if (r < 0) {
9717 return r;
9718 }
9719 has_ref = true;
9720 }
9721 *pref = &ref;
9722 return 0;
9723
9724}
9725
9726int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
9727 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
9728 bufferlist& bl, off_t ofs, off_t end,
9729 map<string, bufferlist> *attrs,
9730 rgw_cache_entry_info *cache_info)
9731{
9732 uint64_t len;
9733 ObjectReadOperation op;
9734
9735 if (end < 0)
9736 len = 0;
9737 else
9738 len = end - ofs + 1;
9739
9740 if (objv_tracker) {
9741 objv_tracker->prepare_op_for_read(&op);
9742 }
9743
9744 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
9745 op.read(ofs, len, &bl, NULL);
9746
9747 if (attrs) {
9748 op.getxattrs(attrs, NULL);
9749 }
9750
9751 rgw_rados_ref *ref;
9752 int r = read_state.get_ref(this, obj, &ref);
9753 if (r < 0) {
9754 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
9755 return r;
9756 }
9757 r = ref->ioctx.operate(ref->oid, &op, NULL);
9758 if (r < 0) {
9759 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
9760 return r;
9761 }
9762 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
9763
9764 uint64_t op_ver = ref->ioctx.get_last_version();
9765
9766 if (read_state.last_ver > 0 &&
9767 read_state.last_ver != op_ver) {
9768 ldout(cct, 5) << "raced with an object write, abort" << dendl;
9769 return -ECANCELED;
9770 }
9771
9772 read_state.last_ver = op_ver;
9773
9774 return bl.length();
9775}
9776
9777int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl, RGWObjVersionTracker *objv_tracker)
9778{
9779 RGWRados *store = source->get_store();
9780 rgw_raw_obj& obj = source->get_obj();
9781
9782 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl, ofs, end, read_params.attrs, read_params.cache_info);
9783}
9784
9785int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
9786{
9787 RGWRados *store = source->get_store();
9788 rgw_raw_obj& obj = source->get_obj();
9789
9790 return store->system_obj_get_attr(obj, name, dest);
9791}
9792
9793struct get_obj_data;
9794
9795struct get_obj_aio_data {
9796 struct get_obj_data *op_data;
9797 off_t ofs;
9798 off_t len;
9799};
9800
9801struct get_obj_io {
9802 off_t len;
9803 bufferlist bl;
9804};
9805
9806static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
9807
9808struct get_obj_data : public RefCountedObject {
9809 CephContext *cct;
9810 RGWRados *rados;
9811 RGWObjectCtx *ctx;
9812 IoCtx io_ctx;
9813 map<off_t, get_obj_io> io_map;
9814 map<off_t, librados::AioCompletion *> completion_map;
9815 uint64_t total_read;
9816 Mutex lock;
9817 Mutex data_lock;
9818 list<get_obj_aio_data> aio_data;
9819 RGWGetDataCB *client_cb;
9820 std::atomic<bool> cancelled = { false };
9821 std::atomic<int64_t> err_code = { 0 };
9822 Throttle throttle;
9823 list<bufferlist> read_list;
9824
9825 explicit get_obj_data(CephContext *_cct)
9826 : cct(_cct),
9827 rados(NULL), ctx(NULL),
9828 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
9829 client_cb(NULL),
9830 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
9831 ~get_obj_data() override { }
9832 void set_cancelled(int r) {
9833 cancelled = true;
9834 err_code = r;
9835 }
9836
9837 bool is_cancelled() {
9838 return cancelled;
9839 }
9840
9841 int get_err_code() {
9842 return err_code;
9843 }
9844
9845 int wait_next_io(bool *done) {
9846 lock.Lock();
9847 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
9848 if (iter == completion_map.end()) {
9849 *done = true;
9850 lock.Unlock();
9851 return 0;
9852 }
9853 off_t cur_ofs = iter->first;
9854 librados::AioCompletion *c = iter->second;
9855 lock.Unlock();
9856
9857 c->wait_for_safe_and_cb();
9858 int r = c->get_return_value();
9859
9860 lock.Lock();
9861 completion_map.erase(cur_ofs);
9862
9863 if (completion_map.empty()) {
9864 *done = true;
9865 }
9866 lock.Unlock();
9867
9868 c->release();
9869
9870 return r;
9871 }
9872
9873 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
9874 Mutex::Locker l(lock);
9875
9876 const auto& io_iter = io_map.insert(
9877 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
9878
9879 assert(io_iter.second); // assert new insertion
9880
9881 get_obj_io& io = (io_iter.first)->second;
9882 *pbl = &io.bl;
9883
9884 struct get_obj_aio_data aio;
9885 aio.ofs = ofs;
9886 aio.len = len;
9887 aio.op_data = this;
9888
9889 aio_data.push_back(aio);
9890
9891 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
9892
9893 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
9894 completion_map[ofs] = c;
9895
9896 *pc = c;
9897
9898 /* we have a reference per IO, plus one reference for the calling function.
9899 * reference is dropped for each callback, plus when we're done iterating
9900 * over the parts */
9901 get();
9902 }
9903
9904 void cancel_io(off_t ofs) {
9905 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
9906 lock.Lock();
9907 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
9908 if (iter != completion_map.end()) {
9909 AioCompletion *c = iter->second;
9910 c->release();
9911 completion_map.erase(ofs);
9912 io_map.erase(ofs);
9913 }
9914 lock.Unlock();
9915
9916 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
9917 * need IoCtx to live, as io callback may still be called
9918 */
9919 }
9920
9921 void cancel_all_io() {
9922 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
9923 Mutex::Locker l(lock);
9924 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
9925 iter != completion_map.end(); ++iter) {
9926 librados::AioCompletion *c = iter->second;
9927 c->release();
9928 }
9929 }
9930
9931 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
9932 Mutex::Locker l(lock);
9933
9934 map<off_t, get_obj_io>::iterator liter = io_map.begin();
9935
9936 if (liter == io_map.end() ||
9937 liter->first != ofs) {
9938 return 0;
9939 }
9940
9941 map<off_t, librados::AioCompletion *>::iterator aiter;
9942 aiter = completion_map.find(ofs);
9943 if (aiter == completion_map.end()) {
9944 /* completion map does not hold this io, it was cancelled */
9945 return 0;
9946 }
9947
9948 AioCompletion *completion = aiter->second;
9949 int r = completion->get_return_value();
9950 if (r < 0)
9951 return r;
9952
9953 for (; aiter != completion_map.end(); ++aiter) {
9954 completion = aiter->second;
9955 if (!completion->is_safe()) {
9956 /* reached a request that is not yet complete, stop */
9957 break;
9958 }
9959
9960 r = completion->get_return_value();
9961 if (r < 0) {
9962 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
9963 return r;
9964 }
9965
9966 total_read += r;
9967
9968 map<off_t, get_obj_io>::iterator old_liter = liter++;
9969 bl_list.push_back(old_liter->second.bl);
9970 io_map.erase(old_liter);
9971 }
9972
9973 return 0;
9974 }
9975};
9976
9977static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
9978{
9979 struct get_obj_data *d = (struct get_obj_data *)arg;
9980
9981 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
9982}
9983
9984static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
9985{
9986 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
9987 struct get_obj_data *d = aio_data->op_data;
9988
9989 d->rados->get_obj_aio_completion_cb(cb, arg);
9990}
9991
9992
9993void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
9994{
9995 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
9996 struct get_obj_data *d = aio_data->op_data;
9997 off_t ofs = aio_data->ofs;
9998 off_t len = aio_data->len;
9999
10000 list<bufferlist> bl_list;
10001 list<bufferlist>::iterator iter;
10002 int r;
10003
10004 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10005 d->throttle.put(len);
10006
10007 r = rados_aio_get_return_value(c);
10008 if (r < 0) {
10009 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10010 d->set_cancelled(r);
10011 goto done;
10012 }
10013
10014 if (d->is_cancelled()) {
10015 goto done;
10016 }
10017
10018 d->data_lock.Lock();
10019
10020 r = d->get_complete_ios(ofs, bl_list);
10021 if (r < 0) {
10022 goto done_unlock;
10023 }
10024
10025 d->read_list.splice(d->read_list.end(), bl_list);
10026
10027done_unlock:
10028 d->data_lock.Unlock();
10029done:
10030 d->put();
10031 return;
10032}
10033
10034int RGWRados::flush_read_list(struct get_obj_data *d)
10035{
10036 d->data_lock.Lock();
10037 list<bufferlist> l;
10038 l.swap(d->read_list);
10039 d->get();
10040 d->read_list.clear();
10041
10042 d->data_lock.Unlock();
10043
10044 int r = 0;
10045
10046 list<bufferlist>::iterator iter;
10047 for (iter = l.begin(); iter != l.end(); ++iter) {
10048 bufferlist& bl = *iter;
10049 r = d->client_cb->handle_data(bl, 0, bl.length());
10050 if (r < 0) {
10051 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10052 break;
10053 }
10054 }
10055
10056 d->data_lock.Lock();
10057 d->put();
10058 if (r < 0) {
10059 d->set_cancelled(r);
10060 }
10061 d->data_lock.Unlock();
10062 return r;
10063}
10064
10065int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10066 const RGWBucketInfo& bucket_info,
10067 const rgw_obj& obj,
10068 const rgw_raw_obj& read_obj,
10069 off_t obj_ofs,
10070 off_t read_ofs, off_t len,
10071 bool is_head_obj, void *arg)
10072{
10073 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10074 ObjectReadOperation op;
10075 struct get_obj_data *d = (struct get_obj_data *)arg;
10076 string oid, key;
10077 bufferlist *pbl;
10078 AioCompletion *c;
10079
10080 int r;
10081
10082 if (is_head_obj) {
10083 /* only when reading from the head object do we need to do the atomic test */
10084 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10085 if (r < 0)
10086 return r;
10087
10088 if (astate &&
10089 obj_ofs < astate->data.length()) {
10090 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10091
10092 d->data_lock.Lock();
10093 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10094 d->data_lock.Unlock();
10095 if (r < 0)
10096 return r;
10097
10098 d->lock.Lock();
10099 d->total_read += chunk_len;
10100 d->lock.Unlock();
10101
10102 len -= chunk_len;
10103 read_ofs += chunk_len;
10104 obj_ofs += chunk_len;
10105 if (!len)
10106 return 0;
10107 }
10108 }
10109
10110 d->throttle.get(len);
10111 if (d->is_cancelled()) {
10112 return d->get_err_code();
10113 }
10114
10115 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10116 * cleaning up
10117 */
10118 d->add_io(obj_ofs, len, &pbl, &c);
10119
10120 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10121 op.read(read_ofs, len, pbl, NULL);
10122
10123 librados::IoCtx io_ctx(d->io_ctx);
10124 io_ctx.locator_set_key(read_obj.loc);
10125
10126 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10127 if (r < 0) {
10128 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10129 goto done_err;
10130 }
10131
10132 // Flush data to client if there is any
10133 r = flush_read_list(d);
10134 if (r < 0)
10135 return r;
10136
10137 return 0;
10138
10139done_err:
10140 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10141 d->set_cancelled(r);
10142 d->cancel_io(obj_ofs);
10143
10144 return r;
10145}
10146
10147int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10148{
10149 RGWRados *store = source->get_store();
10150 CephContext *cct = store->ctx();
10151
10152 struct get_obj_data *data = new get_obj_data(cct);
10153 bool done = false;
10154
10155 RGWObjectCtx& obj_ctx = source->get_ctx();
10156
10157 data->rados = store;
10158 data->io_ctx.dup(state.io_ctx);
10159 data->client_cb = cb;
10160
10161 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10162 if (r < 0) {
10163 data->cancel_all_io();
10164 goto done;
10165 }
10166
10167 while (!done) {
10168 r = data->wait_next_io(&done);
10169 if (r < 0) {
10170 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10171 data->cancel_all_io();
10172 break;
10173 }
10174 r = store->flush_read_list(data);
10175 if (r < 0) {
10176 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10177 data->cancel_all_io();
10178 break;
10179 }
10180 }
10181
10182done:
10183 data->put();
10184 return r;
10185}
10186
10187int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10188 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10189 off_t ofs, off_t end,
10190 uint64_t max_chunk_size,
10191 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10192 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10193 RGWObjState *, void *),
10194 void *arg)
10195{
10196 rgw_raw_obj head_obj;
10197 rgw_raw_obj read_obj;
10198 uint64_t read_ofs = ofs;
10199 uint64_t len;
10200 bool reading_from_head = true;
10201 RGWObjState *astate = NULL;
10202
10203 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10204
10205 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10206 if (r < 0) {
10207 return r;
10208 }
10209
10210 if (end < 0)
10211 len = 0;
10212 else
10213 len = end - ofs + 1;
10214
10215 if (astate->has_manifest) {
10216 /* now get the relevant object stripe */
10217 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10218
10219 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10220
10221 for (; iter != obj_end && ofs <= end; ++iter) {
10222 off_t stripe_ofs = iter.get_stripe_ofs();
10223 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10224
10225 while (ofs < next_stripe_ofs && ofs <= end) {
10226 read_obj = iter.get_location().get_raw_obj(this);
10227 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10228 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10229
10230 if (read_len > max_chunk_size) {
10231 read_len = max_chunk_size;
10232 }
10233
10234 reading_from_head = (read_obj == head_obj);
10235 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10236 if (r < 0) {
10237 return r;
10238 }
10239
10240 len -= read_len;
10241 ofs += read_len;
10242 }
10243 }
10244 } else {
10245 while (ofs <= end) {
10246 read_obj = head_obj;
10247 uint64_t read_len = min(len, max_chunk_size);
10248
10249 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10250 if (r < 0) {
10251 return r;
10252 }
10253
10254 len -= read_len;
10255 ofs += read_len;
10256 }
10257 }
10258
10259 return 0;
10260}
10261
10262int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10263{
10264 rgw_rados_ref ref;
10265 int r = get_obj_head_ref(bucket_info, obj, &ref);
10266 if (r < 0) {
10267 return r;
10268 }
10269
10270 return ref.ioctx.operate(ref.oid, op);
10271}
10272
10273int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10274{
10275 rgw_rados_ref ref;
10276 int r = get_obj_head_ref(bucket_info, obj, &ref);
10277 if (r < 0) {
10278 return r;
10279 }
10280
10281 bufferlist outbl;
10282
10283 return ref.ioctx.operate(ref.oid, op, &outbl);
10284}
10285
10286int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10287{
10288 ObjectWriteOperation op;
10289
10290 assert(olh_obj.key.instance.empty());
10291
10292 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10293
10294 if (!state.exists) {
10295 op.create(true);
10296 } else {
10297 op.assert_exists();
10298 }
10299
10300 /*
10301 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10302 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10303 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10304 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10305 * log will reflect that.
10306 *
10307 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10308 * is used for object data instance, olh_tag for olh instance.
10309 */
10310 if (has_tag) {
10311 /* guard against racing writes */
10312 bucket_index_guard_olh_op(state, op);
10313 }
10314
10315 if (!has_tag) {
10316 /* obj tag */
10317 string obj_tag;
10318 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10319 if (ret < 0) {
10320 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10321 return ret;
10322 }
10323 bufferlist bl;
10324 bl.append(obj_tag.c_str(), obj_tag.size());
10325 op.setxattr(RGW_ATTR_ID_TAG, bl);
10326
10327 state.attrset[RGW_ATTR_ID_TAG] = bl;
10328 state.obj_tag = bl;
10329
10330 /* olh tag */
10331 string olh_tag;
10332 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10333 if (ret < 0) {
10334 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10335 return ret;
10336 }
10337 bufferlist olh_bl;
10338 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10339 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10340
10341 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10342 state.olh_tag = olh_bl;
10343 state.is_olh = true;
10344
10345 bufferlist verbl;
10346 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10347 }
10348
10349 bufferlist bl;
10350 RGWOLHPendingInfo pending_info;
10351 pending_info.time = real_clock::now();
10352 ::encode(pending_info, bl);
10353
10354#define OLH_PENDING_TAG_LEN 32
10355 /* tag will start with current time epoch, this so that entries are sorted by time */
10356 char buf[32];
10357 utime_t ut(pending_info.time);
10358 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10359 *op_tag = buf;
10360
10361 string s;
10362 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10363 if (ret < 0) {
10364 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10365 return ret;
10366 }
10367 op_tag->append(s);
10368
10369 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10370 attr_name.append(*op_tag);
10371
10372 op.setxattr(attr_name.c_str(), bl);
10373
10374 ret = obj_operate(bucket_info, olh_obj, &op);
10375 if (ret < 0) {
10376 return ret;
10377 }
10378
10379 state.exists = true;
10380 state.attrset[attr_name] = bl;
10381
10382 return 0;
10383}
10384
10385int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10386{
10387 int ret;
10388
10389 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10390 if (ret == -EEXIST) {
10391 ret = -ECANCELED;
10392 }
10393
10394 return ret;
10395}
10396
10397int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
10398 bool delete_marker,
10399 const string& op_tag,
10400 struct rgw_bucket_dir_entry_meta *meta,
10401 uint64_t olh_epoch,
10402 real_time unmod_since, bool high_precision_time)
10403{
10404 rgw_rados_ref ref;
10405 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10406 if (r < 0) {
10407 return r;
10408 }
10409
10410 BucketShard bs(this);
10411 int ret = bs.init(obj_instance.bucket, obj_instance);
10412 if (ret < 0) {
10413 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10414 return ret;
10415 }
10416
10417 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10418 ret = cls_rgw_bucket_link_olh(bs.index_ctx, bs.bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
10419 unmod_since, high_precision_time,
10420 get_zone().log_data);
10421 if (ret < 0) {
10422 return ret;
10423 }
10424
10425 return 0;
10426}
10427
10428void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
10429{
10430 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
10431 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
10432}
10433
10434int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
10435 const string& op_tag, const string& olh_tag, uint64_t olh_epoch)
10436{
10437 rgw_rados_ref ref;
10438 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10439 if (r < 0) {
10440 return r;
10441 }
10442
10443 BucketShard bs(this);
10444 int ret = bs.init(obj_instance.bucket, obj_instance);
10445 if (ret < 0) {
10446 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10447 return ret;
10448 }
10449
10450 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10451 ret = cls_rgw_bucket_unlink_instance(bs.index_ctx, bs.bucket_obj, key, op_tag, olh_tag, olh_epoch, get_zone().log_data);
10452 if (ret < 0) {
10453 return ret;
10454 }
10455
10456 return 0;
10457}
10458
10459int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
10460 const rgw_obj& obj_instance, uint64_t ver_marker,
10461 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
10462 bool *is_truncated)
10463{
10464 rgw_rados_ref ref;
10465 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10466 if (r < 0) {
10467 return r;
10468 }
10469
10470 BucketShard bs(this);
10471 int ret = bs.init(obj_instance.bucket, obj_instance);
10472 if (ret < 0) {
10473 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10474 return ret;
10475 }
10476
10477 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10478
10479 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10480
10481 ObjectReadOperation op;
10482
10483 ret = cls_rgw_get_olh_log(bs.index_ctx, bs.bucket_obj, op, key, ver_marker, olh_tag, log, is_truncated);
10484 if (ret < 0)
10485 return ret;
10486
10487 return 0;
10488}
10489
10490int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
10491{
10492 rgw_rados_ref ref;
10493 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10494 if (r < 0) {
10495 return r;
10496 }
10497
10498 BucketShard bs(this);
10499 int ret = bs.init(obj_instance.bucket, obj_instance);
10500 if (ret < 0) {
10501 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10502 return ret;
10503 }
10504
10505 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10506
10507 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10508
10509 ObjectWriteOperation op;
10510
10511 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
10512
10513 ret = bs.index_ctx.operate(bs.bucket_obj, &op);
10514 if (ret < 0)
10515 return ret;
10516
10517 return 0;
10518}
10519
10520int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
10521{
10522 rgw_rados_ref ref;
10523 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10524 if (r < 0) {
10525 return r;
10526 }
10527
10528 BucketShard bs(this);
10529 int ret = bs.init(obj_instance.bucket, obj_instance);
10530 if (ret < 0) {
10531 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10532 return ret;
10533 }
10534
10535 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10536
10537 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10538
10539 ret = cls_rgw_clear_olh(bs.index_ctx, bs.bucket_obj, key, olh_tag);
10540 if (ret < 0) {
10541 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
10542 return ret;
10543 }
10544
10545 return 0;
10546}
10547
10548int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10549 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
10550 uint64_t *plast_ver)
10551{
10552 if (log.empty()) {
10553 return 0;
10554 }
10555
10556 librados::ObjectWriteOperation op;
10557
10558 uint64_t last_ver = log.rbegin()->first;
10559 *plast_ver = last_ver;
10560
10561 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
10562
10563 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
10564 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
10565
10566 bool need_to_link = false;
10567 cls_rgw_obj_key key;
10568 bool delete_marker = false;
10569 list<cls_rgw_obj_key> remove_instances;
10570 bool need_to_remove = false;
10571
10572 for (iter = log.begin(); iter != log.end(); ++iter) {
10573 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
10574 for (; viter != iter->second.end(); ++viter) {
10575 rgw_bucket_olh_log_entry& entry = *viter;
10576
10577 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
10578 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
10579 << (entry.delete_marker ? "(delete)" : "") << dendl;
10580 switch (entry.op) {
10581 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
10582 remove_instances.push_back(entry.key);
10583 break;
10584 case CLS_RGW_OLH_OP_LINK_OLH:
10585 need_to_link = true;
10586 need_to_remove = false;
10587 key = entry.key;
10588 delete_marker = entry.delete_marker;
10589 break;
10590 case CLS_RGW_OLH_OP_UNLINK_OLH:
10591 need_to_remove = true;
10592 need_to_link = false;
10593 break;
10594 default:
10595 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
10596 return -EIO;
10597 }
10598 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10599 attr_name.append(entry.op_tag);
10600 op.rmxattr(attr_name.c_str());
10601 }
10602 }
10603
10604 rgw_rados_ref ref;
10605 int r = get_obj_head_ref(bucket_info, obj, &ref);
10606 if (r < 0) {
10607 return r;
10608 }
10609
10610 const rgw_bucket& bucket = obj.bucket;
10611
10612 if (need_to_link) {
10613 rgw_obj target(bucket, key);
10614 RGWOLHInfo info;
10615 info.target = target;
10616 info.removed = delete_marker;
10617 bufferlist bl;
10618 ::encode(info, bl);
10619 op.setxattr(RGW_ATTR_OLH_INFO, bl);
10620 }
10621
10622 /* first remove object instances */
10623 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
10624 liter != remove_instances.end(); ++liter) {
10625 cls_rgw_obj_key& key = *liter;
10626 rgw_obj obj_instance(bucket, key);
10627 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP);
10628 if (ret < 0 && ret != -ENOENT) {
10629 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
10630 return ret;
10631 }
10632 }
10633
10634 /* update olh object */
10635 r = ref.ioctx.operate(ref.oid, &op);
10636 if (r == -ECANCELED) {
10637 r = 0;
10638 }
10639 if (r < 0) {
10640 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
10641 return r;
10642 }
10643
10644 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
10645 if (r < 0) {
10646 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
10647 return r;
10648 }
10649
10650 if (need_to_remove) {
10651 ObjectWriteOperation rm_op;
10652
10653 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
10654 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
10655 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
10656 rm_op.remove();
10657
10658 r = ref.ioctx.operate(ref.oid, &rm_op);
10659 if (r == -ECANCELED) {
10660 return 0; /* someone else won this race */
10661 } else {
10662 /*
10663 * only clear if was successful, otherwise we might clobber pending operations on this object
10664 */
10665 r = bucket_index_clear_olh(bucket_info, state, obj);
10666 if (r < 0) {
10667 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
10668 return r;
10669 }
10670 }
10671 }
10672
10673 return 0;
10674}
10675
10676/*
10677 * read olh log and apply it
10678 */
10679int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
10680{
10681 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
10682 bool is_truncated;
10683 uint64_t ver_marker = 0;
10684
10685 do {
10686 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
10687 if (ret < 0) {
10688 return ret;
10689 }
10690 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker);
10691 if (ret < 0) {
10692 return ret;
10693 }
10694 } while (is_truncated);
10695
10696 return 0;
10697}
10698
10699int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
10700 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time)
10701{
10702 string op_tag;
10703
10704 rgw_obj olh_obj = target_obj;
10705 olh_obj.key.instance.clear();
10706
10707 RGWObjState *state = NULL;
10708
10709 int ret = 0;
10710 int i;
10711
10712#define MAX_ECANCELED_RETRY 100
10713 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
10714 if (ret == -ECANCELED) {
10715 obj_ctx.obj.invalidate(olh_obj);
10716 }
10717
10718 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
10719 if (ret < 0) {
10720 return ret;
10721 }
10722
10723 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
10724 if (ret < 0) {
10725 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
10726 if (ret == -ECANCELED) {
10727 continue;
10728 }
10729 return ret;
10730 }
10731 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time);
10732 if (ret < 0) {
10733 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
10734 if (ret == -ECANCELED) {
10735 continue;
10736 }
10737 return ret;
10738 }
10739 break;
10740 }
10741
10742 if (i == MAX_ECANCELED_RETRY) {
10743 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
10744 return -EIO;
10745 }
10746
10747 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
10748 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
10749 ret = 0;
10750 }
10751 if (ret < 0) {
10752 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
10753 return ret;
10754 }
10755
10756 return 0;
10757}
10758
10759int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
10760 uint64_t olh_epoch)
10761{
10762 string op_tag;
10763
10764 rgw_obj olh_obj = target_obj;
10765 olh_obj.key.instance.clear();
10766
10767 RGWObjState *state = NULL;
10768
10769 int ret = 0;
10770 int i;
10771
10772 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
10773 if (ret == -ECANCELED) {
10774 obj_ctx.obj.invalidate(olh_obj);
10775 }
10776
10777 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
10778 if (ret < 0)
10779 return ret;
10780
10781 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
10782 if (ret < 0) {
10783 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
10784 if (ret == -ECANCELED) {
10785 continue;
10786 }
10787 return ret;
10788 }
10789
10790 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
10791
10792 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch);
10793 if (ret < 0) {
10794 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
10795 if (ret == -ECANCELED) {
10796 continue;
10797 }
10798 return ret;
10799 }
10800 break;
10801 }
10802
10803 if (i == MAX_ECANCELED_RETRY) {
10804 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
10805 return -EIO;
10806 }
10807
10808 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
10809 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
10810 return 0;
10811 }
10812 if (ret < 0) {
10813 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
10814 return ret;
10815 }
10816
10817 return 0;
10818}
10819
10820void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
10821{
10822#define OBJ_INSTANCE_LEN 32
10823 char buf[OBJ_INSTANCE_LEN + 1];
10824
10825 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
10826 no underscore for instance name due to the way we encode the raw keys */
10827
10828 target_obj->key.set_instance(buf);
10829}
10830
10831static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
10832 map<string, bufferlist> *attrset)
10833{
10834 attrset->clear();
10835 map<string, bufferlist>::iterator iter;
10836 for (iter = unfiltered_attrset.lower_bound(check_prefix);
10837 iter != unfiltered_attrset.end(); ++iter) {
10838 if (!boost::algorithm::starts_with(iter->first, check_prefix))
10839 break;
10840 (*attrset)[iter->first] = iter->second;
10841 }
10842}
10843
10844int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
10845{
10846 map<string, bufferlist> unfiltered_attrset;
10847
10848 ObjectReadOperation op;
10849 op.getxattrs(&unfiltered_attrset, NULL);
10850
10851 bufferlist outbl;
10852 int r = obj_operate(bucket_info, obj, &op);
10853
10854 if (r < 0) {
10855 return r;
10856 }
10857 map<string, bufferlist> attrset;
10858
10859 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
10860
10861 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
10862 if (iter == attrset.end()) { /* not an olh */
10863 return -EINVAL;
10864 }
10865
10866 try {
10867 bufferlist::iterator biter = iter->second.begin();
10868 ::decode(*olh, biter);
10869 } catch (buffer::error& err) {
10870 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
10871 return -EIO;
10872 }
10873
10874 return 0;
10875}
10876
10877void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
10878 map<string, bufferlist> *rm_pending_entries)
10879{
10880 map<string, bufferlist>::iterator iter = pending_entries.begin();
10881
10882 real_time now = real_clock::now();
10883
10884 while (iter != pending_entries.end()) {
10885 bufferlist::iterator biter = iter->second.begin();
10886 RGWOLHPendingInfo pending_info;
10887 try {
10888 ::decode(pending_info, biter);
10889 } catch (buffer::error& err) {
10890 /* skipping bad entry, we could remove it but it might hide a bug */
10891 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
10892 ++iter;
10893 continue;
10894 }
10895
10896 map<string, bufferlist>::iterator cur_iter = iter;
10897 ++iter;
10898 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
10899 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
10900 pending_entries.erase(cur_iter);
10901 } else {
10902 /* entries names are sorted by time (rounded to a second) */
10903 break;
10904 }
10905 }
10906}
10907
10908int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
10909{
10910 ObjectWriteOperation op;
10911
10912 bucket_index_guard_olh_op(state, op);
10913
10914 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
10915 op.rmxattr(iter->first.c_str());
10916 }
10917
10918 rgw_rados_ref ref;
10919 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
10920 if (r < 0) {
10921 return r;
10922 }
10923
10924 /* update olh object */
10925 r = ref.ioctx.operate(ref.oid, &op);
10926 if (r == -ENOENT || r == -ECANCELED) {
10927 /* raced with some other change, shouldn't sweat about it */
10928 r = 0;
10929 }
10930 if (r < 0) {
10931 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
10932 return r;
10933 }
10934
10935 return 0;
10936}
10937
10938int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
10939{
10940 map<string, bufferlist> pending_entries;
10941 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
10942
10943 map<string, bufferlist> rm_pending_entries;
10944 check_pending_olh_entries(pending_entries, &rm_pending_entries);
10945
10946 if (!rm_pending_entries.empty()) {
10947 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
10948 if (ret < 0) {
10949 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
10950 return ret;
10951 }
10952 }
10953 if (!pending_entries.empty()) {
10954 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
10955
10956 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
10957 if (ret < 0) {
10958 return ret;
10959 }
10960 }
10961
10962 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
10963 assert(iter != state->attrset.end());
10964 RGWOLHInfo olh;
10965 try {
10966 bufferlist::iterator biter = iter->second.begin();
10967 ::decode(olh, biter);
10968 } catch (buffer::error& err) {
10969 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
10970 return -EIO;
10971 }
10972
10973 if (olh.removed) {
10974 return -ENOENT;
10975 }
10976
10977 *target = olh.target;
10978
10979 return 0;
10980}
10981
10982int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
10983 map<string, bufferlist> *attrs, bufferlist *first_chunk,
10984 RGWObjVersionTracker *objv_tracker)
10985{
10986 rgw_rados_ref ref;
10987 int r = get_raw_obj_ref(obj, &ref);
10988 if (r < 0) {
10989 return r;
10990 }
10991
10992 map<string, bufferlist> unfiltered_attrset;
10993 uint64_t size = 0;
10994 struct timespec mtime_ts;
10995
10996 ObjectReadOperation op;
10997 if (objv_tracker) {
10998 objv_tracker->prepare_op_for_read(&op);
10999 }
11000 if (attrs) {
11001 op.getxattrs(&unfiltered_attrset, NULL);
11002 }
11003 if (psize || pmtime) {
11004 op.stat2(&size, &mtime_ts, NULL);
11005 }
11006 if (first_chunk) {
11007 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11008 }
11009 bufferlist outbl;
11010 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11011
11012 if (epoch) {
11013 *epoch = ref.ioctx.get_last_version();
11014 }
11015
11016 if (r < 0)
11017 return r;
11018
11019 if (psize)
11020 *psize = size;
11021 if (pmtime)
11022 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11023 if (attrs) {
11024 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11025 }
11026
11027 return 0;
11028}
11029
11030int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11031 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker)
11032{
11033 map<string, rgw_bucket_dir_header> headers;
11034 map<int, string> bucket_instance_ids;
11035 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11036 if (r < 0) {
11037 return r;
11038 }
11039
11040 assert(headers.size() == bucket_instance_ids.size());
11041
11042 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11043 map<int, string>::iterator viter = bucket_instance_ids.begin();
11044 BucketIndexShardsManager ver_mgr;
11045 BucketIndexShardsManager master_ver_mgr;
11046 BucketIndexShardsManager marker_mgr;
11047 string shard_marker;
11048 char buf[64];
11049 for(; iter != headers.end(); ++iter, ++viter) {
11050 accumulate_raw_stats(iter->second, stats);
11051 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11052 ver_mgr.add(viter->first, string(buf));
11053 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11054 master_ver_mgr.add(viter->first, string(buf));
11055 if (shard_id >= 0) {
11056 *max_marker = iter->second.max_marker;
11057 } else {
11058 marker_mgr.add(viter->first, iter->second.max_marker);
11059 }
11060 }
11061 ver_mgr.to_string(bucket_ver);
11062 master_ver_mgr.to_string(master_ver);
11063 if (shard_id < 0) {
11064 marker_mgr.to_string(max_marker);
11065 }
11066 return 0;
11067}
11068
11069int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11070 map<int, string>& markers)
11071{
11072 map<string, rgw_bucket_dir_header> headers;
11073 map<int, string> bucket_instance_ids;
11074 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11075 if (r < 0)
11076 return r;
11077
11078 assert(headers.size() == bucket_instance_ids.size());
11079
11080 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11081 map<int, string>::iterator viter = bucket_instance_ids.begin();
11082
11083 for(; iter != headers.end(); ++iter, ++viter) {
11084 if (shard_id >= 0) {
11085 markers[shard_id] = iter->second.max_marker;
11086 } else {
11087 markers[viter->first] = iter->second.max_marker;
11088 }
11089 }
11090 return 0;
11091}
11092
11093class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11094 RGWGetBucketStats_CB *cb;
11095 uint32_t pendings;
11096 map<RGWObjCategory, RGWStorageStats> stats;
11097 int ret_code;
11098 bool should_cb;
11099 Mutex lock;
11100
11101public:
11102 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11103 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11104 lock("RGWGetBucketStatsContext") {}
11105
11106 void handle_response(int r, rgw_bucket_dir_header& header) override {
11107 Mutex::Locker l(lock);
11108 if (should_cb) {
11109 if ( r >= 0) {
11110 accumulate_raw_stats(header, stats);
11111 } else {
11112 ret_code = r;
11113 }
11114
11115 // Are we all done?
11116 if (--pendings == 0) {
11117 if (!ret_code) {
11118 cb->set_response(&stats);
11119 }
11120 cb->handle_response(ret_code);
11121 cb->put();
11122 }
11123 }
11124 }
11125
11126 void unset_cb() {
11127 Mutex::Locker l(lock);
11128 should_cb = false;
11129 }
11130};
11131
11132int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11133{
11134 int num_aio = 0;
11135 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards);
11136 assert(get_ctx);
11137 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11138 get_ctx->put();
11139 if (r < 0) {
11140 ctx->put();
11141 if (num_aio) {
11142 get_ctx->unset_cb();
11143 }
11144 }
11145 return r;
11146}
11147
11148class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11149 RGWGetUserStats_CB *cb;
11150
11151public:
11152 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11153 : cb(cb) {}
11154
11155 void handle_response(int r, cls_user_header& header) override {
11156 const cls_user_stats& hs = header.stats;
11157 if (r >= 0) {
11158 RGWStorageStats stats;
11159
11160 stats.size = hs.total_bytes;
11161 stats.size_rounded = hs.total_bytes_rounded;
11162 stats.num_objects = hs.total_entries;
11163
11164 cb->set_response(stats);
11165 }
11166
11167 cb->handle_response(r);
11168
11169 cb->put();
11170 }
11171};
11172
11173int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11174{
11175 string user_str = user.to_str();
11176
11177 cls_user_header header;
11178 int r = cls_user_get_header(user_str, &header);
11179 if (r < 0)
11180 return r;
11181
11182 const cls_user_stats& hs = header.stats;
11183
11184 stats.size = hs.total_bytes;
11185 stats.size_rounded = hs.total_bytes_rounded;
11186 stats.num_objects = hs.total_entries;
11187
11188 return 0;
11189}
11190
11191int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11192{
11193 string user_str = user.to_str();
11194
11195 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11196 int r = cls_user_get_header_async(user_str, get_ctx);
11197 if (r < 0) {
11198 ctx->put();
11199 delete get_ctx;
11200 return r;
11201 }
11202
11203 return 0;
11204}
11205
11206void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11207{
11208 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11209}
11210
11211void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11212{
11213 if (!bucket.oid.empty()) {
11214 obj.init(get_zone_params().domain_root, bucket.oid);
11215 } else {
11216 string oid;
11217 get_bucket_meta_oid(bucket, oid);
11218 obj.init(get_zone_params().domain_root, oid);
11219 }
11220}
11221
11222int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11223 real_time *pmtime, map<string, bufferlist> *pattrs)
11224{
11225 size_t pos = meta_key.find(':');
11226 if (pos == string::npos) {
11227 return -EINVAL;
11228 }
11229 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11230 rgw_bucket_instance_key_to_oid(oid);
11231
11232 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11233}
11234
11235int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11236 real_time *pmtime, map<string, bufferlist> *pattrs)
11237{
11238 string oid;
11239 if (bucket.oid.empty()) {
11240 get_bucket_meta_oid(bucket, oid);
11241 } else {
11242 oid = bucket.oid;
11243 }
11244
11245 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11246}
11247
11248int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, string& oid, RGWBucketInfo& info,
11249 real_time *pmtime, map<string, bufferlist> *pattrs,
11250 rgw_cache_entry_info *cache_info)
11251{
11252 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11253
11254 bufferlist epbl;
11255
11256 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, oid, epbl, &info.objv_tracker, pmtime, pattrs, cache_info);
11257 if (ret < 0) {
11258 return ret;
11259 }
11260
11261 bufferlist::iterator iter = epbl.begin();
11262 try {
11263 ::decode(info, iter);
11264 } catch (buffer::error& err) {
11265 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11266 return -EIO;
11267 }
11268 info.bucket.oid = oid;
11269 return 0;
11270}
11271
11272int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11273 const string& tenant_name,
11274 const string& bucket_name,
11275 RGWBucketEntryPoint& entry_point,
11276 RGWObjVersionTracker *objv_tracker,
11277 real_time *pmtime,
11278 map<string, bufferlist> *pattrs,
11279 rgw_cache_entry_info *cache_info)
11280{
11281 bufferlist bl;
11282 string bucket_entry;
11283
11284 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11285 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
11286 if (ret < 0) {
11287 return ret;
11288 }
11289
11290 bufferlist::iterator iter = bl.begin();
11291 try {
11292 ::decode(entry_point, iter);
11293 } catch (buffer::error& err) {
11294 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11295 return -EIO;
11296 }
11297 return 0;
11298}
11299
11300int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11301 const string& tenant_name,
11302 const string& bucket_name)
11303{
11304 RGWBucketEntryPoint entry_point;
11305 real_time ep_mtime;
11306 RGWObjVersionTracker ot;
11307 map<string, bufferlist> attrs;
11308 RGWBucketInfo info;
11309
11310 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11311
11312 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11313 if (ret < 0) {
11314 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11315 return ret;
11316 }
11317
11318 if (!entry_point.has_bucket_info) {
11319 /* already converted! */
11320 return 0;
11321 }
11322
11323 info = entry_point.old_bucket_info;
11324 info.bucket.oid = bucket_name;
11325 info.ep_objv = ot.read_version;
11326
11327 ot.generate_new_write_ver(cct);
11328
11329 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11330 if (ret < 0) {
11331 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11332 return ret;
11333 }
11334
11335 return 0;
11336}
11337
11338int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
11339 const string& tenant, const string& bucket_name, RGWBucketInfo& info,
11340 real_time *pmtime, map<string, bufferlist> *pattrs)
11341{
11342 bucket_info_entry e;
11343 string bucket_entry;
11344 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11345
11346 if (binfo_cache->find(bucket_entry, &e)) {
11347 info = e.info;
11348 if (pattrs)
11349 *pattrs = e.attrs;
11350 if (pmtime)
11351 *pmtime = e.mtime;
11352 return 0;
11353 }
11354
11355 RGWBucketEntryPoint entry_point;
11356 real_time ep_mtime;
11357 RGWObjVersionTracker ot;
11358 rgw_cache_entry_info entry_cache_info;
11359 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
11360 if (ret < 0) {
11361 /* only init these fields */
11362 info.bucket.tenant = tenant;
11363 info.bucket.name = bucket_name;
11364 return ret;
11365 }
11366
11367 if (entry_point.has_bucket_info) {
11368 info = entry_point.old_bucket_info;
11369 info.bucket.oid = bucket_name;
11370 info.bucket.tenant = tenant;
11371 info.ep_objv = ot.read_version;
11372 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
11373 return 0;
11374 }
11375
11376 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11377 * that we got
11378 */
11379 if (pattrs) {
11380 pattrs->clear();
11381 }
11382
11383 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
11384
11385
11386 /* read bucket instance info */
11387
11388 string oid;
11389 get_bucket_meta_oid(entry_point.bucket, oid);
11390
11391 rgw_cache_entry_info cache_info;
11392
11393 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, &cache_info);
11394 e.info.ep_objv = ot.read_version;
11395 info = e.info;
11396 if (ret < 0) {
11397 info.bucket.tenant = tenant;
11398 info.bucket.name = bucket_name;
11399 // XXX and why return anything in case of an error anyway?
11400 return ret;
11401 }
11402
11403 if (pmtime)
11404 *pmtime = e.mtime;
11405 if (pattrs)
11406 *pattrs = e.attrs;
11407
11408 list<rgw_cache_entry_info *> cache_info_entries;
11409 cache_info_entries.push_back(&entry_cache_info);
11410 cache_info_entries.push_back(&cache_info);
11411
11412
11413 /* chain to both bucket entry point and bucket instance */
11414 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
11415 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
11416 }
11417
11418 return 0;
11419}
11420
11421int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
11422 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
11423 map<string, bufferlist> *pattrs)
11424{
11425 bufferlist epbl;
11426 ::encode(entry_point, epbl);
11427 string bucket_entry;
11428 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11429 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
11430}
11431
11432int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
11433 real_time mtime, map<string, bufferlist> *pattrs)
11434{
11435 info.has_instance_obj = true;
11436 bufferlist bl;
11437
11438 ::encode(info, bl);
11439
11440 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
11441 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
11442 if (ret == -EEXIST) {
11443 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11444 * bucket operation on this specific bucket (e.g., being synced from the master), but
11445 * since bucket instace meta object is unique for this specific bucket instace, we don't
11446 * need to return an error.
11447 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11448 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11449 * locally, while in the sync thread we sync the new bucket.
11450 */
11451 ret = 0;
11452 }
11453 return ret;
11454}
11455
11456int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
11457 map<string, bufferlist> *pattrs, bool create_entry_point)
11458{
11459 bool create_head = !info.has_instance_obj || create_entry_point;
11460
11461 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
11462 if (ret < 0) {
11463 return ret;
11464 }
11465
11466 if (!create_head)
11467 return 0; /* done! */
11468
11469 RGWBucketEntryPoint entry_point;
11470 entry_point.bucket = info.bucket;
11471 entry_point.owner = info.owner;
11472 entry_point.creation_time = info.creation_time;
11473 entry_point.linked = true;
11474 RGWObjVersionTracker ot;
11475 if (pep_objv && !pep_objv->tag.empty()) {
11476 ot.write_version = *pep_objv;
11477 } else {
11478 ot.generate_new_write_ver(cct);
11479 if (pep_objv) {
11480 *pep_objv = ot.write_version;
11481 }
11482 }
11483 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
11484 if (ret < 0)
11485 return ret;
11486
11487 return 0;
11488}
11489
11490int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
11491{
11492 rgw_rados_ref ref;
11493 int r = get_raw_obj_ref(obj, &ref);
11494 if (r < 0) {
11495 return r;
11496 }
11497
11498 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
11499 if (r < 0)
11500 return r;
11501
11502 return 0;
11503
11504}
11505
11506int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
11507 std::map<string, bufferlist>& m)
11508{
11509 rgw_rados_ref ref;
11510 int r = get_raw_obj_ref(obj, &ref);
11511 if (r < 0) {
11512 return r;
11513 }
11514
11515#define MAX_OMAP_GET_ENTRIES 1024
11516 const int count = MAX_OMAP_GET_ENTRIES;
11517 string start_after;
11518
11519 while (true) {
11520 std::map<string, bufferlist> t;
11521 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
11522 if (r < 0) {
11523 return r;
11524 }
11525 if (t.empty()) {
11526 break;
11527 }
11528 start_after = t.rbegin()->first;
11529 m.insert(t.begin(), t.end());
11530 }
11531 return 0;
11532}
11533
11534int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
11535{
11536 rgw_rados_ref ref;
11537 int r = get_raw_obj_ref(obj, &ref);
11538 if (r < 0) {
11539 return r;
11540 }
11541 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
11542
11543 map<string, bufferlist> m;
11544 m[key] = bl;
11545
11546 r = ref.ioctx.omap_set(ref.oid, m);
11547
11548 return r;
11549}
11550
11551int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
11552{
11553 rgw_rados_ref ref;
11554 int r = get_raw_obj_ref(obj, &ref);
11555 if (r < 0) {
11556 return r;
11557 }
11558
11559 r = ref.ioctx.omap_set(ref.oid, m);
11560
11561 return r;
11562}
11563
11564int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
11565{
11566 rgw_rados_ref ref;
11567 int r = get_raw_obj_ref(obj, &ref);
11568 if (r < 0) {
11569 return r;
11570 }
11571
11572 set<string> k;
11573 k.insert(key);
11574
11575 r = ref.ioctx.omap_rm_keys(ref.oid, k);
11576 return r;
11577}
11578
11579int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
11580{
11581 RGWObjectCtx obj_ctx(this);
11582
11583 map<string, RGWBucketEnt>::iterator iter;
11584 for (iter = m.begin(); iter != m.end(); ++iter) {
11585 RGWBucketEnt& ent = iter->second;
11586 rgw_bucket& bucket = ent.bucket;
11587 ent.count = 0;
11588 ent.size = 0;
11589 ent.size_rounded = 0;
11590
11591 map<string, rgw_bucket_dir_header> headers;
11592
11593 RGWBucketInfo bucket_info;
11594 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
11595 if (ret < 0) {
11596 return ret;
11597 }
11598
11599 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
11600 if (r < 0)
11601 return r;
11602
11603 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
11604 for (; hiter != headers.end(); ++hiter) {
11605 RGWObjCategory category = main_category;
11606 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
11607 if (iter != hiter->second.stats.end()) {
11608 struct rgw_bucket_category_stats& stats = iter->second;
11609 ent.count += stats.num_entries;
11610 ent.size += stats.total_size;
11611 ent.size_rounded += stats.total_size_rounded;
11612 }
11613 }
11614 }
11615
11616 return m.size();
11617}
11618
11619int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
11620{
11621 rgw_rados_ref ref;
11622 int r = get_raw_obj_ref(obj, &ref);
11623 if (r < 0) {
11624 return r;
11625 }
11626 librados::Rados *rad = get_rados_handle();
11627 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
11628
11629 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
11630 completion->release();
11631 return r;
11632}
11633
11634int RGWRados::distribute(const string& key, bufferlist& bl)
11635{
11636 /*
11637 * we were called before watch was initialized. This can only happen if we're updating some system
11638 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
11639 * objects, they're currently only read on startup anyway.
11640 */
11641 if (!watch_initialized)
11642 return 0;
11643
11644 string notify_oid;
11645 pick_control_oid(key, notify_oid);
11646
11647 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
11648 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
11649}
11650
11651int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
11652{
11653 librados::IoCtx& io_ctx = ctx.io_ctx;
11654 librados::NObjectIterator& iter = ctx.iter;
11655
11656 int r = open_pool_ctx(pool, io_ctx);
11657 if (r < 0)
11658 return r;
11659
11660 iter = io_ctx.nobjects_begin();
11661
11662 return 0;
11663}
11664
11665int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
11666 bool *is_truncated, RGWAccessListFilter *filter)
11667{
11668 librados::IoCtx& io_ctx = ctx.io_ctx;
11669 librados::NObjectIterator& iter = ctx.iter;
11670
11671 if (iter == io_ctx.nobjects_end())
11672 return -ENOENT;
11673
11674 uint32_t i;
11675
11676 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
11677 rgw_bucket_dir_entry e;
11678
11679 string oid = iter->get_oid();
11680 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
11681
11682 // fill it in with initial values; we may correct later
11683 if (filter && !filter->filter(oid, oid))
11684 continue;
11685
11686 e.key = oid;
11687 objs.push_back(e);
11688 }
11689
11690 if (is_truncated)
11691 *is_truncated = (iter != io_ctx.nobjects_end());
11692
11693 return objs.size();
11694}
11695struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
11696 string prefix;
11697
11698 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
11699 bool filter(string& name, string& key) override {
11700 return (prefix.compare(key.substr(0, prefix.size())) == 0);
11701 }
11702};
11703
11704int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
11705 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
11706 bool *is_truncated)
11707{
11708 RGWAccessListFilterPrefix filter(prefix_filter);
11709
11710 if (!ctx.initialized) {
11711 int r = pool_iterate_begin(pool, ctx.iter_ctx);
11712 if (r < 0) {
11713 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
11714 return r;
11715 }
11716 ctx.initialized = true;
11717 }
11718
11719 vector<rgw_bucket_dir_entry> objs;
11720 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
11721 if (r < 0) {
11722 if(r != -ENOENT)
11723 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
11724 return r;
11725 }
11726
11727 vector<rgw_bucket_dir_entry>::iterator iter;
11728 for (iter = objs.begin(); iter != objs.end(); ++iter) {
11729 oids.push_back(iter->key.name);
11730 }
11731
11732 return oids.size();
11733}
11734
11735int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
11736 std::list<rgw_bi_log_entry>& result, bool *truncated)
11737{
11738 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
11739 result.clear();
11740
11741 librados::IoCtx index_ctx;
11742 map<int, string> oids;
11743 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
11744 map<int, string> bucket_instance_ids;
11745 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
11746 if (r < 0)
11747 return r;
11748
11749 BucketIndexShardsManager marker_mgr;
11750 bool has_shards = (oids.size() > 1 || shard_id >= 0);
11751 // If there are multiple shards for the bucket index object, the marker
11752 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
11753 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
11754 // only contain one record, and the key is the bucket instance id.
11755 r = marker_mgr.from_string(marker, shard_id);
11756 if (r < 0)
11757 return r;
11758
11759 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
11760 if (r < 0)
11761 return r;
11762
11763 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
11764 map<int, list<rgw_bi_log_entry>::iterator> vends;
11765 if (truncated) {
11766 *truncated = false;
11767 }
11768 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
11769 for (; miter != bi_log_lists.end(); ++miter) {
11770 int shard_id = miter->first;
11771 vcurrents[shard_id] = miter->second.entries.begin();
11772 vends[shard_id] = miter->second.entries.end();
11773 if (truncated) {
11774 *truncated = (*truncated || miter->second.truncated);
11775 }
11776 }
11777
11778 size_t total = 0;
11779 bool has_more = true;
11780 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
11781 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
11782 while (total < max && has_more) {
11783 has_more = false;
11784
11785 viter = vcurrents.begin();
11786 eiter = vends.begin();
11787
11788 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
11789 assert (eiter != vends.end());
11790
11791 int shard_id = viter->first;
11792 list<rgw_bi_log_entry>::iterator& liter = viter->second;
11793
11794 if (liter == eiter->second){
11795 continue;
11796 }
11797 rgw_bi_log_entry& entry = *(liter);
11798 if (has_shards) {
11799 char buf[16];
11800 snprintf(buf, sizeof(buf), "%d", shard_id);
11801 string tmp_id;
11802 build_bucket_index_marker(buf, entry.id, &tmp_id);
11803 entry.id.swap(tmp_id);
11804 }
11805 marker_mgr.add(shard_id, entry.id);
11806 result.push_back(entry);
11807 total++;
11808 has_more = true;
11809 ++liter;
11810 }
11811 }
11812
11813 if (truncated) {
11814 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
11815 assert (eiter != vends.end());
11816 *truncated = (*truncated || (viter->second != eiter->second));
11817 }
11818 }
11819
11820 // Refresh marker, if there are multiple shards, the output will look like
11821 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
11822 // if there is no sharding, the simply marker (without oid) is returned
11823 if (has_shards) {
11824 marker_mgr.to_string(&marker);
11825 } else {
11826 if (!result.empty()) {
11827 marker = result.rbegin()->id;
11828 }
11829 }
11830
11831 return 0;
11832}
11833
11834int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
11835{
11836 librados::IoCtx index_ctx;
11837 map<int, string> bucket_objs;
11838 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
11839 if (r < 0)
11840 return r;
11841
11842 BucketIndexShardsManager start_marker_mgr;
11843 r = start_marker_mgr.from_string(start_marker, shard_id);
11844 if (r < 0)
11845 return r;
11846 BucketIndexShardsManager end_marker_mgr;
11847 r = end_marker_mgr.from_string(end_marker, shard_id);
11848 if (r < 0)
11849 return r;
11850
11851 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
11852 cct->_conf->rgw_bucket_index_max_aio)();
11853}
11854
11855int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
11856{
11857 rgw_rados_ref ref;
11858 int r = get_obj_head_ref(bucket_info, obj, &ref);
11859 if (r < 0) {
11860 return r;
11861 }
11862
11863 rgw_cls_bi_entry bi_entry;
11864 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
11865 if (r < 0 && r != -ENOENT) {
11866 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
11867 }
11868 if (r < 0) {
11869 return r;
11870 }
11871 bufferlist::iterator iter = bi_entry.data.begin();
11872 try {
11873 ::decode(*dirent, iter);
11874 } catch (buffer::error& err) {
11875 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
11876 return -EIO;
11877 }
11878
11879 return 0;
11880}
11881
11882int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
11883{
11884 BucketShard bs(this);
11885 int ret = bs.init(bucket, obj);
11886 if (ret < 0) {
11887 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11888 return ret;
11889 }
11890
11891 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
11892
11893 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
11894 if (ret < 0)
11895 return ret;
11896
11897 return 0;
11898}
11899
11900void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
11901{
11902 cls_rgw_bi_put(op, bs.bucket_obj, entry);
11903}
11904
11905int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
11906{
11907 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
11908 if (ret < 0)
11909 return ret;
11910
11911 return 0;
11912}
11913
11914int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
11915{
11916 BucketShard bs(this);
11917 int ret = bs.init(bucket, obj);
11918 if (ret < 0) {
11919 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11920 return ret;
11921 }
11922
11923 return bi_put(bs, entry);
11924}
11925
11926int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
11927{
11928 rgw_obj obj(bucket, obj_name);
11929 BucketShard bs(this);
11930 int ret = bs.init(bucket, obj);
11931 if (ret < 0) {
11932 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11933 return ret;
11934 }
11935
11936 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
11937 if (ret < 0)
11938 return ret;
11939
11940 return 0;
11941}
11942
11943int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
11944{
11945 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
11946 if (ret < 0)
11947 return ret;
11948
11949 return 0;
11950}
11951
11952int RGWRados::bi_remove(BucketShard& bs)
11953{
11954 int ret = bs.index_ctx.remove(bs.bucket_obj);
11955 if (ret == -ENOENT) {
11956 ret = 0;
11957 }
11958 if (ret < 0) {
11959 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
11960 return ret;
11961 }
11962
11963 return 0;
11964}
11965
11966int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
11967{
11968 BucketShard bs(this);
11969 int ret = bs.init(bucket, shard_id);
11970 if (ret < 0) {
11971 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11972 return ret;
11973 }
11974
11975 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
11976}
11977
11978int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
11979{
11980 return gc_pool_ctx.operate(oid, op);
11981}
11982
11983int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
11984{
11985 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
11986 int r = gc_pool_ctx.aio_operate(oid, c, op);
11987 c->release();
11988 return r;
11989}
11990
11991int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
11992{
11993 return gc_pool_ctx.operate(oid, op, pbl);
11994}
11995
11996int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
11997{
11998 return gc->list(index, marker, max, expired_only, result, truncated);
11999}
12000
12001int RGWRados::process_gc()
12002{
12003 return gc->process();
12004}
12005
12006int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12007{
12008 return lc->list_lc_progress(marker, max_entries, progress_map);
12009}
12010
12011int RGWRados::process_lc()
12012{
12013 return lc->process();
12014}
12015
12016int RGWRados::process_expire_objects()
12017{
12018 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12019 return 0;
12020}
12021
12022int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12023{
12024 bufferlist in;
12025 cls_rgw_bucket_init(op);
12026 return index_ctx.operate(oid, &op);
12027}
12028
12029int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12030 rgw_obj& obj, uint16_t bilog_flags)
12031{
12032 ObjectWriteOperation o;
12033 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12034 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags);
12035 return bs.index_ctx.operate(bs.bucket_obj, &o);
12036}
12037
12038int RGWRados::cls_obj_complete_op(BucketShard& bs, RGWModifyOp op, string& tag,
12039 int64_t pool, uint64_t epoch,
12040 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12041 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags)
12042{
12043 list<cls_rgw_obj_key> *pro = NULL;
12044 list<cls_rgw_obj_key> ro;
12045
12046 if (remove_objs) {
12047 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
12048 ro.push_back(*iter);
12049 }
12050 pro = &ro;
12051 }
12052
12053 ObjectWriteOperation o;
12054 rgw_bucket_dir_entry_meta dir_meta;
12055 dir_meta = ent.meta;
12056 dir_meta.category = category;
12057
12058 rgw_bucket_entry_ver ver;
12059 ver.pool = pool;
12060 ver.epoch = epoch;
12061 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12062 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, pro,
12063 get_zone().log_data, bilog_flags);
12064
12065 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12066 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, c, &o);
12067 c->release();
12068 return ret;
12069}
12070
12071int RGWRados::cls_obj_complete_add(BucketShard& bs, string& tag,
12072 int64_t pool, uint64_t epoch,
12073 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12074 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags)
12075{
12076 return cls_obj_complete_op(bs, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags);
12077}
12078
12079int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12080 int64_t pool, uint64_t epoch,
12081 rgw_obj& obj,
12082 real_time& removed_mtime,
12083 list<rgw_obj_index_key> *remove_objs,
12084 uint16_t bilog_flags)
12085{
12086 rgw_bucket_dir_entry ent;
12087 ent.meta.mtime = removed_mtime;
12088 obj.key.get_index_key(&ent.key);
12089 return cls_obj_complete_op(bs, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags);
12090}
12091
12092int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags)
12093{
12094 rgw_bucket_dir_entry ent;
12095 obj.key.get_index_key(&ent.key);
12096 return cls_obj_complete_op(bs, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags);
12097}
12098
12099int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12100{
12101 librados::IoCtx index_ctx;
12102 map<int, string> bucket_objs;
12103 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12104 if (r < 0)
12105 return r;
12106
12107 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12108}
12109
12110int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12111 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12112 bool *is_truncated, rgw_obj_index_key *last_entry,
12113 bool (*force_check_filter)(const string& name))
12114{
12115 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12116
12117 librados::IoCtx index_ctx;
12118 // key - oid (for different shards if there is any)
12119 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12120 map<int, string> oids;
12121 map<int, struct rgw_cls_list_ret> list_results;
12122 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12123 if (r < 0)
12124 return r;
12125
12126 cls_rgw_obj_key start_key(start.name, start.instance);
12127 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12128 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12129 if (r < 0)
12130 return r;
12131
12132 // Create a list of iterators that are used to iterate each shard
12133 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12134 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12135 vector<string> vnames(list_results.size());
12136 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12137 *is_truncated = false;
12138 for (; iter != list_results.end(); ++iter) {
12139 vcurrents.push_back(iter->second.dir.m.begin());
12140 vends.push_back(iter->second.dir.m.end());
12141 vnames.push_back(oids[iter->first]);
12142 *is_truncated = (*is_truncated || iter->second.is_truncated);
12143 }
12144
12145 // Create a map to track the next candidate entry from each shard, if the entry
12146 // from a specified shard is selected/erased, the next entry from that shard will
12147 // be inserted for next round selection
12148 map<string, size_t> candidates;
12149 for (size_t i = 0; i < vcurrents.size(); ++i) {
12150 if (vcurrents[i] != vends[i]) {
12151 candidates[vcurrents[i]->first] = i;
12152 }
12153 }
12154
12155 map<string, bufferlist> updates;
12156 uint32_t count = 0;
12157 while (count < num_entries && !candidates.empty()) {
12158 r = 0;
12159 // Select the next one
12160 int pos = candidates.begin()->second;
12161 const string& name = vcurrents[pos]->first;
12162 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12163
12164 bool force_check = force_check_filter && force_check_filter(dirent.key.name);
12165 if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
12166 /* there are uncommitted ops. We need to check the current state,
12167 * and if the tags are old we need to do cleanup as well. */
12168 librados::IoCtx sub_ctx;
12169 sub_ctx.dup(index_ctx);
12170 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12171 if (r < 0 && r != -ENOENT) {
12172 return r;
12173 }
12174 }
12175 if (r >= 0) {
12176 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12177 m[name] = std::move(dirent);
12178 ++count;
12179 }
12180
12181 // Refresh the candidates map
12182 candidates.erase(candidates.begin());
12183 ++vcurrents[pos];
12184 if (vcurrents[pos] != vends[pos]) {
12185 candidates[vcurrents[pos]->first] = pos;
12186 }
12187 }
12188
12189 // Suggest updates if there is any
12190 map<string, bufferlist>::iterator miter = updates.begin();
12191 for (; miter != updates.end(); ++miter) {
12192 if (miter->second.length()) {
12193 ObjectWriteOperation o;
12194 cls_rgw_suggest_changes(o, miter->second);
12195 // we don't care if we lose suggested updates, send them off blindly
12196 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12197 index_ctx.aio_operate(miter->first, c, &o);
12198 c->release();
12199 }
12200 }
12201
12202 // Check if all the returned entries are consumed or not
12203 for (size_t i = 0; i < vcurrents.size(); ++i) {
12204 if (vcurrents[i] != vends[i])
12205 *is_truncated = true;
12206 }
12207 if (!m.empty())
12208 *last_entry = m.rbegin()->first;
12209
12210 return 0;
12211}
12212
12213int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12214{
12215 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12216
12217 rgw_rados_ref ref;
12218 rgw_pool pool;
12219 int r = get_raw_obj_ref(obj, &ref, &pool);
12220 if (r < 0) {
12221 return r;
12222 }
12223
12224 ObjectWriteOperation op;
12225 cls_rgw_usage_log_add(op, info);
12226
12227 r = ref.ioctx.operate(ref.oid, &op);
12228 return r;
12229}
12230
12231int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
12232 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
12233{
12234 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12235
12236 rgw_rados_ref ref;
12237 rgw_pool pool;
12238 int r = get_raw_obj_ref(obj, &ref, &pool);
12239 if (r < 0) {
12240 return r;
12241 }
12242
12243 *is_truncated = false;
12244
12245 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
12246 max_entries, read_iter, usage, is_truncated);
12247
12248 return r;
12249}
12250
12251int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
12252{
12253 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12254
12255 rgw_rados_ref ref;
12256 rgw_pool pool;
12257 int r = get_raw_obj_ref(obj, &ref, &pool);
12258 if (r < 0) {
12259 return r;
12260 }
12261
12262 ObjectWriteOperation op;
12263 cls_rgw_usage_log_trim(op, user, start_epoch, end_epoch);
12264
12265 r = ref.ioctx.operate(ref.oid, &op);
12266 return r;
12267}
12268
12269int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
12270{
12271 librados::IoCtx index_ctx;
12272 string dir_oid;
12273
12274 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12275
12276 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
12277 if (r < 0)
12278 return r;
12279
12280 bufferlist updates;
12281
12282 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
12283 rgw_bucket_dir_entry entry;
12284 entry.key = *iter;
12285 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
12286 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12287 updates.append(CEPH_RGW_REMOVE | suggest_flag);
12288 ::encode(entry, updates);
12289 }
12290
12291 bufferlist out;
12292
12293 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
12294
12295 return r;
12296}
12297
12298int RGWRados::check_disk_state(librados::IoCtx io_ctx,
12299 const RGWBucketInfo& bucket_info,
12300 rgw_bucket_dir_entry& list_state,
12301 rgw_bucket_dir_entry& object,
12302 bufferlist& suggested_updates)
12303{
12304 const rgw_bucket& bucket = bucket_info.bucket;
12305 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12306
12307 std::string loc;
12308
12309 rgw_obj obj(bucket, list_state.key);
12310
12311 string oid;
12312 get_obj_bucket_and_oid_loc(obj, oid, loc);
12313
12314 if (loc != list_state.locator) {
12315 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
12316 }
12317
12318 io_ctx.locator_set_key(list_state.locator);
12319
12320 RGWObjState *astate = NULL;
12321 RGWObjectCtx rctx(this);
12322 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
12323 if (r < 0)
12324 return r;
12325
12326 list_state.pending_map.clear(); // we don't need this and it inflates size
12327 if (!astate->exists) {
12328 /* object doesn't exist right now -- hopefully because it's
12329 * marked as !exists and got deleted */
12330 if (list_state.exists) {
12331 /* FIXME: what should happen now? Work out if there are any
12332 * non-bad ways this could happen (there probably are, but annoying
12333 * to handle!) */
12334 }
12335 // encode a suggested removal of that key
12336 list_state.ver.epoch = io_ctx.get_last_version();
12337 list_state.ver.pool = io_ctx.get_id();
12338 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
12339 return -ENOENT;
12340 }
12341
12342 string etag;
12343 string content_type;
12344 ACLOwner owner;
12345
12346 object.meta.size = astate->size;
12347 object.meta.accounted_size = astate->accounted_size;
12348 object.meta.mtime = astate->mtime;
12349
12350 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
12351 if (iter != astate->attrset.end()) {
12352 etag = iter->second.c_str();
12353 }
12354 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
12355 if (iter != astate->attrset.end()) {
12356 content_type = iter->second.c_str();
12357 }
12358 iter = astate->attrset.find(RGW_ATTR_ACL);
12359 if (iter != astate->attrset.end()) {
12360 r = decode_policy(iter->second, &owner);
12361 if (r < 0) {
12362 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
12363 }
12364 }
12365
12366 if (astate->has_manifest) {
12367 RGWObjManifest::obj_iterator miter;
12368 RGWObjManifest& manifest = astate->manifest;
12369 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
12370 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
12371 rgw_obj loc;
12372 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
12373
12374 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
12375 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
12376 r = delete_obj_index(loc);
12377 if (r < 0) {
12378 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
12379 }
12380 }
12381 }
12382 }
12383
12384 object.meta.etag = etag;
12385 object.meta.content_type = content_type;
12386 object.meta.owner = owner.get_id().to_str();
12387 object.meta.owner_display_name = owner.get_display_name();
12388
12389 // encode suggested updates
12390 list_state.ver.pool = io_ctx.get_id();
12391 list_state.ver.epoch = astate->epoch;
12392 list_state.meta.size = object.meta.size;
12393 list_state.meta.accounted_size = object.meta.accounted_size;
12394 list_state.meta.mtime = object.meta.mtime;
12395 list_state.meta.category = main_category;
12396 list_state.meta.etag = etag;
12397 list_state.meta.content_type = content_type;
12398 if (astate->obj_tag.length() > 0)
12399 list_state.tag = astate->obj_tag.c_str();
12400 list_state.meta.owner = owner.get_id().to_str();
12401 list_state.meta.owner_display_name = owner.get_display_name();
12402
12403 list_state.exists = true;
12404 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
12405 return 0;
12406}
12407
12408int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
12409{
12410 librados::IoCtx index_ctx;
12411 map<int, string> oids;
12412 map<int, struct rgw_cls_list_ret> list_results;
12413 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
12414 if (r < 0)
12415 return r;
12416
12417 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12418 if (r < 0)
12419 return r;
12420
12421 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12422 for(; iter != list_results.end(); ++iter) {
12423 headers[oids[iter->first]] = iter->second.dir.header;
12424 }
12425 return 0;
12426}
12427
12428int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
12429{
12430 librados::IoCtx index_ctx;
12431 map<int, string> bucket_objs;
12432 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12433 if (r < 0)
12434 return r;
12435
12436 map<int, string>::iterator iter = bucket_objs.begin();
12437 for (; iter != bucket_objs.end(); ++iter) {
12438 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
12439 if (r < 0) {
12440 ctx->put();
12441 break;
12442 } else {
12443 (*num_aio)++;
12444 }
12445 }
12446 return r;
12447}
12448
12449int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
12450{
12451 string buckets_obj_id;
12452 rgw_get_buckets_obj(user_id, buckets_obj_id);
12453 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12454
12455 rgw_rados_ref ref;
12456 rgw_pool pool;
12457 int r = get_raw_obj_ref(obj, &ref, &pool);
12458 if (r < 0) {
12459 return r;
12460 }
12461
12462 librados::ObjectReadOperation op;
12463 int rc;
12464 ::cls_user_get_header(op, header, &rc);
12465 bufferlist ibl;
12466 r = ref.ioctx.operate(ref.oid, &op, &ibl);
12467 if (r < 0)
12468 return r;
12469 if (rc < 0)
12470 return rc;
12471
12472 return 0;
12473}
12474
12475int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
12476{
12477 string buckets_obj_id;
12478 rgw_get_buckets_obj(user_id, buckets_obj_id);
12479 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12480
12481 rgw_rados_ref ref;
12482 rgw_pool pool;
12483 int r = get_raw_obj_ref(obj, &ref, &pool);
12484 if (r < 0) {
12485 return r;
12486 }
12487
12488 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
12489 if (r < 0)
12490 return r;
12491
12492 return 0;
12493}
12494
12495int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
12496{
12497 map<string, struct rgw_bucket_dir_header> headers;
12498 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12499 if (r < 0) {
12500 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
12501 return r;
12502 }
12503
12504 cls_user_bucket_entry entry;
12505
12506 bucket_info.bucket.convert(&entry.bucket);
12507
12508 map<string, struct rgw_bucket_dir_header>::iterator hiter = headers.begin();
12509 for (; hiter != headers.end(); ++hiter) {
12510 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = hiter->second.stats.begin();
12511 for (; iter != hiter->second.stats.end(); ++iter) {
12512 struct rgw_bucket_category_stats& header_stats = iter->second;
12513 entry.size += header_stats.total_size;
12514 entry.size_rounded += header_stats.total_size_rounded;
12515 entry.count += header_stats.num_entries;
12516 }
12517 }
12518
12519 list<cls_user_bucket_entry> entries;
12520 entries.push_back(entry);
12521
12522 r = cls_user_update_buckets(user_obj, entries, false);
12523 if (r < 0) {
12524 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
12525 return r;
12526 }
12527
12528 return 0;
12529}
12530
12531int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
12532 const string& in_marker,
12533 const string& end_marker,
12534 const int max_entries,
12535 list<cls_user_bucket_entry>& entries,
12536 string * const out_marker,
12537 bool * const truncated)
12538{
12539 rgw_rados_ref ref;
12540 rgw_pool pool;
12541 int r = get_raw_obj_ref(obj, &ref, &pool);
12542 if (r < 0) {
12543 return r;
12544 }
12545
12546 librados::ObjectReadOperation op;
12547 int rc;
12548
12549 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
12550 bufferlist ibl;
12551 r = ref.ioctx.operate(ref.oid, &op, &ibl);
12552 if (r < 0)
12553 return r;
12554 if (rc < 0)
12555 return rc;
12556
12557 return 0;
12558}
12559
12560int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
12561{
12562 rgw_rados_ref ref;
12563 rgw_pool pool;
12564 int r = get_raw_obj_ref(obj, &ref, &pool);
12565 if (r < 0) {
12566 return r;
12567 }
12568
12569 librados::ObjectWriteOperation op;
12570 cls_user_set_buckets(op, entries, add);
12571 r = ref.ioctx.operate(ref.oid, &op);
12572 if (r < 0)
12573 return r;
12574
12575 return 0;
12576}
12577
12578int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
12579{
12580 string buckets_obj_id;
12581 rgw_get_buckets_obj(user_id, buckets_obj_id);
12582 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12583 return cls_user_complete_stats_sync(obj);
12584}
12585
12586int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
12587{
12588 rgw_rados_ref ref;
12589 rgw_pool pool;
12590 int r = get_raw_obj_ref(obj, &ref, &pool);
12591 if (r < 0) {
12592 return r;
12593 }
12594
12595 librados::ObjectWriteOperation op;
12596 ::cls_user_complete_stats_sync(op);
12597 r = ref.ioctx.operate(ref.oid, &op);
12598 if (r < 0)
12599 return r;
12600
12601 return 0;
12602}
12603
12604int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
12605{
12606 list<cls_user_bucket_entry> l;
12607 l.push_back(entry);
12608
12609 return cls_user_update_buckets(obj, l, true);
12610}
12611
12612int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
12613{
12614 rgw_pool p;
12615 rgw_rados_ref ref;
12616 int r = get_system_obj_ref(obj, &ref, &p);
12617 if (r < 0) {
12618 return r;
12619 }
12620
12621 librados::ObjectWriteOperation op;
12622 ::cls_user_remove_bucket(op, bucket);
12623 r = ref.ioctx.operate(ref.oid, &op);
12624 if (r < 0)
12625 return r;
12626
12627 return 0;
12628}
12629
12630int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
12631 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
12632{
12633 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
12634}
12635
12636void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
12637 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
12638{
12639 if (!num_shards) {
12640 bucket_objects[0] = bucket_oid_base;
12641 } else {
12642 char buf[bucket_oid_base.size() + 32];
12643 if (shard_id < 0) {
12644 for (uint32_t i = 0; i < num_shards; ++i) {
12645 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
12646 bucket_objects[i] = buf;
12647 }
12648 } else {
12649 if ((uint32_t)shard_id > num_shards) {
12650 return;
12651 }
12652 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
12653 bucket_objects[shard_id] = buf;
12654 }
12655 }
12656}
12657
12658void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
12659{
12660 const rgw_bucket& bucket = bucket_info.bucket;
12661 string plain_id = bucket.name + ":" + bucket.bucket_id;
12662 if (!bucket_info.num_shards) {
12663 (*result)[0] = plain_id;
12664 } else {
12665 char buf[16];
12666 if (shard_id < 0) {
12667 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
12668 snprintf(buf, sizeof(buf), ":%d", i);
12669 (*result)[i] = plain_id + buf;
12670 }
12671 } else {
12672 if ((uint32_t)shard_id > bucket_info.num_shards) {
12673 return;
12674 }
12675 snprintf(buf, sizeof(buf), ":%d", shard_id);
12676 (*result)[shard_id] = plain_id + buf;
12677 }
12678 }
12679}
12680
12681int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
12682 int *shard_id)
12683{
12684 int r = 0;
12685 switch (bucket_info.bucket_index_shard_hash_type) {
12686 case RGWBucketInfo::MOD:
12687 if (!bucket_info.num_shards) {
12688 if (shard_id) {
12689 *shard_id = -1;
12690 }
12691 } else {
12692 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
12693 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
12694 sid = sid2 % MAX_BUCKET_INDEX_SHARDS_PRIME % bucket_info.num_shards;
12695 if (shard_id) {
12696 *shard_id = (int)sid;
12697 }
12698 }
12699 break;
12700 default:
12701 r = -ENOTSUP;
12702 }
12703 return r;
12704}
12705
12706void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
12707 int shard_id, string *bucket_obj)
12708{
12709 if (!num_shards) {
12710 // By default with no sharding, we use the bucket oid as itself
12711 (*bucket_obj) = bucket_oid_base;
12712 } else {
12713 char buf[bucket_oid_base.size() + 32];
12714 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
12715 (*bucket_obj) = buf;
12716 }
12717}
12718
12719int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
12720 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
12721{
12722 int r = 0;
12723 switch (hash_type) {
12724 case RGWBucketInfo::MOD:
12725 if (!num_shards) {
12726 // By default with no sharding, we use the bucket oid as itself
12727 (*bucket_obj) = bucket_oid_base;
12728 if (shard_id) {
12729 *shard_id = -1;
12730 }
12731 } else {
12732 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
12733 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
12734 sid = sid2 % MAX_BUCKET_INDEX_SHARDS_PRIME % num_shards;
12735 char buf[bucket_oid_base.size() + 32];
12736 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
12737 (*bucket_obj) = buf;
12738 if (shard_id) {
12739 *shard_id = (int)sid;
12740 }
12741 }
12742 break;
12743 default:
12744 r = -ENOTSUP;
12745 }
12746 return r;
12747}
12748
12749void RGWStateLog::oid_str(int shard, string& oid) {
12750 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
12751 char buf[16];
12752 snprintf(buf, sizeof(buf), "%d", shard);
12753 oid += buf;
12754}
12755
12756int RGWStateLog::get_shard_num(const string& object) {
12757 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
12758 return val % num_shards;
12759}
12760
12761string RGWStateLog::get_oid(const string& object) {
12762 int shard = get_shard_num(object);
12763 string oid;
12764 oid_str(shard, oid);
12765 return oid;
12766}
12767
12768int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
12769 rgw_pool pool;
12770 store->get_log_pool(pool);
12771 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
12772 if (r < 0) {
12773 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
12774 return r;
12775 }
12776 return 0;
12777}
12778
12779int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
12780 uint32_t state, bufferlist *bl, uint32_t *check_state)
12781{
12782 if (client_id.empty() ||
12783 op_id.empty() ||
12784 object.empty()) {
12785 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
12786 }
12787
12788 librados::IoCtx ioctx;
12789 int r = open_ioctx(ioctx);
12790 if (r < 0)
12791 return r;
12792
12793 string oid = get_oid(object);
12794
12795 librados::ObjectWriteOperation op;
12796 if (check_state) {
12797 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
12798 }
12799 utime_t ts = ceph_clock_now();
12800 bufferlist nobl;
12801 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
12802 r = ioctx.operate(oid, &op);
12803 if (r < 0) {
12804 return r;
12805 }
12806
12807 return 0;
12808}
12809
12810int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
12811{
12812 if (client_id.empty() ||
12813 op_id.empty() ||
12814 object.empty()) {
12815 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
12816 }
12817
12818 librados::IoCtx ioctx;
12819 int r = open_ioctx(ioctx);
12820 if (r < 0)
12821 return r;
12822
12823 string oid = get_oid(object);
12824
12825 librados::ObjectWriteOperation op;
12826 cls_statelog_remove_by_object(op, object, op_id);
12827 r = ioctx.operate(oid, &op);
12828 if (r < 0) {
12829 return r;
12830 }
12831
12832 return 0;
12833}
12834
12835void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
12836 void **handle)
12837{
12838 list_state *state = new list_state;
12839 state->client_id = client_id;
12840 state->op_id = op_id;
12841 state->object = object;
12842 if (object.empty()) {
12843 state->cur_shard = 0;
12844 state->max_shard = num_shards - 1;
12845 } else {
12846 state->cur_shard = state->max_shard = get_shard_num(object);
12847 }
12848 *handle = (void *)state;
12849}
12850
12851int RGWStateLog::list_entries(void *handle, int max_entries,
12852 list<cls_statelog_entry>& entries,
12853 bool *done)
12854{
12855 list_state *state = static_cast<list_state *>(handle);
12856
12857 librados::IoCtx ioctx;
12858 int r = open_ioctx(ioctx);
12859 if (r < 0)
12860 return r;
12861
12862 entries.clear();
12863
12864 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
12865 string oid;
12866 oid_str(state->cur_shard, oid);
12867
12868 librados::ObjectReadOperation op;
12869 list<cls_statelog_entry> ents;
12870 bool truncated;
12871 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
12872 max_entries, ents, &state->marker, &truncated);
12873 bufferlist ibl;
12874 r = ioctx.operate(oid, &op, &ibl);
12875 if (r == -ENOENT) {
12876 truncated = false;
12877 r = 0;
12878 }
12879 if (r < 0) {
12880 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
12881 return r;
12882 }
12883
12884 if (!truncated) {
12885 state->marker.clear();
12886 }
12887
12888 max_entries -= ents.size();
12889
12890 entries.splice(entries.end(), ents);
12891
12892 if (truncated)
12893 break;
12894 }
12895
12896 *done = (state->cur_shard > state->max_shard);
12897
12898 return 0;
12899}
12900
12901void RGWStateLog::finish_list_entries(void *handle)
12902{
12903 list_state *state = static_cast<list_state *>(handle);
12904 delete state;
12905}
12906
12907void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
12908{
12909 f->open_object_section("statelog_entry");
12910 f->dump_string("client_id", entry.client_id);
12911 f->dump_string("op_id", entry.op_id);
12912 f->dump_string("object", entry.object);
12913 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
12914 if (!dump_entry_internal(entry, f)) {
12915 f->dump_int("state", entry.state);
12916 }
12917 f->close_section();
12918}
12919
12920RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
12921{
12922}
12923
12924bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
12925{
12926 string s;
12927 switch ((OpState)entry.state) {
12928 case OPSTATE_UNKNOWN:
12929 s = "unknown";
12930 break;
12931 case OPSTATE_IN_PROGRESS:
12932 s = "in-progress";
12933 break;
12934 case OPSTATE_COMPLETE:
12935 s = "complete";
12936 break;
12937 case OPSTATE_ERROR:
12938 s = "error";
12939 break;
12940 case OPSTATE_ABORT:
12941 s = "abort";
12942 break;
12943 case OPSTATE_CANCELLED:
12944 s = "cancelled";
12945 break;
12946 default:
12947 s = "invalid";
12948 }
12949 f->dump_string("state", s);
12950 return true;
12951}
12952
12953int RGWOpState::state_from_str(const string& s, OpState *state)
12954{
12955 if (s == "unknown") {
12956 *state = OPSTATE_UNKNOWN;
12957 } else if (s == "in-progress") {
12958 *state = OPSTATE_IN_PROGRESS;
12959 } else if (s == "complete") {
12960 *state = OPSTATE_COMPLETE;
12961 } else if (s == "error") {
12962 *state = OPSTATE_ERROR;
12963 } else if (s == "abort") {
12964 *state = OPSTATE_ABORT;
12965 } else if (s == "cancelled") {
12966 *state = OPSTATE_CANCELLED;
12967 } else {
12968 return -EINVAL;
12969 }
12970
12971 return 0;
12972}
12973
12974int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
12975{
12976 uint32_t s = (uint32_t)state;
12977 return store_entry(client_id, op_id, object, s, NULL, NULL);
12978}
12979
12980int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
12981{
12982 uint32_t s = (uint32_t)state;
12983 return store_entry(client_id, op_id, object, s, NULL, &s);
12984}
12985
12986RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
12987 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
12988{
12989 cct = store->ctx();
12990 cur_state = RGWOpState::OPSTATE_UNKNOWN;
12991}
12992
12993int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
12994 last_update = real_clock::now();
12995 cur_state = state;
12996 return os.set_state(client_id, op_id, object, state);
12997}
12998
12999int RGWOpStateSingleOp::renew_state() {
13000 real_time now = real_clock::now();
13001
13002 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13003
13004 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13005 return 0;
13006 }
13007
13008 last_update = now;
13009 return os.renew_state(client_id, op_id, object, cur_state);
13010}
13011
13012
13013uint64_t RGWRados::instance_id()
13014{
13015 return get_rados_handle()->get_instance_id();
13016}
13017
13018uint64_t RGWRados::next_bucket_id()
13019{
13020 Mutex::Locker l(bucket_id_lock);
13021 return ++max_bucket_id;
13022}
13023
13024RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread)
13025{
13026 int use_cache = cct->_conf->rgw_cache_enabled;
13027 RGWRados *store = NULL;
13028 if (!use_cache) {
13029 store = new RGWRados;
13030 } else {
13031 store = new RGWCache<RGWRados>;
13032 }
13033
13034 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread) < 0) {
13035 delete store;
13036 return NULL;
13037 }
13038
13039 return store;
13040}
13041
13042RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13043{
13044 RGWRados *store = NULL;
13045 store = new RGWRados;
13046
13047 store->set_context(cct);
13048
13049 if (store->init_rados() < 0) {
13050 delete store;
13051 return NULL;
13052 }
13053
13054 return store;
13055}
13056
13057void RGWStoreManager::close_storage(RGWRados *store)
13058{
13059 if (!store)
13060 return;
13061
13062 store->finalize();
13063
13064 delete store;
13065}
13066
13067librados::Rados* RGWRados::get_rados_handle()
13068{
13069 if (rados.size() == 1) {
13070 return &rados[0];
13071 } else {
13072 handle_lock.get_read();
13073 pthread_t id = pthread_self();
13074 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13075
13076 if (it != rados_map.end()) {
13077 handle_lock.put_read();
13078 return &rados[it->second];
13079 } else {
13080 handle_lock.put_read();
13081 handle_lock.get_write();
13082 const uint32_t handle = next_rados_handle;
13083 rados_map[id] = handle;
13084 if (++next_rados_handle == rados.size()) {
13085 next_rados_handle = 0;
13086 }
13087 handle_lock.put_write();
13088 return &rados[handle];
13089 }
13090 }
13091}
13092
13093int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13094{
13095 rgw_rados_ref ref;
13096 int ret = get_raw_obj_ref(obj, &ref);
13097 if (ret < 0) {
13098 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13099 return ret;
13100 }
13101
13102 ObjectWriteOperation op;
13103 list<string> prefixes;
13104 cls_rgw_remove_obj(op, prefixes);
13105
13106 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13107 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13108 if (ret < 0) {
13109 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13110 c->release();
13111 return ret;
13112 }
13113
13114 handles.push_back(c);
13115
13116 return 0;
13117}
13118
13119int RGWRados::delete_obj_aio(const rgw_obj& obj,
13120 RGWBucketInfo& bucket_info, RGWObjState *astate,
13121 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13122{
13123 rgw_rados_ref ref;
13124 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13125 if (ret < 0) {
13126 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13127 return ret;
13128 }
13129
13130 if (keep_index_consistent) {
13131 RGWRados::Bucket bop(this, bucket_info);
13132 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13133
13134 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13135 if (ret < 0) {
13136 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13137 return ret;
13138 }
13139 }
13140
13141 ObjectWriteOperation op;
13142 list<string> prefixes;
13143 cls_rgw_remove_obj(op, prefixes);
13144
13145 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13146 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13147 if (ret < 0) {
13148 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13149 c->release();
13150 return ret;
13151 }
13152
13153 handles.push_back(c);
13154
13155 if (keep_index_consistent) {
13156 ret = delete_obj_index(obj);
13157 if (ret < 0) {
13158 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13159 return ret;
13160 }
13161 }
13162 return ret;
13163}
13164
13165int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
13166 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
13167 if (value != attrs.end()) {
13168 bufferlist::iterator bliter = value->second.begin();
13169 try {
13170 ::decode(cs_info, bliter);
13171 } catch (buffer::error& err) {
13172 return -EIO;
13173 }
13174 if (cs_info.blocks.size() == 0) {
13175 return -EIO;
13176 }
13177 if (cs_info.compression_type != "none")
13178 need_decompress = true;
13179 else
13180 need_decompress = false;
13181 return 0;
13182 } else {
13183 need_decompress = false;
13184 return 0;
13185 }
13186}
13187