]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to v12.2.1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1
2 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 // vim: ts=8 sw=2 smarttab
4
5 #include "include/compat.h"
6 #include <errno.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <boost/algorithm/string.hpp>
10
11 #include <boost/format.hpp>
12 #include <boost/optional.hpp>
13 #include <boost/utility/in_place_factory.hpp>
14
15 #include "common/ceph_json.h"
16 #include "common/utf8.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21 #include "common/Finisher.h"
22
23 #include "rgw_rados.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_metadata.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32
33 #include "cls/rgw/cls_rgw_ops.h"
34 #include "cls/rgw/cls_rgw_types.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/statelog/cls_statelog_client.h"
41 #include "cls/timeindex/cls_timeindex_client.h"
42 #include "cls/lock/cls_lock_client.h"
43 #include "cls/user/cls_user_client.h"
44 #include "osd/osd_types.h"
45
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
49
50 #undef fork // fails to compile RGWPeriod::fork() below
51
52 #include "common/Clock.h"
53
54 #include "include/rados/librados.hpp"
55 using namespace librados;
56
57 #include <string>
58 #include <iostream>
59 #include <vector>
60 #include <atomic>
61 #include <list>
62 #include <map>
63 #include "auth/Crypto.h" // get_random_bytes()
64
65 #include "rgw_log.h"
66
67 #include "rgw_gc.h"
68 #include "rgw_lc.h"
69
70 #include "rgw_object_expirer_core.h"
71 #include "rgw_sync.h"
72 #include "rgw_data_sync.h"
73 #include "rgw_realm_watcher.h"
74 #include "rgw_reshard.h"
75
76 #include "compressor/Compressor.h"
77
78 #define dout_context g_ceph_context
79 #define dout_subsys ceph_subsys_rgw
80
81 using namespace std;
82
83 static string notify_oid_prefix = "notify";
84 static string *notify_oids = NULL;
85 static string shadow_ns = "shadow";
86 static string dir_oid_prefix = ".dir.";
87 static string default_storage_pool_suffix = "rgw.buckets.data";
88 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
89 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
90 static string avail_pools = ".pools.avail";
91
92 static string zone_info_oid_prefix = "zone_info.";
93 static string zone_names_oid_prefix = "zone_names.";
94 static string region_info_oid_prefix = "region_info.";
95 static string zone_group_info_oid_prefix = "zonegroup_info.";
96 static string realm_names_oid_prefix = "realms_names.";
97 static string realm_info_oid_prefix = "realms.";
98 static string default_region_info_oid = "default.region";
99 static string default_zone_group_info_oid = "default.zonegroup";
100 static string period_info_oid_prefix = "periods.";
101 static string period_latest_epoch_info_oid = ".latest_epoch";
102 static string region_map_oid = "region_map";
103 static string zonegroup_map_oid = "zonegroup_map";
104 static string log_lock_name = "rgw_log_lock";
105 static string default_realm_info_oid = "default.realm";
106 const string default_zonegroup_name = "default";
107 const string default_zone_name = "default";
108 static string zonegroup_names_oid_prefix = "zonegroups_names.";
109 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
110 #define RGW_USAGE_OBJ_PREFIX "usage."
111 #define FIRST_EPOCH 1
112 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
115 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
116
117 #define RGW_STATELOG_OBJ_PREFIX "statelog."
118
119 #define dout_subsys ceph_subsys_rgw
120
121
122 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
123 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
124 {
125 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
126 RGWZonePlacementInfo placement;
127 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
128 return false;
129 }
130
131 if (!obj.in_extra_data) {
132 *pool = placement.data_pool;
133 } else {
134 *pool = placement.get_data_extra_pool();
135 }
136 }
137
138 return true;
139 }
140
141 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
142 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
143 {
144 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
145
146 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
147 }
148
149 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
150 {
151 if (!is_raw) {
152 rgw_raw_obj r;
153 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
154 return r;
155 }
156 return raw_obj;
157 }
158
159 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
160 {
161 if (!is_raw) {
162 rgw_raw_obj r;
163 store->obj_to_raw(placement_rule, obj, &r);
164 return r;
165 }
166 return raw_obj;
167 }
168
169 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
170 {
171 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
172 if (r == -ENOENT && create) {
173 r = rados->pool_create(pool.name.c_str());
174 if (r < 0 && r != -EEXIST) {
175 return r;
176 }
177
178 r = rados->ioctx_create(pool.name.c_str(), ioctx);
179 if (r < 0) {
180 return r;
181 }
182
183 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
184 if (r < 0 && r != -EOPNOTSUPP) {
185 return r;
186 }
187 } else if (r < 0) {
188 return r;
189 }
190 if (!pool.ns.empty()) {
191 ioctx.set_namespace(pool.ns);
192 }
193 return 0;
194 }
195
196 template<>
197 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
198 RWLock::WLocker wl(lock);
199 auto iter = objs_state.find(obj);
200 if (iter == objs_state.end()) {
201 return;
202 }
203 bool is_atomic = iter->second.is_atomic;
204 bool prefetch_data = iter->second.prefetch_data;
205
206 objs_state.erase(iter);
207
208 if (is_atomic || prefetch_data) {
209 auto& s = objs_state[obj];
210 s.is_atomic = is_atomic;
211 s.prefetch_data = prefetch_data;
212 }
213 }
214
215 template<>
216 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
217 RWLock::WLocker wl(lock);
218 auto iter = objs_state.find(obj);
219 if (iter == objs_state.end()) {
220 return;
221 }
222
223 objs_state.erase(iter);
224 }
225
226 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
227 encode_json("default_zonegroup", default_zonegroup, f);
228 }
229
230 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
231
232 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
233 /* backward compatability with region */
234 if (default_zonegroup.empty()) {
235 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
236 }
237 }
238
239 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
240 {
241 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
242 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
243 }
244
245 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
246 }
247
248 int RGWZoneGroup::create_default(bool old_format)
249 {
250 name = default_zonegroup_name;
251 is_master = true;
252
253 RGWZoneGroupPlacementTarget placement_target;
254 placement_target.name = "default-placement";
255 placement_targets[placement_target.name] = placement_target;
256 default_placement = "default-placement";
257
258 RGWZoneParams zone_params(default_zone_name);
259
260 int r = zone_params.init(cct, store, false);
261 if (r < 0) {
262 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
263 return r;
264 }
265
266 r = zone_params.create_default();
267 if (r < 0 && r != -EEXIST) {
268 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
269 return r;
270 } else if (r == -EEXIST) {
271 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
272 zone_params.clear_id();
273 r = zone_params.init(cct, store);
274 if (r < 0) {
275 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 }
278 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
279 << dendl;
280 }
281
282 RGWZone& default_zone = zones[zone_params.get_id()];
283 default_zone.name = zone_params.get_name();
284 default_zone.id = zone_params.get_id();
285 master_zone = default_zone.id;
286
287 r = create();
288 if (r < 0 && r != -EEXIST) {
289 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
290 return r;
291 }
292
293 if (r == -EEXIST) {
294 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
295 id.clear();
296 r = init(cct, store);
297 if (r < 0) {
298 return r;
299 }
300 }
301
302 if (old_format) {
303 name = id;
304 }
305
306 post_process_params();
307
308 return 0;
309 }
310
311 const string RGWZoneGroup::get_default_oid(bool old_region_format)
312 {
313 if (old_region_format) {
314 if (cct->_conf->rgw_default_region_info_oid.empty()) {
315 return default_region_info_oid;
316 }
317 return cct->_conf->rgw_default_region_info_oid;
318 }
319
320 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
321
322 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
323 default_oid = default_zone_group_info_oid;
324 }
325
326 default_oid += "." + realm_id;
327
328 return default_oid;
329 }
330
331 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
332 {
333 if (old_region_format) {
334 return region_info_oid_prefix;
335 }
336 return zone_group_info_oid_prefix;
337 }
338
339 const string& RGWZoneGroup::get_names_oid_prefix()
340 {
341 return zonegroup_names_oid_prefix;
342 }
343
344 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
345 return cct->_conf->rgw_zonegroup;
346 }
347
348 int RGWZoneGroup::equals(const string& other_zonegroup) const
349 {
350 if (is_master && other_zonegroup.empty())
351 return true;
352
353 return (id == other_zonegroup);
354 }
355
356 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
357 const list<string>& endpoints, const string *ptier_type,
358 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
359 {
360 auto& zone_id = zone_params.get_id();
361 auto& zone_name = zone_params.get_name();
362
363 // check for duplicate zone name on insert
364 if (!zones.count(zone_id)) {
365 for (const auto& zone : zones) {
366 if (zone.second.name == zone_name) {
367 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
368 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
369 return -EEXIST;
370 }
371 }
372 }
373
374 if (is_master) {
375 if (*is_master) {
376 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
377 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
378 }
379 master_zone = zone_params.get_id();
380 } else if (master_zone == zone_params.get_id()) {
381 master_zone.clear();
382 }
383 }
384
385 RGWZone& zone = zones[zone_params.get_id()];
386 zone.name = zone_params.get_name();
387 zone.id = zone_params.get_id();
388 if (!endpoints.empty()) {
389 zone.endpoints = endpoints;
390 }
391 if (read_only) {
392 zone.read_only = *read_only;
393 }
394 if (ptier_type) {
395 zone.tier_type = *ptier_type;
396 }
397
398 if (psync_from_all) {
399 zone.sync_from_all = *psync_from_all;
400 }
401
402 for (auto add : sync_from) {
403 zone.sync_from.insert(add);
404 }
405
406 for (auto rm : sync_from_rm) {
407 zone.sync_from.erase(rm);
408 }
409
410 post_process_params();
411
412 return update();
413 }
414
415
416 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
417 {
418 RGWZone& zone = zones[zone_params.get_id()];
419 zone.name = zone_params.get_name();
420
421 return update();
422 }
423
424 void RGWZoneGroup::post_process_params()
425 {
426 bool log_data = zones.size() > 1;
427
428 if (master_zone.empty()) {
429 map<string, RGWZone>::iterator iter = zones.begin();
430 if (iter != zones.end()) {
431 master_zone = iter->first;
432 }
433 }
434
435 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
436 RGWZone& zone = iter->second;
437 zone.log_data = log_data;
438
439 RGWZoneParams zone_params(zone.id, zone.name);
440 int ret = zone_params.init(cct, store);
441 if (ret < 0) {
442 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
443 continue;
444 }
445
446 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
447 iter != zone_params.placement_pools.end(); ++iter) {
448 const string& placement_name = iter->first;
449 if (placement_targets.find(placement_name) == placement_targets.end()) {
450 RGWZoneGroupPlacementTarget placement_target;
451 placement_target.name = placement_name;
452 placement_targets[placement_name] = placement_target;
453 }
454 }
455 }
456
457 if (default_placement.empty() && !placement_targets.empty()) {
458 default_placement = placement_targets.begin()->first;
459 }
460 }
461
462 int RGWZoneGroup::remove_zone(const std::string& zone_id)
463 {
464 map<string, RGWZone>::iterator iter = zones.find(zone_id);
465 if (iter == zones.end()) {
466 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
467 << name << dendl;
468 return -ENOENT;
469 }
470
471 zones.erase(iter);
472
473 post_process_params();
474
475 return update();
476 }
477
478 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
479 {
480 if (realm_id.empty()) {
481 /* try using default realm */
482 RGWRealm realm;
483 int ret = realm.init(cct, store);
484 if (ret < 0) {
485 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
486 return -ENOENT;
487 }
488 realm_id = realm.get_id();
489 }
490
491 return RGWSystemMetaObj::read_default_id(default_id, old_format);
492 }
493
494 int RGWZoneGroup::set_as_default(bool exclusive)
495 {
496 if (realm_id.empty()) {
497 /* try using default realm */
498 RGWRealm realm;
499 int ret = realm.init(cct, store);
500 if (ret < 0) {
501 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
502 return -EINVAL;
503 }
504 realm_id = realm.get_id();
505 }
506
507 return RGWSystemMetaObj::set_as_default(exclusive);
508 }
509
510 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
511 {
512 cct = _cct;
513 store = _store;
514
515 if (!setup_obj)
516 return 0;
517
518 if (old_format && id.empty()) {
519 id = name;
520 }
521
522 if (id.empty()) {
523 int r;
524 if (name.empty()) {
525 name = get_predefined_name(cct);
526 }
527 if (name.empty()) {
528 r = use_default(old_format);
529 if (r < 0) {
530 return r;
531 }
532 } else if (!old_format) {
533 r = read_id(name, id);
534 if (r < 0) {
535 if (r != -ENOENT) {
536 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
537 }
538 return r;
539 }
540 }
541 }
542
543 return read_info(id, old_format);
544 }
545
546 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
547 {
548 auto pool = get_pool(cct);
549 bufferlist bl;
550 RGWObjectCtx obj_ctx(store);
551 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
552 if (ret < 0)
553 return ret;
554
555 try {
556 bufferlist::iterator iter = bl.begin();
557 ::decode(default_info, iter);
558 } catch (buffer::error& err) {
559 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
560 return -EIO;
561 }
562
563 return 0;
564 }
565
566 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
567 {
568 RGWDefaultSystemMetaObjInfo default_info;
569
570 int ret = read_default(default_info, get_default_oid(old_format));
571 if (ret < 0) {
572 return ret;
573 }
574
575 default_id = default_info.default_id;
576
577 return 0;
578 }
579
580 int RGWSystemMetaObj::use_default(bool old_format)
581 {
582 return read_default_id(id, old_format);
583 }
584
585 int RGWSystemMetaObj::set_as_default(bool exclusive)
586 {
587 string oid = get_default_oid();
588
589 rgw_pool pool(get_pool(cct));
590 bufferlist bl;
591
592 RGWDefaultSystemMetaObjInfo default_info;
593 default_info.default_id = id;
594
595 ::encode(default_info, bl);
596
597 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
598 exclusive, NULL, real_time(), NULL);
599 if (ret < 0)
600 return ret;
601
602 return 0;
603 }
604
605 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
606 {
607 rgw_pool pool(get_pool(cct));
608 bufferlist bl;
609
610 string oid = get_names_oid_prefix() + obj_name;
611
612 RGWObjectCtx obj_ctx(store);
613 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
614 if (ret < 0) {
615 return ret;
616 }
617
618 RGWNameToId nameToId;
619 try {
620 bufferlist::iterator iter = bl.begin();
621 ::decode(nameToId, iter);
622 } catch (buffer::error& err) {
623 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
624 return -EIO;
625 }
626 object_id = nameToId.obj_id;
627 return 0;
628 }
629
630 int RGWSystemMetaObj::delete_obj(bool old_format)
631 {
632 rgw_pool pool(get_pool(cct));
633
634 /* check to see if obj is the default */
635 RGWDefaultSystemMetaObjInfo default_info;
636 int ret = read_default(default_info, get_default_oid(old_format));
637 if (ret < 0 && ret != -ENOENT)
638 return ret;
639 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
640 string oid = get_default_oid(old_format);
641 rgw_raw_obj default_named_obj(pool, oid);
642 ret = store->delete_system_obj(default_named_obj);
643 if (ret < 0) {
644 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
645 return ret;
646 }
647 }
648 if (!old_format) {
649 string oid = get_names_oid_prefix() + name;
650 rgw_raw_obj object_name(pool, oid);
651 ret = store->delete_system_obj(object_name);
652 if (ret < 0) {
653 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
654 return ret;
655 }
656 }
657
658 string oid = get_info_oid_prefix(old_format);
659 if (old_format) {
660 oid += name;
661 } else {
662 oid += id;
663 }
664
665 rgw_raw_obj object_id(pool, oid);
666 ret = store->delete_system_obj(object_id);
667 if (ret < 0) {
668 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
669 }
670
671 return ret;
672 }
673
674 int RGWSystemMetaObj::store_name(bool exclusive)
675 {
676 rgw_pool pool(get_pool(cct));
677 string oid = get_names_oid_prefix() + name;
678
679 RGWNameToId nameToId;
680 nameToId.obj_id = id;
681
682 bufferlist bl;
683 ::encode(nameToId, bl);
684 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
685 }
686
687 int RGWSystemMetaObj::rename(const string& new_name)
688 {
689 string new_id;
690 int ret = read_id(new_name, new_id);
691 if (!ret) {
692 return -EEXIST;
693 }
694 if (ret < 0 && ret != -ENOENT) {
695 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
696 return ret;
697 }
698 string old_name = name;
699 name = new_name;
700 ret = update();
701 if (ret < 0) {
702 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 ret = store_name(true);
706 if (ret < 0) {
707 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
708 return ret;
709 }
710 /* delete old name */
711 rgw_pool pool(get_pool(cct));
712 string oid = get_names_oid_prefix() + old_name;
713 rgw_raw_obj old_name_obj(pool, oid);
714 ret = store->delete_system_obj(old_name_obj);
715 if (ret < 0) {
716 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
717 return ret;
718 }
719
720 return ret;
721 }
722
723 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
724 {
725 rgw_pool pool(get_pool(cct));
726
727 bufferlist bl;
728
729 string oid = get_info_oid_prefix(old_format) + obj_id;
730
731 RGWObjectCtx obj_ctx(store);
732 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
733 if (ret < 0) {
734 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
735 return ret;
736 }
737
738 try {
739 bufferlist::iterator iter = bl.begin();
740 ::decode(*this, iter);
741 } catch (buffer::error& err) {
742 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
743 return -EIO;
744 }
745
746 return 0;
747 }
748
749 int RGWSystemMetaObj::read()
750 {
751 int ret = read_id(name, id);
752 if (ret < 0) {
753 return ret;
754 }
755
756 return read_info(id);
757 }
758
759 int RGWSystemMetaObj::create(bool exclusive)
760 {
761 int ret;
762
763 /* check to see the name is not used */
764 ret = read_id(name, id);
765 if (exclusive && ret == 0) {
766 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
767 return -EEXIST;
768 } else if ( ret < 0 && ret != -ENOENT) {
769 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
770 return ret;
771 }
772
773 if (id.empty()) {
774 /* create unique id */
775 uuid_d new_uuid;
776 char uuid_str[37];
777 new_uuid.generate_random();
778 new_uuid.print(uuid_str);
779 id = uuid_str;
780 }
781
782 ret = store_info(exclusive);
783 if (ret < 0) {
784 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
785 return ret;
786 }
787
788 return store_name(exclusive);
789 }
790
791 int RGWSystemMetaObj::store_info(bool exclusive)
792 {
793 rgw_pool pool(get_pool(cct));
794
795 string oid = get_info_oid_prefix() + id;
796
797 bufferlist bl;
798 ::encode(*this, bl);
799 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
800 }
801
802 int RGWSystemMetaObj::write(bool exclusive)
803 {
804 int ret = store_info(exclusive);
805 if (ret < 0) {
806 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
807 return ret;
808 }
809 ret = store_name(exclusive);
810 if (ret < 0) {
811 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
812 return ret;
813 }
814 return 0;
815 }
816
817
818 const string& RGWRealm::get_predefined_name(CephContext *cct) {
819 return cct->_conf->rgw_realm;
820 }
821
822 int RGWRealm::create(bool exclusive)
823 {
824 int ret = RGWSystemMetaObj::create(exclusive);
825 if (ret < 0) {
826 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
827 return ret;
828 }
829 // create the control object for watch/notify
830 ret = create_control(exclusive);
831 if (ret < 0) {
832 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
833 return ret;
834 }
835 RGWPeriod period;
836 if (current_period.empty()) {
837 /* create new period for the realm */
838 ret = period.init(cct, store, id, name, false);
839 if (ret < 0 ) {
840 return ret;
841 }
842 ret = period.create(true);
843 if (ret < 0) {
844 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
845 return ret;
846 }
847 } else {
848 period = RGWPeriod(current_period, 0);
849 int ret = period.init(cct, store, id, name);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
852 return ret;
853 }
854 }
855 ret = set_current_period(period);
856 if (ret < 0) {
857 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
858 return ret;
859 }
860 // try to set as default. may race with another create, so pass exclusive=true
861 // so we don't override an existing default
862 ret = set_as_default(true);
863 if (ret < 0 && ret != -EEXIST) {
864 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
865 }
866
867 return 0;
868 }
869
870 int RGWRealm::delete_obj()
871 {
872 int ret = RGWSystemMetaObj::delete_obj();
873 if (ret < 0) {
874 return ret;
875 }
876 return delete_control();
877 }
878
879 int RGWRealm::create_control(bool exclusive)
880 {
881 auto pool = rgw_pool{get_pool(cct)};
882 auto oid = get_control_oid();
883 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
884 nullptr, real_time(), nullptr);
885 }
886
887 int RGWRealm::delete_control()
888 {
889 auto pool = rgw_pool{get_pool(cct)};
890 auto obj = rgw_raw_obj{pool, get_control_oid()};
891 return store->delete_system_obj(obj);
892 }
893
894 rgw_pool RGWRealm::get_pool(CephContext *cct)
895 {
896 if (cct->_conf->rgw_realm_root_pool.empty()) {
897 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
898 }
899 return rgw_pool(cct->_conf->rgw_realm_root_pool);
900 }
901
902 const string RGWRealm::get_default_oid(bool old_format)
903 {
904 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
905 return default_realm_info_oid;
906 }
907 return cct->_conf->rgw_default_realm_info_oid;
908 }
909
910 const string& RGWRealm::get_names_oid_prefix()
911 {
912 return realm_names_oid_prefix;
913 }
914
915 const string& RGWRealm::get_info_oid_prefix(bool old_format)
916 {
917 return realm_info_oid_prefix;
918 }
919
920 int RGWRealm::set_current_period(RGWPeriod& period)
921 {
922 // update realm epoch to match the period's
923 if (epoch > period.get_realm_epoch()) {
924 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
925 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
926 return -EINVAL;
927 }
928 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
929 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
930 << period.get_realm_epoch() << ", but different period id "
931 << period.get_id() << " != " << current_period << dendl;
932 return -EINVAL;
933 }
934
935 epoch = period.get_realm_epoch();
936 current_period = period.get_id();
937
938 int ret = update();
939 if (ret < 0) {
940 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
941 return ret;
942 }
943
944 ret = period.reflect();
945 if (ret < 0) {
946 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
947 return ret;
948 }
949
950 return 0;
951 }
952
953 string RGWRealm::get_control_oid()
954 {
955 return get_info_oid_prefix() + id + ".control";
956 }
957
958 int RGWRealm::notify_zone(bufferlist& bl)
959 {
960 // open a context on the realm's pool
961 rgw_pool pool{get_pool(cct)};
962 librados::IoCtx ctx;
963 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
964 if (r < 0) {
965 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
966 return r;
967 }
968 // send a notify on the realm object
969 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
970 if (r < 0) {
971 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
972 return r;
973 }
974 return 0;
975 }
976
977 int RGWRealm::notify_new_period(const RGWPeriod& period)
978 {
979 bufferlist bl;
980 // push the period to dependent zonegroups/zones
981 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
982 ::encode(period, bl);
983 // reload the gateway with the new period
984 ::encode(RGWRealmNotify::Reload, bl);
985
986 return notify_zone(bl);
987 }
988
989 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
990 {
991 if (realm_id.empty()) {
992 return "period_config.default";
993 }
994 return "period_config." + realm_id;
995 }
996
997 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
998 {
999 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1000 if (pool_name.empty()) {
1001 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1002 }
1003 return {pool_name};
1004 }
1005
1006 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1007 {
1008 RGWObjectCtx obj_ctx(store);
1009 const auto& pool = get_pool(store->ctx());
1010 const auto& oid = get_oid(realm_id);
1011 bufferlist bl;
1012
1013 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1014 if (ret < 0) {
1015 return ret;
1016 }
1017 try {
1018 bufferlist::iterator iter = bl.begin();
1019 ::decode(*this, iter);
1020 } catch (buffer::error& err) {
1021 return -EIO;
1022 }
1023 return 0;
1024 }
1025
1026 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1027 {
1028 const auto& pool = get_pool(store->ctx());
1029 const auto& oid = get_oid(realm_id);
1030 bufferlist bl;
1031 ::encode(*this, bl);
1032 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1033 false, nullptr, real_time(), nullptr);
1034 }
1035
1036 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1037 const string& period_realm_name, bool setup_obj)
1038 {
1039 cct = _cct;
1040 store = _store;
1041 realm_id = period_realm_id;
1042 realm_name = period_realm_name;
1043
1044 if (!setup_obj)
1045 return 0;
1046
1047 return init(_cct, _store, setup_obj);
1048 }
1049
1050
1051 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1052 {
1053 cct = _cct;
1054 store = _store;
1055
1056 if (!setup_obj)
1057 return 0;
1058
1059 if (id.empty()) {
1060 RGWRealm realm(realm_id, realm_name);
1061 int ret = realm.init(cct, store);
1062 if (ret < 0) {
1063 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1064 cpp_strerror(-ret) << dendl;
1065 return ret;
1066 }
1067 id = realm.get_current_period();
1068 realm_id = realm.get_id();
1069 }
1070
1071 if (!epoch) {
1072 int ret = use_latest_epoch();
1073 if (ret < 0) {
1074 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1075 << " : " << cpp_strerror(-ret) << dendl;
1076 return ret;
1077 }
1078 }
1079
1080 return read_info();
1081 }
1082
1083
1084 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1085 map<string, RGWZoneGroup>::const_iterator iter;
1086 if (!zonegroup_id.empty()) {
1087 iter = period_map.zonegroups.find(zonegroup_id);
1088 } else {
1089 iter = period_map.zonegroups.find("default");
1090 }
1091 if (iter != period_map.zonegroups.end()) {
1092 zonegroup = iter->second;
1093 return 0;
1094 }
1095
1096 return -ENOENT;
1097 }
1098
1099 const string& RGWPeriod::get_latest_epoch_oid()
1100 {
1101 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1102 return period_latest_epoch_info_oid;
1103 }
1104 return cct->_conf->rgw_period_latest_epoch_info_oid;
1105 }
1106
1107 const string& RGWPeriod::get_info_oid_prefix()
1108 {
1109 return period_info_oid_prefix;
1110 }
1111
1112 const string RGWPeriod::get_period_oid_prefix()
1113 {
1114 return get_info_oid_prefix() + id;
1115 }
1116
1117 const string RGWPeriod::get_period_oid()
1118 {
1119 std::ostringstream oss;
1120 oss << get_period_oid_prefix();
1121 // skip the epoch for the staging period
1122 if (id != get_staging_id(realm_id))
1123 oss << "." << epoch;
1124 return oss.str();
1125 }
1126
1127 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1128 RGWObjVersionTracker *objv)
1129 {
1130 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1131
1132 rgw_pool pool(get_pool(cct));
1133 bufferlist bl;
1134 RGWObjectCtx obj_ctx(store);
1135 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1136 if (ret < 0) {
1137 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1138 return ret;
1139 }
1140 try {
1141 bufferlist::iterator iter = bl.begin();
1142 ::decode(info, iter);
1143 } catch (buffer::error& err) {
1144 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1145 return -EIO;
1146 }
1147
1148 return 0;
1149 }
1150
1151 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1152 {
1153 RGWPeriodLatestEpochInfo info;
1154
1155 int ret = read_latest_epoch(info);
1156 if (ret < 0) {
1157 return ret;
1158 }
1159
1160 latest_epoch = info.epoch;
1161
1162 return 0;
1163 }
1164
1165 int RGWPeriod::use_latest_epoch()
1166 {
1167 RGWPeriodLatestEpochInfo info;
1168 int ret = read_latest_epoch(info);
1169 if (ret < 0) {
1170 return ret;
1171 }
1172
1173 epoch = info.epoch;
1174
1175 return 0;
1176 }
1177
1178 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1179 RGWObjVersionTracker *objv)
1180 {
1181 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1182
1183 rgw_pool pool(get_pool(cct));
1184 bufferlist bl;
1185
1186 RGWPeriodLatestEpochInfo info;
1187 info.epoch = epoch;
1188
1189 ::encode(info, bl);
1190
1191 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1192 exclusive, objv, real_time(), nullptr);
1193 }
1194
1195 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1196 {
1197 static constexpr int MAX_RETRIES = 20;
1198
1199 for (int i = 0; i < MAX_RETRIES; i++) {
1200 RGWPeriodLatestEpochInfo info;
1201 RGWObjVersionTracker objv;
1202 bool exclusive = false;
1203
1204 // read existing epoch
1205 int r = read_latest_epoch(info, &objv);
1206 if (r == -ENOENT) {
1207 // use an exclusive create to set the epoch atomically
1208 exclusive = true;
1209 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1210 << " for period=" << id << dendl;
1211 } else if (r < 0) {
1212 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1213 return r;
1214 } else if (epoch <= info.epoch) {
1215 r = -EEXIST; // fail with EEXIST if epoch is not newer
1216 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1217 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1218 return r;
1219 } else {
1220 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1221 << " -> " << epoch << " on period=" << id << dendl;
1222 }
1223
1224 r = set_latest_epoch(epoch, exclusive, &objv);
1225 if (r == -EEXIST) {
1226 continue; // exclusive create raced with another update, retry
1227 } else if (r == -ECANCELED) {
1228 continue; // write raced with a conflicting version, retry
1229 }
1230 if (r < 0) {
1231 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1232 return r;
1233 }
1234 return 0; // return success
1235 }
1236
1237 return -ECANCELED; // fail after max retries
1238 }
1239
1240 int RGWPeriod::delete_obj()
1241 {
1242 rgw_pool pool(get_pool(cct));
1243
1244 // delete the object for each period epoch
1245 for (epoch_t e = 1; e <= epoch; e++) {
1246 RGWPeriod p{get_id(), e};
1247 rgw_raw_obj oid{pool, p.get_period_oid()};
1248 int ret = store->delete_system_obj(oid);
1249 if (ret < 0) {
1250 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1251 << ": " << cpp_strerror(-ret) << dendl;
1252 }
1253 }
1254
1255 // delete the .latest_epoch object
1256 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1257 int ret = store->delete_system_obj(oid);
1258 if (ret < 0) {
1259 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1260 << ": " << cpp_strerror(-ret) << dendl;
1261 }
1262 return ret;
1263 }
1264
1265 int RGWPeriod::read_info()
1266 {
1267 rgw_pool pool(get_pool(cct));
1268
1269 bufferlist bl;
1270
1271 RGWObjectCtx obj_ctx(store);
1272 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1273 if (ret < 0) {
1274 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1275 return ret;
1276 }
1277
1278 try {
1279 bufferlist::iterator iter = bl.begin();
1280 ::decode(*this, iter);
1281 } catch (buffer::error& err) {
1282 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1283 return -EIO;
1284 }
1285
1286 return 0;
1287 }
1288
1289 int RGWPeriod::create(bool exclusive)
1290 {
1291 int ret;
1292
1293 /* create unique id */
1294 uuid_d new_uuid;
1295 char uuid_str[37];
1296 new_uuid.generate_random();
1297 new_uuid.print(uuid_str);
1298 id = uuid_str;
1299
1300 epoch = FIRST_EPOCH;
1301
1302 period_map.id = id;
1303
1304 ret = store_info(exclusive);
1305 if (ret < 0) {
1306 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1307 return ret;
1308 }
1309
1310 ret = set_latest_epoch(epoch);
1311 if (ret < 0) {
1312 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1313 }
1314
1315 return ret;
1316 }
1317
1318 int RGWPeriod::store_info(bool exclusive)
1319 {
1320 rgw_pool pool(get_pool(cct));
1321
1322 string oid = get_period_oid();
1323 bufferlist bl;
1324 ::encode(*this, bl);
1325
1326 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1327 exclusive, NULL, real_time(), NULL);
1328 }
1329
1330 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1331 {
1332 if (cct->_conf->rgw_period_root_pool.empty()) {
1333 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1334 }
1335 return rgw_pool(cct->_conf->rgw_period_root_pool);
1336 }
1337
1338 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1339 {
1340 if (zonegroup.realm_id != realm_id) {
1341 return 0;
1342 }
1343 int ret = period_map.update(zonegroup, cct);
1344 if (ret < 0) {
1345 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1346 return ret;
1347 }
1348
1349 return store_info(false);
1350 }
1351
1352 int RGWPeriod::update()
1353 {
1354 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1355 list<string> zonegroups;
1356 int ret = store->list_zonegroups(zonegroups);
1357 if (ret < 0) {
1358 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1359 return ret;
1360 }
1361
1362 // clear zone short ids of removed zones. period_map.update() will add the
1363 // remaining zones back
1364 period_map.short_zone_ids.clear();
1365
1366 for (auto& iter : zonegroups) {
1367 RGWZoneGroup zg(string(), iter);
1368 ret = zg.init(cct, store);
1369 if (ret < 0) {
1370 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1371 continue;
1372 }
1373
1374 if (zg.realm_id != realm_id) {
1375 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1376 continue;
1377 }
1378
1379 if (zg.master_zone.empty()) {
1380 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1381 return -EINVAL;
1382 }
1383
1384 if (zg.is_master_zonegroup()) {
1385 master_zonegroup = zg.get_id();
1386 master_zone = zg.master_zone;
1387 }
1388
1389 int ret = period_map.update(zg, cct);
1390 if (ret < 0) {
1391 return ret;
1392 }
1393 }
1394
1395 ret = period_config.read(store, realm_id);
1396 if (ret < 0 && ret != -ENOENT) {
1397 ldout(cct, 0) << "ERROR: failed to read period config: "
1398 << cpp_strerror(ret) << dendl;
1399 return ret;
1400 }
1401 return 0;
1402 }
1403
1404 int RGWPeriod::reflect()
1405 {
1406 for (auto& iter : period_map.zonegroups) {
1407 RGWZoneGroup& zg = iter.second;
1408 zg.reinit_instance(cct, store);
1409 int r = zg.write(false);
1410 if (r < 0) {
1411 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1412 return r;
1413 }
1414 if (zg.is_master_zonegroup()) {
1415 // set master as default if no default exists
1416 r = zg.set_as_default(true);
1417 if (r == 0) {
1418 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1419 << " as the default" << dendl;
1420 }
1421 }
1422 }
1423
1424 int r = period_config.write(store, realm_id);
1425 if (r < 0) {
1426 ldout(cct, 0) << "ERROR: failed to store period config: "
1427 << cpp_strerror(-r) << dendl;
1428 return r;
1429 }
1430 return 0;
1431 }
1432
1433 void RGWPeriod::fork()
1434 {
1435 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1436 predecessor_uuid = id;
1437 id = get_staging_id(realm_id);
1438 period_map.reset();
1439 realm_epoch++;
1440 }
1441
1442 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1443 {
1444 // initialize a sync status manager to read the status
1445 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1446 int r = mgr.init();
1447 if (r < 0) {
1448 return r;
1449 }
1450 r = mgr.read_sync_status(sync_status);
1451 mgr.stop();
1452 return r;
1453 }
1454
1455 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1456 std::ostream& error_stream,
1457 bool force_if_stale)
1458 {
1459 rgw_meta_sync_status status;
1460 int r = read_sync_status(store, &status);
1461 if (r < 0) {
1462 ldout(cct, 0) << "period failed to read sync status: "
1463 << cpp_strerror(-r) << dendl;
1464 return r;
1465 }
1466
1467 std::vector<std::string> markers;
1468
1469 const auto current_epoch = current_period.get_realm_epoch();
1470 if (current_epoch != status.sync_info.realm_epoch) {
1471 // no sync status markers for the current period
1472 assert(current_epoch > status.sync_info.realm_epoch);
1473 const int behind = current_epoch - status.sync_info.realm_epoch;
1474 if (!force_if_stale && current_epoch > 1) {
1475 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1476 "the current master zone in metadata sync. If this zone is promoted "
1477 "to master, any metadata changes during that time are likely to "
1478 "be lost.\n"
1479 "Waiting for this zone to catch up on metadata sync (see "
1480 "'radosgw-admin sync status') is recommended.\n"
1481 "To promote this zone to master anyway, add the flag "
1482 "--yes-i-really-mean-it." << std::endl;
1483 return -EINVAL;
1484 }
1485 // empty sync status markers - other zones will skip this period during
1486 // incremental metadata sync
1487 markers.resize(status.sync_info.num_shards);
1488 } else {
1489 markers.reserve(status.sync_info.num_shards);
1490 for (auto& i : status.sync_markers) {
1491 auto& marker = i.second;
1492 // filter out markers from other periods
1493 if (marker.realm_epoch != current_epoch) {
1494 marker.marker.clear();
1495 }
1496 markers.emplace_back(std::move(marker.marker));
1497 }
1498 }
1499
1500 std::swap(sync_status, markers);
1501 return 0;
1502 }
1503
1504 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1505 std::ostream& error_stream, bool force_if_stale)
1506 {
1507 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1508 // gateway must be in the master zone to commit
1509 if (master_zone != store->get_zone_params().get_id()) {
1510 error_stream << "Cannot commit period on zone "
1511 << store->get_zone_params().get_id() << ", it must be sent to "
1512 "the period's master zone " << master_zone << '.' << std::endl;
1513 return -EINVAL;
1514 }
1515 // period predecessor must match current period
1516 if (predecessor_uuid != current_period.get_id()) {
1517 error_stream << "Period predecessor " << predecessor_uuid
1518 << " does not match current period " << current_period.get_id()
1519 << ". Use 'period pull' to get the latest period from the master, "
1520 "reapply your changes, and try again." << std::endl;
1521 return -EINVAL;
1522 }
1523 // realm epoch must be 1 greater than current period
1524 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1525 error_stream << "Period's realm epoch " << realm_epoch
1526 << " does not come directly after current realm epoch "
1527 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1528 "latest realm and period from the master zone, reapply your changes, "
1529 "and try again." << std::endl;
1530 return -EINVAL;
1531 }
1532 // did the master zone change?
1533 if (master_zone != current_period.get_master_zone()) {
1534 // store the current metadata sync status in the period
1535 int r = update_sync_status(current_period, error_stream, force_if_stale);
1536 if (r < 0) {
1537 ldout(cct, 0) << "failed to update metadata sync status: "
1538 << cpp_strerror(-r) << dendl;
1539 return r;
1540 }
1541 // create an object with a new period id
1542 r = create(true);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1545 return r;
1546 }
1547 // set as current period
1548 r = realm.set_current_period(*this);
1549 if (r < 0) {
1550 ldout(cct, 0) << "failed to update realm's current period: "
1551 << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 ldout(cct, 4) << "Promoted to master zone and committed new period "
1555 << id << dendl;
1556 realm.notify_new_period(*this);
1557 return 0;
1558 }
1559 // period must be based on current epoch
1560 if (epoch != current_period.get_epoch()) {
1561 error_stream << "Period epoch " << epoch << " does not match "
1562 "predecessor epoch " << current_period.get_epoch()
1563 << ". Use 'period pull' to get the latest epoch from the master zone, "
1564 "reapply your changes, and try again." << std::endl;
1565 return -EINVAL;
1566 }
1567 // set period as next epoch
1568 set_id(current_period.get_id());
1569 set_epoch(current_period.get_epoch() + 1);
1570 set_predecessor(current_period.get_predecessor());
1571 realm_epoch = current_period.get_realm_epoch();
1572 // write the period to rados
1573 int r = store_info(false);
1574 if (r < 0) {
1575 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1576 return r;
1577 }
1578 // set as latest epoch
1579 r = update_latest_epoch(epoch);
1580 if (r == -EEXIST) {
1581 // already have this epoch (or a more recent one)
1582 return 0;
1583 }
1584 if (r < 0) {
1585 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1586 return r;
1587 }
1588 r = reflect();
1589 if (r < 0) {
1590 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1591 return r;
1592 }
1593 ldout(cct, 4) << "Committed new epoch " << epoch
1594 << " for period " << id << dendl;
1595 realm.notify_new_period(*this);
1596 return 0;
1597 }
1598
1599 int RGWZoneParams::create_default(bool old_format)
1600 {
1601 name = default_zone_name;
1602
1603 int r = create();
1604 if (r < 0) {
1605 return r;
1606 }
1607
1608 if (old_format) {
1609 name = id;
1610 }
1611
1612 return r;
1613 }
1614
1615
1616 int get_zones_pool_set(CephContext* cct,
1617 RGWRados* store,
1618 const list<string>& zones,
1619 const string& my_zone_id,
1620 set<rgw_pool>& pool_names)
1621 {
1622 for(auto const& iter : zones) {
1623 RGWZoneParams zone(iter);
1624 int r = zone.init(cct, store);
1625 if (r < 0) {
1626 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1627 return r;
1628 }
1629 if (zone.get_id() != my_zone_id) {
1630 pool_names.insert(zone.domain_root);
1631 pool_names.insert(zone.metadata_heap);
1632 pool_names.insert(zone.control_pool);
1633 pool_names.insert(zone.gc_pool);
1634 pool_names.insert(zone.log_pool);
1635 pool_names.insert(zone.intent_log_pool);
1636 pool_names.insert(zone.usage_log_pool);
1637 pool_names.insert(zone.user_keys_pool);
1638 pool_names.insert(zone.user_email_pool);
1639 pool_names.insert(zone.user_swift_pool);
1640 pool_names.insert(zone.user_uid_pool);
1641 pool_names.insert(zone.roles_pool);
1642 pool_names.insert(zone.reshard_pool);
1643 for(auto& iter : zone.placement_pools) {
1644 pool_names.insert(iter.second.index_pool);
1645 pool_names.insert(iter.second.data_pool);
1646 pool_names.insert(iter.second.data_extra_pool);
1647 }
1648 }
1649 }
1650 return 0;
1651 }
1652
1653 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1654 const string& default_prefix,
1655 const string& default_suffix,
1656 const rgw_pool& suggested_pool)
1657 {
1658 string suggested_name = suggested_pool.to_str();
1659
1660 string prefix = default_prefix;
1661 string suffix = default_suffix;
1662
1663 if (!suggested_pool.empty()) {
1664 prefix = suggested_name.substr(0, suggested_name.find("."));
1665 suffix = suggested_name.substr(prefix.length());
1666 }
1667
1668 rgw_pool pool(prefix + suffix);
1669
1670 if (pools.find(pool) == pools.end()) {
1671 return pool;
1672 } else {
1673 while(true) {
1674 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1675 if (pools.find(pool) == pools.end()) {
1676 return pool;
1677 }
1678 }
1679 }
1680 }
1681
1682 int RGWZoneParams::fix_pool_names()
1683 {
1684
1685 list<string> zones;
1686 int r = store->list_zones(zones);
1687 if (r < 0) {
1688 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1689 }
1690
1691 set<rgw_pool> pools;
1692 r = get_zones_pool_set(cct, store, zones, id, pools);
1693 if (r < 0) {
1694 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1695 return r;
1696 }
1697
1698 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1699 if (!metadata_heap.name.empty()) {
1700 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1701 }
1702 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1703 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1704 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1705 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1706 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1707 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1708 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1709 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1710 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1711 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1712 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1713 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1714
1715 for(auto& iter : placement_pools) {
1716 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1717 iter.second.index_pool);
1718 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1719 iter.second.data_pool);
1720 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1721 iter.second.data_extra_pool);
1722 }
1723
1724 return 0;
1725 }
1726
1727 int RGWZoneParams::create(bool exclusive)
1728 {
1729 /* check for old pools config */
1730 rgw_raw_obj obj(domain_root, avail_pools);
1731 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1732 if (r < 0) {
1733 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1734 /* a new system, let's set new placement info */
1735 RGWZonePlacementInfo default_placement;
1736 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1737 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1738 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1739 placement_pools["default-placement"] = default_placement;
1740 }
1741
1742 r = fix_pool_names();
1743 if (r < 0) {
1744 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1745 return r;
1746 }
1747
1748 r = RGWSystemMetaObj::create(exclusive);
1749 if (r < 0) {
1750 return r;
1751 }
1752
1753 // try to set as default. may race with another create, so pass exclusive=true
1754 // so we don't override an existing default
1755 r = set_as_default(true);
1756 if (r < 0 && r != -EEXIST) {
1757 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1758 }
1759
1760 return 0;
1761 }
1762
1763 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1764 {
1765 if (cct->_conf->rgw_zone_root_pool.empty()) {
1766 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1767 }
1768
1769 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1770 }
1771
1772 const string RGWZoneParams::get_default_oid(bool old_format)
1773 {
1774 if (old_format) {
1775 return cct->_conf->rgw_default_zone_info_oid;
1776 }
1777
1778 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1779 }
1780
1781 const string& RGWZoneParams::get_names_oid_prefix()
1782 {
1783 return zone_names_oid_prefix;
1784 }
1785
1786 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1787 {
1788 return zone_info_oid_prefix;
1789 }
1790
1791 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1792 return cct->_conf->rgw_zone;
1793 }
1794
1795 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1796 {
1797 if (name.empty()) {
1798 name = cct->_conf->rgw_zone;
1799 }
1800
1801 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1802 }
1803
1804 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1805 {
1806 if (realm_id.empty()) {
1807 /* try using default realm */
1808 RGWRealm realm;
1809 int ret = realm.init(cct, store);
1810 if (ret < 0) {
1811 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1812 return -ENOENT;
1813 }
1814 realm_id = realm.get_id();
1815 }
1816
1817 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1818 }
1819
1820
1821 int RGWZoneParams::set_as_default(bool exclusive)
1822 {
1823 if (realm_id.empty()) {
1824 /* try using default realm */
1825 RGWRealm realm;
1826 int ret = realm.init(cct, store);
1827 if (ret < 0) {
1828 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1829 return -EINVAL;
1830 }
1831 realm_id = realm.get_id();
1832 }
1833
1834 return RGWSystemMetaObj::set_as_default(exclusive);
1835 }
1836
1837 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1838 {
1839 static const std::string NONE{"none"};
1840 auto p = placement_pools.find(placement_rule);
1841 if (p == placement_pools.end()) {
1842 return NONE;
1843 }
1844 const auto& type = p->second.compression_type;
1845 return !type.empty() ? type : NONE;
1846 }
1847
1848 void RGWPeriodMap::encode(bufferlist& bl) const {
1849 ENCODE_START(2, 1, bl);
1850 ::encode(id, bl);
1851 ::encode(zonegroups, bl);
1852 ::encode(master_zonegroup, bl);
1853 ::encode(short_zone_ids, bl);
1854 ENCODE_FINISH(bl);
1855 }
1856
1857 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1858 DECODE_START(2, bl);
1859 ::decode(id, bl);
1860 ::decode(zonegroups, bl);
1861 ::decode(master_zonegroup, bl);
1862 if (struct_v >= 2) {
1863 ::decode(short_zone_ids, bl);
1864 }
1865 DECODE_FINISH(bl);
1866
1867 zonegroups_by_api.clear();
1868 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1869 iter != zonegroups.end(); ++iter) {
1870 RGWZoneGroup& zonegroup = iter->second;
1871 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1872 if (zonegroup.is_master_zonegroup()) {
1873 master_zonegroup = zonegroup.get_id();
1874 }
1875 }
1876 }
1877
1878 // run an MD5 hash on the zone_id and return the first 32 bits
1879 static uint32_t gen_short_zone_id(const std::string zone_id)
1880 {
1881 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1882 MD5 hash;
1883 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1884 hash.Final(md5);
1885
1886 uint32_t short_id;
1887 memcpy((char *)&short_id, md5, sizeof(short_id));
1888 return std::max(short_id, 1u);
1889 }
1890
1891 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1892 {
1893 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1894 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1895 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1896 return -EINVAL;
1897 }
1898 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1899 if (iter != zonegroups.end()) {
1900 RGWZoneGroup& old_zonegroup = iter->second;
1901 if (!old_zonegroup.api_name.empty()) {
1902 zonegroups_by_api.erase(old_zonegroup.api_name);
1903 }
1904 }
1905 zonegroups[zonegroup.get_id()] = zonegroup;
1906
1907 if (!zonegroup.api_name.empty()) {
1908 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1909 }
1910
1911 if (zonegroup.is_master_zonegroup()) {
1912 master_zonegroup = zonegroup.get_id();
1913 } else if (master_zonegroup == zonegroup.get_id()) {
1914 master_zonegroup = "";
1915 }
1916
1917 for (auto& i : zonegroup.zones) {
1918 auto& zone = i.second;
1919 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1920 continue;
1921 }
1922 // calculate the zone's short id
1923 uint32_t short_id = gen_short_zone_id(zone.id);
1924
1925 // search for an existing zone with the same short id
1926 for (auto& s : short_zone_ids) {
1927 if (s.second == short_id) {
1928 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1929 << ") generates the same short_zone_id " << short_id
1930 << " as existing zone id " << s.first << dendl;
1931 return -EEXIST;
1932 }
1933 }
1934
1935 short_zone_ids[zone.id] = short_id;
1936 }
1937
1938 return 0;
1939 }
1940
1941 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1942 {
1943 auto i = short_zone_ids.find(zone_id);
1944 if (i == short_zone_ids.end()) {
1945 return 0;
1946 }
1947 return i->second;
1948 }
1949
1950 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1951 {
1952
1953 RGWPeriod period;
1954 int ret = period.init(cct, store);
1955 if (ret < 0) {
1956 cerr << "failed to read current period info: " << cpp_strerror(ret);
1957 return ret;
1958 }
1959
1960 bucket_quota = period.get_config().bucket_quota;
1961 user_quota = period.get_config().user_quota;
1962 zonegroups = period.get_map().zonegroups;
1963 zonegroups_by_api = period.get_map().zonegroups_by_api;
1964 master_zonegroup = period.get_map().master_zonegroup;
1965
1966 return 0;
1967 }
1968
1969 void RGWRegionMap::encode(bufferlist& bl) const {
1970 ENCODE_START( 3, 1, bl);
1971 ::encode(regions, bl);
1972 ::encode(master_region, bl);
1973 ::encode(bucket_quota, bl);
1974 ::encode(user_quota, bl);
1975 ENCODE_FINISH(bl);
1976 }
1977
1978 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1979 DECODE_START(3, bl);
1980 ::decode(regions, bl);
1981 ::decode(master_region, bl);
1982 if (struct_v >= 2)
1983 ::decode(bucket_quota, bl);
1984 if (struct_v >= 3)
1985 ::decode(user_quota, bl);
1986 DECODE_FINISH(bl);
1987 }
1988
1989 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1990 ENCODE_START( 3, 1, bl);
1991 ::encode(zonegroups, bl);
1992 ::encode(master_zonegroup, bl);
1993 ::encode(bucket_quota, bl);
1994 ::encode(user_quota, bl);
1995 ENCODE_FINISH(bl);
1996 }
1997
1998 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1999 DECODE_START(3, bl);
2000 ::decode(zonegroups, bl);
2001 ::decode(master_zonegroup, bl);
2002 if (struct_v >= 2)
2003 ::decode(bucket_quota, bl);
2004 if (struct_v >= 3)
2005 ::decode(user_quota, bl);
2006 DECODE_FINISH(bl);
2007
2008 zonegroups_by_api.clear();
2009 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2010 iter != zonegroups.end(); ++iter) {
2011 RGWZoneGroup& zonegroup = iter->second;
2012 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2013 if (zonegroup.is_master_zonegroup()) {
2014 master_zonegroup = zonegroup.get_name();
2015 }
2016 }
2017 }
2018
2019 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2020 {
2021 obj_version *check_objv = version_for_check();
2022
2023 if (check_objv) {
2024 cls_version_check(*op, *check_objv, VER_COND_EQ);
2025 }
2026
2027 cls_version_read(*op, &read_version);
2028 }
2029
2030 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2031 {
2032 obj_version *check_objv = version_for_check();
2033 obj_version *modify_version = version_for_write();
2034
2035 if (check_objv) {
2036 cls_version_check(*op, *check_objv, VER_COND_EQ);
2037 }
2038
2039 if (modify_version) {
2040 cls_version_set(*op, *modify_version);
2041 } else {
2042 cls_version_inc(*op);
2043 }
2044 }
2045
2046 void RGWObjManifest::obj_iterator::operator++()
2047 {
2048 if (manifest->explicit_objs) {
2049 ++explicit_iter;
2050
2051 if (explicit_iter == manifest->objs.end()) {
2052 ofs = manifest->obj_size;
2053 return;
2054 }
2055
2056 update_explicit_pos();
2057
2058 update_location();
2059 return;
2060 }
2061
2062 uint64_t obj_size = manifest->get_obj_size();
2063 uint64_t head_size = manifest->get_head_size();
2064
2065 if (ofs == obj_size) {
2066 return;
2067 }
2068
2069 if (manifest->rules.empty()) {
2070 return;
2071 }
2072
2073 /* are we still pointing at the head? */
2074 if (ofs < head_size) {
2075 rule_iter = manifest->rules.begin();
2076 RGWObjManifestRule *rule = &rule_iter->second;
2077 ofs = MIN(head_size, obj_size);
2078 stripe_ofs = ofs;
2079 cur_stripe = 1;
2080 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2081 if (rule->part_size > 0) {
2082 stripe_size = MIN(stripe_size, rule->part_size);
2083 }
2084 update_location();
2085 return;
2086 }
2087
2088 RGWObjManifestRule *rule = &rule_iter->second;
2089
2090 stripe_ofs += rule->stripe_max_size;
2091 cur_stripe++;
2092 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2093
2094 if (rule->part_size > 0) {
2095 /* multi part, multi stripes object */
2096
2097 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2098
2099 if (stripe_ofs >= part_ofs + rule->part_size) {
2100 /* moved to the next part */
2101 cur_stripe = 0;
2102 part_ofs += rule->part_size;
2103 stripe_ofs = part_ofs;
2104
2105 bool last_rule = (next_rule_iter == manifest->rules.end());
2106 /* move to the next rule? */
2107 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2108 rule_iter = next_rule_iter;
2109 last_rule = (next_rule_iter == manifest->rules.end());
2110 if (!last_rule) {
2111 ++next_rule_iter;
2112 }
2113 cur_part_id = rule_iter->second.start_part_num;
2114 } else {
2115 cur_part_id++;
2116 }
2117
2118 rule = &rule_iter->second;
2119 }
2120
2121 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2122 }
2123
2124 cur_override_prefix = rule->override_prefix;
2125
2126 ofs = stripe_ofs;
2127 if (ofs > obj_size) {
2128 ofs = obj_size;
2129 stripe_ofs = ofs;
2130 stripe_size = 0;
2131 }
2132
2133 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2134 update_location();
2135 }
2136
2137 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2138 {
2139 manifest = _m;
2140
2141 manifest->set_tail_placement(placement_rule, _b);
2142 manifest->set_head(placement_rule, _obj, 0);
2143 last_ofs = 0;
2144
2145 if (manifest->get_prefix().empty()) {
2146 char buf[33];
2147 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2148
2149 string oid_prefix = ".";
2150 oid_prefix.append(buf);
2151 oid_prefix.append("_");
2152
2153 manifest->set_prefix(oid_prefix);
2154 }
2155
2156 bool found = manifest->get_rule(0, &rule);
2157 if (!found) {
2158 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2159 return -EIO;
2160 }
2161
2162 uint64_t head_size = manifest->get_head_size();
2163
2164 if (head_size > 0) {
2165 cur_stripe_size = head_size;
2166 } else {
2167 cur_stripe_size = rule.stripe_max_size;
2168 }
2169
2170 cur_part_id = rule.start_part_num;
2171
2172 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2173
2174 // Normal object which not generated through copy operation
2175 manifest->set_tail_instance(_obj.key.instance);
2176
2177 manifest->update_iterators();
2178
2179 return 0;
2180 }
2181
2182 int RGWObjManifest::generator::create_next(uint64_t ofs)
2183 {
2184 if (ofs < last_ofs) /* only going forward */
2185 return -EINVAL;
2186
2187 uint64_t max_head_size = manifest->get_max_head_size();
2188
2189 if (ofs < max_head_size) {
2190 manifest->set_head_size(ofs);
2191 }
2192
2193 if (ofs >= max_head_size) {
2194 manifest->set_head_size(max_head_size);
2195 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2196 cur_stripe_size = rule.stripe_max_size;
2197
2198 if (cur_part_id == 0 && max_head_size > 0) {
2199 cur_stripe++;
2200 }
2201 }
2202
2203 last_ofs = ofs;
2204 manifest->set_obj_size(ofs);
2205
2206 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2207
2208 manifest->update_iterators();
2209
2210 return 0;
2211 }
2212
2213 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2214 {
2215 return begin_iter;
2216 }
2217
2218 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2219 {
2220 return end_iter;
2221 }
2222
2223 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2224 {
2225 if (ofs > obj_size) {
2226 ofs = obj_size;
2227 }
2228 RGWObjManifest::obj_iterator iter(this);
2229 iter.seek(ofs);
2230 return iter;
2231 }
2232
2233 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2234 {
2235 if (explicit_objs || m.explicit_objs) {
2236 return append_explicit(m, zonegroup, zone_params);
2237 }
2238
2239 if (rules.empty()) {
2240 *this = m;
2241 return 0;
2242 }
2243
2244 string override_prefix;
2245
2246 if (prefix.empty()) {
2247 prefix = m.prefix;
2248 }
2249
2250 if (prefix != m.prefix) {
2251 override_prefix = m.prefix;
2252 }
2253
2254 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2255 if (miter == m.rules.end()) {
2256 return append_explicit(m, zonegroup, zone_params);
2257 }
2258
2259 for (; miter != m.rules.end(); ++miter) {
2260 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2261
2262 RGWObjManifestRule& rule = last_rule->second;
2263
2264 if (rule.part_size == 0) {
2265 rule.part_size = obj_size - rule.start_ofs;
2266 }
2267
2268 RGWObjManifestRule& next_rule = miter->second;
2269 if (!next_rule.part_size) {
2270 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2271 }
2272
2273 string rule_prefix = prefix;
2274 if (!rule.override_prefix.empty()) {
2275 rule_prefix = rule.override_prefix;
2276 }
2277
2278 string next_rule_prefix = m.prefix;
2279 if (!next_rule.override_prefix.empty()) {
2280 next_rule_prefix = next_rule.override_prefix;
2281 }
2282
2283 if (rule.part_size != next_rule.part_size ||
2284 rule.stripe_max_size != next_rule.stripe_max_size ||
2285 rule_prefix != next_rule_prefix) {
2286 if (next_rule_prefix != prefix) {
2287 append_rules(m, miter, &next_rule_prefix);
2288 } else {
2289 append_rules(m, miter, NULL);
2290 }
2291 break;
2292 }
2293
2294 uint64_t expected_part_num = rule.start_part_num + 1;
2295 if (rule.part_size > 0) {
2296 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2297 }
2298
2299 if (expected_part_num != next_rule.start_part_num) {
2300 append_rules(m, miter, NULL);
2301 break;
2302 }
2303 }
2304
2305 set_obj_size(obj_size + m.obj_size);
2306
2307 return 0;
2308 }
2309
2310 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2311 {
2312 return append(m, store->get_zonegroup(), store->get_zone_params());
2313 }
2314
2315 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2316 string *override_prefix)
2317 {
2318 for (; miter != m.rules.end(); ++miter) {
2319 RGWObjManifestRule rule = miter->second;
2320 rule.start_ofs += obj_size;
2321 if (override_prefix)
2322 rule.override_prefix = *override_prefix;
2323 rules[rule.start_ofs] = rule;
2324 }
2325 }
2326
2327 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2328 {
2329 if (explicit_objs) {
2330 return;
2331 }
2332 obj_iterator iter = obj_begin();
2333
2334 while (iter != obj_end()) {
2335 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2336 const rgw_obj_select& os = iter.get_location();
2337 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2338 part.loc_ofs = 0;
2339
2340 uint64_t ofs = iter.get_stripe_ofs();
2341
2342 if (ofs == 0) {
2343 part.loc = obj;
2344 } else {
2345 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2346 }
2347 ++iter;
2348 uint64_t next_ofs = iter.get_stripe_ofs();
2349
2350 part.size = next_ofs - ofs;
2351 }
2352
2353 explicit_objs = true;
2354 rules.clear();
2355 prefix.clear();
2356 }
2357
2358 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2359 {
2360 if (!explicit_objs) {
2361 convert_to_explicit(zonegroup, zone_params);
2362 }
2363 if (!m.explicit_objs) {
2364 m.convert_to_explicit(zonegroup, zone_params);
2365 }
2366 map<uint64_t, RGWObjManifestPart>::iterator iter;
2367 uint64_t base = obj_size;
2368 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2369 RGWObjManifestPart& part = iter->second;
2370 objs[base + iter->first] = part;
2371 }
2372 obj_size += m.obj_size;
2373
2374 return 0;
2375 }
2376
2377 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2378 {
2379 if (rules.empty()) {
2380 return false;
2381 }
2382
2383 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2384 if (iter != rules.begin()) {
2385 --iter;
2386 }
2387
2388 *rule = iter->second;
2389
2390 return true;
2391 }
2392
2393 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2394 {
2395 write_version.ver = 1;
2396 #define TAG_LEN 24
2397
2398 write_version.tag.clear();
2399 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2400 }
2401
2402 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2403 real_time *mtime, real_time set_mtime,
2404 map<string, bufferlist>& attrs, real_time delete_at,
2405 const char *if_match, const char *if_nomatch, const string *user_data,
2406 rgw_zone_set *zones_trace)
2407 {
2408 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2409 if (r < 0)
2410 return r;
2411
2412 is_complete = !canceled;
2413 return 0;
2414 }
2415
2416 CephContext *RGWPutObjProcessor::ctx()
2417 {
2418 return store->ctx();
2419 }
2420
2421 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2422 {
2423 drain_pending();
2424
2425 if (is_complete)
2426 return;
2427
2428 set<rgw_raw_obj>::iterator iter;
2429 bool need_to_remove_head = false;
2430 rgw_raw_obj raw_head;
2431
2432 if (!head_obj.empty()) {
2433 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2434 }
2435
2436 /**
2437 * We should delete the object in the "multipart" namespace to avoid race condition.
2438 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2439 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2440 * written by the second upload may be deleted by the first upload.
2441 * details is describled on #11749
2442 *
2443 * The above comment still stands, but instead of searching for a specific object in the multipart
2444 * namespace, we just make sure that we remove the object that is marked as the head object after
2445 * we remove all the other raw objects. Note that we use different call to remove the head object,
2446 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2447 */
2448 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2449 const rgw_raw_obj& obj = *iter;
2450 if (!head_obj.empty() && obj == raw_head) {
2451 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2452 need_to_remove_head = true;
2453 continue;
2454 }
2455
2456 int r = store->delete_raw_obj(obj);
2457 if (r < 0 && r != -ENOENT) {
2458 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2459 }
2460 }
2461
2462 if (need_to_remove_head) {
2463 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2464 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2465 if (r < 0 && r != -ENOENT) {
2466 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2467 }
2468 }
2469 }
2470
2471 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2472 {
2473 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2474 obj_len = abs_ofs + bl.length();
2475
2476 if (!(obj == last_written_obj)) {
2477 last_written_obj = obj;
2478 }
2479
2480 // For the first call pass -1 as the offset to
2481 // do a write_full.
2482 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2483 }
2484
2485 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2486 {
2487 struct put_obj_aio_info info;
2488 info = pending.front();
2489 pending.pop_front();
2490 pending_size -= info.size;
2491 return info;
2492 }
2493
2494 int RGWPutObjProcessor_Aio::wait_pending_front()
2495 {
2496 if (pending.empty()) {
2497 return 0;
2498 }
2499 struct put_obj_aio_info info = pop_pending();
2500 int ret = store->aio_wait(info.handle);
2501
2502 if (ret >= 0) {
2503 add_written_obj(info.obj);
2504 }
2505
2506 return ret;
2507 }
2508
2509 bool RGWPutObjProcessor_Aio::pending_has_completed()
2510 {
2511 if (pending.empty())
2512 return false;
2513
2514 struct put_obj_aio_info& info = pending.front();
2515 return store->aio_completed(info.handle);
2516 }
2517
2518 int RGWPutObjProcessor_Aio::drain_pending()
2519 {
2520 int ret = 0;
2521 while (!pending.empty()) {
2522 int r = wait_pending_front();
2523 if (r < 0)
2524 ret = r;
2525 }
2526 return ret;
2527 }
2528
2529 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2530 {
2531 bool _wait = need_to_wait;
2532
2533 if (handle) {
2534 struct put_obj_aio_info info;
2535 info.handle = handle;
2536 info.obj = obj;
2537 info.size = size;
2538 pending_size += size;
2539 pending.push_back(info);
2540 }
2541 size_t orig_size = pending_size;
2542
2543 /* first drain complete IOs */
2544 while (pending_has_completed()) {
2545 int r = wait_pending_front();
2546 if (r < 0)
2547 return r;
2548
2549 _wait = false;
2550 }
2551
2552 /* resize window in case messages are draining too fast */
2553 if (orig_size - pending_size >= window_size) {
2554 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2555 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2556 if (window_size > max_window_size) {
2557 window_size = max_window_size;
2558 }
2559 }
2560
2561 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2562 if (pending_size > window_size || _wait) {
2563 int r = wait_pending_front();
2564 if (r < 0)
2565 return r;
2566 }
2567 return 0;
2568 }
2569
2570 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2571 {
2572 if (ofs >= next_part_ofs) {
2573 int r = prepare_next_part(ofs);
2574 if (r < 0) {
2575 return r;
2576 }
2577 }
2578
2579 *pobj = cur_obj;
2580
2581 if (!bl.length()) {
2582 *phandle = nullptr;
2583 return 0;
2584 }
2585
2586 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2587 }
2588
2589 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2590 {
2591 RGWPutObjProcessor::prepare(store, oid_rand);
2592
2593 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2594
2595 return 0;
2596 }
2597
2598 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2599 {
2600 *phandle = NULL;
2601 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2602
2603 pending_data_bl.claim_append(bl);
2604 if (pending_data_bl.length() < max_write_size) {
2605 *again = false;
2606 return 0;
2607 }
2608
2609 pending_data_bl.splice(0, max_write_size, &bl);
2610
2611 /* do we have enough data pending accumulated that needs to be written? */
2612 *again = (pending_data_bl.length() >= max_chunk_size);
2613
2614 if (!data_ofs && !immutable_head()) {
2615 first_chunk.claim(bl);
2616 obj_len = (uint64_t)first_chunk.length();
2617 int r = prepare_next_part(obj_len);
2618 if (r < 0) {
2619 return r;
2620 }
2621 data_ofs = obj_len;
2622 return 0;
2623 }
2624 off_t write_ofs = data_ofs;
2625 data_ofs = write_ofs + bl.length();
2626 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2627 we could be racing with another upload, to the same
2628 object and cleanup can be messy */
2629 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2630 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2631 bl.clear();
2632 }
2633 return ret;
2634 }
2635
2636
2637 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2638 {
2639 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2640
2641 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2642 if (r < 0) {
2643 return r;
2644 }
2645
2646 return 0;
2647 }
2648
2649 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2650 {
2651 head_obj.init(bucket, obj_str);
2652
2653 int r = prepare_init(store, oid_rand);
2654 if (r < 0) {
2655 return r;
2656 }
2657
2658 if (!version_id.empty()) {
2659 head_obj.key.set_instance(version_id);
2660 } else if (versioned_object) {
2661 store->gen_rand_obj_instance_name(&head_obj);
2662 }
2663
2664 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2665
2666 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2667 if (r < 0) {
2668 return r;
2669 }
2670
2671 return 0;
2672 }
2673
2674 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2675
2676 int ret = manifest_gen.create_next(ofs);
2677 if (ret < 0) {
2678 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2679 return ret;
2680 }
2681 cur_part_ofs = ofs;
2682 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2683 cur_obj = manifest_gen.get_cur_obj(store);
2684
2685 return 0;
2686 }
2687
2688 int RGWPutObjProcessor_Atomic::complete_parts()
2689 {
2690 if (obj_len > (uint64_t)cur_part_ofs) {
2691 return prepare_next_part(obj_len);
2692 }
2693 return 0;
2694 }
2695
2696 int RGWPutObjProcessor_Atomic::complete_writing_data()
2697 {
2698 if (!data_ofs && !immutable_head()) {
2699 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2700 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2701 * clobber first_chunk
2702 */
2703 if (pending_data_bl.length() > 0) {
2704 first_chunk.claim(pending_data_bl);
2705 }
2706 obj_len = (uint64_t)first_chunk.length();
2707 }
2708 while (pending_data_bl.length()) {
2709 void *handle = nullptr;
2710 rgw_raw_obj obj;
2711 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2712 if (max_write_size > pending_data_bl.length()) {
2713 max_write_size = pending_data_bl.length();
2714 }
2715 bufferlist bl;
2716 pending_data_bl.splice(0, max_write_size, &bl);
2717 uint64_t write_len = bl.length();
2718 int r = write_data(bl, data_ofs, &handle, &obj, false);
2719 if (r < 0) {
2720 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2721 return r;
2722 }
2723 data_ofs += write_len;
2724 r = throttle_data(handle, obj, write_len, false);
2725 if (r < 0) {
2726 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2727 return r;
2728 }
2729
2730 if (data_ofs >= next_part_ofs) {
2731 r = prepare_next_part(data_ofs);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2734 return r;
2735 }
2736 }
2737 }
2738 int r = complete_parts();
2739 if (r < 0) {
2740 return r;
2741 }
2742
2743 r = drain_pending();
2744 if (r < 0)
2745 return r;
2746
2747 return 0;
2748 }
2749
2750 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2751 real_time *mtime, real_time set_mtime,
2752 map<string, bufferlist>& attrs,
2753 real_time delete_at,
2754 const char *if_match,
2755 const char *if_nomatch, const string *user_data,
2756 rgw_zone_set *zones_trace) {
2757 int r = complete_writing_data();
2758 if (r < 0)
2759 return r;
2760
2761 obj_ctx.obj.set_atomic(head_obj);
2762
2763 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2764
2765 /* some object types shouldn't be versioned, e.g., multipart parts */
2766 op_target.set_versioning_disabled(!versioned_object);
2767
2768 RGWRados::Object::Write obj_op(&op_target);
2769
2770 obj_op.meta.data = &first_chunk;
2771 obj_op.meta.manifest = &manifest;
2772 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2773 obj_op.meta.if_match = if_match;
2774 obj_op.meta.if_nomatch = if_nomatch;
2775 obj_op.meta.mtime = mtime;
2776 obj_op.meta.set_mtime = set_mtime;
2777 obj_op.meta.owner = bucket_info.owner;
2778 obj_op.meta.flags = PUT_OBJ_CREATE;
2779 obj_op.meta.olh_epoch = olh_epoch;
2780 obj_op.meta.delete_at = delete_at;
2781 obj_op.meta.user_data = user_data;
2782 obj_op.meta.zones_trace = zones_trace;
2783 obj_op.meta.modify_tail = true;
2784
2785 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2786 if (r < 0) {
2787 return r;
2788 }
2789
2790 canceled = obj_op.meta.canceled;
2791
2792 return 0;
2793 }
2794
2795 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2796 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2797 if (r < 0)
2798 return r;
2799 return 0;
2800 }
2801
2802 int RGWRados::unwatch(uint64_t watch_handle)
2803 {
2804 int r = control_pool_ctx.unwatch2(watch_handle);
2805 if (r < 0) {
2806 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2807 return r;
2808 }
2809 r = rados[0].watch_flush();
2810 if (r < 0) {
2811 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2812 return r;
2813 }
2814 return 0;
2815 }
2816
2817 void RGWRados::add_watcher(int i)
2818 {
2819 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2820 Mutex::Locker l(watchers_lock);
2821 watchers_set.insert(i);
2822 if (watchers_set.size() == (size_t)num_watchers) {
2823 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2824 set_cache_enabled(true);
2825 }
2826 }
2827
2828 void RGWRados::remove_watcher(int i)
2829 {
2830 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2831 Mutex::Locker l(watchers_lock);
2832 size_t orig_size = watchers_set.size();
2833 watchers_set.erase(i);
2834 if (orig_size == (size_t)num_watchers &&
2835 watchers_set.size() < orig_size) { /* actually removed */
2836 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2837 set_cache_enabled(false);
2838 }
2839 }
2840
2841 class RGWWatcher : public librados::WatchCtx2 {
2842 RGWRados *rados;
2843 int index;
2844 string oid;
2845 uint64_t watch_handle;
2846
2847 class C_ReinitWatch : public Context {
2848 RGWWatcher *watcher;
2849 public:
2850 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2851 void finish(int r) override {
2852 watcher->reinit();
2853 }
2854 };
2855 public:
2856 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2857 void handle_notify(uint64_t notify_id,
2858 uint64_t cookie,
2859 uint64_t notifier_id,
2860 bufferlist& bl) override {
2861 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2862 << " notify_id " << notify_id
2863 << " cookie " << cookie
2864 << " notifier " << notifier_id
2865 << " bl.length()=" << bl.length() << dendl;
2866 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2867
2868 bufferlist reply_bl; // empty reply payload
2869 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2870 }
2871 void handle_error(uint64_t cookie, int err) override {
2872 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2873 << " err " << cpp_strerror(err) << dendl;
2874 rados->remove_watcher(index);
2875 rados->schedule_context(new C_ReinitWatch(this));
2876 }
2877
2878 void reinit() {
2879 int ret = unregister_watch();
2880 if (ret < 0) {
2881 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2882 return;
2883 }
2884 ret = register_watch();
2885 if (ret < 0) {
2886 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2887 return;
2888 }
2889 }
2890
2891 int unregister_watch() {
2892 int r = rados->unwatch(watch_handle);
2893 if (r < 0) {
2894 return r;
2895 }
2896 rados->remove_watcher(index);
2897 return 0;
2898 }
2899
2900 int register_watch() {
2901 int r = rados->watch(oid, &watch_handle, this);
2902 if (r < 0) {
2903 return r;
2904 }
2905 rados->add_watcher(index);
2906 return 0;
2907 }
2908 };
2909
2910 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2911 RGWRados *store;
2912 RGWHTTPManager http_manager;
2913
2914 public:
2915 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2916 http_manager(store->ctx(), completion_mgr) {
2917 http_manager.set_threaded();
2918 }
2919
2920 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2921 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2922 { "notify", NULL },
2923 { NULL, NULL } };
2924
2925 list<RGWCoroutinesStack *> stacks;
2926 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2927 RGWRESTConn *conn = iter->second;
2928 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2929 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2930
2931 stacks.push_back(stack);
2932 }
2933 return run(stacks);
2934 }
2935 };
2936
2937 class RGWDataNotifierManager : public RGWCoroutinesManager {
2938 RGWRados *store;
2939 RGWHTTPManager http_manager;
2940
2941 public:
2942 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2943 http_manager(store->ctx(), completion_mgr) {
2944 http_manager.set_threaded();
2945 }
2946
2947 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2948 rgw_http_param_pair pairs[] = { { "type", "data" },
2949 { "notify", NULL },
2950 { "source-zone", store->get_zone_params().get_id().c_str() },
2951 { NULL, NULL } };
2952
2953 list<RGWCoroutinesStack *> stacks;
2954 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2955 RGWRESTConn *conn = iter->second;
2956 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2957 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2958
2959 stacks.push_back(stack);
2960 }
2961 return run(stacks);
2962 }
2963 };
2964
2965 class RGWRadosThread {
2966 class Worker : public Thread {
2967 CephContext *cct;
2968 RGWRadosThread *processor;
2969 Mutex lock;
2970 Cond cond;
2971
2972 void wait() {
2973 Mutex::Locker l(lock);
2974 cond.Wait(lock);
2975 };
2976
2977 void wait_interval(const utime_t& wait_time) {
2978 Mutex::Locker l(lock);
2979 cond.WaitInterval(lock, wait_time);
2980 }
2981
2982 public:
2983 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2984 void *entry() override;
2985 void signal() {
2986 Mutex::Locker l(lock);
2987 cond.Signal();
2988 }
2989 };
2990
2991 Worker *worker;
2992
2993 protected:
2994 CephContext *cct;
2995 RGWRados *store;
2996
2997 std::atomic<bool> down_flag = { false };
2998
2999 string thread_name;
3000
3001 virtual uint64_t interval_msec() = 0;
3002 virtual void stop_process() {}
3003 public:
3004 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3005 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3006 virtual ~RGWRadosThread() {
3007 stop();
3008 }
3009
3010 virtual int init() { return 0; }
3011 virtual int process() = 0;
3012
3013 bool going_down() { return down_flag; }
3014
3015 void start();
3016 void stop();
3017
3018 void signal() {
3019 if (worker) {
3020 worker->signal();
3021 }
3022 }
3023 };
3024
3025 void RGWRadosThread::start()
3026 {
3027 worker = new Worker(cct, this);
3028 worker->create(thread_name.c_str());
3029 }
3030
3031 void RGWRadosThread::stop()
3032 {
3033 down_flag = true;
3034 stop_process();
3035 if (worker) {
3036 worker->signal();
3037 worker->join();
3038 }
3039 delete worker;
3040 worker = NULL;
3041 }
3042
3043 void *RGWRadosThread::Worker::entry() {
3044 uint64_t msec = processor->interval_msec();
3045 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3046
3047 do {
3048 utime_t start = ceph_clock_now();
3049 int r = processor->process();
3050 if (r < 0) {
3051 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3052 }
3053
3054 if (processor->going_down())
3055 break;
3056
3057 utime_t end = ceph_clock_now();
3058 end -= start;
3059
3060 uint64_t cur_msec = processor->interval_msec();
3061 if (cur_msec != msec) { /* was it reconfigured? */
3062 msec = cur_msec;
3063 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3064 }
3065
3066 if (cur_msec > 0) {
3067 if (interval <= end)
3068 continue; // next round
3069
3070 utime_t wait_time = interval;
3071 wait_time -= end;
3072
3073 wait_interval(wait_time);
3074 } else {
3075 wait();
3076 }
3077 } while (!processor->going_down());
3078
3079 return NULL;
3080 }
3081
3082 class RGWMetaNotifier : public RGWRadosThread {
3083 RGWMetaNotifierManager notify_mgr;
3084 RGWMetadataLog *const log;
3085
3086 uint64_t interval_msec() override {
3087 return cct->_conf->rgw_md_notify_interval_msec;
3088 }
3089 public:
3090 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3091 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3092
3093 int process() override;
3094 };
3095
3096 int RGWMetaNotifier::process()
3097 {
3098 set<int> shards;
3099
3100 log->read_clear_modified(shards);
3101
3102 if (shards.empty()) {
3103 return 0;
3104 }
3105
3106 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3107 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3108 }
3109
3110 notify_mgr.notify_all(store->zone_conn_map, shards);
3111
3112 return 0;
3113 }
3114
3115 class RGWDataNotifier : public RGWRadosThread {
3116 RGWDataNotifierManager notify_mgr;
3117
3118 uint64_t interval_msec() override {
3119 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3120 }
3121 public:
3122 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3123
3124 int process() override;
3125 };
3126
3127 int RGWDataNotifier::process()
3128 {
3129 if (!store->data_log) {
3130 return 0;
3131 }
3132
3133 map<int, set<string> > shards;
3134
3135 store->data_log->read_clear_modified(shards);
3136
3137 if (shards.empty()) {
3138 return 0;
3139 }
3140
3141 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3142 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3143 }
3144
3145 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3146
3147 return 0;
3148 }
3149
3150 class RGWSyncProcessorThread : public RGWRadosThread {
3151 public:
3152 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3153 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3154 ~RGWSyncProcessorThread() override {}
3155 int init() override = 0 ;
3156 int process() override = 0;
3157 };
3158
3159 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3160 {
3161 RGWMetaSyncStatusManager sync;
3162
3163 uint64_t interval_msec() override {
3164 return 0; /* no interval associated, it'll run once until stopped */
3165 }
3166 void stop_process() override {
3167 sync.stop();
3168 }
3169 public:
3170 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3171 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3172
3173 void wakeup_sync_shards(set<int>& shard_ids) {
3174 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3175 sync.wakeup(*iter);
3176 }
3177 }
3178 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3179
3180 int init() override {
3181 int ret = sync.init();
3182 if (ret < 0) {
3183 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3184 return ret;
3185 }
3186 return 0;
3187 }
3188
3189 int process() override {
3190 sync.run();
3191 return 0;
3192 }
3193 };
3194
3195 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3196 {
3197 RGWDataSyncStatusManager sync;
3198 bool initialized;
3199
3200 uint64_t interval_msec() override {
3201 if (initialized) {
3202 return 0; /* no interval associated, it'll run once until stopped */
3203 } else {
3204 #define DATA_SYNC_INIT_WAIT_SEC 20
3205 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3206 }
3207 }
3208 void stop_process() override {
3209 sync.stop();
3210 }
3211 public:
3212 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3213 const string& _source_zone)
3214 : RGWSyncProcessorThread(_store, "data-sync"), sync(_store, async_rados, _source_zone),
3215 initialized(false) {}
3216
3217 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3218 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3219 sync.wakeup(iter->first, iter->second);
3220 }
3221 }
3222 RGWDataSyncStatusManager* get_manager() { return &sync; }
3223
3224 int init() override {
3225 return 0;
3226 }
3227
3228 int process() override {
3229 while (!initialized) {
3230 if (going_down()) {
3231 return 0;
3232 }
3233 int ret = sync.init();
3234 if (ret >= 0) {
3235 initialized = true;
3236 break;
3237 }
3238 /* we'll be back! */
3239 return 0;
3240 }
3241 sync.run();
3242 return 0;
3243 }
3244 };
3245
3246 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3247 {
3248 RGWCoroutinesManager crs;
3249 RGWRados *store;
3250 RGWHTTPManager http;
3251 const utime_t trim_interval;
3252
3253 uint64_t interval_msec() override { return 0; }
3254 void stop_process() override { crs.stop(); }
3255 public:
3256 RGWSyncLogTrimThread(RGWRados *store, int interval)
3257 : RGWSyncProcessorThread(store, "sync-log-trim"),
3258 crs(store->ctx(), store->get_cr_registry()), store(store),
3259 http(store->ctx(), crs.get_completion_mgr()),
3260 trim_interval(interval, 0)
3261 {}
3262
3263 int init() override {
3264 return http.set_threaded();
3265 }
3266 int process() override {
3267 list<RGWCoroutinesStack*> stacks;
3268 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3269 meta->call(create_meta_log_trim_cr(store, &http,
3270 cct->_conf->rgw_md_log_max_shards,
3271 trim_interval));
3272 stacks.push_back(meta);
3273
3274 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3275 data->call(create_data_log_trim_cr(store, &http,
3276 cct->_conf->rgw_data_log_num_shards,
3277 trim_interval));
3278 stacks.push_back(data);
3279
3280 crs.run(stacks);
3281 return 0;
3282 }
3283 };
3284
3285 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3286 {
3287 Mutex::Locker l(meta_sync_thread_lock);
3288 if (meta_sync_processor_thread) {
3289 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3290 }
3291 }
3292
3293 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3294 {
3295 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3296 Mutex::Locker l(data_sync_thread_lock);
3297 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3298 if (iter == data_sync_processor_threads.end()) {
3299 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3300 return;
3301 }
3302
3303 RGWDataSyncProcessorThread *thread = iter->second;
3304 assert(thread);
3305 thread->wakeup_sync_shards(shard_ids);
3306 }
3307
3308 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3309 {
3310 Mutex::Locker l(meta_sync_thread_lock);
3311 if (meta_sync_processor_thread) {
3312 return meta_sync_processor_thread->get_manager();
3313 }
3314 return nullptr;
3315 }
3316
3317 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3318 {
3319 Mutex::Locker l(data_sync_thread_lock);
3320 auto thread = data_sync_processor_threads.find(source_zone);
3321 if (thread == data_sync_processor_threads.end()) {
3322 return nullptr;
3323 }
3324 return thread->second->get_manager();
3325 }
3326
3327 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3328 {
3329 IoCtx ioctx;
3330 int r = open_pool_ctx(pool, ioctx);
3331 if (r < 0) {
3332 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3333 return r;
3334 }
3335
3336 bool requires;
3337 r = ioctx.pool_requires_alignment2(&requires);
3338 if (r < 0) {
3339 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3340 << r << dendl;
3341 return r;
3342 }
3343
3344 if (!requires) {
3345 *alignment = 0;
3346 return 0;
3347 }
3348
3349 uint64_t align;
3350 r = ioctx.pool_required_alignment2(&align);
3351 if (r < 0) {
3352 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3353 << r << dendl;
3354 return r;
3355 }
3356 if (align != 0) {
3357 ldout(cct, 20) << "required alignment=" << align << dendl;
3358 }
3359 *alignment = align;
3360 return 0;
3361 }
3362
3363 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3364 {
3365 uint64_t alignment = 0;
3366 int r = get_required_alignment(pool, &alignment);
3367 if (r < 0) {
3368 return r;
3369 }
3370
3371 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3372
3373 if (alignment == 0) {
3374 *max_chunk_size = config_chunk_size;
3375 return 0;
3376 }
3377
3378 if (config_chunk_size <= alignment) {
3379 *max_chunk_size = alignment;
3380 return 0;
3381 }
3382
3383 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3384
3385 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3386
3387 return 0;
3388 }
3389
3390 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3391 {
3392 rgw_pool pool;
3393 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3394 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3395 return -EIO;
3396 }
3397 return get_max_chunk_size(pool, max_chunk_size);
3398 }
3399
3400 class RGWIndexCompletionManager;
3401
3402 struct complete_op_data {
3403 Mutex lock{"complete_op_data"};
3404 AioCompletion *rados_completion{nullptr};
3405 int manager_shard_id{-1};
3406 RGWIndexCompletionManager *manager{nullptr};
3407 rgw_obj obj;
3408 RGWModifyOp op;
3409 string tag;
3410 rgw_bucket_entry_ver ver;
3411 cls_rgw_obj_key key;
3412 rgw_bucket_dir_entry_meta dir_meta;
3413 list<cls_rgw_obj_key> remove_objs;
3414 bool log_op;
3415 uint16_t bilog_op;
3416 rgw_zone_set zones_trace;
3417
3418 bool stopped{false};
3419
3420 void stop() {
3421 Mutex::Locker l(lock);
3422 stopped = true;
3423 }
3424 };
3425
3426 class RGWIndexCompletionThread : public RGWRadosThread {
3427 RGWRados *store;
3428
3429 uint64_t interval_msec() override {
3430 return 0;
3431 }
3432
3433 list<complete_op_data *> completions;
3434
3435 Mutex completions_lock;
3436 public:
3437 RGWIndexCompletionThread(RGWRados *_store)
3438 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3439
3440 int process() override;
3441
3442 void add_completion(complete_op_data *completion) {
3443 {
3444 Mutex::Locker l(completions_lock);
3445 completions.push_back(completion);
3446 }
3447
3448 signal();
3449 }
3450 };
3451
3452 int RGWIndexCompletionThread::process()
3453 {
3454 list<complete_op_data *> comps;
3455
3456 {
3457 Mutex::Locker l(completions_lock);
3458 completions.swap(comps);
3459 }
3460
3461 for (auto c : comps) {
3462 std::unique_ptr<complete_op_data> up{c};
3463
3464 if (going_down()) {
3465 continue;
3466 }
3467 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3468
3469 RGWRados::BucketShard bs(store);
3470
3471 int r = bs.init(c->obj.bucket, c->obj);
3472 if (r < 0) {
3473 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3474 /* not much to do */
3475 continue;
3476 }
3477
3478 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3479 librados::ObjectWriteOperation o;
3480 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3481 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3482 c->log_op, c->bilog_op, &c->zones_trace);
3483
3484 return bs->index_ctx.operate(bs->bucket_obj, &o);
3485 });
3486 if (r < 0) {
3487 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3488 /* ignoring error, can't do anything about it */
3489 continue;
3490 }
3491 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3492 if (r < 0) {
3493 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3494 }
3495 }
3496
3497 return 0;
3498 }
3499
3500 class RGWIndexCompletionManager {
3501 RGWRados *store{nullptr};
3502 vector<Mutex *> locks;
3503 vector<set<complete_op_data *> > completions;
3504
3505 RGWIndexCompletionThread *completion_thread{nullptr};
3506
3507 int num_shards;
3508
3509 std::atomic<int> cur_shard {0};
3510
3511
3512 public:
3513 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3514 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3515
3516 for (int i = 0; i < num_shards; i++) {
3517 char buf[64];
3518 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3519 locks.push_back(new Mutex(buf));
3520 }
3521
3522 completions.resize(num_shards);
3523 }
3524 ~RGWIndexCompletionManager() {
3525 stop();
3526
3527 for (auto l : locks) {
3528 delete l;
3529 }
3530 }
3531
3532 int next_shard() {
3533 int result = cur_shard % num_shards;
3534 cur_shard++;
3535 return result;
3536 }
3537
3538 void create_completion(const rgw_obj& obj,
3539 RGWModifyOp op, string& tag,
3540 rgw_bucket_entry_ver& ver,
3541 const cls_rgw_obj_key& key,
3542 rgw_bucket_dir_entry_meta& dir_meta,
3543 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3544 uint16_t bilog_op,
3545 rgw_zone_set *zones_trace,
3546 complete_op_data **result);
3547 bool handle_completion(completion_t cb, complete_op_data *arg);
3548
3549 int start() {
3550 completion_thread = new RGWIndexCompletionThread(store);
3551 int ret = completion_thread->init();
3552 if (ret < 0) {
3553 return ret;
3554 }
3555 completion_thread->start();
3556 return 0;
3557 }
3558 void stop() {
3559 if (completion_thread) {
3560 completion_thread->stop();
3561 delete completion_thread;
3562 }
3563
3564 for (int i = 0; i < num_shards; ++i) {
3565 Mutex::Locker l(*locks[i]);
3566 for (auto c : completions[i]) {
3567 Mutex::Locker cl(c->lock);
3568 c->stop();
3569 }
3570 }
3571 completions.clear();
3572 }
3573 };
3574
3575 static void obj_complete_cb(completion_t cb, void *arg)
3576 {
3577 complete_op_data *completion = (complete_op_data *)arg;
3578 completion->lock.Lock();
3579 if (completion->stopped) {
3580 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3581 delete completion;
3582 return;
3583 }
3584 bool need_delete = completion->manager->handle_completion(cb, completion);
3585 completion->lock.Unlock();
3586 if (need_delete) {
3587 delete completion;
3588 }
3589 }
3590
3591
3592 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3593 RGWModifyOp op, string& tag,
3594 rgw_bucket_entry_ver& ver,
3595 const cls_rgw_obj_key& key,
3596 rgw_bucket_dir_entry_meta& dir_meta,
3597 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3598 uint16_t bilog_op,
3599 rgw_zone_set *zones_trace,
3600 complete_op_data **result)
3601 {
3602 complete_op_data *entry = new complete_op_data;
3603
3604 int shard_id = next_shard();
3605
3606 entry->manager_shard_id = shard_id;
3607 entry->manager = this;
3608 entry->obj = obj;
3609 entry->op = op;
3610 entry->tag = tag;
3611 entry->ver = ver;
3612 entry->key = key;
3613 entry->dir_meta = dir_meta;
3614 entry->log_op = log_op;
3615 entry->bilog_op = bilog_op;
3616
3617 if (remove_objs) {
3618 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3619 entry->remove_objs.push_back(*iter);
3620 }
3621 }
3622
3623 if (zones_trace) {
3624 entry->zones_trace = *zones_trace;
3625 } else {
3626 entry->zones_trace.insert(store->get_zone().id);
3627 }
3628
3629 *result = entry;
3630
3631 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3632
3633 Mutex::Locker l(*locks[shard_id]);
3634 completions[shard_id].insert(entry);
3635 }
3636
3637 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3638 {
3639 int shard_id = arg->manager_shard_id;
3640 {
3641 Mutex::Locker l(*locks[shard_id]);
3642
3643 auto& comps = completions[shard_id];
3644
3645 auto iter = comps.find(arg);
3646 if (iter == comps.end()) {
3647 return true;
3648 }
3649
3650 comps.erase(iter);
3651 }
3652
3653 int r = rados_aio_get_return_value(cb);
3654 if (r != -ERR_BUSY_RESHARDING) {
3655 return true;
3656 }
3657 completion_thread->add_completion(arg);
3658 return false;
3659 }
3660
3661 void RGWRados::finalize()
3662 {
3663 if (run_sync_thread) {
3664 Mutex::Locker l(meta_sync_thread_lock);
3665 meta_sync_processor_thread->stop();
3666
3667 Mutex::Locker dl(data_sync_thread_lock);
3668 for (auto iter : data_sync_processor_threads) {
3669 RGWDataSyncProcessorThread *thread = iter.second;
3670 thread->stop();
3671 }
3672 if (sync_log_trimmer) {
3673 sync_log_trimmer->stop();
3674 }
3675 }
3676 if (async_rados) {
3677 async_rados->stop();
3678 }
3679 if (run_sync_thread) {
3680 delete meta_sync_processor_thread;
3681 meta_sync_processor_thread = NULL;
3682 Mutex::Locker dl(data_sync_thread_lock);
3683 for (auto iter : data_sync_processor_threads) {
3684 RGWDataSyncProcessorThread *thread = iter.second;
3685 delete thread;
3686 }
3687 data_sync_processor_threads.clear();
3688 delete sync_log_trimmer;
3689 sync_log_trimmer = nullptr;
3690 }
3691 if (finisher) {
3692 finisher->stop();
3693 }
3694 if (need_watch_notify()) {
3695 finalize_watch();
3696 }
3697 if (finisher) {
3698 /* delete finisher only after cleaning up watches, as watch error path might call
3699 * into finisher. We stop finisher before finalizing watch to make sure we don't
3700 * actually handle any racing work
3701 */
3702 delete finisher;
3703 }
3704 if (meta_notifier) {
3705 meta_notifier->stop();
3706 delete meta_notifier;
3707 }
3708 if (data_notifier) {
3709 data_notifier->stop();
3710 delete data_notifier;
3711 }
3712 delete data_log;
3713 if (async_rados) {
3714 delete async_rados;
3715 }
3716
3717 delete lc;
3718 lc = NULL;
3719
3720 delete gc;
3721 gc = NULL;
3722
3723 delete obj_expirer;
3724 obj_expirer = NULL;
3725
3726 delete rest_master_conn;
3727
3728 map<string, RGWRESTConn *>::iterator iter;
3729 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3730 RGWRESTConn *conn = iter->second;
3731 delete conn;
3732 }
3733
3734 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3735 RGWRESTConn *conn = iter->second;
3736 delete conn;
3737 }
3738 RGWQuotaHandler::free_handler(quota_handler);
3739 if (cr_registry) {
3740 cr_registry->put();
3741 }
3742 delete meta_mgr;
3743 delete binfo_cache;
3744 delete obj_tombstone_cache;
3745 delete sync_modules_manager;
3746
3747 if (reshard_wait.get()) {
3748 reshard_wait->stop();
3749 reshard_wait.reset();
3750 }
3751
3752 if (run_reshard_thread) {
3753 reshard->stop_processor();
3754 }
3755 delete reshard;
3756 delete index_completion_manager;
3757 }
3758
3759 /**
3760 * Initialize the RADOS instance and prepare to do other ops
3761 * Returns 0 on success, -ERR# on failure.
3762 */
3763 int RGWRados::init_rados()
3764 {
3765 int ret = 0;
3766 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3767
3768 for (auto& r : handles) {
3769 ret = r.init_with_context(cct);
3770 if (ret < 0) {
3771 return ret;
3772 }
3773 ret = r.connect();
3774 if (ret < 0) {
3775 return ret;
3776 }
3777 }
3778
3779 sync_modules_manager = new RGWSyncModulesManager();
3780
3781 rgw_register_sync_modules(sync_modules_manager);
3782
3783 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3784 new RGWCoroutinesManagerRegistry(cct)};
3785 ret = crs->hook_to_admin_command("cr dump");
3786 if (ret < 0) {
3787 return ret;
3788 }
3789
3790 meta_mgr = new RGWMetadataManager(cct, this);
3791 data_log = new RGWDataChangesLog(cct, this);
3792 cr_registry = crs.release();
3793
3794 std::swap(handles, rados);
3795 return ret;
3796 }
3797
3798
3799 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3800 {
3801 map<string,string> metadata = meta;
3802 metadata["num_handles"] = stringify(rados.size());
3803 metadata["zonegroup_id"] = zonegroup.get_id();
3804 metadata["zonegroup_name"] = zonegroup.get_name();
3805 metadata["zone_name"] = zone_name();
3806 metadata["zone_id"] = zone_id();;
3807 string name = cct->_conf->name.get_id();
3808 if (name.find("rgw.") == 0) {
3809 name = name.substr(4);
3810 }
3811 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3812 if (ret < 0) {
3813 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3814 return ret;
3815 }
3816
3817 return 0;
3818 }
3819
3820 /**
3821 * Add new connection to connections map
3822 * @param zonegroup_conn_map map which new connection will be added to
3823 * @param zonegroup zonegroup which new connection will connect to
3824 * @param new_connection pointer to new connection instance
3825 */
3826 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3827 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3828 {
3829 // Delete if connection is already exists
3830 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3831 if (iterZoneGroup != zonegroup_conn_map.end()) {
3832 delete iterZoneGroup->second;
3833 }
3834
3835 // Add new connection to connections map
3836 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3837 }
3838
3839 int RGWRados::convert_regionmap()
3840 {
3841 RGWZoneGroupMap zonegroupmap;
3842
3843 string pool_name = cct->_conf->rgw_zone_root_pool;
3844 if (pool_name.empty()) {
3845 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3846 }
3847 string oid = region_map_oid;
3848
3849 rgw_pool pool(pool_name);
3850 bufferlist bl;
3851 RGWObjectCtx obj_ctx(this);
3852 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3853 if (ret < 0 && ret != -ENOENT) {
3854 return ret;
3855 } else if (ret == -ENOENT) {
3856 return 0;
3857 }
3858
3859 try {
3860 bufferlist::iterator iter = bl.begin();
3861 ::decode(zonegroupmap, iter);
3862 } catch (buffer::error& err) {
3863 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3864 return -EIO;
3865 }
3866
3867 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3868 iter != zonegroupmap.zonegroups.end(); ++iter) {
3869 RGWZoneGroup& zonegroup = iter->second;
3870 ret = zonegroup.init(cct, this, false);
3871 ret = zonegroup.update();
3872 if (ret < 0 && ret != -ENOENT) {
3873 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3874 cpp_strerror(-ret) << dendl;
3875 return ret;
3876 } else if (ret == -ENOENT) {
3877 ret = zonegroup.create();
3878 if (ret < 0) {
3879 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3880 cpp_strerror(-ret) << dendl;
3881 return ret;
3882 }
3883 }
3884 }
3885
3886 current_period.set_user_quota(zonegroupmap.user_quota);
3887 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3888
3889 // remove the region_map so we don't try to convert again
3890 rgw_raw_obj obj(pool, oid);
3891 ret = delete_system_obj(obj);
3892 if (ret < 0) {
3893 ldout(cct, 0) << "Error could not remove " << obj
3894 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3895 return ret;
3896 }
3897
3898 return 0;
3899 }
3900
3901 /**
3902 * Replace all region configuration with zonegroup for
3903 * backward compatability
3904 * Returns 0 on success, -ERR# on failure.
3905 */
3906 int RGWRados::replace_region_with_zonegroup()
3907 {
3908 /* copy default region */
3909 /* convert default region to default zonegroup */
3910 string default_oid = cct->_conf->rgw_default_region_info_oid;
3911 if (default_oid.empty()) {
3912 default_oid = default_region_info_oid;
3913 }
3914
3915
3916 RGWZoneGroup default_zonegroup;
3917 rgw_pool pool{default_zonegroup.get_pool(cct)};
3918 string oid = "converted";
3919 bufferlist bl;
3920 RGWObjectCtx obj_ctx(this);
3921
3922 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3923 if (ret < 0 && ret != -ENOENT) {
3924 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3925 << dendl;
3926 return ret;
3927 } else if (ret != -ENOENT) {
3928 ldout(cct, 20) << "System already converted " << dendl;
3929 return 0;
3930 }
3931
3932 string default_region;
3933 ret = default_zonegroup.init(cct, this, false, true);
3934 if (ret < 0) {
3935 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3936 return ret;
3937 }
3938 ret = default_zonegroup.read_default_id(default_region, true);
3939 if (ret < 0 && ret != -ENOENT) {
3940 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3941 return ret;
3942 }
3943
3944 /* convert regions to zonegroups */
3945 list<string> regions;
3946 ret = list_regions(regions);
3947 if (ret < 0 && ret != -ENOENT) {
3948 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3949 return ret;
3950 } else if (ret == -ENOENT || regions.empty()) {
3951 RGWZoneParams zoneparams(default_zone_name);
3952 int ret = zoneparams.init(cct, this);
3953 if (ret < 0 && ret != -ENOENT) {
3954 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3955 return ret;
3956 }
3957 /* update master zone */
3958 RGWZoneGroup default_zg(default_zonegroup_name);
3959 ret = default_zg.init(cct, this);
3960 if (ret < 0 && ret != -ENOENT) {
3961 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3962 return ret;
3963 }
3964 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3965 default_zg.master_zone = zoneparams.get_id();
3966 return default_zg.update();
3967 }
3968 return 0;
3969 }
3970
3971 string master_region, master_zone;
3972 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3973 if (*iter != default_zonegroup_name){
3974 RGWZoneGroup region(*iter);
3975 int ret = region.init(cct, this, true, true);
3976 if (ret < 0) {
3977 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3978 return ret;
3979 }
3980 if (region.is_master_zonegroup()) {
3981 master_region = region.get_id();
3982 master_zone = region.master_zone;
3983 }
3984 }
3985 }
3986
3987 /* create realm if there is none.
3988 The realm name will be the region and zone concatenated
3989 realm id will be mds of its name */
3990 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
3991 string new_realm_name = master_region + "." + master_zone;
3992 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
3993 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
3994 MD5 hash;
3995 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
3996 hash.Final(md5);
3997 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
3998 string new_realm_id(md5_str);
3999 RGWRealm new_realm(new_realm_id,new_realm_name);
4000 ret = new_realm.init(cct, this, false);
4001 if (ret < 0) {
4002 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4003 return ret;
4004 }
4005 ret = new_realm.create();
4006 if (ret < 0 && ret != -EEXIST) {
4007 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4008 return ret;
4009 }
4010 ret = new_realm.set_as_default();
4011 if (ret < 0) {
4012 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4013 return ret;
4014 }
4015 ret = realm.init(cct, this);
4016 if (ret < 0) {
4017 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4018 return ret;
4019 }
4020 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4021 if (ret < 0) {
4022 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4023 return ret;
4024 }
4025 }
4026
4027 list<string>::iterator iter;
4028 /* create zonegroups */
4029 for (iter = regions.begin(); iter != regions.end(); ++iter)
4030 {
4031 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4032 /* check to see if we don't have already a zonegroup with this name */
4033 RGWZoneGroup new_zonegroup(*iter);
4034 ret = new_zonegroup.init(cct , this);
4035 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4036 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4037 " skipping conversion " << dendl;
4038 continue;
4039 }
4040 RGWZoneGroup zonegroup(*iter);
4041 zonegroup.set_id(*iter);
4042 int ret = zonegroup.init(cct, this, true, true);
4043 if (ret < 0) {
4044 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4045 return ret;
4046 }
4047 zonegroup.realm_id = realm.get_id();
4048 /* fix default region master zone */
4049 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4050 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4051 zonegroup.master_zone = default_zone_name;
4052 }
4053 ret = zonegroup.update();
4054 if (ret < 0 && ret != -EEXIST) {
4055 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4056 << dendl;
4057 return ret;
4058 }
4059 ret = zonegroup.update_name();
4060 if (ret < 0 && ret != -EEXIST) {
4061 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4062 << dendl;
4063 return ret;
4064 }
4065 if (zonegroup.get_name() == default_region) {
4066 ret = zonegroup.set_as_default();
4067 if (ret < 0) {
4068 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4069 << dendl;
4070 return ret;
4071 }
4072 }
4073 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4074 ++iter) {
4075 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4076 RGWZoneParams zoneparams(iter->first, iter->first);
4077 zoneparams.set_id(iter->first);
4078 zoneparams.realm_id = realm.get_id();
4079 ret = zoneparams.init(cct, this);
4080 if (ret < 0 && ret != -ENOENT) {
4081 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4082 return ret;
4083 } else if (ret == -ENOENT) {
4084 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4085 continue;
4086 }
4087 zonegroup.realm_id = realm.get_id();
4088 ret = zoneparams.update();
4089 if (ret < 0 && ret != -EEXIST) {
4090 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4091 return ret;
4092 }
4093 ret = zoneparams.update_name();
4094 if (ret < 0 && ret != -EEXIST) {
4095 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4096 return ret;
4097 }
4098 }
4099
4100 if (!current_period.get_id().empty()) {
4101 ret = current_period.add_zonegroup(zonegroup);
4102 if (ret < 0) {
4103 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4104 return ret;
4105 }
4106 }
4107 }
4108
4109 if (!current_period.get_id().empty()) {
4110 ret = current_period.update();
4111 if (ret < 0) {
4112 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4113 return ret;
4114 }
4115 ret = current_period.store_info(false);
4116 if (ret < 0) {
4117 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4118 return ret;
4119 }
4120 ret = current_period.reflect();
4121 if (ret < 0) {
4122 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4123 return ret;
4124 }
4125 }
4126
4127 for (auto const& iter : regions) {
4128 RGWZoneGroup zonegroup(iter);
4129 int ret = zonegroup.init(cct, this, true, true);
4130 if (ret < 0) {
4131 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4132 return ret;
4133 }
4134 ret = zonegroup.delete_obj(true);
4135 if (ret < 0 && ret != -ENOENT) {
4136 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4137 << dendl;
4138 return ret;
4139 }
4140 }
4141
4142 /* mark as converted */
4143 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4144 true, NULL, real_time(), NULL);
4145 if (ret < 0 ) {
4146 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4147 << dendl;
4148 return ret;
4149 }
4150
4151 return 0;
4152 }
4153
4154 int RGWRados::init_zg_from_period(bool *initialized)
4155 {
4156 *initialized = false;
4157
4158 if (current_period.get_id().empty()) {
4159 return 0;
4160 }
4161
4162 int ret = zonegroup.init(cct, this);
4163 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4164 if (ret == -ENOENT) {
4165 return 0;
4166 }
4167 if (ret < 0) {
4168 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4169 return ret;
4170 }
4171 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4172
4173 map<string, RGWZoneGroup>::const_iterator iter =
4174 current_period.get_map().zonegroups.find(zonegroup.get_id());
4175
4176 if (iter != current_period.get_map().zonegroups.end()) {
4177 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4178 zonegroup = iter->second;
4179 ret = zonegroup.init(cct, this, false);
4180 if (ret < 0) {
4181 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4182 return ret;
4183 }
4184 ret = zone_params.init(cct, this);
4185 if (ret < 0 && ret != -ENOENT) {
4186 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4187 return ret;
4188 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4189 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4190 zone_params.set_name(default_zone_name);
4191 ret = zone_params.init(cct, this);
4192 if (ret < 0 && ret != -ENOENT) {
4193 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4194 return ret;
4195 }
4196 }
4197 }
4198 for (iter = current_period.get_map().zonegroups.begin();
4199 iter != current_period.get_map().zonegroups.end(); ++iter){
4200 const RGWZoneGroup& zg = iter->second;
4201 // use endpoints from the zonegroup's master zone
4202 auto master = zg.zones.find(zg.master_zone);
4203 if (master == zg.zones.end()) {
4204 // fix missing master zone for a single zone zonegroup
4205 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4206 master = zg.zones.begin();
4207 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4208 master->second.name << " id:" << master->second.id << " as master" << dendl;
4209 if (zonegroup.get_id() == zg.get_id()) {
4210 zonegroup.master_zone = master->second.id;
4211 ret = zonegroup.update();
4212 if (ret < 0) {
4213 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4214 return ret;
4215 }
4216 } else {
4217 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4218 ret = fixed_zg.init(cct, this);
4219 if (ret < 0) {
4220 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4221 return ret;
4222 }
4223 fixed_zg.master_zone = master->second.id;
4224 ret = fixed_zg.update();
4225 if (ret < 0) {
4226 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4227 return ret;
4228 }
4229 }
4230 } else {
4231 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4232 zg.master_zone << dendl;
4233 return -EINVAL;
4234 }
4235 }
4236 const auto& endpoints = master->second.endpoints;
4237 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4238 if (!current_period.get_master_zonegroup().empty() &&
4239 zg.get_id() == current_period.get_master_zonegroup()) {
4240 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4241 }
4242 }
4243
4244 *initialized = true;
4245
4246 return 0;
4247 }
4248
4249 int RGWRados::init_zg_from_local(bool *creating_defaults)
4250 {
4251 int ret = zonegroup.init(cct, this);
4252 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4253 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4254 return ret;
4255 } else if (ret == -ENOENT) {
4256 *creating_defaults = true;
4257 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4258 ret = zonegroup.create_default();
4259 if (ret < 0) {
4260 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4261 << dendl;
4262 return ret;
4263 }
4264 ret = zonegroup.init(cct, this);
4265 if (ret < 0) {
4266 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4267 << dendl;
4268 return ret;
4269 }
4270 }
4271 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4272 if (zonegroup.is_master_zonegroup()) {
4273 // use endpoints from the zonegroup's master zone
4274 auto master = zonegroup.zones.find(zonegroup.master_zone);
4275 if (master == zonegroup.zones.end()) {
4276 // fix missing master zone for a single zone zonegroup
4277 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4278 master = zonegroup.zones.begin();
4279 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4280 master->second.name << " id:" << master->second.id << " as master" << dendl;
4281 zonegroup.master_zone = master->second.id;
4282 ret = zonegroup.update();
4283 if (ret < 0) {
4284 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4285 return ret;
4286 }
4287 } else {
4288 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4289 "master_zone=" << zonegroup.master_zone << dendl;
4290 return -EINVAL;
4291 }
4292 }
4293 const auto& endpoints = master->second.endpoints;
4294 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4295 }
4296
4297 return 0;
4298 }
4299
4300
4301 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4302 {
4303 return target_zone.syncs_from(source_zone.name) &&
4304 sync_modules_manager->supports_data_export(source_zone.tier_type);
4305 }
4306
4307 /**
4308 * Initialize the RADOS instance and prepare to do other ops
4309 * Returns 0 on success, -ERR# on failure.
4310 */
4311 int RGWRados::init_complete()
4312 {
4313 int ret = realm.init(cct, this);
4314 if (ret < 0 && ret != -ENOENT) {
4315 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4316 return ret;
4317 } else if (ret != -ENOENT) {
4318 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4319 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4320 if (ret < 0 && ret != -ENOENT) {
4321 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4322 return ret;
4323 }
4324 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4325 }
4326
4327 ret = replace_region_with_zonegroup();
4328 if (ret < 0) {
4329 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4330 return ret;
4331 }
4332
4333 ret = convert_regionmap();
4334 if (ret < 0) {
4335 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4336 return ret;
4337 }
4338
4339 bool zg_initialized = false;
4340
4341 if (!current_period.get_id().empty()) {
4342 ret = init_zg_from_period(&zg_initialized);
4343 if (ret < 0) {
4344 return ret;
4345 }
4346 }
4347
4348 bool creating_defaults = false;
4349 bool using_local = (!zg_initialized);
4350 if (using_local) {
4351 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4352 ret = init_zg_from_local(&creating_defaults);
4353 if (ret < 0) {
4354 return ret;
4355 }
4356 // read period_config into current_period
4357 auto& period_config = current_period.get_config();
4358 ret = period_config.read(this, zonegroup.realm_id);
4359 if (ret < 0 && ret != -ENOENT) {
4360 ldout(cct, 0) << "ERROR: failed to read period config: "
4361 << cpp_strerror(ret) << dendl;
4362 return ret;
4363 }
4364 }
4365
4366 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4367 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4368 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4369 zone_params.set_name(default_zone_name);
4370 }
4371
4372 ret = zone_params.init(cct, this);
4373 if (ret < 0 && ret != -ENOENT) {
4374 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4375 return ret;
4376 }
4377 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4378 if (zone_iter == get_zonegroup().zones.end()) {
4379 if (using_local) {
4380 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4381 return -EINVAL;
4382 }
4383 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4384 ret = init_zg_from_local(&creating_defaults);
4385 if (ret < 0) {
4386 return ret;
4387 }
4388 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4389 }
4390 if (zone_iter != get_zonegroup().zones.end()) {
4391 zone_public_config = zone_iter->second;
4392 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4393 } else {
4394 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4395 return -EINVAL;
4396 }
4397
4398 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4399
4400 if (run_sync_thread) {
4401 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4402 if (ret < 0) {
4403 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4404 return ret;
4405 }
4406 }
4407
4408 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4409
4410 init_unique_trans_id_deps();
4411
4412 finisher = new Finisher(cct);
4413 finisher->start();
4414
4415 period_puller.reset(new RGWPeriodPuller(this));
4416 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4417 current_period));
4418
4419 if (need_watch_notify()) {
4420 ret = init_watch();
4421 if (ret < 0) {
4422 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4423 return ret;
4424 }
4425 }
4426
4427 /* first build all zones index */
4428 for (auto ziter : get_zonegroup().zones) {
4429 const string& id = ziter.first;
4430 RGWZone& z = ziter.second;
4431 zone_id_by_name[z.name] = id;
4432 zone_by_id[id] = z;
4433 }
4434
4435 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4436 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4437 }
4438 zone_public_config = zone_by_id[zone_id()];
4439 for (auto ziter : get_zonegroup().zones) {
4440 const string& id = ziter.first;
4441 RGWZone& z = ziter.second;
4442 if (id == zone_id()) {
4443 continue;
4444 }
4445 if (z.endpoints.empty()) {
4446 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4447 continue;
4448 }
4449 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4450 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4451 zone_conn_map[id] = conn;
4452 if (zone_syncs_from(zone_public_config, z) ||
4453 zone_syncs_from(z, zone_public_config)) {
4454 if (zone_syncs_from(zone_public_config, z)) {
4455 zone_data_sync_from_map[id] = conn;
4456 }
4457 if (zone_syncs_from(z, zone_public_config)) {
4458 zone_data_notify_to_map[id] = conn;
4459 }
4460 } else {
4461 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4462 }
4463 }
4464
4465 ret = open_root_pool_ctx();
4466 if (ret < 0)
4467 return ret;
4468
4469 ret = open_gc_pool_ctx();
4470 if (ret < 0)
4471 return ret;
4472
4473 ret = open_lc_pool_ctx();
4474 if (ret < 0)
4475 return ret;
4476
4477 ret = open_objexp_pool_ctx();
4478 if (ret < 0)
4479 return ret;
4480
4481 ret = open_reshard_pool_ctx();
4482 if (ret < 0)
4483 return ret;
4484
4485 pools_initialized = true;
4486
4487 gc = new RGWGC();
4488 gc->initialize(cct, this);
4489
4490 obj_expirer = new RGWObjectExpirer(this);
4491
4492 if (use_gc_thread) {
4493 gc->start_processor();
4494 obj_expirer->start_processor();
4495 }
4496
4497 if (run_sync_thread) {
4498 // initialize the log period history. we want to do this any time we're not
4499 // running under radosgw-admin, so we check run_sync_thread here before
4500 // disabling it based on the zone/zonegroup setup
4501 meta_mgr->init_oldest_log_period();
4502 }
4503
4504 /* no point of running sync thread if we don't have a master zone configured
4505 or there is no rest_master_conn */
4506 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4507 || current_period.get_id().empty()) {
4508 run_sync_thread = false;
4509 }
4510
4511 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4512 async_rados->start();
4513
4514 ret = meta_mgr->init(current_period.get_id());
4515 if (ret < 0) {
4516 lderr(cct) << "ERROR: failed to initialize metadata log: "
4517 << cpp_strerror(-ret) << dendl;
4518 return ret;
4519 }
4520
4521 if (is_meta_master()) {
4522 auto md_log = meta_mgr->get_log(current_period.get_id());
4523 meta_notifier = new RGWMetaNotifier(this, md_log);
4524 meta_notifier->start();
4525 }
4526
4527 if (run_sync_thread) {
4528 Mutex::Locker l(meta_sync_thread_lock);
4529 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4530 ret = meta_sync_processor_thread->init();
4531 if (ret < 0) {
4532 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4533 return ret;
4534 }
4535 meta_sync_processor_thread->start();
4536
4537 Mutex::Locker dl(data_sync_thread_lock);
4538 for (auto iter : zone_data_sync_from_map) {
4539 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4540 RGWDataSyncProcessorThread *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
4541 ret = thread->init();
4542 if (ret < 0) {
4543 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4544 return ret;
4545 }
4546 thread->start();
4547 data_sync_processor_threads[iter.first] = thread;
4548 }
4549 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4550 if (interval > 0) {
4551 sync_log_trimmer = new RGWSyncLogTrimThread(this, interval);
4552 ret = sync_log_trimmer->init();
4553 if (ret < 0) {
4554 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4555 return ret;
4556 }
4557 sync_log_trimmer->start();
4558 }
4559 }
4560 data_notifier = new RGWDataNotifier(this);
4561 data_notifier->start();
4562
4563 lc = new RGWLC();
4564 lc->initialize(cct, this);
4565
4566 if (use_lc_thread)
4567 lc->start_processor();
4568
4569 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4570
4571 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4572 get_zone().bucket_index_max_shards);
4573 if (bucket_index_max_shards > get_max_bucket_shards()) {
4574 bucket_index_max_shards = get_max_bucket_shards();
4575 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4576 << get_max_bucket_shards() << dendl;
4577 }
4578 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4579
4580 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4581 binfo_cache->init(this);
4582
4583 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4584
4585 if (need_tombstone_cache) {
4586 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4587 }
4588
4589 reshard_wait = std::make_shared<RGWReshardWait>(this);
4590
4591 reshard = new RGWReshard(this);
4592
4593 /* only the master zone in the zonegroup reshards buckets */
4594 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4595 if (run_reshard_thread) {
4596 reshard->start_processor();
4597 }
4598
4599 index_completion_manager = new RGWIndexCompletionManager(this);
4600 ret = index_completion_manager->start();
4601
4602 return ret;
4603 }
4604
4605 /**
4606 * Initialize the RADOS instance and prepare to do other ops
4607 * Returns 0 on success, -ERR# on failure.
4608 */
4609 int RGWRados::initialize()
4610 {
4611 int ret;
4612
4613 ret = init_rados();
4614 if (ret < 0)
4615 return ret;
4616
4617 return init_complete();
4618 }
4619
4620 void RGWRados::finalize_watch()
4621 {
4622 for (int i = 0; i < num_watchers; i++) {
4623 RGWWatcher *watcher = watchers[i];
4624 watcher->unregister_watch();
4625 delete watcher;
4626 }
4627
4628 delete[] notify_oids;
4629 delete[] watchers;
4630 }
4631
4632 void RGWRados::schedule_context(Context *c) {
4633 finisher->queue(c);
4634 }
4635
4636 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4637 {
4638 bool is_truncated;
4639 RGWListRawObjsCtx ctx;
4640 do {
4641 list<string> oids;
4642 int r = list_raw_objects(pool, prefix, 1000,
4643 ctx, oids, &is_truncated);
4644 if (r < 0) {
4645 return r;
4646 }
4647 list<string>::iterator iter;
4648 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4649 string& val = *iter;
4650 if (val.size() > prefix.size())
4651 result.push_back(val.substr(prefix.size()));
4652 }
4653 } while (is_truncated);
4654
4655 return 0;
4656 }
4657
4658 int RGWRados::list_regions(list<string>& regions)
4659 {
4660 RGWZoneGroup zonegroup;
4661
4662 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4663 }
4664
4665 int RGWRados::list_zonegroups(list<string>& zonegroups)
4666 {
4667 RGWZoneGroup zonegroup;
4668
4669 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4670 }
4671
4672 int RGWRados::list_zones(list<string>& zones)
4673 {
4674 RGWZoneParams zoneparams;
4675
4676 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4677 }
4678
4679 int RGWRados::list_realms(list<string>& realms)
4680 {
4681 RGWRealm realm(cct, this);
4682 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4683 }
4684
4685 int RGWRados::list_periods(list<string>& periods)
4686 {
4687 RGWPeriod period;
4688 list<string> raw_periods;
4689 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4690 if (ret < 0) {
4691 return ret;
4692 }
4693 for (const auto& oid : raw_periods) {
4694 size_t pos = oid.find(".");
4695 if (pos != std::string::npos) {
4696 periods.push_back(oid.substr(0, pos));
4697 } else {
4698 periods.push_back(oid);
4699 }
4700 }
4701 periods.sort(); // unique() only detects duplicates if they're adjacent
4702 periods.unique();
4703 return 0;
4704 }
4705
4706
4707 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4708 {
4709 int ret = 0;
4710 string period_id = current_period;
4711 while(!period_id.empty()) {
4712 RGWPeriod period(period_id);
4713 ret = period.init(cct, this);
4714 if (ret < 0) {
4715 return ret;
4716 }
4717 periods.push_back(period.get_id());
4718 period_id = period.get_predecessor();
4719 }
4720
4721 return ret;
4722 }
4723
4724 /**
4725 * Open the pool used as root for this gateway
4726 * Returns: 0 on success, -ERR# otherwise.
4727 */
4728 int RGWRados::open_root_pool_ctx()
4729 {
4730 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4731 }
4732
4733 int RGWRados::open_gc_pool_ctx()
4734 {
4735 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4736 }
4737
4738 int RGWRados::open_lc_pool_ctx()
4739 {
4740 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4741 }
4742
4743 int RGWRados::open_objexp_pool_ctx()
4744 {
4745 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4746 }
4747
4748 int RGWRados::open_reshard_pool_ctx()
4749 {
4750 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4751 }
4752
4753 int RGWRados::init_watch()
4754 {
4755 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4756 if (r < 0) {
4757 return r;
4758 }
4759
4760 num_watchers = cct->_conf->rgw_num_control_oids;
4761
4762 bool compat_oid = (num_watchers == 0);
4763
4764 if (num_watchers <= 0)
4765 num_watchers = 1;
4766
4767 notify_oids = new string[num_watchers];
4768 watchers = new RGWWatcher *[num_watchers];
4769
4770 for (int i=0; i < num_watchers; i++) {
4771 string& notify_oid = notify_oids[i];
4772 notify_oid = notify_oid_prefix;
4773 if (!compat_oid) {
4774 char buf[16];
4775 snprintf(buf, sizeof(buf), ".%d", i);
4776 notify_oid.append(buf);
4777 }
4778 r = control_pool_ctx.create(notify_oid, false);
4779 if (r < 0 && r != -EEXIST)
4780 return r;
4781
4782 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4783 watchers[i] = watcher;
4784
4785 r = watcher->register_watch();
4786 if (r < 0)
4787 return r;
4788 }
4789
4790 watch_initialized = true;
4791
4792 set_cache_enabled(true);
4793
4794 return 0;
4795 }
4796
4797 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4798 {
4799 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4800
4801 int i = r % num_watchers;
4802 char buf[16];
4803 snprintf(buf, sizeof(buf), ".%d", i);
4804
4805 notify_oid = notify_oid_prefix;
4806 notify_oid.append(buf);
4807 }
4808
4809 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4810 {
4811 librados::Rados *rad = get_rados_handle();
4812 int r = rgw_init_ioctx(rad, pool, io_ctx);
4813 if (r != -ENOENT)
4814 return r;
4815
4816 if (!pools_initialized)
4817 return r;
4818
4819 r = rad->pool_create(pool.name.c_str());
4820 if (r < 0 && r != -EEXIST)
4821 return r;
4822
4823 r = rgw_init_ioctx(rad, pool, io_ctx);
4824 if (r < 0)
4825 return r;
4826
4827 r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
4828 if (r < 0 && r != -EOPNOTSUPP)
4829 return r;
4830 return 0;
4831 }
4832
4833 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4834 string *marker) {
4835 if (marker) {
4836 *marker = shard_id_str;
4837 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4838 marker->append(shard_marker);
4839 }
4840 }
4841
4842 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4843 {
4844 const string *rule = &bucket_info.placement_rule;
4845 if (rule->empty()) {
4846 rule = &zonegroup.default_placement;
4847 }
4848 auto iter = zone_params.placement_pools.find(*rule);
4849 if (iter == zone_params.placement_pools.end()) {
4850 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4851 return -EINVAL;
4852 }
4853
4854 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4855 if (r < 0)
4856 return r;
4857
4858 return 0;
4859 }
4860
4861 /**
4862 * set up a bucket listing.
4863 * handle is filled in.
4864 * Returns 0 on success, -ERR# otherwise.
4865 */
4866 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4867 {
4868 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4869 *handle = (RGWAccessHandle)state;
4870 return 0;
4871 }
4872
4873 /**
4874 * get the next bucket in the listing.
4875 * obj is filled in,
4876 * handle is updated.
4877 * returns 0 on success, -ERR# otherwise.
4878 */
4879 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4880 {
4881 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4882
4883 do {
4884 if (*state == root_pool_ctx.nobjects_end()) {
4885 delete state;
4886 return -ENOENT;
4887 }
4888
4889 obj.key.name = (*state)->get_oid();
4890 if (obj.key.name[0] == '_') {
4891 obj.key.name = obj.key.name.substr(1);
4892 }
4893
4894 (*state)++;
4895 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4896
4897 return 0;
4898 }
4899
4900
4901 /**** logs ****/
4902
4903 struct log_list_state {
4904 string prefix;
4905 librados::IoCtx io_ctx;
4906 librados::NObjectIterator obit;
4907 };
4908
4909 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4910 {
4911 log_list_state *state = new log_list_state;
4912 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4913 if (r < 0) {
4914 delete state;
4915 return r;
4916 }
4917 state->prefix = prefix;
4918 state->obit = state->io_ctx.nobjects_begin();
4919 *handle = (RGWAccessHandle)state;
4920 return 0;
4921 }
4922
4923 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4924 {
4925 log_list_state *state = static_cast<log_list_state *>(handle);
4926 while (true) {
4927 if (state->obit == state->io_ctx.nobjects_end()) {
4928 delete state;
4929 return -ENOENT;
4930 }
4931 if (state->prefix.length() &&
4932 state->obit->get_oid().find(state->prefix) != 0) {
4933 state->obit++;
4934 continue;
4935 }
4936 *name = state->obit->get_oid();
4937 state->obit++;
4938 break;
4939 }
4940 return 0;
4941 }
4942
4943 int RGWRados::log_remove(const string& name)
4944 {
4945 librados::IoCtx io_ctx;
4946 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4947 if (r < 0)
4948 return r;
4949 return io_ctx.remove(name);
4950 }
4951
4952 struct log_show_state {
4953 librados::IoCtx io_ctx;
4954 bufferlist bl;
4955 bufferlist::iterator p;
4956 string name;
4957 uint64_t pos;
4958 bool eof;
4959 log_show_state() : pos(0), eof(false) {}
4960 };
4961
4962 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4963 {
4964 log_show_state *state = new log_show_state;
4965 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4966 if (r < 0) {
4967 delete state;
4968 return r;
4969 }
4970 state->name = name;
4971 *handle = (RGWAccessHandle)state;
4972 return 0;
4973 }
4974
4975 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4976 {
4977 log_show_state *state = static_cast<log_show_state *>(handle);
4978 off_t off = state->p.get_off();
4979
4980 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
4981 << " off " << off
4982 << " eof " << (int)state->eof
4983 << dendl;
4984 // read some?
4985 unsigned chunk = 1024*1024;
4986 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
4987 bufferlist more;
4988 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
4989 if (r < 0)
4990 return r;
4991 state->pos += r;
4992 bufferlist old;
4993 try {
4994 old.substr_of(state->bl, off, state->bl.length() - off);
4995 } catch (buffer::error& err) {
4996 return -EINVAL;
4997 }
4998 state->bl.clear();
4999 state->bl.claim(old);
5000 state->bl.claim_append(more);
5001 state->p = state->bl.begin();
5002 if ((unsigned)r < chunk)
5003 state->eof = true;
5004 ldout(cct, 10) << " read " << r << dendl;
5005 }
5006
5007 if (state->p.end())
5008 return 0; // end of file
5009 try {
5010 ::decode(*entry, state->p);
5011 }
5012 catch (const buffer::error &e) {
5013 return -EINVAL;
5014 }
5015 return 1;
5016 }
5017
5018 /**
5019 * usage_log_hash: get usage log key hash, based on name and index
5020 *
5021 * Get the usage object name. Since a user may have more than 1
5022 * object holding that info (multiple shards), we use index to
5023 * specify that shard number. Once index exceeds max shards it
5024 * wraps.
5025 * If name is not being set, results for all users will be returned
5026 * and index will wrap only after total shards number.
5027 *
5028 * @param cct [in] ceph context
5029 * @param name [in] user name
5030 * @param hash [out] hash value
5031 * @param index [in] shard index number
5032 */
5033 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5034 {
5035 uint32_t val = index;
5036
5037 if (!name.empty()) {
5038 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5039 val %= max_user_shards;
5040 val += ceph_str_hash_linux(name.c_str(), name.size());
5041 }
5042 char buf[17];
5043 int max_shards = cct->_conf->rgw_usage_max_shards;
5044 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5045 hash = buf;
5046 }
5047
5048 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5049 {
5050 uint32_t index = 0;
5051
5052 map<string, rgw_usage_log_info> log_objs;
5053
5054 string hash;
5055 string last_user;
5056
5057 /* restructure usage map, zone by object hash */
5058 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5059 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5060 const rgw_user_bucket& ub = iter->first;
5061 RGWUsageBatch& info = iter->second;
5062
5063 if (ub.user.empty()) {
5064 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5065 continue;
5066 }
5067
5068 if (ub.user != last_user) {
5069 /* index *should* be random, but why waste extra cycles
5070 in most cases max user shards is not going to exceed 1,
5071 so just incrementing it */
5072 usage_log_hash(cct, ub.user, hash, index++);
5073 }
5074 last_user = ub.user;
5075 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5076
5077 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5078 v.push_back(miter->second);
5079 }
5080 }
5081
5082 map<string, rgw_usage_log_info>::iterator liter;
5083
5084 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5085 int r = cls_obj_usage_log_add(liter->first, liter->second);
5086 if (r < 0)
5087 return r;
5088 }
5089 return 0;
5090 }
5091
5092 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5093 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5094 {
5095 uint32_t num = max_entries;
5096 string hash, first_hash;
5097 string user_str = user.to_str();
5098 usage_log_hash(cct, user_str, first_hash, 0);
5099
5100 if (usage_iter.index) {
5101 usage_log_hash(cct, user_str, hash, usage_iter.index);
5102 } else {
5103 hash = first_hash;
5104 }
5105
5106 usage.clear();
5107
5108 do {
5109 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5110 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5111
5112 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5113 usage_iter.read_iter, ret_usage, is_truncated);
5114 if (ret == -ENOENT)
5115 goto next;
5116
5117 if (ret < 0)
5118 return ret;
5119
5120 num -= ret_usage.size();
5121
5122 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5123 usage[iter->first].aggregate(iter->second);
5124 }
5125
5126 next:
5127 if (!*is_truncated) {
5128 usage_iter.read_iter.clear();
5129 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5130 }
5131 } while (num && !*is_truncated && hash != first_hash);
5132 return 0;
5133 }
5134
5135 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5136 {
5137 uint32_t index = 0;
5138 string hash, first_hash;
5139 string user_str = user.to_str();
5140 usage_log_hash(cct, user_str, first_hash, index);
5141
5142 hash = first_hash;
5143
5144 do {
5145 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5146 if (ret == -ENOENT)
5147 goto next;
5148
5149 if (ret < 0)
5150 return ret;
5151
5152 next:
5153 usage_log_hash(cct, user_str, hash, ++index);
5154 } while (hash != first_hash);
5155
5156 return 0;
5157 }
5158
5159 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5160 {
5161 return rgw_shards_hash(key, max_shards);
5162 }
5163
5164 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5165 {
5166 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5167 char buf[16];
5168 if (shard_id) {
5169 *shard_id = val % max_shards;
5170 }
5171 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5172 name = prefix + buf;
5173 }
5174
5175 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5176 {
5177 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5178 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5179 char buf[16];
5180 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5181 name = prefix + buf;
5182 }
5183
5184 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5185 {
5186 char buf[16];
5187 snprintf(buf, sizeof(buf), "%u", shard_id);
5188 name = prefix + buf;
5189
5190 }
5191
5192 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5193 {
5194 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5195 }
5196
5197 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5198 {
5199 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5200
5201 }
5202
5203 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5204 {
5205 librados::IoCtx io_ctx;
5206
5207 int r = time_log_add_init(io_ctx);
5208 if (r < 0) {
5209 return r;
5210 }
5211
5212 ObjectWriteOperation op;
5213 utime_t t(ut);
5214 cls_log_add(op, t, section, key, bl);
5215
5216 return io_ctx.operate(oid, &op);
5217 }
5218
5219 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5220 librados::AioCompletion *completion, bool monotonic_inc)
5221 {
5222 librados::IoCtx io_ctx;
5223
5224 int r = time_log_add_init(io_ctx);
5225 if (r < 0) {
5226 return r;
5227 }
5228
5229 ObjectWriteOperation op;
5230 cls_log_add(op, entries, monotonic_inc);
5231
5232 if (!completion) {
5233 r = io_ctx.operate(oid, &op);
5234 } else {
5235 r = io_ctx.aio_operate(oid, completion, &op);
5236 }
5237 return r;
5238 }
5239
5240 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5241 int max_entries, list<cls_log_entry>& entries,
5242 const string& marker,
5243 string *out_marker,
5244 bool *truncated)
5245 {
5246 librados::IoCtx io_ctx;
5247
5248 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5249 if (r < 0)
5250 return r;
5251 librados::ObjectReadOperation op;
5252
5253 utime_t st(start_time);
5254 utime_t et(end_time);
5255
5256 cls_log_list(op, st, et, marker, max_entries, entries,
5257 out_marker, truncated);
5258
5259 bufferlist obl;
5260
5261 int ret = io_ctx.operate(oid, &op, &obl);
5262 if (ret < 0)
5263 return ret;
5264
5265 return 0;
5266 }
5267
5268 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5269 {
5270 librados::IoCtx io_ctx;
5271
5272 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5273 if (r < 0)
5274 return r;
5275 librados::ObjectReadOperation op;
5276
5277 cls_log_info(op, header);
5278
5279 bufferlist obl;
5280
5281 int ret = io_ctx.operate(oid, &op, &obl);
5282 if (ret < 0)
5283 return ret;
5284
5285 return 0;
5286 }
5287
5288 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5289 {
5290 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5291 if (r < 0)
5292 return r;
5293
5294 librados::ObjectReadOperation op;
5295
5296 cls_log_info(op, header);
5297
5298 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5299 if (ret < 0)
5300 return ret;
5301
5302 return 0;
5303 }
5304
5305 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5306 const string& from_marker, const string& to_marker,
5307 librados::AioCompletion *completion)
5308 {
5309 librados::IoCtx io_ctx;
5310
5311 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5312 if (r < 0)
5313 return r;
5314
5315 utime_t st(start_time);
5316 utime_t et(end_time);
5317
5318 ObjectWriteOperation op;
5319 cls_log_trim(op, st, et, from_marker, to_marker);
5320
5321 if (!completion) {
5322 r = io_ctx.operate(oid, &op);
5323 } else {
5324 r = io_ctx.aio_operate(oid, completion, &op);
5325 }
5326 return r;
5327 }
5328
5329 string RGWRados::objexp_hint_get_shardname(int shard_num)
5330 {
5331 char buf[32];
5332 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5333
5334 string objname("obj_delete_at_hint.");
5335 return objname + buf;
5336 }
5337
5338 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5339 {
5340 string obj_key = key.name + key.instance;
5341 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5342 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5343 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5344 sid = rgw_shards_mod(sid2, num_shards);
5345 return sid;
5346 }
5347
5348 static string objexp_hint_get_keyext(const string& tenant_name,
5349 const string& bucket_name,
5350 const string& bucket_id,
5351 const rgw_obj_key& obj_key)
5352 {
5353 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5354 ":" + obj_key.name + ":" + obj_key.instance;
5355 }
5356
5357 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5358 const string& tenant_name,
5359 const string& bucket_name,
5360 const string& bucket_id,
5361 const rgw_obj_index_key& obj_key)
5362 {
5363 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5364 bucket_id, obj_key);
5365 objexp_hint_entry he = {
5366 .tenant = tenant_name,
5367 .bucket_name = bucket_name,
5368 .bucket_id = bucket_id,
5369 .obj_key = obj_key,
5370 .exp_time = delete_at };
5371 bufferlist hebl;
5372 ::encode(he, hebl);
5373 ObjectWriteOperation op;
5374 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5375
5376 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5377 return objexp_pool_ctx.operate(shard_name, &op);
5378 }
5379
5380 void RGWRados::objexp_get_shard(int shard_num,
5381 string& shard) /* out */
5382 {
5383 shard = objexp_hint_get_shardname(shard_num);
5384 }
5385
5386 int RGWRados::objexp_hint_list(const string& oid,
5387 const ceph::real_time& start_time,
5388 const ceph::real_time& end_time,
5389 const int max_entries,
5390 const string& marker,
5391 list<cls_timeindex_entry>& entries, /* out */
5392 string *out_marker, /* out */
5393 bool *truncated) /* out */
5394 {
5395 librados::ObjectReadOperation op;
5396 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5397 out_marker, truncated);
5398
5399 bufferlist obl;
5400 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5401
5402 if ((ret < 0 ) && (ret != -ENOENT)) {
5403 return ret;
5404 }
5405
5406 if ((ret == -ENOENT) && truncated) {
5407 *truncated = false;
5408 }
5409
5410 return 0;
5411 }
5412
5413 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5414 objexp_hint_entry& hint_entry) /* out */
5415 {
5416 try {
5417 bufferlist::iterator iter = ti_entry.value.begin();
5418 ::decode(hint_entry, iter);
5419 } catch (buffer::error& err) {
5420 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5421 }
5422
5423 return 0;
5424 }
5425
5426 int RGWRados::objexp_hint_trim(const string& oid,
5427 const ceph::real_time& start_time,
5428 const ceph::real_time& end_time,
5429 const string& from_marker,
5430 const string& to_marker)
5431 {
5432 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5433 from_marker, to_marker);
5434 if ((ret < 0 ) && (ret != -ENOENT)) {
5435 return ret;
5436 }
5437
5438 return 0;
5439 }
5440
5441 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5442 string& zone_id, string& owner_id) {
5443 librados::IoCtx io_ctx;
5444
5445 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5446 if (r < 0) {
5447 return r;
5448 }
5449 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5450 utime_t ut(msec / 1000, msec % 1000);
5451
5452 rados::cls::lock::Lock l(log_lock_name);
5453 l.set_duration(ut);
5454 l.set_cookie(owner_id);
5455 l.set_tag(zone_id);
5456 l.set_renew(true);
5457
5458 return l.lock_exclusive(&io_ctx, oid);
5459 }
5460
5461 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5462 librados::IoCtx io_ctx;
5463
5464 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5465 if (r < 0) {
5466 return r;
5467 }
5468
5469 rados::cls::lock::Lock l(log_lock_name);
5470 l.set_tag(zone_id);
5471 l.set_cookie(owner_id);
5472
5473 return l.unlock(&io_ctx, oid);
5474 }
5475
5476 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5477 {
5478 bufferlist::iterator i = bl.begin();
5479 RGWAccessControlPolicy policy(cct);
5480 try {
5481 policy.decode_owner(i);
5482 } catch (buffer::error& err) {
5483 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5484 return -EIO;
5485 }
5486 *owner = policy.get_owner();
5487 return 0;
5488 }
5489
5490 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5491 {
5492 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5493 if (aiter == attrset.end())
5494 return -EIO;
5495
5496 bufferlist& bl = aiter->second;
5497 bufferlist::iterator iter = bl.begin();
5498 try {
5499 policy->decode(iter);
5500 } catch (buffer::error& err) {
5501 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5502 return -EIO;
5503 }
5504 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5505 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5506 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5507 s3policy->to_xml(*_dout);
5508 *_dout << dendl;
5509 }
5510 return 0;
5511 }
5512
5513
5514 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5515 {
5516 rgw_bucket bucket = bucket_info.bucket;
5517 bucket.update_bucket_id(new_bucket_id);
5518
5519 RGWObjectCtx obj_ctx(store);
5520
5521 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5522 if (ret < 0) {
5523 return ret;
5524 }
5525
5526 return 0;
5527 }
5528
5529 /**
5530 * get listing of the objects in a bucket.
5531 *
5532 * max: maximum number of results to return
5533 * bucket: bucket to list contents of
5534 * prefix: only return results that match this prefix
5535 * delim: do not include results that match this string.
5536 * Any skipped results will have the matching portion of their name
5537 * inserted in common_prefixes with a "true" mark.
5538 * marker: if filled in, begin the listing with this object.
5539 * end_marker: if filled in, end the listing with this object.
5540 * result: the objects are put in here.
5541 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5542 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5543 */
5544 int RGWRados::Bucket::List::list_objects(int64_t max,
5545 vector<rgw_bucket_dir_entry> *result,
5546 map<string, bool> *common_prefixes,
5547 bool *is_truncated)
5548 {
5549 RGWRados *store = target->get_store();
5550 CephContext *cct = store->ctx();
5551 int shard_id = target->get_shard_id();
5552
5553 int count = 0;
5554 bool truncated = true;
5555 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5556
5557 result->clear();
5558
5559 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5560
5561 rgw_obj_key end_marker_obj;
5562 rgw_obj_index_key cur_end_marker;
5563 if (!params.ns.empty()) {
5564 end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
5565 end_marker_obj.ns = params.ns;
5566 end_marker_obj.get_index_key(&cur_end_marker);
5567 }
5568 rgw_obj_index_key cur_marker;
5569 marker_obj.get_index_key(&cur_marker);
5570
5571 const bool cur_end_marker_valid = !params.end_marker.empty();
5572
5573 rgw_obj_key prefix_obj(params.prefix);
5574 prefix_obj.ns = params.ns;
5575 string cur_prefix = prefix_obj.get_index_key_name();
5576
5577 string bigger_than_delim;
5578
5579 if (!params.delim.empty()) {
5580 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5581 char buf[params.delim.size() + 16];
5582 int r = encode_utf8(val + 1, (unsigned char *)buf);
5583 if (r < 0) {
5584 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5585 return -EINVAL;
5586 }
5587 buf[r] = '\0';
5588
5589 bigger_than_delim = buf;
5590
5591 /* if marker points at a common prefix, fast forward it into its upperbound string */
5592 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5593 if (delim_pos >= 0) {
5594 string s = cur_marker.name.substr(0, delim_pos);
5595 s.append(bigger_than_delim);
5596 cur_marker = s;
5597 }
5598 }
5599
5600 string skip_after_delim;
5601 while (truncated && count <= max) {
5602 if (skip_after_delim > cur_marker.name) {
5603 cur_marker = skip_after_delim;
5604 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5605 }
5606 std::map<string, rgw_bucket_dir_entry> ent_map;
5607 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5608 read_ahead + 1 - count, params.list_versions, ent_map,
5609 &truncated, &cur_marker);
5610 if (r < 0)
5611 return r;
5612
5613 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5614 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5615 rgw_bucket_dir_entry& entry = eiter->second;
5616 rgw_obj_index_key index_key = entry.key;
5617
5618 rgw_obj_key obj(index_key);
5619
5620 /* note that parse_raw_oid() here will not set the correct object's instance, as
5621 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5622 * not needed for the checks here and we end up using the raw entry for the return vector
5623 */
5624 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5625 if (!valid) {
5626 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5627 continue;
5628 }
5629 bool check_ns = (obj.ns == params.ns);
5630 if (!params.list_versions && !entry.is_visible()) {
5631 continue;
5632 }
5633
5634 if (params.enforce_ns && !check_ns) {
5635 if (!params.ns.empty()) {
5636 /* we've iterated past the namespace we're searching -- done now */
5637 truncated = false;
5638 goto done;
5639 }
5640
5641 /* we're not looking at the namespace this object is in, next! */
5642 continue;
5643 }
5644
5645 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5646 truncated = false;
5647 goto done;
5648 }
5649
5650 if (count < max) {
5651 params.marker = index_key;
5652 next_marker = index_key;
5653 }
5654
5655 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5656 continue;
5657
5658 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5659 continue;
5660
5661 if (!params.delim.empty()) {
5662 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5663
5664 if (delim_pos >= 0) {
5665 string prefix_key = obj.name.substr(0, delim_pos + 1);
5666
5667 if (common_prefixes &&
5668 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5669 if (count >= max) {
5670 truncated = true;
5671 goto done;
5672 }
5673 next_marker = prefix_key;
5674 (*common_prefixes)[prefix_key] = true;
5675
5676 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5677
5678 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5679 skip_after_delim.append(bigger_than_delim);
5680
5681 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5682
5683 count++;
5684 }
5685
5686 continue;
5687 }
5688 }
5689
5690 if (count >= max) {
5691 truncated = true;
5692 goto done;
5693 }
5694
5695 result->emplace_back(std::move(entry));
5696 count++;
5697 }
5698
5699 // Either the back-end telling us truncated, or we don't consume all
5700 // items returned per the amount caller request
5701 truncated = (truncated || eiter != ent_map.end());
5702 }
5703
5704 done:
5705 if (is_truncated)
5706 *is_truncated = truncated;
5707
5708 return 0;
5709 }
5710
5711 /**
5712 * create a rados pool, associated meta info
5713 * returns 0 on success, -ERR# otherwise.
5714 */
5715 int RGWRados::create_pool(const rgw_pool& pool)
5716 {
5717 int ret = 0;
5718
5719 librados::Rados *rad = get_rados_handle();
5720 ret = rad->pool_create(pool.name.c_str(), 0);
5721 if (ret == -EEXIST)
5722 ret = 0;
5723 else if (ret == -ERANGE) {
5724 ldout(cct, 0)
5725 << __func__
5726 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5727 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5728 << dendl;
5729 }
5730 if (ret < 0)
5731 return ret;
5732
5733 librados::IoCtx io_ctx;
5734 ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
5735 if (ret < 0)
5736 return ret;
5737
5738 ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
5739 if (ret < 0 && ret != -EOPNOTSUPP)
5740 return ret;
5741 return 0;
5742 }
5743
5744 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5745 {
5746 librados::IoCtx index_ctx; // context for new bucket
5747
5748 string dir_oid = dir_oid_prefix;
5749 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5750 if (r < 0) {
5751 return r;
5752 }
5753
5754 dir_oid.append(bucket_info.bucket.bucket_id);
5755
5756 map<int, string> bucket_objs;
5757 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5758
5759 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5760 }
5761
5762 void RGWRados::create_bucket_id(string *bucket_id)
5763 {
5764 uint64_t iid = instance_id();
5765 uint64_t bid = next_bucket_id();
5766 char buf[get_zone_params().get_id().size() + 48];
5767 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5768 *bucket_id = buf;
5769 }
5770
5771 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5772 const string& zonegroup_id,
5773 const string& placement_rule,
5774 const string& swift_ver_location,
5775 const RGWQuotaInfo * pquota_info,
5776 map<std::string, bufferlist>& attrs,
5777 RGWBucketInfo& info,
5778 obj_version *pobjv,
5779 obj_version *pep_objv,
5780 real_time creation_time,
5781 rgw_bucket *pmaster_bucket,
5782 uint32_t *pmaster_num_shards,
5783 bool exclusive)
5784 {
5785 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5786 string selected_placement_rule_name;
5787 RGWZonePlacementInfo rule_info;
5788
5789 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5790 int ret = 0;
5791 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5792 &selected_placement_rule_name, &rule_info);
5793 if (ret < 0)
5794 return ret;
5795
5796 if (!pmaster_bucket) {
5797 create_bucket_id(&bucket.marker);
5798 bucket.bucket_id = bucket.marker;
5799 } else {
5800 bucket.marker = pmaster_bucket->marker;
5801 bucket.bucket_id = pmaster_bucket->bucket_id;
5802 }
5803
5804 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5805
5806 if (pobjv) {
5807 objv_tracker.write_version = *pobjv;
5808 } else {
5809 objv_tracker.generate_new_write_ver(cct);
5810 }
5811
5812 info.bucket = bucket;
5813 info.owner = owner.user_id;
5814 info.zonegroup = zonegroup_id;
5815 info.placement_rule = selected_placement_rule_name;
5816 info.index_type = rule_info.index_type;
5817 info.swift_ver_location = swift_ver_location;
5818 info.swift_versioning = (!swift_ver_location.empty());
5819 if (pmaster_num_shards) {
5820 info.num_shards = *pmaster_num_shards;
5821 } else {
5822 info.num_shards = bucket_index_max_shards;
5823 }
5824 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5825 info.requester_pays = false;
5826 if (real_clock::is_zero(creation_time)) {
5827 info.creation_time = ceph::real_clock::now();
5828 } else {
5829 info.creation_time = creation_time;
5830 }
5831 if (pquota_info) {
5832 info.quota = *pquota_info;
5833 }
5834
5835 int r = init_bucket_index(info, info.num_shards);
5836 if (r < 0) {
5837 return r;
5838 }
5839
5840 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5841 if (ret == -EEXIST) {
5842 librados::IoCtx index_ctx;
5843 map<int, string> bucket_objs;
5844 int r = open_bucket_index(info, index_ctx, bucket_objs);
5845 if (r < 0)
5846 return r;
5847
5848 /* we need to reread the info and return it, caller will have a use for it */
5849 RGWObjVersionTracker instance_ver = info.objv_tracker;
5850 info.objv_tracker.clear();
5851 RGWObjectCtx obj_ctx(this);
5852 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5853 if (r < 0) {
5854 if (r == -ENOENT) {
5855 continue;
5856 }
5857 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5858 return r;
5859 }
5860
5861 /* only remove it if it's a different bucket instance */
5862 if (info.bucket.bucket_id != bucket.bucket_id) {
5863 /* remove bucket meta instance */
5864 string entry = bucket.get_key();
5865 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5866 if (r < 0)
5867 return r;
5868
5869 map<int, string>::const_iterator biter;
5870 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5871 // Do best effort removal
5872 index_ctx.remove(biter->second);
5873 }
5874 }
5875 /* ret == -ENOENT here */
5876 }
5877 return ret;
5878 }
5879
5880 /* this is highly unlikely */
5881 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5882 return -ENOENT;
5883 }
5884
5885 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5886 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5887
5888 {
5889 /* first check that zonegroup exists within current period. */
5890 RGWZoneGroup zonegroup;
5891 int ret = get_zonegroup(zonegroup_id, zonegroup);
5892 if (ret < 0) {
5893 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5894 return ret;
5895 }
5896
5897 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5898 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
5899
5900 if (!request_rule.empty()) {
5901 titer = zonegroup.placement_targets.find(request_rule);
5902 if (titer == zonegroup.placement_targets.end()) {
5903 ldout(cct, 0) << "could not find requested placement id " << request_rule
5904 << " within zonegroup " << dendl;
5905 return -ERR_INVALID_LOCATION_CONSTRAINT;
5906 }
5907 } else if (!user_info.default_placement.empty()) {
5908 titer = zonegroup.placement_targets.find(user_info.default_placement);
5909 if (titer == zonegroup.placement_targets.end()) {
5910 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
5911 << " within zonegroup " << dendl;
5912 return -ERR_INVALID_LOCATION_CONSTRAINT;
5913 }
5914 } else {
5915 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
5916 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
5917 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
5918 } else {
5919 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
5920 if (titer == zonegroup.placement_targets.end()) {
5921 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
5922 << " within zonegroup " << dendl;
5923 return -ERR_INVALID_LOCATION_CONSTRAINT;
5924 }
5925 }
5926 }
5927
5928 /* now check tag for the rule, whether user is permitted to use rule */
5929 const auto& target_rule = titer->second;
5930 if (!target_rule.user_permitted(user_info.placement_tags)) {
5931 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
5932 return -EPERM;
5933 }
5934
5935 if (pselected_rule_name)
5936 *pselected_rule_name = titer->first;
5937
5938 return select_bucket_location_by_rule(titer->first, rule_info);
5939 }
5940
5941 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5942 {
5943 if (location_rule.empty()) {
5944 /* we can only reach here if we're trying to set a bucket location from a bucket
5945 * created on a different zone, using a legacy / default pool configuration
5946 */
5947 return select_legacy_bucket_placement(rule_info);
5948 }
5949
5950 /*
5951 * make sure that zone has this rule configured. We're
5952 * checking it for the local zone, because that's where this bucket object is going to
5953 * reside.
5954 */
5955 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5956 if (piter == get_zone_params().placement_pools.end()) {
5957 /* couldn't find, means we cannot really place data for this bucket in this zone */
5958 if (get_zonegroup().equals(zonegroup.get_id())) {
5959 /* that's a configuration error, zone should have that rule, as we're within the requested
5960 * zonegroup */
5961 return -EINVAL;
5962 } else {
5963 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5964 return 0;
5965 }
5966 }
5967
5968 RGWZonePlacementInfo& placement_info = piter->second;
5969
5970 if (rule_info) {
5971 *rule_info = placement_info;
5972 }
5973
5974 return 0;
5975 }
5976
5977 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5978 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5979 {
5980 if (!get_zone_params().placement_pools.empty()) {
5981 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5982 pselected_rule_name, rule_info);
5983 }
5984
5985 if (pselected_rule_name) {
5986 pselected_rule_name->clear();
5987 }
5988
5989 return select_legacy_bucket_placement(rule_info);
5990 }
5991
5992 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
5993 {
5994 bufferlist map_bl;
5995 map<string, bufferlist> m;
5996 string pool_name;
5997 bool write_map = false;
5998
5999 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6000
6001 RGWObjectCtx obj_ctx(this);
6002 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6003 if (ret < 0) {
6004 goto read_omap;
6005 }
6006
6007 try {
6008 bufferlist::iterator iter = map_bl.begin();
6009 ::decode(m, iter);
6010 } catch (buffer::error& err) {
6011 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6012 }
6013
6014 read_omap:
6015 if (m.empty()) {
6016 bufferlist header;
6017 ret = omap_get_all(obj, header, m);
6018
6019 write_map = true;
6020 }
6021
6022 if (ret < 0 || m.empty()) {
6023 vector<rgw_pool> pools;
6024 string s = string("default.") + default_storage_pool_suffix;
6025 pools.push_back(rgw_pool(s));
6026 vector<int> retcodes;
6027 bufferlist bl;
6028 ret = create_pools(pools, retcodes);
6029 if (ret < 0)
6030 return ret;
6031 ret = omap_set(obj, s, bl);
6032 if (ret < 0)
6033 return ret;
6034 m[s] = bl;
6035 }
6036
6037 if (write_map) {
6038 bufferlist new_bl;
6039 ::encode(m, new_bl);
6040 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6041 if (ret < 0) {
6042 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6043 }
6044 }
6045
6046 map<string, bufferlist>::iterator miter;
6047 if (m.size() > 1) {
6048 vector<string> v;
6049 for (miter = m.begin(); miter != m.end(); ++miter) {
6050 v.push_back(miter->first);
6051 }
6052
6053 uint32_t r;
6054 ret = get_random_bytes((char *)&r, sizeof(r));
6055 if (ret < 0)
6056 return ret;
6057
6058 int i = r % v.size();
6059 pool_name = v[i];
6060 } else {
6061 miter = m.begin();
6062 pool_name = miter->first;
6063 }
6064
6065 rule_info->data_pool = pool_name;
6066 rule_info->data_extra_pool = pool_name;
6067 rule_info->index_pool = pool_name;
6068 rule_info->index_type = RGWBIType_Normal;
6069
6070 return 0;
6071 }
6072
6073 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6074 {
6075 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6076 }
6077
6078 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6079 {
6080 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6081
6082 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6083 }
6084
6085 int RGWRados::update_placement_map()
6086 {
6087 bufferlist header;
6088 map<string, bufferlist> m;
6089 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6090 int ret = omap_get_all(obj, header, m);
6091 if (ret < 0)
6092 return ret;
6093
6094 bufferlist new_bl;
6095 ::encode(m, new_bl);
6096 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6097 if (ret < 0) {
6098 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6099 }
6100
6101 return ret;
6102 }
6103
6104 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6105 {
6106 librados::Rados *rad = get_rados_handle();
6107 int ret = rad->pool_lookup(new_pool.name.c_str());
6108 if (ret < 0) // DNE, or something
6109 return ret;
6110
6111 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6112 bufferlist empty_bl;
6113 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6114
6115 // don't care about return value
6116 update_placement_map();
6117
6118 return ret;
6119 }
6120
6121 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6122 {
6123 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6124 int ret = omap_del(obj, old_pool.to_str());
6125
6126 // don't care about return value
6127 update_placement_map();
6128
6129 return ret;
6130 }
6131
6132 int RGWRados::list_placement_set(set<rgw_pool>& names)
6133 {
6134 bufferlist header;
6135 map<string, bufferlist> m;
6136
6137 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6138 int ret = omap_get_all(obj, header, m);
6139 if (ret < 0)
6140 return ret;
6141
6142 names.clear();
6143 map<string, bufferlist>::iterator miter;
6144 for (miter = m.begin(); miter != m.end(); ++miter) {
6145 names.insert(rgw_pool(miter->first));
6146 }
6147
6148 return names.size();
6149 }
6150
6151 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6152 {
6153 vector<librados::PoolAsyncCompletion *> completions;
6154 vector<int> rets;
6155
6156 librados::Rados *rad = get_rados_handle();
6157 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6158 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6159 completions.push_back(c);
6160 rgw_pool& pool = *iter;
6161 int ret = rad->pool_create_async(pool.name.c_str(), c);
6162 rets.push_back(ret);
6163 }
6164
6165 vector<int>::iterator riter;
6166 vector<librados::PoolAsyncCompletion *>::iterator citer;
6167
6168 bool error = false;
6169 assert(rets.size() == completions.size());
6170 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6171 int r = *riter;
6172 PoolAsyncCompletion *c = *citer;
6173 if (r == 0) {
6174 c->wait();
6175 r = c->get_return_value();
6176 if (r < 0) {
6177 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6178 error = true;
6179 }
6180 }
6181 c->release();
6182 retcodes.push_back(r);
6183 }
6184 if (error) {
6185 return 0;
6186 }
6187
6188 std::vector<librados::IoCtx> io_ctxs;
6189 retcodes.clear();
6190 for (auto pool : pools) {
6191 io_ctxs.emplace_back();
6192 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6193 if (ret < 0) {
6194 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6195 error = true;
6196 }
6197 retcodes.push_back(ret);
6198 }
6199 if (error) {
6200 return 0;
6201 }
6202
6203 completions.clear();
6204 for (auto &io_ctx : io_ctxs) {
6205 librados::PoolAsyncCompletion *c =
6206 librados::Rados::pool_async_create_completion();
6207 completions.push_back(c);
6208 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6209 false, c);
6210 assert(ret == 0);
6211 }
6212
6213 retcodes.clear();
6214 for (auto c : completions) {
6215 c->wait();
6216 int ret = c->get_return_value();
6217 if (ret == -EOPNOTSUPP) {
6218 ret = 0;
6219 } else if (ret < 0) {
6220 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6221 << dendl;
6222 error = true;
6223 }
6224 c->release();
6225 retcodes.push_back(ret);
6226 }
6227 return 0;
6228 }
6229
6230 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6231 {
6232 string oid, key;
6233 get_obj_bucket_and_oid_loc(obj, oid, key);
6234
6235 rgw_pool pool;
6236 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6237 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6238 return -EIO;
6239 }
6240
6241 int r = open_pool_ctx(pool, *ioctx);
6242 if (r < 0) {
6243 return r;
6244 }
6245
6246 ioctx->locator_set_key(key);
6247
6248 return 0;
6249 }
6250
6251 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6252 {
6253 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6254
6255 rgw_pool pool;
6256 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6257 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6258 return -EIO;
6259 }
6260
6261 int r = open_pool_ctx(pool, ref->ioctx);
6262 if (r < 0) {
6263 return r;
6264 }
6265
6266 ref->ioctx.locator_set_key(ref->key);
6267
6268 return 0;
6269 }
6270
6271 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6272 {
6273 ref->oid = obj.oid;
6274 ref->key = obj.loc;
6275
6276 int r;
6277
6278 if (ref->oid.empty()) {
6279 ref->oid = obj.pool.to_str();
6280 ref->pool = get_zone_params().domain_root;
6281 } else {
6282 ref->pool = obj.pool;
6283 }
6284 r = open_pool_ctx(ref->pool, ref->ioctx);
6285 if (r < 0)
6286 return r;
6287
6288 ref->ioctx.locator_set_key(ref->key);
6289
6290 return 0;
6291 }
6292
6293 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6294 {
6295 return get_raw_obj_ref(obj, ref);
6296 }
6297
6298 /*
6299 * fixes an issue where head objects were supposed to have a locator created, but ended
6300 * up without one
6301 */
6302 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6303 {
6304 const rgw_bucket& bucket = bucket_info.bucket;
6305 string oid;
6306 string locator;
6307
6308 rgw_obj obj(bucket, key);
6309
6310 get_obj_bucket_and_oid_loc(obj, oid, locator);
6311
6312 if (locator.empty()) {
6313 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6314 return 0;
6315 }
6316
6317 librados::IoCtx ioctx;
6318
6319 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6320 if (ret < 0) {
6321 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6322 return ret;
6323 }
6324 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6325
6326 uint64_t size;
6327 bufferlist data;
6328
6329 struct timespec mtime_ts;
6330 map<string, bufferlist> attrs;
6331 librados::ObjectReadOperation op;
6332 op.getxattrs(&attrs, NULL);
6333 op.stat2(&size, &mtime_ts, NULL);
6334 #define HEAD_SIZE 512 * 1024
6335 op.read(0, HEAD_SIZE, &data, NULL);
6336
6337 ret = ioctx.operate(oid, &op, NULL);
6338 if (ret < 0) {
6339 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6340 return ret;
6341 }
6342
6343 if (size > HEAD_SIZE) {
6344 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6345 return -EIO;
6346 }
6347
6348 if (size != data.length()) {
6349 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6350 return -EIO;
6351 }
6352
6353 if (copy_obj) {
6354 librados::ObjectWriteOperation wop;
6355
6356 wop.mtime2(&mtime_ts);
6357
6358 map<string, bufferlist>::iterator iter;
6359 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6360 wop.setxattr(iter->first.c_str(), iter->second);
6361 }
6362
6363 wop.write(0, data);
6364
6365 ioctx.locator_set_key(locator);
6366 ioctx.operate(oid, &wop);
6367 }
6368
6369 if (remove_bad) {
6370 ioctx.locator_set_key(string());
6371
6372 ret = ioctx.remove(oid);
6373 if (ret < 0) {
6374 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6375 return ret;
6376 }
6377 }
6378
6379 return 0;
6380 }
6381
6382 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6383 const string& src_oid, const string& src_locator,
6384 librados::IoCtx& dst_ioctx,
6385 const string& dst_oid, const string& dst_locator)
6386 {
6387
6388 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6389 bool done = false;
6390 uint64_t chunk_size = COPY_BUF_SIZE;
6391 uint64_t ofs = 0;
6392 int ret = 0;
6393 real_time mtime;
6394 struct timespec mtime_ts;
6395 uint64_t size;
6396
6397 if (src_oid == dst_oid && src_locator == dst_locator) {
6398 return 0;
6399 }
6400
6401 src_ioctx.locator_set_key(src_locator);
6402 dst_ioctx.locator_set_key(dst_locator);
6403
6404 do {
6405 bufferlist data;
6406 ObjectReadOperation rop;
6407 ObjectWriteOperation wop;
6408
6409 if (ofs == 0) {
6410 rop.stat2(&size, &mtime_ts, NULL);
6411 mtime = real_clock::from_timespec(mtime_ts);
6412 }
6413 rop.read(ofs, chunk_size, &data, NULL);
6414 ret = src_ioctx.operate(src_oid, &rop, NULL);
6415 if (ret < 0) {
6416 goto done_err;
6417 }
6418
6419 if (data.length() == 0) {
6420 break;
6421 }
6422
6423 if (ofs == 0) {
6424 wop.create(true); /* make it exclusive */
6425 wop.mtime2(&mtime_ts);
6426 mtime = real_clock::from_timespec(mtime_ts);
6427 }
6428 wop.write(ofs, data);
6429 ret = dst_ioctx.operate(dst_oid, &wop);
6430 ofs += data.length();
6431 done = data.length() != chunk_size;
6432 } while (!done);
6433
6434 if (ofs != size) {
6435 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6436 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6437 ret = -EIO;
6438 goto done_err;
6439 }
6440
6441 src_ioctx.remove(src_oid);
6442
6443 return 0;
6444
6445 done_err:
6446 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6447 return ret;
6448 }
6449
6450 /*
6451 * fixes an issue where head objects were supposed to have a locator created, but ended
6452 * up without one
6453 */
6454 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6455 {
6456 const rgw_bucket& bucket = bucket_info.bucket;
6457 rgw_obj obj(bucket, key);
6458
6459 if (need_fix) {
6460 *need_fix = false;
6461 }
6462
6463 rgw_rados_ref ref;
6464 int r = get_obj_head_ref(bucket_info, obj, &ref);
6465 if (r < 0) {
6466 return r;
6467 }
6468
6469 RGWObjState *astate = NULL;
6470 RGWObjectCtx rctx(this);
6471 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6472 if (r < 0)
6473 return r;
6474
6475 if (astate->has_manifest) {
6476 RGWObjManifest::obj_iterator miter;
6477 RGWObjManifest& manifest = astate->manifest;
6478 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6479 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6480 rgw_obj loc;
6481 string oid;
6482 string locator;
6483
6484 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6485
6486 if (loc.key.ns.empty()) {
6487 /* continue, we're only interested in tail objects */
6488 continue;
6489 }
6490
6491 get_obj_bucket_and_oid_loc(loc, oid, locator);
6492 ref.ioctx.locator_set_key(locator);
6493
6494 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6495
6496 r = ref.ioctx.stat(oid, NULL, NULL);
6497 if (r != -ENOENT) {
6498 continue;
6499 }
6500
6501 string bad_loc;
6502 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6503
6504 /* create a new ioctx with the bad locator */
6505 librados::IoCtx src_ioctx;
6506 src_ioctx.dup(ref.ioctx);
6507 src_ioctx.locator_set_key(bad_loc);
6508
6509 r = src_ioctx.stat(oid, NULL, NULL);
6510 if (r != 0) {
6511 /* cannot find a broken part */
6512 continue;
6513 }
6514 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6515 if (need_fix) {
6516 *need_fix = true;
6517 }
6518 if (fix) {
6519 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6520 if (r < 0) {
6521 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6522 }
6523 }
6524 }
6525 }
6526
6527 return 0;
6528 }
6529
6530 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6531 {
6532 bucket = _bucket;
6533
6534 RGWObjectCtx obj_ctx(store);
6535
6536 RGWBucketInfo bucket_info;
6537 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6538 if (ret < 0) {
6539 return ret;
6540 }
6541
6542 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6543 if (ret < 0) {
6544 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6545 return ret;
6546 }
6547 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6548
6549 return 0;
6550 }
6551
6552 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6553 {
6554 bucket = _bucket;
6555 shard_id = sid;
6556
6557 RGWObjectCtx obj_ctx(store);
6558
6559 RGWBucketInfo bucket_info;
6560 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6561 if (ret < 0) {
6562 return ret;
6563 }
6564
6565 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6566 if (ret < 0) {
6567 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6568 return ret;
6569 }
6570 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6571
6572 return 0;
6573 }
6574
6575
6576 /* Execute @handler on last item in bucket listing for bucket specified
6577 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6578 * to objects matching these criterias. */
6579 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6580 const std::string& obj_prefix,
6581 const std::string& obj_delim,
6582 std::function<int(const rgw_bucket_dir_entry&)> handler)
6583 {
6584 RGWRados::Bucket target(this, bucket_info);
6585 RGWRados::Bucket::List list_op(&target);
6586
6587 list_op.params.prefix = obj_prefix;
6588 list_op.params.delim = obj_delim;
6589
6590 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6591 << ", obj_prefix=" << obj_prefix
6592 << ", obj_delim=" << obj_delim
6593 << dendl;
6594
6595 bool is_truncated = false;
6596
6597 boost::optional<rgw_bucket_dir_entry> last_entry;
6598 /* We need to rewind to the last object in a listing. */
6599 do {
6600 /* List bucket entries in chunks. */
6601 static constexpr int MAX_LIST_OBJS = 100;
6602 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6603
6604 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6605 &is_truncated);
6606 if (ret < 0) {
6607 return ret;
6608 } else if (!entries.empty()) {
6609 last_entry = entries.back();
6610 }
6611 } while (is_truncated);
6612
6613 if (last_entry) {
6614 return handler(*last_entry);
6615 }
6616
6617 /* Empty listing - no items we can run handler on. */
6618 return 0;
6619 }
6620
6621
6622 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6623 const rgw_user& user,
6624 RGWBucketInfo& bucket_info,
6625 rgw_obj& obj)
6626 {
6627 if (! swift_versioning_enabled(bucket_info)) {
6628 return 0;
6629 }
6630
6631 obj_ctx.obj.set_atomic(obj);
6632
6633 RGWObjState * state = nullptr;
6634 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6635 if (r < 0) {
6636 return r;
6637 }
6638
6639 if (!state->exists) {
6640 return 0;
6641 }
6642
6643 string client_id;
6644 string op_id;
6645
6646 const string& src_name = obj.get_oid();
6647 char buf[src_name.size() + 32];
6648 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6649 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6650 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6651
6652 RGWBucketInfo dest_bucket_info;
6653
6654 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6655 if (r < 0) {
6656 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6657 if (r == -ENOENT) {
6658 return -ERR_PRECONDITION_FAILED;
6659 }
6660 return r;
6661 }
6662
6663 if (dest_bucket_info.owner != bucket_info.owner) {
6664 return -ERR_PRECONDITION_FAILED;
6665 }
6666
6667 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6668 obj_ctx.obj.set_atomic(dest_obj);
6669
6670 string no_zone;
6671
6672 r = copy_obj(obj_ctx,
6673 user,
6674 client_id,
6675 op_id,
6676 NULL, /* req_info *info */
6677 no_zone,
6678 dest_obj,
6679 obj,
6680 dest_bucket_info,
6681 bucket_info,
6682 NULL, /* time_t *src_mtime */
6683 NULL, /* time_t *mtime */
6684 NULL, /* const time_t *mod_ptr */
6685 NULL, /* const time_t *unmod_ptr */
6686 false, /* bool high_precision_time */
6687 NULL, /* const char *if_match */
6688 NULL, /* const char *if_nomatch */
6689 RGWRados::ATTRSMOD_NONE,
6690 true, /* bool copy_if_newer */
6691 state->attrset,
6692 RGW_OBJ_CATEGORY_MAIN,
6693 0, /* uint64_t olh_epoch */
6694 real_time(), /* time_t delete_at */
6695 NULL, /* string *version_id */
6696 NULL, /* string *ptag */
6697 NULL, /* string *petag */
6698 NULL, /* void (*progress_cb)(off_t, void *) */
6699 NULL); /* void *progress_data */
6700 if (r == -ECANCELED || r == -ENOENT) {
6701 /* Has already been overwritten, meaning another rgw process already
6702 * copied it out */
6703 return 0;
6704 }
6705
6706 return r;
6707 }
6708
6709 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6710 const rgw_user& user,
6711 RGWBucketInfo& bucket_info,
6712 rgw_obj& obj,
6713 bool& restored) /* out */
6714 {
6715 if (! swift_versioning_enabled(bucket_info)) {
6716 return 0;
6717 }
6718
6719 /* Bucket info of the bucket that stores previous versions of our object. */
6720 RGWBucketInfo archive_binfo;
6721
6722 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6723 bucket_info.swift_ver_location, archive_binfo,
6724 nullptr, nullptr);
6725 if (ret < 0) {
6726 return ret;
6727 }
6728
6729 /* Abort the operation if the bucket storing our archive belongs to someone
6730 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6731 * into consideration. For we can live with that.
6732 *
6733 * TODO: delegate this check to un upper layer and compare with ACLs. */
6734 if (bucket_info.owner != archive_binfo.owner) {
6735 return -EPERM;
6736 }
6737
6738 /* This code will be executed on latest version of the object. */
6739 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6740 std::string no_client_id;
6741 std::string no_op_id;
6742 std::string no_zone;
6743
6744 /* We don't support object versioning of Swift API on those buckets that
6745 * are already versioned using the S3 mechanism. This affects also bucket
6746 * storing archived objects. Otherwise the delete operation would create
6747 * a deletion marker. */
6748 if (archive_binfo.versioned()) {
6749 restored = false;
6750 return -ERR_PRECONDITION_FAILED;
6751 }
6752
6753 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6754 * irrelevant and may be safely skipped. */
6755 std::map<std::string, ceph::bufferlist> no_attrs;
6756
6757 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6758 obj_ctx.obj.set_atomic(archive_obj);
6759 obj_ctx.obj.set_atomic(obj);
6760
6761 int ret = copy_obj(obj_ctx,
6762 user,
6763 no_client_id,
6764 no_op_id,
6765 nullptr, /* req_info *info */
6766 no_zone,
6767 obj, /* dest obj */
6768 archive_obj, /* src obj */
6769 bucket_info, /* dest bucket info */
6770 archive_binfo, /* src bucket info */
6771 nullptr, /* time_t *src_mtime */
6772 nullptr, /* time_t *mtime */
6773 nullptr, /* const time_t *mod_ptr */
6774 nullptr, /* const time_t *unmod_ptr */
6775 false, /* bool high_precision_time */
6776 nullptr, /* const char *if_match */
6777 nullptr, /* const char *if_nomatch */
6778 RGWRados::ATTRSMOD_NONE,
6779 true, /* bool copy_if_newer */
6780 no_attrs,
6781 RGW_OBJ_CATEGORY_MAIN,
6782 0, /* uint64_t olh_epoch */
6783 real_time(), /* time_t delete_at */
6784 nullptr, /* string *version_id */
6785 nullptr, /* string *ptag */
6786 nullptr, /* string *petag */
6787 nullptr, /* void (*progress_cb)(off_t, void *) */
6788 nullptr); /* void *progress_data */
6789 if (ret == -ECANCELED || ret == -ENOENT) {
6790 /* Has already been overwritten, meaning another rgw process already
6791 * copied it out */
6792 return 0;
6793 } else if (ret < 0) {
6794 return ret;
6795 } else {
6796 restored = true;
6797 }
6798
6799 /* Need to remove the archived copy. */
6800 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6801 archive_binfo.versioning_status());
6802
6803 return ret;
6804 };
6805
6806 const std::string& obj_name = obj.get_oid();
6807 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6808 % obj_name);
6809
6810 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6811 handler);
6812 }
6813
6814 /**
6815 * Write/overwrite an object to the bucket storage.
6816 * bucket: the bucket to store the object in
6817 * obj: the object name/key
6818 * data: the object contents/value
6819 * size: the amount of data to write (data must be this long)
6820 * accounted_size: original size of data before compression, encryption
6821 * mtime: if non-NULL, writes the given mtime to the bucket storage
6822 * attrs: all the given attrs are written to bucket storage for the given object
6823 * exclusive: create object exclusively
6824 * Returns: 0 on success, -ERR# otherwise.
6825 */
6826 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6827 map<string, bufferlist>& attrs,
6828 bool assume_noent, bool modify_tail,
6829 void *_index_op)
6830 {
6831 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6832 RGWRados *store = target->get_store();
6833
6834 ObjectWriteOperation op;
6835
6836 RGWObjState *state;
6837 int r = target->get_state(&state, false, assume_noent);
6838 if (r < 0)
6839 return r;
6840
6841 rgw_obj& obj = target->get_obj();
6842
6843 if (obj.get_oid().empty()) {
6844 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6845 return -EIO;
6846 }
6847
6848 rgw_rados_ref ref;
6849 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6850 if (r < 0)
6851 return r;
6852
6853 bool is_olh = state->is_olh;
6854
6855 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6856
6857 const string *ptag = meta.ptag;
6858 if (!ptag && !index_op->get_optag()->empty()) {
6859 ptag = index_op->get_optag();
6860 }
6861 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
6862 if (r < 0)
6863 return r;
6864
6865 if (real_clock::is_zero(meta.set_mtime)) {
6866 meta.set_mtime = real_clock::now();
6867 }
6868
6869 if (state->is_olh) {
6870 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6871 }
6872
6873 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6874 op.mtime2(&mtime_ts);
6875
6876 if (meta.data) {
6877 /* if we want to overwrite the data, we also want to overwrite the
6878 xattrs, so just remove the object */
6879 op.write_full(*meta.data);
6880 }
6881
6882 string etag;
6883 string content_type;
6884 bufferlist acl_bl;
6885
6886 map<string, bufferlist>::iterator iter;
6887 if (meta.rmattrs) {
6888 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6889 const string& name = iter->first;
6890 op.rmxattr(name.c_str());
6891 }
6892 }
6893
6894 if (meta.manifest) {
6895 /* remove existing manifest attr */
6896 iter = attrs.find(RGW_ATTR_MANIFEST);
6897 if (iter != attrs.end())
6898 attrs.erase(iter);
6899
6900 bufferlist bl;
6901 ::encode(*meta.manifest, bl);
6902 op.setxattr(RGW_ATTR_MANIFEST, bl);
6903 }
6904
6905 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6906 const string& name = iter->first;
6907 bufferlist& bl = iter->second;
6908
6909 if (!bl.length())
6910 continue;
6911
6912 op.setxattr(name.c_str(), bl);
6913
6914 if (name.compare(RGW_ATTR_ETAG) == 0) {
6915 etag = bl.c_str();
6916 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6917 content_type = bl.c_str();
6918 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6919 acl_bl = bl;
6920 }
6921 }
6922 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6923 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6924 }
6925
6926 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6927 bufferlist bl;
6928 ::encode(store->get_zone_short_id(), bl);
6929 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6930 }
6931
6932 if (!op.size())
6933 return 0;
6934
6935 uint64_t epoch;
6936 int64_t poolid;
6937 bool orig_exists;
6938 uint64_t orig_size;
6939
6940 if (!reset_obj) { //Multipart upload, it has immutable head.
6941 orig_exists = false;
6942 orig_size = 0;
6943 } else {
6944 orig_exists = state->exists;
6945 orig_size = state->accounted_size;
6946 }
6947
6948 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6949
6950 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6951
6952 if (versioned_op) {
6953 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6954 }
6955
6956 if (!index_op->is_prepared()) {
6957 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6958 if (r < 0)
6959 return r;
6960 }
6961
6962 r = ref.ioctx.operate(ref.oid, &op);
6963 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6964 or -ENOENT if was removed, or -EEXIST if it did not exist
6965 before and now it does */
6966 if (r == -EEXIST && assume_noent) {
6967 target->invalidate_state();
6968 return r;
6969 }
6970 goto done_cancel;
6971 }
6972
6973 epoch = ref.ioctx.get_last_version();
6974 poolid = ref.ioctx.get_id();
6975
6976 r = target->complete_atomic_modification();
6977 if (r < 0) {
6978 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
6979 }
6980
6981 r = index_op->complete(poolid, epoch, size, accounted_size,
6982 meta.set_mtime, etag, content_type, &acl_bl,
6983 meta.category, meta.remove_objs, meta.user_data);
6984 if (r < 0)
6985 goto done_cancel;
6986
6987 if (meta.mtime) {
6988 *meta.mtime = meta.set_mtime;
6989 }
6990
6991 /* note that index_op was using state so we couldn't invalidate it earlier */
6992 target->invalidate_state();
6993 state = NULL;
6994
6995 if (versioned_op) {
6996 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
6997 if (r < 0) {
6998 return r;
6999 }
7000 }
7001
7002 if (!real_clock::is_zero(meta.delete_at)) {
7003 rgw_obj_index_key obj_key;
7004 obj.key.get_index_key(&obj_key);
7005
7006 r = store->objexp_hint_add(meta.delete_at,
7007 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7008 if (r < 0) {
7009 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7010 /* ignoring error, nothing we can do at this point */
7011 }
7012 }
7013 meta.canceled = false;
7014
7015 /* update quota cache */
7016 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7017 accounted_size, orig_size);
7018 return 0;
7019
7020 done_cancel:
7021 int ret = index_op->cancel();
7022 if (ret < 0) {
7023 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7024 }
7025
7026 meta.canceled = true;
7027
7028 /* we lost in a race. There are a few options:
7029 * - existing object was rewritten (ECANCELED)
7030 * - non existing object was created (EEXIST)
7031 * - object was removed (ENOENT)
7032 * should treat it as a success
7033 */
7034 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7035 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7036 r = 0;
7037 }
7038 } else {
7039 if (meta.if_match != NULL) {
7040 // only overwrite existing object
7041 if (strcmp(meta.if_match, "*") == 0) {
7042 if (r == -ENOENT) {
7043 r = -ERR_PRECONDITION_FAILED;
7044 } else if (r == -ECANCELED) {
7045 r = 0;
7046 }
7047 }
7048 }
7049
7050 if (meta.if_nomatch != NULL) {
7051 // only create a new object
7052 if (strcmp(meta.if_nomatch, "*") == 0) {
7053 if (r == -EEXIST) {
7054 r = -ERR_PRECONDITION_FAILED;
7055 } else if (r == -ENOENT) {
7056 r = 0;
7057 }
7058 }
7059 }
7060 }
7061
7062 return r;
7063 }
7064
7065 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7066 map<string, bufferlist>& attrs)
7067 {
7068 RGWBucketInfo& bucket_info = target->get_bucket_info();
7069
7070 RGWRados::Bucket bop(target->get_store(), bucket_info);
7071 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7072 index_op.set_zones_trace(meta.zones_trace);
7073
7074 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7075 int r;
7076 if (assume_noent) {
7077 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7078 if (r == -EEXIST) {
7079 assume_noent = false;
7080 }
7081 }
7082 if (!assume_noent) {
7083 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7084 }
7085 return r;
7086 }
7087
7088 /** Write/overwrite a system object. */
7089 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7090 map<std::string, bufferlist>& attrs, int flags,
7091 bufferlist& data,
7092 RGWObjVersionTracker *objv_tracker,
7093 real_time set_mtime /* 0 for don't set */)
7094 {
7095 rgw_rados_ref ref;
7096 int r = get_system_obj_ref(obj, &ref);
7097 if (r < 0)
7098 return r;
7099
7100 ObjectWriteOperation op;
7101
7102 if (flags & PUT_OBJ_EXCL) {
7103 if (!(flags & PUT_OBJ_CREATE))
7104 return -EINVAL;
7105 op.create(true); // exclusive create
7106 } else {
7107 op.remove();
7108 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7109 op.create(false);
7110 }
7111
7112 if (objv_tracker) {
7113 objv_tracker->prepare_op_for_write(&op);
7114 }
7115
7116 if (real_clock::is_zero(set_mtime)) {
7117 set_mtime = real_clock::now();
7118 }
7119
7120 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7121 op.mtime2(&mtime_ts);
7122 op.write_full(data);
7123
7124 bufferlist acl_bl;
7125
7126 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7127 const string& name = iter->first;
7128 bufferlist& bl = iter->second;
7129
7130 if (!bl.length())
7131 continue;
7132
7133 op.setxattr(name.c_str(), bl);
7134 }
7135
7136 r = ref.ioctx.operate(ref.oid, &op);
7137 if (r < 0) {
7138 return r;
7139 }
7140
7141 if (objv_tracker) {
7142 objv_tracker->apply_write();
7143 }
7144
7145 if (mtime) {
7146 *mtime = set_mtime;
7147 }
7148
7149 return 0;
7150 }
7151
7152 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7153 off_t ofs, bool exclusive,
7154 RGWObjVersionTracker *objv_tracker)
7155 {
7156 rgw_rados_ref ref;
7157 int r = get_system_obj_ref(obj, &ref);
7158 if (r < 0) {
7159 return r;
7160 }
7161
7162 ObjectWriteOperation op;
7163
7164 if (exclusive)
7165 op.create(true);
7166
7167 if (objv_tracker) {
7168 objv_tracker->prepare_op_for_write(&op);
7169 }
7170 if (ofs == -1) {
7171 op.write_full(bl);
7172 } else {
7173 op.write(ofs, bl);
7174 }
7175 r = ref.ioctx.operate(ref.oid, &op);
7176 if (r < 0)
7177 return r;
7178
7179 if (objv_tracker) {
7180 objv_tracker->apply_write();
7181 }
7182 return 0;
7183 }
7184
7185 /**
7186 * Write/overwrite an object to the bucket storage.
7187 * bucket: the bucket to store the object in
7188 * obj: the object name/key
7189 * data: the object contents/value
7190 * offset: the offet to write to in the object
7191 * If this is -1, we will overwrite the whole object.
7192 * size: the amount of data to write (data must be this long)
7193 * attrs: all the given attrs are written to bucket storage for the given object
7194 * Returns: 0 on success, -ERR# otherwise.
7195 */
7196
7197 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7198 off_t ofs, bool exclusive,
7199 void **handle)
7200 {
7201 rgw_rados_ref ref;
7202 int r = get_raw_obj_ref(obj, &ref);
7203 if (r < 0) {
7204 return r;
7205 }
7206
7207 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7208 *handle = c;
7209
7210 ObjectWriteOperation op;
7211
7212 if (exclusive)
7213 op.create(true);
7214
7215 if (ofs == -1) {
7216 op.write_full(bl);
7217 } else {
7218 op.write(ofs, bl);
7219 }
7220 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7221 if (r < 0)
7222 return r;
7223
7224 return 0;
7225 }
7226
7227 int RGWRados::aio_wait(void *handle)
7228 {
7229 AioCompletion *c = (AioCompletion *)handle;
7230 c->wait_for_safe();
7231 int ret = c->get_return_value();
7232 c->release();
7233 return ret;
7234 }
7235
7236 bool RGWRados::aio_completed(void *handle)
7237 {
7238 AioCompletion *c = (AioCompletion *)handle;
7239 return c->is_safe();
7240 }
7241
7242 class RGWRadosPutObj : public RGWGetDataCB
7243 {
7244 CephContext* cct;
7245 rgw_obj obj;
7246 RGWPutObjDataProcessor *filter;
7247 boost::optional<RGWPutObj_Compress>& compressor;
7248 CompressorRef& plugin;
7249 RGWPutObjProcessor_Atomic *processor;
7250 RGWOpStateSingleOp *opstate;
7251 void (*progress_cb)(off_t, void *);
7252 void *progress_data;
7253 bufferlist extra_data_bl;
7254 uint64_t extra_data_len;
7255 uint64_t data_len;
7256 map<string, bufferlist> src_attrs;
7257 public:
7258 RGWRadosPutObj(CephContext* cct,
7259 CompressorRef& plugin,
7260 boost::optional<RGWPutObj_Compress>& compressor,
7261 RGWPutObjProcessor_Atomic *p,
7262 RGWOpStateSingleOp *_ops,
7263 void (*_progress_cb)(off_t, void *),
7264 void *_progress_data) :
7265 cct(cct),
7266 filter(p),
7267 compressor(compressor),
7268 plugin(plugin),
7269 processor(p),
7270 opstate(_ops),
7271 progress_cb(_progress_cb),
7272 progress_data(_progress_data),
7273 extra_data_len(0),
7274 data_len(0) {}
7275
7276 int process_attrs(void) {
7277 if (extra_data_bl.length()) {
7278 JSONParser jp;
7279 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7280 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7281 return -EIO;
7282 }
7283
7284 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7285
7286 src_attrs.erase(RGW_ATTR_COMPRESSION);
7287 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7288 }
7289
7290 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7291 //do not compress if object is encrypted
7292 compressor = boost::in_place(cct, plugin, filter);
7293 filter = &*compressor;
7294 }
7295 return 0;
7296 }
7297
7298 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7299 if (progress_cb) {
7300 progress_cb(ofs, progress_data);
7301 }
7302 if (extra_data_len) {
7303 size_t extra_len = bl.length();
7304 if (extra_len > extra_data_len)
7305 extra_len = extra_data_len;
7306
7307 bufferlist extra;
7308 bl.splice(0, extra_len, &extra);
7309 extra_data_bl.append(extra);
7310
7311 extra_data_len -= extra_len;
7312 if (extra_data_len == 0) {
7313 int res = process_attrs();
7314 if (res < 0)
7315 return res;
7316 }
7317 if (bl.length() == 0) {
7318 return 0;
7319 }
7320 }
7321 data_len += bl.length();
7322 bool again = false;
7323
7324 bool need_opstate = true;
7325
7326 do {
7327 void *handle = NULL;
7328 rgw_raw_obj obj;
7329 uint64_t size = bl.length();
7330 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7331 if (ret < 0)
7332 return ret;
7333
7334 if (need_opstate && opstate) {
7335 /* need to update opstate repository with new state. This is ratelimited, so we're not
7336 * really doing it every time
7337 */
7338 ret = opstate->renew_state();
7339 if (ret < 0) {
7340 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7341 int r = filter->throttle_data(handle, obj, size, false);
7342 if (r < 0) {
7343 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7344 }
7345 /* could not renew state! might have been marked as cancelled */
7346 return ret;
7347 }
7348 need_opstate = false;
7349 }
7350
7351 ret = filter->throttle_data(handle, obj, size, false);
7352 if (ret < 0)
7353 return ret;
7354 } while (again);
7355
7356 return 0;
7357 }
7358
7359 bufferlist& get_extra_data() { return extra_data_bl; }
7360
7361 map<string, bufferlist>& get_attrs() { return src_attrs; }
7362
7363 void set_extra_data_len(uint64_t len) override {
7364 extra_data_len = len;
7365 }
7366
7367 uint64_t get_data_len() {
7368 return data_len;
7369 }
7370
7371 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7372 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7373 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7374 }
7375
7376 bool is_canceled() {
7377 return processor->is_canceled();
7378 }
7379 };
7380
7381 /*
7382 * prepare attrset depending on attrs_mod.
7383 */
7384 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7385 map<string, bufferlist>& attrs,
7386 RGWRados::AttrsMod attrs_mod)
7387 {
7388 switch (attrs_mod) {
7389 case RGWRados::ATTRSMOD_NONE:
7390 attrs = src_attrs;
7391 break;
7392 case RGWRados::ATTRSMOD_REPLACE:
7393 if (!attrs[RGW_ATTR_ETAG].length()) {
7394 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7395 }
7396 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7397 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7398 if (ttiter != src_attrs.end()) {
7399 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7400 }
7401 }
7402 break;
7403 case RGWRados::ATTRSMOD_MERGE:
7404 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7405 if (attrs.find(it->first) == attrs.end()) {
7406 attrs[it->first] = it->second;
7407 }
7408 }
7409 break;
7410 }
7411 }
7412
7413 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7414 {
7415 map<string, bufferlist> attrset;
7416
7417 real_time mtime;
7418 uint64_t obj_size;
7419 RGWObjectCtx rctx(this);
7420
7421 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7422 RGWRados::Object::Read read_op(&op_target);
7423
7424 read_op.params.attrs = &attrset;
7425 read_op.params.lastmod = &mtime;
7426 read_op.params.obj_size = &obj_size;
7427
7428 int ret = read_op.prepare();
7429 if (ret < 0)
7430 return ret;
7431
7432 attrset.erase(RGW_ATTR_ID_TAG);
7433 attrset.erase(RGW_ATTR_TAIL_TAG);
7434
7435 uint64_t max_chunk_size;
7436
7437 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7438 if (ret < 0) {
7439 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7440 return ret;
7441 }
7442
7443 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
7444 RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL);
7445 }
7446
7447 struct obj_time_weight {
7448 real_time mtime;
7449 uint32_t zone_short_id;
7450 uint64_t pg_ver;
7451 bool high_precision;
7452
7453 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7454
7455 bool compare_low_precision(const obj_time_weight& rhs) {
7456 struct timespec l = ceph::real_clock::to_timespec(mtime);
7457 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7458 l.tv_nsec = 0;
7459 r.tv_nsec = 0;
7460 if (l > r) {
7461 return false;
7462 }
7463 if (l < r) {
7464 return true;
7465 }
7466 if (zone_short_id != rhs.zone_short_id) {
7467 return (zone_short_id < rhs.zone_short_id);
7468 }
7469 return (pg_ver < rhs.pg_ver);
7470
7471 }
7472
7473 bool operator<(const obj_time_weight& rhs) {
7474 if (!high_precision || !rhs.high_precision) {
7475 return compare_low_precision(rhs);
7476 }
7477 if (mtime > rhs.mtime) {
7478 return false;
7479 }
7480 if (mtime < rhs.mtime) {
7481 return true;
7482 }
7483 if (zone_short_id != rhs.zone_short_id) {
7484 return (zone_short_id < rhs.zone_short_id);
7485 }
7486 return (pg_ver < rhs.pg_ver);
7487 }
7488
7489 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7490 mtime = _mtime;
7491 zone_short_id = _short_id;
7492 pg_ver = _pg_ver;
7493 }
7494
7495 void init(RGWObjState *state) {
7496 mtime = state->mtime;
7497 zone_short_id = state->zone_short_id;
7498 pg_ver = state->pg_ver;
7499 }
7500 };
7501
7502 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7503 out << o.mtime;
7504
7505 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7506 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7507 }
7508
7509 return out;
7510 }
7511
7512 class RGWGetExtraDataCB : public RGWGetDataCB {
7513 bufferlist extra_data;
7514 public:
7515 RGWGetExtraDataCB() {}
7516 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7517 if (extra_data.length() < extra_data_len) {
7518 off_t max = extra_data_len - extra_data.length();
7519 if (max > bl_len) {
7520 max = bl_len;
7521 }
7522 bl.splice(0, max, &extra_data);
7523 }
7524 return bl_len;
7525 }
7526
7527 bufferlist& get_extra_data() {
7528 return extra_data;
7529 }
7530 };
7531
7532 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7533 const rgw_user& user_id,
7534 const string& client_id,
7535 req_info *info,
7536 const string& source_zone,
7537 rgw_obj& src_obj,
7538 RGWBucketInfo& src_bucket_info,
7539 real_time *src_mtime,
7540 uint64_t *psize,
7541 const real_time *mod_ptr,
7542 const real_time *unmod_ptr,
7543 bool high_precision_time,
7544 const char *if_match,
7545 const char *if_nomatch,
7546 map<string, bufferlist> *pattrs,
7547 string *version_id,
7548 string *ptag,
7549 string *petag)
7550 {
7551 /* source is in a different zonegroup, copy from there */
7552
7553 RGWRESTStreamRWRequest *in_stream_req;
7554 string tag;
7555 map<string, bufferlist> src_attrs;
7556 append_rand_alpha(cct, tag, tag, 32);
7557 obj_time_weight set_mtime_weight;
7558 set_mtime_weight.high_precision = high_precision_time;
7559
7560 RGWRESTConn *conn;
7561 if (source_zone.empty()) {
7562 if (src_bucket_info.zonegroup.empty()) {
7563 /* source is in the master zonegroup */
7564 conn = rest_master_conn;
7565 } else {
7566 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7567 if (iter == zonegroup_conn_map.end()) {
7568 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7569 return -ENOENT;
7570 }
7571 conn = iter->second;
7572 }
7573 } else {
7574 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7575 if (iter == zone_conn_map.end()) {
7576 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7577 return -ENOENT;
7578 }
7579 conn = iter->second;
7580 }
7581
7582 RGWGetExtraDataCB cb;
7583 string etag;
7584 map<string, string> req_headers;
7585 real_time set_mtime;
7586
7587 const real_time *pmod = mod_ptr;
7588
7589 obj_time_weight dest_mtime_weight;
7590
7591 constexpr bool prepend_meta = true;
7592 constexpr bool get_op = true;
7593 constexpr bool rgwx_stat = true;
7594 constexpr bool sync_manifest = true;
7595 constexpr bool skip_decrypt = true;
7596 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7597 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7598 prepend_meta, get_op, rgwx_stat,
7599 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7600 if (ret < 0) {
7601 return ret;
7602 }
7603
7604 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7605 if (ret < 0) {
7606 return ret;
7607 }
7608
7609 bufferlist& extra_data_bl = cb.get_extra_data();
7610 if (extra_data_bl.length()) {
7611 JSONParser jp;
7612 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7613 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7614 return -EIO;
7615 }
7616
7617 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7618
7619 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7620 }
7621
7622 if (src_mtime) {
7623 *src_mtime = set_mtime;
7624 }
7625
7626 if (petag) {
7627 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7628 if (iter != src_attrs.end()) {
7629 bufferlist& etagbl = iter->second;
7630 *petag = etagbl.to_str();
7631 }
7632 }
7633
7634 if (pattrs) {
7635 *pattrs = src_attrs;
7636 }
7637
7638 return 0;
7639 }
7640
7641 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7642 const rgw_user& user_id,
7643 const string& client_id,
7644 const string& op_id,
7645 bool record_op_state,
7646 req_info *info,
7647 const string& source_zone,
7648 rgw_obj& dest_obj,
7649 rgw_obj& src_obj,
7650 RGWBucketInfo& dest_bucket_info,
7651 RGWBucketInfo& src_bucket_info,
7652 real_time *src_mtime,
7653 real_time *mtime,
7654 const real_time *mod_ptr,
7655 const real_time *unmod_ptr,
7656 bool high_precision_time,
7657 const char *if_match,
7658 const char *if_nomatch,
7659 AttrsMod attrs_mod,
7660 bool copy_if_newer,
7661 map<string, bufferlist>& attrs,
7662 RGWObjCategory category,
7663 uint64_t olh_epoch,
7664 real_time delete_at,
7665 string *version_id,
7666 string *ptag,
7667 ceph::buffer::list *petag,
7668 void (*progress_cb)(off_t, void *),
7669 void *progress_data,
7670 rgw_zone_set *zones_trace)
7671 {
7672 /* source is in a different zonegroup, copy from there */
7673
7674 RGWRESTStreamRWRequest *in_stream_req;
7675 string tag;
7676 int i;
7677 append_rand_alpha(cct, tag, tag, 32);
7678 obj_time_weight set_mtime_weight;
7679 set_mtime_weight.high_precision = high_precision_time;
7680
7681 RGWPutObjProcessor_Atomic processor(obj_ctx,
7682 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7683 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7684 if (version_id && *version_id != "null") {
7685 processor.set_version_id(*version_id);
7686 }
7687 processor.set_olh_epoch(olh_epoch);
7688 int ret = processor.prepare(this, NULL);
7689 if (ret < 0) {
7690 return ret;
7691 }
7692
7693 RGWRESTConn *conn;
7694 if (source_zone.empty()) {
7695 if (dest_bucket_info.zonegroup.empty()) {
7696 /* source is in the master zonegroup */
7697 conn = rest_master_conn;
7698 } else {
7699 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7700 if (iter == zonegroup_conn_map.end()) {
7701 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7702 return -ENOENT;
7703 }
7704 conn = iter->second;
7705 }
7706 } else {
7707 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7708 if (iter == zone_conn_map.end()) {
7709 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7710 return -ENOENT;
7711 }
7712 conn = iter->second;
7713 }
7714
7715 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7716
7717 RGWOpStateSingleOp *opstate = NULL;
7718
7719 if (record_op_state) {
7720 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7721
7722 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7723 if (ret < 0) {
7724 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7725 delete opstate;
7726 return ret;
7727 }
7728 }
7729
7730 boost::optional<RGWPutObj_Compress> compressor;
7731 CompressorRef plugin;
7732
7733 const auto& compression_type = zone_params.get_compression_type(
7734 dest_bucket_info.placement_rule);
7735 if (compression_type != "none") {
7736 plugin = Compressor::create(cct, compression_type);
7737 if (!plugin) {
7738 ldout(cct, 1) << "Cannot load plugin for compression type "
7739 << compression_type << dendl;
7740 }
7741 }
7742
7743 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7744
7745 string etag;
7746 map<string, string> req_headers;
7747 real_time set_mtime;
7748
7749 RGWObjState *dest_state = NULL;
7750
7751 const real_time *pmod = mod_ptr;
7752
7753 obj_time_weight dest_mtime_weight;
7754
7755 if (copy_if_newer) {
7756 /* need to get mtime for destination */
7757 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7758 if (ret < 0)
7759 goto set_err_state;
7760
7761 if (!real_clock::is_zero(dest_state->mtime)) {
7762 dest_mtime_weight.init(dest_state);
7763 pmod = &dest_mtime_weight.mtime;
7764 }
7765 }
7766
7767 static constexpr bool prepend_meta = true;
7768 static constexpr bool get_op = true;
7769 static constexpr bool rgwx_stat = false;
7770 static constexpr bool sync_manifest = true;
7771 static constexpr bool skip_decrypt = true;
7772 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7773 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7774 prepend_meta, get_op, rgwx_stat,
7775 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7776 if (ret < 0) {
7777 goto set_err_state;
7778 }
7779
7780 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7781 if (ret < 0) {
7782 goto set_err_state;
7783 }
7784 if (compressor && compressor->is_compressed()) {
7785 bufferlist tmp;
7786 RGWCompressionInfo cs_info;
7787 cs_info.compression_type = plugin->get_type_name();
7788 cs_info.orig_size = cb.get_data_len();
7789 cs_info.blocks = move(compressor->get_compression_blocks());
7790 ::encode(cs_info, tmp);
7791 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7792 }
7793
7794 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7795 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7796 } else {
7797 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7798 if (iter != cb.get_attrs().end()) {
7799 try {
7800 ::decode(delete_at, iter->second);
7801 } catch (buffer::error& err) {
7802 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7803 }
7804 }
7805 }
7806
7807 if (src_mtime) {
7808 *src_mtime = set_mtime;
7809 }
7810
7811 if (petag) {
7812 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7813 if (iter != cb.get_attrs().end()) {
7814 *petag = iter->second;
7815 }
7816 }
7817
7818 if (source_zone.empty()) {
7819 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7820 } else {
7821 attrs = cb.get_attrs();
7822 }
7823
7824 if (copy_if_newer) {
7825 uint64_t pg_ver = 0;
7826 auto i = attrs.find(RGW_ATTR_PG_VER);
7827 if (i != attrs.end() && i->second.length() > 0) {
7828 bufferlist::iterator iter = i->second.begin();
7829 try {
7830 ::decode(pg_ver, iter);
7831 } catch (buffer::error& err) {
7832 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7833 /* non critical error */
7834 }
7835 }
7836 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7837 }
7838
7839 #define MAX_COMPLETE_RETRY 100
7840 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7841 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7842 if (ret < 0) {
7843 goto set_err_state;
7844 }
7845 if (copy_if_newer && cb.is_canceled()) {
7846 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7847 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7848 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7849 if (ret < 0) {
7850 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7851 goto set_err_state;
7852 }
7853 dest_mtime_weight.init(dest_state);
7854 dest_mtime_weight.high_precision = high_precision_time;
7855 if (!dest_state->exists ||
7856 dest_mtime_weight < set_mtime_weight) {
7857 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7858 continue;
7859 } else {
7860 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7861 }
7862 }
7863 break;
7864 }
7865
7866 if (i == MAX_COMPLETE_RETRY) {
7867 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7868 ret = -EIO;
7869 goto set_err_state;
7870 }
7871
7872 if (opstate) {
7873 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7874 if (ret < 0) {
7875 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7876 }
7877 delete opstate;
7878 }
7879
7880 return 0;
7881 set_err_state:
7882 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7883 ret = 0;
7884 }
7885 if (opstate) {
7886 RGWOpState::OpState state;
7887 if (ret < 0) {
7888 state = RGWOpState::OPSTATE_ERROR;
7889 } else {
7890 state = RGWOpState::OPSTATE_COMPLETE;
7891 }
7892 int r = opstate->set_state(state);
7893 if (r < 0) {
7894 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7895 }
7896 delete opstate;
7897 }
7898 return ret;
7899 }
7900
7901
7902 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7903 map<string, bufferlist>& src_attrs,
7904 RGWRados::Object::Read& read_op,
7905 const rgw_user& user_id,
7906 rgw_obj& dest_obj,
7907 real_time *mtime)
7908 {
7909 string etag;
7910
7911 RGWRESTStreamWriteRequest *out_stream_req;
7912
7913 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7914 if (ret < 0) {
7915 return ret;
7916 }
7917
7918 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7919 if (ret < 0) {
7920 delete out_stream_req;
7921 return ret;
7922 }
7923
7924 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7925 if (ret < 0)
7926 return ret;
7927
7928 return 0;
7929 }
7930
7931 /**
7932 * Copy an object.
7933 * dest_obj: the object to copy into
7934 * src_obj: the object to copy from
7935 * attrs: usage depends on attrs_mod parameter
7936 * attrs_mod: the modification mode of the attrs, may have the following values:
7937 * ATTRSMOD_NONE - the attributes of the source object will be
7938 * copied without modifications, attrs parameter is ignored;
7939 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7940 * parameter, source object attributes are not copied;
7941 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7942 * are overwritten by values contained in attrs parameter.
7943 * err: stores any errors resulting from the get of the original object
7944 * Returns: 0 on success, -ERR# otherwise.
7945 */
7946 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7947 const rgw_user& user_id,
7948 const string& client_id,
7949 const string& op_id,
7950 req_info *info,
7951 const string& source_zone,
7952 rgw_obj& dest_obj,
7953 rgw_obj& src_obj,
7954 RGWBucketInfo& dest_bucket_info,
7955 RGWBucketInfo& src_bucket_info,
7956 real_time *src_mtime,
7957 real_time *mtime,
7958 const real_time *mod_ptr,
7959 const real_time *unmod_ptr,
7960 bool high_precision_time,
7961 const char *if_match,
7962 const char *if_nomatch,
7963 AttrsMod attrs_mod,
7964 bool copy_if_newer,
7965 map<string, bufferlist>& attrs,
7966 RGWObjCategory category,
7967 uint64_t olh_epoch,
7968 real_time delete_at,
7969 string *version_id,
7970 string *ptag,
7971 ceph::buffer::list *petag,
7972 void (*progress_cb)(off_t, void *),
7973 void *progress_data)
7974 {
7975 int ret;
7976 uint64_t obj_size;
7977 rgw_obj shadow_obj = dest_obj;
7978 string shadow_oid;
7979
7980 bool remote_src;
7981 bool remote_dest;
7982
7983 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
7984 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
7985
7986 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
7987 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
7988
7989 if (remote_src && remote_dest) {
7990 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7991 return -EINVAL;
7992 }
7993
7994 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7995
7996 if (remote_src || !source_zone.empty()) {
7997 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
7998 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
7999 unmod_ptr, high_precision_time,
8000 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
8001 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
8002 }
8003
8004 map<string, bufferlist> src_attrs;
8005 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8006 RGWRados::Object::Read read_op(&src_op_target);
8007
8008 read_op.conds.mod_ptr = mod_ptr;
8009 read_op.conds.unmod_ptr = unmod_ptr;
8010 read_op.conds.high_precision_time = high_precision_time;
8011 read_op.conds.if_match = if_match;
8012 read_op.conds.if_nomatch = if_nomatch;
8013 read_op.params.attrs = &src_attrs;
8014 read_op.params.lastmod = src_mtime;
8015 read_op.params.obj_size = &obj_size;
8016
8017 ret = read_op.prepare();
8018 if (ret < 0) {
8019 return ret;
8020 }
8021
8022 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8023 src_attrs.erase(RGW_ATTR_DELETE_AT);
8024
8025 set_copy_attrs(src_attrs, attrs, attrs_mod);
8026 attrs.erase(RGW_ATTR_ID_TAG);
8027 attrs.erase(RGW_ATTR_PG_VER);
8028 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8029 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8030 if (cmp != src_attrs.end())
8031 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8032
8033 RGWObjManifest manifest;
8034 RGWObjState *astate = NULL;
8035
8036 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8037 if (ret < 0) {
8038 return ret;
8039 }
8040
8041 vector<rgw_raw_obj> ref_objs;
8042
8043 if (remote_dest) {
8044 /* dest is in a different zonegroup, copy it there */
8045 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8046 }
8047 uint64_t max_chunk_size;
8048
8049 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8050 if (ret < 0) {
8051 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8052 return ret;
8053 }
8054
8055 rgw_pool src_pool;
8056 rgw_pool dest_pool;
8057 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8058 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8059 return -EIO;
8060 }
8061 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8062 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8063 return -EIO;
8064 }
8065
8066
8067 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8068 bool copy_first = false;
8069 if (astate->has_manifest) {
8070 if (!astate->manifest.has_tail()) {
8071 copy_data = true;
8072 } else {
8073 uint64_t head_size = astate->manifest.get_head_size();
8074
8075 if (head_size > 0) {
8076 if (head_size > max_chunk_size) {
8077 copy_data = true;
8078 } else {
8079 copy_first = true;
8080 }
8081 }
8082 }
8083 }
8084
8085 if (petag) {
8086 const auto iter = attrs.find(RGW_ATTR_ETAG);
8087 if (iter != attrs.end()) {
8088 *petag = iter->second;
8089 }
8090 }
8091
8092 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8093 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8094 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8095 version_id, ptag, petag);
8096 }
8097
8098 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8099
8100 if (copy_first) { // we need to copy first chunk, not increase refcount
8101 ++miter;
8102 }
8103
8104 rgw_rados_ref ref;
8105 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8106 if (ret < 0) {
8107 return ret;
8108 }
8109
8110 bool versioned_dest = dest_bucket_info.versioning_enabled();
8111
8112 if (version_id && !version_id->empty()) {
8113 versioned_dest = true;
8114 dest_obj.key.set_instance(*version_id);
8115 } else if (versioned_dest) {
8116 gen_rand_obj_instance_name(&dest_obj);
8117 }
8118
8119 bufferlist first_chunk;
8120
8121 bool copy_itself = (dest_obj == src_obj);
8122 RGWObjManifest *pmanifest;
8123 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8124
8125 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8126 RGWRados::Object::Write write_op(&dest_op_target);
8127
8128 string tag;
8129
8130 if (ptag) {
8131 tag = *ptag;
8132 }
8133
8134 if (tag.empty()) {
8135 append_rand_alpha(cct, tag, tag, 32);
8136 }
8137
8138 if (!copy_itself) {
8139 attrs.erase(RGW_ATTR_TAIL_TAG);
8140 manifest = astate->manifest;
8141 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8142 if (tail_placement.bucket.name.empty()) {
8143 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8144 }
8145 for (; miter != astate->manifest.obj_end(); ++miter) {
8146 ObjectWriteOperation op;
8147 cls_refcount_get(op, tag, true);
8148 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8149 ref.ioctx.locator_set_key(loc.loc);
8150
8151 ret = ref.ioctx.operate(loc.oid, &op);
8152 if (ret < 0) {
8153 goto done_ret;
8154 }
8155
8156 ref_objs.push_back(loc);
8157 }
8158
8159 pmanifest = &manifest;
8160 } else {
8161 pmanifest = &astate->manifest;
8162 /* don't send the object's tail for garbage collection */
8163 astate->keep_tail = true;
8164 }
8165
8166 if (copy_first) {
8167 ret = read_op.read(0, max_chunk_size, first_chunk);
8168 if (ret < 0) {
8169 goto done_ret;
8170 }
8171
8172 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8173 } else {
8174 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8175 }
8176
8177 write_op.meta.data = &first_chunk;
8178 write_op.meta.manifest = pmanifest;
8179 write_op.meta.ptag = &tag;
8180 write_op.meta.owner = dest_bucket_info.owner;
8181 write_op.meta.mtime = mtime;
8182 write_op.meta.flags = PUT_OBJ_CREATE;
8183 write_op.meta.category = category;
8184 write_op.meta.olh_epoch = olh_epoch;
8185 write_op.meta.delete_at = delete_at;
8186 write_op.meta.modify_tail = !copy_itself;
8187
8188 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8189 if (ret < 0) {
8190 goto done_ret;
8191 }
8192
8193 return 0;
8194
8195 done_ret:
8196 if (!copy_itself) {
8197 vector<rgw_raw_obj>::iterator riter;
8198
8199 /* rollback reference */
8200 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8201 ObjectWriteOperation op;
8202 cls_refcount_put(op, tag, true);
8203
8204 ref.ioctx.locator_set_key(riter->loc);
8205
8206 int r = ref.ioctx.operate(riter->oid, &op);
8207 if (r < 0) {
8208 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8209 }
8210 }
8211 }
8212 return ret;
8213 }
8214
8215
8216 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8217 RGWBucketInfo& dest_bucket_info,
8218 RGWRados::Object::Read& read_op, off_t end,
8219 rgw_obj& dest_obj,
8220 rgw_obj& src_obj,
8221 uint64_t max_chunk_size,
8222 real_time *mtime,
8223 real_time set_mtime,
8224 map<string, bufferlist>& attrs,
8225 RGWObjCategory category,
8226 uint64_t olh_epoch,
8227 real_time delete_at,
8228 string *version_id,
8229 string *ptag,
8230 ceph::buffer::list *petag)
8231 {
8232 bufferlist first_chunk;
8233 RGWObjManifest manifest;
8234
8235 string tag;
8236 append_rand_alpha(cct, tag, tag, 32);
8237
8238 RGWPutObjProcessor_Atomic processor(obj_ctx,
8239 dest_bucket_info, dest_obj.bucket, dest_obj.get_oid(),
8240 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8241 if (version_id) {
8242 processor.set_version_id(*version_id);
8243 }
8244 processor.set_olh_epoch(olh_epoch);
8245 int ret = processor.prepare(this, NULL);
8246 if (ret < 0)
8247 return ret;
8248
8249 off_t ofs = 0;
8250
8251 do {
8252 bufferlist bl;
8253 ret = read_op.read(ofs, end, bl);
8254
8255 uint64_t read_len = ret;
8256 bool again;
8257
8258 do {
8259 void *handle;
8260 rgw_raw_obj obj;
8261
8262 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8263 if (ret < 0) {
8264 return ret;
8265 }
8266 ret = processor.throttle_data(handle, obj, read_len, false);
8267 if (ret < 0)
8268 return ret;
8269 } while (again);
8270
8271 ofs += read_len;
8272 } while (ofs <= end);
8273
8274 string etag;
8275 auto iter = attrs.find(RGW_ATTR_ETAG);
8276 if (iter != attrs.end()) {
8277 bufferlist& bl = iter->second;
8278 etag = string(bl.c_str(), bl.length());
8279 if (petag) {
8280 *petag = bl;
8281 }
8282 }
8283
8284 uint64_t accounted_size;
8285 {
8286 bool compressed{false};
8287 RGWCompressionInfo cs_info;
8288 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8289 if (ret < 0) {
8290 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8291 return ret;
8292 }
8293 // pass original size if compressed
8294 accounted_size = compressed ? cs_info.orig_size : ofs;
8295 }
8296
8297 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8298 }
8299
8300 bool RGWRados::is_meta_master()
8301 {
8302 if (!get_zonegroup().is_master_zonegroup()) {
8303 return false;
8304 }
8305
8306 return (get_zonegroup().master_zone == zone_public_config.id);
8307 }
8308
8309 /**
8310 * Check to see if the bucket metadata could be synced
8311 * bucket: the bucket to check
8312 * Returns false is the bucket is not synced
8313 */
8314 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8315 {
8316
8317 /* no current period */
8318 if (current_period.get_id().empty()) {
8319 return false;
8320 }
8321
8322 /* zonegroup is not master zonegroup */
8323 if (!get_zonegroup().is_master_zonegroup()) {
8324 return false;
8325 }
8326
8327 /* single zonegroup and a single zone */
8328 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8329 return false;
8330 }
8331
8332 /* zone is not master */
8333 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8334 return false;
8335 }
8336
8337 return true;
8338 }
8339
8340 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8341 {
8342 std::map<string, rgw_bucket_dir_entry> ent_map;
8343 rgw_obj_index_key marker;
8344 string prefix;
8345 bool is_truncated;
8346
8347 do {
8348 #define NUM_ENTRIES 1000
8349 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8350 &is_truncated, &marker);
8351 if (r < 0)
8352 return r;
8353
8354 string ns;
8355 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8356 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8357 rgw_obj_key obj;
8358
8359 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8360 return -ENOTEMPTY;
8361 }
8362 } while (is_truncated);
8363 return 0;
8364 }
8365
8366 /**
8367 * Delete a bucket.
8368 * bucket: the name of the bucket to delete
8369 * Returns 0 on success, -ERR# otherwise.
8370 */
8371 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8372 {
8373 const rgw_bucket& bucket = bucket_info.bucket;
8374 librados::IoCtx index_ctx;
8375 map<int, string> bucket_objs;
8376 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8377 if (r < 0)
8378 return r;
8379
8380 if (check_empty) {
8381 r = check_bucket_empty(bucket_info);
8382 if (r < 0) {
8383 return r;
8384 }
8385 }
8386
8387 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8388 if (r < 0)
8389 return r;
8390
8391 /* if the bucket is not synced we can remove the meta file */
8392 if (!is_syncing_bucket_meta(bucket)) {
8393 RGWObjVersionTracker objv_tracker;
8394 string entry = bucket.get_key();
8395 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8396 if (r < 0) {
8397 return r;
8398 }
8399 /* remove bucket index objects*/
8400 map<int, string>::const_iterator biter;
8401 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8402 index_ctx.remove(biter->second);
8403 }
8404 }
8405 return 0;
8406 }
8407
8408 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8409 {
8410 RGWBucketInfo info;
8411 map<string, bufferlist> attrs;
8412 RGWObjectCtx obj_ctx(this);
8413 int r;
8414 if (bucket.bucket_id.empty()) {
8415 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8416 } else {
8417 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8418 }
8419 if (r < 0) {
8420 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8421 return r;
8422 }
8423
8424 info.owner = owner.get_id();
8425
8426 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8427 if (r < 0) {
8428 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8429 return r;
8430 }
8431
8432 return 0;
8433 }
8434
8435
8436 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8437 {
8438 int ret = 0;
8439
8440 vector<rgw_bucket>::iterator iter;
8441
8442 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8443 rgw_bucket& bucket = *iter;
8444 if (enabled)
8445 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8446 else
8447 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8448
8449 RGWBucketInfo info;
8450 map<string, bufferlist> attrs;
8451 RGWObjectCtx obj_ctx(this);
8452 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8453 if (r < 0) {
8454 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8455 ret = r;
8456 continue;
8457 }
8458 if (enabled) {
8459 info.flags &= ~BUCKET_SUSPENDED;
8460 } else {
8461 info.flags |= BUCKET_SUSPENDED;
8462 }
8463
8464 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8465 if (r < 0) {
8466 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8467 ret = r;
8468 continue;
8469 }
8470 }
8471 return ret;
8472 }
8473
8474 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8475 {
8476 RGWBucketInfo bucket_info;
8477 RGWObjectCtx obj_ctx(this);
8478 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8479 if (ret < 0) {
8480 return ret;
8481 }
8482
8483 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8484 return 0;
8485 }
8486
8487 int RGWRados::Object::complete_atomic_modification()
8488 {
8489 if (!state->has_manifest || state->keep_tail)
8490 return 0;
8491
8492 cls_rgw_obj_chain chain;
8493 store->update_gc_chain(obj, state->manifest, &chain);
8494
8495 if (chain.empty()) {
8496 return 0;
8497 }
8498
8499 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
8500 return store->gc->send_chain(chain, tag, false); // do it async
8501 }
8502
8503 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8504 {
8505 RGWObjManifest::obj_iterator iter;
8506 rgw_raw_obj raw_head;
8507 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8508 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8509 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8510 if (mobj == raw_head)
8511 continue;
8512 cls_rgw_obj_key key(mobj.oid);
8513 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8514 }
8515 }
8516
8517 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8518 {
8519 return gc->send_chain(chain, tag, sync);
8520 }
8521
8522 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8523 {
8524 const rgw_bucket& bucket = bucket_info.bucket;
8525 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8526 if (r < 0)
8527 return r;
8528
8529 if (bucket.bucket_id.empty()) {
8530 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8531 return -EIO;
8532 }
8533
8534 bucket_oid = dir_oid_prefix;
8535 bucket_oid.append(bucket.bucket_id);
8536
8537 return 0;
8538 }
8539
8540 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8541 string& bucket_oid_base) {
8542 const rgw_bucket& bucket = bucket_info.bucket;
8543 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8544 if (r < 0)
8545 return r;
8546
8547 if (bucket.bucket_id.empty()) {
8548 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8549 return -EIO;
8550 }
8551
8552 bucket_oid_base = dir_oid_prefix;
8553 bucket_oid_base.append(bucket.bucket_id);
8554
8555 return 0;
8556
8557 }
8558
8559 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8560 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8561 string bucket_oid_base;
8562 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8563 if (ret < 0) {
8564 return ret;
8565 }
8566
8567 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8568 if (bucket_instance_ids) {
8569 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8570 }
8571 return 0;
8572 }
8573
8574 template<typename T>
8575 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8576 map<int, string>& oids, map<int, T>& bucket_objs,
8577 int shard_id, map<int, string> *bucket_instance_ids)
8578 {
8579 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8580 if (ret < 0)
8581 return ret;
8582
8583 map<int, string>::const_iterator iter = oids.begin();
8584 for (; iter != oids.end(); ++iter) {
8585 bucket_objs[iter->first] = T();
8586 }
8587 return 0;
8588 }
8589
8590 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8591 const string& obj_key, string *bucket_obj, int *shard_id)
8592 {
8593 string bucket_oid_base;
8594 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8595 if (ret < 0)
8596 return ret;
8597
8598 RGWObjectCtx obj_ctx(this);
8599
8600 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8601 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8602 if (ret < 0) {
8603 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8604 return ret;
8605 }
8606 return 0;
8607 }
8608
8609 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8610 int shard_id, string *bucket_obj)
8611 {
8612 string bucket_oid_base;
8613 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8614 if (ret < 0)
8615 return ret;
8616
8617 RGWObjectCtx obj_ctx(this);
8618
8619 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8620 shard_id, bucket_obj);
8621 return 0;
8622 }
8623
8624 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8625 map<RGWObjCategory, RGWStorageStats>& stats)
8626 {
8627 for (const auto& pair : header.stats) {
8628 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8629 const rgw_bucket_category_stats& header_stats = pair.second;
8630
8631 RGWStorageStats& s = stats[category];
8632
8633 s.category = category;
8634 s.size += header_stats.total_size;
8635 s.size_rounded += header_stats.total_size_rounded;
8636 s.size_utilized += header_stats.actual_size;
8637 s.num_objects += header_stats.num_entries;
8638 }
8639 }
8640
8641 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8642 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8643 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8644 {
8645 librados::IoCtx index_ctx;
8646 // key - bucket index object id
8647 // value - bucket index check OP returned result with the given bucket index object (shard)
8648 map<int, string> oids;
8649 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8650
8651 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8652 if (ret < 0) {
8653 return ret;
8654 }
8655
8656 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8657 if (ret < 0) {
8658 return ret;
8659 }
8660
8661 // Aggregate results (from different shards if there is any)
8662 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8663 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8664 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8665 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8666 }
8667
8668 return 0;
8669 }
8670
8671 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8672 {
8673 librados::IoCtx index_ctx;
8674 map<int, string> bucket_objs;
8675
8676 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8677 if (r < 0) {
8678 return r;
8679 }
8680
8681 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8682 }
8683
8684 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8685 {
8686 librados::IoCtx index_ctx;
8687 map<int, string> bucket_objs;
8688
8689 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8690 if (r < 0) {
8691 return r;
8692 }
8693
8694 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8695 }
8696
8697 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8698 {
8699 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8700 std::string oid, key;
8701 get_obj_bucket_and_oid_loc(obj, oid, key);
8702 if (!rctx)
8703 return 0;
8704
8705 RGWObjState *state = NULL;
8706
8707 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8708 if (r < 0)
8709 return r;
8710
8711 if (!state->is_atomic) {
8712 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8713 return -EINVAL;
8714 }
8715
8716 string tag;
8717
8718 if (state->tail_tag.length() > 0) {
8719 tag = state->tail_tag.c_str();
8720 } else if (state->obj_tag.length() > 0) {
8721 tag = state->obj_tag.c_str();
8722 } else {
8723 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8724 return -EINVAL;
8725 }
8726
8727 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8728
8729 return gc->defer_chain(tag, false);
8730 }
8731
8732 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8733 {
8734 list<string> prefixes;
8735 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8736 cls_rgw_remove_obj(op, prefixes);
8737 }
8738
8739 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8740 {
8741 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8742 }
8743
8744 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8745 {
8746 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8747 }
8748
8749
8750 /**
8751 * Delete an object.
8752 * bucket: name of the bucket storing the object
8753 * obj: name of the object to delete
8754 * Returns: 0 on success, -ERR# otherwise.
8755 */
8756 int RGWRados::Object::Delete::delete_obj()
8757 {
8758 RGWRados *store = target->get_store();
8759 rgw_obj& src_obj = target->get_obj();
8760 const string& instance = src_obj.key.instance;
8761 rgw_obj obj = src_obj;
8762
8763 if (instance == "null") {
8764 obj.key.instance.clear();
8765 }
8766
8767 bool explicit_marker_version = (!params.marker_version_id.empty());
8768
8769 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8770 if (instance.empty() || explicit_marker_version) {
8771 rgw_obj marker = obj;
8772
8773 if (!params.marker_version_id.empty()) {
8774 if (params.marker_version_id != "null") {
8775 marker.key.set_instance(params.marker_version_id);
8776 }
8777 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8778 store->gen_rand_obj_instance_name(&marker);
8779 }
8780
8781 result.version_id = marker.key.instance;
8782 result.delete_marker = true;
8783
8784 struct rgw_bucket_dir_entry_meta meta;
8785
8786 meta.owner = params.obj_owner.get_id().to_str();
8787 meta.owner_display_name = params.obj_owner.get_display_name();
8788
8789 if (real_clock::is_zero(params.mtime)) {
8790 meta.mtime = real_clock::now();
8791 } else {
8792 meta.mtime = params.mtime;
8793 }
8794
8795 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8796 if (r < 0) {
8797 return r;
8798 }
8799 } else {
8800 rgw_bucket_dir_entry dirent;
8801
8802 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8803 if (r < 0) {
8804 return r;
8805 }
8806 result.delete_marker = dirent.is_delete_marker();
8807 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8808 if (r < 0) {
8809 return r;
8810 }
8811 result.version_id = instance;
8812 }
8813
8814 BucketShard *bs;
8815 int r = target->get_bucket_shard(&bs);
8816 if (r < 0) {
8817 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8818 return r;
8819 }
8820
8821 if (target->bucket_info.datasync_flag_enabled()) {
8822 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8823 if (r < 0) {
8824 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8825 return r;
8826 }
8827 }
8828
8829 return 0;
8830 }
8831
8832 rgw_rados_ref ref;
8833 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8834 if (r < 0) {
8835 return r;
8836 }
8837
8838 RGWObjState *state;
8839 r = target->get_state(&state, false);
8840 if (r < 0)
8841 return r;
8842
8843 ObjectWriteOperation op;
8844
8845 if (!real_clock::is_zero(params.unmod_since)) {
8846 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8847 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8848 if (!params.high_precision_time) {
8849 ctime.tv_nsec = 0;
8850 unmod.tv_nsec = 0;
8851 }
8852
8853 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8854 if (ctime > unmod) {
8855 return -ERR_PRECONDITION_FAILED;
8856 }
8857
8858 /* only delete object if mtime is less than or equal to params.unmod_since */
8859 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8860 }
8861 uint64_t obj_size = state->size;
8862
8863 if (!real_clock::is_zero(params.expiration_time)) {
8864 bufferlist bl;
8865 real_time delete_at;
8866
8867 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8868 try {
8869 bufferlist::iterator iter = bl.begin();
8870 ::decode(delete_at, iter);
8871 } catch (buffer::error& err) {
8872 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8873 return -EIO;
8874 }
8875
8876 if (params.expiration_time != delete_at) {
8877 return -ERR_PRECONDITION_FAILED;
8878 }
8879 } else {
8880 return -ERR_PRECONDITION_FAILED;
8881 }
8882 }
8883
8884 if (!state->exists) {
8885 target->invalidate_state();
8886 return -ENOENT;
8887 }
8888
8889 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
8890 if (r < 0)
8891 return r;
8892
8893 RGWBucketInfo& bucket_info = target->get_bucket_info();
8894
8895 RGWRados::Bucket bop(store, bucket_info);
8896 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8897
8898 index_op.set_zones_trace(params.zones_trace);
8899 index_op.set_bilog_flags(params.bilog_flags);
8900
8901
8902 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8903 if (r < 0)
8904 return r;
8905
8906 store->remove_rgw_head_obj(op);
8907 r = ref.ioctx.operate(ref.oid, &op);
8908 bool need_invalidate = false;
8909 if (r == -ECANCELED) {
8910 /* raced with another operation, we can regard it as removed */
8911 need_invalidate = true;
8912 r = 0;
8913 }
8914
8915 int64_t poolid = ref.ioctx.get_id();
8916 if (r >= 0) {
8917 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8918 if (obj_tombstone_cache) {
8919 tombstone_entry entry{*state};
8920 obj_tombstone_cache->add(obj, entry);
8921 }
8922 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8923
8924 int ret = target->complete_atomic_modification();
8925 if (ret < 0) {
8926 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8927 }
8928 /* other than that, no need to propagate error */
8929 } else {
8930 int ret = index_op.cancel();
8931 if (ret < 0) {
8932 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8933 }
8934 }
8935
8936 if (need_invalidate) {
8937 target->invalidate_state();
8938 }
8939
8940 if (r < 0)
8941 return r;
8942
8943 /* update quota cache */
8944 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8945
8946 return 0;
8947 }
8948
8949 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8950 const RGWBucketInfo& bucket_info,
8951 const rgw_obj& obj,
8952 int versioning_status,
8953 uint16_t bilog_flags,
8954 const real_time& expiration_time,
8955 rgw_zone_set *zones_trace)
8956 {
8957 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
8958 RGWRados::Object::Delete del_op(&del_target);
8959
8960 del_op.params.bucket_owner = bucket_info.owner;
8961 del_op.params.versioning_status = versioning_status;
8962 del_op.params.bilog_flags = bilog_flags;
8963 del_op.params.expiration_time = expiration_time;
8964 del_op.params.zones_trace = zones_trace;
8965
8966 return del_op.delete_obj();
8967 }
8968
8969 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
8970 {
8971 rgw_rados_ref ref;
8972 int r = get_raw_obj_ref(obj, &ref);
8973 if (r < 0) {
8974 return r;
8975 }
8976
8977 ObjectWriteOperation op;
8978
8979 op.remove();
8980 r = ref.ioctx.operate(ref.oid, &op);
8981 if (r < 0)
8982 return r;
8983
8984 return 0;
8985 }
8986
8987 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
8988 {
8989 if (obj.empty()) {
8990 ldout(cct, 1) << "delete_system_obj got empty object name "
8991 << obj << ", returning EINVAL" << dendl;
8992 return -EINVAL;
8993 }
8994 rgw_rados_ref ref;
8995 int r = get_raw_obj_ref(obj, &ref);
8996 if (r < 0) {
8997 return r;
8998 }
8999
9000 ObjectWriteOperation op;
9001
9002 if (objv_tracker) {
9003 objv_tracker->prepare_op_for_write(&op);
9004 }
9005
9006 op.remove();
9007 r = ref.ioctx.operate(ref.oid, &op);
9008 if (r < 0)
9009 return r;
9010
9011 return 0;
9012 }
9013
9014 int RGWRados::delete_obj_index(const rgw_obj& obj)
9015 {
9016 std::string oid, key;
9017 get_obj_bucket_and_oid_loc(obj, oid, key);
9018
9019 RGWObjectCtx obj_ctx(this);
9020
9021 RGWBucketInfo bucket_info;
9022 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9023 if (ret < 0) {
9024 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9025 return ret;
9026 }
9027
9028 RGWRados::Bucket bop(this, bucket_info);
9029 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9030
9031 real_time removed_mtime;
9032 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9033
9034 return r;
9035 }
9036
9037 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9038 {
9039 string tag;
9040
9041 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9042 if (mi != manifest.obj_end()) {
9043 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9044 ++mi;
9045 tag = mi.get_location().get_raw_obj(store).oid;
9046 tag.append("_");
9047 }
9048
9049 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9050 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9051 MD5 hash;
9052 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9053
9054 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9055 if (iter != attrset.end()) {
9056 bufferlist& bl = iter->second;
9057 hash.Update((const byte *)bl.c_str(), bl.length());
9058 }
9059
9060 hash.Final(md5);
9061 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9062 tag.append(md5_str);
9063
9064 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9065
9066 tag_bl.append(tag.c_str(), tag.size() + 1);
9067 }
9068
9069 static bool is_olh(map<string, bufferlist>& attrs)
9070 {
9071 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9072 return (iter != attrs.end());
9073 }
9074
9075 static bool has_olh_tag(map<string, bufferlist>& attrs)
9076 {
9077 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9078 return (iter != attrs.end());
9079 }
9080
9081 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9082 RGWObjState *olh_state, RGWObjState **target_state)
9083 {
9084 assert(olh_state->is_olh);
9085
9086 rgw_obj target;
9087 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9088 if (r < 0) {
9089 return r;
9090 }
9091 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9092 if (r < 0) {
9093 return r;
9094 }
9095
9096 return 0;
9097 }
9098
9099 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9100 {
9101 if (obj.empty()) {
9102 return -EINVAL;
9103 }
9104
9105 RGWRawObjState *s = rctx->raw.get_state(obj);
9106 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9107 *state = s;
9108 if (s->has_attrs) {
9109 return 0;
9110 }
9111
9112 s->obj = obj;
9113
9114 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9115 if (r == -ENOENT) {
9116 s->exists = false;
9117 s->has_attrs = true;
9118 s->mtime = real_time();
9119 return 0;
9120 }
9121 if (r < 0)
9122 return r;
9123
9124 s->exists = true;
9125 s->has_attrs = true;
9126 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9127
9128 if (s->obj_tag.length())
9129 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9130 << s->obj_tag.c_str() << dendl;
9131 else
9132 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9133
9134 return 0;
9135 }
9136
9137 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9138 {
9139 int ret;
9140
9141 do {
9142 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9143 } while (ret == -EAGAIN);
9144
9145 return ret;
9146 }
9147
9148 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9149 RGWObjState **state, bool follow_olh, bool assume_noent)
9150 {
9151 if (obj.empty()) {
9152 return -EINVAL;
9153 }
9154
9155 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9156
9157 RGWObjState *s = rctx->obj.get_state(obj);
9158 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9159 *state = s;
9160 if (s->has_attrs) {
9161 if (s->is_olh && need_follow_olh) {
9162 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9163 }
9164 return 0;
9165 }
9166
9167 s->obj = obj;
9168
9169 rgw_raw_obj raw_obj;
9170 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9171
9172 int r = -ENOENT;
9173
9174 if (!assume_noent) {
9175 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9176 }
9177
9178 if (r == -ENOENT) {
9179 s->exists = false;
9180 s->has_attrs = true;
9181 tombstone_entry entry;
9182 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9183 s->mtime = entry.mtime;
9184 s->zone_short_id = entry.zone_short_id;
9185 s->pg_ver = entry.pg_ver;
9186 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9187 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9188 } else {
9189 s->mtime = real_time();
9190 }
9191 return 0;
9192 }
9193 if (r < 0)
9194 return r;
9195
9196 s->exists = true;
9197 s->has_attrs = true;
9198 s->accounted_size = s->size;
9199
9200 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9201 const bool compressed = (iter != s->attrset.end());
9202 if (compressed) {
9203 // use uncompressed size for accounted_size
9204 try {
9205 RGWCompressionInfo info;
9206 auto p = iter->second.begin();
9207 ::decode(info, p);
9208 s->accounted_size = info.orig_size;
9209 } catch (buffer::error&) {
9210 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9211 return -EIO;
9212 }
9213 }
9214
9215 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9216 if (iter != s->attrset.end()) {
9217 bufferlist bl = iter->second;
9218 bufferlist::iterator it = bl.begin();
9219 it.copy(bl.length(), s->shadow_obj);
9220 s->shadow_obj[bl.length()] = '\0';
9221 }
9222 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9223 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9224 if (ttiter != s->attrset.end()) {
9225 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9226 }
9227
9228 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9229 if (manifest_bl.length()) {
9230 bufferlist::iterator miter = manifest_bl.begin();
9231 try {
9232 ::decode(s->manifest, miter);
9233 s->has_manifest = true;
9234 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9235 broken due to old bugs */
9236 s->size = s->manifest.get_obj_size();
9237 if (!compressed)
9238 s->accounted_size = s->size;
9239 } catch (buffer::error& err) {
9240 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9241 return -EIO;
9242 }
9243 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9244 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9245 RGWObjManifest::obj_iterator mi;
9246 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9247 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9248 }
9249 }
9250
9251 if (!s->obj_tag.length()) {
9252 /*
9253 * Uh oh, something's wrong, object with manifest should have tag. Let's
9254 * create one out of the manifest, would be unique
9255 */
9256 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9257 s->fake_tag = true;
9258 }
9259 }
9260 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9261 if (aiter != s->attrset.end()) {
9262 bufferlist& pg_ver_bl = aiter->second;
9263 if (pg_ver_bl.length()) {
9264 bufferlist::iterator pgbl = pg_ver_bl.begin();
9265 try {
9266 ::decode(s->pg_ver, pgbl);
9267 } catch (buffer::error& err) {
9268 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9269 }
9270 }
9271 }
9272 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9273 if (aiter != s->attrset.end()) {
9274 bufferlist& zone_short_id_bl = aiter->second;
9275 if (zone_short_id_bl.length()) {
9276 bufferlist::iterator zbl = zone_short_id_bl.begin();
9277 try {
9278 ::decode(s->zone_short_id, zbl);
9279 } catch (buffer::error& err) {
9280 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9281 }
9282 }
9283 }
9284 if (s->obj_tag.length())
9285 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9286 else
9287 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9288
9289 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9290 * it exist, and not only if is_olh() returns true
9291 */
9292 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9293 if (iter != s->attrset.end()) {
9294 s->olh_tag = iter->second;
9295 }
9296
9297 if (is_olh(s->attrset)) {
9298 s->is_olh = true;
9299
9300 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9301
9302 if (need_follow_olh) {
9303 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9304 }
9305 }
9306
9307 return 0;
9308 }
9309
9310 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9311 bool follow_olh, bool assume_noent)
9312 {
9313 int ret;
9314
9315 do {
9316 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9317 } while (ret == -EAGAIN);
9318
9319 return ret;
9320 }
9321
9322 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9323 {
9324 RGWObjState *astate;
9325 int r = get_state(&astate, true);
9326 if (r < 0) {
9327 return r;
9328 }
9329
9330 *pmanifest = &astate->manifest;
9331
9332 return 0;
9333 }
9334
9335 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9336 {
9337 RGWObjState *state;
9338 int r = source->get_state(&state, true);
9339 if (r < 0)
9340 return r;
9341 if (!state->exists)
9342 return -ENOENT;
9343 if (!state->get_attr(name, dest))
9344 return -ENODATA;
9345
9346 return 0;
9347 }
9348
9349
9350 int RGWRados::Object::Stat::stat_async()
9351 {
9352 RGWObjectCtx& ctx = source->get_ctx();
9353 rgw_obj& obj = source->get_obj();
9354 RGWRados *store = source->get_store();
9355
9356 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9357 result.obj = obj;
9358 if (s->has_attrs) {
9359 state.ret = 0;
9360 result.size = s->size;
9361 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9362 result.attrs = s->attrset;
9363 result.has_manifest = s->has_manifest;
9364 result.manifest = s->manifest;
9365 return 0;
9366 }
9367
9368 string oid;
9369 string loc;
9370 get_obj_bucket_and_oid_loc(obj, oid, loc);
9371
9372 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9373 if (r < 0) {
9374 return r;
9375 }
9376
9377 librados::ObjectReadOperation op;
9378 op.stat2(&result.size, &result.mtime, NULL);
9379 op.getxattrs(&result.attrs, NULL);
9380 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9381 state.io_ctx.locator_set_key(loc);
9382 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9383 if (r < 0) {
9384 ldout(store->ctx(), 5) << __func__
9385 << ": ERROR: aio_operate() returned ret=" << r
9386 << dendl;
9387 return r;
9388 }
9389
9390 return 0;
9391 }
9392
9393
9394 int RGWRados::Object::Stat::wait()
9395 {
9396 if (!state.completion) {
9397 return state.ret;
9398 }
9399
9400 state.completion->wait_for_safe();
9401 state.ret = state.completion->get_return_value();
9402 state.completion->release();
9403
9404 if (state.ret != 0) {
9405 return state.ret;
9406 }
9407
9408 return finish();
9409 }
9410
9411 int RGWRados::Object::Stat::finish()
9412 {
9413 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9414 if (iter != result.attrs.end()) {
9415 bufferlist& bl = iter->second;
9416 bufferlist::iterator biter = bl.begin();
9417 try {
9418 ::decode(result.manifest, biter);
9419 } catch (buffer::error& err) {
9420 RGWRados *store = source->get_store();
9421 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9422 return -EIO;
9423 }
9424 result.has_manifest = true;
9425 }
9426
9427 return 0;
9428 }
9429
9430 /**
9431 * Get an attribute for a system object.
9432 * obj: the object to get attr
9433 * name: name of the attr to retrieve
9434 * dest: bufferlist to store the result in
9435 * Returns: 0 on success, -ERR# otherwise.
9436 */
9437 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9438 {
9439 rgw_rados_ref ref;
9440 int r = get_system_obj_ref(obj, &ref);
9441 if (r < 0) {
9442 return r;
9443 }
9444
9445 ObjectReadOperation op;
9446
9447 int rval;
9448 op.getxattr(name, &dest, &rval);
9449
9450 r = ref.ioctx.operate(ref.oid, &op, NULL);
9451 if (r < 0)
9452 return r;
9453
9454 return 0;
9455 }
9456
9457 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9458 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9459 ObjectOperation& op, RGWObjState **pstate)
9460 {
9461 if (!rctx)
9462 return 0;
9463
9464 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9465 if (r < 0)
9466 return r;
9467
9468 RGWObjState *state = *pstate;
9469
9470 if (!state->is_atomic) {
9471 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9472 return 0;
9473 }
9474
9475 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9476 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9477 } else {
9478 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9479 }
9480 return 0;
9481 }
9482
9483 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9484 {
9485 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9486 }
9487
9488 void RGWRados::Object::invalidate_state()
9489 {
9490 ctx.obj.invalidate(obj);
9491 }
9492
9493 void RGWRados::SystemObject::invalidate_state()
9494 {
9495 ctx.raw.invalidate(obj);
9496 }
9497
9498 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9499 const char *if_match, const char *if_nomatch, bool removal_op,
9500 bool modify_tail)
9501 {
9502 int r = get_state(&state, false);
9503 if (r < 0)
9504 return r;
9505
9506 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9507 if_match != NULL || if_nomatch != NULL) &&
9508 (!state->fake_tag);
9509
9510 if (!state->is_atomic) {
9511 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9512
9513 if (reset_obj) {
9514 op.create(false);
9515 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9516 }
9517
9518 return 0;
9519 }
9520
9521 if (need_guard) {
9522 /* first verify that the object wasn't replaced under */
9523 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9524 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9525 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9526 }
9527
9528 if (if_match) {
9529 if (strcmp(if_match, "*") == 0) {
9530 // test the object is existing
9531 if (!state->exists) {
9532 return -ERR_PRECONDITION_FAILED;
9533 }
9534 } else {
9535 bufferlist bl;
9536 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9537 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9538 return -ERR_PRECONDITION_FAILED;
9539 }
9540 }
9541 }
9542
9543 if (if_nomatch) {
9544 if (strcmp(if_nomatch, "*") == 0) {
9545 // test the object is NOT existing
9546 if (state->exists) {
9547 return -ERR_PRECONDITION_FAILED;
9548 }
9549 } else {
9550 bufferlist bl;
9551 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9552 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9553 return -ERR_PRECONDITION_FAILED;
9554 }
9555 }
9556 }
9557 }
9558
9559 if (reset_obj) {
9560 if (state->exists) {
9561 op.create(false);
9562 store->remove_rgw_head_obj(op);
9563 } else {
9564 op.create(true);
9565 }
9566 }
9567
9568 if (removal_op) {
9569 /* the object is being removed, no need to update its tag */
9570 return 0;
9571 }
9572
9573 if (ptag) {
9574 state->write_tag = *ptag;
9575 } else {
9576 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9577 }
9578 bufferlist bl;
9579 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9580
9581 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9582
9583 op.setxattr(RGW_ATTR_ID_TAG, bl);
9584 if (modify_tail) {
9585 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9586 }
9587
9588 return 0;
9589 }
9590
9591 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9592 RGWObjVersionTracker *objv_tracker)
9593 {
9594 map<string, bufferlist> attrs;
9595 attrs[name] = bl;
9596 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9597 }
9598
9599 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9600 map<string, bufferlist>& attrs,
9601 map<string, bufferlist>* rmattrs,
9602 RGWObjVersionTracker *objv_tracker)
9603 {
9604 rgw_rados_ref ref;
9605 int r = get_system_obj_ref(obj, &ref);
9606 if (r < 0) {
9607 return r;
9608 }
9609 ObjectWriteOperation op;
9610
9611 if (objv_tracker) {
9612 objv_tracker->prepare_op_for_write(&op);
9613 }
9614
9615 map<string, bufferlist>::iterator iter;
9616 if (rmattrs) {
9617 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9618 const string& name = iter->first;
9619 op.rmxattr(name.c_str());
9620 }
9621 }
9622
9623 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9624 const string& name = iter->first;
9625 bufferlist& bl = iter->second;
9626
9627 if (!bl.length())
9628 continue;
9629
9630 op.setxattr(name.c_str(), bl);
9631 }
9632
9633 if (!op.size())
9634 return 0;
9635
9636 bufferlist bl;
9637
9638 r = ref.ioctx.operate(ref.oid, &op);
9639 if (r < 0)
9640 return r;
9641
9642 return 0;
9643 }
9644
9645 /**
9646 * Set an attr on an object.
9647 * bucket: name of the bucket holding the object
9648 * obj: name of the object to set the attr on
9649 * name: the attr to set
9650 * bl: the contents of the attr
9651 * Returns: 0 on success, -ERR# otherwise.
9652 */
9653 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9654 {
9655 map<string, bufferlist> attrs;
9656 attrs[name] = bl;
9657 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9658 }
9659
9660 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9661 map<string, bufferlist>& attrs,
9662 map<string, bufferlist>* rmattrs)
9663 {
9664 rgw_rados_ref ref;
9665 int r = get_obj_head_ref(bucket_info, obj, &ref);
9666 if (r < 0) {
9667 return r;
9668 }
9669 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9670
9671 ObjectWriteOperation op;
9672 RGWObjState *state = NULL;
9673
9674 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9675 if (r < 0)
9676 return r;
9677
9678 map<string, bufferlist>::iterator iter;
9679 if (rmattrs) {
9680 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9681 const string& name = iter->first;
9682 op.rmxattr(name.c_str());
9683 }
9684 }
9685
9686 const rgw_bucket& bucket = obj.bucket;
9687
9688 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9689 const string& name = iter->first;
9690 bufferlist& bl = iter->second;
9691
9692 if (!bl.length())
9693 continue;
9694
9695 op.setxattr(name.c_str(), bl);
9696
9697 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9698 real_time ts;
9699 try {
9700 ::decode(ts, bl);
9701
9702 rgw_obj_index_key obj_key;
9703 obj.key.get_index_key(&obj_key);
9704
9705 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9706 } catch (buffer::error& err) {
9707 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9708 }
9709 }
9710 }
9711
9712 if (!op.size())
9713 return 0;
9714
9715 RGWObjectCtx obj_ctx(this);
9716
9717 bufferlist bl;
9718 RGWRados::Bucket bop(this, bucket_info);
9719 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9720
9721 if (state) {
9722 string tag;
9723 append_rand_alpha(cct, tag, tag, 32);
9724 state->write_tag = tag;
9725 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9726
9727 if (r < 0)
9728 return r;
9729
9730 bl.append(tag.c_str(), tag.size() + 1);
9731
9732 op.setxattr(RGW_ATTR_ID_TAG, bl);
9733 }
9734
9735 r = ref.ioctx.operate(ref.oid, &op);
9736 if (state) {
9737 if (r >= 0) {
9738 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9739 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9740 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9741 string etag(etag_bl.c_str(), etag_bl.length());
9742 string content_type(content_type_bl.c_str(), content_type_bl.length());
9743 uint64_t epoch = ref.ioctx.get_last_version();
9744 int64_t poolid = ref.ioctx.get_id();
9745 real_time mtime = real_clock::now();
9746 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9747 mtime, etag, content_type, &acl_bl,
9748 RGW_OBJ_CATEGORY_MAIN, NULL);
9749 } else {
9750 int ret = index_op.cancel();
9751 if (ret < 0) {
9752 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9753 }
9754 }
9755 }
9756 if (r < 0)
9757 return r;
9758
9759 if (state) {
9760 state->obj_tag.swap(bl);
9761 if (rmattrs) {
9762 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9763 state->attrset.erase(iter->first);
9764 }
9765 }
9766 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9767 state->attrset[iter->first] = iter->second;
9768 }
9769 }
9770
9771 return 0;
9772 }
9773
9774 int RGWRados::Object::Read::prepare()
9775 {
9776 RGWRados *store = source->get_store();
9777 CephContext *cct = store->ctx();
9778
9779 bufferlist etag;
9780
9781 map<string, bufferlist>::iterator iter;
9782
9783 RGWObjState *astate;
9784 int r = source->get_state(&astate, true);
9785 if (r < 0)
9786 return r;
9787
9788 if (!astate->exists) {
9789 return -ENOENT;
9790 }
9791
9792 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9793
9794 state.obj = astate->obj;
9795 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9796
9797 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9798 if (r < 0) {
9799 return r;
9800 }
9801 if (params.attrs) {
9802 *params.attrs = astate->attrset;
9803 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9804 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9805 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9806 }
9807 }
9808 }
9809
9810 /* Convert all times go GMT to make them compatible */
9811 if (conds.mod_ptr || conds.unmod_ptr) {
9812 obj_time_weight src_weight;
9813 src_weight.init(astate);
9814 src_weight.high_precision = conds.high_precision_time;
9815
9816 obj_time_weight dest_weight;
9817 dest_weight.high_precision = conds.high_precision_time;
9818
9819 if (conds.mod_ptr) {
9820 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9821 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9822 if (!(dest_weight < src_weight)) {
9823 return -ERR_NOT_MODIFIED;
9824 }
9825 }
9826
9827 if (conds.unmod_ptr) {
9828 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9829 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9830 if (dest_weight < src_weight) {
9831 return -ERR_PRECONDITION_FAILED;
9832 }
9833 }
9834 }
9835 if (conds.if_match || conds.if_nomatch) {
9836 r = get_attr(RGW_ATTR_ETAG, etag);
9837 if (r < 0)
9838 return r;
9839
9840 if (conds.if_match) {
9841 string if_match_str = rgw_string_unquote(conds.if_match);
9842 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9843 if (if_match_str.compare(etag.c_str()) != 0) {
9844 return -ERR_PRECONDITION_FAILED;
9845 }
9846 }
9847
9848 if (conds.if_nomatch) {
9849 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9850 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9851 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9852 return -ERR_NOT_MODIFIED;
9853 }
9854 }
9855 }
9856
9857 if (params.obj_size)
9858 *params.obj_size = astate->size;
9859 if (params.lastmod)
9860 *params.lastmod = astate->mtime;
9861
9862 return 0;
9863 }
9864
9865 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9866 {
9867 if (ofs < 0) {
9868 ofs += obj_size;
9869 if (ofs < 0)
9870 ofs = 0;
9871 end = obj_size - 1;
9872 } else if (end < 0) {
9873 end = obj_size - 1;
9874 }
9875
9876 if (obj_size > 0) {
9877 if (ofs >= (off_t)obj_size) {
9878 return -ERANGE;
9879 }
9880 if (end >= (off_t)obj_size) {
9881 end = obj_size - 1;
9882 }
9883 }
9884 return 0;
9885 }
9886
9887 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9888 {
9889 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9890 }
9891
9892 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9893 RGWRados::SystemObject::Read::GetObjState& state,
9894 rgw_raw_obj& obj,
9895 map<string, bufferlist> *attrs,
9896 real_time *lastmod,
9897 uint64_t *obj_size,
9898 RGWObjVersionTracker *objv_tracker)
9899 {
9900 RGWRawObjState *astate = NULL;
9901
9902 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9903 if (r < 0)
9904 return r;
9905
9906 if (!astate->exists) {
9907 return -ENOENT;
9908 }
9909
9910 if (attrs) {
9911 *attrs = astate->attrset;
9912 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9913 map<string, bufferlist>::iterator iter;
9914 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9915 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9916 }
9917 }
9918 }
9919
9920 if (obj_size)
9921 *obj_size = astate->size;
9922 if (lastmod)
9923 *lastmod = astate->mtime;
9924
9925 return 0;
9926 }
9927
9928
9929 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
9930 {
9931 RGWRados *store = target->get_store();
9932 BucketShard *bs;
9933 int r;
9934
9935 #define NUM_RESHARD_RETRIES 10
9936 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
9937 int ret = get_bucket_shard(&bs);
9938 if (ret < 0) {
9939 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9940 return ret;
9941 }
9942 r = call(bs);
9943 if (r != -ERR_BUSY_RESHARDING) {
9944 break;
9945 }
9946 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
9947 string new_bucket_id;
9948 r = store->block_while_resharding(bs, &new_bucket_id);
9949 if (r == -ERR_BUSY_RESHARDING) {
9950 continue;
9951 }
9952 if (r < 0) {
9953 return r;
9954 }
9955 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
9956 i = 0; /* resharding is finished, make sure we can retry */
9957 r = target->update_bucket_id(new_bucket_id);
9958 if (r < 0) {
9959 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
9960 return r;
9961 }
9962 invalidate_bs();
9963 }
9964
9965 if (r < 0) {
9966 return r;
9967 }
9968
9969 if (pbs) {
9970 *pbs = bs;
9971 }
9972
9973 return 0;
9974 }
9975
9976 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
9977 {
9978 RGWRados *store = source->get_store();
9979 rgw_raw_obj& obj = source->get_obj();
9980
9981 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
9982 stat_params.lastmod, stat_params.obj_size, objv_tracker);
9983 }
9984
9985 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
9986 {
9987 if (blind) {
9988 return 0;
9989 }
9990 RGWRados *store = target->get_store();
9991
9992 if (write_tag && write_tag->length()) {
9993 optag = string(write_tag->c_str(), write_tag->length());
9994 } else {
9995 if (optag.empty()) {
9996 append_rand_alpha(store->ctx(), optag, optag, 32);
9997 }
9998 }
9999
10000 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10001 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10002 });
10003
10004 if (r < 0) {
10005 return r;
10006 }
10007 prepared = true;
10008
10009 return 0;
10010 }
10011
10012 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10013 uint64_t size, uint64_t accounted_size,
10014 ceph::real_time& ut, const string& etag,
10015 const string& content_type,
10016 bufferlist *acl_bl,
10017 RGWObjCategory category,
10018 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10019 {
10020 if (blind) {
10021 return 0;
10022 }
10023 RGWRados *store = target->get_store();
10024 BucketShard *bs;
10025
10026 int ret = get_bucket_shard(&bs);
10027 if (ret < 0) {
10028 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10029 return ret;
10030 }
10031
10032 rgw_bucket_dir_entry ent;
10033 obj.key.get_index_key(&ent.key);
10034 ent.meta.size = size;
10035 ent.meta.accounted_size = accounted_size;
10036 ent.meta.mtime = ut;
10037 ent.meta.etag = etag;
10038 if (user_data)
10039 ent.meta.user_data = *user_data;
10040
10041 ACLOwner owner;
10042 if (acl_bl && acl_bl->length()) {
10043 int ret = store->decode_policy(*acl_bl, &owner);
10044 if (ret < 0) {
10045 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10046 }
10047 }
10048 ent.meta.owner = owner.get_id().to_str();
10049 ent.meta.owner_display_name = owner.get_display_name();
10050 ent.meta.content_type = content_type;
10051
10052 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10053
10054 if (target->bucket_info.datasync_flag_enabled()) {
10055 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10056 if (r < 0) {
10057 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10058 }
10059 }
10060
10061 return ret;
10062 }
10063
10064 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10065 real_time& removed_mtime,
10066 list<rgw_obj_index_key> *remove_objs)
10067 {
10068 if (blind) {
10069 return 0;
10070 }
10071 RGWRados *store = target->get_store();
10072 BucketShard *bs;
10073
10074 int ret = get_bucket_shard(&bs);
10075 if (ret < 0) {
10076 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10077 return ret;
10078 }
10079
10080 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10081
10082 if (target->bucket_info.datasync_flag_enabled()) {
10083 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10084 if (r < 0) {
10085 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10086 }
10087 }
10088
10089 return ret;
10090 }
10091
10092
10093 int RGWRados::Bucket::UpdateIndex::cancel()
10094 {
10095 if (blind) {
10096 return 0;
10097 }
10098 RGWRados *store = target->get_store();
10099 BucketShard *bs;
10100
10101 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10102 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10103 });
10104
10105 /*
10106 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10107 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10108 * have no way to tell that they're all caught up
10109 */
10110 if (target->bucket_info.datasync_flag_enabled()) {
10111 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10112 if (r < 0) {
10113 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10114 }
10115 }
10116
10117 return ret;
10118 }
10119
10120 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10121 {
10122 RGWRados *store = source->get_store();
10123 CephContext *cct = store->ctx();
10124
10125 rgw_raw_obj read_obj;
10126 uint64_t read_ofs = ofs;
10127 uint64_t len, read_len;
10128 bool reading_from_head = true;
10129 ObjectReadOperation op;
10130
10131 bool merge_bl = false;
10132 bufferlist *pbl = &bl;
10133 bufferlist read_bl;
10134 uint64_t max_chunk_size;
10135
10136 RGWObjState *astate;
10137 int r = source->get_state(&astate, true);
10138 if (r < 0)
10139 return r;
10140
10141 if (end < 0)
10142 len = 0;
10143 else
10144 len = end - ofs + 1;
10145
10146 if (astate->has_manifest && astate->manifest.has_tail()) {
10147 /* now get the relevant object part */
10148 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10149
10150 uint64_t stripe_ofs = iter.get_stripe_ofs();
10151 read_obj = iter.get_location().get_raw_obj(store);
10152 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10153 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10154 reading_from_head = (read_obj == state.head_obj);
10155 } else {
10156 read_obj = state.head_obj;
10157 }
10158
10159 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10160 if (r < 0) {
10161 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10162 return r;
10163 }
10164
10165 if (len > max_chunk_size)
10166 len = max_chunk_size;
10167
10168
10169 state.io_ctx.locator_set_key(read_obj.loc);
10170
10171 read_len = len;
10172
10173 if (reading_from_head) {
10174 /* only when reading from the head object do we need to do the atomic test */
10175 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10176 if (r < 0)
10177 return r;
10178
10179 if (astate && astate->prefetch_data) {
10180 if (!ofs && astate->data.length() >= len) {
10181 bl = astate->data;
10182 return bl.length();
10183 }
10184
10185 if (ofs < astate->data.length()) {
10186 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10187 astate->data.copy(ofs, copy_len, bl);
10188 read_len -= copy_len;
10189 read_ofs += copy_len;
10190 if (!read_len)
10191 return bl.length();
10192
10193 merge_bl = true;
10194 pbl = &read_bl;
10195 }
10196 }
10197 }
10198
10199 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10200 op.read(read_ofs, read_len, pbl, NULL);
10201
10202 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10203 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10204
10205 if (r < 0) {
10206 return r;
10207 }
10208
10209 if (merge_bl) {
10210 bl.append(read_bl);
10211 }
10212
10213 return bl.length();
10214 }
10215
10216 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10217 {
10218 if (!has_ref) {
10219 int r = store->get_raw_obj_ref(obj, &ref);
10220 if (r < 0) {
10221 return r;
10222 }
10223 has_ref = true;
10224 }
10225 *pref = &ref;
10226 return 0;
10227
10228 }
10229
10230 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10231 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10232 bufferlist& bl, off_t ofs, off_t end,
10233 map<string, bufferlist> *attrs,
10234 rgw_cache_entry_info *cache_info)
10235 {
10236 uint64_t len;
10237 ObjectReadOperation op;
10238
10239 if (end < 0)
10240 len = 0;
10241 else
10242 len = end - ofs + 1;
10243
10244 if (objv_tracker) {
10245 objv_tracker->prepare_op_for_read(&op);
10246 }
10247
10248 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10249 op.read(ofs, len, &bl, NULL);
10250
10251 if (attrs) {
10252 op.getxattrs(attrs, NULL);
10253 }
10254
10255 rgw_rados_ref *ref;
10256 int r = read_state.get_ref(this, obj, &ref);
10257 if (r < 0) {
10258 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10259 return r;
10260 }
10261 r = ref->ioctx.operate(ref->oid, &op, NULL);
10262 if (r < 0) {
10263 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10264 return r;
10265 }
10266 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10267
10268 uint64_t op_ver = ref->ioctx.get_last_version();
10269
10270 if (read_state.last_ver > 0 &&
10271 read_state.last_ver != op_ver) {
10272 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10273 return -ECANCELED;
10274 }
10275
10276 read_state.last_ver = op_ver;
10277
10278 return bl.length();
10279 }
10280
10281 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl, RGWObjVersionTracker *objv_tracker)
10282 {
10283 RGWRados *store = source->get_store();
10284 rgw_raw_obj& obj = source->get_obj();
10285
10286 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl, ofs, end, read_params.attrs, read_params.cache_info);
10287 }
10288
10289 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10290 {
10291 RGWRados *store = source->get_store();
10292 rgw_raw_obj& obj = source->get_obj();
10293
10294 return store->system_obj_get_attr(obj, name, dest);
10295 }
10296
10297 struct get_obj_data;
10298
10299 struct get_obj_aio_data {
10300 struct get_obj_data *op_data;
10301 off_t ofs;
10302 off_t len;
10303 };
10304
10305 struct get_obj_io {
10306 off_t len;
10307 bufferlist bl;
10308 };
10309
10310 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10311
10312 struct get_obj_data : public RefCountedObject {
10313 CephContext *cct;
10314 RGWRados *rados;
10315 RGWObjectCtx *ctx;
10316 IoCtx io_ctx;
10317 map<off_t, get_obj_io> io_map;
10318 map<off_t, librados::AioCompletion *> completion_map;
10319 uint64_t total_read;
10320 Mutex lock;
10321 Mutex data_lock;
10322 list<get_obj_aio_data> aio_data;
10323 RGWGetDataCB *client_cb;
10324 std::atomic<bool> cancelled = { false };
10325 std::atomic<int64_t> err_code = { 0 };
10326 Throttle throttle;
10327 list<bufferlist> read_list;
10328
10329 explicit get_obj_data(CephContext *_cct)
10330 : cct(_cct),
10331 rados(NULL), ctx(NULL),
10332 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10333 client_cb(NULL),
10334 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10335 ~get_obj_data() override { }
10336 void set_cancelled(int r) {
10337 cancelled = true;
10338 err_code = r;
10339 }
10340
10341 bool is_cancelled() {
10342 return cancelled;
10343 }
10344
10345 int get_err_code() {
10346 return err_code;
10347 }
10348
10349 int wait_next_io(bool *done) {
10350 lock.Lock();
10351 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10352 if (iter == completion_map.end()) {
10353 *done = true;
10354 lock.Unlock();
10355 return 0;
10356 }
10357 off_t cur_ofs = iter->first;
10358 librados::AioCompletion *c = iter->second;
10359 lock.Unlock();
10360
10361 c->wait_for_safe_and_cb();
10362 int r = c->get_return_value();
10363
10364 lock.Lock();
10365 completion_map.erase(cur_ofs);
10366
10367 if (completion_map.empty()) {
10368 *done = true;
10369 }
10370 lock.Unlock();
10371
10372 c->release();
10373
10374 return r;
10375 }
10376
10377 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10378 Mutex::Locker l(lock);
10379
10380 const auto& io_iter = io_map.insert(
10381 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10382
10383 assert(io_iter.second); // assert new insertion
10384
10385 get_obj_io& io = (io_iter.first)->second;
10386 *pbl = &io.bl;
10387
10388 struct get_obj_aio_data aio;
10389 aio.ofs = ofs;
10390 aio.len = len;
10391 aio.op_data = this;
10392
10393 aio_data.push_back(aio);
10394
10395 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10396
10397 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10398 completion_map[ofs] = c;
10399
10400 *pc = c;
10401
10402 /* we have a reference per IO, plus one reference for the calling function.
10403 * reference is dropped for each callback, plus when we're done iterating
10404 * over the parts */
10405 get();
10406 }
10407
10408 void cancel_io(off_t ofs) {
10409 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10410 lock.Lock();
10411 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10412 if (iter != completion_map.end()) {
10413 AioCompletion *c = iter->second;
10414 c->release();
10415 completion_map.erase(ofs);
10416 io_map.erase(ofs);
10417 }
10418 lock.Unlock();
10419
10420 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10421 * need IoCtx to live, as io callback may still be called
10422 */
10423 }
10424
10425 void cancel_all_io() {
10426 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10427 Mutex::Locker l(lock);
10428 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10429 iter != completion_map.end(); ++iter) {
10430 librados::AioCompletion *c = iter->second;
10431 c->release();
10432 }
10433 }
10434
10435 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10436 Mutex::Locker l(lock);
10437
10438 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10439
10440 if (liter == io_map.end() ||
10441 liter->first != ofs) {
10442 return 0;
10443 }
10444
10445 map<off_t, librados::AioCompletion *>::iterator aiter;
10446 aiter = completion_map.find(ofs);
10447 if (aiter == completion_map.end()) {
10448 /* completion map does not hold this io, it was cancelled */
10449 return 0;
10450 }
10451
10452 AioCompletion *completion = aiter->second;
10453 int r = completion->get_return_value();
10454 if (r < 0)
10455 return r;
10456
10457 for (; aiter != completion_map.end(); ++aiter) {
10458 completion = aiter->second;
10459 if (!completion->is_safe()) {
10460 /* reached a request that is not yet complete, stop */
10461 break;
10462 }
10463
10464 r = completion->get_return_value();
10465 if (r < 0) {
10466 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10467 return r;
10468 }
10469
10470 total_read += r;
10471
10472 map<off_t, get_obj_io>::iterator old_liter = liter++;
10473 bl_list.push_back(old_liter->second.bl);
10474 io_map.erase(old_liter);
10475 }
10476
10477 return 0;
10478 }
10479 };
10480
10481 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10482 {
10483 struct get_obj_data *d = (struct get_obj_data *)arg;
10484
10485 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10486 }
10487
10488 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10489 {
10490 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10491 struct get_obj_data *d = aio_data->op_data;
10492
10493 d->rados->get_obj_aio_completion_cb(cb, arg);
10494 }
10495
10496
10497 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10498 {
10499 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10500 struct get_obj_data *d = aio_data->op_data;
10501 off_t ofs = aio_data->ofs;
10502 off_t len = aio_data->len;
10503
10504 list<bufferlist> bl_list;
10505 list<bufferlist>::iterator iter;
10506 int r;
10507
10508 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10509 d->throttle.put(len);
10510
10511 r = rados_aio_get_return_value(c);
10512 if (r < 0) {
10513 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10514 d->set_cancelled(r);
10515 goto done;
10516 }
10517
10518 if (d->is_cancelled()) {
10519 goto done;
10520 }
10521
10522 d->data_lock.Lock();
10523
10524 r = d->get_complete_ios(ofs, bl_list);
10525 if (r < 0) {
10526 goto done_unlock;
10527 }
10528
10529 d->read_list.splice(d->read_list.end(), bl_list);
10530
10531 done_unlock:
10532 d->data_lock.Unlock();
10533 done:
10534 d->put();
10535 return;
10536 }
10537
10538 int RGWRados::flush_read_list(struct get_obj_data *d)
10539 {
10540 d->data_lock.Lock();
10541 list<bufferlist> l;
10542 l.swap(d->read_list);
10543 d->get();
10544 d->read_list.clear();
10545
10546 d->data_lock.Unlock();
10547
10548 int r = 0;
10549
10550 list<bufferlist>::iterator iter;
10551 for (iter = l.begin(); iter != l.end(); ++iter) {
10552 bufferlist& bl = *iter;
10553 r = d->client_cb->handle_data(bl, 0, bl.length());
10554 if (r < 0) {
10555 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10556 break;
10557 }
10558 }
10559
10560 d->data_lock.Lock();
10561 d->put();
10562 if (r < 0) {
10563 d->set_cancelled(r);
10564 }
10565 d->data_lock.Unlock();
10566 return r;
10567 }
10568
10569 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10570 const RGWBucketInfo& bucket_info,
10571 const rgw_obj& obj,
10572 const rgw_raw_obj& read_obj,
10573 off_t obj_ofs,
10574 off_t read_ofs, off_t len,
10575 bool is_head_obj, void *arg)
10576 {
10577 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10578 ObjectReadOperation op;
10579 struct get_obj_data *d = (struct get_obj_data *)arg;
10580 string oid, key;
10581 bufferlist *pbl;
10582 AioCompletion *c;
10583
10584 int r;
10585
10586 if (is_head_obj) {
10587 /* only when reading from the head object do we need to do the atomic test */
10588 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10589 if (r < 0)
10590 return r;
10591
10592 if (astate &&
10593 obj_ofs < astate->data.length()) {
10594 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10595
10596 d->data_lock.Lock();
10597 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10598 d->data_lock.Unlock();
10599 if (r < 0)
10600 return r;
10601
10602 d->lock.Lock();
10603 d->total_read += chunk_len;
10604 d->lock.Unlock();
10605
10606 len -= chunk_len;
10607 read_ofs += chunk_len;
10608 obj_ofs += chunk_len;
10609 if (!len)
10610 return 0;
10611 }
10612 }
10613
10614 d->throttle.get(len);
10615 if (d->is_cancelled()) {
10616 return d->get_err_code();
10617 }
10618
10619 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10620 * cleaning up
10621 */
10622 d->add_io(obj_ofs, len, &pbl, &c);
10623
10624 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10625 op.read(read_ofs, len, pbl, NULL);
10626
10627 librados::IoCtx io_ctx(d->io_ctx);
10628 io_ctx.locator_set_key(read_obj.loc);
10629
10630 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10631 if (r < 0) {
10632 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10633 goto done_err;
10634 }
10635
10636 // Flush data to client if there is any
10637 r = flush_read_list(d);
10638 if (r < 0)
10639 return r;
10640
10641 return 0;
10642
10643 done_err:
10644 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10645 d->set_cancelled(r);
10646 d->cancel_io(obj_ofs);
10647
10648 return r;
10649 }
10650
10651 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10652 {
10653 RGWRados *store = source->get_store();
10654 CephContext *cct = store->ctx();
10655
10656 struct get_obj_data *data = new get_obj_data(cct);
10657 bool done = false;
10658
10659 RGWObjectCtx& obj_ctx = source->get_ctx();
10660
10661 data->rados = store;
10662 data->io_ctx.dup(state.io_ctx);
10663 data->client_cb = cb;
10664
10665 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10666 if (r < 0) {
10667 data->cancel_all_io();
10668 goto done;
10669 }
10670
10671 while (!done) {
10672 r = data->wait_next_io(&done);
10673 if (r < 0) {
10674 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10675 data->cancel_all_io();
10676 break;
10677 }
10678 r = store->flush_read_list(data);
10679 if (r < 0) {
10680 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10681 data->cancel_all_io();
10682 break;
10683 }
10684 }
10685
10686 done:
10687 data->put();
10688 return r;
10689 }
10690
10691 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10692 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10693 off_t ofs, off_t end,
10694 uint64_t max_chunk_size,
10695 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10696 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10697 RGWObjState *, void *),
10698 void *arg)
10699 {
10700 rgw_raw_obj head_obj;
10701 rgw_raw_obj read_obj;
10702 uint64_t read_ofs = ofs;
10703 uint64_t len;
10704 bool reading_from_head = true;
10705 RGWObjState *astate = NULL;
10706
10707 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10708
10709 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10710 if (r < 0) {
10711 return r;
10712 }
10713
10714 if (end < 0)
10715 len = 0;
10716 else
10717 len = end - ofs + 1;
10718
10719 if (astate->has_manifest) {
10720 /* now get the relevant object stripe */
10721 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10722
10723 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10724
10725 for (; iter != obj_end && ofs <= end; ++iter) {
10726 off_t stripe_ofs = iter.get_stripe_ofs();
10727 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10728
10729 while (ofs < next_stripe_ofs && ofs <= end) {
10730 read_obj = iter.get_location().get_raw_obj(this);
10731 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10732 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10733
10734 if (read_len > max_chunk_size) {
10735 read_len = max_chunk_size;
10736 }
10737
10738 reading_from_head = (read_obj == head_obj);
10739 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10740 if (r < 0) {
10741 return r;
10742 }
10743
10744 len -= read_len;
10745 ofs += read_len;
10746 }
10747 }
10748 } else {
10749 while (ofs <= end) {
10750 read_obj = head_obj;
10751 uint64_t read_len = min(len, max_chunk_size);
10752
10753 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10754 if (r < 0) {
10755 return r;
10756 }
10757
10758 len -= read_len;
10759 ofs += read_len;
10760 }
10761 }
10762
10763 return 0;
10764 }
10765
10766 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10767 {
10768 rgw_rados_ref ref;
10769 int r = get_obj_head_ref(bucket_info, obj, &ref);
10770 if (r < 0) {
10771 return r;
10772 }
10773
10774 return ref.ioctx.operate(ref.oid, op);
10775 }
10776
10777 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10778 {
10779 rgw_rados_ref ref;
10780 int r = get_obj_head_ref(bucket_info, obj, &ref);
10781 if (r < 0) {
10782 return r;
10783 }
10784
10785 bufferlist outbl;
10786
10787 return ref.ioctx.operate(ref.oid, op, &outbl);
10788 }
10789
10790 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10791 {
10792 ObjectWriteOperation op;
10793
10794 assert(olh_obj.key.instance.empty());
10795
10796 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10797
10798 if (!state.exists) {
10799 op.create(true);
10800 } else {
10801 op.assert_exists();
10802 }
10803
10804 /*
10805 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10806 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10807 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10808 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10809 * log will reflect that.
10810 *
10811 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10812 * is used for object data instance, olh_tag for olh instance.
10813 */
10814 if (has_tag) {
10815 /* guard against racing writes */
10816 bucket_index_guard_olh_op(state, op);
10817 }
10818
10819 if (!has_tag) {
10820 /* obj tag */
10821 string obj_tag;
10822 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10823 if (ret < 0) {
10824 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10825 return ret;
10826 }
10827 bufferlist bl;
10828 bl.append(obj_tag.c_str(), obj_tag.size());
10829 op.setxattr(RGW_ATTR_ID_TAG, bl);
10830
10831 state.attrset[RGW_ATTR_ID_TAG] = bl;
10832 state.obj_tag = bl;
10833
10834 /* olh tag */
10835 string olh_tag;
10836 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10837 if (ret < 0) {
10838 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10839 return ret;
10840 }
10841 bufferlist olh_bl;
10842 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10843 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10844
10845 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10846 state.olh_tag = olh_bl;
10847 state.is_olh = true;
10848
10849 bufferlist verbl;
10850 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10851 }
10852
10853 bufferlist bl;
10854 RGWOLHPendingInfo pending_info;
10855 pending_info.time = real_clock::now();
10856 ::encode(pending_info, bl);
10857
10858 #define OLH_PENDING_TAG_LEN 32
10859 /* tag will start with current time epoch, this so that entries are sorted by time */
10860 char buf[32];
10861 utime_t ut(pending_info.time);
10862 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10863 *op_tag = buf;
10864
10865 string s;
10866 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10867 if (ret < 0) {
10868 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10869 return ret;
10870 }
10871 op_tag->append(s);
10872
10873 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10874 attr_name.append(*op_tag);
10875
10876 op.setxattr(attr_name.c_str(), bl);
10877
10878 ret = obj_operate(bucket_info, olh_obj, &op);
10879 if (ret < 0) {
10880 return ret;
10881 }
10882
10883 state.exists = true;
10884 state.attrset[attr_name] = bl;
10885
10886 return 0;
10887 }
10888
10889 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10890 {
10891 int ret;
10892
10893 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10894 if (ret == -EEXIST) {
10895 ret = -ECANCELED;
10896 }
10897
10898 return ret;
10899 }
10900
10901 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
10902 {
10903 rgw_obj obj;
10904 const rgw_obj *pobj = &obj_instance;
10905 int r;
10906
10907 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10908 r = bs->init(pobj->bucket, *pobj);
10909 if (r < 0) {
10910 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
10911 return r;
10912 }
10913 r = call(bs);
10914 if (r != -ERR_BUSY_RESHARDING) {
10915 break;
10916 }
10917 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10918 string new_bucket_id;
10919 r = block_while_resharding(bs, &new_bucket_id);
10920 if (r == -ERR_BUSY_RESHARDING) {
10921 continue;
10922 }
10923 if (r < 0) {
10924 return r;
10925 }
10926 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10927 i = 0; /* resharding is finished, make sure we can retry */
10928
10929 obj = *pobj;
10930 obj.bucket.update_bucket_id(new_bucket_id);
10931 pobj = &obj;
10932 }
10933
10934 if (r < 0) {
10935 return r;
10936 }
10937
10938 return 0;
10939 }
10940
10941 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
10942 {
10943 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
10944
10945 return waiter->block_while_resharding(bs, new_bucket_id);
10946 }
10947
10948 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
10949 bool delete_marker,
10950 const string& op_tag,
10951 struct rgw_bucket_dir_entry_meta *meta,
10952 uint64_t olh_epoch,
10953 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
10954 {
10955 rgw_rados_ref ref;
10956 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10957 if (r < 0) {
10958 return r;
10959 }
10960
10961 rgw_zone_set zones_trace;
10962 if (_zones_trace) {
10963 zones_trace = *_zones_trace;
10964 } else {
10965 zones_trace.insert(get_zone().id);
10966 }
10967
10968 BucketShard bs(this);
10969
10970 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10971 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10972 librados::ObjectWriteOperation op;
10973 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10974 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
10975 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
10976 unmod_since, high_precision_time,
10977 get_zone().log_data, zones_trace);
10978 });
10979 if (r < 0) {
10980 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10981 return r;
10982 }
10983
10984 return 0;
10985 }
10986
10987 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
10988 {
10989 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
10990 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
10991 }
10992
10993 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
10994 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
10995 {
10996 rgw_rados_ref ref;
10997 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10998 if (r < 0) {
10999 return r;
11000 }
11001
11002 rgw_zone_set zones_trace;
11003 if (_zones_trace) {
11004 zones_trace = *_zones_trace;
11005 }
11006 zones_trace.insert(get_zone().id);
11007
11008 BucketShard bs(this);
11009
11010 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11011 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11012 librados::ObjectWriteOperation op;
11013 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11014 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11015 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11016 });
11017 if (r < 0) {
11018 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11019 return r;
11020 }
11021
11022 return 0;
11023 }
11024
11025 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11026 const rgw_obj& obj_instance, uint64_t ver_marker,
11027 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11028 bool *is_truncated)
11029 {
11030 rgw_rados_ref ref;
11031 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11032 if (r < 0) {
11033 return r;
11034 }
11035
11036 BucketShard bs(this);
11037 int ret = bs.init(obj_instance.bucket, obj_instance);
11038 if (ret < 0) {
11039 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11040 return ret;
11041 }
11042
11043 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11044
11045 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11046
11047 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11048 ObjectReadOperation op;
11049 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11050 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11051 key, ver_marker, olh_tag, log, is_truncated);
11052 });
11053 if (ret < 0) {
11054 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11055 return ret;
11056 }
11057
11058 return 0;
11059 }
11060
11061 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11062 {
11063 rgw_rados_ref ref;
11064 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11065 if (r < 0) {
11066 return r;
11067 }
11068
11069 BucketShard bs(this);
11070 int ret = bs.init(obj_instance.bucket, obj_instance);
11071 if (ret < 0) {
11072 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11073 return ret;
11074 }
11075
11076 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11077
11078 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11079
11080 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11081 ObjectWriteOperation op;
11082 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11083 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11084 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11085 });
11086 if (ret < 0) {
11087 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11088 return ret;
11089 }
11090
11091 return 0;
11092 }
11093
11094 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11095 {
11096 rgw_rados_ref ref;
11097 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11098 if (r < 0) {
11099 return r;
11100 }
11101
11102 BucketShard bs(this);
11103
11104 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11105
11106 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11107
11108 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11109 ObjectWriteOperation op;
11110 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11111 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11112 });
11113 if (ret < 0) {
11114 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11115 return ret;
11116 }
11117
11118 return 0;
11119 }
11120
11121 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11122 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11123 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11124 {
11125 if (log.empty()) {
11126 return 0;
11127 }
11128
11129 librados::ObjectWriteOperation op;
11130
11131 uint64_t last_ver = log.rbegin()->first;
11132 *plast_ver = last_ver;
11133
11134 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11135
11136 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11137 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11138
11139 bool need_to_link = false;
11140 cls_rgw_obj_key key;
11141 bool delete_marker = false;
11142 list<cls_rgw_obj_key> remove_instances;
11143 bool need_to_remove = false;
11144
11145 for (iter = log.begin(); iter != log.end(); ++iter) {
11146 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11147 for (; viter != iter->second.end(); ++viter) {
11148 rgw_bucket_olh_log_entry& entry = *viter;
11149
11150 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11151 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11152 << (entry.delete_marker ? "(delete)" : "") << dendl;
11153 switch (entry.op) {
11154 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11155 remove_instances.push_back(entry.key);
11156 break;
11157 case CLS_RGW_OLH_OP_LINK_OLH:
11158 need_to_link = true;
11159 need_to_remove = false;
11160 key = entry.key;
11161 delete_marker = entry.delete_marker;
11162 break;
11163 case CLS_RGW_OLH_OP_UNLINK_OLH:
11164 need_to_remove = true;
11165 need_to_link = false;
11166 break;
11167 default:
11168 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11169 return -EIO;
11170 }
11171 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11172 attr_name.append(entry.op_tag);
11173 op.rmxattr(attr_name.c_str());
11174 }
11175 }
11176
11177 rgw_rados_ref ref;
11178 int r = get_obj_head_ref(bucket_info, obj, &ref);
11179 if (r < 0) {
11180 return r;
11181 }
11182
11183 const rgw_bucket& bucket = obj.bucket;
11184
11185 if (need_to_link) {
11186 rgw_obj target(bucket, key);
11187 RGWOLHInfo info;
11188 info.target = target;
11189 info.removed = delete_marker;
11190 bufferlist bl;
11191 ::encode(info, bl);
11192 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11193 }
11194
11195 /* first remove object instances */
11196 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11197 liter != remove_instances.end(); ++liter) {
11198 cls_rgw_obj_key& key = *liter;
11199 rgw_obj obj_instance(bucket, key);
11200 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11201 if (ret < 0 && ret != -ENOENT) {
11202 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11203 return ret;
11204 }
11205 }
11206
11207 /* update olh object */
11208 r = ref.ioctx.operate(ref.oid, &op);
11209 if (r == -ECANCELED) {
11210 r = 0;
11211 }
11212 if (r < 0) {
11213 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11214 return r;
11215 }
11216
11217 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11218 if (r < 0) {
11219 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11220 return r;
11221 }
11222
11223 if (need_to_remove) {
11224 ObjectWriteOperation rm_op;
11225
11226 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11227 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11228 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11229 rm_op.remove();
11230
11231 r = ref.ioctx.operate(ref.oid, &rm_op);
11232 if (r == -ECANCELED) {
11233 return 0; /* someone else won this race */
11234 } else {
11235 /*
11236 * only clear if was successful, otherwise we might clobber pending operations on this object
11237 */
11238 r = bucket_index_clear_olh(bucket_info, state, obj);
11239 if (r < 0) {
11240 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11241 return r;
11242 }
11243 }
11244 }
11245
11246 return 0;
11247 }
11248
11249 /*
11250 * read olh log and apply it
11251 */
11252 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11253 {
11254 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11255 bool is_truncated;
11256 uint64_t ver_marker = 0;
11257
11258 do {
11259 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11260 if (ret < 0) {
11261 return ret;
11262 }
11263 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11264 if (ret < 0) {
11265 return ret;
11266 }
11267 } while (is_truncated);
11268
11269 return 0;
11270 }
11271
11272 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11273 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11274 {
11275 string op_tag;
11276
11277 rgw_obj olh_obj = target_obj;
11278 olh_obj.key.instance.clear();
11279
11280 RGWObjState *state = NULL;
11281
11282 int ret = 0;
11283 int i;
11284
11285 #define MAX_ECANCELED_RETRY 100
11286 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11287 if (ret == -ECANCELED) {
11288 obj_ctx.obj.invalidate(olh_obj);
11289 }
11290
11291 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11292 if (ret < 0) {
11293 return ret;
11294 }
11295
11296 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11297 if (ret < 0) {
11298 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11299 if (ret == -ECANCELED) {
11300 continue;
11301 }
11302 return ret;
11303 }
11304 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11305 if (ret < 0) {
11306 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11307 if (ret == -ECANCELED) {
11308 continue;
11309 }
11310 return ret;
11311 }
11312 break;
11313 }
11314
11315 if (i == MAX_ECANCELED_RETRY) {
11316 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11317 return -EIO;
11318 }
11319
11320 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11321 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11322 ret = 0;
11323 }
11324 if (ret < 0) {
11325 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11326 return ret;
11327 }
11328
11329 return 0;
11330 }
11331
11332 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11333 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11334 {
11335 string op_tag;
11336
11337 rgw_obj olh_obj = target_obj;
11338 olh_obj.key.instance.clear();
11339
11340 RGWObjState *state = NULL;
11341
11342 int ret = 0;
11343 int i;
11344
11345 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11346 if (ret == -ECANCELED) {
11347 obj_ctx.obj.invalidate(olh_obj);
11348 }
11349
11350 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11351 if (ret < 0)
11352 return ret;
11353
11354 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11355 if (ret < 0) {
11356 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11357 if (ret == -ECANCELED) {
11358 continue;
11359 }
11360 return ret;
11361 }
11362
11363 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11364
11365 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11366 if (ret < 0) {
11367 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11368 if (ret == -ECANCELED) {
11369 continue;
11370 }
11371 return ret;
11372 }
11373 break;
11374 }
11375
11376 if (i == MAX_ECANCELED_RETRY) {
11377 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11378 return -EIO;
11379 }
11380
11381 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11382 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11383 return 0;
11384 }
11385 if (ret < 0) {
11386 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11387 return ret;
11388 }
11389
11390 return 0;
11391 }
11392
11393 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11394 {
11395 #define OBJ_INSTANCE_LEN 32
11396 char buf[OBJ_INSTANCE_LEN + 1];
11397
11398 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11399 no underscore for instance name due to the way we encode the raw keys */
11400
11401 target_obj->key.set_instance(buf);
11402 }
11403
11404 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11405 map<string, bufferlist> *attrset)
11406 {
11407 attrset->clear();
11408 map<string, bufferlist>::iterator iter;
11409 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11410 iter != unfiltered_attrset.end(); ++iter) {
11411 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11412 break;
11413 (*attrset)[iter->first] = iter->second;
11414 }
11415 }
11416
11417 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11418 {
11419 map<string, bufferlist> unfiltered_attrset;
11420
11421 ObjectReadOperation op;
11422 op.getxattrs(&unfiltered_attrset, NULL);
11423
11424 bufferlist outbl;
11425 int r = obj_operate(bucket_info, obj, &op);
11426
11427 if (r < 0) {
11428 return r;
11429 }
11430 map<string, bufferlist> attrset;
11431
11432 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11433
11434 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11435 if (iter == attrset.end()) { /* not an olh */
11436 return -EINVAL;
11437 }
11438
11439 try {
11440 bufferlist::iterator biter = iter->second.begin();
11441 ::decode(*olh, biter);
11442 } catch (buffer::error& err) {
11443 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11444 return -EIO;
11445 }
11446
11447 return 0;
11448 }
11449
11450 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11451 map<string, bufferlist> *rm_pending_entries)
11452 {
11453 map<string, bufferlist>::iterator iter = pending_entries.begin();
11454
11455 real_time now = real_clock::now();
11456
11457 while (iter != pending_entries.end()) {
11458 bufferlist::iterator biter = iter->second.begin();
11459 RGWOLHPendingInfo pending_info;
11460 try {
11461 ::decode(pending_info, biter);
11462 } catch (buffer::error& err) {
11463 /* skipping bad entry, we could remove it but it might hide a bug */
11464 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11465 ++iter;
11466 continue;
11467 }
11468
11469 map<string, bufferlist>::iterator cur_iter = iter;
11470 ++iter;
11471 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11472 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11473 pending_entries.erase(cur_iter);
11474 } else {
11475 /* entries names are sorted by time (rounded to a second) */
11476 break;
11477 }
11478 }
11479 }
11480
11481 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11482 {
11483 ObjectWriteOperation op;
11484
11485 bucket_index_guard_olh_op(state, op);
11486
11487 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11488 op.rmxattr(iter->first.c_str());
11489 }
11490
11491 rgw_rados_ref ref;
11492 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11493 if (r < 0) {
11494 return r;
11495 }
11496
11497 /* update olh object */
11498 r = ref.ioctx.operate(ref.oid, &op);
11499 if (r == -ENOENT || r == -ECANCELED) {
11500 /* raced with some other change, shouldn't sweat about it */
11501 r = 0;
11502 }
11503 if (r < 0) {
11504 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11505 return r;
11506 }
11507
11508 return 0;
11509 }
11510
11511 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11512 {
11513 map<string, bufferlist> pending_entries;
11514 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11515
11516 map<string, bufferlist> rm_pending_entries;
11517 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11518
11519 if (!rm_pending_entries.empty()) {
11520 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11521 if (ret < 0) {
11522 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11523 return ret;
11524 }
11525 }
11526 if (!pending_entries.empty()) {
11527 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11528
11529 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11530 if (ret < 0) {
11531 return ret;
11532 }
11533 }
11534
11535 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11536 assert(iter != state->attrset.end());
11537 RGWOLHInfo olh;
11538 try {
11539 bufferlist::iterator biter = iter->second.begin();
11540 ::decode(olh, biter);
11541 } catch (buffer::error& err) {
11542 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11543 return -EIO;
11544 }
11545
11546 if (olh.removed) {
11547 return -ENOENT;
11548 }
11549
11550 *target = olh.target;
11551
11552 return 0;
11553 }
11554
11555 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11556 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11557 RGWObjVersionTracker *objv_tracker)
11558 {
11559 rgw_rados_ref ref;
11560 int r = get_raw_obj_ref(obj, &ref);
11561 if (r < 0) {
11562 return r;
11563 }
11564
11565 map<string, bufferlist> unfiltered_attrset;
11566 uint64_t size = 0;
11567 struct timespec mtime_ts;
11568
11569 ObjectReadOperation op;
11570 if (objv_tracker) {
11571 objv_tracker->prepare_op_for_read(&op);
11572 }
11573 if (attrs) {
11574 op.getxattrs(&unfiltered_attrset, NULL);
11575 }
11576 if (psize || pmtime) {
11577 op.stat2(&size, &mtime_ts, NULL);
11578 }
11579 if (first_chunk) {
11580 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11581 }
11582 bufferlist outbl;
11583 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11584
11585 if (epoch) {
11586 *epoch = ref.ioctx.get_last_version();
11587 }
11588
11589 if (r < 0)
11590 return r;
11591
11592 if (psize)
11593 *psize = size;
11594 if (pmtime)
11595 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11596 if (attrs) {
11597 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11598 }
11599
11600 return 0;
11601 }
11602
11603 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11604 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11605 {
11606 map<string, rgw_bucket_dir_header> headers;
11607 map<int, string> bucket_instance_ids;
11608 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11609 if (r < 0) {
11610 return r;
11611 }
11612
11613 assert(headers.size() == bucket_instance_ids.size());
11614
11615 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11616 map<int, string>::iterator viter = bucket_instance_ids.begin();
11617 BucketIndexShardsManager ver_mgr;
11618 BucketIndexShardsManager master_ver_mgr;
11619 BucketIndexShardsManager marker_mgr;
11620 char buf[64];
11621 for(; iter != headers.end(); ++iter, ++viter) {
11622 accumulate_raw_stats(iter->second, stats);
11623 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11624 ver_mgr.add(viter->first, string(buf));
11625 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11626 master_ver_mgr.add(viter->first, string(buf));
11627 if (shard_id >= 0) {
11628 *max_marker = iter->second.max_marker;
11629 } else {
11630 marker_mgr.add(viter->first, iter->second.max_marker);
11631 }
11632 if (syncstopped != NULL)
11633 *syncstopped = iter->second.syncstopped;
11634 }
11635 ver_mgr.to_string(bucket_ver);
11636 master_ver_mgr.to_string(master_ver);
11637 if (shard_id < 0) {
11638 marker_mgr.to_string(max_marker);
11639 }
11640 return 0;
11641 }
11642
11643 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11644 map<int, string>& markers)
11645 {
11646 map<string, rgw_bucket_dir_header> headers;
11647 map<int, string> bucket_instance_ids;
11648 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11649 if (r < 0)
11650 return r;
11651
11652 assert(headers.size() == bucket_instance_ids.size());
11653
11654 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11655 map<int, string>::iterator viter = bucket_instance_ids.begin();
11656
11657 for(; iter != headers.end(); ++iter, ++viter) {
11658 if (shard_id >= 0) {
11659 markers[shard_id] = iter->second.max_marker;
11660 } else {
11661 markers[viter->first] = iter->second.max_marker;
11662 }
11663 }
11664 return 0;
11665 }
11666
11667 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11668 RGWGetBucketStats_CB *cb;
11669 uint32_t pendings;
11670 map<RGWObjCategory, RGWStorageStats> stats;
11671 int ret_code;
11672 bool should_cb;
11673 Mutex lock;
11674
11675 public:
11676 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11677 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11678 lock("RGWGetBucketStatsContext") {}
11679
11680 void handle_response(int r, rgw_bucket_dir_header& header) override {
11681 Mutex::Locker l(lock);
11682 if (should_cb) {
11683 if ( r >= 0) {
11684 accumulate_raw_stats(header, stats);
11685 } else {
11686 ret_code = r;
11687 }
11688
11689 // Are we all done?
11690 if (--pendings == 0) {
11691 if (!ret_code) {
11692 cb->set_response(&stats);
11693 }
11694 cb->handle_response(ret_code);
11695 cb->put();
11696 }
11697 }
11698 }
11699
11700 void unset_cb() {
11701 Mutex::Locker l(lock);
11702 should_cb = false;
11703 }
11704 };
11705
11706 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11707 {
11708 int num_aio = 0;
11709 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11710 assert(get_ctx);
11711 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11712 if (r < 0) {
11713 ctx->put();
11714 if (num_aio) {
11715 get_ctx->unset_cb();
11716 }
11717 }
11718 get_ctx->put();
11719 return r;
11720 }
11721
11722 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11723 RGWGetUserStats_CB *cb;
11724
11725 public:
11726 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11727 : cb(cb) {}
11728
11729 void handle_response(int r, cls_user_header& header) override {
11730 const cls_user_stats& hs = header.stats;
11731 if (r >= 0) {
11732 RGWStorageStats stats;
11733
11734 stats.size = hs.total_bytes;
11735 stats.size_rounded = hs.total_bytes_rounded;
11736 stats.num_objects = hs.total_entries;
11737
11738 cb->set_response(stats);
11739 }
11740
11741 cb->handle_response(r);
11742
11743 cb->put();
11744 }
11745 };
11746
11747 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11748 {
11749 string user_str = user.to_str();
11750
11751 cls_user_header header;
11752 int r = cls_user_get_header(user_str, &header);
11753 if (r < 0)
11754 return r;
11755
11756 const cls_user_stats& hs = header.stats;
11757
11758 stats.size = hs.total_bytes;
11759 stats.size_rounded = hs.total_bytes_rounded;
11760 stats.num_objects = hs.total_entries;
11761
11762 return 0;
11763 }
11764
11765 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11766 {
11767 string user_str = user.to_str();
11768
11769 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11770 int r = cls_user_get_header_async(user_str, get_ctx);
11771 if (r < 0) {
11772 ctx->put();
11773 delete get_ctx;
11774 return r;
11775 }
11776
11777 return 0;
11778 }
11779
11780 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11781 {
11782 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11783 }
11784
11785 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11786 {
11787 if (!bucket.oid.empty()) {
11788 obj.init(get_zone_params().domain_root, bucket.oid);
11789 } else {
11790 string oid;
11791 get_bucket_meta_oid(bucket, oid);
11792 obj.init(get_zone_params().domain_root, oid);
11793 }
11794 }
11795
11796 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11797 real_time *pmtime, map<string, bufferlist> *pattrs)
11798 {
11799 size_t pos = meta_key.find(':');
11800 if (pos == string::npos) {
11801 return -EINVAL;
11802 }
11803 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11804 rgw_bucket_instance_key_to_oid(oid);
11805
11806 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11807 }
11808
11809 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11810 real_time *pmtime, map<string, bufferlist> *pattrs)
11811 {
11812 string oid;
11813 if (bucket.oid.empty()) {
11814 get_bucket_meta_oid(bucket, oid);
11815 } else {
11816 oid = bucket.oid;
11817 }
11818
11819 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11820 }
11821
11822 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11823 real_time *pmtime, map<string, bufferlist> *pattrs,
11824 rgw_cache_entry_info *cache_info)
11825 {
11826 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11827
11828 bufferlist epbl;
11829
11830 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, oid, epbl, &info.objv_tracker, pmtime, pattrs, cache_info);
11831 if (ret < 0) {
11832 return ret;
11833 }
11834
11835 bufferlist::iterator iter = epbl.begin();
11836 try {
11837 ::decode(info, iter);
11838 } catch (buffer::error& err) {
11839 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11840 return -EIO;
11841 }
11842 info.bucket.oid = oid;
11843 return 0;
11844 }
11845
11846 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11847 const string& tenant_name,
11848 const string& bucket_name,
11849 RGWBucketEntryPoint& entry_point,
11850 RGWObjVersionTracker *objv_tracker,
11851 real_time *pmtime,
11852 map<string, bufferlist> *pattrs,
11853 rgw_cache_entry_info *cache_info)
11854 {
11855 bufferlist bl;
11856 string bucket_entry;
11857
11858 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11859 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
11860 if (ret < 0) {
11861 return ret;
11862 }
11863
11864 bufferlist::iterator iter = bl.begin();
11865 try {
11866 ::decode(entry_point, iter);
11867 } catch (buffer::error& err) {
11868 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11869 return -EIO;
11870 }
11871 return 0;
11872 }
11873
11874 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11875 const string& tenant_name,
11876 const string& bucket_name)
11877 {
11878 RGWBucketEntryPoint entry_point;
11879 real_time ep_mtime;
11880 RGWObjVersionTracker ot;
11881 map<string, bufferlist> attrs;
11882 RGWBucketInfo info;
11883
11884 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11885
11886 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11887 if (ret < 0) {
11888 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11889 return ret;
11890 }
11891
11892 if (!entry_point.has_bucket_info) {
11893 /* already converted! */
11894 return 0;
11895 }
11896
11897 info = entry_point.old_bucket_info;
11898 info.bucket.oid = bucket_name;
11899 info.ep_objv = ot.read_version;
11900
11901 ot.generate_new_write_ver(cct);
11902
11903 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11904 if (ret < 0) {
11905 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11906 return ret;
11907 }
11908
11909 return 0;
11910 }
11911
11912 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
11913 const string& tenant, const string& bucket_name, RGWBucketInfo& info,
11914 real_time *pmtime, map<string, bufferlist> *pattrs)
11915 {
11916 bucket_info_entry e;
11917 string bucket_entry;
11918 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11919
11920 if (binfo_cache->find(bucket_entry, &e)) {
11921 info = e.info;
11922 if (pattrs)
11923 *pattrs = e.attrs;
11924 if (pmtime)
11925 *pmtime = e.mtime;
11926 return 0;
11927 }
11928
11929 RGWBucketEntryPoint entry_point;
11930 real_time ep_mtime;
11931 RGWObjVersionTracker ot;
11932 rgw_cache_entry_info entry_cache_info;
11933 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
11934 if (ret < 0) {
11935 /* only init these fields */
11936 info.bucket.tenant = tenant;
11937 info.bucket.name = bucket_name;
11938 return ret;
11939 }
11940
11941 if (entry_point.has_bucket_info) {
11942 info = entry_point.old_bucket_info;
11943 info.bucket.oid = bucket_name;
11944 info.bucket.tenant = tenant;
11945 info.ep_objv = ot.read_version;
11946 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
11947 return 0;
11948 }
11949
11950 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11951 * that we got
11952 */
11953 if (pattrs) {
11954 pattrs->clear();
11955 }
11956
11957 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
11958
11959
11960 /* read bucket instance info */
11961
11962 string oid;
11963 get_bucket_meta_oid(entry_point.bucket, oid);
11964
11965 rgw_cache_entry_info cache_info;
11966
11967 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, &cache_info);
11968 e.info.ep_objv = ot.read_version;
11969 info = e.info;
11970 if (ret < 0) {
11971 info.bucket.tenant = tenant;
11972 info.bucket.name = bucket_name;
11973 // XXX and why return anything in case of an error anyway?
11974 return ret;
11975 }
11976
11977 if (pmtime)
11978 *pmtime = e.mtime;
11979 if (pattrs)
11980 *pattrs = e.attrs;
11981
11982 list<rgw_cache_entry_info *> cache_info_entries;
11983 cache_info_entries.push_back(&entry_cache_info);
11984 cache_info_entries.push_back(&cache_info);
11985
11986
11987 /* chain to both bucket entry point and bucket instance */
11988 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
11989 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
11990 }
11991
11992 return 0;
11993 }
11994
11995 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
11996 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
11997 map<string, bufferlist> *pattrs)
11998 {
11999 bufferlist epbl;
12000 ::encode(entry_point, epbl);
12001 string bucket_entry;
12002 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12003 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12004 }
12005
12006 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12007 real_time mtime, map<string, bufferlist> *pattrs)
12008 {
12009 info.has_instance_obj = true;
12010 bufferlist bl;
12011
12012 ::encode(info, bl);
12013
12014 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12015 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12016 if (ret == -EEXIST) {
12017 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12018 * bucket operation on this specific bucket (e.g., being synced from the master), but
12019 * since bucket instace meta object is unique for this specific bucket instace, we don't
12020 * need to return an error.
12021 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12022 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12023 * locally, while in the sync thread we sync the new bucket.
12024 */
12025 ret = 0;
12026 }
12027 return ret;
12028 }
12029
12030 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12031 map<string, bufferlist> *pattrs, bool create_entry_point)
12032 {
12033 bool create_head = !info.has_instance_obj || create_entry_point;
12034
12035 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12036 if (ret < 0) {
12037 return ret;
12038 }
12039
12040 if (!create_head)
12041 return 0; /* done! */
12042
12043 RGWBucketEntryPoint entry_point;
12044 entry_point.bucket = info.bucket;
12045 entry_point.owner = info.owner;
12046 entry_point.creation_time = info.creation_time;
12047 entry_point.linked = true;
12048 RGWObjVersionTracker ot;
12049 if (pep_objv && !pep_objv->tag.empty()) {
12050 ot.write_version = *pep_objv;
12051 } else {
12052 ot.generate_new_write_ver(cct);
12053 if (pep_objv) {
12054 *pep_objv = ot.write_version;
12055 }
12056 }
12057 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12058 if (ret < 0)
12059 return ret;
12060
12061 return 0;
12062 }
12063
12064 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12065 {
12066 rgw_rados_ref ref;
12067 int r = get_raw_obj_ref(obj, &ref);
12068 if (r < 0) {
12069 return r;
12070 }
12071
12072 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12073 if (r < 0)
12074 return r;
12075
12076 return 0;
12077
12078 }
12079
12080 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12081 std::map<string, bufferlist>& m)
12082 {
12083 rgw_rados_ref ref;
12084 int r = get_raw_obj_ref(obj, &ref);
12085 if (r < 0) {
12086 return r;
12087 }
12088
12089 #define MAX_OMAP_GET_ENTRIES 1024
12090 const int count = MAX_OMAP_GET_ENTRIES;
12091 string start_after;
12092
12093 while (true) {
12094 std::map<string, bufferlist> t;
12095 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12096 if (r < 0) {
12097 return r;
12098 }
12099 if (t.empty()) {
12100 break;
12101 }
12102 start_after = t.rbegin()->first;
12103 m.insert(t.begin(), t.end());
12104 }
12105 return 0;
12106 }
12107
12108 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12109 {
12110 rgw_rados_ref ref;
12111 int r = get_raw_obj_ref(obj, &ref);
12112 if (r < 0) {
12113 return r;
12114 }
12115 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12116
12117 map<string, bufferlist> m;
12118 m[key] = bl;
12119
12120 r = ref.ioctx.omap_set(ref.oid, m);
12121
12122 return r;
12123 }
12124
12125 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12126 {
12127 rgw_rados_ref ref;
12128 int r = get_raw_obj_ref(obj, &ref);
12129 if (r < 0) {
12130 return r;
12131 }
12132
12133 r = ref.ioctx.omap_set(ref.oid, m);
12134
12135 return r;
12136 }
12137
12138 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12139 {
12140 rgw_rados_ref ref;
12141 int r = get_raw_obj_ref(obj, &ref);
12142 if (r < 0) {
12143 return r;
12144 }
12145
12146 set<string> k;
12147 k.insert(key);
12148
12149 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12150 return r;
12151 }
12152
12153 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12154 {
12155 RGWObjectCtx obj_ctx(this);
12156
12157 map<string, RGWBucketEnt>::iterator iter;
12158 for (iter = m.begin(); iter != m.end(); ++iter) {
12159 RGWBucketEnt& ent = iter->second;
12160 rgw_bucket& bucket = ent.bucket;
12161 ent.count = 0;
12162 ent.size = 0;
12163 ent.size_rounded = 0;
12164
12165 map<string, rgw_bucket_dir_header> headers;
12166
12167 RGWBucketInfo bucket_info;
12168 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12169 if (ret < 0) {
12170 return ret;
12171 }
12172
12173 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12174 if (r < 0)
12175 return r;
12176
12177 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12178 for (; hiter != headers.end(); ++hiter) {
12179 RGWObjCategory category = main_category;
12180 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12181 if (iter != hiter->second.stats.end()) {
12182 struct rgw_bucket_category_stats& stats = iter->second;
12183 ent.count += stats.num_entries;
12184 ent.size += stats.total_size;
12185 ent.size_rounded += stats.total_size_rounded;
12186 }
12187 }
12188 }
12189
12190 return m.size();
12191 }
12192
12193 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12194 {
12195 rgw_rados_ref ref;
12196 int r = get_raw_obj_ref(obj, &ref);
12197 if (r < 0) {
12198 return r;
12199 }
12200 librados::Rados *rad = get_rados_handle();
12201 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12202
12203 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12204 completion->release();
12205 return r;
12206 }
12207
12208 int RGWRados::distribute(const string& key, bufferlist& bl)
12209 {
12210 /*
12211 * we were called before watch was initialized. This can only happen if we're updating some system
12212 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12213 * objects, they're currently only read on startup anyway.
12214 */
12215 if (!watch_initialized)
12216 return 0;
12217
12218 string notify_oid;
12219 pick_control_oid(key, notify_oid);
12220
12221 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12222 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12223 }
12224
12225 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12226 {
12227 librados::IoCtx& io_ctx = ctx.io_ctx;
12228 librados::NObjectIterator& iter = ctx.iter;
12229
12230 int r = open_pool_ctx(pool, io_ctx);
12231 if (r < 0)
12232 return r;
12233
12234 iter = io_ctx.nobjects_begin();
12235
12236 return 0;
12237 }
12238
12239 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12240 {
12241 librados::IoCtx& io_ctx = ctx.io_ctx;
12242 librados::NObjectIterator& iter = ctx.iter;
12243
12244 int r = open_pool_ctx(pool, io_ctx);
12245 if (r < 0)
12246 return r;
12247
12248 librados::ObjectCursor oc;
12249 if (!oc.from_str(cursor)) {
12250 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12251 return -EINVAL;
12252 }
12253
12254 iter = io_ctx.nobjects_begin(oc);
12255
12256 return 0;
12257 }
12258
12259 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12260 {
12261 return ctx.iter.get_cursor().to_str();
12262 }
12263
12264 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12265 bool *is_truncated, RGWAccessListFilter *filter)
12266 {
12267 librados::IoCtx& io_ctx = ctx.io_ctx;
12268 librados::NObjectIterator& iter = ctx.iter;
12269
12270 if (iter == io_ctx.nobjects_end())
12271 return -ENOENT;
12272
12273 uint32_t i;
12274
12275 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12276 rgw_bucket_dir_entry e;
12277
12278 string oid = iter->get_oid();
12279 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12280
12281 // fill it in with initial values; we may correct later
12282 if (filter && !filter->filter(oid, oid))
12283 continue;
12284
12285 e.key = oid;
12286 objs.push_back(e);
12287 }
12288
12289 if (is_truncated)
12290 *is_truncated = (iter != io_ctx.nobjects_end());
12291
12292 return objs.size();
12293 }
12294 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12295 string prefix;
12296
12297 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12298 bool filter(string& name, string& key) override {
12299 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12300 }
12301 };
12302
12303 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
12304 {
12305 if (!ctx->initialized) {
12306 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
12307 if (r < 0) {
12308 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12309 return r;
12310 }
12311 ctx->initialized = true;
12312 }
12313 return 0;
12314 }
12315
12316 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12317 RGWListRawObjsCtx& ctx, list<string>& oids,
12318 bool *is_truncated)
12319 {
12320 if (!ctx.initialized) {
12321 return -EINVAL;
12322 }
12323 RGWAccessListFilterPrefix filter(prefix_filter);
12324 vector<rgw_bucket_dir_entry> objs;
12325 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12326 if (r < 0) {
12327 if(r != -ENOENT)
12328 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12329 return r;
12330 }
12331
12332 vector<rgw_bucket_dir_entry>::iterator iter;
12333 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12334 oids.push_back(iter->key.name);
12335 }
12336
12337 return oids.size();
12338 }
12339
12340 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12341 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12342 bool *is_truncated)
12343 {
12344 if (!ctx.initialized) {
12345 int r = list_raw_objects_init(pool, string(), &ctx);
12346 if (r < 0) {
12347 return r;
12348 }
12349 }
12350
12351 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12352 }
12353
12354 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12355 {
12356 return pool_iterate_get_cursor(ctx.iter_ctx);
12357 }
12358
12359 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12360 std::list<rgw_bi_log_entry>& result, bool *truncated)
12361 {
12362 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12363 result.clear();
12364
12365 librados::IoCtx index_ctx;
12366 map<int, string> oids;
12367 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12368 map<int, string> bucket_instance_ids;
12369 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12370 if (r < 0)
12371 return r;
12372
12373 BucketIndexShardsManager marker_mgr;
12374 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12375 // If there are multiple shards for the bucket index object, the marker
12376 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12377 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12378 // only contain one record, and the key is the bucket instance id.
12379 r = marker_mgr.from_string(marker, shard_id);
12380 if (r < 0)
12381 return r;
12382
12383 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12384 if (r < 0)
12385 return r;
12386
12387 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12388 map<int, list<rgw_bi_log_entry>::iterator> vends;
12389 if (truncated) {
12390 *truncated = false;
12391 }
12392 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12393 for (; miter != bi_log_lists.end(); ++miter) {
12394 int shard_id = miter->first;
12395 vcurrents[shard_id] = miter->second.entries.begin();
12396 vends[shard_id] = miter->second.entries.end();
12397 if (truncated) {
12398 *truncated = (*truncated || miter->second.truncated);
12399 }
12400 }
12401
12402 size_t total = 0;
12403 bool has_more = true;
12404 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12405 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12406 while (total < max && has_more) {
12407 has_more = false;
12408
12409 viter = vcurrents.begin();
12410 eiter = vends.begin();
12411
12412 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12413 assert (eiter != vends.end());
12414
12415 int shard_id = viter->first;
12416 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12417
12418 if (liter == eiter->second){
12419 continue;
12420 }
12421 rgw_bi_log_entry& entry = *(liter);
12422 if (has_shards) {
12423 char buf[16];
12424 snprintf(buf, sizeof(buf), "%d", shard_id);
12425 string tmp_id;
12426 build_bucket_index_marker(buf, entry.id, &tmp_id);
12427 entry.id.swap(tmp_id);
12428 }
12429 marker_mgr.add(shard_id, entry.id);
12430 result.push_back(entry);
12431 total++;
12432 has_more = true;
12433 ++liter;
12434 }
12435 }
12436
12437 if (truncated) {
12438 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12439 assert (eiter != vends.end());
12440 *truncated = (*truncated || (viter->second != eiter->second));
12441 }
12442 }
12443
12444 // Refresh marker, if there are multiple shards, the output will look like
12445 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12446 // if there is no sharding, the simply marker (without oid) is returned
12447 if (has_shards) {
12448 marker_mgr.to_string(&marker);
12449 } else {
12450 if (!result.empty()) {
12451 marker = result.rbegin()->id;
12452 }
12453 }
12454
12455 return 0;
12456 }
12457
12458 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12459 {
12460 librados::IoCtx index_ctx;
12461 map<int, string> bucket_objs;
12462
12463 BucketIndexShardsManager start_marker_mgr;
12464 BucketIndexShardsManager end_marker_mgr;
12465
12466 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12467 if (r < 0) {
12468 return r;
12469 }
12470
12471 r = start_marker_mgr.from_string(start_marker, shard_id);
12472 if (r < 0) {
12473 return r;
12474 }
12475
12476 r = end_marker_mgr.from_string(end_marker, shard_id);
12477 if (r < 0) {
12478 return r;
12479 }
12480
12481 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12482 cct->_conf->rgw_bucket_index_max_aio)();
12483
12484 return r;
12485 }
12486
12487 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12488 {
12489 librados::IoCtx index_ctx;
12490 map<int, string> bucket_objs;
12491 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12492 if (r < 0)
12493 return r;
12494
12495 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12496 }
12497
12498 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12499 {
12500 librados::IoCtx index_ctx;
12501 map<int, string> bucket_objs;
12502 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12503 if (r < 0)
12504 return r;
12505
12506 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12507 }
12508
12509 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12510 {
12511 rgw_rados_ref ref;
12512 int r = get_obj_head_ref(bucket_info, obj, &ref);
12513 if (r < 0) {
12514 return r;
12515 }
12516
12517 rgw_cls_bi_entry bi_entry;
12518 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12519 if (r < 0 && r != -ENOENT) {
12520 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12521 }
12522 if (r < 0) {
12523 return r;
12524 }
12525 bufferlist::iterator iter = bi_entry.data.begin();
12526 try {
12527 ::decode(*dirent, iter);
12528 } catch (buffer::error& err) {
12529 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12530 return -EIO;
12531 }
12532
12533 return 0;
12534 }
12535
12536 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12537 {
12538 BucketShard bs(this);
12539 int ret = bs.init(bucket, obj);
12540 if (ret < 0) {
12541 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12542 return ret;
12543 }
12544
12545 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12546
12547 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12548 if (ret < 0)
12549 return ret;
12550
12551 return 0;
12552 }
12553
12554 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12555 {
12556 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12557 }
12558
12559 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12560 {
12561 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12562 if (ret < 0)
12563 return ret;
12564
12565 return 0;
12566 }
12567
12568 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12569 {
12570 BucketShard bs(this);
12571 int ret = bs.init(bucket, obj);
12572 if (ret < 0) {
12573 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12574 return ret;
12575 }
12576
12577 return bi_put(bs, entry);
12578 }
12579
12580 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12581 {
12582 rgw_obj obj(bucket, obj_name);
12583 BucketShard bs(this);
12584 int ret = bs.init(bucket, obj);
12585 if (ret < 0) {
12586 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12587 return ret;
12588 }
12589
12590 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12591 if (ret == -ENOENT) {
12592 *is_truncated = false;
12593 }
12594 if (ret < 0)
12595 return ret;
12596
12597 return 0;
12598 }
12599
12600 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12601 {
12602 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12603 if (ret < 0)
12604 return ret;
12605
12606 return 0;
12607 }
12608
12609 int RGWRados::bi_remove(BucketShard& bs)
12610 {
12611 int ret = bs.index_ctx.remove(bs.bucket_obj);
12612 if (ret == -ENOENT) {
12613 ret = 0;
12614 }
12615 if (ret < 0) {
12616 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12617 return ret;
12618 }
12619
12620 return 0;
12621 }
12622
12623 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12624 {
12625 BucketShard bs(this);
12626 int ret = bs.init(bucket, shard_id);
12627 if (ret < 0) {
12628 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12629 return ret;
12630 }
12631
12632 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12633 }
12634
12635 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12636 {
12637 return gc_pool_ctx.operate(oid, op);
12638 }
12639
12640 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12641 {
12642 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12643 int r = gc_pool_ctx.aio_operate(oid, c, op);
12644 c->release();
12645 return r;
12646 }
12647
12648 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12649 {
12650 return gc_pool_ctx.operate(oid, op, pbl);
12651 }
12652
12653 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12654 {
12655 return gc->list(index, marker, max, expired_only, result, truncated);
12656 }
12657
12658 int RGWRados::process_gc()
12659 {
12660 return gc->process();
12661 }
12662
12663 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12664 {
12665 return lc->list_lc_progress(marker, max_entries, progress_map);
12666 }
12667
12668 int RGWRados::process_lc()
12669 {
12670 return lc->process();
12671 }
12672
12673 int RGWRados::process_expire_objects()
12674 {
12675 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12676 return 0;
12677 }
12678
12679 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12680 {
12681 bufferlist in;
12682 cls_rgw_bucket_init(op);
12683 return index_ctx.operate(oid, &op);
12684 }
12685
12686 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12687 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12688 {
12689 rgw_zone_set zones_trace;
12690 if (_zones_trace) {
12691 zones_trace = *_zones_trace;
12692 }
12693 else {
12694 zones_trace.insert(get_zone().id);
12695 }
12696
12697 ObjectWriteOperation o;
12698 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12699 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12700 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12701 return bs.index_ctx.operate(bs.bucket_obj, &o);
12702 }
12703
12704 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12705 int64_t pool, uint64_t epoch,
12706 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12707 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12708 {
12709 ObjectWriteOperation o;
12710 rgw_bucket_dir_entry_meta dir_meta;
12711 dir_meta = ent.meta;
12712 dir_meta.category = category;
12713
12714 rgw_bucket_entry_ver ver;
12715 ver.pool = pool;
12716 ver.epoch = epoch;
12717 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12718 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12719 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12720 get_zone().log_data, bilog_flags, _zones_trace);
12721 complete_op_data *arg;
12722 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12723 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12724 librados::AioCompletion *completion = arg->rados_completion;
12725 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12726 completion->release(); /* can't reference arg here, as it might have already been released */
12727 return ret;
12728 }
12729
12730 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12731 int64_t pool, uint64_t epoch,
12732 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12733 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12734 {
12735 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12736 }
12737
12738 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12739 int64_t pool, uint64_t epoch,
12740 rgw_obj& obj,
12741 real_time& removed_mtime,
12742 list<rgw_obj_index_key> *remove_objs,
12743 uint16_t bilog_flags,
12744 rgw_zone_set *zones_trace)
12745 {
12746 rgw_bucket_dir_entry ent;
12747 ent.meta.mtime = removed_mtime;
12748 obj.key.get_index_key(&ent.key);
12749 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12750 }
12751
12752 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12753 {
12754 rgw_bucket_dir_entry ent;
12755 obj.key.get_index_key(&ent.key);
12756 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12757 }
12758
12759 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12760 {
12761 librados::IoCtx index_ctx;
12762 map<int, string> bucket_objs;
12763 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12764 if (r < 0)
12765 return r;
12766
12767 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12768 }
12769
12770 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12771 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12772 bool *is_truncated, rgw_obj_index_key *last_entry,
12773 bool (*force_check_filter)(const string& name))
12774 {
12775 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12776
12777 librados::IoCtx index_ctx;
12778 // key - oid (for different shards if there is any)
12779 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12780 map<int, string> oids;
12781 map<int, struct rgw_cls_list_ret> list_results;
12782 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12783 if (r < 0)
12784 return r;
12785
12786 cls_rgw_obj_key start_key(start.name, start.instance);
12787 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12788 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12789 if (r < 0)
12790 return r;
12791
12792 // Create a list of iterators that are used to iterate each shard
12793 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12794 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12795 vector<string> vnames(list_results.size());
12796 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12797 *is_truncated = false;
12798 for (; iter != list_results.end(); ++iter) {
12799 vcurrents.push_back(iter->second.dir.m.begin());
12800 vends.push_back(iter->second.dir.m.end());
12801 vnames.push_back(oids[iter->first]);
12802 *is_truncated = (*is_truncated || iter->second.is_truncated);
12803 }
12804
12805 // Create a map to track the next candidate entry from each shard, if the entry
12806 // from a specified shard is selected/erased, the next entry from that shard will
12807 // be inserted for next round selection
12808 map<string, size_t> candidates;
12809 for (size_t i = 0; i < vcurrents.size(); ++i) {
12810 if (vcurrents[i] != vends[i]) {
12811 candidates[vcurrents[i]->first] = i;
12812 }
12813 }
12814
12815 map<string, bufferlist> updates;
12816 uint32_t count = 0;
12817 while (count < num_entries && !candidates.empty()) {
12818 r = 0;
12819 // Select the next one
12820 int pos = candidates.begin()->second;
12821 const string& name = vcurrents[pos]->first;
12822 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12823
12824 bool force_check = force_check_filter && force_check_filter(dirent.key.name);
12825 if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
12826 /* there are uncommitted ops. We need to check the current state,
12827 * and if the tags are old we need to do cleanup as well. */
12828 librados::IoCtx sub_ctx;
12829 sub_ctx.dup(index_ctx);
12830 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12831 if (r < 0 && r != -ENOENT) {
12832 return r;
12833 }
12834 }
12835 if (r >= 0) {
12836 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12837 m[name] = std::move(dirent);
12838 ++count;
12839 }
12840
12841 // Refresh the candidates map
12842 candidates.erase(candidates.begin());
12843 ++vcurrents[pos];
12844 if (vcurrents[pos] != vends[pos]) {
12845 candidates[vcurrents[pos]->first] = pos;
12846 }
12847 }
12848
12849 // Suggest updates if there is any
12850 map<string, bufferlist>::iterator miter = updates.begin();
12851 for (; miter != updates.end(); ++miter) {
12852 if (miter->second.length()) {
12853 ObjectWriteOperation o;
12854 cls_rgw_suggest_changes(o, miter->second);
12855 // we don't care if we lose suggested updates, send them off blindly
12856 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12857 index_ctx.aio_operate(miter->first, c, &o);
12858 c->release();
12859 }
12860 }
12861
12862 // Check if all the returned entries are consumed or not
12863 for (size_t i = 0; i < vcurrents.size(); ++i) {
12864 if (vcurrents[i] != vends[i])
12865 *is_truncated = true;
12866 }
12867 if (!m.empty())
12868 *last_entry = m.rbegin()->first;
12869
12870 return 0;
12871 }
12872
12873 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12874 {
12875 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12876
12877 rgw_rados_ref ref;
12878 int r = get_raw_obj_ref(obj, &ref);
12879 if (r < 0) {
12880 return r;
12881 }
12882
12883 ObjectWriteOperation op;
12884 cls_rgw_usage_log_add(op, info);
12885
12886 r = ref.ioctx.operate(ref.oid, &op);
12887 return r;
12888 }
12889
12890 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
12891 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
12892 {
12893 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12894
12895 rgw_rados_ref ref;
12896 int r = get_raw_obj_ref(obj, &ref);
12897 if (r < 0) {
12898 return r;
12899 }
12900
12901 *is_truncated = false;
12902
12903 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
12904 max_entries, read_iter, usage, is_truncated);
12905
12906 return r;
12907 }
12908
12909 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
12910 {
12911 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12912
12913 rgw_rados_ref ref;
12914 int r = get_raw_obj_ref(obj, &ref);
12915 if (r < 0) {
12916 return r;
12917 }
12918
12919 ObjectWriteOperation op;
12920 cls_rgw_usage_log_trim(op, user, start_epoch, end_epoch);
12921
12922 r = ref.ioctx.operate(ref.oid, &op);
12923 return r;
12924 }
12925
12926 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
12927 {
12928 librados::IoCtx index_ctx;
12929 string dir_oid;
12930
12931 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12932
12933 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
12934 if (r < 0)
12935 return r;
12936
12937 bufferlist updates;
12938
12939 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
12940 rgw_bucket_dir_entry entry;
12941 entry.key = *iter;
12942 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
12943 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12944 updates.append(CEPH_RGW_REMOVE | suggest_flag);
12945 ::encode(entry, updates);
12946 }
12947
12948 bufferlist out;
12949
12950 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
12951
12952 return r;
12953 }
12954
12955 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
12956 const RGWBucketInfo& bucket_info,
12957 rgw_bucket_dir_entry& list_state,
12958 rgw_bucket_dir_entry& object,
12959 bufferlist& suggested_updates)
12960 {
12961 const rgw_bucket& bucket = bucket_info.bucket;
12962 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12963
12964 std::string loc;
12965
12966 rgw_obj obj(bucket, list_state.key);
12967
12968 string oid;
12969 get_obj_bucket_and_oid_loc(obj, oid, loc);
12970
12971 if (loc != list_state.locator) {
12972 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
12973 }
12974
12975 io_ctx.locator_set_key(list_state.locator);
12976
12977 RGWObjState *astate = NULL;
12978 RGWObjectCtx rctx(this);
12979 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
12980 if (r < 0)
12981 return r;
12982
12983 list_state.pending_map.clear(); // we don't need this and it inflates size
12984 if (!astate->exists) {
12985 /* object doesn't exist right now -- hopefully because it's
12986 * marked as !exists and got deleted */
12987 if (list_state.exists) {
12988 /* FIXME: what should happen now? Work out if there are any
12989 * non-bad ways this could happen (there probably are, but annoying
12990 * to handle!) */
12991 }
12992 // encode a suggested removal of that key
12993 list_state.ver.epoch = io_ctx.get_last_version();
12994 list_state.ver.pool = io_ctx.get_id();
12995 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
12996 return -ENOENT;
12997 }
12998
12999 string etag;
13000 string content_type;
13001 ACLOwner owner;
13002
13003 object.meta.size = astate->size;
13004 object.meta.accounted_size = astate->accounted_size;
13005 object.meta.mtime = astate->mtime;
13006
13007 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13008 if (iter != astate->attrset.end()) {
13009 etag = iter->second.c_str();
13010 }
13011 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13012 if (iter != astate->attrset.end()) {
13013 content_type = iter->second.c_str();
13014 }
13015 iter = astate->attrset.find(RGW_ATTR_ACL);
13016 if (iter != astate->attrset.end()) {
13017 r = decode_policy(iter->second, &owner);
13018 if (r < 0) {
13019 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13020 }
13021 }
13022
13023 if (astate->has_manifest) {
13024 RGWObjManifest::obj_iterator miter;
13025 RGWObjManifest& manifest = astate->manifest;
13026 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13027 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13028 rgw_obj loc;
13029 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13030
13031 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13032 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13033 r = delete_obj_index(loc);
13034 if (r < 0) {
13035 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13036 }
13037 }
13038 }
13039 }
13040
13041 object.meta.etag = etag;
13042 object.meta.content_type = content_type;
13043 object.meta.owner = owner.get_id().to_str();
13044 object.meta.owner_display_name = owner.get_display_name();
13045
13046 // encode suggested updates
13047 list_state.ver.pool = io_ctx.get_id();
13048 list_state.ver.epoch = astate->epoch;
13049 list_state.meta.size = object.meta.size;
13050 list_state.meta.accounted_size = object.meta.accounted_size;
13051 list_state.meta.mtime = object.meta.mtime;
13052 list_state.meta.category = main_category;
13053 list_state.meta.etag = etag;
13054 list_state.meta.content_type = content_type;
13055 if (astate->obj_tag.length() > 0)
13056 list_state.tag = astate->obj_tag.c_str();
13057 list_state.meta.owner = owner.get_id().to_str();
13058 list_state.meta.owner_display_name = owner.get_display_name();
13059
13060 list_state.exists = true;
13061 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13062 return 0;
13063 }
13064
13065 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13066 {
13067 librados::IoCtx index_ctx;
13068 map<int, string> oids;
13069 map<int, struct rgw_cls_list_ret> list_results;
13070 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13071 if (r < 0)
13072 return r;
13073
13074 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13075 if (r < 0)
13076 return r;
13077
13078 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13079 for(; iter != list_results.end(); ++iter) {
13080 headers[oids[iter->first]] = iter->second.dir.header;
13081 }
13082 return 0;
13083 }
13084
13085 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13086 {
13087 librados::IoCtx index_ctx;
13088 map<int, string> bucket_objs;
13089 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13090 if (r < 0)
13091 return r;
13092
13093 map<int, string>::iterator iter = bucket_objs.begin();
13094 for (; iter != bucket_objs.end(); ++iter) {
13095 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13096 if (r < 0) {
13097 ctx->put();
13098 break;
13099 } else {
13100 (*num_aio)++;
13101 }
13102 }
13103 return r;
13104 }
13105
13106 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13107 {
13108 string buckets_obj_id;
13109 rgw_get_buckets_obj(user_id, buckets_obj_id);
13110 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13111
13112 rgw_rados_ref ref;
13113 int r = get_raw_obj_ref(obj, &ref);
13114 if (r < 0) {
13115 return r;
13116 }
13117
13118 librados::ObjectReadOperation op;
13119 int rc;
13120 ::cls_user_get_header(op, header, &rc);
13121 bufferlist ibl;
13122 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13123 if (r < 0)
13124 return r;
13125 if (rc < 0)
13126 return rc;
13127
13128 return 0;
13129 }
13130
13131 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13132 {
13133 string buckets_obj_id;
13134 rgw_get_buckets_obj(user_id, buckets_obj_id);
13135 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13136
13137 rgw_rados_ref ref;
13138 int r = get_raw_obj_ref(obj, &ref);
13139 if (r < 0) {
13140 return r;
13141 }
13142
13143 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13144 if (r < 0)
13145 return r;
13146
13147 return 0;
13148 }
13149
13150 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13151 {
13152 map<string, struct rgw_bucket_dir_header> headers;
13153 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13154 if (r < 0) {
13155 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13156 return r;
13157 }
13158
13159 cls_user_bucket_entry entry;
13160
13161 bucket_info.bucket.convert(&entry.bucket);
13162
13163 for (const auto& hiter : headers) {
13164 for (const auto& iter : hiter.second.stats) {
13165 const struct rgw_bucket_category_stats& header_stats = iter.second;
13166 entry.size += header_stats.total_size;
13167 entry.size_rounded += header_stats.total_size_rounded;
13168 entry.count += header_stats.num_entries;
13169 }
13170 }
13171
13172 list<cls_user_bucket_entry> entries;
13173 entries.push_back(entry);
13174
13175 r = cls_user_update_buckets(user_obj, entries, false);
13176 if (r < 0) {
13177 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13178 return r;
13179 }
13180
13181 return 0;
13182 }
13183
13184 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13185 {
13186 map<string, struct rgw_bucket_dir_header> headers;
13187 RGWBucketInfo bucket_info;
13188 RGWObjectCtx obj_ctx(this);
13189 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13190 if (ret < 0) {
13191 return ret;
13192 }
13193
13194 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13195 if (ret < 0) {
13196 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13197 return ret;
13198 }
13199
13200 bucket.convert(&entry.bucket);
13201
13202 for (const auto& hiter : headers) {
13203 for (const auto& iter : hiter.second.stats) {
13204 const struct rgw_bucket_category_stats& header_stats = iter.second;
13205 entry.size += header_stats.total_size;
13206 entry.size_rounded += header_stats.total_size_rounded;
13207 entry.count += header_stats.num_entries;
13208 }
13209 }
13210
13211 return 0;
13212 }
13213
13214 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13215 const string& in_marker,
13216 const string& end_marker,
13217 const int max_entries,
13218 list<cls_user_bucket_entry>& entries,
13219 string * const out_marker,
13220 bool * const truncated)
13221 {
13222 rgw_rados_ref ref;
13223 int r = get_raw_obj_ref(obj, &ref);
13224 if (r < 0) {
13225 return r;
13226 }
13227
13228 librados::ObjectReadOperation op;
13229 int rc;
13230
13231 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13232 bufferlist ibl;
13233 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13234 if (r < 0)
13235 return r;
13236 if (rc < 0)
13237 return rc;
13238
13239 return 0;
13240 }
13241
13242 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13243 {
13244 rgw_rados_ref ref;
13245 int r = get_raw_obj_ref(obj, &ref);
13246 if (r < 0) {
13247 return r;
13248 }
13249
13250 librados::ObjectWriteOperation op;
13251 cls_user_set_buckets(op, entries, add);
13252 r = ref.ioctx.operate(ref.oid, &op);
13253 if (r < 0)
13254 return r;
13255
13256 return 0;
13257 }
13258
13259 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13260 {
13261 string buckets_obj_id;
13262 rgw_get_buckets_obj(user_id, buckets_obj_id);
13263 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13264 return cls_user_complete_stats_sync(obj);
13265 }
13266
13267 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13268 {
13269 rgw_rados_ref ref;
13270 int r = get_raw_obj_ref(obj, &ref);
13271 if (r < 0) {
13272 return r;
13273 }
13274
13275 librados::ObjectWriteOperation op;
13276 ::cls_user_complete_stats_sync(op);
13277 r = ref.ioctx.operate(ref.oid, &op);
13278 if (r < 0)
13279 return r;
13280
13281 return 0;
13282 }
13283
13284 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13285 {
13286 list<cls_user_bucket_entry> l;
13287 l.push_back(entry);
13288
13289 return cls_user_update_buckets(obj, l, true);
13290 }
13291
13292 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13293 {
13294 rgw_rados_ref ref;
13295 int r = get_system_obj_ref(obj, &ref);
13296 if (r < 0) {
13297 return r;
13298 }
13299
13300 librados::ObjectWriteOperation op;
13301 ::cls_user_remove_bucket(op, bucket);
13302 r = ref.ioctx.operate(ref.oid, &op);
13303 if (r < 0)
13304 return r;
13305
13306 return 0;
13307 }
13308
13309 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13310 RGWQuotaInfo& bucket_quota)
13311 {
13312 if (!cct->_conf->rgw_dynamic_resharding) {
13313 return 0;
13314 }
13315
13316 bool need_resharding = false;
13317 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13318 uint32_t suggested_num_shards;
13319
13320 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13321 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13322 1, need_resharding, &suggested_num_shards);
13323 if (ret < 0) {
13324 return ret;
13325 }
13326
13327 if (need_resharding) {
13328 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13329 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13330 dendl;
13331 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13332 }
13333
13334 return ret;
13335 }
13336
13337 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13338 {
13339 RGWReshard reshard(this);
13340
13341 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13342
13343 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13344 if (new_num_shards <= num_source_shards) {
13345 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13346 return 0;
13347 }
13348
13349 cls_rgw_reshard_entry entry;
13350 entry.time = real_clock::now();
13351 entry.tenant = bucket_info.owner.tenant;
13352 entry.bucket_name = bucket_info.bucket.name;
13353 entry.bucket_id = bucket_info.bucket.bucket_id;
13354 entry.old_num_shards = num_source_shards;
13355 entry.new_num_shards = new_num_shards;
13356
13357 return reshard.add(entry);
13358 }
13359
13360 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13361 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13362 {
13363 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13364 }
13365
13366 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13367 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13368 {
13369 if (!num_shards) {
13370 bucket_objects[0] = bucket_oid_base;
13371 } else {
13372 char buf[bucket_oid_base.size() + 32];
13373 if (shard_id < 0) {
13374 for (uint32_t i = 0; i < num_shards; ++i) {
13375 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13376 bucket_objects[i] = buf;
13377 }
13378 } else {
13379 if ((uint32_t)shard_id > num_shards) {
13380 return;
13381 }
13382 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13383 bucket_objects[shard_id] = buf;
13384 }
13385 }
13386 }
13387
13388 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13389 {
13390 const rgw_bucket& bucket = bucket_info.bucket;
13391 string plain_id = bucket.name + ":" + bucket.bucket_id;
13392 if (!bucket_info.num_shards) {
13393 (*result)[0] = plain_id;
13394 } else {
13395 char buf[16];
13396 if (shard_id < 0) {
13397 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13398 snprintf(buf, sizeof(buf), ":%d", i);
13399 (*result)[i] = plain_id + buf;
13400 }
13401 } else {
13402 if ((uint32_t)shard_id > bucket_info.num_shards) {
13403 return;
13404 }
13405 snprintf(buf, sizeof(buf), ":%d", shard_id);
13406 (*result)[shard_id] = plain_id + buf;
13407 }
13408 }
13409 }
13410
13411 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13412 int *shard_id)
13413 {
13414 int r = 0;
13415 switch (bucket_info.bucket_index_shard_hash_type) {
13416 case RGWBucketInfo::MOD:
13417 if (!bucket_info.num_shards) {
13418 if (shard_id) {
13419 *shard_id = -1;
13420 }
13421 } else {
13422 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13423 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13424 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13425 if (shard_id) {
13426 *shard_id = (int)sid;
13427 }
13428 }
13429 break;
13430 default:
13431 r = -ENOTSUP;
13432 }
13433 return r;
13434 }
13435
13436 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13437 int shard_id, string *bucket_obj)
13438 {
13439 if (!num_shards) {
13440 // By default with no sharding, we use the bucket oid as itself
13441 (*bucket_obj) = bucket_oid_base;
13442 } else {
13443 char buf[bucket_oid_base.size() + 32];
13444 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13445 (*bucket_obj) = buf;
13446 }
13447 }
13448
13449 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13450 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13451 {
13452 int r = 0;
13453 switch (hash_type) {
13454 case RGWBucketInfo::MOD:
13455 if (!num_shards) {
13456 // By default with no sharding, we use the bucket oid as itself
13457 (*bucket_obj) = bucket_oid_base;
13458 if (shard_id) {
13459 *shard_id = -1;
13460 }
13461 } else {
13462 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13463 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13464 sid = rgw_shards_mod(sid2, num_shards);
13465 char buf[bucket_oid_base.size() + 32];
13466 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13467 (*bucket_obj) = buf;
13468 if (shard_id) {
13469 *shard_id = (int)sid;
13470 }
13471 }
13472 break;
13473 default:
13474 r = -ENOTSUP;
13475 }
13476 return r;
13477 }
13478
13479 void RGWStateLog::oid_str(int shard, string& oid) {
13480 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13481 char buf[16];
13482 snprintf(buf, sizeof(buf), "%d", shard);
13483 oid += buf;
13484 }
13485
13486 int RGWStateLog::get_shard_num(const string& object) {
13487 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13488 return val % num_shards;
13489 }
13490
13491 string RGWStateLog::get_oid(const string& object) {
13492 int shard = get_shard_num(object);
13493 string oid;
13494 oid_str(shard, oid);
13495 return oid;
13496 }
13497
13498 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13499 rgw_pool pool;
13500 store->get_log_pool(pool);
13501 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13502 if (r < 0) {
13503 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13504 return r;
13505 }
13506 return 0;
13507 }
13508
13509 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13510 uint32_t state, bufferlist *bl, uint32_t *check_state)
13511 {
13512 if (client_id.empty() ||
13513 op_id.empty() ||
13514 object.empty()) {
13515 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13516 }
13517
13518 librados::IoCtx ioctx;
13519 int r = open_ioctx(ioctx);
13520 if (r < 0)
13521 return r;
13522
13523 string oid = get_oid(object);
13524
13525 librados::ObjectWriteOperation op;
13526 if (check_state) {
13527 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13528 }
13529 utime_t ts = ceph_clock_now();
13530 bufferlist nobl;
13531 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13532 r = ioctx.operate(oid, &op);
13533 if (r < 0) {
13534 return r;
13535 }
13536
13537 return 0;
13538 }
13539
13540 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13541 {
13542 if (client_id.empty() ||
13543 op_id.empty() ||
13544 object.empty()) {
13545 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13546 }
13547
13548 librados::IoCtx ioctx;
13549 int r = open_ioctx(ioctx);
13550 if (r < 0)
13551 return r;
13552
13553 string oid = get_oid(object);
13554
13555 librados::ObjectWriteOperation op;
13556 cls_statelog_remove_by_object(op, object, op_id);
13557 r = ioctx.operate(oid, &op);
13558 if (r < 0) {
13559 return r;
13560 }
13561
13562 return 0;
13563 }
13564
13565 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13566 void **handle)
13567 {
13568 list_state *state = new list_state;
13569 state->client_id = client_id;
13570 state->op_id = op_id;
13571 state->object = object;
13572 if (object.empty()) {
13573 state->cur_shard = 0;
13574 state->max_shard = num_shards - 1;
13575 } else {
13576 state->cur_shard = state->max_shard = get_shard_num(object);
13577 }
13578 *handle = (void *)state;
13579 }
13580
13581 int RGWStateLog::list_entries(void *handle, int max_entries,
13582 list<cls_statelog_entry>& entries,
13583 bool *done)
13584 {
13585 list_state *state = static_cast<list_state *>(handle);
13586
13587 librados::IoCtx ioctx;
13588 int r = open_ioctx(ioctx);
13589 if (r < 0)
13590 return r;
13591
13592 entries.clear();
13593
13594 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13595 string oid;
13596 oid_str(state->cur_shard, oid);
13597
13598 librados::ObjectReadOperation op;
13599 list<cls_statelog_entry> ents;
13600 bool truncated;
13601 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13602 max_entries, ents, &state->marker, &truncated);
13603 bufferlist ibl;
13604 r = ioctx.operate(oid, &op, &ibl);
13605 if (r == -ENOENT) {
13606 truncated = false;
13607 r = 0;
13608 }
13609 if (r < 0) {
13610 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13611 return r;
13612 }
13613
13614 if (!truncated) {
13615 state->marker.clear();
13616 }
13617
13618 max_entries -= ents.size();
13619
13620 entries.splice(entries.end(), ents);
13621
13622 if (truncated)
13623 break;
13624 }
13625
13626 *done = (state->cur_shard > state->max_shard);
13627
13628 return 0;
13629 }
13630
13631 void RGWStateLog::finish_list_entries(void *handle)
13632 {
13633 list_state *state = static_cast<list_state *>(handle);
13634 delete state;
13635 }
13636
13637 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13638 {
13639 f->open_object_section("statelog_entry");
13640 f->dump_string("client_id", entry.client_id);
13641 f->dump_string("op_id", entry.op_id);
13642 f->dump_string("object", entry.object);
13643 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13644 if (!dump_entry_internal(entry, f)) {
13645 f->dump_int("state", entry.state);
13646 }
13647 f->close_section();
13648 }
13649
13650 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13651 {
13652 }
13653
13654 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13655 {
13656 string s;
13657 switch ((OpState)entry.state) {
13658 case OPSTATE_UNKNOWN:
13659 s = "unknown";
13660 break;
13661 case OPSTATE_IN_PROGRESS:
13662 s = "in-progress";
13663 break;
13664 case OPSTATE_COMPLETE:
13665 s = "complete";
13666 break;
13667 case OPSTATE_ERROR:
13668 s = "error";
13669 break;
13670 case OPSTATE_ABORT:
13671 s = "abort";
13672 break;
13673 case OPSTATE_CANCELLED:
13674 s = "cancelled";
13675 break;
13676 default:
13677 s = "invalid";
13678 }
13679 f->dump_string("state", s);
13680 return true;
13681 }
13682
13683 int RGWOpState::state_from_str(const string& s, OpState *state)
13684 {
13685 if (s == "unknown") {
13686 *state = OPSTATE_UNKNOWN;
13687 } else if (s == "in-progress") {
13688 *state = OPSTATE_IN_PROGRESS;
13689 } else if (s == "complete") {
13690 *state = OPSTATE_COMPLETE;
13691 } else if (s == "error") {
13692 *state = OPSTATE_ERROR;
13693 } else if (s == "abort") {
13694 *state = OPSTATE_ABORT;
13695 } else if (s == "cancelled") {
13696 *state = OPSTATE_CANCELLED;
13697 } else {
13698 return -EINVAL;
13699 }
13700
13701 return 0;
13702 }
13703
13704 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13705 {
13706 uint32_t s = (uint32_t)state;
13707 return store_entry(client_id, op_id, object, s, NULL, NULL);
13708 }
13709
13710 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13711 {
13712 uint32_t s = (uint32_t)state;
13713 return store_entry(client_id, op_id, object, s, NULL, &s);
13714 }
13715
13716 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13717 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13718 {
13719 cct = store->ctx();
13720 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13721 }
13722
13723 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13724 last_update = real_clock::now();
13725 cur_state = state;
13726 return os.set_state(client_id, op_id, object, state);
13727 }
13728
13729 int RGWOpStateSingleOp::renew_state() {
13730 real_time now = real_clock::now();
13731
13732 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13733
13734 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13735 return 0;
13736 }
13737
13738 last_update = now;
13739 return os.renew_state(client_id, op_id, object, cur_state);
13740 }
13741
13742
13743 uint64_t RGWRados::instance_id()
13744 {
13745 return get_rados_handle()->get_instance_id();
13746 }
13747
13748 uint64_t RGWRados::next_bucket_id()
13749 {
13750 Mutex::Locker l(bucket_id_lock);
13751 return ++max_bucket_id;
13752 }
13753
13754 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13755 {
13756 int use_cache = cct->_conf->rgw_cache_enabled;
13757 RGWRados *store = NULL;
13758 if (!use_cache) {
13759 store = new RGWRados;
13760 } else {
13761 store = new RGWCache<RGWRados>;
13762 }
13763
13764 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13765 delete store;
13766 return NULL;
13767 }
13768
13769 return store;
13770 }
13771
13772 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13773 {
13774 RGWRados *store = NULL;
13775 store = new RGWRados;
13776
13777 store->set_context(cct);
13778
13779 if (store->init_rados() < 0) {
13780 delete store;
13781 return NULL;
13782 }
13783
13784 return store;
13785 }
13786
13787 void RGWStoreManager::close_storage(RGWRados *store)
13788 {
13789 if (!store)
13790 return;
13791
13792 store->finalize();
13793
13794 delete store;
13795 }
13796
13797 librados::Rados* RGWRados::get_rados_handle()
13798 {
13799 if (rados.size() == 1) {
13800 return &rados[0];
13801 } else {
13802 handle_lock.get_read();
13803 pthread_t id = pthread_self();
13804 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13805
13806 if (it != rados_map.end()) {
13807 handle_lock.put_read();
13808 return &rados[it->second];
13809 } else {
13810 handle_lock.put_read();
13811 handle_lock.get_write();
13812 const uint32_t handle = next_rados_handle;
13813 rados_map[id] = handle;
13814 if (++next_rados_handle == rados.size()) {
13815 next_rados_handle = 0;
13816 }
13817 handle_lock.put_write();
13818 return &rados[handle];
13819 }
13820 }
13821 }
13822
13823 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13824 {
13825 rgw_rados_ref ref;
13826 int ret = get_raw_obj_ref(obj, &ref);
13827 if (ret < 0) {
13828 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13829 return ret;
13830 }
13831
13832 ObjectWriteOperation op;
13833 list<string> prefixes;
13834 cls_rgw_remove_obj(op, prefixes);
13835
13836 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13837 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13838 if (ret < 0) {
13839 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13840 c->release();
13841 return ret;
13842 }
13843
13844 handles.push_back(c);
13845
13846 return 0;
13847 }
13848
13849 int RGWRados::delete_obj_aio(const rgw_obj& obj,
13850 RGWBucketInfo& bucket_info, RGWObjState *astate,
13851 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13852 {
13853 rgw_rados_ref ref;
13854 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13855 if (ret < 0) {
13856 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13857 return ret;
13858 }
13859
13860 if (keep_index_consistent) {
13861 RGWRados::Bucket bop(this, bucket_info);
13862 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13863
13864 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13865 if (ret < 0) {
13866 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13867 return ret;
13868 }
13869 }
13870
13871 ObjectWriteOperation op;
13872 list<string> prefixes;
13873 cls_rgw_remove_obj(op, prefixes);
13874
13875 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13876 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13877 if (ret < 0) {
13878 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13879 c->release();
13880 return ret;
13881 }
13882
13883 handles.push_back(c);
13884
13885 if (keep_index_consistent) {
13886 ret = delete_obj_index(obj);
13887 if (ret < 0) {
13888 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13889 return ret;
13890 }
13891 }
13892 return ret;
13893 }
13894
13895 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
13896 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
13897 if (value != attrs.end()) {
13898 bufferlist::iterator bliter = value->second.begin();
13899 try {
13900 ::decode(cs_info, bliter);
13901 } catch (buffer::error& err) {
13902 return -EIO;
13903 }
13904 if (cs_info.blocks.size() == 0) {
13905 return -EIO;
13906 }
13907 if (cs_info.compression_type != "none")
13908 need_decompress = true;
13909 else
13910 need_decompress = false;
13911 return 0;
13912 } else {
13913 need_decompress = false;
13914 return 0;
13915 }
13916 }
13917