]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to v12.1.3
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1
2 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 // vim: ts=8 sw=2 smarttab
4
5 #include "include/compat.h"
6 #include <errno.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <boost/algorithm/string.hpp>
10
11 #include <boost/format.hpp>
12 #include <boost/optional.hpp>
13 #include <boost/utility/in_place_factory.hpp>
14
15 #include "common/ceph_json.h"
16 #include "common/utf8.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21 #include "common/Finisher.h"
22
23 #include "rgw_rados.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_metadata.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32
33 #include "cls/rgw/cls_rgw_ops.h"
34 #include "cls/rgw/cls_rgw_types.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/statelog/cls_statelog_client.h"
41 #include "cls/timeindex/cls_timeindex_client.h"
42 #include "cls/lock/cls_lock_client.h"
43 #include "cls/user/cls_user_client.h"
44 #include "osd/osd_types.h"
45
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
49
50 #undef fork // fails to compile RGWPeriod::fork() below
51
52 #include "common/Clock.h"
53
54 #include "include/rados/librados.hpp"
55 using namespace librados;
56
57 #include <string>
58 #include <iostream>
59 #include <vector>
60 #include <atomic>
61 #include <list>
62 #include <map>
63 #include "auth/Crypto.h" // get_random_bytes()
64
65 #include "rgw_log.h"
66
67 #include "rgw_gc.h"
68 #include "rgw_lc.h"
69
70 #include "rgw_object_expirer_core.h"
71 #include "rgw_sync.h"
72 #include "rgw_data_sync.h"
73 #include "rgw_realm_watcher.h"
74 #include "rgw_reshard.h"
75
76 #include "compressor/Compressor.h"
77
78 #define dout_context g_ceph_context
79 #define dout_subsys ceph_subsys_rgw
80
81 using namespace std;
82
83 static string notify_oid_prefix = "notify";
84 static string *notify_oids = NULL;
85 static string shadow_ns = "shadow";
86 static string dir_oid_prefix = ".dir.";
87 static string default_storage_pool_suffix = "rgw.buckets.data";
88 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
89 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
90 static string avail_pools = ".pools.avail";
91
92 static string zone_info_oid_prefix = "zone_info.";
93 static string zone_names_oid_prefix = "zone_names.";
94 static string region_info_oid_prefix = "region_info.";
95 static string zone_group_info_oid_prefix = "zonegroup_info.";
96 static string realm_names_oid_prefix = "realms_names.";
97 static string realm_info_oid_prefix = "realms.";
98 static string default_region_info_oid = "default.region";
99 static string default_zone_group_info_oid = "default.zonegroup";
100 static string period_info_oid_prefix = "periods.";
101 static string period_latest_epoch_info_oid = ".latest_epoch";
102 static string region_map_oid = "region_map";
103 static string zonegroup_map_oid = "zonegroup_map";
104 static string log_lock_name = "rgw_log_lock";
105 static string default_realm_info_oid = "default.realm";
106 const string default_zonegroup_name = "default";
107 const string default_zone_name = "default";
108 static string zonegroup_names_oid_prefix = "zonegroups_names.";
109 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
110 #define RGW_USAGE_OBJ_PREFIX "usage."
111 #define FIRST_EPOCH 1
112 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
115 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
116
117 #define RGW_STATELOG_OBJ_PREFIX "statelog."
118
119 #define dout_subsys ceph_subsys_rgw
120
121
122 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
123 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
124 {
125 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
126 RGWZonePlacementInfo placement;
127 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
128 return false;
129 }
130
131 if (!obj.in_extra_data) {
132 *pool = placement.data_pool;
133 } else {
134 *pool = placement.get_data_extra_pool();
135 }
136 }
137
138 return true;
139 }
140
141 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
142 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
143 {
144 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
145
146 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
147 }
148
149 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
150 {
151 if (!is_raw) {
152 rgw_raw_obj r;
153 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
154 return r;
155 }
156 return raw_obj;
157 }
158
159 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
160 {
161 if (!is_raw) {
162 rgw_raw_obj r;
163 store->obj_to_raw(placement_rule, obj, &r);
164 return r;
165 }
166 return raw_obj;
167 }
168
169 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
170 {
171 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
172 if (r == -ENOENT && create) {
173 r = rados->pool_create(pool.name.c_str());
174 if (r < 0 && r != -EEXIST) {
175 return r;
176 }
177
178 r = rados->ioctx_create(pool.name.c_str(), ioctx);
179 if (r < 0) {
180 return r;
181 }
182
183 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
184 if (r < 0 && r != -EOPNOTSUPP) {
185 return r;
186 }
187 } else if (r < 0) {
188 return r;
189 }
190 if (!pool.ns.empty()) {
191 ioctx.set_namespace(pool.ns);
192 }
193 return 0;
194 }
195
196 template<>
197 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
198 RWLock::WLocker wl(lock);
199 auto iter = objs_state.find(obj);
200 if (iter == objs_state.end()) {
201 return;
202 }
203 bool is_atomic = iter->second.is_atomic;
204 bool prefetch_data = iter->second.prefetch_data;
205
206 objs_state.erase(iter);
207
208 if (is_atomic || prefetch_data) {
209 auto& s = objs_state[obj];
210 s.is_atomic = is_atomic;
211 s.prefetch_data = prefetch_data;
212 }
213 }
214
215 template<>
216 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
217 RWLock::WLocker wl(lock);
218 auto iter = objs_state.find(obj);
219 if (iter == objs_state.end()) {
220 return;
221 }
222
223 objs_state.erase(iter);
224 }
225
226 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
227 encode_json("default_zonegroup", default_zonegroup, f);
228 }
229
230 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
231
232 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
233 /* backward compatability with region */
234 if (default_zonegroup.empty()) {
235 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
236 }
237 }
238
239 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
240 {
241 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
242 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
243 }
244
245 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
246 }
247
248 int RGWZoneGroup::create_default(bool old_format)
249 {
250 name = default_zonegroup_name;
251 is_master = true;
252
253 RGWZoneGroupPlacementTarget placement_target;
254 placement_target.name = "default-placement";
255 placement_targets[placement_target.name] = placement_target;
256 default_placement = "default-placement";
257
258 RGWZoneParams zone_params(default_zone_name);
259
260 int r = zone_params.init(cct, store, false);
261 if (r < 0) {
262 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
263 return r;
264 }
265
266 r = zone_params.create_default();
267 if (r < 0 && r != -EEXIST) {
268 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
269 return r;
270 } else if (r == -EEXIST) {
271 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
272 zone_params.clear_id();
273 r = zone_params.init(cct, store);
274 if (r < 0) {
275 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 }
278 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
279 << dendl;
280 }
281
282 RGWZone& default_zone = zones[zone_params.get_id()];
283 default_zone.name = zone_params.get_name();
284 default_zone.id = zone_params.get_id();
285 master_zone = default_zone.id;
286
287 r = create();
288 if (r < 0 && r != -EEXIST) {
289 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
290 return r;
291 }
292
293 if (r == -EEXIST) {
294 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
295 id.clear();
296 r = init(cct, store);
297 if (r < 0) {
298 return r;
299 }
300 }
301
302 if (old_format) {
303 name = id;
304 }
305
306 post_process_params();
307
308 return 0;
309 }
310
311 const string RGWZoneGroup::get_default_oid(bool old_region_format)
312 {
313 if (old_region_format) {
314 if (cct->_conf->rgw_default_region_info_oid.empty()) {
315 return default_region_info_oid;
316 }
317 return cct->_conf->rgw_default_region_info_oid;
318 }
319
320 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
321
322 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
323 default_oid = default_zone_group_info_oid;
324 }
325
326 default_oid += "." + realm_id;
327
328 return default_oid;
329 }
330
331 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
332 {
333 if (old_region_format) {
334 return region_info_oid_prefix;
335 }
336 return zone_group_info_oid_prefix;
337 }
338
339 const string& RGWZoneGroup::get_names_oid_prefix()
340 {
341 return zonegroup_names_oid_prefix;
342 }
343
344 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
345 return cct->_conf->rgw_zonegroup;
346 }
347
348 int RGWZoneGroup::equals(const string& other_zonegroup) const
349 {
350 if (is_master && other_zonegroup.empty())
351 return true;
352
353 return (id == other_zonegroup);
354 }
355
356 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
357 const list<string>& endpoints, const string *ptier_type,
358 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
359 {
360 auto& zone_id = zone_params.get_id();
361 auto& zone_name = zone_params.get_name();
362
363 // check for duplicate zone name on insert
364 if (!zones.count(zone_id)) {
365 for (const auto& zone : zones) {
366 if (zone.second.name == zone_name) {
367 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
368 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
369 return -EEXIST;
370 }
371 }
372 }
373
374 if (is_master) {
375 if (*is_master) {
376 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
377 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
378 }
379 master_zone = zone_params.get_id();
380 } else if (master_zone == zone_params.get_id()) {
381 master_zone.clear();
382 }
383 }
384
385 RGWZone& zone = zones[zone_params.get_id()];
386 zone.name = zone_params.get_name();
387 zone.id = zone_params.get_id();
388 if (!endpoints.empty()) {
389 zone.endpoints = endpoints;
390 }
391 if (read_only) {
392 zone.read_only = *read_only;
393 }
394 if (ptier_type) {
395 zone.tier_type = *ptier_type;
396 }
397
398 if (psync_from_all) {
399 zone.sync_from_all = *psync_from_all;
400 }
401
402 for (auto add : sync_from) {
403 zone.sync_from.insert(add);
404 }
405
406 for (auto rm : sync_from_rm) {
407 zone.sync_from.erase(rm);
408 }
409
410 post_process_params();
411
412 return update();
413 }
414
415
416 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
417 {
418 RGWZone& zone = zones[zone_params.get_id()];
419 zone.name = zone_params.get_name();
420
421 return update();
422 }
423
424 void RGWZoneGroup::post_process_params()
425 {
426 bool log_data = zones.size() > 1;
427
428 if (master_zone.empty()) {
429 map<string, RGWZone>::iterator iter = zones.begin();
430 if (iter != zones.end()) {
431 master_zone = iter->first;
432 }
433 }
434
435 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
436 RGWZone& zone = iter->second;
437 zone.log_data = log_data;
438
439 RGWZoneParams zone_params(zone.id, zone.name);
440 int ret = zone_params.init(cct, store);
441 if (ret < 0) {
442 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
443 continue;
444 }
445
446 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
447 iter != zone_params.placement_pools.end(); ++iter) {
448 const string& placement_name = iter->first;
449 if (placement_targets.find(placement_name) == placement_targets.end()) {
450 RGWZoneGroupPlacementTarget placement_target;
451 placement_target.name = placement_name;
452 placement_targets[placement_name] = placement_target;
453 }
454 }
455 }
456
457 if (default_placement.empty() && !placement_targets.empty()) {
458 default_placement = placement_targets.begin()->first;
459 }
460 }
461
462 int RGWZoneGroup::remove_zone(const std::string& zone_id)
463 {
464 map<string, RGWZone>::iterator iter = zones.find(zone_id);
465 if (iter == zones.end()) {
466 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
467 << name << dendl;
468 return -ENOENT;
469 }
470
471 zones.erase(iter);
472
473 post_process_params();
474
475 return update();
476 }
477
478 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
479 {
480 if (realm_id.empty()) {
481 /* try using default realm */
482 RGWRealm realm;
483 int ret = realm.init(cct, store);
484 if (ret < 0) {
485 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
486 return -ENOENT;
487 }
488 realm_id = realm.get_id();
489 }
490
491 return RGWSystemMetaObj::read_default_id(default_id, old_format);
492 }
493
494 int RGWZoneGroup::set_as_default(bool exclusive)
495 {
496 if (realm_id.empty()) {
497 /* try using default realm */
498 RGWRealm realm;
499 int ret = realm.init(cct, store);
500 if (ret < 0) {
501 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
502 return -EINVAL;
503 }
504 realm_id = realm.get_id();
505 }
506
507 return RGWSystemMetaObj::set_as_default(exclusive);
508 }
509
510 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
511 {
512 cct = _cct;
513 store = _store;
514
515 if (!setup_obj)
516 return 0;
517
518 if (old_format && id.empty()) {
519 id = name;
520 }
521
522 if (id.empty()) {
523 int r;
524 if (name.empty()) {
525 name = get_predefined_name(cct);
526 }
527 if (name.empty()) {
528 r = use_default(old_format);
529 if (r < 0) {
530 return r;
531 }
532 } else if (!old_format) {
533 r = read_id(name, id);
534 if (r < 0) {
535 if (r != -ENOENT) {
536 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
537 }
538 return r;
539 }
540 }
541 }
542
543 return read_info(id, old_format);
544 }
545
546 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
547 {
548 auto pool = get_pool(cct);
549 bufferlist bl;
550 RGWObjectCtx obj_ctx(store);
551 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
552 if (ret < 0)
553 return ret;
554
555 try {
556 bufferlist::iterator iter = bl.begin();
557 ::decode(default_info, iter);
558 } catch (buffer::error& err) {
559 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
560 return -EIO;
561 }
562
563 return 0;
564 }
565
566 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
567 {
568 RGWDefaultSystemMetaObjInfo default_info;
569
570 int ret = read_default(default_info, get_default_oid(old_format));
571 if (ret < 0) {
572 return ret;
573 }
574
575 default_id = default_info.default_id;
576
577 return 0;
578 }
579
580 int RGWSystemMetaObj::use_default(bool old_format)
581 {
582 return read_default_id(id, old_format);
583 }
584
585 int RGWSystemMetaObj::set_as_default(bool exclusive)
586 {
587 string oid = get_default_oid();
588
589 rgw_pool pool(get_pool(cct));
590 bufferlist bl;
591
592 RGWDefaultSystemMetaObjInfo default_info;
593 default_info.default_id = id;
594
595 ::encode(default_info, bl);
596
597 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
598 exclusive, NULL, real_time(), NULL);
599 if (ret < 0)
600 return ret;
601
602 return 0;
603 }
604
605 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
606 {
607 rgw_pool pool(get_pool(cct));
608 bufferlist bl;
609
610 string oid = get_names_oid_prefix() + obj_name;
611
612 RGWObjectCtx obj_ctx(store);
613 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
614 if (ret < 0) {
615 return ret;
616 }
617
618 RGWNameToId nameToId;
619 try {
620 bufferlist::iterator iter = bl.begin();
621 ::decode(nameToId, iter);
622 } catch (buffer::error& err) {
623 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
624 return -EIO;
625 }
626 object_id = nameToId.obj_id;
627 return 0;
628 }
629
630 int RGWSystemMetaObj::delete_obj(bool old_format)
631 {
632 rgw_pool pool(get_pool(cct));
633
634 /* check to see if obj is the default */
635 RGWDefaultSystemMetaObjInfo default_info;
636 int ret = read_default(default_info, get_default_oid(old_format));
637 if (ret < 0 && ret != -ENOENT)
638 return ret;
639 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
640 string oid = get_default_oid(old_format);
641 rgw_raw_obj default_named_obj(pool, oid);
642 ret = store->delete_system_obj(default_named_obj);
643 if (ret < 0) {
644 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
645 return ret;
646 }
647 }
648 if (!old_format) {
649 string oid = get_names_oid_prefix() + name;
650 rgw_raw_obj object_name(pool, oid);
651 ret = store->delete_system_obj(object_name);
652 if (ret < 0) {
653 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
654 return ret;
655 }
656 }
657
658 string oid = get_info_oid_prefix(old_format);
659 if (old_format) {
660 oid += name;
661 } else {
662 oid += id;
663 }
664
665 rgw_raw_obj object_id(pool, oid);
666 ret = store->delete_system_obj(object_id);
667 if (ret < 0) {
668 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
669 }
670
671 return ret;
672 }
673
674 int RGWSystemMetaObj::store_name(bool exclusive)
675 {
676 rgw_pool pool(get_pool(cct));
677 string oid = get_names_oid_prefix() + name;
678
679 RGWNameToId nameToId;
680 nameToId.obj_id = id;
681
682 bufferlist bl;
683 ::encode(nameToId, bl);
684 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
685 }
686
687 int RGWSystemMetaObj::rename(const string& new_name)
688 {
689 string new_id;
690 int ret = read_id(new_name, new_id);
691 if (!ret) {
692 return -EEXIST;
693 }
694 if (ret < 0 && ret != -ENOENT) {
695 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
696 return ret;
697 }
698 string old_name = name;
699 name = new_name;
700 ret = update();
701 if (ret < 0) {
702 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 ret = store_name(true);
706 if (ret < 0) {
707 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
708 return ret;
709 }
710 /* delete old name */
711 rgw_pool pool(get_pool(cct));
712 string oid = get_names_oid_prefix() + old_name;
713 rgw_raw_obj old_name_obj(pool, oid);
714 ret = store->delete_system_obj(old_name_obj);
715 if (ret < 0) {
716 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
717 return ret;
718 }
719
720 return ret;
721 }
722
723 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
724 {
725 rgw_pool pool(get_pool(cct));
726
727 bufferlist bl;
728
729 string oid = get_info_oid_prefix(old_format) + obj_id;
730
731 RGWObjectCtx obj_ctx(store);
732 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
733 if (ret < 0) {
734 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
735 return ret;
736 }
737
738 try {
739 bufferlist::iterator iter = bl.begin();
740 ::decode(*this, iter);
741 } catch (buffer::error& err) {
742 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
743 return -EIO;
744 }
745
746 return 0;
747 }
748
749 int RGWSystemMetaObj::read()
750 {
751 int ret = read_id(name, id);
752 if (ret < 0) {
753 return ret;
754 }
755
756 return read_info(id);
757 }
758
759 int RGWSystemMetaObj::create(bool exclusive)
760 {
761 int ret;
762
763 /* check to see the name is not used */
764 ret = read_id(name, id);
765 if (exclusive && ret == 0) {
766 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
767 return -EEXIST;
768 } else if ( ret < 0 && ret != -ENOENT) {
769 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
770 return ret;
771 }
772
773 if (id.empty()) {
774 /* create unique id */
775 uuid_d new_uuid;
776 char uuid_str[37];
777 new_uuid.generate_random();
778 new_uuid.print(uuid_str);
779 id = uuid_str;
780 }
781
782 ret = store_info(exclusive);
783 if (ret < 0) {
784 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
785 return ret;
786 }
787
788 return store_name(exclusive);
789 }
790
791 int RGWSystemMetaObj::store_info(bool exclusive)
792 {
793 rgw_pool pool(get_pool(cct));
794
795 string oid = get_info_oid_prefix() + id;
796
797 bufferlist bl;
798 ::encode(*this, bl);
799 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
800 }
801
802 int RGWSystemMetaObj::write(bool exclusive)
803 {
804 int ret = store_info(exclusive);
805 if (ret < 0) {
806 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
807 return ret;
808 }
809 ret = store_name(exclusive);
810 if (ret < 0) {
811 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
812 return ret;
813 }
814 return 0;
815 }
816
817
818 const string& RGWRealm::get_predefined_name(CephContext *cct) {
819 return cct->_conf->rgw_realm;
820 }
821
822 int RGWRealm::create(bool exclusive)
823 {
824 int ret = RGWSystemMetaObj::create(exclusive);
825 if (ret < 0) {
826 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
827 return ret;
828 }
829 // create the control object for watch/notify
830 ret = create_control(exclusive);
831 if (ret < 0) {
832 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
833 return ret;
834 }
835 RGWPeriod period;
836 if (current_period.empty()) {
837 /* create new period for the realm */
838 ret = period.init(cct, store, id, name, false);
839 if (ret < 0 ) {
840 return ret;
841 }
842 ret = period.create(true);
843 if (ret < 0) {
844 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
845 return ret;
846 }
847 } else {
848 period = RGWPeriod(current_period, 0);
849 int ret = period.init(cct, store, id, name);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
852 return ret;
853 }
854 }
855 ret = set_current_period(period);
856 if (ret < 0) {
857 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
858 return ret;
859 }
860 // try to set as default. may race with another create, so pass exclusive=true
861 // so we don't override an existing default
862 ret = set_as_default(true);
863 if (ret < 0 && ret != -EEXIST) {
864 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
865 }
866
867 return 0;
868 }
869
870 int RGWRealm::delete_obj()
871 {
872 int ret = RGWSystemMetaObj::delete_obj();
873 if (ret < 0) {
874 return ret;
875 }
876 return delete_control();
877 }
878
879 int RGWRealm::create_control(bool exclusive)
880 {
881 auto pool = rgw_pool{get_pool(cct)};
882 auto oid = get_control_oid();
883 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
884 nullptr, real_time(), nullptr);
885 }
886
887 int RGWRealm::delete_control()
888 {
889 auto pool = rgw_pool{get_pool(cct)};
890 auto obj = rgw_raw_obj{pool, get_control_oid()};
891 return store->delete_system_obj(obj);
892 }
893
894 rgw_pool RGWRealm::get_pool(CephContext *cct)
895 {
896 if (cct->_conf->rgw_realm_root_pool.empty()) {
897 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
898 }
899 return rgw_pool(cct->_conf->rgw_realm_root_pool);
900 }
901
902 const string RGWRealm::get_default_oid(bool old_format)
903 {
904 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
905 return default_realm_info_oid;
906 }
907 return cct->_conf->rgw_default_realm_info_oid;
908 }
909
910 const string& RGWRealm::get_names_oid_prefix()
911 {
912 return realm_names_oid_prefix;
913 }
914
915 const string& RGWRealm::get_info_oid_prefix(bool old_format)
916 {
917 return realm_info_oid_prefix;
918 }
919
920 int RGWRealm::set_current_period(RGWPeriod& period)
921 {
922 // update realm epoch to match the period's
923 if (epoch > period.get_realm_epoch()) {
924 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
925 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
926 return -EINVAL;
927 }
928 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
929 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
930 << period.get_realm_epoch() << ", but different period id "
931 << period.get_id() << " != " << current_period << dendl;
932 return -EINVAL;
933 }
934
935 epoch = period.get_realm_epoch();
936 current_period = period.get_id();
937
938 int ret = update();
939 if (ret < 0) {
940 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
941 return ret;
942 }
943
944 ret = period.reflect();
945 if (ret < 0) {
946 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
947 return ret;
948 }
949
950 return 0;
951 }
952
953 string RGWRealm::get_control_oid()
954 {
955 return get_info_oid_prefix() + id + ".control";
956 }
957
958 int RGWRealm::notify_zone(bufferlist& bl)
959 {
960 // open a context on the realm's pool
961 rgw_pool pool{get_pool(cct)};
962 librados::IoCtx ctx;
963 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
964 if (r < 0) {
965 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
966 return r;
967 }
968 // send a notify on the realm object
969 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
970 if (r < 0) {
971 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
972 return r;
973 }
974 return 0;
975 }
976
977 int RGWRealm::notify_new_period(const RGWPeriod& period)
978 {
979 bufferlist bl;
980 // push the period to dependent zonegroups/zones
981 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
982 ::encode(period, bl);
983 // reload the gateway with the new period
984 ::encode(RGWRealmNotify::Reload, bl);
985
986 return notify_zone(bl);
987 }
988
989 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
990 {
991 if (realm_id.empty()) {
992 return "period_config.default";
993 }
994 return "period_config." + realm_id;
995 }
996
997 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
998 {
999 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1000 if (pool_name.empty()) {
1001 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1002 }
1003 return {pool_name};
1004 }
1005
1006 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1007 {
1008 RGWObjectCtx obj_ctx(store);
1009 const auto& pool = get_pool(store->ctx());
1010 const auto& oid = get_oid(realm_id);
1011 bufferlist bl;
1012
1013 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1014 if (ret < 0) {
1015 return ret;
1016 }
1017 try {
1018 bufferlist::iterator iter = bl.begin();
1019 ::decode(*this, iter);
1020 } catch (buffer::error& err) {
1021 return -EIO;
1022 }
1023 return 0;
1024 }
1025
1026 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1027 {
1028 const auto& pool = get_pool(store->ctx());
1029 const auto& oid = get_oid(realm_id);
1030 bufferlist bl;
1031 ::encode(*this, bl);
1032 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1033 false, nullptr, real_time(), nullptr);
1034 }
1035
1036 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1037 const string& period_realm_name, bool setup_obj)
1038 {
1039 cct = _cct;
1040 store = _store;
1041 realm_id = period_realm_id;
1042 realm_name = period_realm_name;
1043
1044 if (!setup_obj)
1045 return 0;
1046
1047 return init(_cct, _store, setup_obj);
1048 }
1049
1050
1051 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1052 {
1053 cct = _cct;
1054 store = _store;
1055
1056 if (!setup_obj)
1057 return 0;
1058
1059 if (id.empty()) {
1060 RGWRealm realm(realm_id, realm_name);
1061 int ret = realm.init(cct, store);
1062 if (ret < 0) {
1063 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1064 cpp_strerror(-ret) << dendl;
1065 return ret;
1066 }
1067 id = realm.get_current_period();
1068 realm_id = realm.get_id();
1069 }
1070
1071 if (!epoch) {
1072 int ret = use_latest_epoch();
1073 if (ret < 0) {
1074 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1075 << " : " << cpp_strerror(-ret) << dendl;
1076 return ret;
1077 }
1078 }
1079
1080 return read_info();
1081 }
1082
1083
1084 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1085 map<string, RGWZoneGroup>::const_iterator iter;
1086 if (!zonegroup_id.empty()) {
1087 iter = period_map.zonegroups.find(zonegroup_id);
1088 } else {
1089 iter = period_map.zonegroups.find("default");
1090 }
1091 if (iter != period_map.zonegroups.end()) {
1092 zonegroup = iter->second;
1093 return 0;
1094 }
1095
1096 return -ENOENT;
1097 }
1098
1099 const string& RGWPeriod::get_latest_epoch_oid()
1100 {
1101 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1102 return period_latest_epoch_info_oid;
1103 }
1104 return cct->_conf->rgw_period_latest_epoch_info_oid;
1105 }
1106
1107 const string& RGWPeriod::get_info_oid_prefix()
1108 {
1109 return period_info_oid_prefix;
1110 }
1111
1112 const string RGWPeriod::get_period_oid_prefix()
1113 {
1114 return get_info_oid_prefix() + id;
1115 }
1116
1117 const string RGWPeriod::get_period_oid()
1118 {
1119 std::ostringstream oss;
1120 oss << get_period_oid_prefix();
1121 // skip the epoch for the staging period
1122 if (id != get_staging_id(realm_id))
1123 oss << "." << epoch;
1124 return oss.str();
1125 }
1126
1127 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1128 RGWObjVersionTracker *objv)
1129 {
1130 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1131
1132 rgw_pool pool(get_pool(cct));
1133 bufferlist bl;
1134 RGWObjectCtx obj_ctx(store);
1135 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1136 if (ret < 0) {
1137 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1138 return ret;
1139 }
1140 try {
1141 bufferlist::iterator iter = bl.begin();
1142 ::decode(info, iter);
1143 } catch (buffer::error& err) {
1144 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1145 return -EIO;
1146 }
1147
1148 return 0;
1149 }
1150
1151 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1152 {
1153 RGWPeriodLatestEpochInfo info;
1154
1155 int ret = read_latest_epoch(info);
1156 if (ret < 0) {
1157 return ret;
1158 }
1159
1160 latest_epoch = info.epoch;
1161
1162 return 0;
1163 }
1164
1165 int RGWPeriod::use_latest_epoch()
1166 {
1167 RGWPeriodLatestEpochInfo info;
1168 int ret = read_latest_epoch(info);
1169 if (ret < 0) {
1170 return ret;
1171 }
1172
1173 epoch = info.epoch;
1174
1175 return 0;
1176 }
1177
1178 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1179 RGWObjVersionTracker *objv)
1180 {
1181 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1182
1183 rgw_pool pool(get_pool(cct));
1184 bufferlist bl;
1185
1186 RGWPeriodLatestEpochInfo info;
1187 info.epoch = epoch;
1188
1189 ::encode(info, bl);
1190
1191 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1192 exclusive, objv, real_time(), nullptr);
1193 }
1194
1195 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1196 {
1197 static constexpr int MAX_RETRIES = 20;
1198
1199 for (int i = 0; i < MAX_RETRIES; i++) {
1200 RGWPeriodLatestEpochInfo info;
1201 RGWObjVersionTracker objv;
1202 bool exclusive = false;
1203
1204 // read existing epoch
1205 int r = read_latest_epoch(info, &objv);
1206 if (r == -ENOENT) {
1207 // use an exclusive create to set the epoch atomically
1208 exclusive = true;
1209 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1210 << " for period=" << id << dendl;
1211 } else if (r < 0) {
1212 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1213 return r;
1214 } else if (epoch <= info.epoch) {
1215 r = -EEXIST; // fail with EEXIST if epoch is not newer
1216 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1217 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1218 return r;
1219 } else {
1220 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1221 << " -> " << epoch << " on period=" << id << dendl;
1222 }
1223
1224 r = set_latest_epoch(epoch, exclusive, &objv);
1225 if (r == -EEXIST) {
1226 continue; // exclusive create raced with another update, retry
1227 } else if (r == -ECANCELED) {
1228 continue; // write raced with a conflicting version, retry
1229 }
1230 if (r < 0) {
1231 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1232 return r;
1233 }
1234 return 0; // return success
1235 }
1236
1237 return -ECANCELED; // fail after max retries
1238 }
1239
1240 int RGWPeriod::delete_obj()
1241 {
1242 rgw_pool pool(get_pool(cct));
1243
1244 // delete the object for each period epoch
1245 for (epoch_t e = 1; e <= epoch; e++) {
1246 RGWPeriod p{get_id(), e};
1247 rgw_raw_obj oid{pool, p.get_period_oid()};
1248 int ret = store->delete_system_obj(oid);
1249 if (ret < 0) {
1250 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1251 << ": " << cpp_strerror(-ret) << dendl;
1252 }
1253 }
1254
1255 // delete the .latest_epoch object
1256 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1257 int ret = store->delete_system_obj(oid);
1258 if (ret < 0) {
1259 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1260 << ": " << cpp_strerror(-ret) << dendl;
1261 }
1262 return ret;
1263 }
1264
1265 int RGWPeriod::read_info()
1266 {
1267 rgw_pool pool(get_pool(cct));
1268
1269 bufferlist bl;
1270
1271 RGWObjectCtx obj_ctx(store);
1272 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1273 if (ret < 0) {
1274 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1275 return ret;
1276 }
1277
1278 try {
1279 bufferlist::iterator iter = bl.begin();
1280 ::decode(*this, iter);
1281 } catch (buffer::error& err) {
1282 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1283 return -EIO;
1284 }
1285
1286 return 0;
1287 }
1288
1289 int RGWPeriod::create(bool exclusive)
1290 {
1291 int ret;
1292
1293 /* create unique id */
1294 uuid_d new_uuid;
1295 char uuid_str[37];
1296 new_uuid.generate_random();
1297 new_uuid.print(uuid_str);
1298 id = uuid_str;
1299
1300 epoch = FIRST_EPOCH;
1301
1302 period_map.id = id;
1303
1304 ret = store_info(exclusive);
1305 if (ret < 0) {
1306 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1307 return ret;
1308 }
1309
1310 ret = set_latest_epoch(epoch);
1311 if (ret < 0) {
1312 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1313 }
1314
1315 return ret;
1316 }
1317
1318 int RGWPeriod::store_info(bool exclusive)
1319 {
1320 rgw_pool pool(get_pool(cct));
1321
1322 string oid = get_period_oid();
1323 bufferlist bl;
1324 ::encode(*this, bl);
1325
1326 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1327 exclusive, NULL, real_time(), NULL);
1328 }
1329
1330 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1331 {
1332 if (cct->_conf->rgw_period_root_pool.empty()) {
1333 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1334 }
1335 return rgw_pool(cct->_conf->rgw_period_root_pool);
1336 }
1337
1338 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1339 {
1340 if (zonegroup.realm_id != realm_id) {
1341 return 0;
1342 }
1343 int ret = period_map.update(zonegroup, cct);
1344 if (ret < 0) {
1345 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1346 return ret;
1347 }
1348
1349 return store_info(false);
1350 }
1351
1352 int RGWPeriod::update()
1353 {
1354 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1355 list<string> zonegroups;
1356 int ret = store->list_zonegroups(zonegroups);
1357 if (ret < 0) {
1358 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1359 return ret;
1360 }
1361
1362 // clear zone short ids of removed zones. period_map.update() will add the
1363 // remaining zones back
1364 period_map.short_zone_ids.clear();
1365
1366 for (auto& iter : zonegroups) {
1367 RGWZoneGroup zg(string(), iter);
1368 ret = zg.init(cct, store);
1369 if (ret < 0) {
1370 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1371 continue;
1372 }
1373
1374 if (zg.realm_id != realm_id) {
1375 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1376 continue;
1377 }
1378
1379 if (zg.master_zone.empty()) {
1380 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1381 return -EINVAL;
1382 }
1383
1384 if (zg.is_master_zonegroup()) {
1385 master_zonegroup = zg.get_id();
1386 master_zone = zg.master_zone;
1387 }
1388
1389 int ret = period_map.update(zg, cct);
1390 if (ret < 0) {
1391 return ret;
1392 }
1393 }
1394
1395 ret = period_config.read(store, realm_id);
1396 if (ret < 0 && ret != -ENOENT) {
1397 ldout(cct, 0) << "ERROR: failed to read period config: "
1398 << cpp_strerror(ret) << dendl;
1399 return ret;
1400 }
1401 return 0;
1402 }
1403
1404 int RGWPeriod::reflect()
1405 {
1406 for (auto& iter : period_map.zonegroups) {
1407 RGWZoneGroup& zg = iter.second;
1408 zg.reinit_instance(cct, store);
1409 int r = zg.write(false);
1410 if (r < 0) {
1411 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1412 return r;
1413 }
1414 if (zg.is_master_zonegroup()) {
1415 // set master as default if no default exists
1416 r = zg.set_as_default(true);
1417 if (r == 0) {
1418 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1419 << " as the default" << dendl;
1420 }
1421 }
1422 }
1423
1424 int r = period_config.write(store, realm_id);
1425 if (r < 0) {
1426 ldout(cct, 0) << "ERROR: failed to store period config: "
1427 << cpp_strerror(-r) << dendl;
1428 return r;
1429 }
1430 return 0;
1431 }
1432
1433 void RGWPeriod::fork()
1434 {
1435 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1436 predecessor_uuid = id;
1437 id = get_staging_id(realm_id);
1438 period_map.reset();
1439 realm_epoch++;
1440 }
1441
1442 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1443 {
1444 // initialize a sync status manager to read the status
1445 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1446 int r = mgr.init();
1447 if (r < 0) {
1448 return r;
1449 }
1450 r = mgr.read_sync_status(sync_status);
1451 mgr.stop();
1452 return r;
1453 }
1454
1455 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1456 std::ostream& error_stream,
1457 bool force_if_stale)
1458 {
1459 rgw_meta_sync_status status;
1460 int r = read_sync_status(store, &status);
1461 if (r < 0) {
1462 ldout(cct, 0) << "period failed to read sync status: "
1463 << cpp_strerror(-r) << dendl;
1464 return r;
1465 }
1466
1467 std::vector<std::string> markers;
1468
1469 const auto current_epoch = current_period.get_realm_epoch();
1470 if (current_epoch != status.sync_info.realm_epoch) {
1471 // no sync status markers for the current period
1472 assert(current_epoch > status.sync_info.realm_epoch);
1473 const int behind = current_epoch - status.sync_info.realm_epoch;
1474 if (!force_if_stale && current_epoch > 1) {
1475 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1476 "the current master zone in metadata sync. If this zone is promoted "
1477 "to master, any metadata changes during that time are likely to "
1478 "be lost.\n"
1479 "Waiting for this zone to catch up on metadata sync (see "
1480 "'radosgw-admin sync status') is recommended.\n"
1481 "To promote this zone to master anyway, add the flag "
1482 "--yes-i-really-mean-it." << std::endl;
1483 return -EINVAL;
1484 }
1485 // empty sync status markers - other zones will skip this period during
1486 // incremental metadata sync
1487 markers.resize(status.sync_info.num_shards);
1488 } else {
1489 markers.reserve(status.sync_info.num_shards);
1490 for (auto& i : status.sync_markers) {
1491 auto& marker = i.second;
1492 // filter out markers from other periods
1493 if (marker.realm_epoch != current_epoch) {
1494 marker.marker.clear();
1495 }
1496 markers.emplace_back(std::move(marker.marker));
1497 }
1498 }
1499
1500 std::swap(sync_status, markers);
1501 return 0;
1502 }
1503
1504 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1505 std::ostream& error_stream, bool force_if_stale)
1506 {
1507 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1508 // gateway must be in the master zone to commit
1509 if (master_zone != store->get_zone_params().get_id()) {
1510 error_stream << "Cannot commit period on zone "
1511 << store->get_zone_params().get_id() << ", it must be sent to "
1512 "the period's master zone " << master_zone << '.' << std::endl;
1513 return -EINVAL;
1514 }
1515 // period predecessor must match current period
1516 if (predecessor_uuid != current_period.get_id()) {
1517 error_stream << "Period predecessor " << predecessor_uuid
1518 << " does not match current period " << current_period.get_id()
1519 << ". Use 'period pull' to get the latest period from the master, "
1520 "reapply your changes, and try again." << std::endl;
1521 return -EINVAL;
1522 }
1523 // realm epoch must be 1 greater than current period
1524 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1525 error_stream << "Period's realm epoch " << realm_epoch
1526 << " does not come directly after current realm epoch "
1527 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1528 "latest realm and period from the master zone, reapply your changes, "
1529 "and try again." << std::endl;
1530 return -EINVAL;
1531 }
1532 // did the master zone change?
1533 if (master_zone != current_period.get_master_zone()) {
1534 // store the current metadata sync status in the period
1535 int r = update_sync_status(current_period, error_stream, force_if_stale);
1536 if (r < 0) {
1537 ldout(cct, 0) << "failed to update metadata sync status: "
1538 << cpp_strerror(-r) << dendl;
1539 return r;
1540 }
1541 // create an object with a new period id
1542 r = create(true);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1545 return r;
1546 }
1547 // set as current period
1548 r = realm.set_current_period(*this);
1549 if (r < 0) {
1550 ldout(cct, 0) << "failed to update realm's current period: "
1551 << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 ldout(cct, 4) << "Promoted to master zone and committed new period "
1555 << id << dendl;
1556 realm.notify_new_period(*this);
1557 return 0;
1558 }
1559 // period must be based on current epoch
1560 if (epoch != current_period.get_epoch()) {
1561 error_stream << "Period epoch " << epoch << " does not match "
1562 "predecessor epoch " << current_period.get_epoch()
1563 << ". Use 'period pull' to get the latest epoch from the master zone, "
1564 "reapply your changes, and try again." << std::endl;
1565 return -EINVAL;
1566 }
1567 // set period as next epoch
1568 set_id(current_period.get_id());
1569 set_epoch(current_period.get_epoch() + 1);
1570 set_predecessor(current_period.get_predecessor());
1571 realm_epoch = current_period.get_realm_epoch();
1572 // write the period to rados
1573 int r = store_info(false);
1574 if (r < 0) {
1575 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1576 return r;
1577 }
1578 // set as latest epoch
1579 r = update_latest_epoch(epoch);
1580 if (r == -EEXIST) {
1581 // already have this epoch (or a more recent one)
1582 return 0;
1583 }
1584 if (r < 0) {
1585 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1586 return r;
1587 }
1588 r = reflect();
1589 if (r < 0) {
1590 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1591 return r;
1592 }
1593 ldout(cct, 4) << "Committed new epoch " << epoch
1594 << " for period " << id << dendl;
1595 realm.notify_new_period(*this);
1596 return 0;
1597 }
1598
1599 int RGWZoneParams::create_default(bool old_format)
1600 {
1601 name = default_zone_name;
1602
1603 int r = create();
1604 if (r < 0) {
1605 return r;
1606 }
1607
1608 if (old_format) {
1609 name = id;
1610 }
1611
1612 return r;
1613 }
1614
1615
1616 int get_zones_pool_set(CephContext* cct,
1617 RGWRados* store,
1618 const list<string>& zones,
1619 const string& my_zone_id,
1620 set<rgw_pool>& pool_names)
1621 {
1622 for(auto const& iter : zones) {
1623 RGWZoneParams zone(iter);
1624 int r = zone.init(cct, store);
1625 if (r < 0) {
1626 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1627 return r;
1628 }
1629 if (zone.get_id() != my_zone_id) {
1630 pool_names.insert(zone.domain_root);
1631 pool_names.insert(zone.metadata_heap);
1632 pool_names.insert(zone.control_pool);
1633 pool_names.insert(zone.gc_pool);
1634 pool_names.insert(zone.log_pool);
1635 pool_names.insert(zone.intent_log_pool);
1636 pool_names.insert(zone.usage_log_pool);
1637 pool_names.insert(zone.user_keys_pool);
1638 pool_names.insert(zone.user_email_pool);
1639 pool_names.insert(zone.user_swift_pool);
1640 pool_names.insert(zone.user_uid_pool);
1641 pool_names.insert(zone.roles_pool);
1642 pool_names.insert(zone.reshard_pool);
1643 for(auto& iter : zone.placement_pools) {
1644 pool_names.insert(iter.second.index_pool);
1645 pool_names.insert(iter.second.data_pool);
1646 pool_names.insert(iter.second.data_extra_pool);
1647 }
1648 }
1649 }
1650 return 0;
1651 }
1652
1653 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1654 const string& default_prefix,
1655 const string& default_suffix,
1656 const rgw_pool& suggested_pool)
1657 {
1658 string suggested_name = suggested_pool.to_str();
1659
1660 string prefix = default_prefix;
1661 string suffix = default_suffix;
1662
1663 if (!suggested_pool.empty()) {
1664 prefix = suggested_name.substr(0, suggested_name.find("."));
1665 suffix = suggested_name.substr(prefix.length());
1666 }
1667
1668 rgw_pool pool(prefix + suffix);
1669
1670 if (pools.find(pool) == pools.end()) {
1671 return pool;
1672 } else {
1673 while(true) {
1674 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1675 if (pools.find(pool) == pools.end()) {
1676 return pool;
1677 }
1678 }
1679 }
1680 }
1681
1682 int RGWZoneParams::fix_pool_names()
1683 {
1684
1685 list<string> zones;
1686 int r = store->list_zones(zones);
1687 if (r < 0) {
1688 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1689 }
1690
1691 set<rgw_pool> pools;
1692 r = get_zones_pool_set(cct, store, zones, id, pools);
1693 if (r < 0) {
1694 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1695 return r;
1696 }
1697
1698 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1699 if (!metadata_heap.name.empty()) {
1700 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1701 }
1702 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1703 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1704 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1705 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1706 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1707 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1708 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1709 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1710 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1711 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1712 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1713 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1714
1715 for(auto& iter : placement_pools) {
1716 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1717 iter.second.index_pool);
1718 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1719 iter.second.data_pool);
1720 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1721 iter.second.data_extra_pool);
1722 }
1723
1724 return 0;
1725 }
1726
1727 int RGWZoneParams::create(bool exclusive)
1728 {
1729 /* check for old pools config */
1730 rgw_raw_obj obj(domain_root, avail_pools);
1731 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1732 if (r < 0) {
1733 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1734 /* a new system, let's set new placement info */
1735 RGWZonePlacementInfo default_placement;
1736 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1737 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1738 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1739 placement_pools["default-placement"] = default_placement;
1740 }
1741
1742 r = fix_pool_names();
1743 if (r < 0) {
1744 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1745 return r;
1746 }
1747
1748 r = RGWSystemMetaObj::create(exclusive);
1749 if (r < 0) {
1750 return r;
1751 }
1752
1753 // try to set as default. may race with another create, so pass exclusive=true
1754 // so we don't override an existing default
1755 r = set_as_default(true);
1756 if (r < 0 && r != -EEXIST) {
1757 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1758 }
1759
1760 return 0;
1761 }
1762
1763 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1764 {
1765 if (cct->_conf->rgw_zone_root_pool.empty()) {
1766 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1767 }
1768
1769 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1770 }
1771
1772 const string RGWZoneParams::get_default_oid(bool old_format)
1773 {
1774 if (old_format) {
1775 return cct->_conf->rgw_default_zone_info_oid;
1776 }
1777
1778 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1779 }
1780
1781 const string& RGWZoneParams::get_names_oid_prefix()
1782 {
1783 return zone_names_oid_prefix;
1784 }
1785
1786 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1787 {
1788 return zone_info_oid_prefix;
1789 }
1790
1791 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1792 return cct->_conf->rgw_zone;
1793 }
1794
1795 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1796 {
1797 if (name.empty()) {
1798 name = cct->_conf->rgw_zone;
1799 }
1800
1801 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1802 }
1803
1804 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1805 {
1806 if (realm_id.empty()) {
1807 /* try using default realm */
1808 RGWRealm realm;
1809 int ret = realm.init(cct, store);
1810 if (ret < 0) {
1811 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1812 return -ENOENT;
1813 }
1814 realm_id = realm.get_id();
1815 }
1816
1817 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1818 }
1819
1820
1821 int RGWZoneParams::set_as_default(bool exclusive)
1822 {
1823 if (realm_id.empty()) {
1824 /* try using default realm */
1825 RGWRealm realm;
1826 int ret = realm.init(cct, store);
1827 if (ret < 0) {
1828 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1829 return -EINVAL;
1830 }
1831 realm_id = realm.get_id();
1832 }
1833
1834 return RGWSystemMetaObj::set_as_default(exclusive);
1835 }
1836
1837 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1838 {
1839 static const std::string NONE{"none"};
1840 auto p = placement_pools.find(placement_rule);
1841 if (p == placement_pools.end()) {
1842 return NONE;
1843 }
1844 const auto& type = p->second.compression_type;
1845 return !type.empty() ? type : NONE;
1846 }
1847
1848 void RGWPeriodMap::encode(bufferlist& bl) const {
1849 ENCODE_START(2, 1, bl);
1850 ::encode(id, bl);
1851 ::encode(zonegroups, bl);
1852 ::encode(master_zonegroup, bl);
1853 ::encode(short_zone_ids, bl);
1854 ENCODE_FINISH(bl);
1855 }
1856
1857 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1858 DECODE_START(2, bl);
1859 ::decode(id, bl);
1860 ::decode(zonegroups, bl);
1861 ::decode(master_zonegroup, bl);
1862 if (struct_v >= 2) {
1863 ::decode(short_zone_ids, bl);
1864 }
1865 DECODE_FINISH(bl);
1866
1867 zonegroups_by_api.clear();
1868 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1869 iter != zonegroups.end(); ++iter) {
1870 RGWZoneGroup& zonegroup = iter->second;
1871 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1872 if (zonegroup.is_master_zonegroup()) {
1873 master_zonegroup = zonegroup.get_id();
1874 }
1875 }
1876 }
1877
1878 // run an MD5 hash on the zone_id and return the first 32 bits
1879 static uint32_t gen_short_zone_id(const std::string zone_id)
1880 {
1881 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1882 MD5 hash;
1883 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1884 hash.Final(md5);
1885
1886 uint32_t short_id;
1887 memcpy((char *)&short_id, md5, sizeof(short_id));
1888 return std::max(short_id, 1u);
1889 }
1890
1891 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1892 {
1893 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1894 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1895 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1896 return -EINVAL;
1897 }
1898 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1899 if (iter != zonegroups.end()) {
1900 RGWZoneGroup& old_zonegroup = iter->second;
1901 if (!old_zonegroup.api_name.empty()) {
1902 zonegroups_by_api.erase(old_zonegroup.api_name);
1903 }
1904 }
1905 zonegroups[zonegroup.get_id()] = zonegroup;
1906
1907 if (!zonegroup.api_name.empty()) {
1908 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1909 }
1910
1911 if (zonegroup.is_master_zonegroup()) {
1912 master_zonegroup = zonegroup.get_id();
1913 } else if (master_zonegroup == zonegroup.get_id()) {
1914 master_zonegroup = "";
1915 }
1916
1917 for (auto& i : zonegroup.zones) {
1918 auto& zone = i.second;
1919 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1920 continue;
1921 }
1922 // calculate the zone's short id
1923 uint32_t short_id = gen_short_zone_id(zone.id);
1924
1925 // search for an existing zone with the same short id
1926 for (auto& s : short_zone_ids) {
1927 if (s.second == short_id) {
1928 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1929 << ") generates the same short_zone_id " << short_id
1930 << " as existing zone id " << s.first << dendl;
1931 return -EEXIST;
1932 }
1933 }
1934
1935 short_zone_ids[zone.id] = short_id;
1936 }
1937
1938 return 0;
1939 }
1940
1941 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1942 {
1943 auto i = short_zone_ids.find(zone_id);
1944 if (i == short_zone_ids.end()) {
1945 return 0;
1946 }
1947 return i->second;
1948 }
1949
1950 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1951 {
1952
1953 RGWPeriod period;
1954 int ret = period.init(cct, store);
1955 if (ret < 0) {
1956 cerr << "failed to read current period info: " << cpp_strerror(ret);
1957 return ret;
1958 }
1959
1960 bucket_quota = period.get_config().bucket_quota;
1961 user_quota = period.get_config().user_quota;
1962 zonegroups = period.get_map().zonegroups;
1963 zonegroups_by_api = period.get_map().zonegroups_by_api;
1964 master_zonegroup = period.get_map().master_zonegroup;
1965
1966 return 0;
1967 }
1968
1969 void RGWRegionMap::encode(bufferlist& bl) const {
1970 ENCODE_START( 3, 1, bl);
1971 ::encode(regions, bl);
1972 ::encode(master_region, bl);
1973 ::encode(bucket_quota, bl);
1974 ::encode(user_quota, bl);
1975 ENCODE_FINISH(bl);
1976 }
1977
1978 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1979 DECODE_START(3, bl);
1980 ::decode(regions, bl);
1981 ::decode(master_region, bl);
1982 if (struct_v >= 2)
1983 ::decode(bucket_quota, bl);
1984 if (struct_v >= 3)
1985 ::decode(user_quota, bl);
1986 DECODE_FINISH(bl);
1987 }
1988
1989 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1990 ENCODE_START( 3, 1, bl);
1991 ::encode(zonegroups, bl);
1992 ::encode(master_zonegroup, bl);
1993 ::encode(bucket_quota, bl);
1994 ::encode(user_quota, bl);
1995 ENCODE_FINISH(bl);
1996 }
1997
1998 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1999 DECODE_START(3, bl);
2000 ::decode(zonegroups, bl);
2001 ::decode(master_zonegroup, bl);
2002 if (struct_v >= 2)
2003 ::decode(bucket_quota, bl);
2004 if (struct_v >= 3)
2005 ::decode(user_quota, bl);
2006 DECODE_FINISH(bl);
2007
2008 zonegroups_by_api.clear();
2009 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2010 iter != zonegroups.end(); ++iter) {
2011 RGWZoneGroup& zonegroup = iter->second;
2012 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2013 if (zonegroup.is_master_zonegroup()) {
2014 master_zonegroup = zonegroup.get_name();
2015 }
2016 }
2017 }
2018
2019 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2020 {
2021 obj_version *check_objv = version_for_check();
2022
2023 if (check_objv) {
2024 cls_version_check(*op, *check_objv, VER_COND_EQ);
2025 }
2026
2027 cls_version_read(*op, &read_version);
2028 }
2029
2030 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2031 {
2032 obj_version *check_objv = version_for_check();
2033 obj_version *modify_version = version_for_write();
2034
2035 if (check_objv) {
2036 cls_version_check(*op, *check_objv, VER_COND_EQ);
2037 }
2038
2039 if (modify_version) {
2040 cls_version_set(*op, *modify_version);
2041 } else {
2042 cls_version_inc(*op);
2043 }
2044 }
2045
2046 void RGWObjManifest::obj_iterator::operator++()
2047 {
2048 if (manifest->explicit_objs) {
2049 ++explicit_iter;
2050
2051 if (explicit_iter == manifest->objs.end()) {
2052 ofs = manifest->obj_size;
2053 return;
2054 }
2055
2056 update_explicit_pos();
2057
2058 update_location();
2059 return;
2060 }
2061
2062 uint64_t obj_size = manifest->get_obj_size();
2063 uint64_t head_size = manifest->get_head_size();
2064
2065 if (ofs == obj_size) {
2066 return;
2067 }
2068
2069 if (manifest->rules.empty()) {
2070 return;
2071 }
2072
2073 /* are we still pointing at the head? */
2074 if (ofs < head_size) {
2075 rule_iter = manifest->rules.begin();
2076 RGWObjManifestRule *rule = &rule_iter->second;
2077 ofs = MIN(head_size, obj_size);
2078 stripe_ofs = ofs;
2079 cur_stripe = 1;
2080 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2081 if (rule->part_size > 0) {
2082 stripe_size = MIN(stripe_size, rule->part_size);
2083 }
2084 update_location();
2085 return;
2086 }
2087
2088 RGWObjManifestRule *rule = &rule_iter->second;
2089
2090 stripe_ofs += rule->stripe_max_size;
2091 cur_stripe++;
2092 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2093
2094 if (rule->part_size > 0) {
2095 /* multi part, multi stripes object */
2096
2097 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2098
2099 if (stripe_ofs >= part_ofs + rule->part_size) {
2100 /* moved to the next part */
2101 cur_stripe = 0;
2102 part_ofs += rule->part_size;
2103 stripe_ofs = part_ofs;
2104
2105 bool last_rule = (next_rule_iter == manifest->rules.end());
2106 /* move to the next rule? */
2107 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2108 rule_iter = next_rule_iter;
2109 last_rule = (next_rule_iter == manifest->rules.end());
2110 if (!last_rule) {
2111 ++next_rule_iter;
2112 }
2113 cur_part_id = rule_iter->second.start_part_num;
2114 } else {
2115 cur_part_id++;
2116 }
2117
2118 rule = &rule_iter->second;
2119 }
2120
2121 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2122 }
2123
2124 cur_override_prefix = rule->override_prefix;
2125
2126 ofs = stripe_ofs;
2127 if (ofs > obj_size) {
2128 ofs = obj_size;
2129 stripe_ofs = ofs;
2130 stripe_size = 0;
2131 }
2132
2133 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2134 update_location();
2135 }
2136
2137 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2138 {
2139 manifest = _m;
2140
2141 manifest->set_tail_placement(placement_rule, _b);
2142 manifest->set_head(placement_rule, _obj, 0);
2143 last_ofs = 0;
2144
2145 if (manifest->get_prefix().empty()) {
2146 char buf[33];
2147 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2148
2149 string oid_prefix = ".";
2150 oid_prefix.append(buf);
2151 oid_prefix.append("_");
2152
2153 manifest->set_prefix(oid_prefix);
2154 }
2155
2156 bool found = manifest->get_rule(0, &rule);
2157 if (!found) {
2158 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2159 return -EIO;
2160 }
2161
2162 uint64_t head_size = manifest->get_head_size();
2163
2164 if (head_size > 0) {
2165 cur_stripe_size = head_size;
2166 } else {
2167 cur_stripe_size = rule.stripe_max_size;
2168 }
2169
2170 cur_part_id = rule.start_part_num;
2171
2172 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2173
2174 // Normal object which not generated through copy operation
2175 manifest->set_tail_instance(_obj.key.instance);
2176
2177 manifest->update_iterators();
2178
2179 return 0;
2180 }
2181
2182 int RGWObjManifest::generator::create_next(uint64_t ofs)
2183 {
2184 if (ofs < last_ofs) /* only going forward */
2185 return -EINVAL;
2186
2187 uint64_t max_head_size = manifest->get_max_head_size();
2188
2189 if (ofs < max_head_size) {
2190 manifest->set_head_size(ofs);
2191 }
2192
2193 if (ofs >= max_head_size) {
2194 manifest->set_head_size(max_head_size);
2195 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2196 cur_stripe_size = rule.stripe_max_size;
2197
2198 if (cur_part_id == 0 && max_head_size > 0) {
2199 cur_stripe++;
2200 }
2201 }
2202
2203 last_ofs = ofs;
2204 manifest->set_obj_size(ofs);
2205
2206 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2207
2208 manifest->update_iterators();
2209
2210 return 0;
2211 }
2212
2213 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2214 {
2215 return begin_iter;
2216 }
2217
2218 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2219 {
2220 return end_iter;
2221 }
2222
2223 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2224 {
2225 if (ofs > obj_size) {
2226 ofs = obj_size;
2227 }
2228 RGWObjManifest::obj_iterator iter(this);
2229 iter.seek(ofs);
2230 return iter;
2231 }
2232
2233 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2234 {
2235 if (explicit_objs || m.explicit_objs) {
2236 return append_explicit(m, zonegroup, zone_params);
2237 }
2238
2239 if (rules.empty()) {
2240 *this = m;
2241 return 0;
2242 }
2243
2244 string override_prefix;
2245
2246 if (prefix.empty()) {
2247 prefix = m.prefix;
2248 }
2249
2250 if (prefix != m.prefix) {
2251 override_prefix = m.prefix;
2252 }
2253
2254 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2255 if (miter == m.rules.end()) {
2256 return append_explicit(m, zonegroup, zone_params);
2257 }
2258
2259 for (; miter != m.rules.end(); ++miter) {
2260 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2261
2262 RGWObjManifestRule& rule = last_rule->second;
2263
2264 if (rule.part_size == 0) {
2265 rule.part_size = obj_size - rule.start_ofs;
2266 }
2267
2268 RGWObjManifestRule& next_rule = miter->second;
2269 if (!next_rule.part_size) {
2270 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2271 }
2272
2273 string rule_prefix = prefix;
2274 if (!rule.override_prefix.empty()) {
2275 rule_prefix = rule.override_prefix;
2276 }
2277
2278 string next_rule_prefix = m.prefix;
2279 if (!next_rule.override_prefix.empty()) {
2280 next_rule_prefix = next_rule.override_prefix;
2281 }
2282
2283 if (rule.part_size != next_rule.part_size ||
2284 rule.stripe_max_size != next_rule.stripe_max_size ||
2285 rule_prefix != next_rule_prefix) {
2286 if (next_rule_prefix != prefix) {
2287 append_rules(m, miter, &next_rule_prefix);
2288 } else {
2289 append_rules(m, miter, NULL);
2290 }
2291 break;
2292 }
2293
2294 uint64_t expected_part_num = rule.start_part_num + 1;
2295 if (rule.part_size > 0) {
2296 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2297 }
2298
2299 if (expected_part_num != next_rule.start_part_num) {
2300 append_rules(m, miter, NULL);
2301 break;
2302 }
2303 }
2304
2305 set_obj_size(obj_size + m.obj_size);
2306
2307 return 0;
2308 }
2309
2310 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2311 {
2312 return append(m, store->get_zonegroup(), store->get_zone_params());
2313 }
2314
2315 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2316 string *override_prefix)
2317 {
2318 for (; miter != m.rules.end(); ++miter) {
2319 RGWObjManifestRule rule = miter->second;
2320 rule.start_ofs += obj_size;
2321 if (override_prefix)
2322 rule.override_prefix = *override_prefix;
2323 rules[rule.start_ofs] = rule;
2324 }
2325 }
2326
2327 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2328 {
2329 if (explicit_objs) {
2330 return;
2331 }
2332 obj_iterator iter = obj_begin();
2333
2334 while (iter != obj_end()) {
2335 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2336 const rgw_obj_select& os = iter.get_location();
2337 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2338 part.loc_ofs = 0;
2339
2340 uint64_t ofs = iter.get_stripe_ofs();
2341
2342 if (ofs == 0) {
2343 part.loc = obj;
2344 } else {
2345 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2346 }
2347 ++iter;
2348 uint64_t next_ofs = iter.get_stripe_ofs();
2349
2350 part.size = next_ofs - ofs;
2351 }
2352
2353 explicit_objs = true;
2354 rules.clear();
2355 prefix.clear();
2356 }
2357
2358 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2359 {
2360 if (!explicit_objs) {
2361 convert_to_explicit(zonegroup, zone_params);
2362 }
2363 if (!m.explicit_objs) {
2364 m.convert_to_explicit(zonegroup, zone_params);
2365 }
2366 map<uint64_t, RGWObjManifestPart>::iterator iter;
2367 uint64_t base = obj_size;
2368 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2369 RGWObjManifestPart& part = iter->second;
2370 objs[base + iter->first] = part;
2371 }
2372 obj_size += m.obj_size;
2373
2374 return 0;
2375 }
2376
2377 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2378 {
2379 if (rules.empty()) {
2380 return false;
2381 }
2382
2383 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2384 if (iter != rules.begin()) {
2385 --iter;
2386 }
2387
2388 *rule = iter->second;
2389
2390 return true;
2391 }
2392
2393 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2394 {
2395 write_version.ver = 1;
2396 #define TAG_LEN 24
2397
2398 write_version.tag.clear();
2399 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2400 }
2401
2402 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2403 real_time *mtime, real_time set_mtime,
2404 map<string, bufferlist>& attrs, real_time delete_at,
2405 const char *if_match, const char *if_nomatch, const string *user_data,
2406 rgw_zone_set *zones_trace)
2407 {
2408 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2409 if (r < 0)
2410 return r;
2411
2412 is_complete = !canceled;
2413 return 0;
2414 }
2415
2416 CephContext *RGWPutObjProcessor::ctx()
2417 {
2418 return store->ctx();
2419 }
2420
2421 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2422 {
2423 drain_pending();
2424
2425 if (is_complete)
2426 return;
2427
2428 set<rgw_raw_obj>::iterator iter;
2429 bool need_to_remove_head = false;
2430 rgw_raw_obj raw_head;
2431
2432 if (!head_obj.empty()) {
2433 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2434 }
2435
2436 /**
2437 * We should delete the object in the "multipart" namespace to avoid race condition.
2438 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2439 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2440 * written by the second upload may be deleted by the first upload.
2441 * details is describled on #11749
2442 *
2443 * The above comment still stands, but instead of searching for a specific object in the multipart
2444 * namespace, we just make sure that we remove the object that is marked as the head object after
2445 * we remove all the other raw objects. Note that we use different call to remove the head object,
2446 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2447 */
2448 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2449 const rgw_raw_obj& obj = *iter;
2450 if (!head_obj.empty() && obj == raw_head) {
2451 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2452 need_to_remove_head = true;
2453 continue;
2454 }
2455
2456 int r = store->delete_raw_obj(obj);
2457 if (r < 0 && r != -ENOENT) {
2458 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2459 }
2460 }
2461
2462 if (need_to_remove_head) {
2463 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2464 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2465 if (r < 0 && r != -ENOENT) {
2466 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2467 }
2468 }
2469 }
2470
2471 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2472 {
2473 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2474 obj_len = abs_ofs + bl.length();
2475
2476 if (!(obj == last_written_obj)) {
2477 last_written_obj = obj;
2478 }
2479
2480 // For the first call pass -1 as the offset to
2481 // do a write_full.
2482 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2483 }
2484
2485 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2486 {
2487 struct put_obj_aio_info info;
2488 info = pending.front();
2489 pending.pop_front();
2490 pending_size -= info.size;
2491 return info;
2492 }
2493
2494 int RGWPutObjProcessor_Aio::wait_pending_front()
2495 {
2496 if (pending.empty()) {
2497 return 0;
2498 }
2499 struct put_obj_aio_info info = pop_pending();
2500 int ret = store->aio_wait(info.handle);
2501
2502 if (ret >= 0) {
2503 add_written_obj(info.obj);
2504 }
2505
2506 return ret;
2507 }
2508
2509 bool RGWPutObjProcessor_Aio::pending_has_completed()
2510 {
2511 if (pending.empty())
2512 return false;
2513
2514 struct put_obj_aio_info& info = pending.front();
2515 return store->aio_completed(info.handle);
2516 }
2517
2518 int RGWPutObjProcessor_Aio::drain_pending()
2519 {
2520 int ret = 0;
2521 while (!pending.empty()) {
2522 int r = wait_pending_front();
2523 if (r < 0)
2524 ret = r;
2525 }
2526 return ret;
2527 }
2528
2529 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2530 {
2531 bool _wait = need_to_wait;
2532
2533 if (handle) {
2534 struct put_obj_aio_info info;
2535 info.handle = handle;
2536 info.obj = obj;
2537 info.size = size;
2538 pending_size += size;
2539 pending.push_back(info);
2540 }
2541 size_t orig_size = pending_size;
2542
2543 /* first drain complete IOs */
2544 while (pending_has_completed()) {
2545 int r = wait_pending_front();
2546 if (r < 0)
2547 return r;
2548
2549 _wait = false;
2550 }
2551
2552 /* resize window in case messages are draining too fast */
2553 if (orig_size - pending_size >= window_size) {
2554 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2555 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2556 if (window_size > max_window_size) {
2557 window_size = max_window_size;
2558 }
2559 }
2560
2561 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2562 if (pending_size > window_size || _wait) {
2563 int r = wait_pending_front();
2564 if (r < 0)
2565 return r;
2566 }
2567 return 0;
2568 }
2569
2570 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2571 {
2572 if (ofs >= next_part_ofs) {
2573 int r = prepare_next_part(ofs);
2574 if (r < 0) {
2575 return r;
2576 }
2577 }
2578
2579 *pobj = cur_obj;
2580
2581 if (!bl.length()) {
2582 *phandle = nullptr;
2583 return 0;
2584 }
2585
2586 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2587 }
2588
2589 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2590 {
2591 RGWPutObjProcessor::prepare(store, oid_rand);
2592
2593 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2594
2595 return 0;
2596 }
2597
2598 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2599 {
2600 *phandle = NULL;
2601 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2602
2603 pending_data_bl.claim_append(bl);
2604 if (pending_data_bl.length() < max_write_size) {
2605 *again = false;
2606 return 0;
2607 }
2608
2609 pending_data_bl.splice(0, max_write_size, &bl);
2610
2611 /* do we have enough data pending accumulated that needs to be written? */
2612 *again = (pending_data_bl.length() >= max_chunk_size);
2613
2614 if (!data_ofs && !immutable_head()) {
2615 first_chunk.claim(bl);
2616 obj_len = (uint64_t)first_chunk.length();
2617 int r = prepare_next_part(obj_len);
2618 if (r < 0) {
2619 return r;
2620 }
2621 data_ofs = obj_len;
2622 return 0;
2623 }
2624 off_t write_ofs = data_ofs;
2625 data_ofs = write_ofs + bl.length();
2626 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2627 we could be racing with another upload, to the same
2628 object and cleanup can be messy */
2629 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2630 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2631 bl.clear();
2632 }
2633 return ret;
2634 }
2635
2636
2637 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2638 {
2639 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2640
2641 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2642 if (r < 0) {
2643 return r;
2644 }
2645
2646 return 0;
2647 }
2648
2649 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2650 {
2651 head_obj.init(bucket, obj_str);
2652
2653 int r = prepare_init(store, oid_rand);
2654 if (r < 0) {
2655 return r;
2656 }
2657
2658 if (!version_id.empty()) {
2659 head_obj.key.set_instance(version_id);
2660 } else if (versioned_object) {
2661 store->gen_rand_obj_instance_name(&head_obj);
2662 }
2663
2664 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2665
2666 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2667 if (r < 0) {
2668 return r;
2669 }
2670
2671 return 0;
2672 }
2673
2674 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2675
2676 int ret = manifest_gen.create_next(ofs);
2677 if (ret < 0) {
2678 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2679 return ret;
2680 }
2681 cur_part_ofs = ofs;
2682 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2683 cur_obj = manifest_gen.get_cur_obj(store);
2684
2685 return 0;
2686 }
2687
2688 int RGWPutObjProcessor_Atomic::complete_parts()
2689 {
2690 if (obj_len > (uint64_t)cur_part_ofs) {
2691 return prepare_next_part(obj_len);
2692 }
2693 return 0;
2694 }
2695
2696 int RGWPutObjProcessor_Atomic::complete_writing_data()
2697 {
2698 if (!data_ofs && !immutable_head()) {
2699 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2700 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2701 * clobber first_chunk
2702 */
2703 if (pending_data_bl.length() > 0) {
2704 first_chunk.claim(pending_data_bl);
2705 }
2706 obj_len = (uint64_t)first_chunk.length();
2707 }
2708 while (pending_data_bl.length()) {
2709 void *handle = nullptr;
2710 rgw_raw_obj obj;
2711 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2712 if (max_write_size > pending_data_bl.length()) {
2713 max_write_size = pending_data_bl.length();
2714 }
2715 bufferlist bl;
2716 pending_data_bl.splice(0, max_write_size, &bl);
2717 uint64_t write_len = bl.length();
2718 int r = write_data(bl, data_ofs, &handle, &obj, false);
2719 if (r < 0) {
2720 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2721 return r;
2722 }
2723 data_ofs += write_len;
2724 r = throttle_data(handle, obj, write_len, false);
2725 if (r < 0) {
2726 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2727 return r;
2728 }
2729
2730 if (data_ofs >= next_part_ofs) {
2731 r = prepare_next_part(data_ofs);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2734 return r;
2735 }
2736 }
2737 }
2738 int r = complete_parts();
2739 if (r < 0) {
2740 return r;
2741 }
2742
2743 r = drain_pending();
2744 if (r < 0)
2745 return r;
2746
2747 return 0;
2748 }
2749
2750 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2751 real_time *mtime, real_time set_mtime,
2752 map<string, bufferlist>& attrs,
2753 real_time delete_at,
2754 const char *if_match,
2755 const char *if_nomatch, const string *user_data,
2756 rgw_zone_set *zones_trace) {
2757 int r = complete_writing_data();
2758 if (r < 0)
2759 return r;
2760
2761 obj_ctx.obj.set_atomic(head_obj);
2762
2763 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2764
2765 /* some object types shouldn't be versioned, e.g., multipart parts */
2766 op_target.set_versioning_disabled(!versioned_object);
2767
2768 RGWRados::Object::Write obj_op(&op_target);
2769
2770 obj_op.meta.data = &first_chunk;
2771 obj_op.meta.manifest = &manifest;
2772 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2773 obj_op.meta.if_match = if_match;
2774 obj_op.meta.if_nomatch = if_nomatch;
2775 obj_op.meta.mtime = mtime;
2776 obj_op.meta.set_mtime = set_mtime;
2777 obj_op.meta.owner = bucket_info.owner;
2778 obj_op.meta.flags = PUT_OBJ_CREATE;
2779 obj_op.meta.olh_epoch = olh_epoch;
2780 obj_op.meta.delete_at = delete_at;
2781 obj_op.meta.user_data = user_data;
2782 obj_op.meta.zones_trace = zones_trace;
2783
2784 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2785 if (r < 0) {
2786 return r;
2787 }
2788
2789 canceled = obj_op.meta.canceled;
2790
2791 return 0;
2792 }
2793
2794 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2795 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2796 if (r < 0)
2797 return r;
2798 return 0;
2799 }
2800
2801 int RGWRados::unwatch(uint64_t watch_handle)
2802 {
2803 int r = control_pool_ctx.unwatch2(watch_handle);
2804 if (r < 0) {
2805 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2806 return r;
2807 }
2808 r = rados[0].watch_flush();
2809 if (r < 0) {
2810 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2811 return r;
2812 }
2813 return 0;
2814 }
2815
2816 void RGWRados::add_watcher(int i)
2817 {
2818 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2819 Mutex::Locker l(watchers_lock);
2820 watchers_set.insert(i);
2821 if (watchers_set.size() == (size_t)num_watchers) {
2822 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2823 set_cache_enabled(true);
2824 }
2825 }
2826
2827 void RGWRados::remove_watcher(int i)
2828 {
2829 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2830 Mutex::Locker l(watchers_lock);
2831 size_t orig_size = watchers_set.size();
2832 watchers_set.erase(i);
2833 if (orig_size == (size_t)num_watchers &&
2834 watchers_set.size() < orig_size) { /* actually removed */
2835 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2836 set_cache_enabled(false);
2837 }
2838 }
2839
2840 class RGWWatcher : public librados::WatchCtx2 {
2841 RGWRados *rados;
2842 int index;
2843 string oid;
2844 uint64_t watch_handle;
2845
2846 class C_ReinitWatch : public Context {
2847 RGWWatcher *watcher;
2848 public:
2849 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2850 void finish(int r) override {
2851 watcher->reinit();
2852 }
2853 };
2854 public:
2855 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2856 void handle_notify(uint64_t notify_id,
2857 uint64_t cookie,
2858 uint64_t notifier_id,
2859 bufferlist& bl) override {
2860 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2861 << " notify_id " << notify_id
2862 << " cookie " << cookie
2863 << " notifier " << notifier_id
2864 << " bl.length()=" << bl.length() << dendl;
2865 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2866
2867 bufferlist reply_bl; // empty reply payload
2868 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2869 }
2870 void handle_error(uint64_t cookie, int err) override {
2871 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2872 << " err " << cpp_strerror(err) << dendl;
2873 rados->remove_watcher(index);
2874 rados->schedule_context(new C_ReinitWatch(this));
2875 }
2876
2877 void reinit() {
2878 int ret = unregister_watch();
2879 if (ret < 0) {
2880 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2881 return;
2882 }
2883 ret = register_watch();
2884 if (ret < 0) {
2885 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2886 return;
2887 }
2888 }
2889
2890 int unregister_watch() {
2891 int r = rados->unwatch(watch_handle);
2892 if (r < 0) {
2893 return r;
2894 }
2895 rados->remove_watcher(index);
2896 return 0;
2897 }
2898
2899 int register_watch() {
2900 int r = rados->watch(oid, &watch_handle, this);
2901 if (r < 0) {
2902 return r;
2903 }
2904 rados->add_watcher(index);
2905 return 0;
2906 }
2907 };
2908
2909 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2910 RGWRados *store;
2911 RGWHTTPManager http_manager;
2912
2913 public:
2914 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2915 http_manager(store->ctx(), completion_mgr) {
2916 http_manager.set_threaded();
2917 }
2918
2919 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2920 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2921 { "notify", NULL },
2922 { NULL, NULL } };
2923
2924 list<RGWCoroutinesStack *> stacks;
2925 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2926 RGWRESTConn *conn = iter->second;
2927 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2928 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2929
2930 stacks.push_back(stack);
2931 }
2932 return run(stacks);
2933 }
2934 };
2935
2936 class RGWDataNotifierManager : public RGWCoroutinesManager {
2937 RGWRados *store;
2938 RGWHTTPManager http_manager;
2939
2940 public:
2941 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2942 http_manager(store->ctx(), completion_mgr) {
2943 http_manager.set_threaded();
2944 }
2945
2946 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2947 rgw_http_param_pair pairs[] = { { "type", "data" },
2948 { "notify", NULL },
2949 { "source-zone", store->get_zone_params().get_id().c_str() },
2950 { NULL, NULL } };
2951
2952 list<RGWCoroutinesStack *> stacks;
2953 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2954 RGWRESTConn *conn = iter->second;
2955 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2956 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2957
2958 stacks.push_back(stack);
2959 }
2960 return run(stacks);
2961 }
2962 };
2963
2964 class RGWRadosThread {
2965 class Worker : public Thread {
2966 CephContext *cct;
2967 RGWRadosThread *processor;
2968 Mutex lock;
2969 Cond cond;
2970
2971 void wait() {
2972 Mutex::Locker l(lock);
2973 cond.Wait(lock);
2974 };
2975
2976 void wait_interval(const utime_t& wait_time) {
2977 Mutex::Locker l(lock);
2978 cond.WaitInterval(lock, wait_time);
2979 }
2980
2981 public:
2982 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2983 void *entry() override;
2984 void signal() {
2985 Mutex::Locker l(lock);
2986 cond.Signal();
2987 }
2988 };
2989
2990 Worker *worker;
2991
2992 protected:
2993 CephContext *cct;
2994 RGWRados *store;
2995
2996 std::atomic<bool> down_flag = { false };
2997
2998 string thread_name;
2999
3000 virtual uint64_t interval_msec() = 0;
3001 virtual void stop_process() {}
3002 public:
3003 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3004 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3005 virtual ~RGWRadosThread() {
3006 stop();
3007 }
3008
3009 virtual int init() { return 0; }
3010 virtual int process() = 0;
3011
3012 bool going_down() { return down_flag; }
3013
3014 void start();
3015 void stop();
3016
3017 void signal() {
3018 if (worker) {
3019 worker->signal();
3020 }
3021 }
3022 };
3023
3024 void RGWRadosThread::start()
3025 {
3026 worker = new Worker(cct, this);
3027 worker->create(thread_name.c_str());
3028 }
3029
3030 void RGWRadosThread::stop()
3031 {
3032 down_flag = true;
3033 stop_process();
3034 if (worker) {
3035 worker->signal();
3036 worker->join();
3037 }
3038 delete worker;
3039 worker = NULL;
3040 }
3041
3042 void *RGWRadosThread::Worker::entry() {
3043 uint64_t msec = processor->interval_msec();
3044 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3045
3046 do {
3047 utime_t start = ceph_clock_now();
3048 int r = processor->process();
3049 if (r < 0) {
3050 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3051 }
3052
3053 if (processor->going_down())
3054 break;
3055
3056 utime_t end = ceph_clock_now();
3057 end -= start;
3058
3059 uint64_t cur_msec = processor->interval_msec();
3060 if (cur_msec != msec) { /* was it reconfigured? */
3061 msec = cur_msec;
3062 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3063 }
3064
3065 if (cur_msec > 0) {
3066 if (interval <= end)
3067 continue; // next round
3068
3069 utime_t wait_time = interval;
3070 wait_time -= end;
3071
3072 wait_interval(wait_time);
3073 } else {
3074 wait();
3075 }
3076 } while (!processor->going_down());
3077
3078 return NULL;
3079 }
3080
3081 class RGWMetaNotifier : public RGWRadosThread {
3082 RGWMetaNotifierManager notify_mgr;
3083 RGWMetadataLog *const log;
3084
3085 uint64_t interval_msec() override {
3086 return cct->_conf->rgw_md_notify_interval_msec;
3087 }
3088 public:
3089 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3090 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3091
3092 int process() override;
3093 };
3094
3095 int RGWMetaNotifier::process()
3096 {
3097 set<int> shards;
3098
3099 log->read_clear_modified(shards);
3100
3101 if (shards.empty()) {
3102 return 0;
3103 }
3104
3105 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3106 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3107 }
3108
3109 notify_mgr.notify_all(store->zone_conn_map, shards);
3110
3111 return 0;
3112 }
3113
3114 class RGWDataNotifier : public RGWRadosThread {
3115 RGWDataNotifierManager notify_mgr;
3116
3117 uint64_t interval_msec() override {
3118 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3119 }
3120 public:
3121 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3122
3123 int process() override;
3124 };
3125
3126 int RGWDataNotifier::process()
3127 {
3128 if (!store->data_log) {
3129 return 0;
3130 }
3131
3132 map<int, set<string> > shards;
3133
3134 store->data_log->read_clear_modified(shards);
3135
3136 if (shards.empty()) {
3137 return 0;
3138 }
3139
3140 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3141 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3142 }
3143
3144 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3145
3146 return 0;
3147 }
3148
3149 class RGWSyncProcessorThread : public RGWRadosThread {
3150 public:
3151 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3152 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3153 ~RGWSyncProcessorThread() override {}
3154 int init() override = 0 ;
3155 int process() override = 0;
3156 };
3157
3158 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3159 {
3160 RGWMetaSyncStatusManager sync;
3161
3162 uint64_t interval_msec() override {
3163 return 0; /* no interval associated, it'll run once until stopped */
3164 }
3165 void stop_process() override {
3166 sync.stop();
3167 }
3168 public:
3169 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3170 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3171
3172 void wakeup_sync_shards(set<int>& shard_ids) {
3173 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3174 sync.wakeup(*iter);
3175 }
3176 }
3177 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3178
3179 int init() override {
3180 int ret = sync.init();
3181 if (ret < 0) {
3182 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3183 return ret;
3184 }
3185 return 0;
3186 }
3187
3188 int process() override {
3189 sync.run();
3190 return 0;
3191 }
3192 };
3193
3194 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3195 {
3196 RGWDataSyncStatusManager sync;
3197 bool initialized;
3198
3199 uint64_t interval_msec() override {
3200 if (initialized) {
3201 return 0; /* no interval associated, it'll run once until stopped */
3202 } else {
3203 #define DATA_SYNC_INIT_WAIT_SEC 20
3204 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3205 }
3206 }
3207 void stop_process() override {
3208 sync.stop();
3209 }
3210 public:
3211 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3212 const string& _source_zone)
3213 : RGWSyncProcessorThread(_store, "data-sync"), sync(_store, async_rados, _source_zone),
3214 initialized(false) {}
3215
3216 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3217 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3218 sync.wakeup(iter->first, iter->second);
3219 }
3220 }
3221 RGWDataSyncStatusManager* get_manager() { return &sync; }
3222
3223 int init() override {
3224 return 0;
3225 }
3226
3227 int process() override {
3228 while (!initialized) {
3229 if (going_down()) {
3230 return 0;
3231 }
3232 int ret = sync.init();
3233 if (ret >= 0) {
3234 initialized = true;
3235 break;
3236 }
3237 /* we'll be back! */
3238 return 0;
3239 }
3240 sync.run();
3241 return 0;
3242 }
3243 };
3244
3245 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3246 {
3247 RGWCoroutinesManager crs;
3248 RGWRados *store;
3249 RGWHTTPManager http;
3250 const utime_t trim_interval;
3251
3252 uint64_t interval_msec() override { return 0; }
3253 void stop_process() override { crs.stop(); }
3254 public:
3255 RGWSyncLogTrimThread(RGWRados *store, int interval)
3256 : RGWSyncProcessorThread(store, "sync-log-trim"),
3257 crs(store->ctx(), store->get_cr_registry()), store(store),
3258 http(store->ctx(), crs.get_completion_mgr()),
3259 trim_interval(interval, 0)
3260 {}
3261
3262 int init() override {
3263 return http.set_threaded();
3264 }
3265 int process() override {
3266 list<RGWCoroutinesStack*> stacks;
3267 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3268 meta->call(create_meta_log_trim_cr(store, &http,
3269 cct->_conf->rgw_md_log_max_shards,
3270 trim_interval));
3271 stacks.push_back(meta);
3272
3273 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3274 data->call(create_data_log_trim_cr(store, &http,
3275 cct->_conf->rgw_data_log_num_shards,
3276 trim_interval));
3277 stacks.push_back(data);
3278
3279 crs.run(stacks);
3280 return 0;
3281 }
3282 };
3283
3284 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3285 {
3286 Mutex::Locker l(meta_sync_thread_lock);
3287 if (meta_sync_processor_thread) {
3288 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3289 }
3290 }
3291
3292 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3293 {
3294 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3295 Mutex::Locker l(data_sync_thread_lock);
3296 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3297 if (iter == data_sync_processor_threads.end()) {
3298 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3299 return;
3300 }
3301
3302 RGWDataSyncProcessorThread *thread = iter->second;
3303 assert(thread);
3304 thread->wakeup_sync_shards(shard_ids);
3305 }
3306
3307 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3308 {
3309 Mutex::Locker l(meta_sync_thread_lock);
3310 if (meta_sync_processor_thread) {
3311 return meta_sync_processor_thread->get_manager();
3312 }
3313 return nullptr;
3314 }
3315
3316 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3317 {
3318 Mutex::Locker l(data_sync_thread_lock);
3319 auto thread = data_sync_processor_threads.find(source_zone);
3320 if (thread == data_sync_processor_threads.end()) {
3321 return nullptr;
3322 }
3323 return thread->second->get_manager();
3324 }
3325
3326 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3327 {
3328 IoCtx ioctx;
3329 int r = open_pool_ctx(pool, ioctx);
3330 if (r < 0) {
3331 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3332 return r;
3333 }
3334
3335 bool requires;
3336 r = ioctx.pool_requires_alignment2(&requires);
3337 if (r < 0) {
3338 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3339 << r << dendl;
3340 return r;
3341 }
3342
3343 if (!requires) {
3344 *alignment = 0;
3345 return 0;
3346 }
3347
3348 uint64_t align;
3349 r = ioctx.pool_required_alignment2(&align);
3350 if (r < 0) {
3351 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3352 << r << dendl;
3353 return r;
3354 }
3355 if (align != 0) {
3356 ldout(cct, 20) << "required alignment=" << align << dendl;
3357 }
3358 *alignment = align;
3359 return 0;
3360 }
3361
3362 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3363 {
3364 uint64_t alignment = 0;
3365 int r = get_required_alignment(pool, &alignment);
3366 if (r < 0) {
3367 return r;
3368 }
3369
3370 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3371
3372 if (alignment == 0) {
3373 *max_chunk_size = config_chunk_size;
3374 return 0;
3375 }
3376
3377 if (config_chunk_size <= alignment) {
3378 *max_chunk_size = alignment;
3379 return 0;
3380 }
3381
3382 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3383
3384 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3385
3386 return 0;
3387 }
3388
3389 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3390 {
3391 rgw_pool pool;
3392 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3393 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3394 return -EIO;
3395 }
3396 return get_max_chunk_size(pool, max_chunk_size);
3397 }
3398
3399 class RGWIndexCompletionManager;
3400
3401 struct complete_op_data {
3402 Mutex lock{"complete_op_data"};
3403 AioCompletion *rados_completion{nullptr};
3404 int manager_shard_id{-1};
3405 RGWIndexCompletionManager *manager{nullptr};
3406 rgw_obj obj;
3407 RGWModifyOp op;
3408 string tag;
3409 rgw_bucket_entry_ver ver;
3410 cls_rgw_obj_key key;
3411 rgw_bucket_dir_entry_meta dir_meta;
3412 list<cls_rgw_obj_key> remove_objs;
3413 bool log_op;
3414 uint16_t bilog_op;
3415 rgw_zone_set zones_trace;
3416
3417 bool stopped{false};
3418
3419 void stop() {
3420 Mutex::Locker l(lock);
3421 stopped = true;
3422 }
3423 };
3424
3425 class RGWIndexCompletionThread : public RGWRadosThread {
3426 RGWRados *store;
3427
3428 uint64_t interval_msec() override {
3429 return 0;
3430 }
3431
3432 list<complete_op_data *> completions;
3433
3434 Mutex completions_lock;
3435 public:
3436 RGWIndexCompletionThread(RGWRados *_store)
3437 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3438
3439 int process() override;
3440
3441 void add_completion(complete_op_data *completion) {
3442 {
3443 Mutex::Locker l(completions_lock);
3444 completions.push_back(completion);
3445 }
3446
3447 signal();
3448 }
3449 };
3450
3451 int RGWIndexCompletionThread::process()
3452 {
3453 list<complete_op_data *> comps;
3454
3455 {
3456 Mutex::Locker l(completions_lock);
3457 completions.swap(comps);
3458 }
3459
3460 for (auto c : comps) {
3461 std::unique_ptr<complete_op_data> up{c};
3462
3463 if (going_down()) {
3464 continue;
3465 }
3466 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3467
3468 RGWRados::BucketShard bs(store);
3469
3470 int r = bs.init(c->obj.bucket, c->obj);
3471 if (r < 0) {
3472 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3473 /* not much to do */
3474 continue;
3475 }
3476
3477 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3478 librados::ObjectWriteOperation o;
3479 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3480 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3481 c->log_op, c->bilog_op, &c->zones_trace);
3482
3483 return bs->index_ctx.operate(bs->bucket_obj, &o);
3484 });
3485 if (r < 0) {
3486 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3487 /* ignoring error, can't do anything about it */
3488 continue;
3489 }
3490 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3491 if (r < 0) {
3492 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3493 }
3494 }
3495
3496 return 0;
3497 }
3498
3499 class RGWIndexCompletionManager {
3500 RGWRados *store{nullptr};
3501 vector<Mutex *> locks;
3502 vector<set<complete_op_data *> > completions;
3503
3504 RGWIndexCompletionThread *completion_thread{nullptr};
3505
3506 int num_shards;
3507
3508 std::atomic<int> cur_shard {0};
3509
3510
3511 public:
3512 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3513 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3514
3515 for (int i = 0; i < num_shards; i++) {
3516 char buf[64];
3517 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3518 locks.push_back(new Mutex(buf));
3519 }
3520
3521 completions.resize(num_shards);
3522 }
3523 ~RGWIndexCompletionManager() {
3524 stop();
3525
3526 for (auto l : locks) {
3527 delete l;
3528 }
3529 }
3530
3531 int next_shard() {
3532 int result = cur_shard % num_shards;
3533 cur_shard++;
3534 return result;
3535 }
3536
3537 void create_completion(const rgw_obj& obj,
3538 RGWModifyOp op, string& tag,
3539 rgw_bucket_entry_ver& ver,
3540 const cls_rgw_obj_key& key,
3541 rgw_bucket_dir_entry_meta& dir_meta,
3542 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3543 uint16_t bilog_op,
3544 rgw_zone_set *zones_trace,
3545 complete_op_data **result);
3546 bool handle_completion(completion_t cb, complete_op_data *arg);
3547
3548 int start() {
3549 completion_thread = new RGWIndexCompletionThread(store);
3550 int ret = completion_thread->init();
3551 if (ret < 0) {
3552 return ret;
3553 }
3554 completion_thread->start();
3555 return 0;
3556 }
3557 void stop() {
3558 if (completion_thread) {
3559 completion_thread->stop();
3560 delete completion_thread;
3561 }
3562
3563 for (int i = 0; i < num_shards; ++i) {
3564 Mutex::Locker l(*locks[i]);
3565 for (auto c : completions[i]) {
3566 Mutex::Locker cl(c->lock);
3567 c->stop();
3568 }
3569 }
3570 completions.clear();
3571 }
3572 };
3573
3574 static void obj_complete_cb(completion_t cb, void *arg)
3575 {
3576 complete_op_data *completion = (complete_op_data *)arg;
3577 completion->lock.Lock();
3578 if (completion->stopped) {
3579 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3580 delete completion;
3581 return;
3582 }
3583 bool need_delete = completion->manager->handle_completion(cb, completion);
3584 completion->lock.Unlock();
3585 if (need_delete) {
3586 delete completion;
3587 }
3588 }
3589
3590
3591 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3592 RGWModifyOp op, string& tag,
3593 rgw_bucket_entry_ver& ver,
3594 const cls_rgw_obj_key& key,
3595 rgw_bucket_dir_entry_meta& dir_meta,
3596 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3597 uint16_t bilog_op,
3598 rgw_zone_set *zones_trace,
3599 complete_op_data **result)
3600 {
3601 complete_op_data *entry = new complete_op_data;
3602
3603 int shard_id = next_shard();
3604
3605 entry->manager_shard_id = shard_id;
3606 entry->manager = this;
3607 entry->obj = obj;
3608 entry->op = op;
3609 entry->tag = tag;
3610 entry->ver = ver;
3611 entry->key = key;
3612 entry->dir_meta = dir_meta;
3613 entry->log_op = log_op;
3614 entry->bilog_op = bilog_op;
3615
3616 if (remove_objs) {
3617 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3618 entry->remove_objs.push_back(*iter);
3619 }
3620 }
3621
3622 if (zones_trace) {
3623 entry->zones_trace = *zones_trace;
3624 } else {
3625 entry->zones_trace.insert(store->get_zone().id);
3626 }
3627
3628 *result = entry;
3629
3630 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3631
3632 Mutex::Locker l(*locks[shard_id]);
3633 completions[shard_id].insert(entry);
3634 }
3635
3636 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3637 {
3638 int shard_id = arg->manager_shard_id;
3639 {
3640 Mutex::Locker l(*locks[shard_id]);
3641
3642 auto& comps = completions[shard_id];
3643
3644 auto iter = comps.find(arg);
3645 if (iter == comps.end()) {
3646 return true;
3647 }
3648
3649 comps.erase(iter);
3650 }
3651
3652 int r = rados_aio_get_return_value(cb);
3653 if (r != -ERR_BUSY_RESHARDING) {
3654 return true;
3655 }
3656 completion_thread->add_completion(arg);
3657 return false;
3658 }
3659
3660 void RGWRados::finalize()
3661 {
3662 if (run_sync_thread) {
3663 Mutex::Locker l(meta_sync_thread_lock);
3664 meta_sync_processor_thread->stop();
3665
3666 Mutex::Locker dl(data_sync_thread_lock);
3667 for (auto iter : data_sync_processor_threads) {
3668 RGWDataSyncProcessorThread *thread = iter.second;
3669 thread->stop();
3670 }
3671 if (sync_log_trimmer) {
3672 sync_log_trimmer->stop();
3673 }
3674 }
3675 if (async_rados) {
3676 async_rados->stop();
3677 }
3678 if (run_sync_thread) {
3679 delete meta_sync_processor_thread;
3680 meta_sync_processor_thread = NULL;
3681 Mutex::Locker dl(data_sync_thread_lock);
3682 for (auto iter : data_sync_processor_threads) {
3683 RGWDataSyncProcessorThread *thread = iter.second;
3684 delete thread;
3685 }
3686 data_sync_processor_threads.clear();
3687 delete sync_log_trimmer;
3688 sync_log_trimmer = nullptr;
3689 }
3690 if (finisher) {
3691 finisher->stop();
3692 }
3693 if (need_watch_notify()) {
3694 finalize_watch();
3695 }
3696 if (finisher) {
3697 /* delete finisher only after cleaning up watches, as watch error path might call
3698 * into finisher. We stop finisher before finalizing watch to make sure we don't
3699 * actually handle any racing work
3700 */
3701 delete finisher;
3702 }
3703 if (meta_notifier) {
3704 meta_notifier->stop();
3705 delete meta_notifier;
3706 }
3707 if (data_notifier) {
3708 data_notifier->stop();
3709 delete data_notifier;
3710 }
3711 delete data_log;
3712 if (async_rados) {
3713 delete async_rados;
3714 }
3715
3716 delete lc;
3717 lc = NULL;
3718
3719 delete gc;
3720 gc = NULL;
3721
3722 delete obj_expirer;
3723 obj_expirer = NULL;
3724
3725 delete rest_master_conn;
3726
3727 map<string, RGWRESTConn *>::iterator iter;
3728 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3729 RGWRESTConn *conn = iter->second;
3730 delete conn;
3731 }
3732
3733 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3734 RGWRESTConn *conn = iter->second;
3735 delete conn;
3736 }
3737 RGWQuotaHandler::free_handler(quota_handler);
3738 if (cr_registry) {
3739 cr_registry->put();
3740 }
3741 delete meta_mgr;
3742 delete binfo_cache;
3743 delete obj_tombstone_cache;
3744 delete sync_modules_manager;
3745
3746 if (reshard_wait.get()) {
3747 reshard_wait->stop();
3748 reshard_wait.reset();
3749 }
3750
3751 if (run_reshard_thread) {
3752 reshard->stop_processor();
3753 }
3754 delete reshard;
3755 delete index_completion_manager;
3756 }
3757
3758 /**
3759 * Initialize the RADOS instance and prepare to do other ops
3760 * Returns 0 on success, -ERR# on failure.
3761 */
3762 int RGWRados::init_rados()
3763 {
3764 int ret = 0;
3765 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3766
3767 for (auto& r : handles) {
3768 ret = r.init_with_context(cct);
3769 if (ret < 0) {
3770 return ret;
3771 }
3772 ret = r.connect();
3773 if (ret < 0) {
3774 return ret;
3775 }
3776 }
3777
3778 sync_modules_manager = new RGWSyncModulesManager();
3779
3780 rgw_register_sync_modules(sync_modules_manager);
3781
3782 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3783 new RGWCoroutinesManagerRegistry(cct)};
3784 ret = crs->hook_to_admin_command("cr dump");
3785 if (ret < 0) {
3786 return ret;
3787 }
3788
3789 meta_mgr = new RGWMetadataManager(cct, this);
3790 data_log = new RGWDataChangesLog(cct, this);
3791 cr_registry = crs.release();
3792
3793 std::swap(handles, rados);
3794 return ret;
3795 }
3796
3797
3798 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3799 {
3800 map<string,string> metadata = meta;
3801 metadata["num_handles"] = stringify(rados.size());
3802 metadata["zonegroup_id"] = zonegroup.get_id();
3803 metadata["zonegroup_name"] = zonegroup.get_name();
3804 metadata["zone_name"] = zone_name();
3805 metadata["zone_id"] = zone_id();;
3806 string name = cct->_conf->name.get_id();
3807 if (name.find("rgw.") == 0) {
3808 name = name.substr(4);
3809 }
3810 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3811 if (ret < 0) {
3812 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3813 return ret;
3814 }
3815
3816 return 0;
3817 }
3818
3819 /**
3820 * Add new connection to connections map
3821 * @param zonegroup_conn_map map which new connection will be added to
3822 * @param zonegroup zonegroup which new connection will connect to
3823 * @param new_connection pointer to new connection instance
3824 */
3825 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3826 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3827 {
3828 // Delete if connection is already exists
3829 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3830 if (iterZoneGroup != zonegroup_conn_map.end()) {
3831 delete iterZoneGroup->second;
3832 }
3833
3834 // Add new connection to connections map
3835 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3836 }
3837
3838 int RGWRados::convert_regionmap()
3839 {
3840 RGWZoneGroupMap zonegroupmap;
3841
3842 string pool_name = cct->_conf->rgw_zone_root_pool;
3843 if (pool_name.empty()) {
3844 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3845 }
3846 string oid = region_map_oid;
3847
3848 rgw_pool pool(pool_name);
3849 bufferlist bl;
3850 RGWObjectCtx obj_ctx(this);
3851 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3852 if (ret < 0 && ret != -ENOENT) {
3853 return ret;
3854 } else if (ret == -ENOENT) {
3855 return 0;
3856 }
3857
3858 try {
3859 bufferlist::iterator iter = bl.begin();
3860 ::decode(zonegroupmap, iter);
3861 } catch (buffer::error& err) {
3862 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3863 return -EIO;
3864 }
3865
3866 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3867 iter != zonegroupmap.zonegroups.end(); ++iter) {
3868 RGWZoneGroup& zonegroup = iter->second;
3869 ret = zonegroup.init(cct, this, false);
3870 ret = zonegroup.update();
3871 if (ret < 0 && ret != -ENOENT) {
3872 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3873 cpp_strerror(-ret) << dendl;
3874 return ret;
3875 } else if (ret == -ENOENT) {
3876 ret = zonegroup.create();
3877 if (ret < 0) {
3878 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3879 cpp_strerror(-ret) << dendl;
3880 return ret;
3881 }
3882 }
3883 }
3884
3885 current_period.set_user_quota(zonegroupmap.user_quota);
3886 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3887
3888 // remove the region_map so we don't try to convert again
3889 rgw_raw_obj obj(pool, oid);
3890 ret = delete_system_obj(obj);
3891 if (ret < 0) {
3892 ldout(cct, 0) << "Error could not remove " << obj
3893 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3894 return ret;
3895 }
3896
3897 return 0;
3898 }
3899
3900 /**
3901 * Replace all region configuration with zonegroup for
3902 * backward compatability
3903 * Returns 0 on success, -ERR# on failure.
3904 */
3905 int RGWRados::replace_region_with_zonegroup()
3906 {
3907 /* copy default region */
3908 /* convert default region to default zonegroup */
3909 string default_oid = cct->_conf->rgw_default_region_info_oid;
3910 if (default_oid.empty()) {
3911 default_oid = default_region_info_oid;
3912 }
3913
3914
3915 RGWZoneGroup default_zonegroup;
3916 rgw_pool pool{default_zonegroup.get_pool(cct)};
3917 string oid = "converted";
3918 bufferlist bl;
3919 RGWObjectCtx obj_ctx(this);
3920
3921 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3922 if (ret < 0 && ret != -ENOENT) {
3923 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3924 << dendl;
3925 return ret;
3926 } else if (ret != -ENOENT) {
3927 ldout(cct, 20) << "System already converted " << dendl;
3928 return 0;
3929 }
3930
3931 string default_region;
3932 ret = default_zonegroup.init(cct, this, false, true);
3933 if (ret < 0) {
3934 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3935 return ret;
3936 }
3937 ret = default_zonegroup.read_default_id(default_region, true);
3938 if (ret < 0 && ret != -ENOENT) {
3939 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3940 return ret;
3941 }
3942
3943 /* convert regions to zonegroups */
3944 list<string> regions;
3945 ret = list_regions(regions);
3946 if (ret < 0 && ret != -ENOENT) {
3947 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3948 return ret;
3949 } else if (ret == -ENOENT || regions.empty()) {
3950 RGWZoneParams zoneparams(default_zone_name);
3951 int ret = zoneparams.init(cct, this);
3952 if (ret < 0 && ret != -ENOENT) {
3953 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3954 return ret;
3955 }
3956 /* update master zone */
3957 RGWZoneGroup default_zg(default_zonegroup_name);
3958 ret = default_zg.init(cct, this);
3959 if (ret < 0 && ret != -ENOENT) {
3960 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3961 return ret;
3962 }
3963 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3964 default_zg.master_zone = zoneparams.get_id();
3965 return default_zg.update();
3966 }
3967 return 0;
3968 }
3969
3970 string master_region, master_zone;
3971 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3972 if (*iter != default_zonegroup_name){
3973 RGWZoneGroup region(*iter);
3974 int ret = region.init(cct, this, true, true);
3975 if (ret < 0) {
3976 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3977 return ret;
3978 }
3979 if (region.is_master_zonegroup()) {
3980 master_region = region.get_id();
3981 master_zone = region.master_zone;
3982 }
3983 }
3984 }
3985
3986 /* create realm if there is none.
3987 The realm name will be the region and zone concatenated
3988 realm id will be mds of its name */
3989 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
3990 string new_realm_name = master_region + "." + master_zone;
3991 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
3992 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
3993 MD5 hash;
3994 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
3995 hash.Final(md5);
3996 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
3997 string new_realm_id(md5_str);
3998 RGWRealm new_realm(new_realm_id,new_realm_name);
3999 ret = new_realm.init(cct, this, false);
4000 if (ret < 0) {
4001 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4002 return ret;
4003 }
4004 ret = new_realm.create();
4005 if (ret < 0 && ret != -EEXIST) {
4006 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4007 return ret;
4008 }
4009 ret = new_realm.set_as_default();
4010 if (ret < 0) {
4011 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4012 return ret;
4013 }
4014 ret = realm.init(cct, this);
4015 if (ret < 0) {
4016 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4017 return ret;
4018 }
4019 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4020 if (ret < 0) {
4021 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4022 return ret;
4023 }
4024 }
4025
4026 list<string>::iterator iter;
4027 /* create zonegroups */
4028 for (iter = regions.begin(); iter != regions.end(); ++iter)
4029 {
4030 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4031 /* check to see if we don't have already a zonegroup with this name */
4032 RGWZoneGroup new_zonegroup(*iter);
4033 ret = new_zonegroup.init(cct , this);
4034 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4035 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4036 " skipping conversion " << dendl;
4037 continue;
4038 }
4039 RGWZoneGroup zonegroup(*iter);
4040 zonegroup.set_id(*iter);
4041 int ret = zonegroup.init(cct, this, true, true);
4042 if (ret < 0) {
4043 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4044 return ret;
4045 }
4046 zonegroup.realm_id = realm.get_id();
4047 /* fix default region master zone */
4048 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4049 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4050 zonegroup.master_zone = default_zone_name;
4051 }
4052 ret = zonegroup.update();
4053 if (ret < 0 && ret != -EEXIST) {
4054 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4055 << dendl;
4056 return ret;
4057 }
4058 ret = zonegroup.update_name();
4059 if (ret < 0 && ret != -EEXIST) {
4060 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4061 << dendl;
4062 return ret;
4063 }
4064 if (zonegroup.get_name() == default_region) {
4065 ret = zonegroup.set_as_default();
4066 if (ret < 0) {
4067 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4068 << dendl;
4069 return ret;
4070 }
4071 }
4072 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4073 ++iter) {
4074 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4075 RGWZoneParams zoneparams(iter->first, iter->first);
4076 zoneparams.set_id(iter->first);
4077 zoneparams.realm_id = realm.get_id();
4078 ret = zoneparams.init(cct, this);
4079 if (ret < 0 && ret != -ENOENT) {
4080 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4081 return ret;
4082 } else if (ret == -ENOENT) {
4083 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4084 continue;
4085 }
4086 zonegroup.realm_id = realm.get_id();
4087 ret = zoneparams.update();
4088 if (ret < 0 && ret != -EEXIST) {
4089 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4090 return ret;
4091 }
4092 ret = zoneparams.update_name();
4093 if (ret < 0 && ret != -EEXIST) {
4094 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4095 return ret;
4096 }
4097 }
4098
4099 if (!current_period.get_id().empty()) {
4100 ret = current_period.add_zonegroup(zonegroup);
4101 if (ret < 0) {
4102 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4103 return ret;
4104 }
4105 }
4106 }
4107
4108 if (!current_period.get_id().empty()) {
4109 ret = current_period.update();
4110 if (ret < 0) {
4111 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4112 return ret;
4113 }
4114 ret = current_period.store_info(false);
4115 if (ret < 0) {
4116 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4117 return ret;
4118 }
4119 ret = current_period.reflect();
4120 if (ret < 0) {
4121 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4122 return ret;
4123 }
4124 }
4125
4126 for (auto const& iter : regions) {
4127 RGWZoneGroup zonegroup(iter);
4128 int ret = zonegroup.init(cct, this, true, true);
4129 if (ret < 0) {
4130 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4131 return ret;
4132 }
4133 ret = zonegroup.delete_obj(true);
4134 if (ret < 0 && ret != -ENOENT) {
4135 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4136 << dendl;
4137 return ret;
4138 }
4139 }
4140
4141 /* mark as converted */
4142 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4143 true, NULL, real_time(), NULL);
4144 if (ret < 0 ) {
4145 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4146 << dendl;
4147 return ret;
4148 }
4149
4150 return 0;
4151 }
4152
4153 int RGWRados::init_zg_from_period(bool *initialized)
4154 {
4155 *initialized = false;
4156
4157 if (current_period.get_id().empty()) {
4158 return 0;
4159 }
4160
4161 int ret = zonegroup.init(cct, this);
4162 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4163 if (ret == -ENOENT) {
4164 return 0;
4165 }
4166 if (ret < 0) {
4167 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4168 return ret;
4169 }
4170 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4171
4172 map<string, RGWZoneGroup>::const_iterator iter =
4173 current_period.get_map().zonegroups.find(zonegroup.get_id());
4174
4175 if (iter != current_period.get_map().zonegroups.end()) {
4176 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4177 zonegroup = iter->second;
4178 ret = zonegroup.init(cct, this, false);
4179 if (ret < 0) {
4180 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4181 return ret;
4182 }
4183 ret = zone_params.init(cct, this);
4184 if (ret < 0 && ret != -ENOENT) {
4185 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4186 return ret;
4187 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4188 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4189 zone_params.set_name(default_zone_name);
4190 ret = zone_params.init(cct, this);
4191 if (ret < 0 && ret != -ENOENT) {
4192 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4193 return ret;
4194 }
4195 }
4196 }
4197 for (iter = current_period.get_map().zonegroups.begin();
4198 iter != current_period.get_map().zonegroups.end(); ++iter){
4199 const RGWZoneGroup& zg = iter->second;
4200 // use endpoints from the zonegroup's master zone
4201 auto master = zg.zones.find(zg.master_zone);
4202 if (master == zg.zones.end()) {
4203 // fix missing master zone for a single zone zonegroup
4204 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4205 master = zg.zones.begin();
4206 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4207 master->second.name << " id:" << master->second.id << " as master" << dendl;
4208 if (zonegroup.get_id() == zg.get_id()) {
4209 zonegroup.master_zone = master->second.id;
4210 ret = zonegroup.update();
4211 if (ret < 0) {
4212 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4213 return ret;
4214 }
4215 } else {
4216 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4217 ret = fixed_zg.init(cct, this);
4218 if (ret < 0) {
4219 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4220 return ret;
4221 }
4222 fixed_zg.master_zone = master->second.id;
4223 ret = fixed_zg.update();
4224 if (ret < 0) {
4225 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4226 return ret;
4227 }
4228 }
4229 } else {
4230 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4231 zg.master_zone << dendl;
4232 return -EINVAL;
4233 }
4234 }
4235 const auto& endpoints = master->second.endpoints;
4236 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4237 if (!current_period.get_master_zonegroup().empty() &&
4238 zg.get_id() == current_period.get_master_zonegroup()) {
4239 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4240 }
4241 }
4242
4243 *initialized = true;
4244
4245 return 0;
4246 }
4247
4248 int RGWRados::init_zg_from_local(bool *creating_defaults)
4249 {
4250 int ret = zonegroup.init(cct, this);
4251 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4252 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4253 return ret;
4254 } else if (ret == -ENOENT) {
4255 *creating_defaults = true;
4256 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4257 ret = zonegroup.create_default();
4258 if (ret < 0) {
4259 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4260 << dendl;
4261 return ret;
4262 }
4263 ret = zonegroup.init(cct, this);
4264 if (ret < 0) {
4265 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4266 << dendl;
4267 return ret;
4268 }
4269 }
4270 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4271 if (zonegroup.is_master_zonegroup()) {
4272 // use endpoints from the zonegroup's master zone
4273 auto master = zonegroup.zones.find(zonegroup.master_zone);
4274 if (master == zonegroup.zones.end()) {
4275 // fix missing master zone for a single zone zonegroup
4276 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4277 master = zonegroup.zones.begin();
4278 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4279 master->second.name << " id:" << master->second.id << " as master" << dendl;
4280 zonegroup.master_zone = master->second.id;
4281 ret = zonegroup.update();
4282 if (ret < 0) {
4283 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4284 return ret;
4285 }
4286 } else {
4287 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4288 "master_zone=" << zonegroup.master_zone << dendl;
4289 return -EINVAL;
4290 }
4291 }
4292 const auto& endpoints = master->second.endpoints;
4293 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4294 }
4295
4296 return 0;
4297 }
4298
4299
4300 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4301 {
4302 return target_zone.syncs_from(source_zone.name) &&
4303 sync_modules_manager->supports_data_export(source_zone.tier_type);
4304 }
4305
4306 /**
4307 * Initialize the RADOS instance and prepare to do other ops
4308 * Returns 0 on success, -ERR# on failure.
4309 */
4310 int RGWRados::init_complete()
4311 {
4312 int ret = realm.init(cct, this);
4313 if (ret < 0 && ret != -ENOENT) {
4314 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4315 return ret;
4316 } else if (ret != -ENOENT) {
4317 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4318 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4319 if (ret < 0 && ret != -ENOENT) {
4320 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4321 return ret;
4322 }
4323 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4324 }
4325
4326 ret = replace_region_with_zonegroup();
4327 if (ret < 0) {
4328 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4329 return ret;
4330 }
4331
4332 ret = convert_regionmap();
4333 if (ret < 0) {
4334 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4335 return ret;
4336 }
4337
4338 bool zg_initialized = false;
4339
4340 if (!current_period.get_id().empty()) {
4341 ret = init_zg_from_period(&zg_initialized);
4342 if (ret < 0) {
4343 return ret;
4344 }
4345 }
4346
4347 bool creating_defaults = false;
4348 bool using_local = (!zg_initialized);
4349 if (using_local) {
4350 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4351 ret = init_zg_from_local(&creating_defaults);
4352 if (ret < 0) {
4353 return ret;
4354 }
4355 // read period_config into current_period
4356 auto& period_config = current_period.get_config();
4357 ret = period_config.read(this, zonegroup.realm_id);
4358 if (ret < 0 && ret != -ENOENT) {
4359 ldout(cct, 0) << "ERROR: failed to read period config: "
4360 << cpp_strerror(ret) << dendl;
4361 return ret;
4362 }
4363 }
4364
4365 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4366 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4367 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4368 zone_params.set_name(default_zone_name);
4369 }
4370
4371 ret = zone_params.init(cct, this);
4372 if (ret < 0 && ret != -ENOENT) {
4373 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4374 return ret;
4375 }
4376 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4377 if (zone_iter == get_zonegroup().zones.end()) {
4378 if (using_local) {
4379 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4380 return -EINVAL;
4381 }
4382 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4383 ret = init_zg_from_local(&creating_defaults);
4384 if (ret < 0) {
4385 return ret;
4386 }
4387 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4388 }
4389 if (zone_iter != get_zonegroup().zones.end()) {
4390 zone_public_config = zone_iter->second;
4391 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4392 } else {
4393 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4394 return -EINVAL;
4395 }
4396
4397 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4398
4399 if (run_sync_thread) {
4400 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4401 if (ret < 0) {
4402 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4403 return ret;
4404 }
4405 }
4406
4407 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4408
4409 init_unique_trans_id_deps();
4410
4411 finisher = new Finisher(cct);
4412 finisher->start();
4413
4414 period_puller.reset(new RGWPeriodPuller(this));
4415 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4416 current_period));
4417
4418 if (need_watch_notify()) {
4419 ret = init_watch();
4420 if (ret < 0) {
4421 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4422 return ret;
4423 }
4424 }
4425
4426 /* first build all zones index */
4427 for (auto ziter : get_zonegroup().zones) {
4428 const string& id = ziter.first;
4429 RGWZone& z = ziter.second;
4430 zone_id_by_name[z.name] = id;
4431 zone_by_id[id] = z;
4432 }
4433
4434 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4435 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4436 }
4437 zone_public_config = zone_by_id[zone_id()];
4438 for (auto ziter : get_zonegroup().zones) {
4439 const string& id = ziter.first;
4440 RGWZone& z = ziter.second;
4441 if (id == zone_id()) {
4442 continue;
4443 }
4444 if (z.endpoints.empty()) {
4445 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4446 continue;
4447 }
4448 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4449 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4450 zone_conn_map[id] = conn;
4451 if (zone_syncs_from(zone_public_config, z) ||
4452 zone_syncs_from(z, zone_public_config)) {
4453 if (zone_syncs_from(zone_public_config, z)) {
4454 zone_data_sync_from_map[id] = conn;
4455 }
4456 if (zone_syncs_from(z, zone_public_config)) {
4457 zone_data_notify_to_map[id] = conn;
4458 }
4459 } else {
4460 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4461 }
4462 }
4463
4464 ret = open_root_pool_ctx();
4465 if (ret < 0)
4466 return ret;
4467
4468 ret = open_gc_pool_ctx();
4469 if (ret < 0)
4470 return ret;
4471
4472 ret = open_lc_pool_ctx();
4473 if (ret < 0)
4474 return ret;
4475
4476 ret = open_objexp_pool_ctx();
4477 if (ret < 0)
4478 return ret;
4479
4480 ret = open_reshard_pool_ctx();
4481 if (ret < 0)
4482 return ret;
4483
4484 pools_initialized = true;
4485
4486 gc = new RGWGC();
4487 gc->initialize(cct, this);
4488
4489 obj_expirer = new RGWObjectExpirer(this);
4490
4491 if (use_gc_thread) {
4492 gc->start_processor();
4493 obj_expirer->start_processor();
4494 }
4495
4496 if (run_sync_thread) {
4497 // initialize the log period history. we want to do this any time we're not
4498 // running under radosgw-admin, so we check run_sync_thread here before
4499 // disabling it based on the zone/zonegroup setup
4500 meta_mgr->init_oldest_log_period();
4501 }
4502
4503 /* no point of running sync thread if we don't have a master zone configured
4504 or there is no rest_master_conn */
4505 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4506 || current_period.get_id().empty()) {
4507 run_sync_thread = false;
4508 }
4509
4510 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4511 async_rados->start();
4512
4513 ret = meta_mgr->init(current_period.get_id());
4514 if (ret < 0) {
4515 lderr(cct) << "ERROR: failed to initialize metadata log: "
4516 << cpp_strerror(-ret) << dendl;
4517 return ret;
4518 }
4519
4520 if (is_meta_master()) {
4521 auto md_log = meta_mgr->get_log(current_period.get_id());
4522 meta_notifier = new RGWMetaNotifier(this, md_log);
4523 meta_notifier->start();
4524 }
4525
4526 if (run_sync_thread) {
4527 Mutex::Locker l(meta_sync_thread_lock);
4528 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4529 ret = meta_sync_processor_thread->init();
4530 if (ret < 0) {
4531 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4532 return ret;
4533 }
4534 meta_sync_processor_thread->start();
4535
4536 Mutex::Locker dl(data_sync_thread_lock);
4537 for (auto iter : zone_data_sync_from_map) {
4538 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4539 RGWDataSyncProcessorThread *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
4540 ret = thread->init();
4541 if (ret < 0) {
4542 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4543 return ret;
4544 }
4545 thread->start();
4546 data_sync_processor_threads[iter.first] = thread;
4547 }
4548 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4549 if (interval > 0) {
4550 sync_log_trimmer = new RGWSyncLogTrimThread(this, interval);
4551 ret = sync_log_trimmer->init();
4552 if (ret < 0) {
4553 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4554 return ret;
4555 }
4556 sync_log_trimmer->start();
4557 }
4558 }
4559 data_notifier = new RGWDataNotifier(this);
4560 data_notifier->start();
4561
4562 lc = new RGWLC();
4563 lc->initialize(cct, this);
4564
4565 if (use_lc_thread)
4566 lc->start_processor();
4567
4568 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4569
4570 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4571 get_zone().bucket_index_max_shards);
4572 if (bucket_index_max_shards > get_max_bucket_shards()) {
4573 bucket_index_max_shards = get_max_bucket_shards();
4574 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4575 << get_max_bucket_shards() << dendl;
4576 }
4577 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4578
4579 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4580 binfo_cache->init(this);
4581
4582 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4583
4584 if (need_tombstone_cache) {
4585 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4586 }
4587
4588 reshard_wait = std::make_shared<RGWReshardWait>(this);
4589
4590 reshard = new RGWReshard(this);
4591
4592 /* only the master zone in the zonegroup reshards buckets */
4593 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4594 if (run_reshard_thread) {
4595 reshard->start_processor();
4596 }
4597
4598 index_completion_manager = new RGWIndexCompletionManager(this);
4599 ret = index_completion_manager->start();
4600
4601 return ret;
4602 }
4603
4604 /**
4605 * Initialize the RADOS instance and prepare to do other ops
4606 * Returns 0 on success, -ERR# on failure.
4607 */
4608 int RGWRados::initialize()
4609 {
4610 int ret;
4611
4612 ret = init_rados();
4613 if (ret < 0)
4614 return ret;
4615
4616 return init_complete();
4617 }
4618
4619 void RGWRados::finalize_watch()
4620 {
4621 for (int i = 0; i < num_watchers; i++) {
4622 RGWWatcher *watcher = watchers[i];
4623 watcher->unregister_watch();
4624 delete watcher;
4625 }
4626
4627 delete[] notify_oids;
4628 delete[] watchers;
4629 }
4630
4631 void RGWRados::schedule_context(Context *c) {
4632 finisher->queue(c);
4633 }
4634
4635 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4636 {
4637 bool is_truncated;
4638 RGWListRawObjsCtx ctx;
4639 do {
4640 list<string> oids;
4641 int r = list_raw_objects(pool, prefix, 1000,
4642 ctx, oids, &is_truncated);
4643 if (r < 0) {
4644 return r;
4645 }
4646 list<string>::iterator iter;
4647 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4648 string& val = *iter;
4649 if (val.size() > prefix.size())
4650 result.push_back(val.substr(prefix.size()));
4651 }
4652 } while (is_truncated);
4653
4654 return 0;
4655 }
4656
4657 int RGWRados::list_regions(list<string>& regions)
4658 {
4659 RGWZoneGroup zonegroup;
4660
4661 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4662 }
4663
4664 int RGWRados::list_zonegroups(list<string>& zonegroups)
4665 {
4666 RGWZoneGroup zonegroup;
4667
4668 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4669 }
4670
4671 int RGWRados::list_zones(list<string>& zones)
4672 {
4673 RGWZoneParams zoneparams;
4674
4675 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4676 }
4677
4678 int RGWRados::list_realms(list<string>& realms)
4679 {
4680 RGWRealm realm(cct, this);
4681 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4682 }
4683
4684 int RGWRados::list_periods(list<string>& periods)
4685 {
4686 RGWPeriod period;
4687 list<string> raw_periods;
4688 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4689 if (ret < 0) {
4690 return ret;
4691 }
4692 for (const auto& oid : raw_periods) {
4693 size_t pos = oid.find(".");
4694 if (pos != std::string::npos) {
4695 periods.push_back(oid.substr(0, pos));
4696 } else {
4697 periods.push_back(oid);
4698 }
4699 }
4700 periods.sort(); // unique() only detects duplicates if they're adjacent
4701 periods.unique();
4702 return 0;
4703 }
4704
4705
4706 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4707 {
4708 int ret = 0;
4709 string period_id = current_period;
4710 while(!period_id.empty()) {
4711 RGWPeriod period(period_id);
4712 ret = period.init(cct, this);
4713 if (ret < 0) {
4714 return ret;
4715 }
4716 periods.push_back(period.get_id());
4717 period_id = period.get_predecessor();
4718 }
4719
4720 return ret;
4721 }
4722
4723 /**
4724 * Open the pool used as root for this gateway
4725 * Returns: 0 on success, -ERR# otherwise.
4726 */
4727 int RGWRados::open_root_pool_ctx()
4728 {
4729 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4730 }
4731
4732 int RGWRados::open_gc_pool_ctx()
4733 {
4734 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4735 }
4736
4737 int RGWRados::open_lc_pool_ctx()
4738 {
4739 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4740 }
4741
4742 int RGWRados::open_objexp_pool_ctx()
4743 {
4744 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4745 }
4746
4747 int RGWRados::open_reshard_pool_ctx()
4748 {
4749 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4750 }
4751
4752 int RGWRados::init_watch()
4753 {
4754 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4755 if (r < 0) {
4756 return r;
4757 }
4758
4759 num_watchers = cct->_conf->rgw_num_control_oids;
4760
4761 bool compat_oid = (num_watchers == 0);
4762
4763 if (num_watchers <= 0)
4764 num_watchers = 1;
4765
4766 notify_oids = new string[num_watchers];
4767 watchers = new RGWWatcher *[num_watchers];
4768
4769 for (int i=0; i < num_watchers; i++) {
4770 string& notify_oid = notify_oids[i];
4771 notify_oid = notify_oid_prefix;
4772 if (!compat_oid) {
4773 char buf[16];
4774 snprintf(buf, sizeof(buf), ".%d", i);
4775 notify_oid.append(buf);
4776 }
4777 r = control_pool_ctx.create(notify_oid, false);
4778 if (r < 0 && r != -EEXIST)
4779 return r;
4780
4781 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4782 watchers[i] = watcher;
4783
4784 r = watcher->register_watch();
4785 if (r < 0)
4786 return r;
4787 }
4788
4789 watch_initialized = true;
4790
4791 set_cache_enabled(true);
4792
4793 return 0;
4794 }
4795
4796 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4797 {
4798 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4799
4800 int i = r % num_watchers;
4801 char buf[16];
4802 snprintf(buf, sizeof(buf), ".%d", i);
4803
4804 notify_oid = notify_oid_prefix;
4805 notify_oid.append(buf);
4806 }
4807
4808 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4809 {
4810 librados::Rados *rad = get_rados_handle();
4811 int r = rgw_init_ioctx(rad, pool, io_ctx);
4812 if (r != -ENOENT)
4813 return r;
4814
4815 if (!pools_initialized)
4816 return r;
4817
4818 r = rad->pool_create(pool.name.c_str());
4819 if (r < 0 && r != -EEXIST)
4820 return r;
4821
4822 r = rgw_init_ioctx(rad, pool, io_ctx);
4823 if (r < 0)
4824 return r;
4825
4826 r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
4827 if (r < 0 && r != -EOPNOTSUPP)
4828 return r;
4829 return 0;
4830 }
4831
4832 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4833 string *marker) {
4834 if (marker) {
4835 *marker = shard_id_str;
4836 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4837 marker->append(shard_marker);
4838 }
4839 }
4840
4841 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4842 {
4843 const string *rule = &bucket_info.placement_rule;
4844 if (rule->empty()) {
4845 rule = &zonegroup.default_placement;
4846 }
4847 auto iter = zone_params.placement_pools.find(*rule);
4848 if (iter == zone_params.placement_pools.end()) {
4849 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4850 return -EINVAL;
4851 }
4852
4853 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4854 if (r < 0)
4855 return r;
4856
4857 return 0;
4858 }
4859
4860 /**
4861 * set up a bucket listing.
4862 * handle is filled in.
4863 * Returns 0 on success, -ERR# otherwise.
4864 */
4865 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4866 {
4867 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4868 *handle = (RGWAccessHandle)state;
4869 return 0;
4870 }
4871
4872 /**
4873 * get the next bucket in the listing.
4874 * obj is filled in,
4875 * handle is updated.
4876 * returns 0 on success, -ERR# otherwise.
4877 */
4878 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4879 {
4880 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4881
4882 do {
4883 if (*state == root_pool_ctx.nobjects_end()) {
4884 delete state;
4885 return -ENOENT;
4886 }
4887
4888 obj.key.name = (*state)->get_oid();
4889 if (obj.key.name[0] == '_') {
4890 obj.key.name = obj.key.name.substr(1);
4891 }
4892
4893 (*state)++;
4894 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4895
4896 return 0;
4897 }
4898
4899
4900 /**** logs ****/
4901
4902 struct log_list_state {
4903 string prefix;
4904 librados::IoCtx io_ctx;
4905 librados::NObjectIterator obit;
4906 };
4907
4908 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4909 {
4910 log_list_state *state = new log_list_state;
4911 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4912 if (r < 0) {
4913 delete state;
4914 return r;
4915 }
4916 state->prefix = prefix;
4917 state->obit = state->io_ctx.nobjects_begin();
4918 *handle = (RGWAccessHandle)state;
4919 return 0;
4920 }
4921
4922 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4923 {
4924 log_list_state *state = static_cast<log_list_state *>(handle);
4925 while (true) {
4926 if (state->obit == state->io_ctx.nobjects_end()) {
4927 delete state;
4928 return -ENOENT;
4929 }
4930 if (state->prefix.length() &&
4931 state->obit->get_oid().find(state->prefix) != 0) {
4932 state->obit++;
4933 continue;
4934 }
4935 *name = state->obit->get_oid();
4936 state->obit++;
4937 break;
4938 }
4939 return 0;
4940 }
4941
4942 int RGWRados::log_remove(const string& name)
4943 {
4944 librados::IoCtx io_ctx;
4945 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4946 if (r < 0)
4947 return r;
4948 return io_ctx.remove(name);
4949 }
4950
4951 struct log_show_state {
4952 librados::IoCtx io_ctx;
4953 bufferlist bl;
4954 bufferlist::iterator p;
4955 string name;
4956 uint64_t pos;
4957 bool eof;
4958 log_show_state() : pos(0), eof(false) {}
4959 };
4960
4961 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4962 {
4963 log_show_state *state = new log_show_state;
4964 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4965 if (r < 0) {
4966 delete state;
4967 return r;
4968 }
4969 state->name = name;
4970 *handle = (RGWAccessHandle)state;
4971 return 0;
4972 }
4973
4974 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4975 {
4976 log_show_state *state = static_cast<log_show_state *>(handle);
4977 off_t off = state->p.get_off();
4978
4979 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
4980 << " off " << off
4981 << " eof " << (int)state->eof
4982 << dendl;
4983 // read some?
4984 unsigned chunk = 1024*1024;
4985 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
4986 bufferlist more;
4987 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
4988 if (r < 0)
4989 return r;
4990 state->pos += r;
4991 bufferlist old;
4992 try {
4993 old.substr_of(state->bl, off, state->bl.length() - off);
4994 } catch (buffer::error& err) {
4995 return -EINVAL;
4996 }
4997 state->bl.clear();
4998 state->bl.claim(old);
4999 state->bl.claim_append(more);
5000 state->p = state->bl.begin();
5001 if ((unsigned)r < chunk)
5002 state->eof = true;
5003 ldout(cct, 10) << " read " << r << dendl;
5004 }
5005
5006 if (state->p.end())
5007 return 0; // end of file
5008 try {
5009 ::decode(*entry, state->p);
5010 }
5011 catch (const buffer::error &e) {
5012 return -EINVAL;
5013 }
5014 return 1;
5015 }
5016
5017 /**
5018 * usage_log_hash: get usage log key hash, based on name and index
5019 *
5020 * Get the usage object name. Since a user may have more than 1
5021 * object holding that info (multiple shards), we use index to
5022 * specify that shard number. Once index exceeds max shards it
5023 * wraps.
5024 * If name is not being set, results for all users will be returned
5025 * and index will wrap only after total shards number.
5026 *
5027 * @param cct [in] ceph context
5028 * @param name [in] user name
5029 * @param hash [out] hash value
5030 * @param index [in] shard index number
5031 */
5032 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5033 {
5034 uint32_t val = index;
5035
5036 if (!name.empty()) {
5037 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5038 val %= max_user_shards;
5039 val += ceph_str_hash_linux(name.c_str(), name.size());
5040 }
5041 char buf[17];
5042 int max_shards = cct->_conf->rgw_usage_max_shards;
5043 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5044 hash = buf;
5045 }
5046
5047 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5048 {
5049 uint32_t index = 0;
5050
5051 map<string, rgw_usage_log_info> log_objs;
5052
5053 string hash;
5054 string last_user;
5055
5056 /* restructure usage map, zone by object hash */
5057 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5058 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5059 const rgw_user_bucket& ub = iter->first;
5060 RGWUsageBatch& info = iter->second;
5061
5062 if (ub.user.empty()) {
5063 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5064 continue;
5065 }
5066
5067 if (ub.user != last_user) {
5068 /* index *should* be random, but why waste extra cycles
5069 in most cases max user shards is not going to exceed 1,
5070 so just incrementing it */
5071 usage_log_hash(cct, ub.user, hash, index++);
5072 }
5073 last_user = ub.user;
5074 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5075
5076 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5077 v.push_back(miter->second);
5078 }
5079 }
5080
5081 map<string, rgw_usage_log_info>::iterator liter;
5082
5083 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5084 int r = cls_obj_usage_log_add(liter->first, liter->second);
5085 if (r < 0)
5086 return r;
5087 }
5088 return 0;
5089 }
5090
5091 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5092 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5093 {
5094 uint32_t num = max_entries;
5095 string hash, first_hash;
5096 string user_str = user.to_str();
5097 usage_log_hash(cct, user_str, first_hash, 0);
5098
5099 if (usage_iter.index) {
5100 usage_log_hash(cct, user_str, hash, usage_iter.index);
5101 } else {
5102 hash = first_hash;
5103 }
5104
5105 usage.clear();
5106
5107 do {
5108 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5109 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5110
5111 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5112 usage_iter.read_iter, ret_usage, is_truncated);
5113 if (ret == -ENOENT)
5114 goto next;
5115
5116 if (ret < 0)
5117 return ret;
5118
5119 num -= ret_usage.size();
5120
5121 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5122 usage[iter->first].aggregate(iter->second);
5123 }
5124
5125 next:
5126 if (!*is_truncated) {
5127 usage_iter.read_iter.clear();
5128 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5129 }
5130 } while (num && !*is_truncated && hash != first_hash);
5131 return 0;
5132 }
5133
5134 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5135 {
5136 uint32_t index = 0;
5137 string hash, first_hash;
5138 string user_str = user.to_str();
5139 usage_log_hash(cct, user_str, first_hash, index);
5140
5141 hash = first_hash;
5142
5143 do {
5144 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5145 if (ret == -ENOENT)
5146 goto next;
5147
5148 if (ret < 0)
5149 return ret;
5150
5151 next:
5152 usage_log_hash(cct, user_str, hash, ++index);
5153 } while (hash != first_hash);
5154
5155 return 0;
5156 }
5157
5158 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5159 {
5160 return rgw_shards_hash(key, max_shards);
5161 }
5162
5163 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5164 {
5165 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5166 char buf[16];
5167 if (shard_id) {
5168 *shard_id = val % max_shards;
5169 }
5170 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5171 name = prefix + buf;
5172 }
5173
5174 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5175 {
5176 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5177 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5178 char buf[16];
5179 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5180 name = prefix + buf;
5181 }
5182
5183 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5184 {
5185 char buf[16];
5186 snprintf(buf, sizeof(buf), "%u", shard_id);
5187 name = prefix + buf;
5188
5189 }
5190
5191 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5192 {
5193 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5194 }
5195
5196 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5197 {
5198 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5199
5200 }
5201
5202 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5203 {
5204 librados::IoCtx io_ctx;
5205
5206 int r = time_log_add_init(io_ctx);
5207 if (r < 0) {
5208 return r;
5209 }
5210
5211 ObjectWriteOperation op;
5212 utime_t t(ut);
5213 cls_log_add(op, t, section, key, bl);
5214
5215 return io_ctx.operate(oid, &op);
5216 }
5217
5218 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5219 librados::AioCompletion *completion, bool monotonic_inc)
5220 {
5221 librados::IoCtx io_ctx;
5222
5223 int r = time_log_add_init(io_ctx);
5224 if (r < 0) {
5225 return r;
5226 }
5227
5228 ObjectWriteOperation op;
5229 cls_log_add(op, entries, monotonic_inc);
5230
5231 if (!completion) {
5232 r = io_ctx.operate(oid, &op);
5233 } else {
5234 r = io_ctx.aio_operate(oid, completion, &op);
5235 }
5236 return r;
5237 }
5238
5239 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5240 int max_entries, list<cls_log_entry>& entries,
5241 const string& marker,
5242 string *out_marker,
5243 bool *truncated)
5244 {
5245 librados::IoCtx io_ctx;
5246
5247 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5248 if (r < 0)
5249 return r;
5250 librados::ObjectReadOperation op;
5251
5252 utime_t st(start_time);
5253 utime_t et(end_time);
5254
5255 cls_log_list(op, st, et, marker, max_entries, entries,
5256 out_marker, truncated);
5257
5258 bufferlist obl;
5259
5260 int ret = io_ctx.operate(oid, &op, &obl);
5261 if (ret < 0)
5262 return ret;
5263
5264 return 0;
5265 }
5266
5267 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5268 {
5269 librados::IoCtx io_ctx;
5270
5271 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5272 if (r < 0)
5273 return r;
5274 librados::ObjectReadOperation op;
5275
5276 cls_log_info(op, header);
5277
5278 bufferlist obl;
5279
5280 int ret = io_ctx.operate(oid, &op, &obl);
5281 if (ret < 0)
5282 return ret;
5283
5284 return 0;
5285 }
5286
5287 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5288 {
5289 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5290 if (r < 0)
5291 return r;
5292
5293 librados::ObjectReadOperation op;
5294
5295 cls_log_info(op, header);
5296
5297 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5298 if (ret < 0)
5299 return ret;
5300
5301 return 0;
5302 }
5303
5304 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5305 const string& from_marker, const string& to_marker,
5306 librados::AioCompletion *completion)
5307 {
5308 librados::IoCtx io_ctx;
5309
5310 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5311 if (r < 0)
5312 return r;
5313
5314 utime_t st(start_time);
5315 utime_t et(end_time);
5316
5317 ObjectWriteOperation op;
5318 cls_log_trim(op, st, et, from_marker, to_marker);
5319
5320 if (!completion) {
5321 r = io_ctx.operate(oid, &op);
5322 } else {
5323 r = io_ctx.aio_operate(oid, completion, &op);
5324 }
5325 return r;
5326 }
5327
5328 string RGWRados::objexp_hint_get_shardname(int shard_num)
5329 {
5330 char buf[32];
5331 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5332
5333 string objname("obj_delete_at_hint.");
5334 return objname + buf;
5335 }
5336
5337 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5338 {
5339 string obj_key = key.name + key.instance;
5340 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5341 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5342 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5343 sid = rgw_shards_mod(sid2, num_shards);
5344 return sid;
5345 }
5346
5347 static string objexp_hint_get_keyext(const string& tenant_name,
5348 const string& bucket_name,
5349 const string& bucket_id,
5350 const rgw_obj_key& obj_key)
5351 {
5352 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5353 ":" + obj_key.name + ":" + obj_key.instance;
5354 }
5355
5356 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5357 const string& tenant_name,
5358 const string& bucket_name,
5359 const string& bucket_id,
5360 const rgw_obj_index_key& obj_key)
5361 {
5362 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5363 bucket_id, obj_key);
5364 objexp_hint_entry he = {
5365 .tenant = tenant_name,
5366 .bucket_name = bucket_name,
5367 .bucket_id = bucket_id,
5368 .obj_key = obj_key,
5369 .exp_time = delete_at };
5370 bufferlist hebl;
5371 ::encode(he, hebl);
5372 ObjectWriteOperation op;
5373 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5374
5375 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5376 return objexp_pool_ctx.operate(shard_name, &op);
5377 }
5378
5379 void RGWRados::objexp_get_shard(int shard_num,
5380 string& shard) /* out */
5381 {
5382 shard = objexp_hint_get_shardname(shard_num);
5383 }
5384
5385 int RGWRados::objexp_hint_list(const string& oid,
5386 const ceph::real_time& start_time,
5387 const ceph::real_time& end_time,
5388 const int max_entries,
5389 const string& marker,
5390 list<cls_timeindex_entry>& entries, /* out */
5391 string *out_marker, /* out */
5392 bool *truncated) /* out */
5393 {
5394 librados::ObjectReadOperation op;
5395 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5396 out_marker, truncated);
5397
5398 bufferlist obl;
5399 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5400
5401 if ((ret < 0 ) && (ret != -ENOENT)) {
5402 return ret;
5403 }
5404
5405 if ((ret == -ENOENT) && truncated) {
5406 *truncated = false;
5407 }
5408
5409 return 0;
5410 }
5411
5412 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5413 objexp_hint_entry& hint_entry) /* out */
5414 {
5415 try {
5416 bufferlist::iterator iter = ti_entry.value.begin();
5417 ::decode(hint_entry, iter);
5418 } catch (buffer::error& err) {
5419 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5420 }
5421
5422 return 0;
5423 }
5424
5425 int RGWRados::objexp_hint_trim(const string& oid,
5426 const ceph::real_time& start_time,
5427 const ceph::real_time& end_time,
5428 const string& from_marker,
5429 const string& to_marker)
5430 {
5431 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5432 from_marker, to_marker);
5433 if ((ret < 0 ) && (ret != -ENOENT)) {
5434 return ret;
5435 }
5436
5437 return 0;
5438 }
5439
5440 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5441 string& zone_id, string& owner_id) {
5442 librados::IoCtx io_ctx;
5443
5444 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5445 if (r < 0) {
5446 return r;
5447 }
5448 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5449 utime_t ut(msec / 1000, msec % 1000);
5450
5451 rados::cls::lock::Lock l(log_lock_name);
5452 l.set_duration(ut);
5453 l.set_cookie(owner_id);
5454 l.set_tag(zone_id);
5455 l.set_renew(true);
5456
5457 return l.lock_exclusive(&io_ctx, oid);
5458 }
5459
5460 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5461 librados::IoCtx io_ctx;
5462
5463 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5464 if (r < 0) {
5465 return r;
5466 }
5467
5468 rados::cls::lock::Lock l(log_lock_name);
5469 l.set_tag(zone_id);
5470 l.set_cookie(owner_id);
5471
5472 return l.unlock(&io_ctx, oid);
5473 }
5474
5475 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5476 {
5477 bufferlist::iterator i = bl.begin();
5478 RGWAccessControlPolicy policy(cct);
5479 try {
5480 policy.decode_owner(i);
5481 } catch (buffer::error& err) {
5482 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5483 return -EIO;
5484 }
5485 *owner = policy.get_owner();
5486 return 0;
5487 }
5488
5489 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5490 {
5491 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5492 if (aiter == attrset.end())
5493 return -EIO;
5494
5495 bufferlist& bl = aiter->second;
5496 bufferlist::iterator iter = bl.begin();
5497 try {
5498 policy->decode(iter);
5499 } catch (buffer::error& err) {
5500 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5501 return -EIO;
5502 }
5503 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5504 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5505 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5506 s3policy->to_xml(*_dout);
5507 *_dout << dendl;
5508 }
5509 return 0;
5510 }
5511
5512
5513 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5514 {
5515 rgw_bucket bucket = bucket_info.bucket;
5516 bucket.update_bucket_id(new_bucket_id);
5517
5518 RGWObjectCtx obj_ctx(store);
5519
5520 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5521 if (ret < 0) {
5522 return ret;
5523 }
5524
5525 return 0;
5526 }
5527
5528 /**
5529 * get listing of the objects in a bucket.
5530 *
5531 * max: maximum number of results to return
5532 * bucket: bucket to list contents of
5533 * prefix: only return results that match this prefix
5534 * delim: do not include results that match this string.
5535 * Any skipped results will have the matching portion of their name
5536 * inserted in common_prefixes with a "true" mark.
5537 * marker: if filled in, begin the listing with this object.
5538 * end_marker: if filled in, end the listing with this object.
5539 * result: the objects are put in here.
5540 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5541 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5542 */
5543 int RGWRados::Bucket::List::list_objects(int64_t max,
5544 vector<rgw_bucket_dir_entry> *result,
5545 map<string, bool> *common_prefixes,
5546 bool *is_truncated)
5547 {
5548 RGWRados *store = target->get_store();
5549 CephContext *cct = store->ctx();
5550 int shard_id = target->get_shard_id();
5551
5552 int count = 0;
5553 bool truncated = true;
5554 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5555
5556 result->clear();
5557
5558 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5559
5560 rgw_obj_key end_marker_obj;
5561 rgw_obj_index_key cur_end_marker;
5562 if (!params.ns.empty()) {
5563 end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
5564 end_marker_obj.ns = params.ns;
5565 end_marker_obj.get_index_key(&cur_end_marker);
5566 }
5567 rgw_obj_index_key cur_marker;
5568 marker_obj.get_index_key(&cur_marker);
5569
5570 const bool cur_end_marker_valid = !params.end_marker.empty();
5571
5572 rgw_obj_key prefix_obj(params.prefix);
5573 prefix_obj.ns = params.ns;
5574 string cur_prefix = prefix_obj.get_index_key_name();
5575
5576 string bigger_than_delim;
5577
5578 if (!params.delim.empty()) {
5579 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5580 char buf[params.delim.size() + 16];
5581 int r = encode_utf8(val + 1, (unsigned char *)buf);
5582 if (r < 0) {
5583 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5584 return -EINVAL;
5585 }
5586 buf[r] = '\0';
5587
5588 bigger_than_delim = buf;
5589
5590 /* if marker points at a common prefix, fast forward it into its upperbound string */
5591 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5592 if (delim_pos >= 0) {
5593 string s = cur_marker.name.substr(0, delim_pos);
5594 s.append(bigger_than_delim);
5595 cur_marker = s;
5596 }
5597 }
5598
5599 string skip_after_delim;
5600 while (truncated && count <= max) {
5601 if (skip_after_delim > cur_marker.name) {
5602 cur_marker = skip_after_delim;
5603 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5604 }
5605 std::map<string, rgw_bucket_dir_entry> ent_map;
5606 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5607 read_ahead + 1 - count, params.list_versions, ent_map,
5608 &truncated, &cur_marker);
5609 if (r < 0)
5610 return r;
5611
5612 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5613 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5614 rgw_bucket_dir_entry& entry = eiter->second;
5615 rgw_obj_index_key index_key = entry.key;
5616
5617 rgw_obj_key obj(index_key);
5618
5619 /* note that parse_raw_oid() here will not set the correct object's instance, as
5620 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5621 * not needed for the checks here and we end up using the raw entry for the return vector
5622 */
5623 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5624 if (!valid) {
5625 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5626 continue;
5627 }
5628 bool check_ns = (obj.ns == params.ns);
5629 if (!params.list_versions && !entry.is_visible()) {
5630 continue;
5631 }
5632
5633 if (params.enforce_ns && !check_ns) {
5634 if (!params.ns.empty()) {
5635 /* we've iterated past the namespace we're searching -- done now */
5636 truncated = false;
5637 goto done;
5638 }
5639
5640 /* we're not looking at the namespace this object is in, next! */
5641 continue;
5642 }
5643
5644 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5645 truncated = false;
5646 goto done;
5647 }
5648
5649 if (count < max) {
5650 params.marker = index_key;
5651 next_marker = index_key;
5652 }
5653
5654 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5655 continue;
5656
5657 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5658 continue;
5659
5660 if (!params.delim.empty()) {
5661 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5662
5663 if (delim_pos >= 0) {
5664 string prefix_key = obj.name.substr(0, delim_pos + 1);
5665
5666 if (common_prefixes &&
5667 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5668 if (count >= max) {
5669 truncated = true;
5670 goto done;
5671 }
5672 next_marker = prefix_key;
5673 (*common_prefixes)[prefix_key] = true;
5674
5675 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5676
5677 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5678 skip_after_delim.append(bigger_than_delim);
5679
5680 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5681
5682 count++;
5683 }
5684
5685 continue;
5686 }
5687 }
5688
5689 if (count >= max) {
5690 truncated = true;
5691 goto done;
5692 }
5693
5694 result->emplace_back(std::move(entry));
5695 count++;
5696 }
5697
5698 // Either the back-end telling us truncated, or we don't consume all
5699 // items returned per the amount caller request
5700 truncated = (truncated || eiter != ent_map.end());
5701 }
5702
5703 done:
5704 if (is_truncated)
5705 *is_truncated = truncated;
5706
5707 return 0;
5708 }
5709
5710 /**
5711 * create a rados pool, associated meta info
5712 * returns 0 on success, -ERR# otherwise.
5713 */
5714 int RGWRados::create_pool(const rgw_pool& pool)
5715 {
5716 int ret = 0;
5717
5718 librados::Rados *rad = get_rados_handle();
5719 ret = rad->pool_create(pool.name.c_str(), 0);
5720 if (ret == -EEXIST)
5721 ret = 0;
5722 else if (ret == -ERANGE) {
5723 ldout(cct, 0)
5724 << __func__
5725 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5726 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5727 << dendl;
5728 }
5729 if (ret < 0)
5730 return ret;
5731
5732 librados::IoCtx io_ctx;
5733 ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
5734 if (ret < 0)
5735 return ret;
5736
5737 ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
5738 if (ret < 0 && ret != -EOPNOTSUPP)
5739 return ret;
5740 return 0;
5741 }
5742
5743 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5744 {
5745 librados::IoCtx index_ctx; // context for new bucket
5746
5747 string dir_oid = dir_oid_prefix;
5748 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5749 if (r < 0) {
5750 return r;
5751 }
5752
5753 dir_oid.append(bucket_info.bucket.bucket_id);
5754
5755 map<int, string> bucket_objs;
5756 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5757
5758 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5759 }
5760
5761 void RGWRados::create_bucket_id(string *bucket_id)
5762 {
5763 uint64_t iid = instance_id();
5764 uint64_t bid = next_bucket_id();
5765 char buf[get_zone_params().get_id().size() + 48];
5766 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5767 *bucket_id = buf;
5768 }
5769
5770 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5771 const string& zonegroup_id,
5772 const string& placement_rule,
5773 const string& swift_ver_location,
5774 const RGWQuotaInfo * pquota_info,
5775 map<std::string, bufferlist>& attrs,
5776 RGWBucketInfo& info,
5777 obj_version *pobjv,
5778 obj_version *pep_objv,
5779 real_time creation_time,
5780 rgw_bucket *pmaster_bucket,
5781 uint32_t *pmaster_num_shards,
5782 bool exclusive)
5783 {
5784 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5785 string selected_placement_rule_name;
5786 RGWZonePlacementInfo rule_info;
5787
5788 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5789 int ret = 0;
5790 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5791 &selected_placement_rule_name, &rule_info);
5792 if (ret < 0)
5793 return ret;
5794
5795 if (!pmaster_bucket) {
5796 create_bucket_id(&bucket.marker);
5797 bucket.bucket_id = bucket.marker;
5798 } else {
5799 bucket.marker = pmaster_bucket->marker;
5800 bucket.bucket_id = pmaster_bucket->bucket_id;
5801 }
5802
5803 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5804
5805 if (pobjv) {
5806 objv_tracker.write_version = *pobjv;
5807 } else {
5808 objv_tracker.generate_new_write_ver(cct);
5809 }
5810
5811 info.bucket = bucket;
5812 info.owner = owner.user_id;
5813 info.zonegroup = zonegroup_id;
5814 info.placement_rule = selected_placement_rule_name;
5815 info.index_type = rule_info.index_type;
5816 info.swift_ver_location = swift_ver_location;
5817 info.swift_versioning = (!swift_ver_location.empty());
5818 if (pmaster_num_shards) {
5819 info.num_shards = *pmaster_num_shards;
5820 } else {
5821 info.num_shards = bucket_index_max_shards;
5822 }
5823 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5824 info.requester_pays = false;
5825 if (real_clock::is_zero(creation_time)) {
5826 info.creation_time = ceph::real_clock::now();
5827 } else {
5828 info.creation_time = creation_time;
5829 }
5830 if (pquota_info) {
5831 info.quota = *pquota_info;
5832 }
5833
5834 int r = init_bucket_index(info, info.num_shards);
5835 if (r < 0) {
5836 return r;
5837 }
5838
5839 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5840 if (ret == -EEXIST) {
5841 librados::IoCtx index_ctx;
5842 map<int, string> bucket_objs;
5843 int r = open_bucket_index(info, index_ctx, bucket_objs);
5844 if (r < 0)
5845 return r;
5846
5847 /* we need to reread the info and return it, caller will have a use for it */
5848 RGWObjVersionTracker instance_ver = info.objv_tracker;
5849 info.objv_tracker.clear();
5850 RGWObjectCtx obj_ctx(this);
5851 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5852 if (r < 0) {
5853 if (r == -ENOENT) {
5854 continue;
5855 }
5856 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5857 return r;
5858 }
5859
5860 /* only remove it if it's a different bucket instance */
5861 if (info.bucket.bucket_id != bucket.bucket_id) {
5862 /* remove bucket meta instance */
5863 string entry = bucket.get_key();
5864 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5865 if (r < 0)
5866 return r;
5867
5868 map<int, string>::const_iterator biter;
5869 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5870 // Do best effort removal
5871 index_ctx.remove(biter->second);
5872 }
5873 }
5874 /* ret == -ENOENT here */
5875 }
5876 return ret;
5877 }
5878
5879 /* this is highly unlikely */
5880 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5881 return -ENOENT;
5882 }
5883
5884 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5885 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5886
5887 {
5888 /* first check that zonegroup exists within current period. */
5889 RGWZoneGroup zonegroup;
5890 int ret = get_zonegroup(zonegroup_id, zonegroup);
5891 if (ret < 0) {
5892 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5893 return ret;
5894 }
5895
5896 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5897 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
5898
5899 if (!request_rule.empty()) {
5900 titer = zonegroup.placement_targets.find(request_rule);
5901 if (titer == zonegroup.placement_targets.end()) {
5902 ldout(cct, 0) << "could not find requested placement id " << request_rule
5903 << " within zonegroup " << dendl;
5904 return -ERR_INVALID_LOCATION_CONSTRAINT;
5905 }
5906 } else if (!user_info.default_placement.empty()) {
5907 titer = zonegroup.placement_targets.find(user_info.default_placement);
5908 if (titer == zonegroup.placement_targets.end()) {
5909 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
5910 << " within zonegroup " << dendl;
5911 return -ERR_INVALID_LOCATION_CONSTRAINT;
5912 }
5913 } else {
5914 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
5915 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
5916 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
5917 } else {
5918 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
5919 if (titer == zonegroup.placement_targets.end()) {
5920 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
5921 << " within zonegroup " << dendl;
5922 return -ERR_INVALID_LOCATION_CONSTRAINT;
5923 }
5924 }
5925 }
5926
5927 /* now check tag for the rule, whether user is permitted to use rule */
5928 const auto& target_rule = titer->second;
5929 if (!target_rule.user_permitted(user_info.placement_tags)) {
5930 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
5931 return -EPERM;
5932 }
5933
5934 if (pselected_rule_name)
5935 *pselected_rule_name = titer->first;
5936
5937 return select_bucket_location_by_rule(titer->first, rule_info);
5938 }
5939
5940 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5941 {
5942 if (location_rule.empty()) {
5943 /* we can only reach here if we're trying to set a bucket location from a bucket
5944 * created on a different zone, using a legacy / default pool configuration
5945 */
5946 return select_legacy_bucket_placement(rule_info);
5947 }
5948
5949 /*
5950 * make sure that zone has this rule configured. We're
5951 * checking it for the local zone, because that's where this bucket object is going to
5952 * reside.
5953 */
5954 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5955 if (piter == get_zone_params().placement_pools.end()) {
5956 /* couldn't find, means we cannot really place data for this bucket in this zone */
5957 if (get_zonegroup().equals(zonegroup.get_id())) {
5958 /* that's a configuration error, zone should have that rule, as we're within the requested
5959 * zonegroup */
5960 return -EINVAL;
5961 } else {
5962 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5963 return 0;
5964 }
5965 }
5966
5967 RGWZonePlacementInfo& placement_info = piter->second;
5968
5969 if (rule_info) {
5970 *rule_info = placement_info;
5971 }
5972
5973 return 0;
5974 }
5975
5976 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5977 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5978 {
5979 if (!get_zone_params().placement_pools.empty()) {
5980 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5981 pselected_rule_name, rule_info);
5982 }
5983
5984 if (pselected_rule_name) {
5985 pselected_rule_name->clear();
5986 }
5987
5988 return select_legacy_bucket_placement(rule_info);
5989 }
5990
5991 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
5992 {
5993 bufferlist map_bl;
5994 map<string, bufferlist> m;
5995 string pool_name;
5996 bool write_map = false;
5997
5998 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5999
6000 RGWObjectCtx obj_ctx(this);
6001 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6002 if (ret < 0) {
6003 goto read_omap;
6004 }
6005
6006 try {
6007 bufferlist::iterator iter = map_bl.begin();
6008 ::decode(m, iter);
6009 } catch (buffer::error& err) {
6010 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6011 }
6012
6013 read_omap:
6014 if (m.empty()) {
6015 bufferlist header;
6016 ret = omap_get_all(obj, header, m);
6017
6018 write_map = true;
6019 }
6020
6021 if (ret < 0 || m.empty()) {
6022 vector<rgw_pool> pools;
6023 string s = string("default.") + default_storage_pool_suffix;
6024 pools.push_back(rgw_pool(s));
6025 vector<int> retcodes;
6026 bufferlist bl;
6027 ret = create_pools(pools, retcodes);
6028 if (ret < 0)
6029 return ret;
6030 ret = omap_set(obj, s, bl);
6031 if (ret < 0)
6032 return ret;
6033 m[s] = bl;
6034 }
6035
6036 if (write_map) {
6037 bufferlist new_bl;
6038 ::encode(m, new_bl);
6039 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6040 if (ret < 0) {
6041 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6042 }
6043 }
6044
6045 map<string, bufferlist>::iterator miter;
6046 if (m.size() > 1) {
6047 vector<string> v;
6048 for (miter = m.begin(); miter != m.end(); ++miter) {
6049 v.push_back(miter->first);
6050 }
6051
6052 uint32_t r;
6053 ret = get_random_bytes((char *)&r, sizeof(r));
6054 if (ret < 0)
6055 return ret;
6056
6057 int i = r % v.size();
6058 pool_name = v[i];
6059 } else {
6060 miter = m.begin();
6061 pool_name = miter->first;
6062 }
6063
6064 rule_info->data_pool = pool_name;
6065 rule_info->data_extra_pool = pool_name;
6066 rule_info->index_pool = pool_name;
6067 rule_info->index_type = RGWBIType_Normal;
6068
6069 return 0;
6070 }
6071
6072 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6073 {
6074 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6075 }
6076
6077 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6078 {
6079 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6080
6081 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6082 }
6083
6084 int RGWRados::update_placement_map()
6085 {
6086 bufferlist header;
6087 map<string, bufferlist> m;
6088 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6089 int ret = omap_get_all(obj, header, m);
6090 if (ret < 0)
6091 return ret;
6092
6093 bufferlist new_bl;
6094 ::encode(m, new_bl);
6095 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6096 if (ret < 0) {
6097 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6098 }
6099
6100 return ret;
6101 }
6102
6103 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6104 {
6105 librados::Rados *rad = get_rados_handle();
6106 int ret = rad->pool_lookup(new_pool.name.c_str());
6107 if (ret < 0) // DNE, or something
6108 return ret;
6109
6110 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6111 bufferlist empty_bl;
6112 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6113
6114 // don't care about return value
6115 update_placement_map();
6116
6117 return ret;
6118 }
6119
6120 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6121 {
6122 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6123 int ret = omap_del(obj, old_pool.to_str());
6124
6125 // don't care about return value
6126 update_placement_map();
6127
6128 return ret;
6129 }
6130
6131 int RGWRados::list_placement_set(set<rgw_pool>& names)
6132 {
6133 bufferlist header;
6134 map<string, bufferlist> m;
6135
6136 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6137 int ret = omap_get_all(obj, header, m);
6138 if (ret < 0)
6139 return ret;
6140
6141 names.clear();
6142 map<string, bufferlist>::iterator miter;
6143 for (miter = m.begin(); miter != m.end(); ++miter) {
6144 names.insert(rgw_pool(miter->first));
6145 }
6146
6147 return names.size();
6148 }
6149
6150 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6151 {
6152 vector<librados::PoolAsyncCompletion *> completions;
6153 vector<int> rets;
6154
6155 librados::Rados *rad = get_rados_handle();
6156 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6157 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6158 completions.push_back(c);
6159 rgw_pool& pool = *iter;
6160 int ret = rad->pool_create_async(pool.name.c_str(), c);
6161 rets.push_back(ret);
6162 }
6163
6164 vector<int>::iterator riter;
6165 vector<librados::PoolAsyncCompletion *>::iterator citer;
6166
6167 bool error = false;
6168 assert(rets.size() == completions.size());
6169 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6170 int r = *riter;
6171 PoolAsyncCompletion *c = *citer;
6172 if (r == 0) {
6173 c->wait();
6174 r = c->get_return_value();
6175 if (r < 0) {
6176 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6177 error = true;
6178 }
6179 }
6180 c->release();
6181 retcodes.push_back(r);
6182 }
6183 if (error) {
6184 return 0;
6185 }
6186
6187 std::vector<librados::IoCtx> io_ctxs;
6188 retcodes.clear();
6189 for (auto pool : pools) {
6190 io_ctxs.emplace_back();
6191 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6192 if (ret < 0) {
6193 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6194 error = true;
6195 }
6196 retcodes.push_back(ret);
6197 }
6198 if (error) {
6199 return 0;
6200 }
6201
6202 completions.clear();
6203 for (auto &io_ctx : io_ctxs) {
6204 librados::PoolAsyncCompletion *c =
6205 librados::Rados::pool_async_create_completion();
6206 completions.push_back(c);
6207 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6208 false, c);
6209 assert(ret == 0);
6210 }
6211
6212 retcodes.clear();
6213 for (auto c : completions) {
6214 c->wait();
6215 int ret = c->get_return_value();
6216 if (ret == -EOPNOTSUPP) {
6217 ret = 0;
6218 } else if (ret < 0) {
6219 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6220 << dendl;
6221 error = true;
6222 }
6223 c->release();
6224 retcodes.push_back(ret);
6225 }
6226 return 0;
6227 }
6228
6229 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6230 {
6231 string oid, key;
6232 get_obj_bucket_and_oid_loc(obj, oid, key);
6233
6234 rgw_pool pool;
6235 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6236 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6237 return -EIO;
6238 }
6239
6240 int r = open_pool_ctx(pool, *ioctx);
6241 if (r < 0) {
6242 return r;
6243 }
6244
6245 ioctx->locator_set_key(key);
6246
6247 return 0;
6248 }
6249
6250 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6251 {
6252 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6253
6254 rgw_pool pool;
6255 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6256 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6257 return -EIO;
6258 }
6259
6260 int r = open_pool_ctx(pool, ref->ioctx);
6261 if (r < 0) {
6262 return r;
6263 }
6264
6265 ref->ioctx.locator_set_key(ref->key);
6266
6267 return 0;
6268 }
6269
6270 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6271 {
6272 ref->oid = obj.oid;
6273 ref->key = obj.loc;
6274
6275 int r;
6276
6277 if (ref->oid.empty()) {
6278 ref->oid = obj.pool.to_str();
6279 ref->pool = get_zone_params().domain_root;
6280 } else {
6281 ref->pool = obj.pool;
6282 }
6283 r = open_pool_ctx(ref->pool, ref->ioctx);
6284 if (r < 0)
6285 return r;
6286
6287 ref->ioctx.locator_set_key(ref->key);
6288
6289 return 0;
6290 }
6291
6292 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6293 {
6294 return get_raw_obj_ref(obj, ref);
6295 }
6296
6297 /*
6298 * fixes an issue where head objects were supposed to have a locator created, but ended
6299 * up without one
6300 */
6301 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6302 {
6303 const rgw_bucket& bucket = bucket_info.bucket;
6304 string oid;
6305 string locator;
6306
6307 rgw_obj obj(bucket, key);
6308
6309 get_obj_bucket_and_oid_loc(obj, oid, locator);
6310
6311 if (locator.empty()) {
6312 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6313 return 0;
6314 }
6315
6316 librados::IoCtx ioctx;
6317
6318 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6319 if (ret < 0) {
6320 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6321 return ret;
6322 }
6323 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6324
6325 uint64_t size;
6326 bufferlist data;
6327
6328 struct timespec mtime_ts;
6329 map<string, bufferlist> attrs;
6330 librados::ObjectReadOperation op;
6331 op.getxattrs(&attrs, NULL);
6332 op.stat2(&size, &mtime_ts, NULL);
6333 #define HEAD_SIZE 512 * 1024
6334 op.read(0, HEAD_SIZE, &data, NULL);
6335
6336 ret = ioctx.operate(oid, &op, NULL);
6337 if (ret < 0) {
6338 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6339 return ret;
6340 }
6341
6342 if (size > HEAD_SIZE) {
6343 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6344 return -EIO;
6345 }
6346
6347 if (size != data.length()) {
6348 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6349 return -EIO;
6350 }
6351
6352 if (copy_obj) {
6353 librados::ObjectWriteOperation wop;
6354
6355 wop.mtime2(&mtime_ts);
6356
6357 map<string, bufferlist>::iterator iter;
6358 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6359 wop.setxattr(iter->first.c_str(), iter->second);
6360 }
6361
6362 wop.write(0, data);
6363
6364 ioctx.locator_set_key(locator);
6365 ioctx.operate(oid, &wop);
6366 }
6367
6368 if (remove_bad) {
6369 ioctx.locator_set_key(string());
6370
6371 ret = ioctx.remove(oid);
6372 if (ret < 0) {
6373 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6374 return ret;
6375 }
6376 }
6377
6378 return 0;
6379 }
6380
6381 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6382 const string& src_oid, const string& src_locator,
6383 librados::IoCtx& dst_ioctx,
6384 const string& dst_oid, const string& dst_locator)
6385 {
6386
6387 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6388 bool done = false;
6389 uint64_t chunk_size = COPY_BUF_SIZE;
6390 uint64_t ofs = 0;
6391 int ret = 0;
6392 real_time mtime;
6393 struct timespec mtime_ts;
6394 uint64_t size;
6395
6396 if (src_oid == dst_oid && src_locator == dst_locator) {
6397 return 0;
6398 }
6399
6400 src_ioctx.locator_set_key(src_locator);
6401 dst_ioctx.locator_set_key(dst_locator);
6402
6403 do {
6404 bufferlist data;
6405 ObjectReadOperation rop;
6406 ObjectWriteOperation wop;
6407
6408 if (ofs == 0) {
6409 rop.stat2(&size, &mtime_ts, NULL);
6410 mtime = real_clock::from_timespec(mtime_ts);
6411 }
6412 rop.read(ofs, chunk_size, &data, NULL);
6413 ret = src_ioctx.operate(src_oid, &rop, NULL);
6414 if (ret < 0) {
6415 goto done_err;
6416 }
6417
6418 if (data.length() == 0) {
6419 break;
6420 }
6421
6422 if (ofs == 0) {
6423 wop.create(true); /* make it exclusive */
6424 wop.mtime2(&mtime_ts);
6425 mtime = real_clock::from_timespec(mtime_ts);
6426 }
6427 wop.write(ofs, data);
6428 ret = dst_ioctx.operate(dst_oid, &wop);
6429 ofs += data.length();
6430 done = data.length() != chunk_size;
6431 } while (!done);
6432
6433 if (ofs != size) {
6434 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6435 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6436 ret = -EIO;
6437 goto done_err;
6438 }
6439
6440 src_ioctx.remove(src_oid);
6441
6442 return 0;
6443
6444 done_err:
6445 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6446 return ret;
6447 }
6448
6449 /*
6450 * fixes an issue where head objects were supposed to have a locator created, but ended
6451 * up without one
6452 */
6453 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6454 {
6455 const rgw_bucket& bucket = bucket_info.bucket;
6456 rgw_obj obj(bucket, key);
6457
6458 if (need_fix) {
6459 *need_fix = false;
6460 }
6461
6462 rgw_rados_ref ref;
6463 int r = get_obj_head_ref(bucket_info, obj, &ref);
6464 if (r < 0) {
6465 return r;
6466 }
6467
6468 RGWObjState *astate = NULL;
6469 RGWObjectCtx rctx(this);
6470 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6471 if (r < 0)
6472 return r;
6473
6474 if (astate->has_manifest) {
6475 RGWObjManifest::obj_iterator miter;
6476 RGWObjManifest& manifest = astate->manifest;
6477 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6478 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6479 rgw_obj loc;
6480 string oid;
6481 string locator;
6482
6483 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6484
6485 if (loc.key.ns.empty()) {
6486 /* continue, we're only interested in tail objects */
6487 continue;
6488 }
6489
6490 get_obj_bucket_and_oid_loc(loc, oid, locator);
6491 ref.ioctx.locator_set_key(locator);
6492
6493 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6494
6495 r = ref.ioctx.stat(oid, NULL, NULL);
6496 if (r != -ENOENT) {
6497 continue;
6498 }
6499
6500 string bad_loc;
6501 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6502
6503 /* create a new ioctx with the bad locator */
6504 librados::IoCtx src_ioctx;
6505 src_ioctx.dup(ref.ioctx);
6506 src_ioctx.locator_set_key(bad_loc);
6507
6508 r = src_ioctx.stat(oid, NULL, NULL);
6509 if (r != 0) {
6510 /* cannot find a broken part */
6511 continue;
6512 }
6513 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6514 if (need_fix) {
6515 *need_fix = true;
6516 }
6517 if (fix) {
6518 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6519 if (r < 0) {
6520 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6521 }
6522 }
6523 }
6524 }
6525
6526 return 0;
6527 }
6528
6529 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6530 {
6531 bucket = _bucket;
6532
6533 RGWObjectCtx obj_ctx(store);
6534
6535 RGWBucketInfo bucket_info;
6536 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6537 if (ret < 0) {
6538 return ret;
6539 }
6540
6541 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6542 if (ret < 0) {
6543 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6544 return ret;
6545 }
6546 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6547
6548 return 0;
6549 }
6550
6551 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6552 {
6553 bucket = _bucket;
6554 shard_id = sid;
6555
6556 RGWObjectCtx obj_ctx(store);
6557
6558 RGWBucketInfo bucket_info;
6559 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6560 if (ret < 0) {
6561 return ret;
6562 }
6563
6564 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6565 if (ret < 0) {
6566 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6567 return ret;
6568 }
6569 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6570
6571 return 0;
6572 }
6573
6574
6575 /* Execute @handler on last item in bucket listing for bucket specified
6576 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6577 * to objects matching these criterias. */
6578 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6579 const std::string& obj_prefix,
6580 const std::string& obj_delim,
6581 std::function<int(const rgw_bucket_dir_entry&)> handler)
6582 {
6583 RGWRados::Bucket target(this, bucket_info);
6584 RGWRados::Bucket::List list_op(&target);
6585
6586 list_op.params.prefix = obj_prefix;
6587 list_op.params.delim = obj_delim;
6588
6589 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6590 << ", obj_prefix=" << obj_prefix
6591 << ", obj_delim=" << obj_delim
6592 << dendl;
6593
6594 bool is_truncated = false;
6595
6596 boost::optional<rgw_bucket_dir_entry> last_entry;
6597 /* We need to rewind to the last object in a listing. */
6598 do {
6599 /* List bucket entries in chunks. */
6600 static constexpr int MAX_LIST_OBJS = 100;
6601 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6602
6603 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6604 &is_truncated);
6605 if (ret < 0) {
6606 return ret;
6607 } else if (!entries.empty()) {
6608 last_entry = entries.back();
6609 }
6610 } while (is_truncated);
6611
6612 if (last_entry) {
6613 return handler(*last_entry);
6614 }
6615
6616 /* Empty listing - no items we can run handler on. */
6617 return 0;
6618 }
6619
6620
6621 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6622 const rgw_user& user,
6623 RGWBucketInfo& bucket_info,
6624 rgw_obj& obj)
6625 {
6626 if (! swift_versioning_enabled(bucket_info)) {
6627 return 0;
6628 }
6629
6630 obj_ctx.obj.set_atomic(obj);
6631
6632 RGWObjState * state = nullptr;
6633 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6634 if (r < 0) {
6635 return r;
6636 }
6637
6638 if (!state->exists) {
6639 return 0;
6640 }
6641
6642 string client_id;
6643 string op_id;
6644
6645 const string& src_name = obj.get_oid();
6646 char buf[src_name.size() + 32];
6647 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6648 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6649 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6650
6651 RGWBucketInfo dest_bucket_info;
6652
6653 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6654 if (r < 0) {
6655 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6656 if (r == -ENOENT) {
6657 return -ERR_PRECONDITION_FAILED;
6658 }
6659 return r;
6660 }
6661
6662 if (dest_bucket_info.owner != bucket_info.owner) {
6663 return -ERR_PRECONDITION_FAILED;
6664 }
6665
6666 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6667 obj_ctx.obj.set_atomic(dest_obj);
6668
6669 string no_zone;
6670
6671 r = copy_obj(obj_ctx,
6672 user,
6673 client_id,
6674 op_id,
6675 NULL, /* req_info *info */
6676 no_zone,
6677 dest_obj,
6678 obj,
6679 dest_bucket_info,
6680 bucket_info,
6681 NULL, /* time_t *src_mtime */
6682 NULL, /* time_t *mtime */
6683 NULL, /* const time_t *mod_ptr */
6684 NULL, /* const time_t *unmod_ptr */
6685 false, /* bool high_precision_time */
6686 NULL, /* const char *if_match */
6687 NULL, /* const char *if_nomatch */
6688 RGWRados::ATTRSMOD_NONE,
6689 true, /* bool copy_if_newer */
6690 state->attrset,
6691 RGW_OBJ_CATEGORY_MAIN,
6692 0, /* uint64_t olh_epoch */
6693 real_time(), /* time_t delete_at */
6694 NULL, /* string *version_id */
6695 NULL, /* string *ptag */
6696 NULL, /* string *petag */
6697 NULL, /* void (*progress_cb)(off_t, void *) */
6698 NULL); /* void *progress_data */
6699 if (r == -ECANCELED || r == -ENOENT) {
6700 /* Has already been overwritten, meaning another rgw process already
6701 * copied it out */
6702 return 0;
6703 }
6704
6705 return r;
6706 }
6707
6708 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6709 const rgw_user& user,
6710 RGWBucketInfo& bucket_info,
6711 rgw_obj& obj,
6712 bool& restored) /* out */
6713 {
6714 if (! swift_versioning_enabled(bucket_info)) {
6715 return 0;
6716 }
6717
6718 /* Bucket info of the bucket that stores previous versions of our object. */
6719 RGWBucketInfo archive_binfo;
6720
6721 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6722 bucket_info.swift_ver_location, archive_binfo,
6723 nullptr, nullptr);
6724 if (ret < 0) {
6725 return ret;
6726 }
6727
6728 /* Abort the operation if the bucket storing our archive belongs to someone
6729 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6730 * into consideration. For we can live with that.
6731 *
6732 * TODO: delegate this check to un upper layer and compare with ACLs. */
6733 if (bucket_info.owner != archive_binfo.owner) {
6734 return -EPERM;
6735 }
6736
6737 /* This code will be executed on latest version of the object. */
6738 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6739 std::string no_client_id;
6740 std::string no_op_id;
6741 std::string no_zone;
6742
6743 /* We don't support object versioning of Swift API on those buckets that
6744 * are already versioned using the S3 mechanism. This affects also bucket
6745 * storing archived objects. Otherwise the delete operation would create
6746 * a deletion marker. */
6747 if (archive_binfo.versioned()) {
6748 restored = false;
6749 return -ERR_PRECONDITION_FAILED;
6750 }
6751
6752 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6753 * irrelevant and may be safely skipped. */
6754 std::map<std::string, ceph::bufferlist> no_attrs;
6755
6756 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6757 obj_ctx.obj.set_atomic(archive_obj);
6758 obj_ctx.obj.set_atomic(obj);
6759
6760 int ret = copy_obj(obj_ctx,
6761 user,
6762 no_client_id,
6763 no_op_id,
6764 nullptr, /* req_info *info */
6765 no_zone,
6766 obj, /* dest obj */
6767 archive_obj, /* src obj */
6768 bucket_info, /* dest bucket info */
6769 archive_binfo, /* src bucket info */
6770 nullptr, /* time_t *src_mtime */
6771 nullptr, /* time_t *mtime */
6772 nullptr, /* const time_t *mod_ptr */
6773 nullptr, /* const time_t *unmod_ptr */
6774 false, /* bool high_precision_time */
6775 nullptr, /* const char *if_match */
6776 nullptr, /* const char *if_nomatch */
6777 RGWRados::ATTRSMOD_NONE,
6778 true, /* bool copy_if_newer */
6779 no_attrs,
6780 RGW_OBJ_CATEGORY_MAIN,
6781 0, /* uint64_t olh_epoch */
6782 real_time(), /* time_t delete_at */
6783 nullptr, /* string *version_id */
6784 nullptr, /* string *ptag */
6785 nullptr, /* string *petag */
6786 nullptr, /* void (*progress_cb)(off_t, void *) */
6787 nullptr); /* void *progress_data */
6788 if (ret == -ECANCELED || ret == -ENOENT) {
6789 /* Has already been overwritten, meaning another rgw process already
6790 * copied it out */
6791 return 0;
6792 } else if (ret < 0) {
6793 return ret;
6794 } else {
6795 restored = true;
6796 }
6797
6798 /* Need to remove the archived copy. */
6799 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6800 archive_binfo.versioning_status());
6801
6802 return ret;
6803 };
6804
6805 const std::string& obj_name = obj.get_oid();
6806 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6807 % obj_name);
6808
6809 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6810 handler);
6811 }
6812
6813 /**
6814 * Write/overwrite an object to the bucket storage.
6815 * bucket: the bucket to store the object in
6816 * obj: the object name/key
6817 * data: the object contents/value
6818 * size: the amount of data to write (data must be this long)
6819 * accounted_size: original size of data before compression, encryption
6820 * mtime: if non-NULL, writes the given mtime to the bucket storage
6821 * attrs: all the given attrs are written to bucket storage for the given object
6822 * exclusive: create object exclusively
6823 * Returns: 0 on success, -ERR# otherwise.
6824 */
6825 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6826 map<string, bufferlist>& attrs, bool assume_noent,
6827 void *_index_op)
6828 {
6829 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6830 RGWRados *store = target->get_store();
6831
6832 ObjectWriteOperation op;
6833
6834 RGWObjState *state;
6835 int r = target->get_state(&state, false, assume_noent);
6836 if (r < 0)
6837 return r;
6838
6839 rgw_obj& obj = target->get_obj();
6840
6841 if (obj.get_oid().empty()) {
6842 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6843 return -EIO;
6844 }
6845
6846 rgw_rados_ref ref;
6847 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6848 if (r < 0)
6849 return r;
6850
6851 bool is_olh = state->is_olh;
6852
6853 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6854
6855 const string *ptag = meta.ptag;
6856 if (!ptag && !index_op->get_optag()->empty()) {
6857 ptag = index_op->get_optag();
6858 }
6859 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
6860 if (r < 0)
6861 return r;
6862
6863 if (real_clock::is_zero(meta.set_mtime)) {
6864 meta.set_mtime = real_clock::now();
6865 }
6866
6867 if (state->is_olh) {
6868 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6869 }
6870
6871 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6872 op.mtime2(&mtime_ts);
6873
6874 if (meta.data) {
6875 /* if we want to overwrite the data, we also want to overwrite the
6876 xattrs, so just remove the object */
6877 op.write_full(*meta.data);
6878 }
6879
6880 string etag;
6881 string content_type;
6882 bufferlist acl_bl;
6883
6884 map<string, bufferlist>::iterator iter;
6885 if (meta.rmattrs) {
6886 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6887 const string& name = iter->first;
6888 op.rmxattr(name.c_str());
6889 }
6890 }
6891
6892 if (meta.manifest) {
6893 /* remove existing manifest attr */
6894 iter = attrs.find(RGW_ATTR_MANIFEST);
6895 if (iter != attrs.end())
6896 attrs.erase(iter);
6897
6898 bufferlist bl;
6899 ::encode(*meta.manifest, bl);
6900 op.setxattr(RGW_ATTR_MANIFEST, bl);
6901 }
6902
6903 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6904 const string& name = iter->first;
6905 bufferlist& bl = iter->second;
6906
6907 if (!bl.length())
6908 continue;
6909
6910 op.setxattr(name.c_str(), bl);
6911
6912 if (name.compare(RGW_ATTR_ETAG) == 0) {
6913 etag = bl.c_str();
6914 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6915 content_type = bl.c_str();
6916 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6917 acl_bl = bl;
6918 }
6919 }
6920 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6921 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6922 }
6923
6924 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6925 bufferlist bl;
6926 ::encode(store->get_zone_short_id(), bl);
6927 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6928 }
6929
6930 if (!op.size())
6931 return 0;
6932
6933 uint64_t epoch;
6934 int64_t poolid;
6935 bool orig_exists;
6936 uint64_t orig_size;
6937
6938 if (!reset_obj) { //Multipart upload, it has immutable head.
6939 orig_exists = false;
6940 orig_size = 0;
6941 } else {
6942 orig_exists = state->exists;
6943 orig_size = state->accounted_size;
6944 }
6945
6946 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6947
6948 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6949
6950 if (versioned_op) {
6951 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6952 }
6953
6954 if (!index_op->is_prepared()) {
6955 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6956 if (r < 0)
6957 return r;
6958 }
6959
6960 r = ref.ioctx.operate(ref.oid, &op);
6961 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6962 or -ENOENT if was removed, or -EEXIST if it did not exist
6963 before and now it does */
6964 if (r == -EEXIST && assume_noent) {
6965 target->invalidate_state();
6966 return r;
6967 }
6968 goto done_cancel;
6969 }
6970
6971 epoch = ref.ioctx.get_last_version();
6972 poolid = ref.ioctx.get_id();
6973
6974 r = target->complete_atomic_modification();
6975 if (r < 0) {
6976 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
6977 }
6978
6979 r = index_op->complete(poolid, epoch, size, accounted_size,
6980 meta.set_mtime, etag, content_type, &acl_bl,
6981 meta.category, meta.remove_objs, meta.user_data);
6982 if (r < 0)
6983 goto done_cancel;
6984
6985 if (meta.mtime) {
6986 *meta.mtime = meta.set_mtime;
6987 }
6988
6989 /* note that index_op was using state so we couldn't invalidate it earlier */
6990 target->invalidate_state();
6991 state = NULL;
6992
6993 if (versioned_op) {
6994 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
6995 if (r < 0) {
6996 return r;
6997 }
6998 }
6999
7000 if (!real_clock::is_zero(meta.delete_at)) {
7001 rgw_obj_index_key obj_key;
7002 obj.key.get_index_key(&obj_key);
7003
7004 r = store->objexp_hint_add(meta.delete_at,
7005 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7006 if (r < 0) {
7007 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7008 /* ignoring error, nothing we can do at this point */
7009 }
7010 }
7011 meta.canceled = false;
7012
7013 /* update quota cache */
7014 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7015 accounted_size, orig_size);
7016 return 0;
7017
7018 done_cancel:
7019 int ret = index_op->cancel();
7020 if (ret < 0) {
7021 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7022 }
7023
7024 meta.canceled = true;
7025
7026 /* we lost in a race. There are a few options:
7027 * - existing object was rewritten (ECANCELED)
7028 * - non existing object was created (EEXIST)
7029 * - object was removed (ENOENT)
7030 * should treat it as a success
7031 */
7032 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7033 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7034 r = 0;
7035 }
7036 } else {
7037 if (meta.if_match != NULL) {
7038 // only overwrite existing object
7039 if (strcmp(meta.if_match, "*") == 0) {
7040 if (r == -ENOENT) {
7041 r = -ERR_PRECONDITION_FAILED;
7042 } else if (r == -ECANCELED) {
7043 r = 0;
7044 }
7045 }
7046 }
7047
7048 if (meta.if_nomatch != NULL) {
7049 // only create a new object
7050 if (strcmp(meta.if_nomatch, "*") == 0) {
7051 if (r == -EEXIST) {
7052 r = -ERR_PRECONDITION_FAILED;
7053 } else if (r == -ENOENT) {
7054 r = 0;
7055 }
7056 }
7057 }
7058 }
7059
7060 return r;
7061 }
7062
7063 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7064 map<string, bufferlist>& attrs)
7065 {
7066 RGWBucketInfo& bucket_info = target->get_bucket_info();
7067
7068 RGWRados::Bucket bop(target->get_store(), bucket_info);
7069 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7070 index_op.set_zones_trace(meta.zones_trace);
7071
7072 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7073 int r;
7074 if (assume_noent) {
7075 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
7076 if (r == -EEXIST) {
7077 assume_noent = false;
7078 }
7079 }
7080 if (!assume_noent) {
7081 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
7082 }
7083 return r;
7084 }
7085
7086 /** Write/overwrite a system object. */
7087 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7088 map<std::string, bufferlist>& attrs, int flags,
7089 bufferlist& data,
7090 RGWObjVersionTracker *objv_tracker,
7091 real_time set_mtime /* 0 for don't set */)
7092 {
7093 rgw_rados_ref ref;
7094 int r = get_system_obj_ref(obj, &ref);
7095 if (r < 0)
7096 return r;
7097
7098 ObjectWriteOperation op;
7099
7100 if (flags & PUT_OBJ_EXCL) {
7101 if (!(flags & PUT_OBJ_CREATE))
7102 return -EINVAL;
7103 op.create(true); // exclusive create
7104 } else {
7105 op.remove();
7106 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7107 op.create(false);
7108 }
7109
7110 if (objv_tracker) {
7111 objv_tracker->prepare_op_for_write(&op);
7112 }
7113
7114 if (real_clock::is_zero(set_mtime)) {
7115 set_mtime = real_clock::now();
7116 }
7117
7118 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7119 op.mtime2(&mtime_ts);
7120 op.write_full(data);
7121
7122 bufferlist acl_bl;
7123
7124 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7125 const string& name = iter->first;
7126 bufferlist& bl = iter->second;
7127
7128 if (!bl.length())
7129 continue;
7130
7131 op.setxattr(name.c_str(), bl);
7132 }
7133
7134 r = ref.ioctx.operate(ref.oid, &op);
7135 if (r < 0) {
7136 return r;
7137 }
7138
7139 if (objv_tracker) {
7140 objv_tracker->apply_write();
7141 }
7142
7143 if (mtime) {
7144 *mtime = set_mtime;
7145 }
7146
7147 return 0;
7148 }
7149
7150 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7151 off_t ofs, bool exclusive,
7152 RGWObjVersionTracker *objv_tracker)
7153 {
7154 rgw_rados_ref ref;
7155 int r = get_system_obj_ref(obj, &ref);
7156 if (r < 0) {
7157 return r;
7158 }
7159
7160 ObjectWriteOperation op;
7161
7162 if (exclusive)
7163 op.create(true);
7164
7165 if (objv_tracker) {
7166 objv_tracker->prepare_op_for_write(&op);
7167 }
7168 if (ofs == -1) {
7169 op.write_full(bl);
7170 } else {
7171 op.write(ofs, bl);
7172 }
7173 r = ref.ioctx.operate(ref.oid, &op);
7174 if (r < 0)
7175 return r;
7176
7177 if (objv_tracker) {
7178 objv_tracker->apply_write();
7179 }
7180 return 0;
7181 }
7182
7183 /**
7184 * Write/overwrite an object to the bucket storage.
7185 * bucket: the bucket to store the object in
7186 * obj: the object name/key
7187 * data: the object contents/value
7188 * offset: the offet to write to in the object
7189 * If this is -1, we will overwrite the whole object.
7190 * size: the amount of data to write (data must be this long)
7191 * attrs: all the given attrs are written to bucket storage for the given object
7192 * Returns: 0 on success, -ERR# otherwise.
7193 */
7194
7195 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7196 off_t ofs, bool exclusive,
7197 void **handle)
7198 {
7199 rgw_rados_ref ref;
7200 int r = get_raw_obj_ref(obj, &ref);
7201 if (r < 0) {
7202 return r;
7203 }
7204
7205 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7206 *handle = c;
7207
7208 ObjectWriteOperation op;
7209
7210 if (exclusive)
7211 op.create(true);
7212
7213 if (ofs == -1) {
7214 op.write_full(bl);
7215 } else {
7216 op.write(ofs, bl);
7217 }
7218 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7219 if (r < 0)
7220 return r;
7221
7222 return 0;
7223 }
7224
7225 int RGWRados::aio_wait(void *handle)
7226 {
7227 AioCompletion *c = (AioCompletion *)handle;
7228 c->wait_for_safe();
7229 int ret = c->get_return_value();
7230 c->release();
7231 return ret;
7232 }
7233
7234 bool RGWRados::aio_completed(void *handle)
7235 {
7236 AioCompletion *c = (AioCompletion *)handle;
7237 return c->is_safe();
7238 }
7239
7240 class RGWRadosPutObj : public RGWGetDataCB
7241 {
7242 CephContext* cct;
7243 rgw_obj obj;
7244 RGWPutObjDataProcessor *filter;
7245 boost::optional<RGWPutObj_Compress>& compressor;
7246 CompressorRef& plugin;
7247 RGWPutObjProcessor_Atomic *processor;
7248 RGWOpStateSingleOp *opstate;
7249 void (*progress_cb)(off_t, void *);
7250 void *progress_data;
7251 bufferlist extra_data_bl;
7252 uint64_t extra_data_len;
7253 uint64_t data_len;
7254 map<string, bufferlist> src_attrs;
7255 public:
7256 RGWRadosPutObj(CephContext* cct,
7257 CompressorRef& plugin,
7258 boost::optional<RGWPutObj_Compress>& compressor,
7259 RGWPutObjProcessor_Atomic *p,
7260 RGWOpStateSingleOp *_ops,
7261 void (*_progress_cb)(off_t, void *),
7262 void *_progress_data) :
7263 cct(cct),
7264 filter(p),
7265 compressor(compressor),
7266 plugin(plugin),
7267 processor(p),
7268 opstate(_ops),
7269 progress_cb(_progress_cb),
7270 progress_data(_progress_data),
7271 extra_data_len(0),
7272 data_len(0) {}
7273
7274 int process_attrs(void) {
7275 if (extra_data_bl.length()) {
7276 JSONParser jp;
7277 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7278 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7279 return -EIO;
7280 }
7281
7282 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7283
7284 src_attrs.erase(RGW_ATTR_COMPRESSION);
7285 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7286 }
7287
7288 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7289 //do not compress if object is encrypted
7290 compressor = boost::in_place(cct, plugin, filter);
7291 filter = &*compressor;
7292 }
7293 return 0;
7294 }
7295
7296 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7297 if (progress_cb) {
7298 progress_cb(ofs, progress_data);
7299 }
7300 if (extra_data_len) {
7301 size_t extra_len = bl.length();
7302 if (extra_len > extra_data_len)
7303 extra_len = extra_data_len;
7304
7305 bufferlist extra;
7306 bl.splice(0, extra_len, &extra);
7307 extra_data_bl.append(extra);
7308
7309 extra_data_len -= extra_len;
7310 if (extra_data_len == 0) {
7311 int res = process_attrs();
7312 if (res < 0)
7313 return res;
7314 }
7315 if (bl.length() == 0) {
7316 return 0;
7317 }
7318 }
7319 data_len += bl.length();
7320 bool again = false;
7321
7322 bool need_opstate = true;
7323
7324 do {
7325 void *handle = NULL;
7326 rgw_raw_obj obj;
7327 uint64_t size = bl.length();
7328 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7329 if (ret < 0)
7330 return ret;
7331
7332 if (need_opstate && opstate) {
7333 /* need to update opstate repository with new state. This is ratelimited, so we're not
7334 * really doing it every time
7335 */
7336 ret = opstate->renew_state();
7337 if (ret < 0) {
7338 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7339 int r = filter->throttle_data(handle, obj, size, false);
7340 if (r < 0) {
7341 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7342 }
7343 /* could not renew state! might have been marked as cancelled */
7344 return ret;
7345 }
7346 need_opstate = false;
7347 }
7348
7349 ret = filter->throttle_data(handle, obj, size, false);
7350 if (ret < 0)
7351 return ret;
7352 } while (again);
7353
7354 return 0;
7355 }
7356
7357 bufferlist& get_extra_data() { return extra_data_bl; }
7358
7359 map<string, bufferlist>& get_attrs() { return src_attrs; }
7360
7361 void set_extra_data_len(uint64_t len) override {
7362 extra_data_len = len;
7363 }
7364
7365 uint64_t get_data_len() {
7366 return data_len;
7367 }
7368
7369 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7370 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7371 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7372 }
7373
7374 bool is_canceled() {
7375 return processor->is_canceled();
7376 }
7377 };
7378
7379 /*
7380 * prepare attrset depending on attrs_mod.
7381 */
7382 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7383 map<string, bufferlist>& attrs,
7384 RGWRados::AttrsMod attrs_mod)
7385 {
7386 switch (attrs_mod) {
7387 case RGWRados::ATTRSMOD_NONE:
7388 attrs = src_attrs;
7389 break;
7390 case RGWRados::ATTRSMOD_REPLACE:
7391 if (!attrs[RGW_ATTR_ETAG].length()) {
7392 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7393 }
7394 break;
7395 case RGWRados::ATTRSMOD_MERGE:
7396 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7397 if (attrs.find(it->first) == attrs.end()) {
7398 attrs[it->first] = it->second;
7399 }
7400 }
7401 break;
7402 }
7403 }
7404
7405 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7406 {
7407 map<string, bufferlist> attrset;
7408
7409 real_time mtime;
7410 uint64_t obj_size;
7411 RGWObjectCtx rctx(this);
7412
7413 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7414 RGWRados::Object::Read read_op(&op_target);
7415
7416 read_op.params.attrs = &attrset;
7417 read_op.params.lastmod = &mtime;
7418 read_op.params.obj_size = &obj_size;
7419
7420 int ret = read_op.prepare();
7421 if (ret < 0)
7422 return ret;
7423
7424 attrset.erase(RGW_ATTR_ID_TAG);
7425
7426 uint64_t max_chunk_size;
7427
7428 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7429 if (ret < 0) {
7430 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7431 return ret;
7432 }
7433
7434 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
7435 RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL);
7436 }
7437
7438 struct obj_time_weight {
7439 real_time mtime;
7440 uint32_t zone_short_id;
7441 uint64_t pg_ver;
7442 bool high_precision;
7443
7444 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7445
7446 bool compare_low_precision(const obj_time_weight& rhs) {
7447 struct timespec l = ceph::real_clock::to_timespec(mtime);
7448 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7449 l.tv_nsec = 0;
7450 r.tv_nsec = 0;
7451 if (l > r) {
7452 return false;
7453 }
7454 if (l < r) {
7455 return true;
7456 }
7457 if (zone_short_id != rhs.zone_short_id) {
7458 return (zone_short_id < rhs.zone_short_id);
7459 }
7460 return (pg_ver < rhs.pg_ver);
7461
7462 }
7463
7464 bool operator<(const obj_time_weight& rhs) {
7465 if (!high_precision || !rhs.high_precision) {
7466 return compare_low_precision(rhs);
7467 }
7468 if (mtime > rhs.mtime) {
7469 return false;
7470 }
7471 if (mtime < rhs.mtime) {
7472 return true;
7473 }
7474 if (zone_short_id != rhs.zone_short_id) {
7475 return (zone_short_id < rhs.zone_short_id);
7476 }
7477 return (pg_ver < rhs.pg_ver);
7478 }
7479
7480 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7481 mtime = _mtime;
7482 zone_short_id = _short_id;
7483 pg_ver = _pg_ver;
7484 }
7485
7486 void init(RGWObjState *state) {
7487 mtime = state->mtime;
7488 zone_short_id = state->zone_short_id;
7489 pg_ver = state->pg_ver;
7490 }
7491 };
7492
7493 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7494 out << o.mtime;
7495
7496 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7497 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7498 }
7499
7500 return out;
7501 }
7502
7503 class RGWGetExtraDataCB : public RGWGetDataCB {
7504 bufferlist extra_data;
7505 public:
7506 RGWGetExtraDataCB() {}
7507 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7508 if (extra_data.length() < extra_data_len) {
7509 off_t max = extra_data_len - extra_data.length();
7510 if (max > bl_len) {
7511 max = bl_len;
7512 }
7513 bl.splice(0, max, &extra_data);
7514 }
7515 return bl_len;
7516 }
7517
7518 bufferlist& get_extra_data() {
7519 return extra_data;
7520 }
7521 };
7522
7523 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7524 const rgw_user& user_id,
7525 const string& client_id,
7526 req_info *info,
7527 const string& source_zone,
7528 rgw_obj& src_obj,
7529 RGWBucketInfo& src_bucket_info,
7530 real_time *src_mtime,
7531 uint64_t *psize,
7532 const real_time *mod_ptr,
7533 const real_time *unmod_ptr,
7534 bool high_precision_time,
7535 const char *if_match,
7536 const char *if_nomatch,
7537 map<string, bufferlist> *pattrs,
7538 string *version_id,
7539 string *ptag,
7540 string *petag)
7541 {
7542 /* source is in a different zonegroup, copy from there */
7543
7544 RGWRESTStreamRWRequest *in_stream_req;
7545 string tag;
7546 map<string, bufferlist> src_attrs;
7547 append_rand_alpha(cct, tag, tag, 32);
7548 obj_time_weight set_mtime_weight;
7549 set_mtime_weight.high_precision = high_precision_time;
7550
7551 RGWRESTConn *conn;
7552 if (source_zone.empty()) {
7553 if (src_bucket_info.zonegroup.empty()) {
7554 /* source is in the master zonegroup */
7555 conn = rest_master_conn;
7556 } else {
7557 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7558 if (iter == zonegroup_conn_map.end()) {
7559 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7560 return -ENOENT;
7561 }
7562 conn = iter->second;
7563 }
7564 } else {
7565 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7566 if (iter == zone_conn_map.end()) {
7567 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7568 return -ENOENT;
7569 }
7570 conn = iter->second;
7571 }
7572
7573 RGWGetExtraDataCB cb;
7574 string etag;
7575 map<string, string> req_headers;
7576 real_time set_mtime;
7577
7578 const real_time *pmod = mod_ptr;
7579
7580 obj_time_weight dest_mtime_weight;
7581
7582 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7583 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7584 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7585 true /* sync manifest */, &cb, &in_stream_req);
7586 if (ret < 0) {
7587 return ret;
7588 }
7589
7590 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7591 if (ret < 0) {
7592 return ret;
7593 }
7594
7595 bufferlist& extra_data_bl = cb.get_extra_data();
7596 if (extra_data_bl.length()) {
7597 JSONParser jp;
7598 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7599 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7600 return -EIO;
7601 }
7602
7603 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7604
7605 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7606 }
7607
7608 if (src_mtime) {
7609 *src_mtime = set_mtime;
7610 }
7611
7612 if (petag) {
7613 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7614 if (iter != src_attrs.end()) {
7615 bufferlist& etagbl = iter->second;
7616 *petag = etagbl.to_str();
7617 }
7618 }
7619
7620 if (pattrs) {
7621 *pattrs = src_attrs;
7622 }
7623
7624 return 0;
7625 }
7626
7627 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7628 const rgw_user& user_id,
7629 const string& client_id,
7630 const string& op_id,
7631 bool record_op_state,
7632 req_info *info,
7633 const string& source_zone,
7634 rgw_obj& dest_obj,
7635 rgw_obj& src_obj,
7636 RGWBucketInfo& dest_bucket_info,
7637 RGWBucketInfo& src_bucket_info,
7638 real_time *src_mtime,
7639 real_time *mtime,
7640 const real_time *mod_ptr,
7641 const real_time *unmod_ptr,
7642 bool high_precision_time,
7643 const char *if_match,
7644 const char *if_nomatch,
7645 AttrsMod attrs_mod,
7646 bool copy_if_newer,
7647 map<string, bufferlist>& attrs,
7648 RGWObjCategory category,
7649 uint64_t olh_epoch,
7650 real_time delete_at,
7651 string *version_id,
7652 string *ptag,
7653 ceph::buffer::list *petag,
7654 void (*progress_cb)(off_t, void *),
7655 void *progress_data,
7656 rgw_zone_set *zones_trace)
7657 {
7658 /* source is in a different zonegroup, copy from there */
7659
7660 RGWRESTStreamRWRequest *in_stream_req;
7661 string tag;
7662 int i;
7663 append_rand_alpha(cct, tag, tag, 32);
7664 obj_time_weight set_mtime_weight;
7665 set_mtime_weight.high_precision = high_precision_time;
7666
7667 RGWPutObjProcessor_Atomic processor(obj_ctx,
7668 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7669 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7670 if (version_id && *version_id != "null") {
7671 processor.set_version_id(*version_id);
7672 }
7673 processor.set_olh_epoch(olh_epoch);
7674 int ret = processor.prepare(this, NULL);
7675 if (ret < 0) {
7676 return ret;
7677 }
7678
7679 RGWRESTConn *conn;
7680 if (source_zone.empty()) {
7681 if (dest_bucket_info.zonegroup.empty()) {
7682 /* source is in the master zonegroup */
7683 conn = rest_master_conn;
7684 } else {
7685 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7686 if (iter == zonegroup_conn_map.end()) {
7687 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7688 return -ENOENT;
7689 }
7690 conn = iter->second;
7691 }
7692 } else {
7693 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7694 if (iter == zone_conn_map.end()) {
7695 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7696 return -ENOENT;
7697 }
7698 conn = iter->second;
7699 }
7700
7701 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7702
7703 RGWOpStateSingleOp *opstate = NULL;
7704
7705 if (record_op_state) {
7706 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7707
7708 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7709 if (ret < 0) {
7710 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7711 delete opstate;
7712 return ret;
7713 }
7714 }
7715
7716 boost::optional<RGWPutObj_Compress> compressor;
7717 CompressorRef plugin;
7718
7719 const auto& compression_type = zone_params.get_compression_type(
7720 dest_bucket_info.placement_rule);
7721 if (compression_type != "none") {
7722 plugin = Compressor::create(cct, compression_type);
7723 if (!plugin) {
7724 ldout(cct, 1) << "Cannot load plugin for compression type "
7725 << compression_type << dendl;
7726 }
7727 }
7728
7729 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7730
7731 string etag;
7732 map<string, string> req_headers;
7733 real_time set_mtime;
7734
7735 RGWObjState *dest_state = NULL;
7736
7737 const real_time *pmod = mod_ptr;
7738
7739 obj_time_weight dest_mtime_weight;
7740
7741 if (copy_if_newer) {
7742 /* need to get mtime for destination */
7743 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7744 if (ret < 0)
7745 goto set_err_state;
7746
7747 if (!real_clock::is_zero(dest_state->mtime)) {
7748 dest_mtime_weight.init(dest_state);
7749 pmod = &dest_mtime_weight.mtime;
7750 }
7751 }
7752
7753 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7754 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7755 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7756 true /* sync manifest */, &cb, &in_stream_req);
7757 if (ret < 0) {
7758 goto set_err_state;
7759 }
7760
7761 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7762 if (ret < 0) {
7763 goto set_err_state;
7764 }
7765 if (compressor && compressor->is_compressed()) {
7766 bufferlist tmp;
7767 RGWCompressionInfo cs_info;
7768 cs_info.compression_type = plugin->get_type_name();
7769 cs_info.orig_size = cb.get_data_len();
7770 cs_info.blocks = move(compressor->get_compression_blocks());
7771 ::encode(cs_info, tmp);
7772 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7773 }
7774
7775 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7776 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7777 } else {
7778 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7779 if (iter != cb.get_attrs().end()) {
7780 try {
7781 ::decode(delete_at, iter->second);
7782 } catch (buffer::error& err) {
7783 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7784 }
7785 }
7786 }
7787
7788 if (src_mtime) {
7789 *src_mtime = set_mtime;
7790 }
7791
7792 if (petag) {
7793 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7794 if (iter != cb.get_attrs().end()) {
7795 *petag = iter->second;
7796 }
7797 }
7798
7799 if (source_zone.empty()) {
7800 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7801 } else {
7802 attrs = cb.get_attrs();
7803 }
7804
7805 if (copy_if_newer) {
7806 uint64_t pg_ver = 0;
7807 auto i = attrs.find(RGW_ATTR_PG_VER);
7808 if (i != attrs.end() && i->second.length() > 0) {
7809 bufferlist::iterator iter = i->second.begin();
7810 try {
7811 ::decode(pg_ver, iter);
7812 } catch (buffer::error& err) {
7813 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7814 /* non critical error */
7815 }
7816 }
7817 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7818 }
7819
7820 #define MAX_COMPLETE_RETRY 100
7821 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7822 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7823 if (ret < 0) {
7824 goto set_err_state;
7825 }
7826 if (copy_if_newer && cb.is_canceled()) {
7827 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7828 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7829 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7830 if (ret < 0) {
7831 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7832 goto set_err_state;
7833 }
7834 dest_mtime_weight.init(dest_state);
7835 dest_mtime_weight.high_precision = high_precision_time;
7836 if (!dest_state->exists ||
7837 dest_mtime_weight < set_mtime_weight) {
7838 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7839 continue;
7840 } else {
7841 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7842 }
7843 }
7844 break;
7845 }
7846
7847 if (i == MAX_COMPLETE_RETRY) {
7848 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7849 ret = -EIO;
7850 goto set_err_state;
7851 }
7852
7853 if (opstate) {
7854 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7855 if (ret < 0) {
7856 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7857 }
7858 delete opstate;
7859 }
7860
7861 return 0;
7862 set_err_state:
7863 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7864 ret = 0;
7865 }
7866 if (opstate) {
7867 RGWOpState::OpState state;
7868 if (ret < 0) {
7869 state = RGWOpState::OPSTATE_ERROR;
7870 } else {
7871 state = RGWOpState::OPSTATE_COMPLETE;
7872 }
7873 int r = opstate->set_state(state);
7874 if (r < 0) {
7875 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7876 }
7877 delete opstate;
7878 }
7879 return ret;
7880 }
7881
7882
7883 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7884 map<string, bufferlist>& src_attrs,
7885 RGWRados::Object::Read& read_op,
7886 const rgw_user& user_id,
7887 rgw_obj& dest_obj,
7888 real_time *mtime)
7889 {
7890 string etag;
7891
7892 RGWRESTStreamWriteRequest *out_stream_req;
7893
7894 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7895 if (ret < 0) {
7896 return ret;
7897 }
7898
7899 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7900 if (ret < 0) {
7901 delete out_stream_req;
7902 return ret;
7903 }
7904
7905 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7906 if (ret < 0)
7907 return ret;
7908
7909 return 0;
7910 }
7911
7912 /**
7913 * Copy an object.
7914 * dest_obj: the object to copy into
7915 * src_obj: the object to copy from
7916 * attrs: usage depends on attrs_mod parameter
7917 * attrs_mod: the modification mode of the attrs, may have the following values:
7918 * ATTRSMOD_NONE - the attributes of the source object will be
7919 * copied without modifications, attrs parameter is ignored;
7920 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7921 * parameter, source object attributes are not copied;
7922 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7923 * are overwritten by values contained in attrs parameter.
7924 * err: stores any errors resulting from the get of the original object
7925 * Returns: 0 on success, -ERR# otherwise.
7926 */
7927 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7928 const rgw_user& user_id,
7929 const string& client_id,
7930 const string& op_id,
7931 req_info *info,
7932 const string& source_zone,
7933 rgw_obj& dest_obj,
7934 rgw_obj& src_obj,
7935 RGWBucketInfo& dest_bucket_info,
7936 RGWBucketInfo& src_bucket_info,
7937 real_time *src_mtime,
7938 real_time *mtime,
7939 const real_time *mod_ptr,
7940 const real_time *unmod_ptr,
7941 bool high_precision_time,
7942 const char *if_match,
7943 const char *if_nomatch,
7944 AttrsMod attrs_mod,
7945 bool copy_if_newer,
7946 map<string, bufferlist>& attrs,
7947 RGWObjCategory category,
7948 uint64_t olh_epoch,
7949 real_time delete_at,
7950 string *version_id,
7951 string *ptag,
7952 ceph::buffer::list *petag,
7953 void (*progress_cb)(off_t, void *),
7954 void *progress_data)
7955 {
7956 int ret;
7957 uint64_t obj_size;
7958 rgw_obj shadow_obj = dest_obj;
7959 string shadow_oid;
7960
7961 bool remote_src;
7962 bool remote_dest;
7963
7964 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
7965 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
7966
7967 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
7968 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
7969
7970 if (remote_src && remote_dest) {
7971 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7972 return -EINVAL;
7973 }
7974
7975 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7976
7977 if (remote_src || !source_zone.empty()) {
7978 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
7979 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
7980 unmod_ptr, high_precision_time,
7981 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
7982 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7983 }
7984
7985 map<string, bufferlist> src_attrs;
7986 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
7987 RGWRados::Object::Read read_op(&src_op_target);
7988
7989 read_op.conds.mod_ptr = mod_ptr;
7990 read_op.conds.unmod_ptr = unmod_ptr;
7991 read_op.conds.high_precision_time = high_precision_time;
7992 read_op.conds.if_match = if_match;
7993 read_op.conds.if_nomatch = if_nomatch;
7994 read_op.params.attrs = &src_attrs;
7995 read_op.params.lastmod = src_mtime;
7996 read_op.params.obj_size = &obj_size;
7997
7998 ret = read_op.prepare();
7999 if (ret < 0) {
8000 return ret;
8001 }
8002
8003 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8004 src_attrs.erase(RGW_ATTR_DELETE_AT);
8005
8006 set_copy_attrs(src_attrs, attrs, attrs_mod);
8007 attrs.erase(RGW_ATTR_ID_TAG);
8008 attrs.erase(RGW_ATTR_PG_VER);
8009 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8010 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8011 if (cmp != src_attrs.end())
8012 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8013
8014 RGWObjManifest manifest;
8015 RGWObjState *astate = NULL;
8016
8017 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8018 if (ret < 0) {
8019 return ret;
8020 }
8021
8022 vector<rgw_raw_obj> ref_objs;
8023
8024 if (remote_dest) {
8025 /* dest is in a different zonegroup, copy it there */
8026 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8027 }
8028 uint64_t max_chunk_size;
8029
8030 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8031 if (ret < 0) {
8032 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8033 return ret;
8034 }
8035
8036 rgw_pool src_pool;
8037 rgw_pool dest_pool;
8038 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8039 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8040 return -EIO;
8041 }
8042 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8043 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8044 return -EIO;
8045 }
8046
8047
8048 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8049 bool copy_first = false;
8050 if (astate->has_manifest) {
8051 if (!astate->manifest.has_tail()) {
8052 copy_data = true;
8053 } else {
8054 uint64_t head_size = astate->manifest.get_head_size();
8055
8056 if (head_size > 0) {
8057 if (head_size > max_chunk_size) {
8058 copy_data = true;
8059 } else {
8060 copy_first = true;
8061 }
8062 }
8063 }
8064 }
8065
8066 if (petag) {
8067 const auto iter = attrs.find(RGW_ATTR_ETAG);
8068 if (iter != attrs.end()) {
8069 *petag = iter->second;
8070 }
8071 }
8072
8073 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8074 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8075 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8076 version_id, ptag, petag);
8077 }
8078
8079 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8080
8081 if (copy_first) { // we need to copy first chunk, not increase refcount
8082 ++miter;
8083 }
8084
8085 rgw_rados_ref ref;
8086 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8087 if (ret < 0) {
8088 return ret;
8089 }
8090
8091 bool versioned_dest = dest_bucket_info.versioning_enabled();
8092
8093 if (version_id && !version_id->empty()) {
8094 versioned_dest = true;
8095 dest_obj.key.set_instance(*version_id);
8096 } else if (versioned_dest) {
8097 gen_rand_obj_instance_name(&dest_obj);
8098 }
8099
8100 bufferlist first_chunk;
8101
8102 bool copy_itself = (dest_obj == src_obj);
8103 RGWObjManifest *pmanifest;
8104 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8105
8106 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8107 RGWRados::Object::Write write_op(&dest_op_target);
8108
8109 string tag;
8110
8111 if (ptag) {
8112 tag = *ptag;
8113 }
8114
8115 if (tag.empty()) {
8116 append_rand_alpha(cct, tag, tag, 32);
8117 }
8118
8119 if (!copy_itself) {
8120 manifest = astate->manifest;
8121 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8122 if (tail_placement.bucket.name.empty()) {
8123 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8124 }
8125 for (; miter != astate->manifest.obj_end(); ++miter) {
8126 ObjectWriteOperation op;
8127 cls_refcount_get(op, tag, true);
8128 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8129 ref.ioctx.locator_set_key(loc.loc);
8130
8131 ret = ref.ioctx.operate(loc.oid, &op);
8132 if (ret < 0) {
8133 goto done_ret;
8134 }
8135
8136 ref_objs.push_back(loc);
8137 }
8138
8139 pmanifest = &manifest;
8140 } else {
8141 pmanifest = &astate->manifest;
8142 /* don't send the object's tail for garbage collection */
8143 astate->keep_tail = true;
8144 }
8145
8146 if (copy_first) {
8147 ret = read_op.read(0, max_chunk_size, first_chunk);
8148 if (ret < 0) {
8149 goto done_ret;
8150 }
8151
8152 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8153 } else {
8154 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8155 }
8156
8157 write_op.meta.data = &first_chunk;
8158 write_op.meta.manifest = pmanifest;
8159 write_op.meta.ptag = &tag;
8160 write_op.meta.owner = dest_bucket_info.owner;
8161 write_op.meta.mtime = mtime;
8162 write_op.meta.flags = PUT_OBJ_CREATE;
8163 write_op.meta.category = category;
8164 write_op.meta.olh_epoch = olh_epoch;
8165 write_op.meta.delete_at = delete_at;
8166
8167 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8168 if (ret < 0) {
8169 goto done_ret;
8170 }
8171
8172 return 0;
8173
8174 done_ret:
8175 if (!copy_itself) {
8176 vector<rgw_raw_obj>::iterator riter;
8177
8178 /* rollback reference */
8179 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8180 ObjectWriteOperation op;
8181 cls_refcount_put(op, tag, true);
8182
8183 ref.ioctx.locator_set_key(riter->loc);
8184
8185 int r = ref.ioctx.operate(riter->oid, &op);
8186 if (r < 0) {
8187 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8188 }
8189 }
8190 }
8191 return ret;
8192 }
8193
8194
8195 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8196 RGWBucketInfo& dest_bucket_info,
8197 RGWRados::Object::Read& read_op, off_t end,
8198 rgw_obj& dest_obj,
8199 rgw_obj& src_obj,
8200 uint64_t max_chunk_size,
8201 real_time *mtime,
8202 real_time set_mtime,
8203 map<string, bufferlist>& attrs,
8204 RGWObjCategory category,
8205 uint64_t olh_epoch,
8206 real_time delete_at,
8207 string *version_id,
8208 string *ptag,
8209 ceph::buffer::list *petag)
8210 {
8211 bufferlist first_chunk;
8212 RGWObjManifest manifest;
8213
8214 string tag;
8215 append_rand_alpha(cct, tag, tag, 32);
8216
8217 RGWPutObjProcessor_Atomic processor(obj_ctx,
8218 dest_bucket_info, dest_obj.bucket, dest_obj.get_oid(),
8219 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8220 if (version_id) {
8221 processor.set_version_id(*version_id);
8222 }
8223 processor.set_olh_epoch(olh_epoch);
8224 int ret = processor.prepare(this, NULL);
8225 if (ret < 0)
8226 return ret;
8227
8228 off_t ofs = 0;
8229
8230 do {
8231 bufferlist bl;
8232 ret = read_op.read(ofs, end, bl);
8233
8234 uint64_t read_len = ret;
8235 bool again;
8236
8237 do {
8238 void *handle;
8239 rgw_raw_obj obj;
8240
8241 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8242 if (ret < 0) {
8243 return ret;
8244 }
8245 ret = processor.throttle_data(handle, obj, read_len, false);
8246 if (ret < 0)
8247 return ret;
8248 } while (again);
8249
8250 ofs += read_len;
8251 } while (ofs <= end);
8252
8253 string etag;
8254 auto iter = attrs.find(RGW_ATTR_ETAG);
8255 if (iter != attrs.end()) {
8256 bufferlist& bl = iter->second;
8257 etag = string(bl.c_str(), bl.length());
8258 if (petag) {
8259 *petag = bl;
8260 }
8261 }
8262
8263 uint64_t accounted_size;
8264 {
8265 bool compressed{false};
8266 RGWCompressionInfo cs_info;
8267 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8268 if (ret < 0) {
8269 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8270 return ret;
8271 }
8272 // pass original size if compressed
8273 accounted_size = compressed ? cs_info.orig_size : ofs;
8274 }
8275
8276 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8277 }
8278
8279 bool RGWRados::is_meta_master()
8280 {
8281 if (!get_zonegroup().is_master_zonegroup()) {
8282 return false;
8283 }
8284
8285 return (get_zonegroup().master_zone == zone_public_config.id);
8286 }
8287
8288 /**
8289 * Check to see if the bucket metadata could be synced
8290 * bucket: the bucket to check
8291 * Returns false is the bucket is not synced
8292 */
8293 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8294 {
8295
8296 /* no current period */
8297 if (current_period.get_id().empty()) {
8298 return false;
8299 }
8300
8301 /* zonegroup is not master zonegroup */
8302 if (!get_zonegroup().is_master_zonegroup()) {
8303 return false;
8304 }
8305
8306 /* single zonegroup and a single zone */
8307 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8308 return false;
8309 }
8310
8311 /* zone is not master */
8312 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8313 return false;
8314 }
8315
8316 return true;
8317 }
8318
8319 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8320 {
8321 std::map<string, rgw_bucket_dir_entry> ent_map;
8322 rgw_obj_index_key marker;
8323 string prefix;
8324 bool is_truncated;
8325
8326 do {
8327 #define NUM_ENTRIES 1000
8328 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8329 &is_truncated, &marker);
8330 if (r < 0)
8331 return r;
8332
8333 string ns;
8334 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8335 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8336 rgw_obj_key obj;
8337
8338 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8339 return -ENOTEMPTY;
8340 }
8341 } while (is_truncated);
8342 return 0;
8343 }
8344
8345 /**
8346 * Delete a bucket.
8347 * bucket: the name of the bucket to delete
8348 * Returns 0 on success, -ERR# otherwise.
8349 */
8350 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8351 {
8352 const rgw_bucket& bucket = bucket_info.bucket;
8353 librados::IoCtx index_ctx;
8354 map<int, string> bucket_objs;
8355 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8356 if (r < 0)
8357 return r;
8358
8359 if (check_empty) {
8360 r = check_bucket_empty(bucket_info);
8361 if (r < 0) {
8362 return r;
8363 }
8364 }
8365
8366 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8367 if (r < 0)
8368 return r;
8369
8370 /* if the bucket is not synced we can remove the meta file */
8371 if (!is_syncing_bucket_meta(bucket)) {
8372 RGWObjVersionTracker objv_tracker;
8373 string entry = bucket.get_key();
8374 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8375 if (r < 0) {
8376 return r;
8377 }
8378 /* remove bucket index objects*/
8379 map<int, string>::const_iterator biter;
8380 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8381 index_ctx.remove(biter->second);
8382 }
8383 }
8384 return 0;
8385 }
8386
8387 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8388 {
8389 RGWBucketInfo info;
8390 map<string, bufferlist> attrs;
8391 RGWObjectCtx obj_ctx(this);
8392 int r;
8393 if (bucket.bucket_id.empty()) {
8394 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8395 } else {
8396 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8397 }
8398 if (r < 0) {
8399 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8400 return r;
8401 }
8402
8403 info.owner = owner.get_id();
8404
8405 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8406 if (r < 0) {
8407 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8408 return r;
8409 }
8410
8411 return 0;
8412 }
8413
8414
8415 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8416 {
8417 int ret = 0;
8418
8419 vector<rgw_bucket>::iterator iter;
8420
8421 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8422 rgw_bucket& bucket = *iter;
8423 if (enabled)
8424 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8425 else
8426 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8427
8428 RGWBucketInfo info;
8429 map<string, bufferlist> attrs;
8430 RGWObjectCtx obj_ctx(this);
8431 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8432 if (r < 0) {
8433 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8434 ret = r;
8435 continue;
8436 }
8437 if (enabled) {
8438 info.flags &= ~BUCKET_SUSPENDED;
8439 } else {
8440 info.flags |= BUCKET_SUSPENDED;
8441 }
8442
8443 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8444 if (r < 0) {
8445 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8446 ret = r;
8447 continue;
8448 }
8449 }
8450 return ret;
8451 }
8452
8453 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8454 {
8455 RGWBucketInfo bucket_info;
8456 RGWObjectCtx obj_ctx(this);
8457 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8458 if (ret < 0) {
8459 return ret;
8460 }
8461
8462 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8463 return 0;
8464 }
8465
8466 int RGWRados::Object::complete_atomic_modification()
8467 {
8468 if (!state->has_manifest || state->keep_tail)
8469 return 0;
8470
8471 cls_rgw_obj_chain chain;
8472 store->update_gc_chain(obj, state->manifest, &chain);
8473
8474 if (chain.empty()) {
8475 return 0;
8476 }
8477
8478 string tag = state->obj_tag.to_str();
8479 return store->gc->send_chain(chain, tag, false); // do it async
8480 }
8481
8482 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8483 {
8484 RGWObjManifest::obj_iterator iter;
8485 rgw_raw_obj raw_head;
8486 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8487 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8488 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8489 if (mobj == raw_head)
8490 continue;
8491 cls_rgw_obj_key key(mobj.oid);
8492 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8493 }
8494 }
8495
8496 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8497 {
8498 return gc->send_chain(chain, tag, sync);
8499 }
8500
8501 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8502 {
8503 const rgw_bucket& bucket = bucket_info.bucket;
8504 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8505 if (r < 0)
8506 return r;
8507
8508 if (bucket.bucket_id.empty()) {
8509 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8510 return -EIO;
8511 }
8512
8513 bucket_oid = dir_oid_prefix;
8514 bucket_oid.append(bucket.bucket_id);
8515
8516 return 0;
8517 }
8518
8519 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8520 string& bucket_oid_base) {
8521 const rgw_bucket& bucket = bucket_info.bucket;
8522 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8523 if (r < 0)
8524 return r;
8525
8526 if (bucket.bucket_id.empty()) {
8527 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8528 return -EIO;
8529 }
8530
8531 bucket_oid_base = dir_oid_prefix;
8532 bucket_oid_base.append(bucket.bucket_id);
8533
8534 return 0;
8535
8536 }
8537
8538 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8539 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8540 string bucket_oid_base;
8541 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8542 if (ret < 0) {
8543 return ret;
8544 }
8545
8546 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8547 if (bucket_instance_ids) {
8548 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8549 }
8550 return 0;
8551 }
8552
8553 template<typename T>
8554 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8555 map<int, string>& oids, map<int, T>& bucket_objs,
8556 int shard_id, map<int, string> *bucket_instance_ids)
8557 {
8558 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8559 if (ret < 0)
8560 return ret;
8561
8562 map<int, string>::const_iterator iter = oids.begin();
8563 for (; iter != oids.end(); ++iter) {
8564 bucket_objs[iter->first] = T();
8565 }
8566 return 0;
8567 }
8568
8569 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8570 const string& obj_key, string *bucket_obj, int *shard_id)
8571 {
8572 string bucket_oid_base;
8573 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8574 if (ret < 0)
8575 return ret;
8576
8577 RGWObjectCtx obj_ctx(this);
8578
8579 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8580 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8581 if (ret < 0) {
8582 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8583 return ret;
8584 }
8585 return 0;
8586 }
8587
8588 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8589 int shard_id, string *bucket_obj)
8590 {
8591 string bucket_oid_base;
8592 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8593 if (ret < 0)
8594 return ret;
8595
8596 RGWObjectCtx obj_ctx(this);
8597
8598 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8599 shard_id, bucket_obj);
8600 return 0;
8601 }
8602
8603 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8604 map<RGWObjCategory, RGWStorageStats>& stats)
8605 {
8606 for (const auto& pair : header.stats) {
8607 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8608 const rgw_bucket_category_stats& header_stats = pair.second;
8609
8610 RGWStorageStats& s = stats[category];
8611
8612 s.category = category;
8613 s.size += header_stats.total_size;
8614 s.size_rounded += header_stats.total_size_rounded;
8615 s.size_utilized += header_stats.actual_size;
8616 s.num_objects += header_stats.num_entries;
8617 }
8618 }
8619
8620 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8621 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8622 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8623 {
8624 librados::IoCtx index_ctx;
8625 // key - bucket index object id
8626 // value - bucket index check OP returned result with the given bucket index object (shard)
8627 map<int, string> oids;
8628 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8629
8630 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8631 if (ret < 0) {
8632 return ret;
8633 }
8634
8635 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8636 if (ret < 0) {
8637 return ret;
8638 }
8639
8640 // Aggregate results (from different shards if there is any)
8641 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8642 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8643 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8644 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8645 }
8646
8647 return 0;
8648 }
8649
8650 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8651 {
8652 librados::IoCtx index_ctx;
8653 map<int, string> bucket_objs;
8654
8655 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8656 if (r < 0) {
8657 return r;
8658 }
8659
8660 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8661 }
8662
8663 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8664 {
8665 librados::IoCtx index_ctx;
8666 map<int, string> bucket_objs;
8667
8668 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8669 if (r < 0) {
8670 return r;
8671 }
8672
8673 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8674 }
8675
8676 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8677 {
8678 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8679 std::string oid, key;
8680 get_obj_bucket_and_oid_loc(obj, oid, key);
8681 if (!rctx)
8682 return 0;
8683
8684 RGWObjState *state = NULL;
8685
8686 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8687 if (r < 0)
8688 return r;
8689
8690 if (!state->is_atomic) {
8691 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8692 return -EINVAL;
8693 }
8694
8695 if (state->obj_tag.length() == 0) {// check for backward compatibility
8696 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8697 return -EINVAL;
8698 }
8699
8700 string tag = state->obj_tag.c_str();
8701
8702 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8703
8704 return gc->defer_chain(tag, false);
8705 }
8706
8707 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8708 {
8709 list<string> prefixes;
8710 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8711 cls_rgw_remove_obj(op, prefixes);
8712 }
8713
8714 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8715 {
8716 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8717 }
8718
8719 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8720 {
8721 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8722 }
8723
8724
8725 /**
8726 * Delete an object.
8727 * bucket: name of the bucket storing the object
8728 * obj: name of the object to delete
8729 * Returns: 0 on success, -ERR# otherwise.
8730 */
8731 int RGWRados::Object::Delete::delete_obj()
8732 {
8733 RGWRados *store = target->get_store();
8734 rgw_obj& src_obj = target->get_obj();
8735 const string& instance = src_obj.key.instance;
8736 rgw_obj obj = src_obj;
8737
8738 if (instance == "null") {
8739 obj.key.instance.clear();
8740 }
8741
8742 bool explicit_marker_version = (!params.marker_version_id.empty());
8743
8744 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8745 if (instance.empty() || explicit_marker_version) {
8746 rgw_obj marker = obj;
8747
8748 if (!params.marker_version_id.empty()) {
8749 if (params.marker_version_id != "null") {
8750 marker.key.set_instance(params.marker_version_id);
8751 }
8752 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8753 store->gen_rand_obj_instance_name(&marker);
8754 }
8755
8756 result.version_id = marker.key.instance;
8757 result.delete_marker = true;
8758
8759 struct rgw_bucket_dir_entry_meta meta;
8760
8761 meta.owner = params.obj_owner.get_id().to_str();
8762 meta.owner_display_name = params.obj_owner.get_display_name();
8763
8764 if (real_clock::is_zero(params.mtime)) {
8765 meta.mtime = real_clock::now();
8766 } else {
8767 meta.mtime = params.mtime;
8768 }
8769
8770 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8771 if (r < 0) {
8772 return r;
8773 }
8774 } else {
8775 rgw_bucket_dir_entry dirent;
8776
8777 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8778 if (r < 0) {
8779 return r;
8780 }
8781 result.delete_marker = dirent.is_delete_marker();
8782 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8783 if (r < 0) {
8784 return r;
8785 }
8786 result.version_id = instance;
8787 }
8788
8789 BucketShard *bs;
8790 int r = target->get_bucket_shard(&bs);
8791 if (r < 0) {
8792 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8793 return r;
8794 }
8795
8796 if (target->bucket_info.datasync_flag_enabled()) {
8797 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8798 if (r < 0) {
8799 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8800 return r;
8801 }
8802 }
8803
8804 return 0;
8805 }
8806
8807 rgw_rados_ref ref;
8808 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8809 if (r < 0) {
8810 return r;
8811 }
8812
8813 RGWObjState *state;
8814 r = target->get_state(&state, false);
8815 if (r < 0)
8816 return r;
8817
8818 ObjectWriteOperation op;
8819
8820 if (!real_clock::is_zero(params.unmod_since)) {
8821 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8822 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8823 if (!params.high_precision_time) {
8824 ctime.tv_nsec = 0;
8825 unmod.tv_nsec = 0;
8826 }
8827
8828 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8829 if (ctime > unmod) {
8830 return -ERR_PRECONDITION_FAILED;
8831 }
8832
8833 /* only delete object if mtime is less than or equal to params.unmod_since */
8834 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8835 }
8836 uint64_t obj_size = state->size;
8837
8838 if (!real_clock::is_zero(params.expiration_time)) {
8839 bufferlist bl;
8840 real_time delete_at;
8841
8842 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8843 try {
8844 bufferlist::iterator iter = bl.begin();
8845 ::decode(delete_at, iter);
8846 } catch (buffer::error& err) {
8847 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8848 return -EIO;
8849 }
8850
8851 if (params.expiration_time != delete_at) {
8852 return -ERR_PRECONDITION_FAILED;
8853 }
8854 } else {
8855 return -ERR_PRECONDITION_FAILED;
8856 }
8857 }
8858
8859 if (!state->exists) {
8860 target->invalidate_state();
8861 return -ENOENT;
8862 }
8863
8864 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
8865 if (r < 0)
8866 return r;
8867
8868 RGWBucketInfo& bucket_info = target->get_bucket_info();
8869
8870 RGWRados::Bucket bop(store, bucket_info);
8871 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8872
8873 index_op.set_zones_trace(params.zones_trace);
8874 index_op.set_bilog_flags(params.bilog_flags);
8875
8876
8877 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8878 if (r < 0)
8879 return r;
8880
8881 store->remove_rgw_head_obj(op);
8882 r = ref.ioctx.operate(ref.oid, &op);
8883 bool need_invalidate = false;
8884 if (r == -ECANCELED) {
8885 /* raced with another operation, we can regard it as removed */
8886 need_invalidate = true;
8887 r = 0;
8888 }
8889
8890 int64_t poolid = ref.ioctx.get_id();
8891 if (r >= 0) {
8892 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8893 if (obj_tombstone_cache) {
8894 tombstone_entry entry{*state};
8895 obj_tombstone_cache->add(obj, entry);
8896 }
8897 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8898
8899 int ret = target->complete_atomic_modification();
8900 if (ret < 0) {
8901 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8902 }
8903 /* other than that, no need to propagate error */
8904 } else {
8905 int ret = index_op.cancel();
8906 if (ret < 0) {
8907 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8908 }
8909 }
8910
8911 if (need_invalidate) {
8912 target->invalidate_state();
8913 }
8914
8915 if (r < 0)
8916 return r;
8917
8918 /* update quota cache */
8919 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8920
8921 return 0;
8922 }
8923
8924 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8925 const RGWBucketInfo& bucket_info,
8926 const rgw_obj& obj,
8927 int versioning_status,
8928 uint16_t bilog_flags,
8929 const real_time& expiration_time,
8930 rgw_zone_set *zones_trace)
8931 {
8932 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
8933 RGWRados::Object::Delete del_op(&del_target);
8934
8935 del_op.params.bucket_owner = bucket_info.owner;
8936 del_op.params.versioning_status = versioning_status;
8937 del_op.params.bilog_flags = bilog_flags;
8938 del_op.params.expiration_time = expiration_time;
8939 del_op.params.zones_trace = zones_trace;
8940
8941 return del_op.delete_obj();
8942 }
8943
8944 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
8945 {
8946 rgw_rados_ref ref;
8947 int r = get_raw_obj_ref(obj, &ref);
8948 if (r < 0) {
8949 return r;
8950 }
8951
8952 ObjectWriteOperation op;
8953
8954 op.remove();
8955 r = ref.ioctx.operate(ref.oid, &op);
8956 if (r < 0)
8957 return r;
8958
8959 return 0;
8960 }
8961
8962 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
8963 {
8964 if (obj.empty()) {
8965 ldout(cct, 1) << "delete_system_obj got empty object name "
8966 << obj << ", returning EINVAL" << dendl;
8967 return -EINVAL;
8968 }
8969 rgw_rados_ref ref;
8970 int r = get_raw_obj_ref(obj, &ref);
8971 if (r < 0) {
8972 return r;
8973 }
8974
8975 ObjectWriteOperation op;
8976
8977 if (objv_tracker) {
8978 objv_tracker->prepare_op_for_write(&op);
8979 }
8980
8981 op.remove();
8982 r = ref.ioctx.operate(ref.oid, &op);
8983 if (r < 0)
8984 return r;
8985
8986 return 0;
8987 }
8988
8989 int RGWRados::delete_obj_index(const rgw_obj& obj)
8990 {
8991 std::string oid, key;
8992 get_obj_bucket_and_oid_loc(obj, oid, key);
8993
8994 RGWObjectCtx obj_ctx(this);
8995
8996 RGWBucketInfo bucket_info;
8997 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
8998 if (ret < 0) {
8999 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9000 return ret;
9001 }
9002
9003 RGWRados::Bucket bop(this, bucket_info);
9004 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9005
9006 real_time removed_mtime;
9007 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9008
9009 return r;
9010 }
9011
9012 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9013 {
9014 string tag;
9015
9016 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9017 if (mi != manifest.obj_end()) {
9018 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9019 ++mi;
9020 tag = mi.get_location().get_raw_obj(store).oid;
9021 tag.append("_");
9022 }
9023
9024 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9025 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9026 MD5 hash;
9027 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9028
9029 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9030 if (iter != attrset.end()) {
9031 bufferlist& bl = iter->second;
9032 hash.Update((const byte *)bl.c_str(), bl.length());
9033 }
9034
9035 hash.Final(md5);
9036 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9037 tag.append(md5_str);
9038
9039 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9040
9041 tag_bl.append(tag.c_str(), tag.size() + 1);
9042 }
9043
9044 static bool is_olh(map<string, bufferlist>& attrs)
9045 {
9046 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9047 return (iter != attrs.end());
9048 }
9049
9050 static bool has_olh_tag(map<string, bufferlist>& attrs)
9051 {
9052 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9053 return (iter != attrs.end());
9054 }
9055
9056 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9057 RGWObjState *olh_state, RGWObjState **target_state)
9058 {
9059 assert(olh_state->is_olh);
9060
9061 rgw_obj target;
9062 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9063 if (r < 0) {
9064 return r;
9065 }
9066 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9067 if (r < 0) {
9068 return r;
9069 }
9070
9071 return 0;
9072 }
9073
9074 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9075 {
9076 if (obj.empty()) {
9077 return -EINVAL;
9078 }
9079
9080 RGWRawObjState *s = rctx->raw.get_state(obj);
9081 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9082 *state = s;
9083 if (s->has_attrs) {
9084 return 0;
9085 }
9086
9087 s->obj = obj;
9088
9089 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9090 if (r == -ENOENT) {
9091 s->exists = false;
9092 s->has_attrs = true;
9093 s->mtime = real_time();
9094 return 0;
9095 }
9096 if (r < 0)
9097 return r;
9098
9099 s->exists = true;
9100 s->has_attrs = true;
9101 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9102
9103 if (s->obj_tag.length())
9104 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9105 << s->obj_tag.c_str() << dendl;
9106 else
9107 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9108
9109 return 0;
9110 }
9111
9112 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9113 {
9114 int ret;
9115
9116 do {
9117 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9118 } while (ret == -EAGAIN);
9119
9120 return ret;
9121 }
9122
9123 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9124 RGWObjState **state, bool follow_olh, bool assume_noent)
9125 {
9126 if (obj.empty()) {
9127 return -EINVAL;
9128 }
9129
9130 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9131
9132 RGWObjState *s = rctx->obj.get_state(obj);
9133 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9134 *state = s;
9135 if (s->has_attrs) {
9136 if (s->is_olh && need_follow_olh) {
9137 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9138 }
9139 return 0;
9140 }
9141
9142 s->obj = obj;
9143
9144 rgw_raw_obj raw_obj;
9145 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9146
9147 int r = -ENOENT;
9148
9149 if (!assume_noent) {
9150 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9151 }
9152
9153 if (r == -ENOENT) {
9154 s->exists = false;
9155 s->has_attrs = true;
9156 tombstone_entry entry;
9157 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9158 s->mtime = entry.mtime;
9159 s->zone_short_id = entry.zone_short_id;
9160 s->pg_ver = entry.pg_ver;
9161 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9162 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9163 } else {
9164 s->mtime = real_time();
9165 }
9166 return 0;
9167 }
9168 if (r < 0)
9169 return r;
9170
9171 s->exists = true;
9172 s->has_attrs = true;
9173 s->accounted_size = s->size;
9174
9175 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9176 const bool compressed = (iter != s->attrset.end());
9177 if (compressed) {
9178 // use uncompressed size for accounted_size
9179 try {
9180 RGWCompressionInfo info;
9181 auto p = iter->second.begin();
9182 ::decode(info, p);
9183 s->accounted_size = info.orig_size;
9184 } catch (buffer::error&) {
9185 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9186 return -EIO;
9187 }
9188 }
9189
9190 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9191 if (iter != s->attrset.end()) {
9192 bufferlist bl = iter->second;
9193 bufferlist::iterator it = bl.begin();
9194 it.copy(bl.length(), s->shadow_obj);
9195 s->shadow_obj[bl.length()] = '\0';
9196 }
9197 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9198
9199 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9200 if (manifest_bl.length()) {
9201 bufferlist::iterator miter = manifest_bl.begin();
9202 try {
9203 ::decode(s->manifest, miter);
9204 s->has_manifest = true;
9205 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9206 broken due to old bugs */
9207 s->size = s->manifest.get_obj_size();
9208 if (!compressed)
9209 s->accounted_size = s->size;
9210 } catch (buffer::error& err) {
9211 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9212 return -EIO;
9213 }
9214 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9215 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9216 RGWObjManifest::obj_iterator mi;
9217 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9218 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9219 }
9220 }
9221
9222 if (!s->obj_tag.length()) {
9223 /*
9224 * Uh oh, something's wrong, object with manifest should have tag. Let's
9225 * create one out of the manifest, would be unique
9226 */
9227 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9228 s->fake_tag = true;
9229 }
9230 }
9231 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9232 if (aiter != s->attrset.end()) {
9233 bufferlist& pg_ver_bl = aiter->second;
9234 if (pg_ver_bl.length()) {
9235 bufferlist::iterator pgbl = pg_ver_bl.begin();
9236 try {
9237 ::decode(s->pg_ver, pgbl);
9238 } catch (buffer::error& err) {
9239 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9240 }
9241 }
9242 }
9243 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9244 if (aiter != s->attrset.end()) {
9245 bufferlist& zone_short_id_bl = aiter->second;
9246 if (zone_short_id_bl.length()) {
9247 bufferlist::iterator zbl = zone_short_id_bl.begin();
9248 try {
9249 ::decode(s->zone_short_id, zbl);
9250 } catch (buffer::error& err) {
9251 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9252 }
9253 }
9254 }
9255 if (s->obj_tag.length())
9256 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9257 else
9258 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9259
9260 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9261 * it exist, and not only if is_olh() returns true
9262 */
9263 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9264 if (iter != s->attrset.end()) {
9265 s->olh_tag = iter->second;
9266 }
9267
9268 if (is_olh(s->attrset)) {
9269 s->is_olh = true;
9270
9271 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9272
9273 if (need_follow_olh) {
9274 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9275 }
9276 }
9277
9278 return 0;
9279 }
9280
9281 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9282 bool follow_olh, bool assume_noent)
9283 {
9284 int ret;
9285
9286 do {
9287 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9288 } while (ret == -EAGAIN);
9289
9290 return ret;
9291 }
9292
9293 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9294 {
9295 RGWObjState *astate;
9296 int r = get_state(&astate, true);
9297 if (r < 0) {
9298 return r;
9299 }
9300
9301 *pmanifest = &astate->manifest;
9302
9303 return 0;
9304 }
9305
9306 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9307 {
9308 RGWObjState *state;
9309 int r = source->get_state(&state, true);
9310 if (r < 0)
9311 return r;
9312 if (!state->exists)
9313 return -ENOENT;
9314 if (!state->get_attr(name, dest))
9315 return -ENODATA;
9316
9317 return 0;
9318 }
9319
9320
9321 int RGWRados::Object::Stat::stat_async()
9322 {
9323 RGWObjectCtx& ctx = source->get_ctx();
9324 rgw_obj& obj = source->get_obj();
9325 RGWRados *store = source->get_store();
9326
9327 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9328 result.obj = obj;
9329 if (s->has_attrs) {
9330 state.ret = 0;
9331 result.size = s->size;
9332 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9333 result.attrs = s->attrset;
9334 result.has_manifest = s->has_manifest;
9335 result.manifest = s->manifest;
9336 return 0;
9337 }
9338
9339 string oid;
9340 string loc;
9341 get_obj_bucket_and_oid_loc(obj, oid, loc);
9342
9343 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9344 if (r < 0) {
9345 return r;
9346 }
9347
9348 librados::ObjectReadOperation op;
9349 op.stat2(&result.size, &result.mtime, NULL);
9350 op.getxattrs(&result.attrs, NULL);
9351 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9352 state.io_ctx.locator_set_key(loc);
9353 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9354 if (r < 0) {
9355 ldout(store->ctx(), 5) << __func__
9356 << ": ERROR: aio_operate() returned ret=" << r
9357 << dendl;
9358 return r;
9359 }
9360
9361 return 0;
9362 }
9363
9364
9365 int RGWRados::Object::Stat::wait()
9366 {
9367 if (!state.completion) {
9368 return state.ret;
9369 }
9370
9371 state.completion->wait_for_safe();
9372 state.ret = state.completion->get_return_value();
9373 state.completion->release();
9374
9375 if (state.ret != 0) {
9376 return state.ret;
9377 }
9378
9379 return finish();
9380 }
9381
9382 int RGWRados::Object::Stat::finish()
9383 {
9384 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9385 if (iter != result.attrs.end()) {
9386 bufferlist& bl = iter->second;
9387 bufferlist::iterator biter = bl.begin();
9388 try {
9389 ::decode(result.manifest, biter);
9390 } catch (buffer::error& err) {
9391 RGWRados *store = source->get_store();
9392 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9393 return -EIO;
9394 }
9395 result.has_manifest = true;
9396 }
9397
9398 return 0;
9399 }
9400
9401 /**
9402 * Get an attribute for a system object.
9403 * obj: the object to get attr
9404 * name: name of the attr to retrieve
9405 * dest: bufferlist to store the result in
9406 * Returns: 0 on success, -ERR# otherwise.
9407 */
9408 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9409 {
9410 rgw_rados_ref ref;
9411 int r = get_system_obj_ref(obj, &ref);
9412 if (r < 0) {
9413 return r;
9414 }
9415
9416 ObjectReadOperation op;
9417
9418 int rval;
9419 op.getxattr(name, &dest, &rval);
9420
9421 r = ref.ioctx.operate(ref.oid, &op, NULL);
9422 if (r < 0)
9423 return r;
9424
9425 return 0;
9426 }
9427
9428 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9429 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9430 ObjectOperation& op, RGWObjState **pstate)
9431 {
9432 if (!rctx)
9433 return 0;
9434
9435 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9436 if (r < 0)
9437 return r;
9438
9439 RGWObjState *state = *pstate;
9440
9441 if (!state->is_atomic) {
9442 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9443 return 0;
9444 }
9445
9446 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9447 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9448 } else {
9449 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9450 }
9451 return 0;
9452 }
9453
9454 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9455 {
9456 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9457 }
9458
9459 void RGWRados::Object::invalidate_state()
9460 {
9461 ctx.obj.invalidate(obj);
9462 }
9463
9464 void RGWRados::SystemObject::invalidate_state()
9465 {
9466 ctx.raw.invalidate(obj);
9467 }
9468
9469 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9470 const char *if_match, const char *if_nomatch, bool removal_op)
9471 {
9472 int r = get_state(&state, false);
9473 if (r < 0)
9474 return r;
9475
9476 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9477 if_match != NULL || if_nomatch != NULL) &&
9478 (!state->fake_tag);
9479
9480 if (!state->is_atomic) {
9481 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9482
9483 if (reset_obj) {
9484 op.create(false);
9485 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9486 }
9487
9488 return 0;
9489 }
9490
9491 if (need_guard) {
9492 /* first verify that the object wasn't replaced under */
9493 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9494 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9495 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9496 }
9497
9498 if (if_match) {
9499 if (strcmp(if_match, "*") == 0) {
9500 // test the object is existing
9501 if (!state->exists) {
9502 return -ERR_PRECONDITION_FAILED;
9503 }
9504 } else {
9505 bufferlist bl;
9506 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9507 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9508 return -ERR_PRECONDITION_FAILED;
9509 }
9510 }
9511 }
9512
9513 if (if_nomatch) {
9514 if (strcmp(if_nomatch, "*") == 0) {
9515 // test the object is NOT existing
9516 if (state->exists) {
9517 return -ERR_PRECONDITION_FAILED;
9518 }
9519 } else {
9520 bufferlist bl;
9521 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9522 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9523 return -ERR_PRECONDITION_FAILED;
9524 }
9525 }
9526 }
9527 }
9528
9529 if (reset_obj) {
9530 if (state->exists) {
9531 op.create(false);
9532 store->remove_rgw_head_obj(op);
9533 } else {
9534 op.create(true);
9535 }
9536 }
9537
9538 if (removal_op) {
9539 /* the object is being removed, no need to update its tag */
9540 return 0;
9541 }
9542
9543 if (ptag) {
9544 state->write_tag = *ptag;
9545 } else {
9546 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9547 }
9548 bufferlist bl;
9549 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9550
9551 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9552
9553 op.setxattr(RGW_ATTR_ID_TAG, bl);
9554
9555 return 0;
9556 }
9557
9558 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9559 RGWObjVersionTracker *objv_tracker)
9560 {
9561 map<string, bufferlist> attrs;
9562 attrs[name] = bl;
9563 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9564 }
9565
9566 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9567 map<string, bufferlist>& attrs,
9568 map<string, bufferlist>* rmattrs,
9569 RGWObjVersionTracker *objv_tracker)
9570 {
9571 rgw_rados_ref ref;
9572 int r = get_system_obj_ref(obj, &ref);
9573 if (r < 0) {
9574 return r;
9575 }
9576 ObjectWriteOperation op;
9577
9578 if (objv_tracker) {
9579 objv_tracker->prepare_op_for_write(&op);
9580 }
9581
9582 map<string, bufferlist>::iterator iter;
9583 if (rmattrs) {
9584 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9585 const string& name = iter->first;
9586 op.rmxattr(name.c_str());
9587 }
9588 }
9589
9590 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9591 const string& name = iter->first;
9592 bufferlist& bl = iter->second;
9593
9594 if (!bl.length())
9595 continue;
9596
9597 op.setxattr(name.c_str(), bl);
9598 }
9599
9600 if (!op.size())
9601 return 0;
9602
9603 bufferlist bl;
9604
9605 r = ref.ioctx.operate(ref.oid, &op);
9606 if (r < 0)
9607 return r;
9608
9609 return 0;
9610 }
9611
9612 /**
9613 * Set an attr on an object.
9614 * bucket: name of the bucket holding the object
9615 * obj: name of the object to set the attr on
9616 * name: the attr to set
9617 * bl: the contents of the attr
9618 * Returns: 0 on success, -ERR# otherwise.
9619 */
9620 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9621 {
9622 map<string, bufferlist> attrs;
9623 attrs[name] = bl;
9624 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9625 }
9626
9627 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9628 map<string, bufferlist>& attrs,
9629 map<string, bufferlist>* rmattrs)
9630 {
9631 rgw_rados_ref ref;
9632 int r = get_obj_head_ref(bucket_info, obj, &ref);
9633 if (r < 0) {
9634 return r;
9635 }
9636 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9637
9638 ObjectWriteOperation op;
9639 RGWObjState *state = NULL;
9640
9641 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9642 if (r < 0)
9643 return r;
9644
9645 map<string, bufferlist>::iterator iter;
9646 if (rmattrs) {
9647 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9648 const string& name = iter->first;
9649 op.rmxattr(name.c_str());
9650 }
9651 }
9652
9653 const rgw_bucket& bucket = obj.bucket;
9654
9655 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9656 const string& name = iter->first;
9657 bufferlist& bl = iter->second;
9658
9659 if (!bl.length())
9660 continue;
9661
9662 op.setxattr(name.c_str(), bl);
9663
9664 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9665 real_time ts;
9666 try {
9667 ::decode(ts, bl);
9668
9669 rgw_obj_index_key obj_key;
9670 obj.key.get_index_key(&obj_key);
9671
9672 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9673 } catch (buffer::error& err) {
9674 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9675 }
9676 }
9677 }
9678
9679 if (!op.size())
9680 return 0;
9681
9682 RGWObjectCtx obj_ctx(this);
9683
9684 bufferlist bl;
9685 RGWRados::Bucket bop(this, bucket_info);
9686 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9687
9688 if (state) {
9689 string tag;
9690 append_rand_alpha(cct, tag, tag, 32);
9691 state->write_tag = tag;
9692 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9693
9694 if (r < 0)
9695 return r;
9696
9697 bl.append(tag.c_str(), tag.size() + 1);
9698
9699 op.setxattr(RGW_ATTR_ID_TAG, bl);
9700 }
9701
9702 r = ref.ioctx.operate(ref.oid, &op);
9703 if (state) {
9704 if (r >= 0) {
9705 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9706 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9707 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9708 string etag(etag_bl.c_str(), etag_bl.length());
9709 string content_type(content_type_bl.c_str(), content_type_bl.length());
9710 uint64_t epoch = ref.ioctx.get_last_version();
9711 int64_t poolid = ref.ioctx.get_id();
9712 real_time mtime = real_clock::now();
9713 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9714 mtime, etag, content_type, &acl_bl,
9715 RGW_OBJ_CATEGORY_MAIN, NULL);
9716 } else {
9717 int ret = index_op.cancel();
9718 if (ret < 0) {
9719 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9720 }
9721 }
9722 }
9723 if (r < 0)
9724 return r;
9725
9726 if (state) {
9727 state->obj_tag.swap(bl);
9728 if (rmattrs) {
9729 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9730 state->attrset.erase(iter->first);
9731 }
9732 }
9733 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9734 state->attrset[iter->first] = iter->second;
9735 }
9736 }
9737
9738 return 0;
9739 }
9740
9741 int RGWRados::Object::Read::prepare()
9742 {
9743 RGWRados *store = source->get_store();
9744 CephContext *cct = store->ctx();
9745
9746 bufferlist etag;
9747
9748 map<string, bufferlist>::iterator iter;
9749
9750 RGWObjState *astate;
9751 int r = source->get_state(&astate, true);
9752 if (r < 0)
9753 return r;
9754
9755 if (!astate->exists) {
9756 return -ENOENT;
9757 }
9758
9759 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9760
9761 state.obj = astate->obj;
9762 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9763
9764 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9765 if (r < 0) {
9766 return r;
9767 }
9768 if (params.attrs) {
9769 *params.attrs = astate->attrset;
9770 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9771 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9772 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9773 }
9774 }
9775 }
9776
9777 /* Convert all times go GMT to make them compatible */
9778 if (conds.mod_ptr || conds.unmod_ptr) {
9779 obj_time_weight src_weight;
9780 src_weight.init(astate);
9781 src_weight.high_precision = conds.high_precision_time;
9782
9783 obj_time_weight dest_weight;
9784 dest_weight.high_precision = conds.high_precision_time;
9785
9786 if (conds.mod_ptr) {
9787 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9788 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9789 if (!(dest_weight < src_weight)) {
9790 return -ERR_NOT_MODIFIED;
9791 }
9792 }
9793
9794 if (conds.unmod_ptr) {
9795 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9796 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9797 if (dest_weight < src_weight) {
9798 return -ERR_PRECONDITION_FAILED;
9799 }
9800 }
9801 }
9802 if (conds.if_match || conds.if_nomatch) {
9803 r = get_attr(RGW_ATTR_ETAG, etag);
9804 if (r < 0)
9805 return r;
9806
9807 if (conds.if_match) {
9808 string if_match_str = rgw_string_unquote(conds.if_match);
9809 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9810 if (if_match_str.compare(etag.c_str()) != 0) {
9811 return -ERR_PRECONDITION_FAILED;
9812 }
9813 }
9814
9815 if (conds.if_nomatch) {
9816 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9817 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9818 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9819 return -ERR_NOT_MODIFIED;
9820 }
9821 }
9822 }
9823
9824 if (params.obj_size)
9825 *params.obj_size = astate->size;
9826 if (params.lastmod)
9827 *params.lastmod = astate->mtime;
9828
9829 return 0;
9830 }
9831
9832 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9833 {
9834 if (ofs < 0) {
9835 ofs += obj_size;
9836 if (ofs < 0)
9837 ofs = 0;
9838 end = obj_size - 1;
9839 } else if (end < 0) {
9840 end = obj_size - 1;
9841 }
9842
9843 if (obj_size > 0) {
9844 if (ofs >= (off_t)obj_size) {
9845 return -ERANGE;
9846 }
9847 if (end >= (off_t)obj_size) {
9848 end = obj_size - 1;
9849 }
9850 }
9851 return 0;
9852 }
9853
9854 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9855 {
9856 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9857 }
9858
9859 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9860 RGWRados::SystemObject::Read::GetObjState& state,
9861 rgw_raw_obj& obj,
9862 map<string, bufferlist> *attrs,
9863 real_time *lastmod,
9864 uint64_t *obj_size,
9865 RGWObjVersionTracker *objv_tracker)
9866 {
9867 RGWRawObjState *astate = NULL;
9868
9869 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9870 if (r < 0)
9871 return r;
9872
9873 if (!astate->exists) {
9874 return -ENOENT;
9875 }
9876
9877 if (attrs) {
9878 *attrs = astate->attrset;
9879 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9880 map<string, bufferlist>::iterator iter;
9881 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9882 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9883 }
9884 }
9885 }
9886
9887 if (obj_size)
9888 *obj_size = astate->size;
9889 if (lastmod)
9890 *lastmod = astate->mtime;
9891
9892 return 0;
9893 }
9894
9895
9896 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
9897 {
9898 RGWRados *store = target->get_store();
9899 BucketShard *bs;
9900 int r;
9901
9902 #define NUM_RESHARD_RETRIES 10
9903 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
9904 int ret = get_bucket_shard(&bs);
9905 if (ret < 0) {
9906 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9907 return ret;
9908 }
9909 r = call(bs);
9910 if (r != -ERR_BUSY_RESHARDING) {
9911 break;
9912 }
9913 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
9914 string new_bucket_id;
9915 r = store->block_while_resharding(bs, &new_bucket_id);
9916 if (r == -ERR_BUSY_RESHARDING) {
9917 continue;
9918 }
9919 if (r < 0) {
9920 return r;
9921 }
9922 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
9923 i = 0; /* resharding is finished, make sure we can retry */
9924 r = target->update_bucket_id(new_bucket_id);
9925 if (r < 0) {
9926 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
9927 return r;
9928 }
9929 invalidate_bs();
9930 }
9931
9932 if (r < 0) {
9933 return r;
9934 }
9935
9936 if (pbs) {
9937 *pbs = bs;
9938 }
9939
9940 return 0;
9941 }
9942
9943 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
9944 {
9945 RGWRados *store = source->get_store();
9946 rgw_raw_obj& obj = source->get_obj();
9947
9948 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
9949 stat_params.lastmod, stat_params.obj_size, objv_tracker);
9950 }
9951
9952 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
9953 {
9954 if (blind) {
9955 return 0;
9956 }
9957 RGWRados *store = target->get_store();
9958
9959 if (write_tag && write_tag->length()) {
9960 optag = string(write_tag->c_str(), write_tag->length());
9961 } else {
9962 if (optag.empty()) {
9963 append_rand_alpha(store->ctx(), optag, optag, 32);
9964 }
9965 }
9966
9967 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
9968 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
9969 });
9970
9971 if (r < 0) {
9972 return r;
9973 }
9974 prepared = true;
9975
9976 return 0;
9977 }
9978
9979 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
9980 uint64_t size, uint64_t accounted_size,
9981 ceph::real_time& ut, const string& etag,
9982 const string& content_type,
9983 bufferlist *acl_bl,
9984 RGWObjCategory category,
9985 list<rgw_obj_index_key> *remove_objs, const string *user_data)
9986 {
9987 if (blind) {
9988 return 0;
9989 }
9990 RGWRados *store = target->get_store();
9991 BucketShard *bs;
9992
9993 int ret = get_bucket_shard(&bs);
9994 if (ret < 0) {
9995 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9996 return ret;
9997 }
9998
9999 rgw_bucket_dir_entry ent;
10000 obj.key.get_index_key(&ent.key);
10001 ent.meta.size = size;
10002 ent.meta.accounted_size = accounted_size;
10003 ent.meta.mtime = ut;
10004 ent.meta.etag = etag;
10005 if (user_data)
10006 ent.meta.user_data = *user_data;
10007
10008 ACLOwner owner;
10009 if (acl_bl && acl_bl->length()) {
10010 int ret = store->decode_policy(*acl_bl, &owner);
10011 if (ret < 0) {
10012 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10013 }
10014 }
10015 ent.meta.owner = owner.get_id().to_str();
10016 ent.meta.owner_display_name = owner.get_display_name();
10017 ent.meta.content_type = content_type;
10018
10019 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10020
10021 if (target->bucket_info.datasync_flag_enabled()) {
10022 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10023 if (r < 0) {
10024 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10025 }
10026 }
10027
10028 return ret;
10029 }
10030
10031 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10032 real_time& removed_mtime,
10033 list<rgw_obj_index_key> *remove_objs)
10034 {
10035 if (blind) {
10036 return 0;
10037 }
10038 RGWRados *store = target->get_store();
10039 BucketShard *bs;
10040
10041 int ret = get_bucket_shard(&bs);
10042 if (ret < 0) {
10043 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10044 return ret;
10045 }
10046
10047 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10048
10049 if (target->bucket_info.datasync_flag_enabled()) {
10050 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10051 if (r < 0) {
10052 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10053 }
10054 }
10055
10056 return ret;
10057 }
10058
10059
10060 int RGWRados::Bucket::UpdateIndex::cancel()
10061 {
10062 if (blind) {
10063 return 0;
10064 }
10065 RGWRados *store = target->get_store();
10066 BucketShard *bs;
10067
10068 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10069 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10070 });
10071
10072 /*
10073 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10074 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10075 * have no way to tell that they're all caught up
10076 */
10077 if (target->bucket_info.datasync_flag_enabled()) {
10078 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10079 if (r < 0) {
10080 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10081 }
10082 }
10083
10084 return ret;
10085 }
10086
10087 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10088 {
10089 RGWRados *store = source->get_store();
10090 CephContext *cct = store->ctx();
10091
10092 rgw_raw_obj read_obj;
10093 uint64_t read_ofs = ofs;
10094 uint64_t len, read_len;
10095 bool reading_from_head = true;
10096 ObjectReadOperation op;
10097
10098 bool merge_bl = false;
10099 bufferlist *pbl = &bl;
10100 bufferlist read_bl;
10101 uint64_t max_chunk_size;
10102
10103 RGWObjState *astate;
10104 int r = source->get_state(&astate, true);
10105 if (r < 0)
10106 return r;
10107
10108 if (end < 0)
10109 len = 0;
10110 else
10111 len = end - ofs + 1;
10112
10113 if (astate->has_manifest && astate->manifest.has_tail()) {
10114 /* now get the relevant object part */
10115 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10116
10117 uint64_t stripe_ofs = iter.get_stripe_ofs();
10118 read_obj = iter.get_location().get_raw_obj(store);
10119 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10120 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10121 reading_from_head = (read_obj == state.head_obj);
10122 } else {
10123 read_obj = state.head_obj;
10124 }
10125
10126 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10127 if (r < 0) {
10128 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10129 return r;
10130 }
10131
10132 if (len > max_chunk_size)
10133 len = max_chunk_size;
10134
10135
10136 state.io_ctx.locator_set_key(read_obj.loc);
10137
10138 read_len = len;
10139
10140 if (reading_from_head) {
10141 /* only when reading from the head object do we need to do the atomic test */
10142 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10143 if (r < 0)
10144 return r;
10145
10146 if (astate && astate->prefetch_data) {
10147 if (!ofs && astate->data.length() >= len) {
10148 bl = astate->data;
10149 return bl.length();
10150 }
10151
10152 if (ofs < astate->data.length()) {
10153 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10154 astate->data.copy(ofs, copy_len, bl);
10155 read_len -= copy_len;
10156 read_ofs += copy_len;
10157 if (!read_len)
10158 return bl.length();
10159
10160 merge_bl = true;
10161 pbl = &read_bl;
10162 }
10163 }
10164 }
10165
10166 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10167 op.read(read_ofs, read_len, pbl, NULL);
10168
10169 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10170 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10171
10172 if (r < 0) {
10173 return r;
10174 }
10175
10176 if (merge_bl) {
10177 bl.append(read_bl);
10178 }
10179
10180 return bl.length();
10181 }
10182
10183 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10184 {
10185 if (!has_ref) {
10186 int r = store->get_raw_obj_ref(obj, &ref);
10187 if (r < 0) {
10188 return r;
10189 }
10190 has_ref = true;
10191 }
10192 *pref = &ref;
10193 return 0;
10194
10195 }
10196
10197 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10198 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10199 bufferlist& bl, off_t ofs, off_t end,
10200 map<string, bufferlist> *attrs,
10201 rgw_cache_entry_info *cache_info)
10202 {
10203 uint64_t len;
10204 ObjectReadOperation op;
10205
10206 if (end < 0)
10207 len = 0;
10208 else
10209 len = end - ofs + 1;
10210
10211 if (objv_tracker) {
10212 objv_tracker->prepare_op_for_read(&op);
10213 }
10214
10215 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10216 op.read(ofs, len, &bl, NULL);
10217
10218 if (attrs) {
10219 op.getxattrs(attrs, NULL);
10220 }
10221
10222 rgw_rados_ref *ref;
10223 int r = read_state.get_ref(this, obj, &ref);
10224 if (r < 0) {
10225 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10226 return r;
10227 }
10228 r = ref->ioctx.operate(ref->oid, &op, NULL);
10229 if (r < 0) {
10230 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10231 return r;
10232 }
10233 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10234
10235 uint64_t op_ver = ref->ioctx.get_last_version();
10236
10237 if (read_state.last_ver > 0 &&
10238 read_state.last_ver != op_ver) {
10239 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10240 return -ECANCELED;
10241 }
10242
10243 read_state.last_ver = op_ver;
10244
10245 return bl.length();
10246 }
10247
10248 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl, RGWObjVersionTracker *objv_tracker)
10249 {
10250 RGWRados *store = source->get_store();
10251 rgw_raw_obj& obj = source->get_obj();
10252
10253 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl, ofs, end, read_params.attrs, read_params.cache_info);
10254 }
10255
10256 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10257 {
10258 RGWRados *store = source->get_store();
10259 rgw_raw_obj& obj = source->get_obj();
10260
10261 return store->system_obj_get_attr(obj, name, dest);
10262 }
10263
10264 struct get_obj_data;
10265
10266 struct get_obj_aio_data {
10267 struct get_obj_data *op_data;
10268 off_t ofs;
10269 off_t len;
10270 };
10271
10272 struct get_obj_io {
10273 off_t len;
10274 bufferlist bl;
10275 };
10276
10277 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10278
10279 struct get_obj_data : public RefCountedObject {
10280 CephContext *cct;
10281 RGWRados *rados;
10282 RGWObjectCtx *ctx;
10283 IoCtx io_ctx;
10284 map<off_t, get_obj_io> io_map;
10285 map<off_t, librados::AioCompletion *> completion_map;
10286 uint64_t total_read;
10287 Mutex lock;
10288 Mutex data_lock;
10289 list<get_obj_aio_data> aio_data;
10290 RGWGetDataCB *client_cb;
10291 std::atomic<bool> cancelled = { false };
10292 std::atomic<int64_t> err_code = { 0 };
10293 Throttle throttle;
10294 list<bufferlist> read_list;
10295
10296 explicit get_obj_data(CephContext *_cct)
10297 : cct(_cct),
10298 rados(NULL), ctx(NULL),
10299 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10300 client_cb(NULL),
10301 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10302 ~get_obj_data() override { }
10303 void set_cancelled(int r) {
10304 cancelled = true;
10305 err_code = r;
10306 }
10307
10308 bool is_cancelled() {
10309 return cancelled;
10310 }
10311
10312 int get_err_code() {
10313 return err_code;
10314 }
10315
10316 int wait_next_io(bool *done) {
10317 lock.Lock();
10318 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10319 if (iter == completion_map.end()) {
10320 *done = true;
10321 lock.Unlock();
10322 return 0;
10323 }
10324 off_t cur_ofs = iter->first;
10325 librados::AioCompletion *c = iter->second;
10326 lock.Unlock();
10327
10328 c->wait_for_safe_and_cb();
10329 int r = c->get_return_value();
10330
10331 lock.Lock();
10332 completion_map.erase(cur_ofs);
10333
10334 if (completion_map.empty()) {
10335 *done = true;
10336 }
10337 lock.Unlock();
10338
10339 c->release();
10340
10341 return r;
10342 }
10343
10344 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10345 Mutex::Locker l(lock);
10346
10347 const auto& io_iter = io_map.insert(
10348 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10349
10350 assert(io_iter.second); // assert new insertion
10351
10352 get_obj_io& io = (io_iter.first)->second;
10353 *pbl = &io.bl;
10354
10355 struct get_obj_aio_data aio;
10356 aio.ofs = ofs;
10357 aio.len = len;
10358 aio.op_data = this;
10359
10360 aio_data.push_back(aio);
10361
10362 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10363
10364 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10365 completion_map[ofs] = c;
10366
10367 *pc = c;
10368
10369 /* we have a reference per IO, plus one reference for the calling function.
10370 * reference is dropped for each callback, plus when we're done iterating
10371 * over the parts */
10372 get();
10373 }
10374
10375 void cancel_io(off_t ofs) {
10376 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10377 lock.Lock();
10378 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10379 if (iter != completion_map.end()) {
10380 AioCompletion *c = iter->second;
10381 c->release();
10382 completion_map.erase(ofs);
10383 io_map.erase(ofs);
10384 }
10385 lock.Unlock();
10386
10387 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10388 * need IoCtx to live, as io callback may still be called
10389 */
10390 }
10391
10392 void cancel_all_io() {
10393 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10394 Mutex::Locker l(lock);
10395 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10396 iter != completion_map.end(); ++iter) {
10397 librados::AioCompletion *c = iter->second;
10398 c->release();
10399 }
10400 }
10401
10402 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10403 Mutex::Locker l(lock);
10404
10405 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10406
10407 if (liter == io_map.end() ||
10408 liter->first != ofs) {
10409 return 0;
10410 }
10411
10412 map<off_t, librados::AioCompletion *>::iterator aiter;
10413 aiter = completion_map.find(ofs);
10414 if (aiter == completion_map.end()) {
10415 /* completion map does not hold this io, it was cancelled */
10416 return 0;
10417 }
10418
10419 AioCompletion *completion = aiter->second;
10420 int r = completion->get_return_value();
10421 if (r < 0)
10422 return r;
10423
10424 for (; aiter != completion_map.end(); ++aiter) {
10425 completion = aiter->second;
10426 if (!completion->is_safe()) {
10427 /* reached a request that is not yet complete, stop */
10428 break;
10429 }
10430
10431 r = completion->get_return_value();
10432 if (r < 0) {
10433 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10434 return r;
10435 }
10436
10437 total_read += r;
10438
10439 map<off_t, get_obj_io>::iterator old_liter = liter++;
10440 bl_list.push_back(old_liter->second.bl);
10441 io_map.erase(old_liter);
10442 }
10443
10444 return 0;
10445 }
10446 };
10447
10448 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10449 {
10450 struct get_obj_data *d = (struct get_obj_data *)arg;
10451
10452 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10453 }
10454
10455 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10456 {
10457 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10458 struct get_obj_data *d = aio_data->op_data;
10459
10460 d->rados->get_obj_aio_completion_cb(cb, arg);
10461 }
10462
10463
10464 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10465 {
10466 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10467 struct get_obj_data *d = aio_data->op_data;
10468 off_t ofs = aio_data->ofs;
10469 off_t len = aio_data->len;
10470
10471 list<bufferlist> bl_list;
10472 list<bufferlist>::iterator iter;
10473 int r;
10474
10475 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10476 d->throttle.put(len);
10477
10478 r = rados_aio_get_return_value(c);
10479 if (r < 0) {
10480 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10481 d->set_cancelled(r);
10482 goto done;
10483 }
10484
10485 if (d->is_cancelled()) {
10486 goto done;
10487 }
10488
10489 d->data_lock.Lock();
10490
10491 r = d->get_complete_ios(ofs, bl_list);
10492 if (r < 0) {
10493 goto done_unlock;
10494 }
10495
10496 d->read_list.splice(d->read_list.end(), bl_list);
10497
10498 done_unlock:
10499 d->data_lock.Unlock();
10500 done:
10501 d->put();
10502 return;
10503 }
10504
10505 int RGWRados::flush_read_list(struct get_obj_data *d)
10506 {
10507 d->data_lock.Lock();
10508 list<bufferlist> l;
10509 l.swap(d->read_list);
10510 d->get();
10511 d->read_list.clear();
10512
10513 d->data_lock.Unlock();
10514
10515 int r = 0;
10516
10517 list<bufferlist>::iterator iter;
10518 for (iter = l.begin(); iter != l.end(); ++iter) {
10519 bufferlist& bl = *iter;
10520 r = d->client_cb->handle_data(bl, 0, bl.length());
10521 if (r < 0) {
10522 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10523 break;
10524 }
10525 }
10526
10527 d->data_lock.Lock();
10528 d->put();
10529 if (r < 0) {
10530 d->set_cancelled(r);
10531 }
10532 d->data_lock.Unlock();
10533 return r;
10534 }
10535
10536 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10537 const RGWBucketInfo& bucket_info,
10538 const rgw_obj& obj,
10539 const rgw_raw_obj& read_obj,
10540 off_t obj_ofs,
10541 off_t read_ofs, off_t len,
10542 bool is_head_obj, void *arg)
10543 {
10544 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10545 ObjectReadOperation op;
10546 struct get_obj_data *d = (struct get_obj_data *)arg;
10547 string oid, key;
10548 bufferlist *pbl;
10549 AioCompletion *c;
10550
10551 int r;
10552
10553 if (is_head_obj) {
10554 /* only when reading from the head object do we need to do the atomic test */
10555 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10556 if (r < 0)
10557 return r;
10558
10559 if (astate &&
10560 obj_ofs < astate->data.length()) {
10561 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10562
10563 d->data_lock.Lock();
10564 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10565 d->data_lock.Unlock();
10566 if (r < 0)
10567 return r;
10568
10569 d->lock.Lock();
10570 d->total_read += chunk_len;
10571 d->lock.Unlock();
10572
10573 len -= chunk_len;
10574 read_ofs += chunk_len;
10575 obj_ofs += chunk_len;
10576 if (!len)
10577 return 0;
10578 }
10579 }
10580
10581 d->throttle.get(len);
10582 if (d->is_cancelled()) {
10583 return d->get_err_code();
10584 }
10585
10586 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10587 * cleaning up
10588 */
10589 d->add_io(obj_ofs, len, &pbl, &c);
10590
10591 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10592 op.read(read_ofs, len, pbl, NULL);
10593
10594 librados::IoCtx io_ctx(d->io_ctx);
10595 io_ctx.locator_set_key(read_obj.loc);
10596
10597 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10598 if (r < 0) {
10599 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10600 goto done_err;
10601 }
10602
10603 // Flush data to client if there is any
10604 r = flush_read_list(d);
10605 if (r < 0)
10606 return r;
10607
10608 return 0;
10609
10610 done_err:
10611 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10612 d->set_cancelled(r);
10613 d->cancel_io(obj_ofs);
10614
10615 return r;
10616 }
10617
10618 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10619 {
10620 RGWRados *store = source->get_store();
10621 CephContext *cct = store->ctx();
10622
10623 struct get_obj_data *data = new get_obj_data(cct);
10624 bool done = false;
10625
10626 RGWObjectCtx& obj_ctx = source->get_ctx();
10627
10628 data->rados = store;
10629 data->io_ctx.dup(state.io_ctx);
10630 data->client_cb = cb;
10631
10632 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10633 if (r < 0) {
10634 data->cancel_all_io();
10635 goto done;
10636 }
10637
10638 while (!done) {
10639 r = data->wait_next_io(&done);
10640 if (r < 0) {
10641 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10642 data->cancel_all_io();
10643 break;
10644 }
10645 r = store->flush_read_list(data);
10646 if (r < 0) {
10647 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10648 data->cancel_all_io();
10649 break;
10650 }
10651 }
10652
10653 done:
10654 data->put();
10655 return r;
10656 }
10657
10658 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10659 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10660 off_t ofs, off_t end,
10661 uint64_t max_chunk_size,
10662 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10663 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10664 RGWObjState *, void *),
10665 void *arg)
10666 {
10667 rgw_raw_obj head_obj;
10668 rgw_raw_obj read_obj;
10669 uint64_t read_ofs = ofs;
10670 uint64_t len;
10671 bool reading_from_head = true;
10672 RGWObjState *astate = NULL;
10673
10674 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10675
10676 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10677 if (r < 0) {
10678 return r;
10679 }
10680
10681 if (end < 0)
10682 len = 0;
10683 else
10684 len = end - ofs + 1;
10685
10686 if (astate->has_manifest) {
10687 /* now get the relevant object stripe */
10688 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10689
10690 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10691
10692 for (; iter != obj_end && ofs <= end; ++iter) {
10693 off_t stripe_ofs = iter.get_stripe_ofs();
10694 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10695
10696 while (ofs < next_stripe_ofs && ofs <= end) {
10697 read_obj = iter.get_location().get_raw_obj(this);
10698 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10699 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10700
10701 if (read_len > max_chunk_size) {
10702 read_len = max_chunk_size;
10703 }
10704
10705 reading_from_head = (read_obj == head_obj);
10706 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10707 if (r < 0) {
10708 return r;
10709 }
10710
10711 len -= read_len;
10712 ofs += read_len;
10713 }
10714 }
10715 } else {
10716 while (ofs <= end) {
10717 read_obj = head_obj;
10718 uint64_t read_len = min(len, max_chunk_size);
10719
10720 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10721 if (r < 0) {
10722 return r;
10723 }
10724
10725 len -= read_len;
10726 ofs += read_len;
10727 }
10728 }
10729
10730 return 0;
10731 }
10732
10733 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10734 {
10735 rgw_rados_ref ref;
10736 int r = get_obj_head_ref(bucket_info, obj, &ref);
10737 if (r < 0) {
10738 return r;
10739 }
10740
10741 return ref.ioctx.operate(ref.oid, op);
10742 }
10743
10744 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10745 {
10746 rgw_rados_ref ref;
10747 int r = get_obj_head_ref(bucket_info, obj, &ref);
10748 if (r < 0) {
10749 return r;
10750 }
10751
10752 bufferlist outbl;
10753
10754 return ref.ioctx.operate(ref.oid, op, &outbl);
10755 }
10756
10757 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10758 {
10759 ObjectWriteOperation op;
10760
10761 assert(olh_obj.key.instance.empty());
10762
10763 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10764
10765 if (!state.exists) {
10766 op.create(true);
10767 } else {
10768 op.assert_exists();
10769 }
10770
10771 /*
10772 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10773 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10774 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10775 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10776 * log will reflect that.
10777 *
10778 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10779 * is used for object data instance, olh_tag for olh instance.
10780 */
10781 if (has_tag) {
10782 /* guard against racing writes */
10783 bucket_index_guard_olh_op(state, op);
10784 }
10785
10786 if (!has_tag) {
10787 /* obj tag */
10788 string obj_tag;
10789 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10790 if (ret < 0) {
10791 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10792 return ret;
10793 }
10794 bufferlist bl;
10795 bl.append(obj_tag.c_str(), obj_tag.size());
10796 op.setxattr(RGW_ATTR_ID_TAG, bl);
10797
10798 state.attrset[RGW_ATTR_ID_TAG] = bl;
10799 state.obj_tag = bl;
10800
10801 /* olh tag */
10802 string olh_tag;
10803 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10804 if (ret < 0) {
10805 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10806 return ret;
10807 }
10808 bufferlist olh_bl;
10809 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10810 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10811
10812 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10813 state.olh_tag = olh_bl;
10814 state.is_olh = true;
10815
10816 bufferlist verbl;
10817 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10818 }
10819
10820 bufferlist bl;
10821 RGWOLHPendingInfo pending_info;
10822 pending_info.time = real_clock::now();
10823 ::encode(pending_info, bl);
10824
10825 #define OLH_PENDING_TAG_LEN 32
10826 /* tag will start with current time epoch, this so that entries are sorted by time */
10827 char buf[32];
10828 utime_t ut(pending_info.time);
10829 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10830 *op_tag = buf;
10831
10832 string s;
10833 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10834 if (ret < 0) {
10835 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10836 return ret;
10837 }
10838 op_tag->append(s);
10839
10840 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10841 attr_name.append(*op_tag);
10842
10843 op.setxattr(attr_name.c_str(), bl);
10844
10845 ret = obj_operate(bucket_info, olh_obj, &op);
10846 if (ret < 0) {
10847 return ret;
10848 }
10849
10850 state.exists = true;
10851 state.attrset[attr_name] = bl;
10852
10853 return 0;
10854 }
10855
10856 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10857 {
10858 int ret;
10859
10860 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10861 if (ret == -EEXIST) {
10862 ret = -ECANCELED;
10863 }
10864
10865 return ret;
10866 }
10867
10868 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
10869 {
10870 rgw_obj obj;
10871 const rgw_obj *pobj = &obj_instance;
10872 int r;
10873
10874 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10875 r = bs->init(pobj->bucket, *pobj);
10876 if (r < 0) {
10877 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
10878 return r;
10879 }
10880 r = call(bs);
10881 if (r != -ERR_BUSY_RESHARDING) {
10882 break;
10883 }
10884 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10885 string new_bucket_id;
10886 r = block_while_resharding(bs, &new_bucket_id);
10887 if (r == -ERR_BUSY_RESHARDING) {
10888 continue;
10889 }
10890 if (r < 0) {
10891 return r;
10892 }
10893 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10894 i = 0; /* resharding is finished, make sure we can retry */
10895
10896 obj = *pobj;
10897 obj.bucket.update_bucket_id(new_bucket_id);
10898 pobj = &obj;
10899 }
10900
10901 if (r < 0) {
10902 return r;
10903 }
10904
10905 return 0;
10906 }
10907
10908 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
10909 {
10910 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
10911
10912 return waiter->block_while_resharding(bs, new_bucket_id);
10913 }
10914
10915 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
10916 bool delete_marker,
10917 const string& op_tag,
10918 struct rgw_bucket_dir_entry_meta *meta,
10919 uint64_t olh_epoch,
10920 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
10921 {
10922 rgw_rados_ref ref;
10923 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10924 if (r < 0) {
10925 return r;
10926 }
10927
10928 rgw_zone_set zones_trace;
10929 if (_zones_trace) {
10930 zones_trace = *_zones_trace;
10931 } else {
10932 zones_trace.insert(get_zone().id);
10933 }
10934
10935 BucketShard bs(this);
10936
10937 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10938 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10939 librados::ObjectWriteOperation op;
10940 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10941 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
10942 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
10943 unmod_since, high_precision_time,
10944 get_zone().log_data, zones_trace);
10945 });
10946 if (r < 0) {
10947 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10948 return r;
10949 }
10950
10951 return 0;
10952 }
10953
10954 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
10955 {
10956 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
10957 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
10958 }
10959
10960 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
10961 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
10962 {
10963 rgw_rados_ref ref;
10964 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10965 if (r < 0) {
10966 return r;
10967 }
10968
10969 rgw_zone_set zones_trace;
10970 if (_zones_trace) {
10971 zones_trace = *_zones_trace;
10972 }
10973 zones_trace.insert(get_zone().id);
10974
10975 BucketShard bs(this);
10976
10977 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10978 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10979 librados::ObjectWriteOperation op;
10980 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10981 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
10982 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
10983 });
10984 if (r < 0) {
10985 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10986 return r;
10987 }
10988
10989 return 0;
10990 }
10991
10992 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
10993 const rgw_obj& obj_instance, uint64_t ver_marker,
10994 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
10995 bool *is_truncated)
10996 {
10997 rgw_rados_ref ref;
10998 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10999 if (r < 0) {
11000 return r;
11001 }
11002
11003 BucketShard bs(this);
11004 int ret = bs.init(obj_instance.bucket, obj_instance);
11005 if (ret < 0) {
11006 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11007 return ret;
11008 }
11009
11010 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11011
11012 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11013
11014 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11015 ObjectReadOperation op;
11016 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11017 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11018 key, ver_marker, olh_tag, log, is_truncated);
11019 });
11020 if (ret < 0) {
11021 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11022 return ret;
11023 }
11024
11025 return 0;
11026 }
11027
11028 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11029 {
11030 rgw_rados_ref ref;
11031 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11032 if (r < 0) {
11033 return r;
11034 }
11035
11036 BucketShard bs(this);
11037 int ret = bs.init(obj_instance.bucket, obj_instance);
11038 if (ret < 0) {
11039 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11040 return ret;
11041 }
11042
11043 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11044
11045 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11046
11047 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11048 ObjectWriteOperation op;
11049 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11050 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11051 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11052 });
11053 if (ret < 0) {
11054 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11055 return ret;
11056 }
11057
11058 return 0;
11059 }
11060
11061 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11062 {
11063 rgw_rados_ref ref;
11064 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11065 if (r < 0) {
11066 return r;
11067 }
11068
11069 BucketShard bs(this);
11070
11071 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11072
11073 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11074
11075 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11076 ObjectWriteOperation op;
11077 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11078 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11079 });
11080 if (ret < 0) {
11081 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11082 return ret;
11083 }
11084
11085 return 0;
11086 }
11087
11088 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11089 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11090 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11091 {
11092 if (log.empty()) {
11093 return 0;
11094 }
11095
11096 librados::ObjectWriteOperation op;
11097
11098 uint64_t last_ver = log.rbegin()->first;
11099 *plast_ver = last_ver;
11100
11101 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11102
11103 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11104 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11105
11106 bool need_to_link = false;
11107 cls_rgw_obj_key key;
11108 bool delete_marker = false;
11109 list<cls_rgw_obj_key> remove_instances;
11110 bool need_to_remove = false;
11111
11112 for (iter = log.begin(); iter != log.end(); ++iter) {
11113 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11114 for (; viter != iter->second.end(); ++viter) {
11115 rgw_bucket_olh_log_entry& entry = *viter;
11116
11117 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11118 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11119 << (entry.delete_marker ? "(delete)" : "") << dendl;
11120 switch (entry.op) {
11121 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11122 remove_instances.push_back(entry.key);
11123 break;
11124 case CLS_RGW_OLH_OP_LINK_OLH:
11125 need_to_link = true;
11126 need_to_remove = false;
11127 key = entry.key;
11128 delete_marker = entry.delete_marker;
11129 break;
11130 case CLS_RGW_OLH_OP_UNLINK_OLH:
11131 need_to_remove = true;
11132 need_to_link = false;
11133 break;
11134 default:
11135 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11136 return -EIO;
11137 }
11138 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11139 attr_name.append(entry.op_tag);
11140 op.rmxattr(attr_name.c_str());
11141 }
11142 }
11143
11144 rgw_rados_ref ref;
11145 int r = get_obj_head_ref(bucket_info, obj, &ref);
11146 if (r < 0) {
11147 return r;
11148 }
11149
11150 const rgw_bucket& bucket = obj.bucket;
11151
11152 if (need_to_link) {
11153 rgw_obj target(bucket, key);
11154 RGWOLHInfo info;
11155 info.target = target;
11156 info.removed = delete_marker;
11157 bufferlist bl;
11158 ::encode(info, bl);
11159 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11160 }
11161
11162 /* first remove object instances */
11163 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11164 liter != remove_instances.end(); ++liter) {
11165 cls_rgw_obj_key& key = *liter;
11166 rgw_obj obj_instance(bucket, key);
11167 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11168 if (ret < 0 && ret != -ENOENT) {
11169 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11170 return ret;
11171 }
11172 }
11173
11174 /* update olh object */
11175 r = ref.ioctx.operate(ref.oid, &op);
11176 if (r == -ECANCELED) {
11177 r = 0;
11178 }
11179 if (r < 0) {
11180 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11181 return r;
11182 }
11183
11184 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11185 if (r < 0) {
11186 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11187 return r;
11188 }
11189
11190 if (need_to_remove) {
11191 ObjectWriteOperation rm_op;
11192
11193 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11194 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11195 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11196 rm_op.remove();
11197
11198 r = ref.ioctx.operate(ref.oid, &rm_op);
11199 if (r == -ECANCELED) {
11200 return 0; /* someone else won this race */
11201 } else {
11202 /*
11203 * only clear if was successful, otherwise we might clobber pending operations on this object
11204 */
11205 r = bucket_index_clear_olh(bucket_info, state, obj);
11206 if (r < 0) {
11207 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11208 return r;
11209 }
11210 }
11211 }
11212
11213 return 0;
11214 }
11215
11216 /*
11217 * read olh log and apply it
11218 */
11219 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11220 {
11221 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11222 bool is_truncated;
11223 uint64_t ver_marker = 0;
11224
11225 do {
11226 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11227 if (ret < 0) {
11228 return ret;
11229 }
11230 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11231 if (ret < 0) {
11232 return ret;
11233 }
11234 } while (is_truncated);
11235
11236 return 0;
11237 }
11238
11239 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11240 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11241 {
11242 string op_tag;
11243
11244 rgw_obj olh_obj = target_obj;
11245 olh_obj.key.instance.clear();
11246
11247 RGWObjState *state = NULL;
11248
11249 int ret = 0;
11250 int i;
11251
11252 #define MAX_ECANCELED_RETRY 100
11253 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11254 if (ret == -ECANCELED) {
11255 obj_ctx.obj.invalidate(olh_obj);
11256 }
11257
11258 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11259 if (ret < 0) {
11260 return ret;
11261 }
11262
11263 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11264 if (ret < 0) {
11265 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11266 if (ret == -ECANCELED) {
11267 continue;
11268 }
11269 return ret;
11270 }
11271 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11272 if (ret < 0) {
11273 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11274 if (ret == -ECANCELED) {
11275 continue;
11276 }
11277 return ret;
11278 }
11279 break;
11280 }
11281
11282 if (i == MAX_ECANCELED_RETRY) {
11283 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11284 return -EIO;
11285 }
11286
11287 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11288 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11289 ret = 0;
11290 }
11291 if (ret < 0) {
11292 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11293 return ret;
11294 }
11295
11296 return 0;
11297 }
11298
11299 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11300 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11301 {
11302 string op_tag;
11303
11304 rgw_obj olh_obj = target_obj;
11305 olh_obj.key.instance.clear();
11306
11307 RGWObjState *state = NULL;
11308
11309 int ret = 0;
11310 int i;
11311
11312 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11313 if (ret == -ECANCELED) {
11314 obj_ctx.obj.invalidate(olh_obj);
11315 }
11316
11317 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11318 if (ret < 0)
11319 return ret;
11320
11321 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11322 if (ret < 0) {
11323 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11324 if (ret == -ECANCELED) {
11325 continue;
11326 }
11327 return ret;
11328 }
11329
11330 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11331
11332 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11333 if (ret < 0) {
11334 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11335 if (ret == -ECANCELED) {
11336 continue;
11337 }
11338 return ret;
11339 }
11340 break;
11341 }
11342
11343 if (i == MAX_ECANCELED_RETRY) {
11344 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11345 return -EIO;
11346 }
11347
11348 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11349 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11350 return 0;
11351 }
11352 if (ret < 0) {
11353 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11354 return ret;
11355 }
11356
11357 return 0;
11358 }
11359
11360 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11361 {
11362 #define OBJ_INSTANCE_LEN 32
11363 char buf[OBJ_INSTANCE_LEN + 1];
11364
11365 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11366 no underscore for instance name due to the way we encode the raw keys */
11367
11368 target_obj->key.set_instance(buf);
11369 }
11370
11371 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11372 map<string, bufferlist> *attrset)
11373 {
11374 attrset->clear();
11375 map<string, bufferlist>::iterator iter;
11376 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11377 iter != unfiltered_attrset.end(); ++iter) {
11378 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11379 break;
11380 (*attrset)[iter->first] = iter->second;
11381 }
11382 }
11383
11384 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11385 {
11386 map<string, bufferlist> unfiltered_attrset;
11387
11388 ObjectReadOperation op;
11389 op.getxattrs(&unfiltered_attrset, NULL);
11390
11391 bufferlist outbl;
11392 int r = obj_operate(bucket_info, obj, &op);
11393
11394 if (r < 0) {
11395 return r;
11396 }
11397 map<string, bufferlist> attrset;
11398
11399 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11400
11401 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11402 if (iter == attrset.end()) { /* not an olh */
11403 return -EINVAL;
11404 }
11405
11406 try {
11407 bufferlist::iterator biter = iter->second.begin();
11408 ::decode(*olh, biter);
11409 } catch (buffer::error& err) {
11410 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11411 return -EIO;
11412 }
11413
11414 return 0;
11415 }
11416
11417 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11418 map<string, bufferlist> *rm_pending_entries)
11419 {
11420 map<string, bufferlist>::iterator iter = pending_entries.begin();
11421
11422 real_time now = real_clock::now();
11423
11424 while (iter != pending_entries.end()) {
11425 bufferlist::iterator biter = iter->second.begin();
11426 RGWOLHPendingInfo pending_info;
11427 try {
11428 ::decode(pending_info, biter);
11429 } catch (buffer::error& err) {
11430 /* skipping bad entry, we could remove it but it might hide a bug */
11431 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11432 ++iter;
11433 continue;
11434 }
11435
11436 map<string, bufferlist>::iterator cur_iter = iter;
11437 ++iter;
11438 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11439 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11440 pending_entries.erase(cur_iter);
11441 } else {
11442 /* entries names are sorted by time (rounded to a second) */
11443 break;
11444 }
11445 }
11446 }
11447
11448 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11449 {
11450 ObjectWriteOperation op;
11451
11452 bucket_index_guard_olh_op(state, op);
11453
11454 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11455 op.rmxattr(iter->first.c_str());
11456 }
11457
11458 rgw_rados_ref ref;
11459 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11460 if (r < 0) {
11461 return r;
11462 }
11463
11464 /* update olh object */
11465 r = ref.ioctx.operate(ref.oid, &op);
11466 if (r == -ENOENT || r == -ECANCELED) {
11467 /* raced with some other change, shouldn't sweat about it */
11468 r = 0;
11469 }
11470 if (r < 0) {
11471 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11472 return r;
11473 }
11474
11475 return 0;
11476 }
11477
11478 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11479 {
11480 map<string, bufferlist> pending_entries;
11481 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11482
11483 map<string, bufferlist> rm_pending_entries;
11484 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11485
11486 if (!rm_pending_entries.empty()) {
11487 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11488 if (ret < 0) {
11489 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11490 return ret;
11491 }
11492 }
11493 if (!pending_entries.empty()) {
11494 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11495
11496 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11497 if (ret < 0) {
11498 return ret;
11499 }
11500 }
11501
11502 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11503 assert(iter != state->attrset.end());
11504 RGWOLHInfo olh;
11505 try {
11506 bufferlist::iterator biter = iter->second.begin();
11507 ::decode(olh, biter);
11508 } catch (buffer::error& err) {
11509 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11510 return -EIO;
11511 }
11512
11513 if (olh.removed) {
11514 return -ENOENT;
11515 }
11516
11517 *target = olh.target;
11518
11519 return 0;
11520 }
11521
11522 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11523 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11524 RGWObjVersionTracker *objv_tracker)
11525 {
11526 rgw_rados_ref ref;
11527 int r = get_raw_obj_ref(obj, &ref);
11528 if (r < 0) {
11529 return r;
11530 }
11531
11532 map<string, bufferlist> unfiltered_attrset;
11533 uint64_t size = 0;
11534 struct timespec mtime_ts;
11535
11536 ObjectReadOperation op;
11537 if (objv_tracker) {
11538 objv_tracker->prepare_op_for_read(&op);
11539 }
11540 if (attrs) {
11541 op.getxattrs(&unfiltered_attrset, NULL);
11542 }
11543 if (psize || pmtime) {
11544 op.stat2(&size, &mtime_ts, NULL);
11545 }
11546 if (first_chunk) {
11547 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11548 }
11549 bufferlist outbl;
11550 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11551
11552 if (epoch) {
11553 *epoch = ref.ioctx.get_last_version();
11554 }
11555
11556 if (r < 0)
11557 return r;
11558
11559 if (psize)
11560 *psize = size;
11561 if (pmtime)
11562 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11563 if (attrs) {
11564 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11565 }
11566
11567 return 0;
11568 }
11569
11570 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11571 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11572 {
11573 map<string, rgw_bucket_dir_header> headers;
11574 map<int, string> bucket_instance_ids;
11575 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11576 if (r < 0) {
11577 return r;
11578 }
11579
11580 assert(headers.size() == bucket_instance_ids.size());
11581
11582 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11583 map<int, string>::iterator viter = bucket_instance_ids.begin();
11584 BucketIndexShardsManager ver_mgr;
11585 BucketIndexShardsManager master_ver_mgr;
11586 BucketIndexShardsManager marker_mgr;
11587 char buf[64];
11588 for(; iter != headers.end(); ++iter, ++viter) {
11589 accumulate_raw_stats(iter->second, stats);
11590 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11591 ver_mgr.add(viter->first, string(buf));
11592 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11593 master_ver_mgr.add(viter->first, string(buf));
11594 if (shard_id >= 0) {
11595 *max_marker = iter->second.max_marker;
11596 } else {
11597 marker_mgr.add(viter->first, iter->second.max_marker);
11598 }
11599 if (syncstopped != NULL)
11600 *syncstopped = iter->second.syncstopped;
11601 }
11602 ver_mgr.to_string(bucket_ver);
11603 master_ver_mgr.to_string(master_ver);
11604 if (shard_id < 0) {
11605 marker_mgr.to_string(max_marker);
11606 }
11607 return 0;
11608 }
11609
11610 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11611 map<int, string>& markers)
11612 {
11613 map<string, rgw_bucket_dir_header> headers;
11614 map<int, string> bucket_instance_ids;
11615 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11616 if (r < 0)
11617 return r;
11618
11619 assert(headers.size() == bucket_instance_ids.size());
11620
11621 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11622 map<int, string>::iterator viter = bucket_instance_ids.begin();
11623
11624 for(; iter != headers.end(); ++iter, ++viter) {
11625 if (shard_id >= 0) {
11626 markers[shard_id] = iter->second.max_marker;
11627 } else {
11628 markers[viter->first] = iter->second.max_marker;
11629 }
11630 }
11631 return 0;
11632 }
11633
11634 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11635 RGWGetBucketStats_CB *cb;
11636 uint32_t pendings;
11637 map<RGWObjCategory, RGWStorageStats> stats;
11638 int ret_code;
11639 bool should_cb;
11640 Mutex lock;
11641
11642 public:
11643 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11644 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11645 lock("RGWGetBucketStatsContext") {}
11646
11647 void handle_response(int r, rgw_bucket_dir_header& header) override {
11648 Mutex::Locker l(lock);
11649 if (should_cb) {
11650 if ( r >= 0) {
11651 accumulate_raw_stats(header, stats);
11652 } else {
11653 ret_code = r;
11654 }
11655
11656 // Are we all done?
11657 if (--pendings == 0) {
11658 if (!ret_code) {
11659 cb->set_response(&stats);
11660 }
11661 cb->handle_response(ret_code);
11662 cb->put();
11663 }
11664 }
11665 }
11666
11667 void unset_cb() {
11668 Mutex::Locker l(lock);
11669 should_cb = false;
11670 }
11671 };
11672
11673 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11674 {
11675 int num_aio = 0;
11676 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11677 assert(get_ctx);
11678 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11679 if (r < 0) {
11680 ctx->put();
11681 if (num_aio) {
11682 get_ctx->unset_cb();
11683 }
11684 }
11685 get_ctx->put();
11686 return r;
11687 }
11688
11689 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11690 RGWGetUserStats_CB *cb;
11691
11692 public:
11693 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11694 : cb(cb) {}
11695
11696 void handle_response(int r, cls_user_header& header) override {
11697 const cls_user_stats& hs = header.stats;
11698 if (r >= 0) {
11699 RGWStorageStats stats;
11700
11701 stats.size = hs.total_bytes;
11702 stats.size_rounded = hs.total_bytes_rounded;
11703 stats.num_objects = hs.total_entries;
11704
11705 cb->set_response(stats);
11706 }
11707
11708 cb->handle_response(r);
11709
11710 cb->put();
11711 }
11712 };
11713
11714 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11715 {
11716 string user_str = user.to_str();
11717
11718 cls_user_header header;
11719 int r = cls_user_get_header(user_str, &header);
11720 if (r < 0)
11721 return r;
11722
11723 const cls_user_stats& hs = header.stats;
11724
11725 stats.size = hs.total_bytes;
11726 stats.size_rounded = hs.total_bytes_rounded;
11727 stats.num_objects = hs.total_entries;
11728
11729 return 0;
11730 }
11731
11732 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11733 {
11734 string user_str = user.to_str();
11735
11736 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11737 int r = cls_user_get_header_async(user_str, get_ctx);
11738 if (r < 0) {
11739 ctx->put();
11740 delete get_ctx;
11741 return r;
11742 }
11743
11744 return 0;
11745 }
11746
11747 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11748 {
11749 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11750 }
11751
11752 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11753 {
11754 if (!bucket.oid.empty()) {
11755 obj.init(get_zone_params().domain_root, bucket.oid);
11756 } else {
11757 string oid;
11758 get_bucket_meta_oid(bucket, oid);
11759 obj.init(get_zone_params().domain_root, oid);
11760 }
11761 }
11762
11763 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11764 real_time *pmtime, map<string, bufferlist> *pattrs)
11765 {
11766 size_t pos = meta_key.find(':');
11767 if (pos == string::npos) {
11768 return -EINVAL;
11769 }
11770 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11771 rgw_bucket_instance_key_to_oid(oid);
11772
11773 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11774 }
11775
11776 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11777 real_time *pmtime, map<string, bufferlist> *pattrs)
11778 {
11779 string oid;
11780 if (bucket.oid.empty()) {
11781 get_bucket_meta_oid(bucket, oid);
11782 } else {
11783 oid = bucket.oid;
11784 }
11785
11786 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11787 }
11788
11789 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11790 real_time *pmtime, map<string, bufferlist> *pattrs,
11791 rgw_cache_entry_info *cache_info)
11792 {
11793 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11794
11795 bufferlist epbl;
11796
11797 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, oid, epbl, &info.objv_tracker, pmtime, pattrs, cache_info);
11798 if (ret < 0) {
11799 return ret;
11800 }
11801
11802 bufferlist::iterator iter = epbl.begin();
11803 try {
11804 ::decode(info, iter);
11805 } catch (buffer::error& err) {
11806 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11807 return -EIO;
11808 }
11809 info.bucket.oid = oid;
11810 return 0;
11811 }
11812
11813 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11814 const string& tenant_name,
11815 const string& bucket_name,
11816 RGWBucketEntryPoint& entry_point,
11817 RGWObjVersionTracker *objv_tracker,
11818 real_time *pmtime,
11819 map<string, bufferlist> *pattrs,
11820 rgw_cache_entry_info *cache_info)
11821 {
11822 bufferlist bl;
11823 string bucket_entry;
11824
11825 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11826 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
11827 if (ret < 0) {
11828 return ret;
11829 }
11830
11831 bufferlist::iterator iter = bl.begin();
11832 try {
11833 ::decode(entry_point, iter);
11834 } catch (buffer::error& err) {
11835 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11836 return -EIO;
11837 }
11838 return 0;
11839 }
11840
11841 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11842 const string& tenant_name,
11843 const string& bucket_name)
11844 {
11845 RGWBucketEntryPoint entry_point;
11846 real_time ep_mtime;
11847 RGWObjVersionTracker ot;
11848 map<string, bufferlist> attrs;
11849 RGWBucketInfo info;
11850
11851 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11852
11853 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11854 if (ret < 0) {
11855 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11856 return ret;
11857 }
11858
11859 if (!entry_point.has_bucket_info) {
11860 /* already converted! */
11861 return 0;
11862 }
11863
11864 info = entry_point.old_bucket_info;
11865 info.bucket.oid = bucket_name;
11866 info.ep_objv = ot.read_version;
11867
11868 ot.generate_new_write_ver(cct);
11869
11870 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11871 if (ret < 0) {
11872 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11873 return ret;
11874 }
11875
11876 return 0;
11877 }
11878
11879 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
11880 const string& tenant, const string& bucket_name, RGWBucketInfo& info,
11881 real_time *pmtime, map<string, bufferlist> *pattrs)
11882 {
11883 bucket_info_entry e;
11884 string bucket_entry;
11885 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11886
11887 if (binfo_cache->find(bucket_entry, &e)) {
11888 info = e.info;
11889 if (pattrs)
11890 *pattrs = e.attrs;
11891 if (pmtime)
11892 *pmtime = e.mtime;
11893 return 0;
11894 }
11895
11896 RGWBucketEntryPoint entry_point;
11897 real_time ep_mtime;
11898 RGWObjVersionTracker ot;
11899 rgw_cache_entry_info entry_cache_info;
11900 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
11901 if (ret < 0) {
11902 /* only init these fields */
11903 info.bucket.tenant = tenant;
11904 info.bucket.name = bucket_name;
11905 return ret;
11906 }
11907
11908 if (entry_point.has_bucket_info) {
11909 info = entry_point.old_bucket_info;
11910 info.bucket.oid = bucket_name;
11911 info.bucket.tenant = tenant;
11912 info.ep_objv = ot.read_version;
11913 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
11914 return 0;
11915 }
11916
11917 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11918 * that we got
11919 */
11920 if (pattrs) {
11921 pattrs->clear();
11922 }
11923
11924 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
11925
11926
11927 /* read bucket instance info */
11928
11929 string oid;
11930 get_bucket_meta_oid(entry_point.bucket, oid);
11931
11932 rgw_cache_entry_info cache_info;
11933
11934 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, &cache_info);
11935 e.info.ep_objv = ot.read_version;
11936 info = e.info;
11937 if (ret < 0) {
11938 info.bucket.tenant = tenant;
11939 info.bucket.name = bucket_name;
11940 // XXX and why return anything in case of an error anyway?
11941 return ret;
11942 }
11943
11944 if (pmtime)
11945 *pmtime = e.mtime;
11946 if (pattrs)
11947 *pattrs = e.attrs;
11948
11949 list<rgw_cache_entry_info *> cache_info_entries;
11950 cache_info_entries.push_back(&entry_cache_info);
11951 cache_info_entries.push_back(&cache_info);
11952
11953
11954 /* chain to both bucket entry point and bucket instance */
11955 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
11956 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
11957 }
11958
11959 return 0;
11960 }
11961
11962 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
11963 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
11964 map<string, bufferlist> *pattrs)
11965 {
11966 bufferlist epbl;
11967 ::encode(entry_point, epbl);
11968 string bucket_entry;
11969 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11970 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
11971 }
11972
11973 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
11974 real_time mtime, map<string, bufferlist> *pattrs)
11975 {
11976 info.has_instance_obj = true;
11977 bufferlist bl;
11978
11979 ::encode(info, bl);
11980
11981 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
11982 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
11983 if (ret == -EEXIST) {
11984 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11985 * bucket operation on this specific bucket (e.g., being synced from the master), but
11986 * since bucket instace meta object is unique for this specific bucket instace, we don't
11987 * need to return an error.
11988 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11989 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11990 * locally, while in the sync thread we sync the new bucket.
11991 */
11992 ret = 0;
11993 }
11994 return ret;
11995 }
11996
11997 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
11998 map<string, bufferlist> *pattrs, bool create_entry_point)
11999 {
12000 bool create_head = !info.has_instance_obj || create_entry_point;
12001
12002 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12003 if (ret < 0) {
12004 return ret;
12005 }
12006
12007 if (!create_head)
12008 return 0; /* done! */
12009
12010 RGWBucketEntryPoint entry_point;
12011 entry_point.bucket = info.bucket;
12012 entry_point.owner = info.owner;
12013 entry_point.creation_time = info.creation_time;
12014 entry_point.linked = true;
12015 RGWObjVersionTracker ot;
12016 if (pep_objv && !pep_objv->tag.empty()) {
12017 ot.write_version = *pep_objv;
12018 } else {
12019 ot.generate_new_write_ver(cct);
12020 if (pep_objv) {
12021 *pep_objv = ot.write_version;
12022 }
12023 }
12024 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12025 if (ret < 0)
12026 return ret;
12027
12028 return 0;
12029 }
12030
12031 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12032 {
12033 rgw_rados_ref ref;
12034 int r = get_raw_obj_ref(obj, &ref);
12035 if (r < 0) {
12036 return r;
12037 }
12038
12039 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12040 if (r < 0)
12041 return r;
12042
12043 return 0;
12044
12045 }
12046
12047 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12048 std::map<string, bufferlist>& m)
12049 {
12050 rgw_rados_ref ref;
12051 int r = get_raw_obj_ref(obj, &ref);
12052 if (r < 0) {
12053 return r;
12054 }
12055
12056 #define MAX_OMAP_GET_ENTRIES 1024
12057 const int count = MAX_OMAP_GET_ENTRIES;
12058 string start_after;
12059
12060 while (true) {
12061 std::map<string, bufferlist> t;
12062 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12063 if (r < 0) {
12064 return r;
12065 }
12066 if (t.empty()) {
12067 break;
12068 }
12069 start_after = t.rbegin()->first;
12070 m.insert(t.begin(), t.end());
12071 }
12072 return 0;
12073 }
12074
12075 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12076 {
12077 rgw_rados_ref ref;
12078 int r = get_raw_obj_ref(obj, &ref);
12079 if (r < 0) {
12080 return r;
12081 }
12082 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12083
12084 map<string, bufferlist> m;
12085 m[key] = bl;
12086
12087 r = ref.ioctx.omap_set(ref.oid, m);
12088
12089 return r;
12090 }
12091
12092 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12093 {
12094 rgw_rados_ref ref;
12095 int r = get_raw_obj_ref(obj, &ref);
12096 if (r < 0) {
12097 return r;
12098 }
12099
12100 r = ref.ioctx.omap_set(ref.oid, m);
12101
12102 return r;
12103 }
12104
12105 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12106 {
12107 rgw_rados_ref ref;
12108 int r = get_raw_obj_ref(obj, &ref);
12109 if (r < 0) {
12110 return r;
12111 }
12112
12113 set<string> k;
12114 k.insert(key);
12115
12116 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12117 return r;
12118 }
12119
12120 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12121 {
12122 RGWObjectCtx obj_ctx(this);
12123
12124 map<string, RGWBucketEnt>::iterator iter;
12125 for (iter = m.begin(); iter != m.end(); ++iter) {
12126 RGWBucketEnt& ent = iter->second;
12127 rgw_bucket& bucket = ent.bucket;
12128 ent.count = 0;
12129 ent.size = 0;
12130 ent.size_rounded = 0;
12131
12132 map<string, rgw_bucket_dir_header> headers;
12133
12134 RGWBucketInfo bucket_info;
12135 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12136 if (ret < 0) {
12137 return ret;
12138 }
12139
12140 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12141 if (r < 0)
12142 return r;
12143
12144 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12145 for (; hiter != headers.end(); ++hiter) {
12146 RGWObjCategory category = main_category;
12147 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12148 if (iter != hiter->second.stats.end()) {
12149 struct rgw_bucket_category_stats& stats = iter->second;
12150 ent.count += stats.num_entries;
12151 ent.size += stats.total_size;
12152 ent.size_rounded += stats.total_size_rounded;
12153 }
12154 }
12155 }
12156
12157 return m.size();
12158 }
12159
12160 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12161 {
12162 rgw_rados_ref ref;
12163 int r = get_raw_obj_ref(obj, &ref);
12164 if (r < 0) {
12165 return r;
12166 }
12167 librados::Rados *rad = get_rados_handle();
12168 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12169
12170 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12171 completion->release();
12172 return r;
12173 }
12174
12175 int RGWRados::distribute(const string& key, bufferlist& bl)
12176 {
12177 /*
12178 * we were called before watch was initialized. This can only happen if we're updating some system
12179 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12180 * objects, they're currently only read on startup anyway.
12181 */
12182 if (!watch_initialized)
12183 return 0;
12184
12185 string notify_oid;
12186 pick_control_oid(key, notify_oid);
12187
12188 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12189 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12190 }
12191
12192 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12193 {
12194 librados::IoCtx& io_ctx = ctx.io_ctx;
12195 librados::NObjectIterator& iter = ctx.iter;
12196
12197 int r = open_pool_ctx(pool, io_ctx);
12198 if (r < 0)
12199 return r;
12200
12201 iter = io_ctx.nobjects_begin();
12202
12203 return 0;
12204 }
12205
12206 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12207 bool *is_truncated, RGWAccessListFilter *filter)
12208 {
12209 librados::IoCtx& io_ctx = ctx.io_ctx;
12210 librados::NObjectIterator& iter = ctx.iter;
12211
12212 if (iter == io_ctx.nobjects_end())
12213 return -ENOENT;
12214
12215 uint32_t i;
12216
12217 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12218 rgw_bucket_dir_entry e;
12219
12220 string oid = iter->get_oid();
12221 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12222
12223 // fill it in with initial values; we may correct later
12224 if (filter && !filter->filter(oid, oid))
12225 continue;
12226
12227 e.key = oid;
12228 objs.push_back(e);
12229 }
12230
12231 if (is_truncated)
12232 *is_truncated = (iter != io_ctx.nobjects_end());
12233
12234 return objs.size();
12235 }
12236 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12237 string prefix;
12238
12239 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12240 bool filter(string& name, string& key) override {
12241 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12242 }
12243 };
12244
12245 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12246 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12247 bool *is_truncated)
12248 {
12249 RGWAccessListFilterPrefix filter(prefix_filter);
12250
12251 if (!ctx.initialized) {
12252 int r = pool_iterate_begin(pool, ctx.iter_ctx);
12253 if (r < 0) {
12254 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12255 return r;
12256 }
12257 ctx.initialized = true;
12258 }
12259
12260 vector<rgw_bucket_dir_entry> objs;
12261 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12262 if (r < 0) {
12263 if(r != -ENOENT)
12264 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12265 return r;
12266 }
12267
12268 vector<rgw_bucket_dir_entry>::iterator iter;
12269 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12270 oids.push_back(iter->key.name);
12271 }
12272
12273 return oids.size();
12274 }
12275
12276 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12277 std::list<rgw_bi_log_entry>& result, bool *truncated)
12278 {
12279 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12280 result.clear();
12281
12282 librados::IoCtx index_ctx;
12283 map<int, string> oids;
12284 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12285 map<int, string> bucket_instance_ids;
12286 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12287 if (r < 0)
12288 return r;
12289
12290 BucketIndexShardsManager marker_mgr;
12291 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12292 // If there are multiple shards for the bucket index object, the marker
12293 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12294 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12295 // only contain one record, and the key is the bucket instance id.
12296 r = marker_mgr.from_string(marker, shard_id);
12297 if (r < 0)
12298 return r;
12299
12300 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12301 if (r < 0)
12302 return r;
12303
12304 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12305 map<int, list<rgw_bi_log_entry>::iterator> vends;
12306 if (truncated) {
12307 *truncated = false;
12308 }
12309 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12310 for (; miter != bi_log_lists.end(); ++miter) {
12311 int shard_id = miter->first;
12312 vcurrents[shard_id] = miter->second.entries.begin();
12313 vends[shard_id] = miter->second.entries.end();
12314 if (truncated) {
12315 *truncated = (*truncated || miter->second.truncated);
12316 }
12317 }
12318
12319 size_t total = 0;
12320 bool has_more = true;
12321 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12322 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12323 while (total < max && has_more) {
12324 has_more = false;
12325
12326 viter = vcurrents.begin();
12327 eiter = vends.begin();
12328
12329 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12330 assert (eiter != vends.end());
12331
12332 int shard_id = viter->first;
12333 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12334
12335 if (liter == eiter->second){
12336 continue;
12337 }
12338 rgw_bi_log_entry& entry = *(liter);
12339 if (has_shards) {
12340 char buf[16];
12341 snprintf(buf, sizeof(buf), "%d", shard_id);
12342 string tmp_id;
12343 build_bucket_index_marker(buf, entry.id, &tmp_id);
12344 entry.id.swap(tmp_id);
12345 }
12346 marker_mgr.add(shard_id, entry.id);
12347 result.push_back(entry);
12348 total++;
12349 has_more = true;
12350 ++liter;
12351 }
12352 }
12353
12354 if (truncated) {
12355 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12356 assert (eiter != vends.end());
12357 *truncated = (*truncated || (viter->second != eiter->second));
12358 }
12359 }
12360
12361 // Refresh marker, if there are multiple shards, the output will look like
12362 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12363 // if there is no sharding, the simply marker (without oid) is returned
12364 if (has_shards) {
12365 marker_mgr.to_string(&marker);
12366 } else {
12367 if (!result.empty()) {
12368 marker = result.rbegin()->id;
12369 }
12370 }
12371
12372 return 0;
12373 }
12374
12375 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12376 {
12377 librados::IoCtx index_ctx;
12378 map<int, string> bucket_objs;
12379
12380 BucketIndexShardsManager start_marker_mgr;
12381 BucketIndexShardsManager end_marker_mgr;
12382
12383 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12384 if (r < 0) {
12385 return r;
12386 }
12387
12388 r = start_marker_mgr.from_string(start_marker, shard_id);
12389 if (r < 0) {
12390 return r;
12391 }
12392
12393 r = end_marker_mgr.from_string(end_marker, shard_id);
12394 if (r < 0) {
12395 return r;
12396 }
12397
12398 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12399 cct->_conf->rgw_bucket_index_max_aio)();
12400
12401 return r;
12402 }
12403
12404 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12405 {
12406 librados::IoCtx index_ctx;
12407 map<int, string> bucket_objs;
12408 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12409 if (r < 0)
12410 return r;
12411
12412 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12413 }
12414
12415 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12416 {
12417 librados::IoCtx index_ctx;
12418 map<int, string> bucket_objs;
12419 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12420 if (r < 0)
12421 return r;
12422
12423 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12424 }
12425
12426 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12427 {
12428 rgw_rados_ref ref;
12429 int r = get_obj_head_ref(bucket_info, obj, &ref);
12430 if (r < 0) {
12431 return r;
12432 }
12433
12434 rgw_cls_bi_entry bi_entry;
12435 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12436 if (r < 0 && r != -ENOENT) {
12437 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12438 }
12439 if (r < 0) {
12440 return r;
12441 }
12442 bufferlist::iterator iter = bi_entry.data.begin();
12443 try {
12444 ::decode(*dirent, iter);
12445 } catch (buffer::error& err) {
12446 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12447 return -EIO;
12448 }
12449
12450 return 0;
12451 }
12452
12453 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12454 {
12455 BucketShard bs(this);
12456 int ret = bs.init(bucket, obj);
12457 if (ret < 0) {
12458 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12459 return ret;
12460 }
12461
12462 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12463
12464 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12465 if (ret < 0)
12466 return ret;
12467
12468 return 0;
12469 }
12470
12471 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12472 {
12473 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12474 }
12475
12476 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12477 {
12478 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12479 if (ret < 0)
12480 return ret;
12481
12482 return 0;
12483 }
12484
12485 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12486 {
12487 BucketShard bs(this);
12488 int ret = bs.init(bucket, obj);
12489 if (ret < 0) {
12490 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12491 return ret;
12492 }
12493
12494 return bi_put(bs, entry);
12495 }
12496
12497 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12498 {
12499 rgw_obj obj(bucket, obj_name);
12500 BucketShard bs(this);
12501 int ret = bs.init(bucket, obj);
12502 if (ret < 0) {
12503 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12504 return ret;
12505 }
12506
12507 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12508 if (ret == -ENOENT) {
12509 *is_truncated = false;
12510 }
12511 if (ret < 0)
12512 return ret;
12513
12514 return 0;
12515 }
12516
12517 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12518 {
12519 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12520 if (ret < 0)
12521 return ret;
12522
12523 return 0;
12524 }
12525
12526 int RGWRados::bi_remove(BucketShard& bs)
12527 {
12528 int ret = bs.index_ctx.remove(bs.bucket_obj);
12529 if (ret == -ENOENT) {
12530 ret = 0;
12531 }
12532 if (ret < 0) {
12533 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12534 return ret;
12535 }
12536
12537 return 0;
12538 }
12539
12540 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12541 {
12542 BucketShard bs(this);
12543 int ret = bs.init(bucket, shard_id);
12544 if (ret < 0) {
12545 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12546 return ret;
12547 }
12548
12549 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12550 }
12551
12552 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12553 {
12554 return gc_pool_ctx.operate(oid, op);
12555 }
12556
12557 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12558 {
12559 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12560 int r = gc_pool_ctx.aio_operate(oid, c, op);
12561 c->release();
12562 return r;
12563 }
12564
12565 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12566 {
12567 return gc_pool_ctx.operate(oid, op, pbl);
12568 }
12569
12570 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12571 {
12572 return gc->list(index, marker, max, expired_only, result, truncated);
12573 }
12574
12575 int RGWRados::process_gc()
12576 {
12577 return gc->process();
12578 }
12579
12580 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12581 {
12582 return lc->list_lc_progress(marker, max_entries, progress_map);
12583 }
12584
12585 int RGWRados::process_lc()
12586 {
12587 return lc->process();
12588 }
12589
12590 int RGWRados::process_expire_objects()
12591 {
12592 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12593 return 0;
12594 }
12595
12596 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12597 {
12598 bufferlist in;
12599 cls_rgw_bucket_init(op);
12600 return index_ctx.operate(oid, &op);
12601 }
12602
12603 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12604 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12605 {
12606 rgw_zone_set zones_trace;
12607 if (_zones_trace) {
12608 zones_trace = *_zones_trace;
12609 }
12610 else {
12611 zones_trace.insert(get_zone().id);
12612 }
12613
12614 ObjectWriteOperation o;
12615 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12616 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12617 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12618 return bs.index_ctx.operate(bs.bucket_obj, &o);
12619 }
12620
12621 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12622 int64_t pool, uint64_t epoch,
12623 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12624 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12625 {
12626 ObjectWriteOperation o;
12627 rgw_bucket_dir_entry_meta dir_meta;
12628 dir_meta = ent.meta;
12629 dir_meta.category = category;
12630
12631 rgw_bucket_entry_ver ver;
12632 ver.pool = pool;
12633 ver.epoch = epoch;
12634 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12635 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12636 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12637 get_zone().log_data, bilog_flags, _zones_trace);
12638 complete_op_data *arg;
12639 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12640 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12641 librados::AioCompletion *completion = arg->rados_completion;
12642 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12643 completion->release(); /* can't reference arg here, as it might have already been released */
12644 return ret;
12645 }
12646
12647 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12648 int64_t pool, uint64_t epoch,
12649 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12650 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12651 {
12652 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12653 }
12654
12655 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12656 int64_t pool, uint64_t epoch,
12657 rgw_obj& obj,
12658 real_time& removed_mtime,
12659 list<rgw_obj_index_key> *remove_objs,
12660 uint16_t bilog_flags,
12661 rgw_zone_set *zones_trace)
12662 {
12663 rgw_bucket_dir_entry ent;
12664 ent.meta.mtime = removed_mtime;
12665 obj.key.get_index_key(&ent.key);
12666 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12667 }
12668
12669 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12670 {
12671 rgw_bucket_dir_entry ent;
12672 obj.key.get_index_key(&ent.key);
12673 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12674 }
12675
12676 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12677 {
12678 librados::IoCtx index_ctx;
12679 map<int, string> bucket_objs;
12680 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12681 if (r < 0)
12682 return r;
12683
12684 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12685 }
12686
12687 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12688 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12689 bool *is_truncated, rgw_obj_index_key *last_entry,
12690 bool (*force_check_filter)(const string& name))
12691 {
12692 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12693
12694 librados::IoCtx index_ctx;
12695 // key - oid (for different shards if there is any)
12696 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12697 map<int, string> oids;
12698 map<int, struct rgw_cls_list_ret> list_results;
12699 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12700 if (r < 0)
12701 return r;
12702
12703 cls_rgw_obj_key start_key(start.name, start.instance);
12704 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12705 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12706 if (r < 0)
12707 return r;
12708
12709 // Create a list of iterators that are used to iterate each shard
12710 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12711 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12712 vector<string> vnames(list_results.size());
12713 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12714 *is_truncated = false;
12715 for (; iter != list_results.end(); ++iter) {
12716 vcurrents.push_back(iter->second.dir.m.begin());
12717 vends.push_back(iter->second.dir.m.end());
12718 vnames.push_back(oids[iter->first]);
12719 *is_truncated = (*is_truncated || iter->second.is_truncated);
12720 }
12721
12722 // Create a map to track the next candidate entry from each shard, if the entry
12723 // from a specified shard is selected/erased, the next entry from that shard will
12724 // be inserted for next round selection
12725 map<string, size_t> candidates;
12726 for (size_t i = 0; i < vcurrents.size(); ++i) {
12727 if (vcurrents[i] != vends[i]) {
12728 candidates[vcurrents[i]->first] = i;
12729 }
12730 }
12731
12732 map<string, bufferlist> updates;
12733 uint32_t count = 0;
12734 while (count < num_entries && !candidates.empty()) {
12735 r = 0;
12736 // Select the next one
12737 int pos = candidates.begin()->second;
12738 const string& name = vcurrents[pos]->first;
12739 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12740
12741 bool force_check = force_check_filter && force_check_filter(dirent.key.name);
12742 if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
12743 /* there are uncommitted ops. We need to check the current state,
12744 * and if the tags are old we need to do cleanup as well. */
12745 librados::IoCtx sub_ctx;
12746 sub_ctx.dup(index_ctx);
12747 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12748 if (r < 0 && r != -ENOENT) {
12749 return r;
12750 }
12751 }
12752 if (r >= 0) {
12753 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12754 m[name] = std::move(dirent);
12755 ++count;
12756 }
12757
12758 // Refresh the candidates map
12759 candidates.erase(candidates.begin());
12760 ++vcurrents[pos];
12761 if (vcurrents[pos] != vends[pos]) {
12762 candidates[vcurrents[pos]->first] = pos;
12763 }
12764 }
12765
12766 // Suggest updates if there is any
12767 map<string, bufferlist>::iterator miter = updates.begin();
12768 for (; miter != updates.end(); ++miter) {
12769 if (miter->second.length()) {
12770 ObjectWriteOperation o;
12771 cls_rgw_suggest_changes(o, miter->second);
12772 // we don't care if we lose suggested updates, send them off blindly
12773 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12774 index_ctx.aio_operate(miter->first, c, &o);
12775 c->release();
12776 }
12777 }
12778
12779 // Check if all the returned entries are consumed or not
12780 for (size_t i = 0; i < vcurrents.size(); ++i) {
12781 if (vcurrents[i] != vends[i])
12782 *is_truncated = true;
12783 }
12784 if (!m.empty())
12785 *last_entry = m.rbegin()->first;
12786
12787 return 0;
12788 }
12789
12790 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12791 {
12792 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12793
12794 rgw_rados_ref ref;
12795 int r = get_raw_obj_ref(obj, &ref);
12796 if (r < 0) {
12797 return r;
12798 }
12799
12800 ObjectWriteOperation op;
12801 cls_rgw_usage_log_add(op, info);
12802
12803 r = ref.ioctx.operate(ref.oid, &op);
12804 return r;
12805 }
12806
12807 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
12808 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
12809 {
12810 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12811
12812 rgw_rados_ref ref;
12813 int r = get_raw_obj_ref(obj, &ref);
12814 if (r < 0) {
12815 return r;
12816 }
12817
12818 *is_truncated = false;
12819
12820 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
12821 max_entries, read_iter, usage, is_truncated);
12822
12823 return r;
12824 }
12825
12826 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
12827 {
12828 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12829
12830 rgw_rados_ref ref;
12831 int r = get_raw_obj_ref(obj, &ref);
12832 if (r < 0) {
12833 return r;
12834 }
12835
12836 ObjectWriteOperation op;
12837 cls_rgw_usage_log_trim(op, user, start_epoch, end_epoch);
12838
12839 r = ref.ioctx.operate(ref.oid, &op);
12840 return r;
12841 }
12842
12843 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
12844 {
12845 librados::IoCtx index_ctx;
12846 string dir_oid;
12847
12848 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12849
12850 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
12851 if (r < 0)
12852 return r;
12853
12854 bufferlist updates;
12855
12856 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
12857 rgw_bucket_dir_entry entry;
12858 entry.key = *iter;
12859 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
12860 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12861 updates.append(CEPH_RGW_REMOVE | suggest_flag);
12862 ::encode(entry, updates);
12863 }
12864
12865 bufferlist out;
12866
12867 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
12868
12869 return r;
12870 }
12871
12872 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
12873 const RGWBucketInfo& bucket_info,
12874 rgw_bucket_dir_entry& list_state,
12875 rgw_bucket_dir_entry& object,
12876 bufferlist& suggested_updates)
12877 {
12878 const rgw_bucket& bucket = bucket_info.bucket;
12879 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12880
12881 std::string loc;
12882
12883 rgw_obj obj(bucket, list_state.key);
12884
12885 string oid;
12886 get_obj_bucket_and_oid_loc(obj, oid, loc);
12887
12888 if (loc != list_state.locator) {
12889 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
12890 }
12891
12892 io_ctx.locator_set_key(list_state.locator);
12893
12894 RGWObjState *astate = NULL;
12895 RGWObjectCtx rctx(this);
12896 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
12897 if (r < 0)
12898 return r;
12899
12900 list_state.pending_map.clear(); // we don't need this and it inflates size
12901 if (!astate->exists) {
12902 /* object doesn't exist right now -- hopefully because it's
12903 * marked as !exists and got deleted */
12904 if (list_state.exists) {
12905 /* FIXME: what should happen now? Work out if there are any
12906 * non-bad ways this could happen (there probably are, but annoying
12907 * to handle!) */
12908 }
12909 // encode a suggested removal of that key
12910 list_state.ver.epoch = io_ctx.get_last_version();
12911 list_state.ver.pool = io_ctx.get_id();
12912 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
12913 return -ENOENT;
12914 }
12915
12916 string etag;
12917 string content_type;
12918 ACLOwner owner;
12919
12920 object.meta.size = astate->size;
12921 object.meta.accounted_size = astate->accounted_size;
12922 object.meta.mtime = astate->mtime;
12923
12924 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
12925 if (iter != astate->attrset.end()) {
12926 etag = iter->second.c_str();
12927 }
12928 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
12929 if (iter != astate->attrset.end()) {
12930 content_type = iter->second.c_str();
12931 }
12932 iter = astate->attrset.find(RGW_ATTR_ACL);
12933 if (iter != astate->attrset.end()) {
12934 r = decode_policy(iter->second, &owner);
12935 if (r < 0) {
12936 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
12937 }
12938 }
12939
12940 if (astate->has_manifest) {
12941 RGWObjManifest::obj_iterator miter;
12942 RGWObjManifest& manifest = astate->manifest;
12943 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
12944 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
12945 rgw_obj loc;
12946 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
12947
12948 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
12949 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
12950 r = delete_obj_index(loc);
12951 if (r < 0) {
12952 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
12953 }
12954 }
12955 }
12956 }
12957
12958 object.meta.etag = etag;
12959 object.meta.content_type = content_type;
12960 object.meta.owner = owner.get_id().to_str();
12961 object.meta.owner_display_name = owner.get_display_name();
12962
12963 // encode suggested updates
12964 list_state.ver.pool = io_ctx.get_id();
12965 list_state.ver.epoch = astate->epoch;
12966 list_state.meta.size = object.meta.size;
12967 list_state.meta.accounted_size = object.meta.accounted_size;
12968 list_state.meta.mtime = object.meta.mtime;
12969 list_state.meta.category = main_category;
12970 list_state.meta.etag = etag;
12971 list_state.meta.content_type = content_type;
12972 if (astate->obj_tag.length() > 0)
12973 list_state.tag = astate->obj_tag.c_str();
12974 list_state.meta.owner = owner.get_id().to_str();
12975 list_state.meta.owner_display_name = owner.get_display_name();
12976
12977 list_state.exists = true;
12978 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
12979 return 0;
12980 }
12981
12982 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
12983 {
12984 librados::IoCtx index_ctx;
12985 map<int, string> oids;
12986 map<int, struct rgw_cls_list_ret> list_results;
12987 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
12988 if (r < 0)
12989 return r;
12990
12991 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12992 if (r < 0)
12993 return r;
12994
12995 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12996 for(; iter != list_results.end(); ++iter) {
12997 headers[oids[iter->first]] = iter->second.dir.header;
12998 }
12999 return 0;
13000 }
13001
13002 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13003 {
13004 librados::IoCtx index_ctx;
13005 map<int, string> bucket_objs;
13006 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13007 if (r < 0)
13008 return r;
13009
13010 map<int, string>::iterator iter = bucket_objs.begin();
13011 for (; iter != bucket_objs.end(); ++iter) {
13012 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13013 if (r < 0) {
13014 ctx->put();
13015 break;
13016 } else {
13017 (*num_aio)++;
13018 }
13019 }
13020 return r;
13021 }
13022
13023 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13024 {
13025 string buckets_obj_id;
13026 rgw_get_buckets_obj(user_id, buckets_obj_id);
13027 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13028
13029 rgw_rados_ref ref;
13030 int r = get_raw_obj_ref(obj, &ref);
13031 if (r < 0) {
13032 return r;
13033 }
13034
13035 librados::ObjectReadOperation op;
13036 int rc;
13037 ::cls_user_get_header(op, header, &rc);
13038 bufferlist ibl;
13039 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13040 if (r < 0)
13041 return r;
13042 if (rc < 0)
13043 return rc;
13044
13045 return 0;
13046 }
13047
13048 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13049 {
13050 string buckets_obj_id;
13051 rgw_get_buckets_obj(user_id, buckets_obj_id);
13052 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13053
13054 rgw_rados_ref ref;
13055 int r = get_raw_obj_ref(obj, &ref);
13056 if (r < 0) {
13057 return r;
13058 }
13059
13060 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13061 if (r < 0)
13062 return r;
13063
13064 return 0;
13065 }
13066
13067 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13068 {
13069 map<string, struct rgw_bucket_dir_header> headers;
13070 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13071 if (r < 0) {
13072 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13073 return r;
13074 }
13075
13076 cls_user_bucket_entry entry;
13077
13078 bucket_info.bucket.convert(&entry.bucket);
13079
13080 for (const auto& hiter : headers) {
13081 for (const auto& iter : hiter.second.stats) {
13082 const struct rgw_bucket_category_stats& header_stats = iter.second;
13083 entry.size += header_stats.total_size;
13084 entry.size_rounded += header_stats.total_size_rounded;
13085 entry.count += header_stats.num_entries;
13086 }
13087 }
13088
13089 list<cls_user_bucket_entry> entries;
13090 entries.push_back(entry);
13091
13092 r = cls_user_update_buckets(user_obj, entries, false);
13093 if (r < 0) {
13094 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13095 return r;
13096 }
13097
13098 return 0;
13099 }
13100
13101 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13102 {
13103 map<string, struct rgw_bucket_dir_header> headers;
13104 RGWBucketInfo bucket_info;
13105 RGWObjectCtx obj_ctx(this);
13106 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13107 if (ret < 0) {
13108 return ret;
13109 }
13110
13111 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13112 if (ret < 0) {
13113 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13114 return ret;
13115 }
13116
13117 bucket.convert(&entry.bucket);
13118
13119 for (const auto& hiter : headers) {
13120 for (const auto& iter : hiter.second.stats) {
13121 const struct rgw_bucket_category_stats& header_stats = iter.second;
13122 entry.size += header_stats.total_size;
13123 entry.size_rounded += header_stats.total_size_rounded;
13124 entry.count += header_stats.num_entries;
13125 }
13126 }
13127
13128 return 0;
13129 }
13130
13131 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13132 const string& in_marker,
13133 const string& end_marker,
13134 const int max_entries,
13135 list<cls_user_bucket_entry>& entries,
13136 string * const out_marker,
13137 bool * const truncated)
13138 {
13139 rgw_rados_ref ref;
13140 int r = get_raw_obj_ref(obj, &ref);
13141 if (r < 0) {
13142 return r;
13143 }
13144
13145 librados::ObjectReadOperation op;
13146 int rc;
13147
13148 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13149 bufferlist ibl;
13150 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13151 if (r < 0)
13152 return r;
13153 if (rc < 0)
13154 return rc;
13155
13156 return 0;
13157 }
13158
13159 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13160 {
13161 rgw_rados_ref ref;
13162 int r = get_raw_obj_ref(obj, &ref);
13163 if (r < 0) {
13164 return r;
13165 }
13166
13167 librados::ObjectWriteOperation op;
13168 cls_user_set_buckets(op, entries, add);
13169 r = ref.ioctx.operate(ref.oid, &op);
13170 if (r < 0)
13171 return r;
13172
13173 return 0;
13174 }
13175
13176 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13177 {
13178 string buckets_obj_id;
13179 rgw_get_buckets_obj(user_id, buckets_obj_id);
13180 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13181 return cls_user_complete_stats_sync(obj);
13182 }
13183
13184 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13185 {
13186 rgw_rados_ref ref;
13187 int r = get_raw_obj_ref(obj, &ref);
13188 if (r < 0) {
13189 return r;
13190 }
13191
13192 librados::ObjectWriteOperation op;
13193 ::cls_user_complete_stats_sync(op);
13194 r = ref.ioctx.operate(ref.oid, &op);
13195 if (r < 0)
13196 return r;
13197
13198 return 0;
13199 }
13200
13201 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13202 {
13203 list<cls_user_bucket_entry> l;
13204 l.push_back(entry);
13205
13206 return cls_user_update_buckets(obj, l, true);
13207 }
13208
13209 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13210 {
13211 rgw_rados_ref ref;
13212 int r = get_system_obj_ref(obj, &ref);
13213 if (r < 0) {
13214 return r;
13215 }
13216
13217 librados::ObjectWriteOperation op;
13218 ::cls_user_remove_bucket(op, bucket);
13219 r = ref.ioctx.operate(ref.oid, &op);
13220 if (r < 0)
13221 return r;
13222
13223 return 0;
13224 }
13225
13226 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13227 RGWQuotaInfo& bucket_quota)
13228 {
13229 if (!cct->_conf->rgw_dynamic_resharding) {
13230 return 0;
13231 }
13232
13233 bool need_resharding = false;
13234 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13235 uint32_t suggested_num_shards;
13236
13237 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13238 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13239 1, need_resharding, &suggested_num_shards);
13240 if (ret < 0) {
13241 return ret;
13242 }
13243
13244 if (need_resharding) {
13245 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13246 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13247 dendl;
13248 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13249 }
13250
13251 return ret;
13252 }
13253
13254 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13255 {
13256 RGWReshard reshard(this);
13257
13258 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13259
13260 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13261 if (new_num_shards <= num_source_shards) {
13262 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13263 return 0;
13264 }
13265
13266 cls_rgw_reshard_entry entry;
13267 entry.time = real_clock::now();
13268 entry.tenant = bucket_info.owner.tenant;
13269 entry.bucket_name = bucket_info.bucket.name;
13270 entry.bucket_id = bucket_info.bucket.bucket_id;
13271 entry.old_num_shards = num_source_shards;
13272 entry.new_num_shards = new_num_shards;
13273
13274 return reshard.add(entry);
13275 }
13276
13277 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13278 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13279 {
13280 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13281 }
13282
13283 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13284 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13285 {
13286 if (!num_shards) {
13287 bucket_objects[0] = bucket_oid_base;
13288 } else {
13289 char buf[bucket_oid_base.size() + 32];
13290 if (shard_id < 0) {
13291 for (uint32_t i = 0; i < num_shards; ++i) {
13292 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13293 bucket_objects[i] = buf;
13294 }
13295 } else {
13296 if ((uint32_t)shard_id > num_shards) {
13297 return;
13298 }
13299 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13300 bucket_objects[shard_id] = buf;
13301 }
13302 }
13303 }
13304
13305 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13306 {
13307 const rgw_bucket& bucket = bucket_info.bucket;
13308 string plain_id = bucket.name + ":" + bucket.bucket_id;
13309 if (!bucket_info.num_shards) {
13310 (*result)[0] = plain_id;
13311 } else {
13312 char buf[16];
13313 if (shard_id < 0) {
13314 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13315 snprintf(buf, sizeof(buf), ":%d", i);
13316 (*result)[i] = plain_id + buf;
13317 }
13318 } else {
13319 if ((uint32_t)shard_id > bucket_info.num_shards) {
13320 return;
13321 }
13322 snprintf(buf, sizeof(buf), ":%d", shard_id);
13323 (*result)[shard_id] = plain_id + buf;
13324 }
13325 }
13326 }
13327
13328 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13329 int *shard_id)
13330 {
13331 int r = 0;
13332 switch (bucket_info.bucket_index_shard_hash_type) {
13333 case RGWBucketInfo::MOD:
13334 if (!bucket_info.num_shards) {
13335 if (shard_id) {
13336 *shard_id = -1;
13337 }
13338 } else {
13339 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13340 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13341 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13342 if (shard_id) {
13343 *shard_id = (int)sid;
13344 }
13345 }
13346 break;
13347 default:
13348 r = -ENOTSUP;
13349 }
13350 return r;
13351 }
13352
13353 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13354 int shard_id, string *bucket_obj)
13355 {
13356 if (!num_shards) {
13357 // By default with no sharding, we use the bucket oid as itself
13358 (*bucket_obj) = bucket_oid_base;
13359 } else {
13360 char buf[bucket_oid_base.size() + 32];
13361 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13362 (*bucket_obj) = buf;
13363 }
13364 }
13365
13366 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13367 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13368 {
13369 int r = 0;
13370 switch (hash_type) {
13371 case RGWBucketInfo::MOD:
13372 if (!num_shards) {
13373 // By default with no sharding, we use the bucket oid as itself
13374 (*bucket_obj) = bucket_oid_base;
13375 if (shard_id) {
13376 *shard_id = -1;
13377 }
13378 } else {
13379 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13380 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13381 sid = rgw_shards_mod(sid2, num_shards);
13382 char buf[bucket_oid_base.size() + 32];
13383 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13384 (*bucket_obj) = buf;
13385 if (shard_id) {
13386 *shard_id = (int)sid;
13387 }
13388 }
13389 break;
13390 default:
13391 r = -ENOTSUP;
13392 }
13393 return r;
13394 }
13395
13396 void RGWStateLog::oid_str(int shard, string& oid) {
13397 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13398 char buf[16];
13399 snprintf(buf, sizeof(buf), "%d", shard);
13400 oid += buf;
13401 }
13402
13403 int RGWStateLog::get_shard_num(const string& object) {
13404 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13405 return val % num_shards;
13406 }
13407
13408 string RGWStateLog::get_oid(const string& object) {
13409 int shard = get_shard_num(object);
13410 string oid;
13411 oid_str(shard, oid);
13412 return oid;
13413 }
13414
13415 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13416 rgw_pool pool;
13417 store->get_log_pool(pool);
13418 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13419 if (r < 0) {
13420 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13421 return r;
13422 }
13423 return 0;
13424 }
13425
13426 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13427 uint32_t state, bufferlist *bl, uint32_t *check_state)
13428 {
13429 if (client_id.empty() ||
13430 op_id.empty() ||
13431 object.empty()) {
13432 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13433 }
13434
13435 librados::IoCtx ioctx;
13436 int r = open_ioctx(ioctx);
13437 if (r < 0)
13438 return r;
13439
13440 string oid = get_oid(object);
13441
13442 librados::ObjectWriteOperation op;
13443 if (check_state) {
13444 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13445 }
13446 utime_t ts = ceph_clock_now();
13447 bufferlist nobl;
13448 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13449 r = ioctx.operate(oid, &op);
13450 if (r < 0) {
13451 return r;
13452 }
13453
13454 return 0;
13455 }
13456
13457 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13458 {
13459 if (client_id.empty() ||
13460 op_id.empty() ||
13461 object.empty()) {
13462 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13463 }
13464
13465 librados::IoCtx ioctx;
13466 int r = open_ioctx(ioctx);
13467 if (r < 0)
13468 return r;
13469
13470 string oid = get_oid(object);
13471
13472 librados::ObjectWriteOperation op;
13473 cls_statelog_remove_by_object(op, object, op_id);
13474 r = ioctx.operate(oid, &op);
13475 if (r < 0) {
13476 return r;
13477 }
13478
13479 return 0;
13480 }
13481
13482 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13483 void **handle)
13484 {
13485 list_state *state = new list_state;
13486 state->client_id = client_id;
13487 state->op_id = op_id;
13488 state->object = object;
13489 if (object.empty()) {
13490 state->cur_shard = 0;
13491 state->max_shard = num_shards - 1;
13492 } else {
13493 state->cur_shard = state->max_shard = get_shard_num(object);
13494 }
13495 *handle = (void *)state;
13496 }
13497
13498 int RGWStateLog::list_entries(void *handle, int max_entries,
13499 list<cls_statelog_entry>& entries,
13500 bool *done)
13501 {
13502 list_state *state = static_cast<list_state *>(handle);
13503
13504 librados::IoCtx ioctx;
13505 int r = open_ioctx(ioctx);
13506 if (r < 0)
13507 return r;
13508
13509 entries.clear();
13510
13511 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13512 string oid;
13513 oid_str(state->cur_shard, oid);
13514
13515 librados::ObjectReadOperation op;
13516 list<cls_statelog_entry> ents;
13517 bool truncated;
13518 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13519 max_entries, ents, &state->marker, &truncated);
13520 bufferlist ibl;
13521 r = ioctx.operate(oid, &op, &ibl);
13522 if (r == -ENOENT) {
13523 truncated = false;
13524 r = 0;
13525 }
13526 if (r < 0) {
13527 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13528 return r;
13529 }
13530
13531 if (!truncated) {
13532 state->marker.clear();
13533 }
13534
13535 max_entries -= ents.size();
13536
13537 entries.splice(entries.end(), ents);
13538
13539 if (truncated)
13540 break;
13541 }
13542
13543 *done = (state->cur_shard > state->max_shard);
13544
13545 return 0;
13546 }
13547
13548 void RGWStateLog::finish_list_entries(void *handle)
13549 {
13550 list_state *state = static_cast<list_state *>(handle);
13551 delete state;
13552 }
13553
13554 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13555 {
13556 f->open_object_section("statelog_entry");
13557 f->dump_string("client_id", entry.client_id);
13558 f->dump_string("op_id", entry.op_id);
13559 f->dump_string("object", entry.object);
13560 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13561 if (!dump_entry_internal(entry, f)) {
13562 f->dump_int("state", entry.state);
13563 }
13564 f->close_section();
13565 }
13566
13567 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13568 {
13569 }
13570
13571 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13572 {
13573 string s;
13574 switch ((OpState)entry.state) {
13575 case OPSTATE_UNKNOWN:
13576 s = "unknown";
13577 break;
13578 case OPSTATE_IN_PROGRESS:
13579 s = "in-progress";
13580 break;
13581 case OPSTATE_COMPLETE:
13582 s = "complete";
13583 break;
13584 case OPSTATE_ERROR:
13585 s = "error";
13586 break;
13587 case OPSTATE_ABORT:
13588 s = "abort";
13589 break;
13590 case OPSTATE_CANCELLED:
13591 s = "cancelled";
13592 break;
13593 default:
13594 s = "invalid";
13595 }
13596 f->dump_string("state", s);
13597 return true;
13598 }
13599
13600 int RGWOpState::state_from_str(const string& s, OpState *state)
13601 {
13602 if (s == "unknown") {
13603 *state = OPSTATE_UNKNOWN;
13604 } else if (s == "in-progress") {
13605 *state = OPSTATE_IN_PROGRESS;
13606 } else if (s == "complete") {
13607 *state = OPSTATE_COMPLETE;
13608 } else if (s == "error") {
13609 *state = OPSTATE_ERROR;
13610 } else if (s == "abort") {
13611 *state = OPSTATE_ABORT;
13612 } else if (s == "cancelled") {
13613 *state = OPSTATE_CANCELLED;
13614 } else {
13615 return -EINVAL;
13616 }
13617
13618 return 0;
13619 }
13620
13621 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13622 {
13623 uint32_t s = (uint32_t)state;
13624 return store_entry(client_id, op_id, object, s, NULL, NULL);
13625 }
13626
13627 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13628 {
13629 uint32_t s = (uint32_t)state;
13630 return store_entry(client_id, op_id, object, s, NULL, &s);
13631 }
13632
13633 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13634 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13635 {
13636 cct = store->ctx();
13637 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13638 }
13639
13640 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13641 last_update = real_clock::now();
13642 cur_state = state;
13643 return os.set_state(client_id, op_id, object, state);
13644 }
13645
13646 int RGWOpStateSingleOp::renew_state() {
13647 real_time now = real_clock::now();
13648
13649 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13650
13651 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13652 return 0;
13653 }
13654
13655 last_update = now;
13656 return os.renew_state(client_id, op_id, object, cur_state);
13657 }
13658
13659
13660 uint64_t RGWRados::instance_id()
13661 {
13662 return get_rados_handle()->get_instance_id();
13663 }
13664
13665 uint64_t RGWRados::next_bucket_id()
13666 {
13667 Mutex::Locker l(bucket_id_lock);
13668 return ++max_bucket_id;
13669 }
13670
13671 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13672 {
13673 int use_cache = cct->_conf->rgw_cache_enabled;
13674 RGWRados *store = NULL;
13675 if (!use_cache) {
13676 store = new RGWRados;
13677 } else {
13678 store = new RGWCache<RGWRados>;
13679 }
13680
13681 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13682 delete store;
13683 return NULL;
13684 }
13685
13686 return store;
13687 }
13688
13689 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13690 {
13691 RGWRados *store = NULL;
13692 store = new RGWRados;
13693
13694 store->set_context(cct);
13695
13696 if (store->init_rados() < 0) {
13697 delete store;
13698 return NULL;
13699 }
13700
13701 return store;
13702 }
13703
13704 void RGWStoreManager::close_storage(RGWRados *store)
13705 {
13706 if (!store)
13707 return;
13708
13709 store->finalize();
13710
13711 delete store;
13712 }
13713
13714 librados::Rados* RGWRados::get_rados_handle()
13715 {
13716 if (rados.size() == 1) {
13717 return &rados[0];
13718 } else {
13719 handle_lock.get_read();
13720 pthread_t id = pthread_self();
13721 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13722
13723 if (it != rados_map.end()) {
13724 handle_lock.put_read();
13725 return &rados[it->second];
13726 } else {
13727 handle_lock.put_read();
13728 handle_lock.get_write();
13729 const uint32_t handle = next_rados_handle;
13730 rados_map[id] = handle;
13731 if (++next_rados_handle == rados.size()) {
13732 next_rados_handle = 0;
13733 }
13734 handle_lock.put_write();
13735 return &rados[handle];
13736 }
13737 }
13738 }
13739
13740 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13741 {
13742 rgw_rados_ref ref;
13743 int ret = get_raw_obj_ref(obj, &ref);
13744 if (ret < 0) {
13745 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13746 return ret;
13747 }
13748
13749 ObjectWriteOperation op;
13750 list<string> prefixes;
13751 cls_rgw_remove_obj(op, prefixes);
13752
13753 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13754 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13755 if (ret < 0) {
13756 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13757 c->release();
13758 return ret;
13759 }
13760
13761 handles.push_back(c);
13762
13763 return 0;
13764 }
13765
13766 int RGWRados::delete_obj_aio(const rgw_obj& obj,
13767 RGWBucketInfo& bucket_info, RGWObjState *astate,
13768 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13769 {
13770 rgw_rados_ref ref;
13771 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13772 if (ret < 0) {
13773 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13774 return ret;
13775 }
13776
13777 if (keep_index_consistent) {
13778 RGWRados::Bucket bop(this, bucket_info);
13779 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13780
13781 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13782 if (ret < 0) {
13783 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13784 return ret;
13785 }
13786 }
13787
13788 ObjectWriteOperation op;
13789 list<string> prefixes;
13790 cls_rgw_remove_obj(op, prefixes);
13791
13792 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13793 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13794 if (ret < 0) {
13795 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13796 c->release();
13797 return ret;
13798 }
13799
13800 handles.push_back(c);
13801
13802 if (keep_index_consistent) {
13803 ret = delete_obj_index(obj);
13804 if (ret < 0) {
13805 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13806 return ret;
13807 }
13808 }
13809 return ret;
13810 }
13811
13812 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
13813 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
13814 if (value != attrs.end()) {
13815 bufferlist::iterator bliter = value->second.begin();
13816 try {
13817 ::decode(cs_info, bliter);
13818 } catch (buffer::error& err) {
13819 return -EIO;
13820 }
13821 if (cs_info.blocks.size() == 0) {
13822 return -EIO;
13823 }
13824 if (cs_info.compression_type != "none")
13825 need_decompress = true;
13826 else
13827 need_decompress = false;
13828 return 0;
13829 } else {
13830 need_decompress = false;
13831 return 0;
13832 }
13833 }
13834