]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to v12.1.1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1
2 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 // vim: ts=8 sw=2 smarttab
4
5 #include "include/compat.h"
6 #include <errno.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <boost/algorithm/string.hpp>
10
11 #include <boost/format.hpp>
12 #include <boost/optional.hpp>
13 #include <boost/utility/in_place_factory.hpp>
14
15 #include "common/ceph_json.h"
16 #include "common/utf8.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21 #include "common/Finisher.h"
22
23 #include "rgw_rados.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_metadata.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32
33 #include "cls/rgw/cls_rgw_ops.h"
34 #include "cls/rgw/cls_rgw_types.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/statelog/cls_statelog_client.h"
41 #include "cls/timeindex/cls_timeindex_client.h"
42 #include "cls/lock/cls_lock_client.h"
43 #include "cls/user/cls_user_client.h"
44
45 #include "rgw_tools.h"
46 #include "rgw_coroutine.h"
47 #include "rgw_compression.h"
48
49 #undef fork // fails to compile RGWPeriod::fork() below
50
51 #include "common/Clock.h"
52
53 #include "include/rados/librados.hpp"
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "auth/Crypto.h" // get_random_bytes()
63
64 #include "rgw_log.h"
65
66 #include "rgw_gc.h"
67 #include "rgw_lc.h"
68
69 #include "rgw_object_expirer_core.h"
70 #include "rgw_sync.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "compressor/Compressor.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
79
80 using namespace std;
81
82 static string notify_oid_prefix = "notify";
83 static string *notify_oids = NULL;
84 static string shadow_ns = "shadow";
85 static string dir_oid_prefix = ".dir.";
86 static string default_storage_pool_suffix = "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89 static string avail_pools = ".pools.avail";
90
91 static string zone_info_oid_prefix = "zone_info.";
92 static string zone_names_oid_prefix = "zone_names.";
93 static string region_info_oid_prefix = "region_info.";
94 static string zone_group_info_oid_prefix = "zonegroup_info.";
95 static string realm_names_oid_prefix = "realms_names.";
96 static string realm_info_oid_prefix = "realms.";
97 static string default_region_info_oid = "default.region";
98 static string default_zone_group_info_oid = "default.zonegroup";
99 static string period_info_oid_prefix = "periods.";
100 static string period_latest_epoch_info_oid = ".latest_epoch";
101 static string region_map_oid = "region_map";
102 static string zonegroup_map_oid = "zonegroup_map";
103 static string log_lock_name = "rgw_log_lock";
104 static string default_realm_info_oid = "default.realm";
105 const string default_zonegroup_name = "default";
106 const string default_zone_name = "default";
107 static string zonegroup_names_oid_prefix = "zonegroups_names.";
108 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118 #define dout_subsys ceph_subsys_rgw
119
120
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123 {
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
133 *pool = placement.get_data_extra_pool();
134 }
135 }
136
137 return true;
138 }
139
140 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142 {
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146 }
147
148 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149 {
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156 }
157
158 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159 {
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166 }
167
168 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169 {
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
173 if (r < 0 && r != -EEXIST) {
174 return r;
175 }
176
177 r = rados->ioctx_create(pool.name.c_str(), ioctx);
178 }
179 if (r < 0) {
180 return r;
181 }
182 if (!pool.ns.empty()) {
183 ioctx.set_namespace(pool.ns);
184 }
185 return 0;
186 }
187
188 template<>
189 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
190 RWLock::WLocker wl(lock);
191 auto iter = objs_state.find(obj);
192 if (iter == objs_state.end()) {
193 return;
194 }
195 bool is_atomic = iter->second.is_atomic;
196 bool prefetch_data = iter->second.prefetch_data;
197
198 objs_state.erase(iter);
199
200 if (is_atomic || prefetch_data) {
201 auto& s = objs_state[obj];
202 s.is_atomic = is_atomic;
203 s.prefetch_data = prefetch_data;
204 }
205 }
206
207 template<>
208 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
209 RWLock::WLocker wl(lock);
210 auto iter = objs_state.find(obj);
211 if (iter == objs_state.end()) {
212 return;
213 }
214
215 objs_state.erase(iter);
216 }
217
218 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
219 encode_json("default_zonegroup", default_zonegroup, f);
220 }
221
222 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
223
224 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
225 /* backward compatability with region */
226 if (default_zonegroup.empty()) {
227 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
228 }
229 }
230
231 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
232 {
233 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
234 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
235 }
236
237 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
238 }
239
240 int RGWZoneGroup::create_default(bool old_format)
241 {
242 name = default_zonegroup_name;
243 is_master = true;
244
245 RGWZoneGroupPlacementTarget placement_target;
246 placement_target.name = "default-placement";
247 placement_targets[placement_target.name] = placement_target;
248 default_placement = "default-placement";
249
250 RGWZoneParams zone_params(default_zone_name);
251
252 int r = zone_params.init(cct, store, false);
253 if (r < 0) {
254 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
255 return r;
256 }
257
258 r = zone_params.create_default();
259 if (r < 0 && r != -EEXIST) {
260 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
261 return r;
262 } else if (r == -EEXIST) {
263 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
264 zone_params.clear_id();
265 r = zone_params.init(cct, store);
266 if (r < 0) {
267 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
268 return r;
269 }
270 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
271 << dendl;
272 }
273
274 RGWZone& default_zone = zones[zone_params.get_id()];
275 default_zone.name = zone_params.get_name();
276 default_zone.id = zone_params.get_id();
277 master_zone = default_zone.id;
278
279 r = create();
280 if (r < 0 && r != -EEXIST) {
281 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
282 return r;
283 }
284
285 if (r == -EEXIST) {
286 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
287 id.clear();
288 r = init(cct, store);
289 if (r < 0) {
290 return r;
291 }
292 }
293
294 if (old_format) {
295 name = id;
296 }
297
298 post_process_params();
299
300 return 0;
301 }
302
303 const string RGWZoneGroup::get_default_oid(bool old_region_format)
304 {
305 if (old_region_format) {
306 if (cct->_conf->rgw_default_region_info_oid.empty()) {
307 return default_region_info_oid;
308 }
309 return cct->_conf->rgw_default_region_info_oid;
310 }
311
312 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
313
314 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
315 default_oid = default_zone_group_info_oid;
316 }
317
318 default_oid += "." + realm_id;
319
320 return default_oid;
321 }
322
323 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
324 {
325 if (old_region_format) {
326 return region_info_oid_prefix;
327 }
328 return zone_group_info_oid_prefix;
329 }
330
331 const string& RGWZoneGroup::get_names_oid_prefix()
332 {
333 return zonegroup_names_oid_prefix;
334 }
335
336 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
337 return cct->_conf->rgw_zonegroup;
338 }
339
340 int RGWZoneGroup::equals(const string& other_zonegroup) const
341 {
342 if (is_master && other_zonegroup.empty())
343 return true;
344
345 return (id == other_zonegroup);
346 }
347
348 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
349 const list<string>& endpoints, const string *ptier_type,
350 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
351 {
352 auto& zone_id = zone_params.get_id();
353 auto& zone_name = zone_params.get_name();
354
355 // check for duplicate zone name on insert
356 if (!zones.count(zone_id)) {
357 for (const auto& zone : zones) {
358 if (zone.second.name == zone_name) {
359 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
360 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
361 return -EEXIST;
362 }
363 }
364 }
365
366 if (is_master) {
367 if (*is_master) {
368 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
369 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
370 }
371 master_zone = zone_params.get_id();
372 } else if (master_zone == zone_params.get_id()) {
373 master_zone.clear();
374 }
375 }
376
377 RGWZone& zone = zones[zone_params.get_id()];
378 zone.name = zone_params.get_name();
379 zone.id = zone_params.get_id();
380 if (!endpoints.empty()) {
381 zone.endpoints = endpoints;
382 }
383 if (read_only) {
384 zone.read_only = *read_only;
385 }
386 if (ptier_type) {
387 zone.tier_type = *ptier_type;
388 }
389
390 if (psync_from_all) {
391 zone.sync_from_all = *psync_from_all;
392 }
393
394 for (auto add : sync_from) {
395 zone.sync_from.insert(add);
396 }
397
398 for (auto rm : sync_from_rm) {
399 zone.sync_from.erase(rm);
400 }
401
402 post_process_params();
403
404 return update();
405 }
406
407
408 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
409 {
410 RGWZone& zone = zones[zone_params.get_id()];
411 zone.name = zone_params.get_name();
412
413 return update();
414 }
415
416 void RGWZoneGroup::post_process_params()
417 {
418 bool log_data = zones.size() > 1;
419
420 if (master_zone.empty()) {
421 map<string, RGWZone>::iterator iter = zones.begin();
422 if (iter != zones.end()) {
423 master_zone = iter->first;
424 }
425 }
426
427 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
428 RGWZone& zone = iter->second;
429 zone.log_data = log_data;
430
431 RGWZoneParams zone_params(zone.id, zone.name);
432 int ret = zone_params.init(cct, store);
433 if (ret < 0) {
434 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
435 continue;
436 }
437
438 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
439 iter != zone_params.placement_pools.end(); ++iter) {
440 const string& placement_name = iter->first;
441 if (placement_targets.find(placement_name) == placement_targets.end()) {
442 RGWZoneGroupPlacementTarget placement_target;
443 placement_target.name = placement_name;
444 placement_targets[placement_name] = placement_target;
445 }
446 }
447 }
448
449 if (default_placement.empty() && !placement_targets.empty()) {
450 default_placement = placement_targets.begin()->first;
451 }
452 }
453
454 int RGWZoneGroup::remove_zone(const std::string& zone_id)
455 {
456 map<string, RGWZone>::iterator iter = zones.find(zone_id);
457 if (iter == zones.end()) {
458 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
459 << name << dendl;
460 return -ENOENT;
461 }
462
463 zones.erase(iter);
464
465 post_process_params();
466
467 return update();
468 }
469
470 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
471 {
472 if (realm_id.empty()) {
473 /* try using default realm */
474 RGWRealm realm;
475 int ret = realm.init(cct, store);
476 if (ret < 0) {
477 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
478 return -ENOENT;
479 }
480 realm_id = realm.get_id();
481 }
482
483 return RGWSystemMetaObj::read_default_id(default_id, old_format);
484 }
485
486 int RGWZoneGroup::set_as_default(bool exclusive)
487 {
488 if (realm_id.empty()) {
489 /* try using default realm */
490 RGWRealm realm;
491 int ret = realm.init(cct, store);
492 if (ret < 0) {
493 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
494 return -EINVAL;
495 }
496 realm_id = realm.get_id();
497 }
498
499 return RGWSystemMetaObj::set_as_default(exclusive);
500 }
501
502 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
503 {
504 cct = _cct;
505 store = _store;
506
507 if (!setup_obj)
508 return 0;
509
510 if (old_format && id.empty()) {
511 id = name;
512 }
513
514 if (id.empty()) {
515 int r;
516 if (name.empty()) {
517 name = get_predefined_name(cct);
518 }
519 if (name.empty()) {
520 r = use_default(old_format);
521 if (r < 0) {
522 return r;
523 }
524 } else if (!old_format) {
525 r = read_id(name, id);
526 if (r < 0) {
527 if (r != -ENOENT) {
528 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
529 }
530 return r;
531 }
532 }
533 }
534
535 return read_info(id, old_format);
536 }
537
538 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
539 {
540 auto pool = get_pool(cct);
541 bufferlist bl;
542 RGWObjectCtx obj_ctx(store);
543 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
544 if (ret < 0)
545 return ret;
546
547 try {
548 bufferlist::iterator iter = bl.begin();
549 ::decode(default_info, iter);
550 } catch (buffer::error& err) {
551 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
552 return -EIO;
553 }
554
555 return 0;
556 }
557
558 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
559 {
560 RGWDefaultSystemMetaObjInfo default_info;
561
562 int ret = read_default(default_info, get_default_oid(old_format));
563 if (ret < 0) {
564 return ret;
565 }
566
567 default_id = default_info.default_id;
568
569 return 0;
570 }
571
572 int RGWSystemMetaObj::use_default(bool old_format)
573 {
574 return read_default_id(id, old_format);
575 }
576
577 int RGWSystemMetaObj::set_as_default(bool exclusive)
578 {
579 string oid = get_default_oid();
580
581 rgw_pool pool(get_pool(cct));
582 bufferlist bl;
583
584 RGWDefaultSystemMetaObjInfo default_info;
585 default_info.default_id = id;
586
587 ::encode(default_info, bl);
588
589 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
590 exclusive, NULL, real_time(), NULL);
591 if (ret < 0)
592 return ret;
593
594 return 0;
595 }
596
597 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
598 {
599 rgw_pool pool(get_pool(cct));
600 bufferlist bl;
601
602 string oid = get_names_oid_prefix() + obj_name;
603
604 RGWObjectCtx obj_ctx(store);
605 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
606 if (ret < 0) {
607 return ret;
608 }
609
610 RGWNameToId nameToId;
611 try {
612 bufferlist::iterator iter = bl.begin();
613 ::decode(nameToId, iter);
614 } catch (buffer::error& err) {
615 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
616 return -EIO;
617 }
618 object_id = nameToId.obj_id;
619 return 0;
620 }
621
622 int RGWSystemMetaObj::delete_obj(bool old_format)
623 {
624 rgw_pool pool(get_pool(cct));
625
626 /* check to see if obj is the default */
627 RGWDefaultSystemMetaObjInfo default_info;
628 int ret = read_default(default_info, get_default_oid(old_format));
629 if (ret < 0 && ret != -ENOENT)
630 return ret;
631 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
632 string oid = get_default_oid(old_format);
633 rgw_raw_obj default_named_obj(pool, oid);
634 ret = store->delete_system_obj(default_named_obj);
635 if (ret < 0) {
636 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
637 return ret;
638 }
639 }
640 if (!old_format) {
641 string oid = get_names_oid_prefix() + name;
642 rgw_raw_obj object_name(pool, oid);
643 ret = store->delete_system_obj(object_name);
644 if (ret < 0) {
645 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
646 return ret;
647 }
648 }
649
650 string oid = get_info_oid_prefix(old_format);
651 if (old_format) {
652 oid += name;
653 } else {
654 oid += id;
655 }
656
657 rgw_raw_obj object_id(pool, oid);
658 ret = store->delete_system_obj(object_id);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
661 }
662
663 return ret;
664 }
665
666 int RGWSystemMetaObj::store_name(bool exclusive)
667 {
668 rgw_pool pool(get_pool(cct));
669 string oid = get_names_oid_prefix() + name;
670
671 RGWNameToId nameToId;
672 nameToId.obj_id = id;
673
674 bufferlist bl;
675 ::encode(nameToId, bl);
676 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
677 }
678
679 int RGWSystemMetaObj::rename(const string& new_name)
680 {
681 string new_id;
682 int ret = read_id(new_name, new_id);
683 if (!ret) {
684 return -EEXIST;
685 }
686 if (ret < 0 && ret != -ENOENT) {
687 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
688 return ret;
689 }
690 string old_name = name;
691 name = new_name;
692 ret = update();
693 if (ret < 0) {
694 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
695 return ret;
696 }
697 ret = store_name(true);
698 if (ret < 0) {
699 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
700 return ret;
701 }
702 /* delete old name */
703 rgw_pool pool(get_pool(cct));
704 string oid = get_names_oid_prefix() + old_name;
705 rgw_raw_obj old_name_obj(pool, oid);
706 ret = store->delete_system_obj(old_name_obj);
707 if (ret < 0) {
708 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
709 return ret;
710 }
711
712 return ret;
713 }
714
715 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
716 {
717 rgw_pool pool(get_pool(cct));
718
719 bufferlist bl;
720
721 string oid = get_info_oid_prefix(old_format) + obj_id;
722
723 RGWObjectCtx obj_ctx(store);
724 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
725 if (ret < 0) {
726 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
727 return ret;
728 }
729
730 try {
731 bufferlist::iterator iter = bl.begin();
732 ::decode(*this, iter);
733 } catch (buffer::error& err) {
734 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
735 return -EIO;
736 }
737
738 return 0;
739 }
740
741 int RGWSystemMetaObj::read()
742 {
743 int ret = read_id(name, id);
744 if (ret < 0) {
745 return ret;
746 }
747
748 return read_info(id);
749 }
750
751 int RGWSystemMetaObj::create(bool exclusive)
752 {
753 int ret;
754
755 /* check to see the name is not used */
756 ret = read_id(name, id);
757 if (exclusive && ret == 0) {
758 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
759 return -EEXIST;
760 } else if ( ret < 0 && ret != -ENOENT) {
761 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
762 return ret;
763 }
764
765 if (id.empty()) {
766 /* create unique id */
767 uuid_d new_uuid;
768 char uuid_str[37];
769 new_uuid.generate_random();
770 new_uuid.print(uuid_str);
771 id = uuid_str;
772 }
773
774 ret = store_info(exclusive);
775 if (ret < 0) {
776 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 return store_name(exclusive);
781 }
782
783 int RGWSystemMetaObj::store_info(bool exclusive)
784 {
785 rgw_pool pool(get_pool(cct));
786
787 string oid = get_info_oid_prefix() + id;
788
789 bufferlist bl;
790 ::encode(*this, bl);
791 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
792 }
793
794 int RGWSystemMetaObj::write(bool exclusive)
795 {
796 int ret = store_info(exclusive);
797 if (ret < 0) {
798 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
799 return ret;
800 }
801 ret = store_name(exclusive);
802 if (ret < 0) {
803 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
804 return ret;
805 }
806 return 0;
807 }
808
809
810 const string& RGWRealm::get_predefined_name(CephContext *cct) {
811 return cct->_conf->rgw_realm;
812 }
813
814 int RGWRealm::create(bool exclusive)
815 {
816 int ret = RGWSystemMetaObj::create(exclusive);
817 if (ret < 0) {
818 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
819 return ret;
820 }
821 // create the control object for watch/notify
822 ret = create_control(exclusive);
823 if (ret < 0) {
824 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
825 return ret;
826 }
827 RGWPeriod period;
828 if (current_period.empty()) {
829 /* create new period for the realm */
830 ret = period.init(cct, store, id, name, false);
831 if (ret < 0 ) {
832 return ret;
833 }
834 ret = period.create(true);
835 if (ret < 0) {
836 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
837 return ret;
838 }
839 } else {
840 period = RGWPeriod(current_period, 0);
841 int ret = period.init(cct, store, id, name);
842 if (ret < 0) {
843 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
844 return ret;
845 }
846 }
847 ret = set_current_period(period);
848 if (ret < 0) {
849 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
850 return ret;
851 }
852 // try to set as default. may race with another create, so pass exclusive=true
853 // so we don't override an existing default
854 ret = set_as_default(true);
855 if (ret < 0 && ret != -EEXIST) {
856 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
857 }
858
859 return 0;
860 }
861
862 int RGWRealm::delete_obj()
863 {
864 int ret = RGWSystemMetaObj::delete_obj();
865 if (ret < 0) {
866 return ret;
867 }
868 return delete_control();
869 }
870
871 int RGWRealm::create_control(bool exclusive)
872 {
873 auto pool = rgw_pool{get_pool(cct)};
874 auto oid = get_control_oid();
875 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
876 nullptr, real_time(), nullptr);
877 }
878
879 int RGWRealm::delete_control()
880 {
881 auto pool = rgw_pool{get_pool(cct)};
882 auto obj = rgw_raw_obj{pool, get_control_oid()};
883 return store->delete_system_obj(obj);
884 }
885
886 rgw_pool RGWRealm::get_pool(CephContext *cct)
887 {
888 if (cct->_conf->rgw_realm_root_pool.empty()) {
889 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
890 }
891 return rgw_pool(cct->_conf->rgw_realm_root_pool);
892 }
893
894 const string RGWRealm::get_default_oid(bool old_format)
895 {
896 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
897 return default_realm_info_oid;
898 }
899 return cct->_conf->rgw_default_realm_info_oid;
900 }
901
902 const string& RGWRealm::get_names_oid_prefix()
903 {
904 return realm_names_oid_prefix;
905 }
906
907 const string& RGWRealm::get_info_oid_prefix(bool old_format)
908 {
909 return realm_info_oid_prefix;
910 }
911
912 int RGWRealm::set_current_period(RGWPeriod& period)
913 {
914 // update realm epoch to match the period's
915 if (epoch > period.get_realm_epoch()) {
916 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
917 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
918 return -EINVAL;
919 }
920 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
921 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
922 << period.get_realm_epoch() << ", but different period id "
923 << period.get_id() << " != " << current_period << dendl;
924 return -EINVAL;
925 }
926
927 epoch = period.get_realm_epoch();
928 current_period = period.get_id();
929
930 int ret = update();
931 if (ret < 0) {
932 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
933 return ret;
934 }
935
936 ret = period.reflect();
937 if (ret < 0) {
938 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
939 return ret;
940 }
941
942 return 0;
943 }
944
945 string RGWRealm::get_control_oid()
946 {
947 return get_info_oid_prefix() + id + ".control";
948 }
949
950 int RGWRealm::notify_zone(bufferlist& bl)
951 {
952 // open a context on the realm's pool
953 rgw_pool pool{get_pool(cct)};
954 librados::IoCtx ctx;
955 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
956 if (r < 0) {
957 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
958 return r;
959 }
960 // send a notify on the realm object
961 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
962 if (r < 0) {
963 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
964 return r;
965 }
966 return 0;
967 }
968
969 int RGWRealm::notify_new_period(const RGWPeriod& period)
970 {
971 bufferlist bl;
972 // push the period to dependent zonegroups/zones
973 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
974 ::encode(period, bl);
975 // reload the gateway with the new period
976 ::encode(RGWRealmNotify::Reload, bl);
977
978 return notify_zone(bl);
979 }
980
981 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
982 {
983 if (realm_id.empty()) {
984 return "period_config.default";
985 }
986 return "period_config." + realm_id;
987 }
988
989 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
990 {
991 const auto& pool_name = cct->_conf->rgw_period_root_pool;
992 if (pool_name.empty()) {
993 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
994 }
995 return {pool_name};
996 }
997
998 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
999 {
1000 RGWObjectCtx obj_ctx(store);
1001 const auto& pool = get_pool(store->ctx());
1002 const auto& oid = get_oid(realm_id);
1003 bufferlist bl;
1004
1005 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1006 if (ret < 0) {
1007 return ret;
1008 }
1009 try {
1010 bufferlist::iterator iter = bl.begin();
1011 ::decode(*this, iter);
1012 } catch (buffer::error& err) {
1013 return -EIO;
1014 }
1015 return 0;
1016 }
1017
1018 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1019 {
1020 const auto& pool = get_pool(store->ctx());
1021 const auto& oid = get_oid(realm_id);
1022 bufferlist bl;
1023 ::encode(*this, bl);
1024 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1025 false, nullptr, real_time(), nullptr);
1026 }
1027
1028 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1029 const string& period_realm_name, bool setup_obj)
1030 {
1031 cct = _cct;
1032 store = _store;
1033 realm_id = period_realm_id;
1034 realm_name = period_realm_name;
1035
1036 if (!setup_obj)
1037 return 0;
1038
1039 return init(_cct, _store, setup_obj);
1040 }
1041
1042
1043 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1044 {
1045 cct = _cct;
1046 store = _store;
1047
1048 if (!setup_obj)
1049 return 0;
1050
1051 if (id.empty()) {
1052 RGWRealm realm(realm_id, realm_name);
1053 int ret = realm.init(cct, store);
1054 if (ret < 0) {
1055 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1056 cpp_strerror(-ret) << dendl;
1057 return ret;
1058 }
1059 id = realm.get_current_period();
1060 realm_id = realm.get_id();
1061 }
1062
1063 if (!epoch) {
1064 int ret = use_latest_epoch();
1065 if (ret < 0) {
1066 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1067 << " : " << cpp_strerror(-ret) << dendl;
1068 return ret;
1069 }
1070 }
1071
1072 return read_info();
1073 }
1074
1075
1076 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1077 map<string, RGWZoneGroup>::const_iterator iter;
1078 if (!zonegroup_id.empty()) {
1079 iter = period_map.zonegroups.find(zonegroup_id);
1080 } else {
1081 iter = period_map.zonegroups.find("default");
1082 }
1083 if (iter != period_map.zonegroups.end()) {
1084 zonegroup = iter->second;
1085 return 0;
1086 }
1087
1088 return -ENOENT;
1089 }
1090
1091 const string& RGWPeriod::get_latest_epoch_oid()
1092 {
1093 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1094 return period_latest_epoch_info_oid;
1095 }
1096 return cct->_conf->rgw_period_latest_epoch_info_oid;
1097 }
1098
1099 const string& RGWPeriod::get_info_oid_prefix()
1100 {
1101 return period_info_oid_prefix;
1102 }
1103
1104 const string RGWPeriod::get_period_oid_prefix()
1105 {
1106 return get_info_oid_prefix() + id;
1107 }
1108
1109 const string RGWPeriod::get_period_oid()
1110 {
1111 std::ostringstream oss;
1112 oss << get_period_oid_prefix();
1113 // skip the epoch for the staging period
1114 if (id != get_staging_id(realm_id))
1115 oss << "." << epoch;
1116 return oss.str();
1117 }
1118
1119 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1120 RGWObjVersionTracker *objv)
1121 {
1122 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1123
1124 rgw_pool pool(get_pool(cct));
1125 bufferlist bl;
1126 RGWObjectCtx obj_ctx(store);
1127 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1128 if (ret < 0) {
1129 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1130 return ret;
1131 }
1132 try {
1133 bufferlist::iterator iter = bl.begin();
1134 ::decode(info, iter);
1135 } catch (buffer::error& err) {
1136 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1137 return -EIO;
1138 }
1139
1140 return 0;
1141 }
1142
1143 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1144 {
1145 RGWPeriodLatestEpochInfo info;
1146
1147 int ret = read_latest_epoch(info);
1148 if (ret < 0) {
1149 return ret;
1150 }
1151
1152 latest_epoch = info.epoch;
1153
1154 return 0;
1155 }
1156
1157 int RGWPeriod::use_latest_epoch()
1158 {
1159 RGWPeriodLatestEpochInfo info;
1160 int ret = read_latest_epoch(info);
1161 if (ret < 0) {
1162 return ret;
1163 }
1164
1165 epoch = info.epoch;
1166
1167 return 0;
1168 }
1169
1170 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1171 RGWObjVersionTracker *objv)
1172 {
1173 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1174
1175 rgw_pool pool(get_pool(cct));
1176 bufferlist bl;
1177
1178 RGWPeriodLatestEpochInfo info;
1179 info.epoch = epoch;
1180
1181 ::encode(info, bl);
1182
1183 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1184 exclusive, objv, real_time(), nullptr);
1185 }
1186
1187 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1188 {
1189 static constexpr int MAX_RETRIES = 20;
1190
1191 for (int i = 0; i < MAX_RETRIES; i++) {
1192 RGWPeriodLatestEpochInfo info;
1193 RGWObjVersionTracker objv;
1194 bool exclusive = false;
1195
1196 // read existing epoch
1197 int r = read_latest_epoch(info, &objv);
1198 if (r == -ENOENT) {
1199 // use an exclusive create to set the epoch atomically
1200 exclusive = true;
1201 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1202 << " for period=" << id << dendl;
1203 } else if (r < 0) {
1204 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1205 return r;
1206 } else if (epoch <= info.epoch) {
1207 r = -EEXIST; // fail with EEXIST if epoch is not newer
1208 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1209 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1210 return r;
1211 } else {
1212 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1213 << " -> " << epoch << " on period=" << id << dendl;
1214 }
1215
1216 r = set_latest_epoch(epoch, exclusive, &objv);
1217 if (r == -EEXIST) {
1218 continue; // exclusive create raced with another update, retry
1219 } else if (r == -ECANCELED) {
1220 continue; // write raced with a conflicting version, retry
1221 }
1222 if (r < 0) {
1223 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1224 return r;
1225 }
1226 return 0; // return success
1227 }
1228
1229 return -ECANCELED; // fail after max retries
1230 }
1231
1232 int RGWPeriod::delete_obj()
1233 {
1234 rgw_pool pool(get_pool(cct));
1235
1236 // delete the object for each period epoch
1237 for (epoch_t e = 1; e <= epoch; e++) {
1238 RGWPeriod p{get_id(), e};
1239 rgw_raw_obj oid{pool, p.get_period_oid()};
1240 int ret = store->delete_system_obj(oid);
1241 if (ret < 0) {
1242 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1243 << ": " << cpp_strerror(-ret) << dendl;
1244 }
1245 }
1246
1247 // delete the .latest_epoch object
1248 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1249 int ret = store->delete_system_obj(oid);
1250 if (ret < 0) {
1251 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1252 << ": " << cpp_strerror(-ret) << dendl;
1253 }
1254 return ret;
1255 }
1256
1257 int RGWPeriod::read_info()
1258 {
1259 rgw_pool pool(get_pool(cct));
1260
1261 bufferlist bl;
1262
1263 RGWObjectCtx obj_ctx(store);
1264 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1267 return ret;
1268 }
1269
1270 try {
1271 bufferlist::iterator iter = bl.begin();
1272 ::decode(*this, iter);
1273 } catch (buffer::error& err) {
1274 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1275 return -EIO;
1276 }
1277
1278 return 0;
1279 }
1280
1281 int RGWPeriod::create(bool exclusive)
1282 {
1283 int ret;
1284
1285 /* create unique id */
1286 uuid_d new_uuid;
1287 char uuid_str[37];
1288 new_uuid.generate_random();
1289 new_uuid.print(uuid_str);
1290 id = uuid_str;
1291
1292 epoch = FIRST_EPOCH;
1293
1294 period_map.id = id;
1295
1296 ret = store_info(exclusive);
1297 if (ret < 0) {
1298 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1299 return ret;
1300 }
1301
1302 ret = set_latest_epoch(epoch);
1303 if (ret < 0) {
1304 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1305 }
1306
1307 return ret;
1308 }
1309
1310 int RGWPeriod::store_info(bool exclusive)
1311 {
1312 rgw_pool pool(get_pool(cct));
1313
1314 string oid = get_period_oid();
1315 bufferlist bl;
1316 ::encode(*this, bl);
1317
1318 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1319 exclusive, NULL, real_time(), NULL);
1320 }
1321
1322 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1323 {
1324 if (cct->_conf->rgw_period_root_pool.empty()) {
1325 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1326 }
1327 return rgw_pool(cct->_conf->rgw_period_root_pool);
1328 }
1329
1330 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1331 {
1332 if (zonegroup.realm_id != realm_id) {
1333 return 0;
1334 }
1335 int ret = period_map.update(zonegroup, cct);
1336 if (ret < 0) {
1337 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1338 return ret;
1339 }
1340
1341 return store_info(false);
1342 }
1343
1344 int RGWPeriod::update()
1345 {
1346 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1347 list<string> zonegroups;
1348 int ret = store->list_zonegroups(zonegroups);
1349 if (ret < 0) {
1350 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1351 return ret;
1352 }
1353
1354 // clear zone short ids of removed zones. period_map.update() will add the
1355 // remaining zones back
1356 period_map.short_zone_ids.clear();
1357
1358 for (auto& iter : zonegroups) {
1359 RGWZoneGroup zg(string(), iter);
1360 ret = zg.init(cct, store);
1361 if (ret < 0) {
1362 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1363 continue;
1364 }
1365
1366 if (zg.realm_id != realm_id) {
1367 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1368 continue;
1369 }
1370
1371 if (zg.master_zone.empty()) {
1372 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1373 return -EINVAL;
1374 }
1375
1376 if (zg.is_master_zonegroup()) {
1377 master_zonegroup = zg.get_id();
1378 master_zone = zg.master_zone;
1379 }
1380
1381 int ret = period_map.update(zg, cct);
1382 if (ret < 0) {
1383 return ret;
1384 }
1385 }
1386
1387 ret = period_config.read(store, realm_id);
1388 if (ret < 0 && ret != -ENOENT) {
1389 ldout(cct, 0) << "ERROR: failed to read period config: "
1390 << cpp_strerror(ret) << dendl;
1391 return ret;
1392 }
1393 return 0;
1394 }
1395
1396 int RGWPeriod::reflect()
1397 {
1398 for (auto& iter : period_map.zonegroups) {
1399 RGWZoneGroup& zg = iter.second;
1400 zg.reinit_instance(cct, store);
1401 int r = zg.write(false);
1402 if (r < 0) {
1403 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1404 return r;
1405 }
1406 if (zg.is_master_zonegroup()) {
1407 // set master as default if no default exists
1408 r = zg.set_as_default(true);
1409 if (r == 0) {
1410 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1411 << " as the default" << dendl;
1412 }
1413 }
1414 }
1415
1416 int r = period_config.write(store, realm_id);
1417 if (r < 0) {
1418 ldout(cct, 0) << "ERROR: failed to store period config: "
1419 << cpp_strerror(-r) << dendl;
1420 return r;
1421 }
1422 return 0;
1423 }
1424
1425 void RGWPeriod::fork()
1426 {
1427 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1428 predecessor_uuid = id;
1429 id = get_staging_id(realm_id);
1430 period_map.reset();
1431 realm_epoch++;
1432 }
1433
1434 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1435 {
1436 // initialize a sync status manager to read the status
1437 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1438 int r = mgr.init();
1439 if (r < 0) {
1440 return r;
1441 }
1442 r = mgr.read_sync_status(sync_status);
1443 mgr.stop();
1444 return r;
1445 }
1446
1447 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1448 std::ostream& error_stream,
1449 bool force_if_stale)
1450 {
1451 rgw_meta_sync_status status;
1452 int r = read_sync_status(store, &status);
1453 if (r < 0) {
1454 ldout(cct, 0) << "period failed to read sync status: "
1455 << cpp_strerror(-r) << dendl;
1456 return r;
1457 }
1458
1459 std::vector<std::string> markers;
1460
1461 const auto current_epoch = current_period.get_realm_epoch();
1462 if (current_epoch != status.sync_info.realm_epoch) {
1463 // no sync status markers for the current period
1464 assert(current_epoch > status.sync_info.realm_epoch);
1465 const int behind = current_epoch - status.sync_info.realm_epoch;
1466 if (!force_if_stale && current_epoch > 1) {
1467 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1468 "the current master zone in metadata sync. If this zone is promoted "
1469 "to master, any metadata changes during that time are likely to "
1470 "be lost.\n"
1471 "Waiting for this zone to catch up on metadata sync (see "
1472 "'radosgw-admin sync status') is recommended.\n"
1473 "To promote this zone to master anyway, add the flag "
1474 "--yes-i-really-mean-it." << std::endl;
1475 return -EINVAL;
1476 }
1477 // empty sync status markers - other zones will skip this period during
1478 // incremental metadata sync
1479 markers.resize(status.sync_info.num_shards);
1480 } else {
1481 markers.reserve(status.sync_info.num_shards);
1482 for (auto& i : status.sync_markers) {
1483 auto& marker = i.second;
1484 // filter out markers from other periods
1485 if (marker.realm_epoch != current_epoch) {
1486 marker.marker.clear();
1487 }
1488 markers.emplace_back(std::move(marker.marker));
1489 }
1490 }
1491
1492 std::swap(sync_status, markers);
1493 return 0;
1494 }
1495
1496 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1497 std::ostream& error_stream, bool force_if_stale)
1498 {
1499 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1500 // gateway must be in the master zone to commit
1501 if (master_zone != store->get_zone_params().get_id()) {
1502 error_stream << "Cannot commit period on zone "
1503 << store->get_zone_params().get_id() << ", it must be sent to "
1504 "the period's master zone " << master_zone << '.' << std::endl;
1505 return -EINVAL;
1506 }
1507 // period predecessor must match current period
1508 if (predecessor_uuid != current_period.get_id()) {
1509 error_stream << "Period predecessor " << predecessor_uuid
1510 << " does not match current period " << current_period.get_id()
1511 << ". Use 'period pull' to get the latest period from the master, "
1512 "reapply your changes, and try again." << std::endl;
1513 return -EINVAL;
1514 }
1515 // realm epoch must be 1 greater than current period
1516 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1517 error_stream << "Period's realm epoch " << realm_epoch
1518 << " does not come directly after current realm epoch "
1519 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1520 "latest realm and period from the master zone, reapply your changes, "
1521 "and try again." << std::endl;
1522 return -EINVAL;
1523 }
1524 // did the master zone change?
1525 if (master_zone != current_period.get_master_zone()) {
1526 // store the current metadata sync status in the period
1527 int r = update_sync_status(current_period, error_stream, force_if_stale);
1528 if (r < 0) {
1529 ldout(cct, 0) << "failed to update metadata sync status: "
1530 << cpp_strerror(-r) << dendl;
1531 return r;
1532 }
1533 // create an object with a new period id
1534 r = create(true);
1535 if (r < 0) {
1536 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1537 return r;
1538 }
1539 // set as current period
1540 r = realm.set_current_period(*this);
1541 if (r < 0) {
1542 ldout(cct, 0) << "failed to update realm's current period: "
1543 << cpp_strerror(-r) << dendl;
1544 return r;
1545 }
1546 ldout(cct, 4) << "Promoted to master zone and committed new period "
1547 << id << dendl;
1548 realm.notify_new_period(*this);
1549 return 0;
1550 }
1551 // period must be based on current epoch
1552 if (epoch != current_period.get_epoch()) {
1553 error_stream << "Period epoch " << epoch << " does not match "
1554 "predecessor epoch " << current_period.get_epoch()
1555 << ". Use 'period pull' to get the latest epoch from the master zone, "
1556 "reapply your changes, and try again." << std::endl;
1557 return -EINVAL;
1558 }
1559 // set period as next epoch
1560 set_id(current_period.get_id());
1561 set_epoch(current_period.get_epoch() + 1);
1562 set_predecessor(current_period.get_predecessor());
1563 realm_epoch = current_period.get_realm_epoch();
1564 // write the period to rados
1565 int r = store_info(false);
1566 if (r < 0) {
1567 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1568 return r;
1569 }
1570 // set as latest epoch
1571 r = update_latest_epoch(epoch);
1572 if (r == -EEXIST) {
1573 // already have this epoch (or a more recent one)
1574 return 0;
1575 }
1576 if (r < 0) {
1577 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1578 return r;
1579 }
1580 r = reflect();
1581 if (r < 0) {
1582 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1583 return r;
1584 }
1585 ldout(cct, 4) << "Committed new epoch " << epoch
1586 << " for period " << id << dendl;
1587 realm.notify_new_period(*this);
1588 return 0;
1589 }
1590
1591 int RGWZoneParams::create_default(bool old_format)
1592 {
1593 name = default_zone_name;
1594
1595 int r = create();
1596 if (r < 0) {
1597 return r;
1598 }
1599
1600 if (old_format) {
1601 name = id;
1602 }
1603
1604 return r;
1605 }
1606
1607
1608 int get_zones_pool_set(CephContext* cct,
1609 RGWRados* store,
1610 const list<string>& zones,
1611 const string& my_zone_id,
1612 set<rgw_pool>& pool_names)
1613 {
1614 for(auto const& iter : zones) {
1615 RGWZoneParams zone(iter);
1616 int r = zone.init(cct, store);
1617 if (r < 0) {
1618 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1619 return r;
1620 }
1621 if (zone.get_id() != my_zone_id) {
1622 pool_names.insert(zone.domain_root);
1623 pool_names.insert(zone.metadata_heap);
1624 pool_names.insert(zone.control_pool);
1625 pool_names.insert(zone.gc_pool);
1626 pool_names.insert(zone.log_pool);
1627 pool_names.insert(zone.intent_log_pool);
1628 pool_names.insert(zone.usage_log_pool);
1629 pool_names.insert(zone.user_keys_pool);
1630 pool_names.insert(zone.user_email_pool);
1631 pool_names.insert(zone.user_swift_pool);
1632 pool_names.insert(zone.user_uid_pool);
1633 pool_names.insert(zone.roles_pool);
1634 pool_names.insert(zone.reshard_pool);
1635 for(auto& iter : zone.placement_pools) {
1636 pool_names.insert(iter.second.index_pool);
1637 pool_names.insert(iter.second.data_pool);
1638 pool_names.insert(iter.second.data_extra_pool);
1639 }
1640 }
1641 }
1642 return 0;
1643 }
1644
1645 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1646 const string& default_prefix,
1647 const string& default_suffix,
1648 const rgw_pool& suggested_pool)
1649 {
1650 string suggested_name = suggested_pool.to_str();
1651
1652 string prefix = default_prefix;
1653 string suffix = default_suffix;
1654
1655 if (!suggested_pool.empty()) {
1656 prefix = suggested_name.substr(0, suggested_name.find("."));
1657 suffix = suggested_name.substr(prefix.length());
1658 }
1659
1660 rgw_pool pool(prefix + suffix);
1661
1662 if (pools.find(pool) == pools.end()) {
1663 return pool;
1664 } else {
1665 while(true) {
1666 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1667 if (pools.find(pool) == pools.end()) {
1668 return pool;
1669 }
1670 }
1671 }
1672 }
1673
1674 int RGWZoneParams::fix_pool_names()
1675 {
1676
1677 list<string> zones;
1678 int r = store->list_zones(zones);
1679 if (r < 0) {
1680 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1681 }
1682
1683 set<rgw_pool> pools;
1684 r = get_zones_pool_set(cct, store, zones, id, pools);
1685 if (r < 0) {
1686 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1687 return r;
1688 }
1689
1690 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1691 if (!metadata_heap.name.empty()) {
1692 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1693 }
1694 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1695 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1696 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1697 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1698 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1699 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1700 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1701 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1702 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1703 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1704 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1705 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1706
1707 for(auto& iter : placement_pools) {
1708 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1709 iter.second.index_pool);
1710 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1711 iter.second.data_pool);
1712 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1713 iter.second.data_extra_pool);
1714 }
1715
1716 return 0;
1717 }
1718
1719 int RGWZoneParams::create(bool exclusive)
1720 {
1721 /* check for old pools config */
1722 rgw_raw_obj obj(domain_root, avail_pools);
1723 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1724 if (r < 0) {
1725 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1726 /* a new system, let's set new placement info */
1727 RGWZonePlacementInfo default_placement;
1728 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1729 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1730 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1731 placement_pools["default-placement"] = default_placement;
1732 }
1733
1734 r = fix_pool_names();
1735 if (r < 0) {
1736 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1737 return r;
1738 }
1739
1740 r = RGWSystemMetaObj::create(exclusive);
1741 if (r < 0) {
1742 return r;
1743 }
1744
1745 // try to set as default. may race with another create, so pass exclusive=true
1746 // so we don't override an existing default
1747 r = set_as_default(true);
1748 if (r < 0 && r != -EEXIST) {
1749 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1750 }
1751
1752 return 0;
1753 }
1754
1755 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1756 {
1757 if (cct->_conf->rgw_zone_root_pool.empty()) {
1758 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1759 }
1760
1761 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1762 }
1763
1764 const string RGWZoneParams::get_default_oid(bool old_format)
1765 {
1766 if (old_format) {
1767 return cct->_conf->rgw_default_zone_info_oid;
1768 }
1769
1770 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1771 }
1772
1773 const string& RGWZoneParams::get_names_oid_prefix()
1774 {
1775 return zone_names_oid_prefix;
1776 }
1777
1778 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1779 {
1780 return zone_info_oid_prefix;
1781 }
1782
1783 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1784 return cct->_conf->rgw_zone;
1785 }
1786
1787 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1788 {
1789 if (name.empty()) {
1790 name = cct->_conf->rgw_zone;
1791 }
1792
1793 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1794 }
1795
1796 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1797 {
1798 if (realm_id.empty()) {
1799 /* try using default realm */
1800 RGWRealm realm;
1801 int ret = realm.init(cct, store);
1802 if (ret < 0) {
1803 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1804 return -ENOENT;
1805 }
1806 realm_id = realm.get_id();
1807 }
1808
1809 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1810 }
1811
1812
1813 int RGWZoneParams::set_as_default(bool exclusive)
1814 {
1815 if (realm_id.empty()) {
1816 /* try using default realm */
1817 RGWRealm realm;
1818 int ret = realm.init(cct, store);
1819 if (ret < 0) {
1820 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1821 return -EINVAL;
1822 }
1823 realm_id = realm.get_id();
1824 }
1825
1826 return RGWSystemMetaObj::set_as_default(exclusive);
1827 }
1828
1829 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1830 {
1831 static const std::string NONE{"none"};
1832 auto p = placement_pools.find(placement_rule);
1833 if (p == placement_pools.end()) {
1834 return NONE;
1835 }
1836 const auto& type = p->second.compression_type;
1837 return !type.empty() ? type : NONE;
1838 }
1839
1840 void RGWPeriodMap::encode(bufferlist& bl) const {
1841 ENCODE_START(2, 1, bl);
1842 ::encode(id, bl);
1843 ::encode(zonegroups, bl);
1844 ::encode(master_zonegroup, bl);
1845 ::encode(short_zone_ids, bl);
1846 ENCODE_FINISH(bl);
1847 }
1848
1849 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1850 DECODE_START(2, bl);
1851 ::decode(id, bl);
1852 ::decode(zonegroups, bl);
1853 ::decode(master_zonegroup, bl);
1854 if (struct_v >= 2) {
1855 ::decode(short_zone_ids, bl);
1856 }
1857 DECODE_FINISH(bl);
1858
1859 zonegroups_by_api.clear();
1860 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1861 iter != zonegroups.end(); ++iter) {
1862 RGWZoneGroup& zonegroup = iter->second;
1863 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1864 if (zonegroup.is_master_zonegroup()) {
1865 master_zonegroup = zonegroup.get_id();
1866 }
1867 }
1868 }
1869
1870 // run an MD5 hash on the zone_id and return the first 32 bits
1871 static uint32_t gen_short_zone_id(const std::string zone_id)
1872 {
1873 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1874 MD5 hash;
1875 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1876 hash.Final(md5);
1877
1878 uint32_t short_id;
1879 memcpy((char *)&short_id, md5, sizeof(short_id));
1880 return std::max(short_id, 1u);
1881 }
1882
1883 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1884 {
1885 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1886 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1887 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1888 return -EINVAL;
1889 }
1890 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1891 if (iter != zonegroups.end()) {
1892 RGWZoneGroup& old_zonegroup = iter->second;
1893 if (!old_zonegroup.api_name.empty()) {
1894 zonegroups_by_api.erase(old_zonegroup.api_name);
1895 }
1896 }
1897 zonegroups[zonegroup.get_id()] = zonegroup;
1898
1899 if (!zonegroup.api_name.empty()) {
1900 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1901 }
1902
1903 if (zonegroup.is_master_zonegroup()) {
1904 master_zonegroup = zonegroup.get_id();
1905 } else if (master_zonegroup == zonegroup.get_id()) {
1906 master_zonegroup = "";
1907 }
1908
1909 for (auto& i : zonegroup.zones) {
1910 auto& zone = i.second;
1911 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1912 continue;
1913 }
1914 // calculate the zone's short id
1915 uint32_t short_id = gen_short_zone_id(zone.id);
1916
1917 // search for an existing zone with the same short id
1918 for (auto& s : short_zone_ids) {
1919 if (s.second == short_id) {
1920 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1921 << ") generates the same short_zone_id " << short_id
1922 << " as existing zone id " << s.first << dendl;
1923 return -EEXIST;
1924 }
1925 }
1926
1927 short_zone_ids[zone.id] = short_id;
1928 }
1929
1930 return 0;
1931 }
1932
1933 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1934 {
1935 auto i = short_zone_ids.find(zone_id);
1936 if (i == short_zone_ids.end()) {
1937 return 0;
1938 }
1939 return i->second;
1940 }
1941
1942 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1943 {
1944
1945 RGWPeriod period;
1946 int ret = period.init(cct, store);
1947 if (ret < 0) {
1948 cerr << "failed to read current period info: " << cpp_strerror(ret);
1949 return ret;
1950 }
1951
1952 bucket_quota = period.get_config().bucket_quota;
1953 user_quota = period.get_config().user_quota;
1954 zonegroups = period.get_map().zonegroups;
1955 zonegroups_by_api = period.get_map().zonegroups_by_api;
1956 master_zonegroup = period.get_map().master_zonegroup;
1957
1958 return 0;
1959 }
1960
1961 void RGWRegionMap::encode(bufferlist& bl) const {
1962 ENCODE_START( 3, 1, bl);
1963 ::encode(regions, bl);
1964 ::encode(master_region, bl);
1965 ::encode(bucket_quota, bl);
1966 ::encode(user_quota, bl);
1967 ENCODE_FINISH(bl);
1968 }
1969
1970 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1971 DECODE_START(3, bl);
1972 ::decode(regions, bl);
1973 ::decode(master_region, bl);
1974 if (struct_v >= 2)
1975 ::decode(bucket_quota, bl);
1976 if (struct_v >= 3)
1977 ::decode(user_quota, bl);
1978 DECODE_FINISH(bl);
1979 }
1980
1981 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1982 ENCODE_START( 3, 1, bl);
1983 ::encode(zonegroups, bl);
1984 ::encode(master_zonegroup, bl);
1985 ::encode(bucket_quota, bl);
1986 ::encode(user_quota, bl);
1987 ENCODE_FINISH(bl);
1988 }
1989
1990 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1991 DECODE_START(3, bl);
1992 ::decode(zonegroups, bl);
1993 ::decode(master_zonegroup, bl);
1994 if (struct_v >= 2)
1995 ::decode(bucket_quota, bl);
1996 if (struct_v >= 3)
1997 ::decode(user_quota, bl);
1998 DECODE_FINISH(bl);
1999
2000 zonegroups_by_api.clear();
2001 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2002 iter != zonegroups.end(); ++iter) {
2003 RGWZoneGroup& zonegroup = iter->second;
2004 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2005 if (zonegroup.is_master_zonegroup()) {
2006 master_zonegroup = zonegroup.get_name();
2007 }
2008 }
2009 }
2010
2011 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2012 {
2013 obj_version *check_objv = version_for_check();
2014
2015 if (check_objv) {
2016 cls_version_check(*op, *check_objv, VER_COND_EQ);
2017 }
2018
2019 cls_version_read(*op, &read_version);
2020 }
2021
2022 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2023 {
2024 obj_version *check_objv = version_for_check();
2025 obj_version *modify_version = version_for_write();
2026
2027 if (check_objv) {
2028 cls_version_check(*op, *check_objv, VER_COND_EQ);
2029 }
2030
2031 if (modify_version) {
2032 cls_version_set(*op, *modify_version);
2033 } else {
2034 cls_version_inc(*op);
2035 }
2036 }
2037
2038 void RGWObjManifest::obj_iterator::operator++()
2039 {
2040 if (manifest->explicit_objs) {
2041 ++explicit_iter;
2042
2043 if (explicit_iter == manifest->objs.end()) {
2044 ofs = manifest->obj_size;
2045 return;
2046 }
2047
2048 update_explicit_pos();
2049
2050 update_location();
2051 return;
2052 }
2053
2054 uint64_t obj_size = manifest->get_obj_size();
2055 uint64_t head_size = manifest->get_head_size();
2056
2057 if (ofs == obj_size) {
2058 return;
2059 }
2060
2061 if (manifest->rules.empty()) {
2062 return;
2063 }
2064
2065 /* are we still pointing at the head? */
2066 if (ofs < head_size) {
2067 rule_iter = manifest->rules.begin();
2068 RGWObjManifestRule *rule = &rule_iter->second;
2069 ofs = MIN(head_size, obj_size);
2070 stripe_ofs = ofs;
2071 cur_stripe = 1;
2072 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2073 if (rule->part_size > 0) {
2074 stripe_size = MIN(stripe_size, rule->part_size);
2075 }
2076 update_location();
2077 return;
2078 }
2079
2080 RGWObjManifestRule *rule = &rule_iter->second;
2081
2082 stripe_ofs += rule->stripe_max_size;
2083 cur_stripe++;
2084 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2085
2086 if (rule->part_size > 0) {
2087 /* multi part, multi stripes object */
2088
2089 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2090
2091 if (stripe_ofs >= part_ofs + rule->part_size) {
2092 /* moved to the next part */
2093 cur_stripe = 0;
2094 part_ofs += rule->part_size;
2095 stripe_ofs = part_ofs;
2096
2097 bool last_rule = (next_rule_iter == manifest->rules.end());
2098 /* move to the next rule? */
2099 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2100 rule_iter = next_rule_iter;
2101 last_rule = (next_rule_iter == manifest->rules.end());
2102 if (!last_rule) {
2103 ++next_rule_iter;
2104 }
2105 cur_part_id = rule_iter->second.start_part_num;
2106 } else {
2107 cur_part_id++;
2108 }
2109
2110 rule = &rule_iter->second;
2111 }
2112
2113 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2114 }
2115
2116 cur_override_prefix = rule->override_prefix;
2117
2118 ofs = stripe_ofs;
2119 if (ofs > obj_size) {
2120 ofs = obj_size;
2121 stripe_ofs = ofs;
2122 stripe_size = 0;
2123 }
2124
2125 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2126 update_location();
2127 }
2128
2129 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2130 {
2131 manifest = _m;
2132
2133 manifest->set_tail_placement(placement_rule, _b);
2134 manifest->set_head(placement_rule, _obj, 0);
2135 last_ofs = 0;
2136
2137 if (manifest->get_prefix().empty()) {
2138 char buf[33];
2139 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2140
2141 string oid_prefix = ".";
2142 oid_prefix.append(buf);
2143 oid_prefix.append("_");
2144
2145 manifest->set_prefix(oid_prefix);
2146 }
2147
2148 bool found = manifest->get_rule(0, &rule);
2149 if (!found) {
2150 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2151 return -EIO;
2152 }
2153
2154 uint64_t head_size = manifest->get_head_size();
2155
2156 if (head_size > 0) {
2157 cur_stripe_size = head_size;
2158 } else {
2159 cur_stripe_size = rule.stripe_max_size;
2160 }
2161
2162 cur_part_id = rule.start_part_num;
2163
2164 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2165
2166 // Normal object which not generated through copy operation
2167 manifest->set_tail_instance(_obj.key.instance);
2168
2169 manifest->update_iterators();
2170
2171 return 0;
2172 }
2173
2174 int RGWObjManifest::generator::create_next(uint64_t ofs)
2175 {
2176 if (ofs < last_ofs) /* only going forward */
2177 return -EINVAL;
2178
2179 uint64_t max_head_size = manifest->get_max_head_size();
2180
2181 if (ofs < max_head_size) {
2182 manifest->set_head_size(ofs);
2183 }
2184
2185 if (ofs >= max_head_size) {
2186 manifest->set_head_size(max_head_size);
2187 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2188 cur_stripe_size = rule.stripe_max_size;
2189
2190 if (cur_part_id == 0 && max_head_size > 0) {
2191 cur_stripe++;
2192 }
2193 }
2194
2195 last_ofs = ofs;
2196 manifest->set_obj_size(ofs);
2197
2198 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2199
2200 manifest->update_iterators();
2201
2202 return 0;
2203 }
2204
2205 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2206 {
2207 return begin_iter;
2208 }
2209
2210 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2211 {
2212 return end_iter;
2213 }
2214
2215 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2216 {
2217 if (ofs > obj_size) {
2218 ofs = obj_size;
2219 }
2220 RGWObjManifest::obj_iterator iter(this);
2221 iter.seek(ofs);
2222 return iter;
2223 }
2224
2225 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2226 {
2227 if (explicit_objs || m.explicit_objs) {
2228 return append_explicit(m, zonegroup, zone_params);
2229 }
2230
2231 if (rules.empty()) {
2232 *this = m;
2233 return 0;
2234 }
2235
2236 string override_prefix;
2237
2238 if (prefix.empty()) {
2239 prefix = m.prefix;
2240 }
2241
2242 if (prefix != m.prefix) {
2243 override_prefix = m.prefix;
2244 }
2245
2246 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2247 if (miter == m.rules.end()) {
2248 return append_explicit(m, zonegroup, zone_params);
2249 }
2250
2251 for (; miter != m.rules.end(); ++miter) {
2252 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2253
2254 RGWObjManifestRule& rule = last_rule->second;
2255
2256 if (rule.part_size == 0) {
2257 rule.part_size = obj_size - rule.start_ofs;
2258 }
2259
2260 RGWObjManifestRule& next_rule = miter->second;
2261 if (!next_rule.part_size) {
2262 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2263 }
2264
2265 string rule_prefix = prefix;
2266 if (!rule.override_prefix.empty()) {
2267 rule_prefix = rule.override_prefix;
2268 }
2269
2270 string next_rule_prefix = m.prefix;
2271 if (!next_rule.override_prefix.empty()) {
2272 next_rule_prefix = next_rule.override_prefix;
2273 }
2274
2275 if (rule.part_size != next_rule.part_size ||
2276 rule.stripe_max_size != next_rule.stripe_max_size ||
2277 rule_prefix != next_rule_prefix) {
2278 if (next_rule_prefix != prefix) {
2279 append_rules(m, miter, &next_rule_prefix);
2280 } else {
2281 append_rules(m, miter, NULL);
2282 }
2283 break;
2284 }
2285
2286 uint64_t expected_part_num = rule.start_part_num + 1;
2287 if (rule.part_size > 0) {
2288 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2289 }
2290
2291 if (expected_part_num != next_rule.start_part_num) {
2292 append_rules(m, miter, NULL);
2293 break;
2294 }
2295 }
2296
2297 set_obj_size(obj_size + m.obj_size);
2298
2299 return 0;
2300 }
2301
2302 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2303 {
2304 return append(m, store->get_zonegroup(), store->get_zone_params());
2305 }
2306
2307 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2308 string *override_prefix)
2309 {
2310 for (; miter != m.rules.end(); ++miter) {
2311 RGWObjManifestRule rule = miter->second;
2312 rule.start_ofs += obj_size;
2313 if (override_prefix)
2314 rule.override_prefix = *override_prefix;
2315 rules[rule.start_ofs] = rule;
2316 }
2317 }
2318
2319 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2320 {
2321 if (explicit_objs) {
2322 return;
2323 }
2324 obj_iterator iter = obj_begin();
2325
2326 while (iter != obj_end()) {
2327 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2328 const rgw_obj_select& os = iter.get_location();
2329 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2330 part.loc_ofs = 0;
2331
2332 uint64_t ofs = iter.get_stripe_ofs();
2333
2334 if (ofs == 0) {
2335 part.loc = obj;
2336 } else {
2337 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2338 }
2339 ++iter;
2340 uint64_t next_ofs = iter.get_stripe_ofs();
2341
2342 part.size = next_ofs - ofs;
2343 }
2344
2345 explicit_objs = true;
2346 rules.clear();
2347 prefix.clear();
2348 }
2349
2350 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2351 {
2352 if (!explicit_objs) {
2353 convert_to_explicit(zonegroup, zone_params);
2354 }
2355 if (!m.explicit_objs) {
2356 m.convert_to_explicit(zonegroup, zone_params);
2357 }
2358 map<uint64_t, RGWObjManifestPart>::iterator iter;
2359 uint64_t base = obj_size;
2360 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2361 RGWObjManifestPart& part = iter->second;
2362 objs[base + iter->first] = part;
2363 }
2364 obj_size += m.obj_size;
2365
2366 return 0;
2367 }
2368
2369 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2370 {
2371 if (rules.empty()) {
2372 return false;
2373 }
2374
2375 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2376 if (iter != rules.begin()) {
2377 --iter;
2378 }
2379
2380 *rule = iter->second;
2381
2382 return true;
2383 }
2384
2385 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2386 {
2387 write_version.ver = 1;
2388 #define TAG_LEN 24
2389
2390 write_version.tag.clear();
2391 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2392 }
2393
2394 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2395 real_time *mtime, real_time set_mtime,
2396 map<string, bufferlist>& attrs, real_time delete_at,
2397 const char *if_match, const char *if_nomatch, const string *user_data,
2398 rgw_zone_set *zones_trace)
2399 {
2400 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2401 if (r < 0)
2402 return r;
2403
2404 is_complete = !canceled;
2405 return 0;
2406 }
2407
2408 CephContext *RGWPutObjProcessor::ctx()
2409 {
2410 return store->ctx();
2411 }
2412
2413 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2414 {
2415 drain_pending();
2416
2417 if (is_complete)
2418 return;
2419
2420 set<rgw_raw_obj>::iterator iter;
2421 bool need_to_remove_head = false;
2422 rgw_raw_obj raw_head;
2423
2424 if (!head_obj.empty()) {
2425 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2426 }
2427
2428 /**
2429 * We should delete the object in the "multipart" namespace to avoid race condition.
2430 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2431 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2432 * written by the second upload may be deleted by the first upload.
2433 * details is describled on #11749
2434 *
2435 * The above comment still stands, but instead of searching for a specific object in the multipart
2436 * namespace, we just make sure that we remove the object that is marked as the head object after
2437 * we remove all the other raw objects. Note that we use different call to remove the head object,
2438 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2439 */
2440 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2441 const rgw_raw_obj& obj = *iter;
2442 if (!head_obj.empty() && obj == raw_head) {
2443 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2444 need_to_remove_head = true;
2445 continue;
2446 }
2447
2448 int r = store->delete_raw_obj(obj);
2449 if (r < 0 && r != -ENOENT) {
2450 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2451 }
2452 }
2453
2454 if (need_to_remove_head) {
2455 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2456 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2457 if (r < 0 && r != -ENOENT) {
2458 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2459 }
2460 }
2461 }
2462
2463 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2464 {
2465 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2466 obj_len = abs_ofs + bl.length();
2467
2468 if (!(obj == last_written_obj)) {
2469 last_written_obj = obj;
2470 }
2471
2472 // For the first call pass -1 as the offset to
2473 // do a write_full.
2474 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2475 }
2476
2477 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2478 {
2479 struct put_obj_aio_info info;
2480 info = pending.front();
2481 pending.pop_front();
2482 pending_size -= info.size;
2483 return info;
2484 }
2485
2486 int RGWPutObjProcessor_Aio::wait_pending_front()
2487 {
2488 if (pending.empty()) {
2489 return 0;
2490 }
2491 struct put_obj_aio_info info = pop_pending();
2492 int ret = store->aio_wait(info.handle);
2493
2494 if (ret >= 0) {
2495 add_written_obj(info.obj);
2496 }
2497
2498 return ret;
2499 }
2500
2501 bool RGWPutObjProcessor_Aio::pending_has_completed()
2502 {
2503 if (pending.empty())
2504 return false;
2505
2506 struct put_obj_aio_info& info = pending.front();
2507 return store->aio_completed(info.handle);
2508 }
2509
2510 int RGWPutObjProcessor_Aio::drain_pending()
2511 {
2512 int ret = 0;
2513 while (!pending.empty()) {
2514 int r = wait_pending_front();
2515 if (r < 0)
2516 ret = r;
2517 }
2518 return ret;
2519 }
2520
2521 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2522 {
2523 bool _wait = need_to_wait;
2524
2525 if (handle) {
2526 struct put_obj_aio_info info;
2527 info.handle = handle;
2528 info.obj = obj;
2529 info.size = size;
2530 pending_size += size;
2531 pending.push_back(info);
2532 }
2533 size_t orig_size = pending_size;
2534
2535 /* first drain complete IOs */
2536 while (pending_has_completed()) {
2537 int r = wait_pending_front();
2538 if (r < 0)
2539 return r;
2540
2541 _wait = false;
2542 }
2543
2544 /* resize window in case messages are draining too fast */
2545 if (orig_size - pending_size >= window_size) {
2546 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2547 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2548 if (window_size > max_window_size) {
2549 window_size = max_window_size;
2550 }
2551 }
2552
2553 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2554 if (pending_size > window_size || _wait) {
2555 int r = wait_pending_front();
2556 if (r < 0)
2557 return r;
2558 }
2559 return 0;
2560 }
2561
2562 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2563 {
2564 if (ofs >= next_part_ofs) {
2565 int r = prepare_next_part(ofs);
2566 if (r < 0) {
2567 return r;
2568 }
2569 }
2570
2571 *pobj = cur_obj;
2572
2573 if (!bl.length()) {
2574 *phandle = nullptr;
2575 return 0;
2576 }
2577
2578 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2579 }
2580
2581 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2582 {
2583 RGWPutObjProcessor::prepare(store, oid_rand);
2584
2585 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2586
2587 return 0;
2588 }
2589
2590 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2591 {
2592 *phandle = NULL;
2593 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2594
2595 pending_data_bl.claim_append(bl);
2596 if (pending_data_bl.length() < max_write_size) {
2597 *again = false;
2598 return 0;
2599 }
2600
2601 pending_data_bl.splice(0, max_write_size, &bl);
2602
2603 /* do we have enough data pending accumulated that needs to be written? */
2604 *again = (pending_data_bl.length() >= max_chunk_size);
2605
2606 if (!data_ofs && !immutable_head()) {
2607 first_chunk.claim(bl);
2608 obj_len = (uint64_t)first_chunk.length();
2609 int r = prepare_next_part(obj_len);
2610 if (r < 0) {
2611 return r;
2612 }
2613 data_ofs = obj_len;
2614 return 0;
2615 }
2616 off_t write_ofs = data_ofs;
2617 data_ofs = write_ofs + bl.length();
2618 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2619 we could be racing with another upload, to the same
2620 object and cleanup can be messy */
2621 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2622 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2623 bl.clear();
2624 }
2625 return ret;
2626 }
2627
2628
2629 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2630 {
2631 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2632
2633 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2634 if (r < 0) {
2635 return r;
2636 }
2637
2638 return 0;
2639 }
2640
2641 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2642 {
2643 head_obj.init(bucket, obj_str);
2644
2645 int r = prepare_init(store, oid_rand);
2646 if (r < 0) {
2647 return r;
2648 }
2649
2650 if (!version_id.empty()) {
2651 head_obj.key.set_instance(version_id);
2652 } else if (versioned_object) {
2653 store->gen_rand_obj_instance_name(&head_obj);
2654 }
2655
2656 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2657
2658 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2659 if (r < 0) {
2660 return r;
2661 }
2662
2663 return 0;
2664 }
2665
2666 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2667
2668 int ret = manifest_gen.create_next(ofs);
2669 if (ret < 0) {
2670 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2671 return ret;
2672 }
2673 cur_part_ofs = ofs;
2674 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2675 cur_obj = manifest_gen.get_cur_obj(store);
2676
2677 return 0;
2678 }
2679
2680 int RGWPutObjProcessor_Atomic::complete_parts()
2681 {
2682 if (obj_len > (uint64_t)cur_part_ofs) {
2683 return prepare_next_part(obj_len);
2684 }
2685 return 0;
2686 }
2687
2688 int RGWPutObjProcessor_Atomic::complete_writing_data()
2689 {
2690 if (!data_ofs && !immutable_head()) {
2691 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2692 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2693 * clobber first_chunk
2694 */
2695 if (pending_data_bl.length() > 0) {
2696 first_chunk.claim(pending_data_bl);
2697 }
2698 obj_len = (uint64_t)first_chunk.length();
2699 }
2700 while (pending_data_bl.length()) {
2701 void *handle = nullptr;
2702 rgw_raw_obj obj;
2703 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2704 if (max_write_size > pending_data_bl.length()) {
2705 max_write_size = pending_data_bl.length();
2706 }
2707 bufferlist bl;
2708 pending_data_bl.splice(0, max_write_size, &bl);
2709 uint64_t write_len = bl.length();
2710 int r = write_data(bl, data_ofs, &handle, &obj, false);
2711 if (r < 0) {
2712 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2713 return r;
2714 }
2715 data_ofs += write_len;
2716 r = throttle_data(handle, obj, write_len, false);
2717 if (r < 0) {
2718 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2719 return r;
2720 }
2721
2722 if (data_ofs >= next_part_ofs) {
2723 r = prepare_next_part(data_ofs);
2724 if (r < 0) {
2725 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2726 return r;
2727 }
2728 }
2729 }
2730 int r = complete_parts();
2731 if (r < 0) {
2732 return r;
2733 }
2734
2735 r = drain_pending();
2736 if (r < 0)
2737 return r;
2738
2739 return 0;
2740 }
2741
2742 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2743 real_time *mtime, real_time set_mtime,
2744 map<string, bufferlist>& attrs,
2745 real_time delete_at,
2746 const char *if_match,
2747 const char *if_nomatch, const string *user_data,
2748 rgw_zone_set *zones_trace) {
2749 int r = complete_writing_data();
2750 if (r < 0)
2751 return r;
2752
2753 obj_ctx.obj.set_atomic(head_obj);
2754
2755 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2756
2757 /* some object types shouldn't be versioned, e.g., multipart parts */
2758 op_target.set_versioning_disabled(!versioned_object);
2759
2760 RGWRados::Object::Write obj_op(&op_target);
2761
2762 obj_op.meta.data = &first_chunk;
2763 obj_op.meta.manifest = &manifest;
2764 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2765 obj_op.meta.if_match = if_match;
2766 obj_op.meta.if_nomatch = if_nomatch;
2767 obj_op.meta.mtime = mtime;
2768 obj_op.meta.set_mtime = set_mtime;
2769 obj_op.meta.owner = bucket_info.owner;
2770 obj_op.meta.flags = PUT_OBJ_CREATE;
2771 obj_op.meta.olh_epoch = olh_epoch;
2772 obj_op.meta.delete_at = delete_at;
2773 obj_op.meta.user_data = user_data;
2774 obj_op.meta.zones_trace = zones_trace;
2775
2776 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2777 if (r < 0) {
2778 return r;
2779 }
2780
2781 canceled = obj_op.meta.canceled;
2782
2783 return 0;
2784 }
2785
2786 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2787 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2788 if (r < 0)
2789 return r;
2790 return 0;
2791 }
2792
2793 int RGWRados::unwatch(uint64_t watch_handle)
2794 {
2795 int r = control_pool_ctx.unwatch2(watch_handle);
2796 if (r < 0) {
2797 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2798 return r;
2799 }
2800 r = rados[0].watch_flush();
2801 if (r < 0) {
2802 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2803 return r;
2804 }
2805 return 0;
2806 }
2807
2808 void RGWRados::add_watcher(int i)
2809 {
2810 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2811 Mutex::Locker l(watchers_lock);
2812 watchers_set.insert(i);
2813 if (watchers_set.size() == (size_t)num_watchers) {
2814 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2815 set_cache_enabled(true);
2816 }
2817 }
2818
2819 void RGWRados::remove_watcher(int i)
2820 {
2821 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2822 Mutex::Locker l(watchers_lock);
2823 size_t orig_size = watchers_set.size();
2824 watchers_set.erase(i);
2825 if (orig_size == (size_t)num_watchers &&
2826 watchers_set.size() < orig_size) { /* actually removed */
2827 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2828 set_cache_enabled(false);
2829 }
2830 }
2831
2832 class RGWWatcher : public librados::WatchCtx2 {
2833 RGWRados *rados;
2834 int index;
2835 string oid;
2836 uint64_t watch_handle;
2837
2838 class C_ReinitWatch : public Context {
2839 RGWWatcher *watcher;
2840 public:
2841 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2842 void finish(int r) override {
2843 watcher->reinit();
2844 }
2845 };
2846 public:
2847 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2848 void handle_notify(uint64_t notify_id,
2849 uint64_t cookie,
2850 uint64_t notifier_id,
2851 bufferlist& bl) override {
2852 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2853 << " notify_id " << notify_id
2854 << " cookie " << cookie
2855 << " notifier " << notifier_id
2856 << " bl.length()=" << bl.length() << dendl;
2857 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2858
2859 bufferlist reply_bl; // empty reply payload
2860 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2861 }
2862 void handle_error(uint64_t cookie, int err) override {
2863 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2864 << " err " << cpp_strerror(err) << dendl;
2865 rados->remove_watcher(index);
2866 rados->schedule_context(new C_ReinitWatch(this));
2867 }
2868
2869 void reinit() {
2870 int ret = unregister_watch();
2871 if (ret < 0) {
2872 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2873 return;
2874 }
2875 ret = register_watch();
2876 if (ret < 0) {
2877 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2878 return;
2879 }
2880 }
2881
2882 int unregister_watch() {
2883 int r = rados->unwatch(watch_handle);
2884 if (r < 0) {
2885 return r;
2886 }
2887 rados->remove_watcher(index);
2888 return 0;
2889 }
2890
2891 int register_watch() {
2892 int r = rados->watch(oid, &watch_handle, this);
2893 if (r < 0) {
2894 return r;
2895 }
2896 rados->add_watcher(index);
2897 return 0;
2898 }
2899 };
2900
2901 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2902 RGWRados *store;
2903 RGWHTTPManager http_manager;
2904
2905 public:
2906 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2907 http_manager(store->ctx(), completion_mgr) {
2908 http_manager.set_threaded();
2909 }
2910
2911 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2912 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2913 { "notify", NULL },
2914 { NULL, NULL } };
2915
2916 list<RGWCoroutinesStack *> stacks;
2917 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2918 RGWRESTConn *conn = iter->second;
2919 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2920 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2921
2922 stacks.push_back(stack);
2923 }
2924 return run(stacks);
2925 }
2926 };
2927
2928 class RGWDataNotifierManager : public RGWCoroutinesManager {
2929 RGWRados *store;
2930 RGWHTTPManager http_manager;
2931
2932 public:
2933 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2934 http_manager(store->ctx(), completion_mgr) {
2935 http_manager.set_threaded();
2936 }
2937
2938 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2939 rgw_http_param_pair pairs[] = { { "type", "data" },
2940 { "notify", NULL },
2941 { "source-zone", store->get_zone_params().get_id().c_str() },
2942 { NULL, NULL } };
2943
2944 list<RGWCoroutinesStack *> stacks;
2945 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2946 RGWRESTConn *conn = iter->second;
2947 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2948 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2949
2950 stacks.push_back(stack);
2951 }
2952 return run(stacks);
2953 }
2954 };
2955
2956 class RGWRadosThread {
2957 class Worker : public Thread {
2958 CephContext *cct;
2959 RGWRadosThread *processor;
2960 Mutex lock;
2961 Cond cond;
2962
2963 void wait() {
2964 Mutex::Locker l(lock);
2965 cond.Wait(lock);
2966 };
2967
2968 void wait_interval(const utime_t& wait_time) {
2969 Mutex::Locker l(lock);
2970 cond.WaitInterval(lock, wait_time);
2971 }
2972
2973 public:
2974 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2975 void *entry() override;
2976 void signal() {
2977 Mutex::Locker l(lock);
2978 cond.Signal();
2979 }
2980 };
2981
2982 Worker *worker;
2983
2984 protected:
2985 CephContext *cct;
2986 RGWRados *store;
2987
2988 std::atomic<bool> down_flag = { false };
2989
2990 string thread_name;
2991
2992 virtual uint64_t interval_msec() = 0;
2993 virtual void stop_process() {}
2994 public:
2995 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
2996 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
2997 virtual ~RGWRadosThread() {
2998 stop();
2999 }
3000
3001 virtual int init() { return 0; }
3002 virtual int process() = 0;
3003
3004 bool going_down() { return down_flag; }
3005
3006 void start();
3007 void stop();
3008
3009 void signal() {
3010 if (worker) {
3011 worker->signal();
3012 }
3013 }
3014 };
3015
3016 void RGWRadosThread::start()
3017 {
3018 worker = new Worker(cct, this);
3019 worker->create(thread_name.c_str());
3020 }
3021
3022 void RGWRadosThread::stop()
3023 {
3024 down_flag = true;
3025 stop_process();
3026 if (worker) {
3027 worker->signal();
3028 worker->join();
3029 }
3030 delete worker;
3031 worker = NULL;
3032 }
3033
3034 void *RGWRadosThread::Worker::entry() {
3035 uint64_t msec = processor->interval_msec();
3036 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3037
3038 do {
3039 utime_t start = ceph_clock_now();
3040 int r = processor->process();
3041 if (r < 0) {
3042 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3043 }
3044
3045 if (processor->going_down())
3046 break;
3047
3048 utime_t end = ceph_clock_now();
3049 end -= start;
3050
3051 uint64_t cur_msec = processor->interval_msec();
3052 if (cur_msec != msec) { /* was it reconfigured? */
3053 msec = cur_msec;
3054 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3055 }
3056
3057 if (cur_msec > 0) {
3058 if (interval <= end)
3059 continue; // next round
3060
3061 utime_t wait_time = interval;
3062 wait_time -= end;
3063
3064 wait_interval(wait_time);
3065 } else {
3066 wait();
3067 }
3068 } while (!processor->going_down());
3069
3070 return NULL;
3071 }
3072
3073 class RGWMetaNotifier : public RGWRadosThread {
3074 RGWMetaNotifierManager notify_mgr;
3075 RGWMetadataLog *const log;
3076
3077 uint64_t interval_msec() override {
3078 return cct->_conf->rgw_md_notify_interval_msec;
3079 }
3080 public:
3081 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3082 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3083
3084 int process() override;
3085 };
3086
3087 int RGWMetaNotifier::process()
3088 {
3089 set<int> shards;
3090
3091 log->read_clear_modified(shards);
3092
3093 if (shards.empty()) {
3094 return 0;
3095 }
3096
3097 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3098 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3099 }
3100
3101 notify_mgr.notify_all(store->zone_conn_map, shards);
3102
3103 return 0;
3104 }
3105
3106 class RGWDataNotifier : public RGWRadosThread {
3107 RGWDataNotifierManager notify_mgr;
3108
3109 uint64_t interval_msec() override {
3110 return cct->_conf->rgw_md_notify_interval_msec;
3111 }
3112 public:
3113 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3114
3115 int process() override;
3116 };
3117
3118 int RGWDataNotifier::process()
3119 {
3120 if (!store->data_log) {
3121 return 0;
3122 }
3123
3124 map<int, set<string> > shards;
3125
3126 store->data_log->read_clear_modified(shards);
3127
3128 if (shards.empty()) {
3129 return 0;
3130 }
3131
3132 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3133 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3134 }
3135
3136 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3137
3138 return 0;
3139 }
3140
3141 class RGWSyncProcessorThread : public RGWRadosThread {
3142 public:
3143 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3144 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3145 ~RGWSyncProcessorThread() override {}
3146 int init() override = 0 ;
3147 int process() override = 0;
3148 };
3149
3150 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3151 {
3152 RGWMetaSyncStatusManager sync;
3153
3154 uint64_t interval_msec() override {
3155 return 0; /* no interval associated, it'll run once until stopped */
3156 }
3157 void stop_process() override {
3158 sync.stop();
3159 }
3160 public:
3161 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3162 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3163
3164 void wakeup_sync_shards(set<int>& shard_ids) {
3165 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3166 sync.wakeup(*iter);
3167 }
3168 }
3169 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3170
3171 int init() override {
3172 int ret = sync.init();
3173 if (ret < 0) {
3174 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3175 return ret;
3176 }
3177 return 0;
3178 }
3179
3180 int process() override {
3181 sync.run();
3182 return 0;
3183 }
3184 };
3185
3186 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3187 {
3188 RGWDataSyncStatusManager sync;
3189 bool initialized;
3190
3191 uint64_t interval_msec() override {
3192 if (initialized) {
3193 return 0; /* no interval associated, it'll run once until stopped */
3194 } else {
3195 #define DATA_SYNC_INIT_WAIT_SEC 20
3196 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3197 }
3198 }
3199 void stop_process() override {
3200 sync.stop();
3201 }
3202 public:
3203 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3204 const string& _source_zone)
3205 : RGWSyncProcessorThread(_store, "data-sync"), sync(_store, async_rados, _source_zone),
3206 initialized(false) {}
3207
3208 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3209 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3210 sync.wakeup(iter->first, iter->second);
3211 }
3212 }
3213 RGWDataSyncStatusManager* get_manager() { return &sync; }
3214
3215 int init() override {
3216 return 0;
3217 }
3218
3219 int process() override {
3220 while (!initialized) {
3221 if (going_down()) {
3222 return 0;
3223 }
3224 int ret = sync.init();
3225 if (ret >= 0) {
3226 initialized = true;
3227 break;
3228 }
3229 /* we'll be back! */
3230 return 0;
3231 }
3232 sync.run();
3233 return 0;
3234 }
3235 };
3236
3237 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3238 {
3239 RGWCoroutinesManager crs;
3240 RGWRados *store;
3241 RGWHTTPManager http;
3242 const utime_t trim_interval;
3243
3244 uint64_t interval_msec() override { return 0; }
3245 void stop_process() override { crs.stop(); }
3246 public:
3247 RGWSyncLogTrimThread(RGWRados *store, int interval)
3248 : RGWSyncProcessorThread(store, "sync-log-trim"),
3249 crs(store->ctx(), store->get_cr_registry()), store(store),
3250 http(store->ctx(), crs.get_completion_mgr()),
3251 trim_interval(interval, 0)
3252 {}
3253
3254 int init() override {
3255 return http.set_threaded();
3256 }
3257 int process() override {
3258 list<RGWCoroutinesStack*> stacks;
3259 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3260 meta->call(create_meta_log_trim_cr(store, &http,
3261 cct->_conf->rgw_md_log_max_shards,
3262 trim_interval));
3263 stacks.push_back(meta);
3264
3265 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3266 data->call(create_data_log_trim_cr(store, &http,
3267 cct->_conf->rgw_data_log_num_shards,
3268 trim_interval));
3269 stacks.push_back(data);
3270
3271 crs.run(stacks);
3272 return 0;
3273 }
3274 };
3275
3276 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3277 {
3278 Mutex::Locker l(meta_sync_thread_lock);
3279 if (meta_sync_processor_thread) {
3280 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3281 }
3282 }
3283
3284 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3285 {
3286 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3287 Mutex::Locker l(data_sync_thread_lock);
3288 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3289 if (iter == data_sync_processor_threads.end()) {
3290 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3291 return;
3292 }
3293
3294 RGWDataSyncProcessorThread *thread = iter->second;
3295 assert(thread);
3296 thread->wakeup_sync_shards(shard_ids);
3297 }
3298
3299 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3300 {
3301 Mutex::Locker l(meta_sync_thread_lock);
3302 if (meta_sync_processor_thread) {
3303 return meta_sync_processor_thread->get_manager();
3304 }
3305 return nullptr;
3306 }
3307
3308 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3309 {
3310 Mutex::Locker l(data_sync_thread_lock);
3311 auto thread = data_sync_processor_threads.find(source_zone);
3312 if (thread == data_sync_processor_threads.end()) {
3313 return nullptr;
3314 }
3315 return thread->second->get_manager();
3316 }
3317
3318 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3319 {
3320 IoCtx ioctx;
3321 int r = open_pool_ctx(pool, ioctx);
3322 if (r < 0) {
3323 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3324 return r;
3325 }
3326
3327 bool requires;
3328 r = ioctx.pool_requires_alignment2(&requires);
3329 if (r < 0) {
3330 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3331 << r << dendl;
3332 return r;
3333 }
3334
3335 if (!requires) {
3336 *alignment = 0;
3337 return 0;
3338 }
3339
3340 uint64_t align;
3341 r = ioctx.pool_required_alignment2(&align);
3342 if (r < 0) {
3343 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3344 << r << dendl;
3345 return r;
3346 }
3347 if (align != 0) {
3348 ldout(cct, 20) << "required alignment=" << align << dendl;
3349 }
3350 *alignment = align;
3351 return 0;
3352 }
3353
3354 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3355 {
3356 uint64_t alignment = 0;
3357 int r = get_required_alignment(pool, &alignment);
3358 if (r < 0) {
3359 return r;
3360 }
3361
3362 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3363
3364 if (alignment == 0) {
3365 *max_chunk_size = config_chunk_size;
3366 return 0;
3367 }
3368
3369 if (config_chunk_size <= alignment) {
3370 *max_chunk_size = alignment;
3371 return 0;
3372 }
3373
3374 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3375
3376 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3377
3378 return 0;
3379 }
3380
3381 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3382 {
3383 rgw_pool pool;
3384 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3385 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3386 return -EIO;
3387 }
3388 return get_max_chunk_size(pool, max_chunk_size);
3389 }
3390
3391 class RGWIndexCompletionManager;
3392
3393 struct complete_op_data {
3394 Mutex lock{"complete_op_data"};
3395 AioCompletion *rados_completion{nullptr};
3396 int manager_shard_id{-1};
3397 RGWIndexCompletionManager *manager{nullptr};
3398 rgw_obj obj;
3399 RGWModifyOp op;
3400 string tag;
3401 rgw_bucket_entry_ver ver;
3402 cls_rgw_obj_key key;
3403 rgw_bucket_dir_entry_meta dir_meta;
3404 list<cls_rgw_obj_key> remove_objs;
3405 bool log_op;
3406 uint16_t bilog_op;
3407 rgw_zone_set zones_trace;
3408
3409 bool stopped{false};
3410
3411 void stop() {
3412 Mutex::Locker l(lock);
3413 stopped = true;
3414 }
3415 };
3416
3417 class RGWIndexCompletionThread : public RGWRadosThread {
3418 RGWRados *store;
3419
3420 uint64_t interval_msec() override {
3421 return 0;
3422 }
3423
3424 list<complete_op_data *> completions;
3425
3426 Mutex completions_lock;
3427 public:
3428 RGWIndexCompletionThread(RGWRados *_store)
3429 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3430
3431 int process() override;
3432
3433 void add_completion(complete_op_data *completion) {
3434 {
3435 Mutex::Locker l(completions_lock);
3436 completions.push_back(completion);
3437 }
3438
3439 signal();
3440 }
3441 };
3442
3443 int RGWIndexCompletionThread::process()
3444 {
3445 list<complete_op_data *> comps;
3446
3447 {
3448 Mutex::Locker l(completions_lock);
3449 completions.swap(comps);
3450 }
3451
3452 for (auto c : comps) {
3453 std::unique_ptr<complete_op_data> up{c};
3454
3455 if (going_down()) {
3456 continue;
3457 }
3458 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3459
3460 RGWRados::BucketShard bs(store);
3461
3462 int r = bs.init(c->obj.bucket, c->obj);
3463 if (r < 0) {
3464 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3465 /* not much to do */
3466 continue;
3467 }
3468
3469 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3470 librados::ObjectWriteOperation o;
3471 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3472 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3473 c->log_op, c->bilog_op, &c->zones_trace);
3474
3475 return bs->index_ctx.operate(bs->bucket_obj, &o);
3476 });
3477 if (r < 0) {
3478 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3479 /* ignoring error, can't do anything about it */
3480 continue;
3481 }
3482 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3483 if (r < 0) {
3484 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3485 }
3486 }
3487
3488 return 0;
3489 }
3490
3491 class RGWIndexCompletionManager {
3492 RGWRados *store{nullptr};
3493 vector<Mutex *> locks;
3494 vector<set<complete_op_data *> > completions;
3495
3496 RGWIndexCompletionThread *completion_thread{nullptr};
3497
3498 int num_shards;
3499
3500 std::atomic<int> cur_shard {0};
3501
3502
3503 public:
3504 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3505 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3506
3507 for (int i = 0; i < num_shards; i++) {
3508 char buf[64];
3509 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3510 locks.push_back(new Mutex(buf));
3511 }
3512
3513 completions.resize(num_shards);
3514 }
3515 ~RGWIndexCompletionManager() {
3516 stop();
3517
3518 for (auto l : locks) {
3519 delete l;
3520 }
3521 }
3522
3523 int next_shard() {
3524 int result = cur_shard % num_shards;
3525 cur_shard++;
3526 return result;
3527 }
3528
3529 void create_completion(const rgw_obj& obj,
3530 RGWModifyOp op, string& tag,
3531 rgw_bucket_entry_ver& ver,
3532 const cls_rgw_obj_key& key,
3533 rgw_bucket_dir_entry_meta& dir_meta,
3534 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3535 uint16_t bilog_op,
3536 rgw_zone_set *zones_trace,
3537 complete_op_data **result);
3538 bool handle_completion(completion_t cb, complete_op_data *arg);
3539
3540 int start() {
3541 completion_thread = new RGWIndexCompletionThread(store);
3542 int ret = completion_thread->init();
3543 if (ret < 0) {
3544 return ret;
3545 }
3546 completion_thread->start();
3547 return 0;
3548 }
3549 void stop() {
3550 if (completion_thread) {
3551 completion_thread->stop();
3552 delete completion_thread;
3553 }
3554
3555 for (int i = 0; i < num_shards; ++i) {
3556 Mutex::Locker l(*locks[i]);
3557 for (auto c : completions[i]) {
3558 Mutex::Locker cl(c->lock);
3559 c->stop();
3560 }
3561 }
3562 completions.clear();
3563 }
3564 };
3565
3566 static void obj_complete_cb(completion_t cb, void *arg)
3567 {
3568 complete_op_data *completion = (complete_op_data *)arg;
3569 completion->lock.Lock();
3570 if (completion->stopped) {
3571 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3572 delete completion;
3573 return;
3574 }
3575 bool need_delete = completion->manager->handle_completion(cb, completion);
3576 completion->lock.Unlock();
3577 if (need_delete) {
3578 delete completion;
3579 }
3580 }
3581
3582
3583 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3584 RGWModifyOp op, string& tag,
3585 rgw_bucket_entry_ver& ver,
3586 const cls_rgw_obj_key& key,
3587 rgw_bucket_dir_entry_meta& dir_meta,
3588 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3589 uint16_t bilog_op,
3590 rgw_zone_set *zones_trace,
3591 complete_op_data **result)
3592 {
3593 complete_op_data *entry = new complete_op_data;
3594
3595 int shard_id = next_shard();
3596
3597 entry->manager_shard_id = shard_id;
3598 entry->manager = this;
3599 entry->obj = obj;
3600 entry->op = op;
3601 entry->tag = tag;
3602 entry->ver = ver;
3603 entry->key = key;
3604 entry->dir_meta = dir_meta;
3605 entry->log_op = log_op;
3606 entry->bilog_op = bilog_op;
3607
3608 if (remove_objs) {
3609 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3610 entry->remove_objs.push_back(*iter);
3611 }
3612 }
3613
3614 if (zones_trace) {
3615 entry->zones_trace = *zones_trace;
3616 } else {
3617 entry->zones_trace.insert(store->get_zone().id);
3618 }
3619
3620 *result = entry;
3621
3622 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3623
3624 Mutex::Locker l(*locks[shard_id]);
3625 completions[shard_id].insert(entry);
3626 }
3627
3628 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3629 {
3630 int shard_id = arg->manager_shard_id;
3631 {
3632 Mutex::Locker l(*locks[shard_id]);
3633
3634 auto& comps = completions[shard_id];
3635
3636 auto iter = comps.find(arg);
3637 if (iter == comps.end()) {
3638 return true;
3639 }
3640
3641 comps.erase(iter);
3642 }
3643
3644 int r = rados_aio_get_return_value(cb);
3645 if (r != -ERR_BUSY_RESHARDING) {
3646 return true;
3647 }
3648 completion_thread->add_completion(arg);
3649 return false;
3650 }
3651
3652 void RGWRados::finalize()
3653 {
3654 if (run_sync_thread) {
3655 Mutex::Locker l(meta_sync_thread_lock);
3656 meta_sync_processor_thread->stop();
3657
3658 Mutex::Locker dl(data_sync_thread_lock);
3659 for (auto iter : data_sync_processor_threads) {
3660 RGWDataSyncProcessorThread *thread = iter.second;
3661 thread->stop();
3662 }
3663 if (sync_log_trimmer) {
3664 sync_log_trimmer->stop();
3665 }
3666 }
3667 if (async_rados) {
3668 async_rados->stop();
3669 }
3670 if (run_sync_thread) {
3671 delete meta_sync_processor_thread;
3672 meta_sync_processor_thread = NULL;
3673 Mutex::Locker dl(data_sync_thread_lock);
3674 for (auto iter : data_sync_processor_threads) {
3675 RGWDataSyncProcessorThread *thread = iter.second;
3676 delete thread;
3677 }
3678 data_sync_processor_threads.clear();
3679 delete sync_log_trimmer;
3680 sync_log_trimmer = nullptr;
3681 }
3682 if (finisher) {
3683 finisher->stop();
3684 }
3685 if (need_watch_notify()) {
3686 finalize_watch();
3687 }
3688 if (finisher) {
3689 /* delete finisher only after cleaning up watches, as watch error path might call
3690 * into finisher. We stop finisher before finalizing watch to make sure we don't
3691 * actually handle any racing work
3692 */
3693 delete finisher;
3694 }
3695 if (meta_notifier) {
3696 meta_notifier->stop();
3697 delete meta_notifier;
3698 }
3699 if (data_notifier) {
3700 data_notifier->stop();
3701 delete data_notifier;
3702 }
3703 delete data_log;
3704 if (async_rados) {
3705 delete async_rados;
3706 }
3707
3708 delete gc;
3709 gc = NULL;
3710
3711 delete obj_expirer;
3712 obj_expirer = NULL;
3713
3714 delete lc;
3715 lc = NULL;
3716
3717 delete rest_master_conn;
3718
3719 map<string, RGWRESTConn *>::iterator iter;
3720 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3721 RGWRESTConn *conn = iter->second;
3722 delete conn;
3723 }
3724
3725 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3726 RGWRESTConn *conn = iter->second;
3727 delete conn;
3728 }
3729 RGWQuotaHandler::free_handler(quota_handler);
3730 if (cr_registry) {
3731 cr_registry->put();
3732 }
3733 delete meta_mgr;
3734 delete binfo_cache;
3735 delete obj_tombstone_cache;
3736 delete sync_modules_manager;
3737
3738 if (reshard_wait.get()) {
3739 reshard_wait->stop();
3740 reshard_wait.reset();
3741 }
3742
3743 if (run_reshard_thread) {
3744 reshard->stop_processor();
3745 }
3746 delete reshard;
3747 delete index_completion_manager;
3748 }
3749
3750 /**
3751 * Initialize the RADOS instance and prepare to do other ops
3752 * Returns 0 on success, -ERR# on failure.
3753 */
3754 int RGWRados::init_rados()
3755 {
3756 int ret = 0;
3757 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3758
3759 for (auto& r : handles) {
3760 ret = r.init_with_context(cct);
3761 if (ret < 0) {
3762 return ret;
3763 }
3764 ret = r.connect();
3765 if (ret < 0) {
3766 return ret;
3767 }
3768 }
3769
3770 sync_modules_manager = new RGWSyncModulesManager();
3771
3772 rgw_register_sync_modules(sync_modules_manager);
3773
3774 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3775 new RGWCoroutinesManagerRegistry(cct)};
3776 ret = crs->hook_to_admin_command("cr dump");
3777 if (ret < 0) {
3778 return ret;
3779 }
3780
3781 meta_mgr = new RGWMetadataManager(cct, this);
3782 data_log = new RGWDataChangesLog(cct, this);
3783 cr_registry = crs.release();
3784
3785 std::swap(handles, rados);
3786 return ret;
3787 }
3788
3789
3790 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3791 {
3792 map<string,string> metadata = meta;
3793 metadata["num_handles"] = stringify(rados.size());
3794 metadata["zonegroup_id"] = zonegroup.get_id();
3795 metadata["zonegroup_name"] = zonegroup.get_name();
3796 metadata["zone_name"] = zone_name();
3797 metadata["zone_id"] = zone_id();;
3798 string name = cct->_conf->name.get_id();
3799 if (name.find("rgw.") == 0) {
3800 name = name.substr(4);
3801 }
3802 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3803 if (ret < 0) {
3804 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3805 return ret;
3806 }
3807
3808 return 0;
3809 }
3810
3811 /**
3812 * Add new connection to connections map
3813 * @param zonegroup_conn_map map which new connection will be added to
3814 * @param zonegroup zonegroup which new connection will connect to
3815 * @param new_connection pointer to new connection instance
3816 */
3817 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3818 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3819 {
3820 // Delete if connection is already exists
3821 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3822 if (iterZoneGroup != zonegroup_conn_map.end()) {
3823 delete iterZoneGroup->second;
3824 }
3825
3826 // Add new connection to connections map
3827 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3828 }
3829
3830 int RGWRados::convert_regionmap()
3831 {
3832 RGWZoneGroupMap zonegroupmap;
3833
3834 string pool_name = cct->_conf->rgw_zone_root_pool;
3835 if (pool_name.empty()) {
3836 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3837 }
3838 string oid = region_map_oid;
3839
3840 rgw_pool pool(pool_name);
3841 bufferlist bl;
3842 RGWObjectCtx obj_ctx(this);
3843 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3844 if (ret < 0 && ret != -ENOENT) {
3845 return ret;
3846 } else if (ret == -ENOENT) {
3847 return 0;
3848 }
3849
3850 try {
3851 bufferlist::iterator iter = bl.begin();
3852 ::decode(zonegroupmap, iter);
3853 } catch (buffer::error& err) {
3854 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3855 return -EIO;
3856 }
3857
3858 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3859 iter != zonegroupmap.zonegroups.end(); ++iter) {
3860 RGWZoneGroup& zonegroup = iter->second;
3861 ret = zonegroup.init(cct, this, false);
3862 ret = zonegroup.update();
3863 if (ret < 0 && ret != -ENOENT) {
3864 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3865 cpp_strerror(-ret) << dendl;
3866 return ret;
3867 } else if (ret == -ENOENT) {
3868 ret = zonegroup.create();
3869 if (ret < 0) {
3870 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3871 cpp_strerror(-ret) << dendl;
3872 return ret;
3873 }
3874 }
3875 }
3876
3877 current_period.set_user_quota(zonegroupmap.user_quota);
3878 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3879
3880 // remove the region_map so we don't try to convert again
3881 rgw_raw_obj obj(pool, oid);
3882 ret = delete_system_obj(obj);
3883 if (ret < 0) {
3884 ldout(cct, 0) << "Error could not remove " << obj
3885 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3886 return ret;
3887 }
3888
3889 return 0;
3890 }
3891
3892 /**
3893 * Replace all region configuration with zonegroup for
3894 * backward compatability
3895 * Returns 0 on success, -ERR# on failure.
3896 */
3897 int RGWRados::replace_region_with_zonegroup()
3898 {
3899 /* copy default region */
3900 /* convert default region to default zonegroup */
3901 string default_oid = cct->_conf->rgw_default_region_info_oid;
3902 if (default_oid.empty()) {
3903 default_oid = default_region_info_oid;
3904 }
3905
3906
3907 RGWZoneGroup default_zonegroup;
3908 rgw_pool pool{default_zonegroup.get_pool(cct)};
3909 string oid = "converted";
3910 bufferlist bl;
3911 RGWObjectCtx obj_ctx(this);
3912
3913 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3914 if (ret < 0 && ret != -ENOENT) {
3915 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3916 << dendl;
3917 return ret;
3918 } else if (ret != -ENOENT) {
3919 ldout(cct, 20) << "System already converted " << dendl;
3920 return 0;
3921 }
3922
3923 string default_region;
3924 ret = default_zonegroup.init(cct, this, false, true);
3925 if (ret < 0) {
3926 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3927 return ret;
3928 }
3929 ret = default_zonegroup.read_default_id(default_region, true);
3930 if (ret < 0 && ret != -ENOENT) {
3931 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3932 return ret;
3933 }
3934
3935 /* convert regions to zonegroups */
3936 list<string> regions;
3937 ret = list_regions(regions);
3938 if (ret < 0 && ret != -ENOENT) {
3939 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3940 return ret;
3941 } else if (ret == -ENOENT || regions.empty()) {
3942 RGWZoneParams zoneparams(default_zone_name);
3943 int ret = zoneparams.init(cct, this);
3944 if (ret < 0 && ret != -ENOENT) {
3945 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3946 return ret;
3947 }
3948 /* update master zone */
3949 RGWZoneGroup default_zg(default_zonegroup_name);
3950 ret = default_zg.init(cct, this);
3951 if (ret < 0 && ret != -ENOENT) {
3952 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3953 return ret;
3954 }
3955 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3956 default_zg.master_zone = zoneparams.get_id();
3957 return default_zg.update();
3958 }
3959 return 0;
3960 }
3961
3962 string master_region, master_zone;
3963 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3964 if (*iter != default_zonegroup_name){
3965 RGWZoneGroup region(*iter);
3966 int ret = region.init(cct, this, true, true);
3967 if (ret < 0) {
3968 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3969 return ret;
3970 }
3971 if (region.is_master_zonegroup()) {
3972 master_region = region.get_id();
3973 master_zone = region.master_zone;
3974 }
3975 }
3976 }
3977
3978 /* create realm if there is none.
3979 The realm name will be the region and zone concatenated
3980 realm id will be mds of its name */
3981 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
3982 string new_realm_name = master_region + "." + master_zone;
3983 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
3984 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
3985 MD5 hash;
3986 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
3987 hash.Final(md5);
3988 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
3989 string new_realm_id(md5_str);
3990 RGWRealm new_realm(new_realm_id,new_realm_name);
3991 ret = new_realm.init(cct, this, false);
3992 if (ret < 0) {
3993 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
3994 return ret;
3995 }
3996 ret = new_realm.create();
3997 if (ret < 0 && ret != -EEXIST) {
3998 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
3999 return ret;
4000 }
4001 ret = new_realm.set_as_default();
4002 if (ret < 0) {
4003 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4004 return ret;
4005 }
4006 ret = realm.init(cct, this);
4007 if (ret < 0) {
4008 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4009 return ret;
4010 }
4011 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4012 if (ret < 0) {
4013 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4014 return ret;
4015 }
4016 }
4017
4018 list<string>::iterator iter;
4019 /* create zonegroups */
4020 for (iter = regions.begin(); iter != regions.end(); ++iter)
4021 {
4022 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4023 /* check to see if we don't have already a zonegroup with this name */
4024 RGWZoneGroup new_zonegroup(*iter);
4025 ret = new_zonegroup.init(cct , this);
4026 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4027 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4028 " skipping conversion " << dendl;
4029 continue;
4030 }
4031 RGWZoneGroup zonegroup(*iter);
4032 zonegroup.set_id(*iter);
4033 int ret = zonegroup.init(cct, this, true, true);
4034 if (ret < 0) {
4035 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4036 return ret;
4037 }
4038 zonegroup.realm_id = realm.get_id();
4039 /* fix default region master zone */
4040 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4041 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4042 zonegroup.master_zone = default_zone_name;
4043 }
4044 ret = zonegroup.update();
4045 if (ret < 0 && ret != -EEXIST) {
4046 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4047 << dendl;
4048 return ret;
4049 }
4050 ret = zonegroup.update_name();
4051 if (ret < 0 && ret != -EEXIST) {
4052 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4053 << dendl;
4054 return ret;
4055 }
4056 if (zonegroup.get_name() == default_region) {
4057 ret = zonegroup.set_as_default();
4058 if (ret < 0) {
4059 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4060 << dendl;
4061 return ret;
4062 }
4063 }
4064 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4065 ++iter) {
4066 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4067 RGWZoneParams zoneparams(iter->first, iter->first);
4068 zoneparams.set_id(iter->first);
4069 zoneparams.realm_id = realm.get_id();
4070 ret = zoneparams.init(cct, this);
4071 if (ret < 0 && ret != -ENOENT) {
4072 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4073 return ret;
4074 } else if (ret == -ENOENT) {
4075 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4076 continue;
4077 }
4078 zonegroup.realm_id = realm.get_id();
4079 ret = zoneparams.update();
4080 if (ret < 0 && ret != -EEXIST) {
4081 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4082 return ret;
4083 }
4084 ret = zoneparams.update_name();
4085 if (ret < 0 && ret != -EEXIST) {
4086 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4087 return ret;
4088 }
4089 }
4090
4091 if (!current_period.get_id().empty()) {
4092 ret = current_period.add_zonegroup(zonegroup);
4093 if (ret < 0) {
4094 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4095 return ret;
4096 }
4097 }
4098 }
4099
4100 if (!current_period.get_id().empty()) {
4101 ret = current_period.update();
4102 if (ret < 0) {
4103 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4104 return ret;
4105 }
4106 ret = current_period.store_info(false);
4107 if (ret < 0) {
4108 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4109 return ret;
4110 }
4111 ret = current_period.reflect();
4112 if (ret < 0) {
4113 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4114 return ret;
4115 }
4116 }
4117
4118 for (auto const& iter : regions) {
4119 RGWZoneGroup zonegroup(iter);
4120 int ret = zonegroup.init(cct, this, true, true);
4121 if (ret < 0) {
4122 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4123 return ret;
4124 }
4125 ret = zonegroup.delete_obj(true);
4126 if (ret < 0 && ret != -ENOENT) {
4127 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4128 << dendl;
4129 return ret;
4130 }
4131 }
4132
4133 /* mark as converted */
4134 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4135 true, NULL, real_time(), NULL);
4136 if (ret < 0 ) {
4137 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4138 << dendl;
4139 return ret;
4140 }
4141
4142 return 0;
4143 }
4144
4145 int RGWRados::init_zg_from_period(bool *initialized)
4146 {
4147 *initialized = false;
4148
4149 if (current_period.get_id().empty()) {
4150 return 0;
4151 }
4152
4153 int ret = zonegroup.init(cct, this);
4154 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4155 if (ret == -ENOENT) {
4156 return 0;
4157 }
4158 if (ret < 0) {
4159 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4160 return ret;
4161 }
4162 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4163
4164 map<string, RGWZoneGroup>::const_iterator iter =
4165 current_period.get_map().zonegroups.find(zonegroup.get_id());
4166
4167 if (iter != current_period.get_map().zonegroups.end()) {
4168 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4169 zonegroup = iter->second;
4170 ret = zonegroup.init(cct, this, false);
4171 if (ret < 0) {
4172 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4173 return ret;
4174 }
4175 ret = zone_params.init(cct, this);
4176 if (ret < 0 && ret != -ENOENT) {
4177 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4178 return ret;
4179 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4180 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4181 zone_params.set_name(default_zone_name);
4182 ret = zone_params.init(cct, this);
4183 if (ret < 0 && ret != -ENOENT) {
4184 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4185 return ret;
4186 }
4187 }
4188 }
4189 for (iter = current_period.get_map().zonegroups.begin();
4190 iter != current_period.get_map().zonegroups.end(); ++iter){
4191 const RGWZoneGroup& zg = iter->second;
4192 // use endpoints from the zonegroup's master zone
4193 auto master = zg.zones.find(zg.master_zone);
4194 if (master == zg.zones.end()) {
4195 // fix missing master zone for a single zone zonegroup
4196 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4197 master = zg.zones.begin();
4198 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4199 master->second.name << " id:" << master->second.id << " as master" << dendl;
4200 if (zonegroup.get_id() == zg.get_id()) {
4201 zonegroup.master_zone = master->second.id;
4202 ret = zonegroup.update();
4203 if (ret < 0) {
4204 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4205 return ret;
4206 }
4207 } else {
4208 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4209 ret = fixed_zg.init(cct, this);
4210 if (ret < 0) {
4211 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4212 return ret;
4213 }
4214 fixed_zg.master_zone = master->second.id;
4215 ret = fixed_zg.update();
4216 if (ret < 0) {
4217 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4218 return ret;
4219 }
4220 }
4221 } else {
4222 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4223 zg.master_zone << dendl;
4224 return -EINVAL;
4225 }
4226 }
4227 const auto& endpoints = master->second.endpoints;
4228 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4229 if (!current_period.get_master_zonegroup().empty() &&
4230 zg.get_id() == current_period.get_master_zonegroup()) {
4231 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4232 }
4233 }
4234
4235 *initialized = true;
4236
4237 return 0;
4238 }
4239
4240 int RGWRados::init_zg_from_local(bool *creating_defaults)
4241 {
4242 int ret = zonegroup.init(cct, this);
4243 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4244 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4245 return ret;
4246 } else if (ret == -ENOENT) {
4247 *creating_defaults = true;
4248 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4249 ret = zonegroup.create_default();
4250 if (ret < 0) {
4251 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4252 << dendl;
4253 return ret;
4254 }
4255 ret = zonegroup.init(cct, this);
4256 if (ret < 0) {
4257 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4258 << dendl;
4259 return ret;
4260 }
4261 }
4262 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4263 if (zonegroup.is_master_zonegroup()) {
4264 // use endpoints from the zonegroup's master zone
4265 auto master = zonegroup.zones.find(zonegroup.master_zone);
4266 if (master == zonegroup.zones.end()) {
4267 // fix missing master zone for a single zone zonegroup
4268 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4269 master = zonegroup.zones.begin();
4270 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4271 master->second.name << " id:" << master->second.id << " as master" << dendl;
4272 zonegroup.master_zone = master->second.id;
4273 ret = zonegroup.update();
4274 if (ret < 0) {
4275 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4276 return ret;
4277 }
4278 } else {
4279 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4280 "master_zone=" << zonegroup.master_zone << dendl;
4281 return -EINVAL;
4282 }
4283 }
4284 const auto& endpoints = master->second.endpoints;
4285 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4286 }
4287
4288 return 0;
4289 }
4290
4291
4292 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4293 {
4294 return target_zone.syncs_from(source_zone.name) &&
4295 sync_modules_manager->supports_data_export(source_zone.tier_type);
4296 }
4297
4298 /**
4299 * Initialize the RADOS instance and prepare to do other ops
4300 * Returns 0 on success, -ERR# on failure.
4301 */
4302 int RGWRados::init_complete()
4303 {
4304 int ret = realm.init(cct, this);
4305 if (ret < 0 && ret != -ENOENT) {
4306 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4307 return ret;
4308 } else if (ret != -ENOENT) {
4309 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4310 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4311 if (ret < 0 && ret != -ENOENT) {
4312 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4313 return ret;
4314 }
4315 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4316 }
4317
4318 ret = replace_region_with_zonegroup();
4319 if (ret < 0) {
4320 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4321 return ret;
4322 }
4323
4324 ret = convert_regionmap();
4325 if (ret < 0) {
4326 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4327 return ret;
4328 }
4329
4330 bool zg_initialized = false;
4331
4332 if (!current_period.get_id().empty()) {
4333 ret = init_zg_from_period(&zg_initialized);
4334 if (ret < 0) {
4335 return ret;
4336 }
4337 }
4338
4339 bool creating_defaults = false;
4340 bool using_local = (!zg_initialized);
4341 if (using_local) {
4342 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4343 ret = init_zg_from_local(&creating_defaults);
4344 if (ret < 0) {
4345 return ret;
4346 }
4347 // read period_config into current_period
4348 auto& period_config = current_period.get_config();
4349 ret = period_config.read(this, zonegroup.realm_id);
4350 if (ret < 0 && ret != -ENOENT) {
4351 ldout(cct, 0) << "ERROR: failed to read period config: "
4352 << cpp_strerror(ret) << dendl;
4353 return ret;
4354 }
4355 }
4356
4357 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4358 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4359 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4360 zone_params.set_name(default_zone_name);
4361 }
4362
4363 ret = zone_params.init(cct, this);
4364 if (ret < 0 && ret != -ENOENT) {
4365 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4366 return ret;
4367 }
4368 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4369 if (zone_iter == get_zonegroup().zones.end()) {
4370 if (using_local) {
4371 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4372 return -EINVAL;
4373 }
4374 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4375 ret = init_zg_from_local(&creating_defaults);
4376 if (ret < 0) {
4377 return ret;
4378 }
4379 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4380 }
4381 if (zone_iter != get_zonegroup().zones.end()) {
4382 zone_public_config = zone_iter->second;
4383 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4384 } else {
4385 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4386 return -EINVAL;
4387 }
4388
4389 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4390
4391 if (run_sync_thread) {
4392 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4393 if (ret < 0) {
4394 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4395 return ret;
4396 }
4397 }
4398
4399 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4400
4401 init_unique_trans_id_deps();
4402
4403 finisher = new Finisher(cct);
4404 finisher->start();
4405
4406 period_puller.reset(new RGWPeriodPuller(this));
4407 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4408 current_period));
4409
4410 if (need_watch_notify()) {
4411 ret = init_watch();
4412 if (ret < 0) {
4413 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4414 return ret;
4415 }
4416 }
4417
4418 /* first build all zones index */
4419 for (auto ziter : get_zonegroup().zones) {
4420 const string& id = ziter.first;
4421 RGWZone& z = ziter.second;
4422 zone_id_by_name[z.name] = id;
4423 zone_by_id[id] = z;
4424 }
4425
4426 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4427 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4428 }
4429 zone_public_config = zone_by_id[zone_id()];
4430 for (auto ziter : get_zonegroup().zones) {
4431 const string& id = ziter.first;
4432 RGWZone& z = ziter.second;
4433 if (id == zone_id()) {
4434 continue;
4435 }
4436 if (z.endpoints.empty()) {
4437 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4438 continue;
4439 }
4440 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4441 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4442 zone_conn_map[id] = conn;
4443 if (zone_syncs_from(zone_public_config, z) ||
4444 zone_syncs_from(z, zone_public_config)) {
4445 if (zone_syncs_from(zone_public_config, z)) {
4446 zone_data_sync_from_map[id] = conn;
4447 }
4448 if (zone_syncs_from(z, zone_public_config)) {
4449 zone_data_notify_to_map[id] = conn;
4450 }
4451 } else {
4452 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4453 }
4454 }
4455
4456 ret = open_root_pool_ctx();
4457 if (ret < 0)
4458 return ret;
4459
4460 ret = open_gc_pool_ctx();
4461 if (ret < 0)
4462 return ret;
4463
4464 ret = open_lc_pool_ctx();
4465 if (ret < 0)
4466 return ret;
4467
4468 ret = open_objexp_pool_ctx();
4469 if (ret < 0)
4470 return ret;
4471
4472 ret = open_reshard_pool_ctx();
4473 if (ret < 0)
4474 return ret;
4475
4476 pools_initialized = true;
4477
4478 gc = new RGWGC();
4479 gc->initialize(cct, this);
4480
4481 obj_expirer = new RGWObjectExpirer(this);
4482
4483 if (use_gc_thread) {
4484 gc->start_processor();
4485 obj_expirer->start_processor();
4486 }
4487
4488 if (run_sync_thread) {
4489 // initialize the log period history. we want to do this any time we're not
4490 // running under radosgw-admin, so we check run_sync_thread here before
4491 // disabling it based on the zone/zonegroup setup
4492 meta_mgr->init_oldest_log_period();
4493 }
4494
4495 /* no point of running sync thread if we don't have a master zone configured
4496 or there is no rest_master_conn */
4497 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4498 || current_period.get_id().empty()) {
4499 run_sync_thread = false;
4500 }
4501
4502 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4503 async_rados->start();
4504
4505 ret = meta_mgr->init(current_period.get_id());
4506 if (ret < 0) {
4507 lderr(cct) << "ERROR: failed to initialize metadata log: "
4508 << cpp_strerror(-ret) << dendl;
4509 return ret;
4510 }
4511
4512 if (is_meta_master()) {
4513 auto md_log = meta_mgr->get_log(current_period.get_id());
4514 meta_notifier = new RGWMetaNotifier(this, md_log);
4515 meta_notifier->start();
4516 }
4517
4518 if (run_sync_thread) {
4519 Mutex::Locker l(meta_sync_thread_lock);
4520 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4521 ret = meta_sync_processor_thread->init();
4522 if (ret < 0) {
4523 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4524 return ret;
4525 }
4526 meta_sync_processor_thread->start();
4527
4528 Mutex::Locker dl(data_sync_thread_lock);
4529 for (auto iter : zone_data_sync_from_map) {
4530 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4531 RGWDataSyncProcessorThread *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
4532 ret = thread->init();
4533 if (ret < 0) {
4534 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4535 return ret;
4536 }
4537 thread->start();
4538 data_sync_processor_threads[iter.first] = thread;
4539 }
4540 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4541 if (interval > 0) {
4542 sync_log_trimmer = new RGWSyncLogTrimThread(this, interval);
4543 ret = sync_log_trimmer->init();
4544 if (ret < 0) {
4545 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4546 return ret;
4547 }
4548 sync_log_trimmer->start();
4549 }
4550 }
4551 data_notifier = new RGWDataNotifier(this);
4552 data_notifier->start();
4553
4554 lc = new RGWLC();
4555 lc->initialize(cct, this);
4556
4557 if (use_lc_thread)
4558 lc->start_processor();
4559
4560 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4561
4562 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4563 get_zone().bucket_index_max_shards);
4564 if (bucket_index_max_shards > get_max_bucket_shards()) {
4565 bucket_index_max_shards = get_max_bucket_shards();
4566 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4567 << get_max_bucket_shards() << dendl;
4568 }
4569 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4570
4571 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4572 binfo_cache->init(this);
4573
4574 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4575
4576 if (need_tombstone_cache) {
4577 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4578 }
4579
4580 reshard_wait = std::make_shared<RGWReshardWait>(this);
4581
4582 reshard = new RGWReshard(this);
4583
4584 /* only the master zone in the zonegroup reshards buckets */
4585 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4586 if (run_reshard_thread) {
4587 reshard->start_processor();
4588 }
4589
4590 index_completion_manager = new RGWIndexCompletionManager(this);
4591 ret = index_completion_manager->start();
4592
4593 return ret;
4594 }
4595
4596 /**
4597 * Initialize the RADOS instance and prepare to do other ops
4598 * Returns 0 on success, -ERR# on failure.
4599 */
4600 int RGWRados::initialize()
4601 {
4602 int ret;
4603
4604 ret = init_rados();
4605 if (ret < 0)
4606 return ret;
4607
4608 return init_complete();
4609 }
4610
4611 void RGWRados::finalize_watch()
4612 {
4613 for (int i = 0; i < num_watchers; i++) {
4614 RGWWatcher *watcher = watchers[i];
4615 watcher->unregister_watch();
4616 delete watcher;
4617 }
4618
4619 delete[] notify_oids;
4620 delete[] watchers;
4621 }
4622
4623 void RGWRados::schedule_context(Context *c) {
4624 finisher->queue(c);
4625 }
4626
4627 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4628 {
4629 bool is_truncated;
4630 RGWListRawObjsCtx ctx;
4631 do {
4632 list<string> oids;
4633 int r = list_raw_objects(pool, prefix, 1000,
4634 ctx, oids, &is_truncated);
4635 if (r < 0) {
4636 return r;
4637 }
4638 list<string>::iterator iter;
4639 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4640 string& val = *iter;
4641 if (val.size() > prefix.size())
4642 result.push_back(val.substr(prefix.size()));
4643 }
4644 } while (is_truncated);
4645
4646 return 0;
4647 }
4648
4649 int RGWRados::list_regions(list<string>& regions)
4650 {
4651 RGWZoneGroup zonegroup;
4652
4653 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4654 }
4655
4656 int RGWRados::list_zonegroups(list<string>& zonegroups)
4657 {
4658 RGWZoneGroup zonegroup;
4659
4660 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4661 }
4662
4663 int RGWRados::list_zones(list<string>& zones)
4664 {
4665 RGWZoneParams zoneparams;
4666
4667 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4668 }
4669
4670 int RGWRados::list_realms(list<string>& realms)
4671 {
4672 RGWRealm realm(cct, this);
4673 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4674 }
4675
4676 int RGWRados::list_periods(list<string>& periods)
4677 {
4678 RGWPeriod period;
4679 list<string> raw_periods;
4680 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4681 if (ret < 0) {
4682 return ret;
4683 }
4684 for (const auto& oid : raw_periods) {
4685 size_t pos = oid.find(".");
4686 if (pos != std::string::npos) {
4687 periods.push_back(oid.substr(0, pos));
4688 } else {
4689 periods.push_back(oid);
4690 }
4691 }
4692 periods.sort(); // unique() only detects duplicates if they're adjacent
4693 periods.unique();
4694 return 0;
4695 }
4696
4697
4698 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4699 {
4700 int ret = 0;
4701 string period_id = current_period;
4702 while(!period_id.empty()) {
4703 RGWPeriod period(period_id);
4704 ret = period.init(cct, this);
4705 if (ret < 0) {
4706 return ret;
4707 }
4708 periods.push_back(period.get_id());
4709 period_id = period.get_predecessor();
4710 }
4711
4712 return ret;
4713 }
4714
4715 /**
4716 * Open the pool used as root for this gateway
4717 * Returns: 0 on success, -ERR# otherwise.
4718 */
4719 int RGWRados::open_root_pool_ctx()
4720 {
4721 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4722 }
4723
4724 int RGWRados::open_gc_pool_ctx()
4725 {
4726 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4727 }
4728
4729 int RGWRados::open_lc_pool_ctx()
4730 {
4731 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4732 }
4733
4734 int RGWRados::open_objexp_pool_ctx()
4735 {
4736 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4737 }
4738
4739 int RGWRados::open_reshard_pool_ctx()
4740 {
4741 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4742 }
4743
4744 int RGWRados::init_watch()
4745 {
4746 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4747 if (r < 0) {
4748 return r;
4749 }
4750
4751 num_watchers = cct->_conf->rgw_num_control_oids;
4752
4753 bool compat_oid = (num_watchers == 0);
4754
4755 if (num_watchers <= 0)
4756 num_watchers = 1;
4757
4758 notify_oids = new string[num_watchers];
4759 watchers = new RGWWatcher *[num_watchers];
4760
4761 for (int i=0; i < num_watchers; i++) {
4762 string& notify_oid = notify_oids[i];
4763 notify_oid = notify_oid_prefix;
4764 if (!compat_oid) {
4765 char buf[16];
4766 snprintf(buf, sizeof(buf), ".%d", i);
4767 notify_oid.append(buf);
4768 }
4769 r = control_pool_ctx.create(notify_oid, false);
4770 if (r < 0 && r != -EEXIST)
4771 return r;
4772
4773 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4774 watchers[i] = watcher;
4775
4776 r = watcher->register_watch();
4777 if (r < 0)
4778 return r;
4779 }
4780
4781 watch_initialized = true;
4782
4783 set_cache_enabled(true);
4784
4785 return 0;
4786 }
4787
4788 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4789 {
4790 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4791
4792 int i = r % num_watchers;
4793 char buf[16];
4794 snprintf(buf, sizeof(buf), ".%d", i);
4795
4796 notify_oid = notify_oid_prefix;
4797 notify_oid.append(buf);
4798 }
4799
4800 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4801 {
4802 librados::Rados *rad = get_rados_handle();
4803 int r = rgw_init_ioctx(rad, pool, io_ctx);
4804 if (r != -ENOENT)
4805 return r;
4806
4807 if (!pools_initialized)
4808 return r;
4809
4810 r = rad->pool_create(pool.name.c_str());
4811 if (r < 0 && r != -EEXIST)
4812 return r;
4813
4814 return rgw_init_ioctx(rad, pool, io_ctx);
4815 }
4816
4817 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4818 string *marker) {
4819 if (marker) {
4820 *marker = shard_id_str;
4821 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4822 marker->append(shard_marker);
4823 }
4824 }
4825
4826 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4827 {
4828 const string *rule = &bucket_info.placement_rule;
4829 if (rule->empty()) {
4830 rule = &zonegroup.default_placement;
4831 }
4832 auto iter = zone_params.placement_pools.find(*rule);
4833 if (iter == zone_params.placement_pools.end()) {
4834 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4835 return -EINVAL;
4836 }
4837
4838 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4839 if (r < 0)
4840 return r;
4841
4842 return 0;
4843 }
4844
4845 /**
4846 * set up a bucket listing.
4847 * handle is filled in.
4848 * Returns 0 on success, -ERR# otherwise.
4849 */
4850 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4851 {
4852 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4853 *handle = (RGWAccessHandle)state;
4854 return 0;
4855 }
4856
4857 /**
4858 * get the next bucket in the listing.
4859 * obj is filled in,
4860 * handle is updated.
4861 * returns 0 on success, -ERR# otherwise.
4862 */
4863 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4864 {
4865 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4866
4867 do {
4868 if (*state == root_pool_ctx.nobjects_end()) {
4869 delete state;
4870 return -ENOENT;
4871 }
4872
4873 obj.key.name = (*state)->get_oid();
4874 if (obj.key.name[0] == '_') {
4875 obj.key.name = obj.key.name.substr(1);
4876 }
4877
4878 (*state)++;
4879 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4880
4881 return 0;
4882 }
4883
4884
4885 /**** logs ****/
4886
4887 struct log_list_state {
4888 string prefix;
4889 librados::IoCtx io_ctx;
4890 librados::NObjectIterator obit;
4891 };
4892
4893 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4894 {
4895 log_list_state *state = new log_list_state;
4896 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4897 if (r < 0) {
4898 delete state;
4899 return r;
4900 }
4901 state->prefix = prefix;
4902 state->obit = state->io_ctx.nobjects_begin();
4903 *handle = (RGWAccessHandle)state;
4904 return 0;
4905 }
4906
4907 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4908 {
4909 log_list_state *state = static_cast<log_list_state *>(handle);
4910 while (true) {
4911 if (state->obit == state->io_ctx.nobjects_end()) {
4912 delete state;
4913 return -ENOENT;
4914 }
4915 if (state->prefix.length() &&
4916 state->obit->get_oid().find(state->prefix) != 0) {
4917 state->obit++;
4918 continue;
4919 }
4920 *name = state->obit->get_oid();
4921 state->obit++;
4922 break;
4923 }
4924 return 0;
4925 }
4926
4927 int RGWRados::log_remove(const string& name)
4928 {
4929 librados::IoCtx io_ctx;
4930 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4931 if (r < 0)
4932 return r;
4933 return io_ctx.remove(name);
4934 }
4935
4936 struct log_show_state {
4937 librados::IoCtx io_ctx;
4938 bufferlist bl;
4939 bufferlist::iterator p;
4940 string name;
4941 uint64_t pos;
4942 bool eof;
4943 log_show_state() : pos(0), eof(false) {}
4944 };
4945
4946 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4947 {
4948 log_show_state *state = new log_show_state;
4949 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4950 if (r < 0) {
4951 delete state;
4952 return r;
4953 }
4954 state->name = name;
4955 *handle = (RGWAccessHandle)state;
4956 return 0;
4957 }
4958
4959 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4960 {
4961 log_show_state *state = static_cast<log_show_state *>(handle);
4962 off_t off = state->p.get_off();
4963
4964 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
4965 << " off " << off
4966 << " eof " << (int)state->eof
4967 << dendl;
4968 // read some?
4969 unsigned chunk = 1024*1024;
4970 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
4971 bufferlist more;
4972 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
4973 if (r < 0)
4974 return r;
4975 state->pos += r;
4976 bufferlist old;
4977 try {
4978 old.substr_of(state->bl, off, state->bl.length() - off);
4979 } catch (buffer::error& err) {
4980 return -EINVAL;
4981 }
4982 state->bl.clear();
4983 state->bl.claim(old);
4984 state->bl.claim_append(more);
4985 state->p = state->bl.begin();
4986 if ((unsigned)r < chunk)
4987 state->eof = true;
4988 ldout(cct, 10) << " read " << r << dendl;
4989 }
4990
4991 if (state->p.end())
4992 return 0; // end of file
4993 try {
4994 ::decode(*entry, state->p);
4995 }
4996 catch (const buffer::error &e) {
4997 return -EINVAL;
4998 }
4999 return 1;
5000 }
5001
5002 /**
5003 * usage_log_hash: get usage log key hash, based on name and index
5004 *
5005 * Get the usage object name. Since a user may have more than 1
5006 * object holding that info (multiple shards), we use index to
5007 * specify that shard number. Once index exceeds max shards it
5008 * wraps.
5009 * If name is not being set, results for all users will be returned
5010 * and index will wrap only after total shards number.
5011 *
5012 * @param cct [in] ceph context
5013 * @param name [in] user name
5014 * @param hash [out] hash value
5015 * @param index [in] shard index number
5016 */
5017 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5018 {
5019 uint32_t val = index;
5020
5021 if (!name.empty()) {
5022 int max_user_shards = max(cct->_conf->rgw_usage_max_user_shards, 1);
5023 val %= max_user_shards;
5024 val += ceph_str_hash_linux(name.c_str(), name.size());
5025 }
5026 char buf[17];
5027 int max_shards = max(cct->_conf->rgw_usage_max_shards, 1);
5028 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5029 hash = buf;
5030 }
5031
5032 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5033 {
5034 uint32_t index = 0;
5035
5036 map<string, rgw_usage_log_info> log_objs;
5037
5038 string hash;
5039 string last_user;
5040
5041 /* restructure usage map, zone by object hash */
5042 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5043 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5044 const rgw_user_bucket& ub = iter->first;
5045 RGWUsageBatch& info = iter->second;
5046
5047 if (ub.user.empty()) {
5048 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5049 continue;
5050 }
5051
5052 if (ub.user != last_user) {
5053 /* index *should* be random, but why waste extra cycles
5054 in most cases max user shards is not going to exceed 1,
5055 so just incrementing it */
5056 usage_log_hash(cct, ub.user, hash, index++);
5057 }
5058 last_user = ub.user;
5059 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5060
5061 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5062 v.push_back(miter->second);
5063 }
5064 }
5065
5066 map<string, rgw_usage_log_info>::iterator liter;
5067
5068 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5069 int r = cls_obj_usage_log_add(liter->first, liter->second);
5070 if (r < 0)
5071 return r;
5072 }
5073 return 0;
5074 }
5075
5076 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5077 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5078 {
5079 uint32_t num = max_entries;
5080 string hash, first_hash;
5081 string user_str = user.to_str();
5082 usage_log_hash(cct, user_str, first_hash, 0);
5083
5084 if (usage_iter.index) {
5085 usage_log_hash(cct, user_str, hash, usage_iter.index);
5086 } else {
5087 hash = first_hash;
5088 }
5089
5090 usage.clear();
5091
5092 do {
5093 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5094 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5095
5096 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5097 usage_iter.read_iter, ret_usage, is_truncated);
5098 if (ret == -ENOENT)
5099 goto next;
5100
5101 if (ret < 0)
5102 return ret;
5103
5104 num -= ret_usage.size();
5105
5106 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5107 usage[iter->first].aggregate(iter->second);
5108 }
5109
5110 next:
5111 if (!*is_truncated) {
5112 usage_iter.read_iter.clear();
5113 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5114 }
5115 } while (num && !*is_truncated && hash != first_hash);
5116 return 0;
5117 }
5118
5119 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5120 {
5121 uint32_t index = 0;
5122 string hash, first_hash;
5123 string user_str = user.to_str();
5124 usage_log_hash(cct, user_str, first_hash, index);
5125
5126 hash = first_hash;
5127
5128 do {
5129 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5130 if (ret == -ENOENT)
5131 goto next;
5132
5133 if (ret < 0)
5134 return ret;
5135
5136 next:
5137 usage_log_hash(cct, user_str, hash, ++index);
5138 } while (hash != first_hash);
5139
5140 return 0;
5141 }
5142
5143 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5144 {
5145 return rgw_shards_hash(key, max_shards);
5146 }
5147
5148 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5149 {
5150 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5151 char buf[16];
5152 if (shard_id) {
5153 *shard_id = val % max_shards;
5154 }
5155 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5156 name = prefix + buf;
5157 }
5158
5159 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5160 {
5161 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5162 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5163 char buf[16];
5164 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5165 name = prefix + buf;
5166 }
5167
5168 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5169 {
5170 char buf[16];
5171 snprintf(buf, sizeof(buf), "%u", shard_id);
5172 name = prefix + buf;
5173
5174 }
5175
5176 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5177 {
5178 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5179 }
5180
5181 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5182 {
5183 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5184
5185 }
5186
5187 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5188 {
5189 librados::IoCtx io_ctx;
5190
5191 int r = time_log_add_init(io_ctx);
5192 if (r < 0) {
5193 return r;
5194 }
5195
5196 ObjectWriteOperation op;
5197 utime_t t(ut);
5198 cls_log_add(op, t, section, key, bl);
5199
5200 return io_ctx.operate(oid, &op);
5201 }
5202
5203 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5204 librados::AioCompletion *completion, bool monotonic_inc)
5205 {
5206 librados::IoCtx io_ctx;
5207
5208 int r = time_log_add_init(io_ctx);
5209 if (r < 0) {
5210 return r;
5211 }
5212
5213 ObjectWriteOperation op;
5214 cls_log_add(op, entries, monotonic_inc);
5215
5216 if (!completion) {
5217 r = io_ctx.operate(oid, &op);
5218 } else {
5219 r = io_ctx.aio_operate(oid, completion, &op);
5220 }
5221 return r;
5222 }
5223
5224 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5225 int max_entries, list<cls_log_entry>& entries,
5226 const string& marker,
5227 string *out_marker,
5228 bool *truncated)
5229 {
5230 librados::IoCtx io_ctx;
5231
5232 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5233 if (r < 0)
5234 return r;
5235 librados::ObjectReadOperation op;
5236
5237 utime_t st(start_time);
5238 utime_t et(end_time);
5239
5240 cls_log_list(op, st, et, marker, max_entries, entries,
5241 out_marker, truncated);
5242
5243 bufferlist obl;
5244
5245 int ret = io_ctx.operate(oid, &op, &obl);
5246 if (ret < 0)
5247 return ret;
5248
5249 return 0;
5250 }
5251
5252 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5253 {
5254 librados::IoCtx io_ctx;
5255
5256 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5257 if (r < 0)
5258 return r;
5259 librados::ObjectReadOperation op;
5260
5261 cls_log_info(op, header);
5262
5263 bufferlist obl;
5264
5265 int ret = io_ctx.operate(oid, &op, &obl);
5266 if (ret < 0)
5267 return ret;
5268
5269 return 0;
5270 }
5271
5272 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5273 {
5274 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5275 if (r < 0)
5276 return r;
5277
5278 librados::ObjectReadOperation op;
5279
5280 cls_log_info(op, header);
5281
5282 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5283 if (ret < 0)
5284 return ret;
5285
5286 return 0;
5287 }
5288
5289 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5290 const string& from_marker, const string& to_marker,
5291 librados::AioCompletion *completion)
5292 {
5293 librados::IoCtx io_ctx;
5294
5295 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5296 if (r < 0)
5297 return r;
5298
5299 utime_t st(start_time);
5300 utime_t et(end_time);
5301
5302 ObjectWriteOperation op;
5303 cls_log_trim(op, st, et, from_marker, to_marker);
5304
5305 if (!completion) {
5306 r = io_ctx.operate(oid, &op);
5307 } else {
5308 r = io_ctx.aio_operate(oid, completion, &op);
5309 }
5310 return r;
5311 }
5312
5313 string RGWRados::objexp_hint_get_shardname(int shard_num)
5314 {
5315 char buf[32];
5316 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5317
5318 string objname("obj_delete_at_hint.");
5319 return objname + buf;
5320 }
5321
5322 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5323 {
5324 string obj_key = key.name + key.instance;
5325 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5326 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5327 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5328 sid = rgw_shards_mod(sid2, num_shards);
5329 return sid;
5330 }
5331
5332 static string objexp_hint_get_keyext(const string& tenant_name,
5333 const string& bucket_name,
5334 const string& bucket_id,
5335 const rgw_obj_key& obj_key)
5336 {
5337 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5338 ":" + obj_key.name + ":" + obj_key.instance;
5339 }
5340
5341 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5342 const string& tenant_name,
5343 const string& bucket_name,
5344 const string& bucket_id,
5345 const rgw_obj_index_key& obj_key)
5346 {
5347 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5348 bucket_id, obj_key);
5349 objexp_hint_entry he = {
5350 .tenant = tenant_name,
5351 .bucket_name = bucket_name,
5352 .bucket_id = bucket_id,
5353 .obj_key = obj_key,
5354 .exp_time = delete_at };
5355 bufferlist hebl;
5356 ::encode(he, hebl);
5357 ObjectWriteOperation op;
5358 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5359
5360 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5361 return objexp_pool_ctx.operate(shard_name, &op);
5362 }
5363
5364 void RGWRados::objexp_get_shard(int shard_num,
5365 string& shard) /* out */
5366 {
5367 shard = objexp_hint_get_shardname(shard_num);
5368 }
5369
5370 int RGWRados::objexp_hint_list(const string& oid,
5371 const ceph::real_time& start_time,
5372 const ceph::real_time& end_time,
5373 const int max_entries,
5374 const string& marker,
5375 list<cls_timeindex_entry>& entries, /* out */
5376 string *out_marker, /* out */
5377 bool *truncated) /* out */
5378 {
5379 librados::ObjectReadOperation op;
5380 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5381 out_marker, truncated);
5382
5383 bufferlist obl;
5384 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5385
5386 if ((ret < 0 ) && (ret != -ENOENT)) {
5387 return ret;
5388 }
5389
5390 if ((ret == -ENOENT) && truncated) {
5391 *truncated = false;
5392 }
5393
5394 return 0;
5395 }
5396
5397 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5398 objexp_hint_entry& hint_entry) /* out */
5399 {
5400 try {
5401 bufferlist::iterator iter = ti_entry.value.begin();
5402 ::decode(hint_entry, iter);
5403 } catch (buffer::error& err) {
5404 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5405 }
5406
5407 return 0;
5408 }
5409
5410 int RGWRados::objexp_hint_trim(const string& oid,
5411 const ceph::real_time& start_time,
5412 const ceph::real_time& end_time,
5413 const string& from_marker,
5414 const string& to_marker)
5415 {
5416 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5417 from_marker, to_marker);
5418 if ((ret < 0 ) && (ret != -ENOENT)) {
5419 return ret;
5420 }
5421
5422 return 0;
5423 }
5424
5425 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5426 string& zone_id, string& owner_id) {
5427 librados::IoCtx io_ctx;
5428
5429 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5430 if (r < 0) {
5431 return r;
5432 }
5433 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5434 utime_t ut(msec / 1000, msec % 1000);
5435
5436 rados::cls::lock::Lock l(log_lock_name);
5437 l.set_duration(ut);
5438 l.set_cookie(owner_id);
5439 l.set_tag(zone_id);
5440 l.set_renew(true);
5441
5442 return l.lock_exclusive(&io_ctx, oid);
5443 }
5444
5445 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5446 librados::IoCtx io_ctx;
5447
5448 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5449 if (r < 0) {
5450 return r;
5451 }
5452
5453 rados::cls::lock::Lock l(log_lock_name);
5454 l.set_tag(zone_id);
5455 l.set_cookie(owner_id);
5456
5457 return l.unlock(&io_ctx, oid);
5458 }
5459
5460 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5461 {
5462 bufferlist::iterator i = bl.begin();
5463 RGWAccessControlPolicy policy(cct);
5464 try {
5465 policy.decode_owner(i);
5466 } catch (buffer::error& err) {
5467 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5468 return -EIO;
5469 }
5470 *owner = policy.get_owner();
5471 return 0;
5472 }
5473
5474 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5475 {
5476 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5477 if (aiter == attrset.end())
5478 return -EIO;
5479
5480 bufferlist& bl = aiter->second;
5481 bufferlist::iterator iter = bl.begin();
5482 try {
5483 policy->decode(iter);
5484 } catch (buffer::error& err) {
5485 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5486 return -EIO;
5487 }
5488 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5489 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5490 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5491 s3policy->to_xml(*_dout);
5492 *_dout << dendl;
5493 }
5494 return 0;
5495 }
5496
5497
5498 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5499 {
5500 rgw_bucket bucket = bucket_info.bucket;
5501 bucket.update_bucket_id(new_bucket_id);
5502
5503 RGWObjectCtx obj_ctx(store);
5504
5505 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5506 if (ret < 0) {
5507 return ret;
5508 }
5509
5510 return 0;
5511 }
5512
5513 /**
5514 * get listing of the objects in a bucket.
5515 *
5516 * max: maximum number of results to return
5517 * bucket: bucket to list contents of
5518 * prefix: only return results that match this prefix
5519 * delim: do not include results that match this string.
5520 * Any skipped results will have the matching portion of their name
5521 * inserted in common_prefixes with a "true" mark.
5522 * marker: if filled in, begin the listing with this object.
5523 * end_marker: if filled in, end the listing with this object.
5524 * result: the objects are put in here.
5525 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5526 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5527 */
5528 int RGWRados::Bucket::List::list_objects(int max, vector<rgw_bucket_dir_entry> *result,
5529 map<string, bool> *common_prefixes,
5530 bool *is_truncated)
5531 {
5532 RGWRados *store = target->get_store();
5533 CephContext *cct = store->ctx();
5534 int shard_id = target->get_shard_id();
5535
5536 int count = 0;
5537 bool truncated = true;
5538 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5539
5540 result->clear();
5541
5542 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5543
5544 rgw_obj_key end_marker_obj;
5545 rgw_obj_index_key cur_end_marker;
5546 if (!params.ns.empty()) {
5547 end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
5548 end_marker_obj.ns = params.ns;
5549 end_marker_obj.get_index_key(&cur_end_marker);
5550 }
5551 rgw_obj_index_key cur_marker;
5552 marker_obj.get_index_key(&cur_marker);
5553
5554 const bool cur_end_marker_valid = !params.end_marker.empty();
5555
5556 rgw_obj_key prefix_obj(params.prefix);
5557 prefix_obj.ns = params.ns;
5558 string cur_prefix = prefix_obj.get_index_key_name();
5559
5560 string bigger_than_delim;
5561
5562 if (!params.delim.empty()) {
5563 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5564 char buf[params.delim.size() + 16];
5565 int r = encode_utf8(val + 1, (unsigned char *)buf);
5566 if (r < 0) {
5567 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5568 return -EINVAL;
5569 }
5570 buf[r] = '\0';
5571
5572 bigger_than_delim = buf;
5573
5574 /* if marker points at a common prefix, fast forward it into its upperbound string */
5575 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5576 if (delim_pos >= 0) {
5577 string s = cur_marker.name.substr(0, delim_pos);
5578 s.append(bigger_than_delim);
5579 cur_marker = s;
5580 }
5581 }
5582
5583 string skip_after_delim;
5584 while (truncated && count <= max) {
5585 if (skip_after_delim > cur_marker.name) {
5586 cur_marker = skip_after_delim;
5587 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5588 }
5589 std::map<string, rgw_bucket_dir_entry> ent_map;
5590 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5591 read_ahead + 1 - count, params.list_versions, ent_map,
5592 &truncated, &cur_marker);
5593 if (r < 0)
5594 return r;
5595
5596 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5597 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5598 rgw_bucket_dir_entry& entry = eiter->second;
5599 rgw_obj_index_key index_key = entry.key;
5600
5601 rgw_obj_key obj(index_key);
5602
5603 /* note that parse_raw_oid() here will not set the correct object's instance, as
5604 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5605 * not needed for the checks here and we end up using the raw entry for the return vector
5606 */
5607 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5608 if (!valid) {
5609 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5610 continue;
5611 }
5612 bool check_ns = (obj.ns == params.ns);
5613 if (!params.list_versions && !entry.is_visible()) {
5614 continue;
5615 }
5616
5617 if (params.enforce_ns && !check_ns) {
5618 if (!params.ns.empty()) {
5619 /* we've iterated past the namespace we're searching -- done now */
5620 truncated = false;
5621 goto done;
5622 }
5623
5624 /* we're not looking at the namespace this object is in, next! */
5625 continue;
5626 }
5627
5628 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5629 truncated = false;
5630 goto done;
5631 }
5632
5633 if (count < max) {
5634 params.marker = index_key;
5635 next_marker = index_key;
5636 }
5637
5638 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5639 continue;
5640
5641 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5642 continue;
5643
5644 if (!params.delim.empty()) {
5645 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5646
5647 if (delim_pos >= 0) {
5648 string prefix_key = obj.name.substr(0, delim_pos + 1);
5649
5650 if (common_prefixes &&
5651 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5652 if (count >= max) {
5653 truncated = true;
5654 goto done;
5655 }
5656 next_marker = prefix_key;
5657 (*common_prefixes)[prefix_key] = true;
5658
5659 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5660
5661 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5662 skip_after_delim.append(bigger_than_delim);
5663
5664 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5665
5666 count++;
5667 }
5668
5669 continue;
5670 }
5671 }
5672
5673 if (count >= max) {
5674 truncated = true;
5675 goto done;
5676 }
5677
5678 result->emplace_back(std::move(entry));
5679 count++;
5680 }
5681
5682 // Either the back-end telling us truncated, or we don't consume all
5683 // items returned per the amount caller request
5684 truncated = (truncated || eiter != ent_map.end());
5685 }
5686
5687 done:
5688 if (is_truncated)
5689 *is_truncated = truncated;
5690
5691 return 0;
5692 }
5693
5694 /**
5695 * create a rados pool, associated meta info
5696 * returns 0 on success, -ERR# otherwise.
5697 */
5698 int RGWRados::create_pool(const rgw_pool& pool)
5699 {
5700 int ret = 0;
5701
5702 librados::Rados *rad = get_rados_handle();
5703 ret = rad->pool_create(pool.name.c_str(), 0);
5704 if (ret == -EEXIST)
5705 ret = 0;
5706 else if (ret == -ERANGE) {
5707 ldout(cct, 0)
5708 << __func__
5709 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5710 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5711 << dendl;
5712 }
5713 if (ret < 0)
5714 return ret;
5715
5716 return 0;
5717 }
5718
5719 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5720 {
5721 librados::IoCtx index_ctx; // context for new bucket
5722
5723 string dir_oid = dir_oid_prefix;
5724 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5725 if (r < 0) {
5726 return r;
5727 }
5728
5729 dir_oid.append(bucket_info.bucket.bucket_id);
5730
5731 map<int, string> bucket_objs;
5732 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5733
5734 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5735 }
5736
5737 void RGWRados::create_bucket_id(string *bucket_id)
5738 {
5739 uint64_t iid = instance_id();
5740 uint64_t bid = next_bucket_id();
5741 char buf[get_zone_params().get_id().size() + 48];
5742 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5743 *bucket_id = buf;
5744 }
5745
5746 /**
5747 * create a bucket with name bucket and the given list of attrs
5748 * returns 0 on success, -ERR# otherwise.
5749 */
5750 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5751 const string& zonegroup_id,
5752 const string& placement_rule,
5753 const string& swift_ver_location,
5754 const RGWQuotaInfo * pquota_info,
5755 map<std::string, bufferlist>& attrs,
5756 RGWBucketInfo& info,
5757 obj_version *pobjv,
5758 obj_version *pep_objv,
5759 real_time creation_time,
5760 rgw_bucket *pmaster_bucket,
5761 uint32_t *pmaster_num_shards,
5762 bool exclusive)
5763 {
5764 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5765 string selected_placement_rule_name;
5766 RGWZonePlacementInfo rule_info;
5767
5768 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5769 int ret = 0;
5770 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5771 &selected_placement_rule_name, &rule_info);
5772 if (ret < 0)
5773 return ret;
5774
5775 if (!pmaster_bucket) {
5776 create_bucket_id(&bucket.marker);
5777 bucket.bucket_id = bucket.marker;
5778 } else {
5779 bucket.marker = pmaster_bucket->marker;
5780 bucket.bucket_id = pmaster_bucket->bucket_id;
5781 }
5782
5783 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5784
5785 if (pobjv) {
5786 objv_tracker.write_version = *pobjv;
5787 } else {
5788 objv_tracker.generate_new_write_ver(cct);
5789 }
5790
5791 info.bucket = bucket;
5792 info.owner = owner.user_id;
5793 info.zonegroup = zonegroup_id;
5794 info.placement_rule = selected_placement_rule_name;
5795 info.index_type = rule_info.index_type;
5796 info.swift_ver_location = swift_ver_location;
5797 info.swift_versioning = (!swift_ver_location.empty());
5798 if (pmaster_num_shards) {
5799 info.num_shards = *pmaster_num_shards;
5800 } else {
5801 info.num_shards = bucket_index_max_shards;
5802 }
5803 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5804 info.requester_pays = false;
5805 if (real_clock::is_zero(creation_time)) {
5806 info.creation_time = ceph::real_clock::now();
5807 } else {
5808 info.creation_time = creation_time;
5809 }
5810 if (pquota_info) {
5811 info.quota = *pquota_info;
5812 }
5813
5814 int r = init_bucket_index(info, info.num_shards);
5815 if (r < 0) {
5816 return r;
5817 }
5818
5819 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5820 if (ret == -EEXIST) {
5821 librados::IoCtx index_ctx;
5822 map<int, string> bucket_objs;
5823 int r = open_bucket_index(info, index_ctx, bucket_objs);
5824 if (r < 0)
5825 return r;
5826
5827 /* we need to reread the info and return it, caller will have a use for it */
5828 RGWObjVersionTracker instance_ver = info.objv_tracker;
5829 info.objv_tracker.clear();
5830 RGWObjectCtx obj_ctx(this);
5831 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5832 if (r < 0) {
5833 if (r == -ENOENT) {
5834 continue;
5835 }
5836 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5837 return r;
5838 }
5839
5840 /* only remove it if it's a different bucket instance */
5841 if (info.bucket.bucket_id != bucket.bucket_id) {
5842 /* remove bucket meta instance */
5843 string entry = bucket.get_key();
5844 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5845 if (r < 0)
5846 return r;
5847
5848 map<int, string>::const_iterator biter;
5849 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5850 // Do best effort removal
5851 index_ctx.remove(biter->second);
5852 }
5853 }
5854 /* ret == -ENOENT here */
5855 }
5856 return ret;
5857 }
5858
5859 /* this is highly unlikely */
5860 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5861 return -ENOENT;
5862 }
5863
5864 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5865 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5866
5867 {
5868 /* first check that rule exists within the specific zonegroup */
5869 RGWZoneGroup zonegroup;
5870 int ret = get_zonegroup(zonegroup_id, zonegroup);
5871 if (ret < 0) {
5872 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5873 return ret;
5874 }
5875
5876 /* now check that tag exists within zonegroup */
5877 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5878 string rule = request_rule;
5879 if (rule.empty()) {
5880 rule = user_info.default_placement;
5881 if (rule.empty())
5882 rule = zonegroup.default_placement;
5883 }
5884
5885 if (rule.empty()) {
5886 ldout(cct, 0) << "misconfiguration, should not have an empty placement rule name" << dendl;
5887 return -EIO;
5888 }
5889
5890 map<string, RGWZoneGroupPlacementTarget>::iterator titer = zonegroup.placement_targets.find(rule);
5891 if (titer == zonegroup.placement_targets.end()) {
5892 ldout(cct, 0) << "could not find placement rule " << rule << " within zonegroup " << dendl;
5893 return -EINVAL;
5894 }
5895
5896 /* now check tag for the rule, whether user is permitted to use rule */
5897 RGWZoneGroupPlacementTarget& target_rule = titer->second;
5898 if (!target_rule.user_permitted(user_info.placement_tags)) {
5899 ldout(cct, 0) << "user not permitted to use placement rule" << dendl;
5900 return -EPERM;
5901 }
5902
5903 if (pselected_rule_name)
5904 *pselected_rule_name = rule;
5905
5906 return select_bucket_location_by_rule(rule, rule_info);
5907 }
5908
5909 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5910 {
5911 if (location_rule.empty()) {
5912 /* we can only reach here if we're trying to set a bucket location from a bucket
5913 * created on a different zone, using a legacy / default pool configuration
5914 */
5915 return select_legacy_bucket_placement(rule_info);
5916 }
5917
5918 /*
5919 * make sure that zone has this rule configured. We're
5920 * checking it for the local zone, because that's where this bucket object is going to
5921 * reside.
5922 */
5923 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5924 if (piter == get_zone_params().placement_pools.end()) {
5925 /* couldn't find, means we cannot really place data for this bucket in this zone */
5926 if (get_zonegroup().equals(zonegroup.get_id())) {
5927 /* that's a configuration error, zone should have that rule, as we're within the requested
5928 * zonegroup */
5929 return -EINVAL;
5930 } else {
5931 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5932 return 0;
5933 }
5934 }
5935
5936 RGWZonePlacementInfo& placement_info = piter->second;
5937
5938 if (rule_info) {
5939 *rule_info = placement_info;
5940 }
5941
5942 return 0;
5943 }
5944
5945 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5946 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5947 {
5948 if (!get_zone_params().placement_pools.empty()) {
5949 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5950 pselected_rule_name, rule_info);
5951 }
5952
5953 if (pselected_rule_name) {
5954 pselected_rule_name->clear();
5955 }
5956
5957 return select_legacy_bucket_placement(rule_info);
5958 }
5959
5960 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
5961 {
5962 bufferlist map_bl;
5963 map<string, bufferlist> m;
5964 string pool_name;
5965 bool write_map = false;
5966
5967 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5968
5969 RGWObjectCtx obj_ctx(this);
5970 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
5971 if (ret < 0) {
5972 goto read_omap;
5973 }
5974
5975 try {
5976 bufferlist::iterator iter = map_bl.begin();
5977 ::decode(m, iter);
5978 } catch (buffer::error& err) {
5979 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5980 }
5981
5982 read_omap:
5983 if (m.empty()) {
5984 bufferlist header;
5985 ret = omap_get_all(obj, header, m);
5986
5987 write_map = true;
5988 }
5989
5990 if (ret < 0 || m.empty()) {
5991 vector<rgw_pool> pools;
5992 string s = string("default.") + default_storage_pool_suffix;
5993 pools.push_back(rgw_pool(s));
5994 vector<int> retcodes;
5995 bufferlist bl;
5996 ret = create_pools(pools, retcodes);
5997 if (ret < 0)
5998 return ret;
5999 ret = omap_set(obj, s, bl);
6000 if (ret < 0)
6001 return ret;
6002 m[s] = bl;
6003 }
6004
6005 if (write_map) {
6006 bufferlist new_bl;
6007 ::encode(m, new_bl);
6008 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6009 if (ret < 0) {
6010 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6011 }
6012 }
6013
6014 map<string, bufferlist>::iterator miter;
6015 if (m.size() > 1) {
6016 vector<string> v;
6017 for (miter = m.begin(); miter != m.end(); ++miter) {
6018 v.push_back(miter->first);
6019 }
6020
6021 uint32_t r;
6022 ret = get_random_bytes((char *)&r, sizeof(r));
6023 if (ret < 0)
6024 return ret;
6025
6026 int i = r % v.size();
6027 pool_name = v[i];
6028 } else {
6029 miter = m.begin();
6030 pool_name = miter->first;
6031 }
6032
6033 rule_info->data_pool = pool_name;
6034 rule_info->data_extra_pool = pool_name;
6035 rule_info->index_pool = pool_name;
6036 rule_info->index_type = RGWBIType_Normal;
6037
6038 return 0;
6039 }
6040
6041 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6042 {
6043 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6044 }
6045
6046 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6047 {
6048 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6049
6050 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6051 }
6052
6053 int RGWRados::update_placement_map()
6054 {
6055 bufferlist header;
6056 map<string, bufferlist> m;
6057 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6058 int ret = omap_get_all(obj, header, m);
6059 if (ret < 0)
6060 return ret;
6061
6062 bufferlist new_bl;
6063 ::encode(m, new_bl);
6064 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6065 if (ret < 0) {
6066 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6067 }
6068
6069 return ret;
6070 }
6071
6072 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6073 {
6074 librados::Rados *rad = get_rados_handle();
6075 int ret = rad->pool_lookup(new_pool.name.c_str());
6076 if (ret < 0) // DNE, or something
6077 return ret;
6078
6079 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6080 bufferlist empty_bl;
6081 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6082
6083 // don't care about return value
6084 update_placement_map();
6085
6086 return ret;
6087 }
6088
6089 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6090 {
6091 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6092 int ret = omap_del(obj, old_pool.to_str());
6093
6094 // don't care about return value
6095 update_placement_map();
6096
6097 return ret;
6098 }
6099
6100 int RGWRados::list_placement_set(set<rgw_pool>& names)
6101 {
6102 bufferlist header;
6103 map<string, bufferlist> m;
6104
6105 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6106 int ret = omap_get_all(obj, header, m);
6107 if (ret < 0)
6108 return ret;
6109
6110 names.clear();
6111 map<string, bufferlist>::iterator miter;
6112 for (miter = m.begin(); miter != m.end(); ++miter) {
6113 names.insert(rgw_pool(miter->first));
6114 }
6115
6116 return names.size();
6117 }
6118
6119 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6120 {
6121 vector<librados::PoolAsyncCompletion *> completions;
6122 vector<int> rets;
6123
6124 librados::Rados *rad = get_rados_handle();
6125 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6126 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6127 completions.push_back(c);
6128 rgw_pool& pool = *iter;
6129 int ret = rad->pool_create_async(pool.name.c_str(), c);
6130 rets.push_back(ret);
6131 }
6132
6133 vector<int>::iterator riter;
6134 vector<librados::PoolAsyncCompletion *>::iterator citer;
6135
6136 assert(rets.size() == completions.size());
6137 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6138 int r = *riter;
6139 PoolAsyncCompletion *c = *citer;
6140 if (r == 0) {
6141 c->wait();
6142 r = c->get_return_value();
6143 if (r < 0) {
6144 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6145 }
6146 }
6147 c->release();
6148 retcodes.push_back(r);
6149 }
6150 return 0;
6151 }
6152
6153 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6154 {
6155 string oid, key;
6156 get_obj_bucket_and_oid_loc(obj, oid, key);
6157
6158 rgw_pool pool;
6159 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6160 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6161 return -EIO;
6162 }
6163
6164 int r = open_pool_ctx(pool, *ioctx);
6165 if (r < 0) {
6166 return r;
6167 }
6168
6169 ioctx->locator_set_key(key);
6170
6171 return 0;
6172 }
6173
6174 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6175 {
6176 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6177
6178 rgw_pool pool;
6179 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6180 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6181 return -EIO;
6182 }
6183
6184 int r = open_pool_ctx(pool, ref->ioctx);
6185 if (r < 0) {
6186 return r;
6187 }
6188
6189 ref->ioctx.locator_set_key(ref->key);
6190
6191 return 0;
6192 }
6193
6194 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6195 {
6196 ref->oid = obj.oid;
6197 ref->key = obj.loc;
6198
6199 int r;
6200
6201 if (ref->oid.empty()) {
6202 ref->oid = obj.pool.to_str();
6203 ref->pool = get_zone_params().domain_root;
6204 } else {
6205 ref->pool = obj.pool;
6206 }
6207 r = open_pool_ctx(ref->pool, ref->ioctx);
6208 if (r < 0)
6209 return r;
6210
6211 ref->ioctx.locator_set_key(ref->key);
6212
6213 return 0;
6214 }
6215
6216 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6217 {
6218 return get_raw_obj_ref(obj, ref);
6219 }
6220
6221 /*
6222 * fixes an issue where head objects were supposed to have a locator created, but ended
6223 * up without one
6224 */
6225 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6226 {
6227 const rgw_bucket& bucket = bucket_info.bucket;
6228 string oid;
6229 string locator;
6230
6231 rgw_obj obj(bucket, key);
6232
6233 get_obj_bucket_and_oid_loc(obj, oid, locator);
6234
6235 if (locator.empty()) {
6236 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6237 return 0;
6238 }
6239
6240 librados::IoCtx ioctx;
6241
6242 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6243 if (ret < 0) {
6244 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6245 return ret;
6246 }
6247 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6248
6249 uint64_t size;
6250 bufferlist data;
6251
6252 struct timespec mtime_ts;
6253 map<string, bufferlist> attrs;
6254 librados::ObjectReadOperation op;
6255 op.getxattrs(&attrs, NULL);
6256 op.stat2(&size, &mtime_ts, NULL);
6257 #define HEAD_SIZE 512 * 1024
6258 op.read(0, HEAD_SIZE, &data, NULL);
6259
6260 ret = ioctx.operate(oid, &op, NULL);
6261 if (ret < 0) {
6262 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6263 return ret;
6264 }
6265
6266 if (size > HEAD_SIZE) {
6267 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6268 return -EIO;
6269 }
6270
6271 if (size != data.length()) {
6272 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6273 return -EIO;
6274 }
6275
6276 if (copy_obj) {
6277 librados::ObjectWriteOperation wop;
6278
6279 wop.mtime2(&mtime_ts);
6280
6281 map<string, bufferlist>::iterator iter;
6282 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6283 wop.setxattr(iter->first.c_str(), iter->second);
6284 }
6285
6286 wop.write(0, data);
6287
6288 ioctx.locator_set_key(locator);
6289 ioctx.operate(oid, &wop);
6290 }
6291
6292 if (remove_bad) {
6293 ioctx.locator_set_key(string());
6294
6295 ret = ioctx.remove(oid);
6296 if (ret < 0) {
6297 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6298 return ret;
6299 }
6300 }
6301
6302 return 0;
6303 }
6304
6305 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6306 const string& src_oid, const string& src_locator,
6307 librados::IoCtx& dst_ioctx,
6308 const string& dst_oid, const string& dst_locator)
6309 {
6310
6311 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6312 bool done = false;
6313 uint64_t chunk_size = COPY_BUF_SIZE;
6314 uint64_t ofs = 0;
6315 int ret = 0;
6316 real_time mtime;
6317 struct timespec mtime_ts;
6318 uint64_t size;
6319
6320 if (src_oid == dst_oid && src_locator == dst_locator) {
6321 return 0;
6322 }
6323
6324 src_ioctx.locator_set_key(src_locator);
6325 dst_ioctx.locator_set_key(dst_locator);
6326
6327 do {
6328 bufferlist data;
6329 ObjectReadOperation rop;
6330 ObjectWriteOperation wop;
6331
6332 if (ofs == 0) {
6333 rop.stat2(&size, &mtime_ts, NULL);
6334 mtime = real_clock::from_timespec(mtime_ts);
6335 }
6336 rop.read(ofs, chunk_size, &data, NULL);
6337 ret = src_ioctx.operate(src_oid, &rop, NULL);
6338 if (ret < 0) {
6339 goto done_err;
6340 }
6341
6342 if (data.length() == 0) {
6343 break;
6344 }
6345
6346 if (ofs == 0) {
6347 wop.create(true); /* make it exclusive */
6348 wop.mtime2(&mtime_ts);
6349 mtime = real_clock::from_timespec(mtime_ts);
6350 }
6351 wop.write(ofs, data);
6352 ret = dst_ioctx.operate(dst_oid, &wop);
6353 ofs += data.length();
6354 done = data.length() != chunk_size;
6355 } while (!done);
6356
6357 if (ofs != size) {
6358 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6359 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6360 ret = -EIO;
6361 goto done_err;
6362 }
6363
6364 src_ioctx.remove(src_oid);
6365
6366 return 0;
6367
6368 done_err:
6369 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6370 return ret;
6371 }
6372
6373 /*
6374 * fixes an issue where head objects were supposed to have a locator created, but ended
6375 * up without one
6376 */
6377 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6378 {
6379 const rgw_bucket& bucket = bucket_info.bucket;
6380 rgw_obj obj(bucket, key);
6381
6382 if (need_fix) {
6383 *need_fix = false;
6384 }
6385
6386 rgw_rados_ref ref;
6387 int r = get_obj_head_ref(bucket_info, obj, &ref);
6388 if (r < 0) {
6389 return r;
6390 }
6391
6392 RGWObjState *astate = NULL;
6393 RGWObjectCtx rctx(this);
6394 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6395 if (r < 0)
6396 return r;
6397
6398 if (astate->has_manifest) {
6399 RGWObjManifest::obj_iterator miter;
6400 RGWObjManifest& manifest = astate->manifest;
6401 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6402 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6403 rgw_obj loc;
6404 string oid;
6405 string locator;
6406
6407 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6408
6409 if (loc.key.ns.empty()) {
6410 /* continue, we're only interested in tail objects */
6411 continue;
6412 }
6413
6414 get_obj_bucket_and_oid_loc(loc, oid, locator);
6415 ref.ioctx.locator_set_key(locator);
6416
6417 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6418
6419 r = ref.ioctx.stat(oid, NULL, NULL);
6420 if (r != -ENOENT) {
6421 continue;
6422 }
6423
6424 string bad_loc;
6425 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6426
6427 /* create a new ioctx with the bad locator */
6428 librados::IoCtx src_ioctx;
6429 src_ioctx.dup(ref.ioctx);
6430 src_ioctx.locator_set_key(bad_loc);
6431
6432 r = src_ioctx.stat(oid, NULL, NULL);
6433 if (r != 0) {
6434 /* cannot find a broken part */
6435 continue;
6436 }
6437 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6438 if (need_fix) {
6439 *need_fix = true;
6440 }
6441 if (fix) {
6442 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6443 if (r < 0) {
6444 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6445 }
6446 }
6447 }
6448 }
6449
6450 return 0;
6451 }
6452
6453 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6454 {
6455 bucket = _bucket;
6456
6457 RGWObjectCtx obj_ctx(store);
6458
6459 RGWBucketInfo bucket_info;
6460 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6461 if (ret < 0) {
6462 return ret;
6463 }
6464
6465 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6466 if (ret < 0) {
6467 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6468 return ret;
6469 }
6470 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6471
6472 return 0;
6473 }
6474
6475 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6476 {
6477 bucket = _bucket;
6478 shard_id = sid;
6479
6480 RGWObjectCtx obj_ctx(store);
6481
6482 RGWBucketInfo bucket_info;
6483 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6484 if (ret < 0) {
6485 return ret;
6486 }
6487
6488 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6489 if (ret < 0) {
6490 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6491 return ret;
6492 }
6493 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6494
6495 return 0;
6496 }
6497
6498
6499 /* Execute @handler on last item in bucket listing for bucket specified
6500 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6501 * to objects matching these criterias. */
6502 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6503 const std::string& obj_prefix,
6504 const std::string& obj_delim,
6505 std::function<int(const rgw_bucket_dir_entry&)> handler)
6506 {
6507 RGWRados::Bucket target(this, bucket_info);
6508 RGWRados::Bucket::List list_op(&target);
6509
6510 list_op.params.prefix = obj_prefix;
6511 list_op.params.delim = obj_delim;
6512
6513 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6514 << ", obj_prefix=" << obj_prefix
6515 << ", obj_delim=" << obj_delim
6516 << dendl;
6517
6518 bool is_truncated = false;
6519
6520 boost::optional<rgw_bucket_dir_entry> last_entry;
6521 /* We need to rewind to the last object in a listing. */
6522 do {
6523 /* List bucket entries in chunks. */
6524 static constexpr int MAX_LIST_OBJS = 100;
6525 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6526
6527 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6528 &is_truncated);
6529 if (ret < 0) {
6530 return ret;
6531 } else if (!entries.empty()) {
6532 last_entry = entries.back();
6533 }
6534 } while (is_truncated);
6535
6536 if (last_entry) {
6537 return handler(*last_entry);
6538 }
6539
6540 /* Empty listing - no items we can run handler on. */
6541 return 0;
6542 }
6543
6544
6545 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6546 const rgw_user& user,
6547 RGWBucketInfo& bucket_info,
6548 rgw_obj& obj)
6549 {
6550 if (! swift_versioning_enabled(bucket_info)) {
6551 return 0;
6552 }
6553
6554 obj_ctx.obj.set_atomic(obj);
6555
6556 RGWObjState * state = nullptr;
6557 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6558 if (r < 0) {
6559 return r;
6560 }
6561
6562 if (!state->exists) {
6563 return 0;
6564 }
6565
6566 string client_id;
6567 string op_id;
6568
6569 const string& src_name = obj.get_oid();
6570 char buf[src_name.size() + 32];
6571 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6572 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6573 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6574
6575 RGWBucketInfo dest_bucket_info;
6576
6577 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6578 if (r < 0) {
6579 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6580 if (r == -ENOENT) {
6581 return -ERR_PRECONDITION_FAILED;
6582 }
6583 return r;
6584 }
6585
6586 if (dest_bucket_info.owner != bucket_info.owner) {
6587 return -ERR_PRECONDITION_FAILED;
6588 }
6589
6590 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6591 obj_ctx.obj.set_atomic(dest_obj);
6592
6593 string no_zone;
6594
6595 r = copy_obj(obj_ctx,
6596 user,
6597 client_id,
6598 op_id,
6599 NULL, /* req_info *info */
6600 no_zone,
6601 dest_obj,
6602 obj,
6603 dest_bucket_info,
6604 bucket_info,
6605 NULL, /* time_t *src_mtime */
6606 NULL, /* time_t *mtime */
6607 NULL, /* const time_t *mod_ptr */
6608 NULL, /* const time_t *unmod_ptr */
6609 false, /* bool high_precision_time */
6610 NULL, /* const char *if_match */
6611 NULL, /* const char *if_nomatch */
6612 RGWRados::ATTRSMOD_NONE,
6613 true, /* bool copy_if_newer */
6614 state->attrset,
6615 RGW_OBJ_CATEGORY_MAIN,
6616 0, /* uint64_t olh_epoch */
6617 real_time(), /* time_t delete_at */
6618 NULL, /* string *version_id */
6619 NULL, /* string *ptag */
6620 NULL, /* string *petag */
6621 NULL, /* void (*progress_cb)(off_t, void *) */
6622 NULL); /* void *progress_data */
6623 if (r == -ECANCELED || r == -ENOENT) {
6624 /* Has already been overwritten, meaning another rgw process already
6625 * copied it out */
6626 return 0;
6627 }
6628
6629 return r;
6630 }
6631
6632 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6633 const rgw_user& user,
6634 RGWBucketInfo& bucket_info,
6635 rgw_obj& obj,
6636 bool& restored) /* out */
6637 {
6638 if (! swift_versioning_enabled(bucket_info)) {
6639 return 0;
6640 }
6641
6642 /* Bucket info of the bucket that stores previous versions of our object. */
6643 RGWBucketInfo archive_binfo;
6644
6645 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6646 bucket_info.swift_ver_location, archive_binfo,
6647 nullptr, nullptr);
6648 if (ret < 0) {
6649 return ret;
6650 }
6651
6652 /* Abort the operation if the bucket storing our archive belongs to someone
6653 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6654 * into consideration. For we can live with that.
6655 *
6656 * TODO: delegate this check to un upper layer and compare with ACLs. */
6657 if (bucket_info.owner != archive_binfo.owner) {
6658 return -EPERM;
6659 }
6660
6661 /* This code will be executed on latest version of the object. */
6662 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6663 std::string no_client_id;
6664 std::string no_op_id;
6665 std::string no_zone;
6666
6667 /* We don't support object versioning of Swift API on those buckets that
6668 * are already versioned using the S3 mechanism. This affects also bucket
6669 * storing archived objects. Otherwise the delete operation would create
6670 * a deletion marker. */
6671 if (archive_binfo.versioned()) {
6672 restored = false;
6673 return -ERR_PRECONDITION_FAILED;
6674 }
6675
6676 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6677 * irrelevant and may be safely skipped. */
6678 std::map<std::string, ceph::bufferlist> no_attrs;
6679
6680 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6681 obj_ctx.obj.set_atomic(archive_obj);
6682 obj_ctx.obj.set_atomic(obj);
6683
6684 int ret = copy_obj(obj_ctx,
6685 user,
6686 no_client_id,
6687 no_op_id,
6688 nullptr, /* req_info *info */
6689 no_zone,
6690 obj, /* dest obj */
6691 archive_obj, /* src obj */
6692 bucket_info, /* dest bucket info */
6693 archive_binfo, /* src bucket info */
6694 nullptr, /* time_t *src_mtime */
6695 nullptr, /* time_t *mtime */
6696 nullptr, /* const time_t *mod_ptr */
6697 nullptr, /* const time_t *unmod_ptr */
6698 false, /* bool high_precision_time */
6699 nullptr, /* const char *if_match */
6700 nullptr, /* const char *if_nomatch */
6701 RGWRados::ATTRSMOD_NONE,
6702 true, /* bool copy_if_newer */
6703 no_attrs,
6704 RGW_OBJ_CATEGORY_MAIN,
6705 0, /* uint64_t olh_epoch */
6706 real_time(), /* time_t delete_at */
6707 nullptr, /* string *version_id */
6708 nullptr, /* string *ptag */
6709 nullptr, /* string *petag */
6710 nullptr, /* void (*progress_cb)(off_t, void *) */
6711 nullptr); /* void *progress_data */
6712 if (ret == -ECANCELED || ret == -ENOENT) {
6713 /* Has already been overwritten, meaning another rgw process already
6714 * copied it out */
6715 return 0;
6716 } else if (ret < 0) {
6717 return ret;
6718 } else {
6719 restored = true;
6720 }
6721
6722 /* Need to remove the archived copy. */
6723 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6724 archive_binfo.versioning_status());
6725
6726 return ret;
6727 };
6728
6729 const std::string& obj_name = obj.get_oid();
6730 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6731 % obj_name);
6732
6733 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6734 handler);
6735 }
6736
6737 /**
6738 * Write/overwrite an object to the bucket storage.
6739 * bucket: the bucket to store the object in
6740 * obj: the object name/key
6741 * data: the object contents/value
6742 * size: the amount of data to write (data must be this long)
6743 * accounted_size: original size of data before compression, encryption
6744 * mtime: if non-NULL, writes the given mtime to the bucket storage
6745 * attrs: all the given attrs are written to bucket storage for the given object
6746 * exclusive: create object exclusively
6747 * Returns: 0 on success, -ERR# otherwise.
6748 */
6749 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6750 map<string, bufferlist>& attrs, bool assume_noent,
6751 void *_index_op)
6752 {
6753 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6754 RGWRados *store = target->get_store();
6755
6756 ObjectWriteOperation op;
6757
6758 RGWObjState *state;
6759 int r = target->get_state(&state, false, assume_noent);
6760 if (r < 0)
6761 return r;
6762
6763 rgw_obj& obj = target->get_obj();
6764
6765 if (obj.get_oid().empty()) {
6766 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6767 return -EIO;
6768 }
6769
6770 rgw_rados_ref ref;
6771 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6772 if (r < 0)
6773 return r;
6774
6775 bool is_olh = state->is_olh;
6776
6777 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6778
6779 const string *ptag = meta.ptag;
6780 if (!ptag && !index_op->get_optag()->empty()) {
6781 ptag = index_op->get_optag();
6782 }
6783 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
6784 if (r < 0)
6785 return r;
6786
6787 if (real_clock::is_zero(meta.set_mtime)) {
6788 meta.set_mtime = real_clock::now();
6789 }
6790
6791 if (state->is_olh) {
6792 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6793 }
6794
6795 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6796 op.mtime2(&mtime_ts);
6797
6798 if (meta.data) {
6799 /* if we want to overwrite the data, we also want to overwrite the
6800 xattrs, so just remove the object */
6801 op.write_full(*meta.data);
6802 }
6803
6804 string etag;
6805 string content_type;
6806 bufferlist acl_bl;
6807
6808 map<string, bufferlist>::iterator iter;
6809 if (meta.rmattrs) {
6810 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6811 const string& name = iter->first;
6812 op.rmxattr(name.c_str());
6813 }
6814 }
6815
6816 if (meta.manifest) {
6817 /* remove existing manifest attr */
6818 iter = attrs.find(RGW_ATTR_MANIFEST);
6819 if (iter != attrs.end())
6820 attrs.erase(iter);
6821
6822 bufferlist bl;
6823 ::encode(*meta.manifest, bl);
6824 op.setxattr(RGW_ATTR_MANIFEST, bl);
6825 }
6826
6827 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6828 const string& name = iter->first;
6829 bufferlist& bl = iter->second;
6830
6831 if (!bl.length())
6832 continue;
6833
6834 op.setxattr(name.c_str(), bl);
6835
6836 if (name.compare(RGW_ATTR_ETAG) == 0) {
6837 etag = bl.c_str();
6838 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6839 content_type = bl.c_str();
6840 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6841 acl_bl = bl;
6842 }
6843 }
6844 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6845 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6846 }
6847
6848 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6849 bufferlist bl;
6850 ::encode(store->get_zone_short_id(), bl);
6851 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6852 }
6853
6854 if (!op.size())
6855 return 0;
6856
6857 uint64_t epoch;
6858 int64_t poolid;
6859 bool orig_exists;
6860 uint64_t orig_size;
6861
6862 if (!reset_obj) { //Multipart upload, it has immutable head.
6863 orig_exists = false;
6864 orig_size = 0;
6865 } else {
6866 orig_exists = state->exists;
6867 orig_size = state->accounted_size;
6868 }
6869
6870 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6871
6872 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6873
6874 if (versioned_op) {
6875 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6876 }
6877
6878 if (!index_op->is_prepared()) {
6879 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6880 if (r < 0)
6881 return r;
6882 }
6883
6884 r = ref.ioctx.operate(ref.oid, &op);
6885 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6886 or -ENOENT if was removed, or -EEXIST if it did not exist
6887 before and now it does */
6888 if (r == -EEXIST && assume_noent) {
6889 target->invalidate_state();
6890 return r;
6891 }
6892 goto done_cancel;
6893 }
6894
6895 epoch = ref.ioctx.get_last_version();
6896 poolid = ref.ioctx.get_id();
6897
6898 r = target->complete_atomic_modification();
6899 if (r < 0) {
6900 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
6901 }
6902
6903 r = index_op->complete(poolid, epoch, size, accounted_size,
6904 meta.set_mtime, etag, content_type, &acl_bl,
6905 meta.category, meta.remove_objs, meta.user_data);
6906 if (r < 0)
6907 goto done_cancel;
6908
6909 if (meta.mtime) {
6910 *meta.mtime = meta.set_mtime;
6911 }
6912
6913 /* note that index_op was using state so we couldn't invalidate it earlier */
6914 target->invalidate_state();
6915 state = NULL;
6916
6917 if (versioned_op) {
6918 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
6919 if (r < 0) {
6920 return r;
6921 }
6922 }
6923
6924 if (!real_clock::is_zero(meta.delete_at)) {
6925 rgw_obj_index_key obj_key;
6926 obj.key.get_index_key(&obj_key);
6927
6928 r = store->objexp_hint_add(meta.delete_at,
6929 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
6930 if (r < 0) {
6931 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
6932 /* ignoring error, nothing we can do at this point */
6933 }
6934 }
6935 meta.canceled = false;
6936
6937 /* update quota cache */
6938 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
6939 accounted_size, orig_size);
6940 return 0;
6941
6942 done_cancel:
6943 int ret = index_op->cancel();
6944 if (ret < 0) {
6945 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
6946 }
6947
6948 meta.canceled = true;
6949
6950 /* we lost in a race. There are a few options:
6951 * - existing object was rewritten (ECANCELED)
6952 * - non existing object was created (EEXIST)
6953 * - object was removed (ENOENT)
6954 * should treat it as a success
6955 */
6956 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
6957 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
6958 r = 0;
6959 }
6960 } else {
6961 if (meta.if_match != NULL) {
6962 // only overwrite existing object
6963 if (strcmp(meta.if_match, "*") == 0) {
6964 if (r == -ENOENT) {
6965 r = -ERR_PRECONDITION_FAILED;
6966 } else if (r == -ECANCELED) {
6967 r = 0;
6968 }
6969 }
6970 }
6971
6972 if (meta.if_nomatch != NULL) {
6973 // only create a new object
6974 if (strcmp(meta.if_nomatch, "*") == 0) {
6975 if (r == -EEXIST) {
6976 r = -ERR_PRECONDITION_FAILED;
6977 } else if (r == -ENOENT) {
6978 r = 0;
6979 }
6980 }
6981 }
6982 }
6983
6984 return r;
6985 }
6986
6987 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
6988 map<string, bufferlist>& attrs)
6989 {
6990 RGWBucketInfo& bucket_info = target->get_bucket_info();
6991
6992 RGWRados::Bucket bop(target->get_store(), bucket_info);
6993 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
6994 index_op.set_zones_trace(meta.zones_trace);
6995
6996 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
6997 int r;
6998 if (assume_noent) {
6999 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
7000 if (r == -EEXIST) {
7001 assume_noent = false;
7002 }
7003 }
7004 if (!assume_noent) {
7005 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
7006 }
7007 return r;
7008 }
7009
7010 /** Write/overwrite a system object. */
7011 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7012 map<std::string, bufferlist>& attrs, int flags,
7013 bufferlist& data,
7014 RGWObjVersionTracker *objv_tracker,
7015 real_time set_mtime /* 0 for don't set */)
7016 {
7017 rgw_rados_ref ref;
7018 int r = get_system_obj_ref(obj, &ref);
7019 if (r < 0)
7020 return r;
7021
7022 ObjectWriteOperation op;
7023
7024 if (flags & PUT_OBJ_EXCL) {
7025 if (!(flags & PUT_OBJ_CREATE))
7026 return -EINVAL;
7027 op.create(true); // exclusive create
7028 } else {
7029 op.remove();
7030 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7031 op.create(false);
7032 }
7033
7034 if (objv_tracker) {
7035 objv_tracker->prepare_op_for_write(&op);
7036 }
7037
7038 if (real_clock::is_zero(set_mtime)) {
7039 set_mtime = real_clock::now();
7040 }
7041
7042 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7043 op.mtime2(&mtime_ts);
7044 op.write_full(data);
7045
7046 bufferlist acl_bl;
7047
7048 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7049 const string& name = iter->first;
7050 bufferlist& bl = iter->second;
7051
7052 if (!bl.length())
7053 continue;
7054
7055 op.setxattr(name.c_str(), bl);
7056 }
7057
7058 r = ref.ioctx.operate(ref.oid, &op);
7059 if (r < 0) {
7060 return r;
7061 }
7062
7063 if (objv_tracker) {
7064 objv_tracker->apply_write();
7065 }
7066
7067 if (mtime) {
7068 *mtime = set_mtime;
7069 }
7070
7071 return 0;
7072 }
7073
7074 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7075 off_t ofs, bool exclusive,
7076 RGWObjVersionTracker *objv_tracker)
7077 {
7078 rgw_rados_ref ref;
7079 int r = get_system_obj_ref(obj, &ref);
7080 if (r < 0) {
7081 return r;
7082 }
7083
7084 ObjectWriteOperation op;
7085
7086 if (exclusive)
7087 op.create(true);
7088
7089 if (objv_tracker) {
7090 objv_tracker->prepare_op_for_write(&op);
7091 }
7092 if (ofs == -1) {
7093 op.write_full(bl);
7094 } else {
7095 op.write(ofs, bl);
7096 }
7097 r = ref.ioctx.operate(ref.oid, &op);
7098 if (r < 0)
7099 return r;
7100
7101 if (objv_tracker) {
7102 objv_tracker->apply_write();
7103 }
7104 return 0;
7105 }
7106
7107 /**
7108 * Write/overwrite an object to the bucket storage.
7109 * bucket: the bucket to store the object in
7110 * obj: the object name/key
7111 * data: the object contents/value
7112 * offset: the offet to write to in the object
7113 * If this is -1, we will overwrite the whole object.
7114 * size: the amount of data to write (data must be this long)
7115 * attrs: all the given attrs are written to bucket storage for the given object
7116 * Returns: 0 on success, -ERR# otherwise.
7117 */
7118
7119 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7120 off_t ofs, bool exclusive,
7121 void **handle)
7122 {
7123 rgw_rados_ref ref;
7124 int r = get_raw_obj_ref(obj, &ref);
7125 if (r < 0) {
7126 return r;
7127 }
7128
7129 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7130 *handle = c;
7131
7132 ObjectWriteOperation op;
7133
7134 if (exclusive)
7135 op.create(true);
7136
7137 if (ofs == -1) {
7138 op.write_full(bl);
7139 } else {
7140 op.write(ofs, bl);
7141 }
7142 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7143 if (r < 0)
7144 return r;
7145
7146 return 0;
7147 }
7148
7149 int RGWRados::aio_wait(void *handle)
7150 {
7151 AioCompletion *c = (AioCompletion *)handle;
7152 c->wait_for_safe();
7153 int ret = c->get_return_value();
7154 c->release();
7155 return ret;
7156 }
7157
7158 bool RGWRados::aio_completed(void *handle)
7159 {
7160 AioCompletion *c = (AioCompletion *)handle;
7161 return c->is_safe();
7162 }
7163
7164 class RGWRadosPutObj : public RGWGetDataCB
7165 {
7166 CephContext* cct;
7167 rgw_obj obj;
7168 RGWPutObjDataProcessor *filter;
7169 boost::optional<RGWPutObj_Compress>& compressor;
7170 CompressorRef& plugin;
7171 RGWPutObjProcessor_Atomic *processor;
7172 RGWOpStateSingleOp *opstate;
7173 void (*progress_cb)(off_t, void *);
7174 void *progress_data;
7175 bufferlist extra_data_bl;
7176 uint64_t extra_data_len;
7177 uint64_t data_len;
7178 map<string, bufferlist> src_attrs;
7179 public:
7180 RGWRadosPutObj(CephContext* cct,
7181 CompressorRef& plugin,
7182 boost::optional<RGWPutObj_Compress>& compressor,
7183 RGWPutObjProcessor_Atomic *p,
7184 RGWOpStateSingleOp *_ops,
7185 void (*_progress_cb)(off_t, void *),
7186 void *_progress_data) :
7187 cct(cct),
7188 filter(p),
7189 compressor(compressor),
7190 plugin(plugin),
7191 processor(p),
7192 opstate(_ops),
7193 progress_cb(_progress_cb),
7194 progress_data(_progress_data),
7195 extra_data_len(0),
7196 data_len(0) {}
7197
7198 int process_attrs(void) {
7199 if (extra_data_bl.length()) {
7200 JSONParser jp;
7201 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7202 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7203 return -EIO;
7204 }
7205
7206 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7207
7208 src_attrs.erase(RGW_ATTR_COMPRESSION);
7209 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7210 }
7211
7212 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7213 //do not compress if object is encrypted
7214 compressor = boost::in_place(cct, plugin, filter);
7215 filter = &*compressor;
7216 }
7217 return 0;
7218 }
7219
7220 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7221 if (progress_cb) {
7222 progress_cb(ofs, progress_data);
7223 }
7224 if (extra_data_len) {
7225 size_t extra_len = bl.length();
7226 if (extra_len > extra_data_len)
7227 extra_len = extra_data_len;
7228
7229 bufferlist extra;
7230 bl.splice(0, extra_len, &extra);
7231 extra_data_bl.append(extra);
7232
7233 extra_data_len -= extra_len;
7234 if (extra_data_len == 0) {
7235 int res = process_attrs();
7236 if (res < 0)
7237 return res;
7238 }
7239 if (bl.length() == 0) {
7240 return 0;
7241 }
7242 }
7243 data_len += bl.length();
7244 bool again = false;
7245
7246 bool need_opstate = true;
7247
7248 do {
7249 void *handle = NULL;
7250 rgw_raw_obj obj;
7251 uint64_t size = bl.length();
7252 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7253 if (ret < 0)
7254 return ret;
7255
7256 if (need_opstate && opstate) {
7257 /* need to update opstate repository with new state. This is ratelimited, so we're not
7258 * really doing it every time
7259 */
7260 ret = opstate->renew_state();
7261 if (ret < 0) {
7262 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7263 int r = filter->throttle_data(handle, obj, size, false);
7264 if (r < 0) {
7265 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7266 }
7267 /* could not renew state! might have been marked as cancelled */
7268 return ret;
7269 }
7270 need_opstate = false;
7271 }
7272
7273 ret = filter->throttle_data(handle, obj, size, false);
7274 if (ret < 0)
7275 return ret;
7276 } while (again);
7277
7278 return 0;
7279 }
7280
7281 bufferlist& get_extra_data() { return extra_data_bl; }
7282
7283 map<string, bufferlist>& get_attrs() { return src_attrs; }
7284
7285 void set_extra_data_len(uint64_t len) override {
7286 extra_data_len = len;
7287 }
7288
7289 uint64_t get_data_len() {
7290 return data_len;
7291 }
7292
7293 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7294 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7295 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7296 }
7297
7298 bool is_canceled() {
7299 return processor->is_canceled();
7300 }
7301 };
7302
7303 /*
7304 * prepare attrset depending on attrs_mod.
7305 */
7306 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7307 map<string, bufferlist>& attrs,
7308 RGWRados::AttrsMod attrs_mod)
7309 {
7310 switch (attrs_mod) {
7311 case RGWRados::ATTRSMOD_NONE:
7312 attrs = src_attrs;
7313 break;
7314 case RGWRados::ATTRSMOD_REPLACE:
7315 if (!attrs[RGW_ATTR_ETAG].length()) {
7316 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7317 }
7318 break;
7319 case RGWRados::ATTRSMOD_MERGE:
7320 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7321 if (attrs.find(it->first) == attrs.end()) {
7322 attrs[it->first] = it->second;
7323 }
7324 }
7325 break;
7326 }
7327 }
7328
7329 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7330 {
7331 map<string, bufferlist> attrset;
7332
7333 real_time mtime;
7334 uint64_t obj_size;
7335 RGWObjectCtx rctx(this);
7336
7337 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7338 RGWRados::Object::Read read_op(&op_target);
7339
7340 read_op.params.attrs = &attrset;
7341 read_op.params.lastmod = &mtime;
7342 read_op.params.obj_size = &obj_size;
7343
7344 int ret = read_op.prepare();
7345 if (ret < 0)
7346 return ret;
7347
7348 attrset.erase(RGW_ATTR_ID_TAG);
7349
7350 uint64_t max_chunk_size;
7351
7352 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7353 if (ret < 0) {
7354 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7355 return ret;
7356 }
7357
7358 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
7359 RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL);
7360 }
7361
7362 struct obj_time_weight {
7363 real_time mtime;
7364 uint32_t zone_short_id;
7365 uint64_t pg_ver;
7366 bool high_precision;
7367
7368 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7369
7370 bool compare_low_precision(const obj_time_weight& rhs) {
7371 struct timespec l = ceph::real_clock::to_timespec(mtime);
7372 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7373 l.tv_nsec = 0;
7374 r.tv_nsec = 0;
7375 if (l > r) {
7376 return false;
7377 }
7378 if (l < r) {
7379 return true;
7380 }
7381 if (zone_short_id != rhs.zone_short_id) {
7382 return (zone_short_id < rhs.zone_short_id);
7383 }
7384 return (pg_ver < rhs.pg_ver);
7385
7386 }
7387
7388 bool operator<(const obj_time_weight& rhs) {
7389 if (!high_precision || !rhs.high_precision) {
7390 return compare_low_precision(rhs);
7391 }
7392 if (mtime > rhs.mtime) {
7393 return false;
7394 }
7395 if (mtime < rhs.mtime) {
7396 return true;
7397 }
7398 if (zone_short_id != rhs.zone_short_id) {
7399 return (zone_short_id < rhs.zone_short_id);
7400 }
7401 return (pg_ver < rhs.pg_ver);
7402 }
7403
7404 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7405 mtime = _mtime;
7406 zone_short_id = _short_id;
7407 pg_ver = _pg_ver;
7408 }
7409
7410 void init(RGWObjState *state) {
7411 mtime = state->mtime;
7412 zone_short_id = state->zone_short_id;
7413 pg_ver = state->pg_ver;
7414 }
7415 };
7416
7417 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7418 out << o.mtime;
7419
7420 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7421 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7422 }
7423
7424 return out;
7425 }
7426
7427 class RGWGetExtraDataCB : public RGWGetDataCB {
7428 bufferlist extra_data;
7429 public:
7430 RGWGetExtraDataCB() {}
7431 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7432 if (extra_data.length() < extra_data_len) {
7433 off_t max = extra_data_len - extra_data.length();
7434 if (max > bl_len) {
7435 max = bl_len;
7436 }
7437 bl.splice(0, max, &extra_data);
7438 }
7439 return bl_len;
7440 }
7441
7442 bufferlist& get_extra_data() {
7443 return extra_data;
7444 }
7445 };
7446
7447 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7448 const rgw_user& user_id,
7449 const string& client_id,
7450 req_info *info,
7451 const string& source_zone,
7452 rgw_obj& src_obj,
7453 RGWBucketInfo& src_bucket_info,
7454 real_time *src_mtime,
7455 uint64_t *psize,
7456 const real_time *mod_ptr,
7457 const real_time *unmod_ptr,
7458 bool high_precision_time,
7459 const char *if_match,
7460 const char *if_nomatch,
7461 map<string, bufferlist> *pattrs,
7462 string *version_id,
7463 string *ptag,
7464 string *petag)
7465 {
7466 /* source is in a different zonegroup, copy from there */
7467
7468 RGWRESTStreamRWRequest *in_stream_req;
7469 string tag;
7470 map<string, bufferlist> src_attrs;
7471 append_rand_alpha(cct, tag, tag, 32);
7472 obj_time_weight set_mtime_weight;
7473 set_mtime_weight.high_precision = high_precision_time;
7474
7475 RGWRESTConn *conn;
7476 if (source_zone.empty()) {
7477 if (src_bucket_info.zonegroup.empty()) {
7478 /* source is in the master zonegroup */
7479 conn = rest_master_conn;
7480 } else {
7481 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7482 if (iter == zonegroup_conn_map.end()) {
7483 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7484 return -ENOENT;
7485 }
7486 conn = iter->second;
7487 }
7488 } else {
7489 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7490 if (iter == zone_conn_map.end()) {
7491 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7492 return -ENOENT;
7493 }
7494 conn = iter->second;
7495 }
7496
7497 RGWGetExtraDataCB cb;
7498 string etag;
7499 map<string, string> req_headers;
7500 real_time set_mtime;
7501
7502 const real_time *pmod = mod_ptr;
7503
7504 obj_time_weight dest_mtime_weight;
7505
7506 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7507 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7508 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7509 true /* sync manifest */, &cb, &in_stream_req);
7510 if (ret < 0) {
7511 return ret;
7512 }
7513
7514 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7515 if (ret < 0) {
7516 return ret;
7517 }
7518
7519 bufferlist& extra_data_bl = cb.get_extra_data();
7520 if (extra_data_bl.length()) {
7521 JSONParser jp;
7522 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7523 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7524 return -EIO;
7525 }
7526
7527 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7528
7529 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7530 }
7531
7532 if (src_mtime) {
7533 *src_mtime = set_mtime;
7534 }
7535
7536 if (petag) {
7537 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7538 if (iter != src_attrs.end()) {
7539 bufferlist& etagbl = iter->second;
7540 *petag = etagbl.to_str();
7541 }
7542 }
7543
7544 if (pattrs) {
7545 *pattrs = src_attrs;
7546 }
7547
7548 return 0;
7549 }
7550
7551 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7552 const rgw_user& user_id,
7553 const string& client_id,
7554 const string& op_id,
7555 bool record_op_state,
7556 req_info *info,
7557 const string& source_zone,
7558 rgw_obj& dest_obj,
7559 rgw_obj& src_obj,
7560 RGWBucketInfo& dest_bucket_info,
7561 RGWBucketInfo& src_bucket_info,
7562 real_time *src_mtime,
7563 real_time *mtime,
7564 const real_time *mod_ptr,
7565 const real_time *unmod_ptr,
7566 bool high_precision_time,
7567 const char *if_match,
7568 const char *if_nomatch,
7569 AttrsMod attrs_mod,
7570 bool copy_if_newer,
7571 map<string, bufferlist>& attrs,
7572 RGWObjCategory category,
7573 uint64_t olh_epoch,
7574 real_time delete_at,
7575 string *version_id,
7576 string *ptag,
7577 ceph::buffer::list *petag,
7578 void (*progress_cb)(off_t, void *),
7579 void *progress_data,
7580 rgw_zone_set *zones_trace)
7581 {
7582 /* source is in a different zonegroup, copy from there */
7583
7584 RGWRESTStreamRWRequest *in_stream_req;
7585 string tag;
7586 int i;
7587 append_rand_alpha(cct, tag, tag, 32);
7588 obj_time_weight set_mtime_weight;
7589 set_mtime_weight.high_precision = high_precision_time;
7590
7591 RGWPutObjProcessor_Atomic processor(obj_ctx,
7592 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7593 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7594 if (version_id && *version_id != "null") {
7595 processor.set_version_id(*version_id);
7596 }
7597 processor.set_olh_epoch(olh_epoch);
7598 int ret = processor.prepare(this, NULL);
7599 if (ret < 0) {
7600 return ret;
7601 }
7602
7603 RGWRESTConn *conn;
7604 if (source_zone.empty()) {
7605 if (dest_bucket_info.zonegroup.empty()) {
7606 /* source is in the master zonegroup */
7607 conn = rest_master_conn;
7608 } else {
7609 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7610 if (iter == zonegroup_conn_map.end()) {
7611 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7612 return -ENOENT;
7613 }
7614 conn = iter->second;
7615 }
7616 } else {
7617 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7618 if (iter == zone_conn_map.end()) {
7619 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7620 return -ENOENT;
7621 }
7622 conn = iter->second;
7623 }
7624
7625 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7626
7627 RGWOpStateSingleOp *opstate = NULL;
7628
7629 if (record_op_state) {
7630 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7631
7632 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7633 if (ret < 0) {
7634 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7635 delete opstate;
7636 return ret;
7637 }
7638 }
7639
7640 boost::optional<RGWPutObj_Compress> compressor;
7641 CompressorRef plugin;
7642
7643 const auto& compression_type = zone_params.get_compression_type(
7644 dest_bucket_info.placement_rule);
7645 if (compression_type != "none") {
7646 plugin = Compressor::create(cct, compression_type);
7647 if (!plugin) {
7648 ldout(cct, 1) << "Cannot load plugin for compression type "
7649 << compression_type << dendl;
7650 }
7651 }
7652
7653 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7654
7655 string etag;
7656 map<string, string> req_headers;
7657 real_time set_mtime;
7658
7659 RGWObjState *dest_state = NULL;
7660
7661 const real_time *pmod = mod_ptr;
7662
7663 obj_time_weight dest_mtime_weight;
7664
7665 if (copy_if_newer) {
7666 /* need to get mtime for destination */
7667 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7668 if (ret < 0)
7669 goto set_err_state;
7670
7671 if (!real_clock::is_zero(dest_state->mtime)) {
7672 dest_mtime_weight.init(dest_state);
7673 pmod = &dest_mtime_weight.mtime;
7674 }
7675 }
7676
7677 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7678 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7679 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7680 true /* sync manifest */, &cb, &in_stream_req);
7681 if (ret < 0) {
7682 goto set_err_state;
7683 }
7684
7685 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7686 if (ret < 0) {
7687 goto set_err_state;
7688 }
7689 if (compressor && compressor->is_compressed()) {
7690 bufferlist tmp;
7691 RGWCompressionInfo cs_info;
7692 cs_info.compression_type = plugin->get_type_name();
7693 cs_info.orig_size = cb.get_data_len();
7694 cs_info.blocks = move(compressor->get_compression_blocks());
7695 ::encode(cs_info, tmp);
7696 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7697 }
7698
7699 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7700 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7701 } else {
7702 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7703 if (iter != cb.get_attrs().end()) {
7704 try {
7705 ::decode(delete_at, iter->second);
7706 } catch (buffer::error& err) {
7707 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7708 }
7709 }
7710 }
7711
7712 if (src_mtime) {
7713 *src_mtime = set_mtime;
7714 }
7715
7716 if (petag) {
7717 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7718 if (iter != cb.get_attrs().end()) {
7719 *petag = iter->second;
7720 }
7721 }
7722
7723 if (source_zone.empty()) {
7724 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7725 } else {
7726 attrs = cb.get_attrs();
7727 }
7728
7729 if (copy_if_newer) {
7730 uint64_t pg_ver = 0;
7731 auto i = attrs.find(RGW_ATTR_PG_VER);
7732 if (i != attrs.end() && i->second.length() > 0) {
7733 bufferlist::iterator iter = i->second.begin();
7734 try {
7735 ::decode(pg_ver, iter);
7736 } catch (buffer::error& err) {
7737 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7738 /* non critical error */
7739 }
7740 }
7741 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7742 }
7743
7744 #define MAX_COMPLETE_RETRY 100
7745 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7746 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7747 if (ret < 0) {
7748 goto set_err_state;
7749 }
7750 if (copy_if_newer && cb.is_canceled()) {
7751 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7752 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7753 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7754 if (ret < 0) {
7755 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7756 goto set_err_state;
7757 }
7758 dest_mtime_weight.init(dest_state);
7759 dest_mtime_weight.high_precision = high_precision_time;
7760 if (!dest_state->exists ||
7761 dest_mtime_weight < set_mtime_weight) {
7762 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7763 continue;
7764 } else {
7765 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7766 }
7767 }
7768 break;
7769 }
7770
7771 if (i == MAX_COMPLETE_RETRY) {
7772 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7773 ret = -EIO;
7774 goto set_err_state;
7775 }
7776
7777 if (opstate) {
7778 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7779 if (ret < 0) {
7780 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7781 }
7782 delete opstate;
7783 }
7784
7785 return 0;
7786 set_err_state:
7787 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7788 ret = 0;
7789 }
7790 if (opstate) {
7791 RGWOpState::OpState state;
7792 if (ret < 0) {
7793 state = RGWOpState::OPSTATE_ERROR;
7794 } else {
7795 state = RGWOpState::OPSTATE_COMPLETE;
7796 }
7797 int r = opstate->set_state(state);
7798 if (r < 0) {
7799 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7800 }
7801 delete opstate;
7802 }
7803 return ret;
7804 }
7805
7806
7807 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7808 map<string, bufferlist>& src_attrs,
7809 RGWRados::Object::Read& read_op,
7810 const rgw_user& user_id,
7811 rgw_obj& dest_obj,
7812 real_time *mtime)
7813 {
7814 string etag;
7815
7816 RGWRESTStreamWriteRequest *out_stream_req;
7817
7818 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7819 if (ret < 0) {
7820 delete out_stream_req;
7821 return ret;
7822 }
7823
7824 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7825 if (ret < 0) {
7826 delete out_stream_req;
7827 return ret;
7828 }
7829
7830 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7831 if (ret < 0)
7832 return ret;
7833
7834 return 0;
7835 }
7836
7837 /**
7838 * Copy an object.
7839 * dest_obj: the object to copy into
7840 * src_obj: the object to copy from
7841 * attrs: usage depends on attrs_mod parameter
7842 * attrs_mod: the modification mode of the attrs, may have the following values:
7843 * ATTRSMOD_NONE - the attributes of the source object will be
7844 * copied without modifications, attrs parameter is ignored;
7845 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7846 * parameter, source object attributes are not copied;
7847 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7848 * are overwritten by values contained in attrs parameter.
7849 * err: stores any errors resulting from the get of the original object
7850 * Returns: 0 on success, -ERR# otherwise.
7851 */
7852 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7853 const rgw_user& user_id,
7854 const string& client_id,
7855 const string& op_id,
7856 req_info *info,
7857 const string& source_zone,
7858 rgw_obj& dest_obj,
7859 rgw_obj& src_obj,
7860 RGWBucketInfo& dest_bucket_info,
7861 RGWBucketInfo& src_bucket_info,
7862 real_time *src_mtime,
7863 real_time *mtime,
7864 const real_time *mod_ptr,
7865 const real_time *unmod_ptr,
7866 bool high_precision_time,
7867 const char *if_match,
7868 const char *if_nomatch,
7869 AttrsMod attrs_mod,
7870 bool copy_if_newer,
7871 map<string, bufferlist>& attrs,
7872 RGWObjCategory category,
7873 uint64_t olh_epoch,
7874 real_time delete_at,
7875 string *version_id,
7876 string *ptag,
7877 ceph::buffer::list *petag,
7878 void (*progress_cb)(off_t, void *),
7879 void *progress_data)
7880 {
7881 int ret;
7882 uint64_t obj_size;
7883 rgw_obj shadow_obj = dest_obj;
7884 string shadow_oid;
7885
7886 bool remote_src;
7887 bool remote_dest;
7888
7889 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
7890 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
7891
7892 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
7893 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
7894
7895 if (remote_src && remote_dest) {
7896 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7897 return -EINVAL;
7898 }
7899
7900 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7901
7902 if (remote_src || !source_zone.empty()) {
7903 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
7904 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
7905 unmod_ptr, high_precision_time,
7906 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
7907 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7908 }
7909
7910 map<string, bufferlist> src_attrs;
7911 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
7912 RGWRados::Object::Read read_op(&src_op_target);
7913
7914 read_op.conds.mod_ptr = mod_ptr;
7915 read_op.conds.unmod_ptr = unmod_ptr;
7916 read_op.conds.high_precision_time = high_precision_time;
7917 read_op.conds.if_match = if_match;
7918 read_op.conds.if_nomatch = if_nomatch;
7919 read_op.params.attrs = &src_attrs;
7920 read_op.params.lastmod = src_mtime;
7921 read_op.params.obj_size = &obj_size;
7922
7923 ret = read_op.prepare();
7924 if (ret < 0) {
7925 return ret;
7926 }
7927
7928 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
7929 src_attrs.erase(RGW_ATTR_DELETE_AT);
7930
7931 set_copy_attrs(src_attrs, attrs, attrs_mod);
7932 attrs.erase(RGW_ATTR_ID_TAG);
7933 attrs.erase(RGW_ATTR_PG_VER);
7934 attrs.erase(RGW_ATTR_SOURCE_ZONE);
7935 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
7936 if (cmp != src_attrs.end())
7937 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
7938
7939 RGWObjManifest manifest;
7940 RGWObjState *astate = NULL;
7941
7942 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
7943 if (ret < 0) {
7944 return ret;
7945 }
7946
7947 vector<rgw_raw_obj> ref_objs;
7948
7949 if (remote_dest) {
7950 /* dest is in a different zonegroup, copy it there */
7951 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
7952 }
7953 uint64_t max_chunk_size;
7954
7955 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
7956 if (ret < 0) {
7957 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
7958 return ret;
7959 }
7960
7961 rgw_pool src_pool;
7962 rgw_pool dest_pool;
7963 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
7964 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7965 return -EIO;
7966 }
7967 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
7968 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7969 return -EIO;
7970 }
7971
7972
7973 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
7974 bool copy_first = false;
7975 if (astate->has_manifest) {
7976 if (!astate->manifest.has_tail()) {
7977 copy_data = true;
7978 } else {
7979 uint64_t head_size = astate->manifest.get_head_size();
7980
7981 if (head_size > 0) {
7982 if (head_size > max_chunk_size) {
7983 copy_data = true;
7984 } else {
7985 copy_first = true;
7986 }
7987 }
7988 }
7989 }
7990
7991 if (petag) {
7992 const auto iter = attrs.find(RGW_ATTR_ETAG);
7993 if (iter != attrs.end()) {
7994 *petag = iter->second;
7995 }
7996 }
7997
7998 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
7999 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8000 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8001 version_id, ptag, petag);
8002 }
8003
8004 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8005
8006 if (copy_first) { // we need to copy first chunk, not increase refcount
8007 ++miter;
8008 }
8009
8010 rgw_rados_ref ref;
8011 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8012 if (ret < 0) {
8013 return ret;
8014 }
8015
8016 bool versioned_dest = dest_bucket_info.versioning_enabled();
8017
8018 if (version_id && !version_id->empty()) {
8019 versioned_dest = true;
8020 dest_obj.key.set_instance(*version_id);
8021 } else if (versioned_dest) {
8022 gen_rand_obj_instance_name(&dest_obj);
8023 }
8024
8025 bufferlist first_chunk;
8026
8027 bool copy_itself = (dest_obj == src_obj);
8028 RGWObjManifest *pmanifest;
8029 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8030
8031 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8032 RGWRados::Object::Write write_op(&dest_op_target);
8033
8034 string tag;
8035
8036 if (ptag) {
8037 tag = *ptag;
8038 }
8039
8040 if (tag.empty()) {
8041 append_rand_alpha(cct, tag, tag, 32);
8042 }
8043
8044 if (!copy_itself) {
8045 manifest = astate->manifest;
8046 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8047 if (tail_placement.bucket.name.empty()) {
8048 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8049 }
8050 string oid, key;
8051 for (; miter != astate->manifest.obj_end(); ++miter) {
8052 ObjectWriteOperation op;
8053 cls_refcount_get(op, tag, true);
8054 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8055 ref.ioctx.locator_set_key(loc.loc);
8056
8057 ret = ref.ioctx.operate(loc.oid, &op);
8058 if (ret < 0) {
8059 goto done_ret;
8060 }
8061
8062 ref_objs.push_back(loc);
8063 }
8064
8065 pmanifest = &manifest;
8066 } else {
8067 pmanifest = &astate->manifest;
8068 /* don't send the object's tail for garbage collection */
8069 astate->keep_tail = true;
8070 }
8071
8072 if (copy_first) {
8073 ret = read_op.read(0, max_chunk_size, first_chunk);
8074 if (ret < 0) {
8075 goto done_ret;
8076 }
8077
8078 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8079 } else {
8080 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8081 }
8082
8083 write_op.meta.data = &first_chunk;
8084 write_op.meta.manifest = pmanifest;
8085 write_op.meta.ptag = &tag;
8086 write_op.meta.owner = dest_bucket_info.owner;
8087 write_op.meta.mtime = mtime;
8088 write_op.meta.flags = PUT_OBJ_CREATE;
8089 write_op.meta.category = category;
8090 write_op.meta.olh_epoch = olh_epoch;
8091 write_op.meta.delete_at = delete_at;
8092
8093 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8094 if (ret < 0) {
8095 goto done_ret;
8096 }
8097
8098 return 0;
8099
8100 done_ret:
8101 if (!copy_itself) {
8102 vector<rgw_raw_obj>::iterator riter;
8103
8104 string oid, key;
8105
8106 /* rollback reference */
8107 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8108 ObjectWriteOperation op;
8109 cls_refcount_put(op, tag, true);
8110
8111 ref.ioctx.locator_set_key(riter->loc);
8112
8113 int r = ref.ioctx.operate(riter->oid, &op);
8114 if (r < 0) {
8115 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8116 }
8117 }
8118 }
8119 return ret;
8120 }
8121
8122
8123 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8124 RGWBucketInfo& dest_bucket_info,
8125 RGWRados::Object::Read& read_op, off_t end,
8126 rgw_obj& dest_obj,
8127 rgw_obj& src_obj,
8128 uint64_t max_chunk_size,
8129 real_time *mtime,
8130 real_time set_mtime,
8131 map<string, bufferlist>& attrs,
8132 RGWObjCategory category,
8133 uint64_t olh_epoch,
8134 real_time delete_at,
8135 string *version_id,
8136 string *ptag,
8137 ceph::buffer::list *petag)
8138 {
8139 bufferlist first_chunk;
8140 RGWObjManifest manifest;
8141
8142 string tag;
8143 append_rand_alpha(cct, tag, tag, 32);
8144
8145 RGWPutObjProcessor_Atomic processor(obj_ctx,
8146 dest_bucket_info, dest_obj.bucket, dest_obj.get_oid(),
8147 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8148 if (version_id) {
8149 processor.set_version_id(*version_id);
8150 }
8151 processor.set_olh_epoch(olh_epoch);
8152 int ret = processor.prepare(this, NULL);
8153 if (ret < 0)
8154 return ret;
8155
8156 off_t ofs = 0;
8157
8158 do {
8159 bufferlist bl;
8160 ret = read_op.read(ofs, end, bl);
8161
8162 uint64_t read_len = ret;
8163 bool again;
8164
8165 do {
8166 void *handle;
8167 rgw_raw_obj obj;
8168
8169 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8170 if (ret < 0) {
8171 return ret;
8172 }
8173 ret = processor.throttle_data(handle, obj, read_len, false);
8174 if (ret < 0)
8175 return ret;
8176 } while (again);
8177
8178 ofs += read_len;
8179 } while (ofs <= end);
8180
8181 string etag;
8182 auto iter = attrs.find(RGW_ATTR_ETAG);
8183 if (iter != attrs.end()) {
8184 bufferlist& bl = iter->second;
8185 etag = string(bl.c_str(), bl.length());
8186 if (petag) {
8187 *petag = bl;
8188 }
8189 }
8190
8191 uint64_t accounted_size;
8192 {
8193 bool compressed{false};
8194 RGWCompressionInfo cs_info;
8195 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8196 if (ret < 0) {
8197 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8198 return ret;
8199 }
8200 // pass original size if compressed
8201 accounted_size = compressed ? cs_info.orig_size : ofs;
8202 }
8203
8204 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8205 }
8206
8207 bool RGWRados::is_meta_master()
8208 {
8209 if (!get_zonegroup().is_master_zonegroup()) {
8210 return false;
8211 }
8212
8213 return (get_zonegroup().master_zone == zone_public_config.id);
8214 }
8215
8216 /**
8217 * Check to see if the bucket metadata could be synced
8218 * bucket: the bucket to check
8219 * Returns false is the bucket is not synced
8220 */
8221 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8222 {
8223
8224 /* no current period */
8225 if (current_period.get_id().empty()) {
8226 return false;
8227 }
8228
8229 /* zonegroup is not master zonegroup */
8230 if (!get_zonegroup().is_master_zonegroup()) {
8231 return false;
8232 }
8233
8234 /* single zonegroup and a single zone */
8235 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8236 return false;
8237 }
8238
8239 /* zone is not master */
8240 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8241 return false;
8242 }
8243
8244 return true;
8245 }
8246
8247 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8248 {
8249 std::map<string, rgw_bucket_dir_entry> ent_map;
8250 rgw_obj_index_key marker;
8251 string prefix;
8252 bool is_truncated;
8253
8254 do {
8255 #define NUM_ENTRIES 1000
8256 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8257 &is_truncated, &marker);
8258 if (r < 0)
8259 return r;
8260
8261 string ns;
8262 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8263 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8264 rgw_obj_key obj;
8265
8266 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8267 return -ENOTEMPTY;
8268 }
8269 } while (is_truncated);
8270 return 0;
8271 }
8272
8273 /**
8274 * Delete a bucket.
8275 * bucket: the name of the bucket to delete
8276 * Returns 0 on success, -ERR# otherwise.
8277 */
8278 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8279 {
8280 const rgw_bucket& bucket = bucket_info.bucket;
8281 librados::IoCtx index_ctx;
8282 map<int, string> bucket_objs;
8283 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8284 if (r < 0)
8285 return r;
8286
8287 if (check_empty) {
8288 r = check_bucket_empty(bucket_info);
8289 if (r < 0) {
8290 return r;
8291 }
8292 }
8293
8294 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8295 if (r < 0)
8296 return r;
8297
8298 /* if the bucket is not synced we can remove the meta file */
8299 if (!is_syncing_bucket_meta(bucket)) {
8300 RGWObjVersionTracker objv_tracker;
8301 string entry = bucket.get_key();
8302 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8303 if (r < 0) {
8304 return r;
8305 }
8306 /* remove bucket index objects*/
8307 map<int, string>::const_iterator biter;
8308 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8309 index_ctx.remove(biter->second);
8310 }
8311 }
8312 return 0;
8313 }
8314
8315 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8316 {
8317 RGWBucketInfo info;
8318 map<string, bufferlist> attrs;
8319 RGWObjectCtx obj_ctx(this);
8320 int r;
8321 if (bucket.bucket_id.empty()) {
8322 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8323 } else {
8324 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8325 }
8326 if (r < 0) {
8327 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8328 return r;
8329 }
8330
8331 info.owner = owner.get_id();
8332
8333 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8334 if (r < 0) {
8335 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8336 return r;
8337 }
8338
8339 return 0;
8340 }
8341
8342
8343 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8344 {
8345 int ret = 0;
8346
8347 vector<rgw_bucket>::iterator iter;
8348
8349 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8350 rgw_bucket& bucket = *iter;
8351 if (enabled)
8352 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8353 else
8354 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8355
8356 RGWBucketInfo info;
8357 map<string, bufferlist> attrs;
8358 RGWObjectCtx obj_ctx(this);
8359 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8360 if (r < 0) {
8361 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8362 ret = r;
8363 continue;
8364 }
8365 if (enabled) {
8366 info.flags &= ~BUCKET_SUSPENDED;
8367 } else {
8368 info.flags |= BUCKET_SUSPENDED;
8369 }
8370
8371 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8372 if (r < 0) {
8373 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8374 ret = r;
8375 continue;
8376 }
8377 }
8378 return ret;
8379 }
8380
8381 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8382 {
8383 RGWBucketInfo bucket_info;
8384 RGWObjectCtx obj_ctx(this);
8385 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8386 if (ret < 0) {
8387 return ret;
8388 }
8389
8390 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8391 return 0;
8392 }
8393
8394 int RGWRados::Object::complete_atomic_modification()
8395 {
8396 if (!state->has_manifest || state->keep_tail)
8397 return 0;
8398
8399 cls_rgw_obj_chain chain;
8400 store->update_gc_chain(obj, state->manifest, &chain);
8401
8402 if (chain.empty()) {
8403 return 0;
8404 }
8405
8406 string tag = state->obj_tag.to_str();
8407 return store->gc->send_chain(chain, tag, false); // do it async
8408 }
8409
8410 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8411 {
8412 RGWObjManifest::obj_iterator iter;
8413 rgw_raw_obj raw_head;
8414 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8415 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8416 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8417 if (mobj == raw_head)
8418 continue;
8419 cls_rgw_obj_key key(mobj.oid);
8420 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8421 }
8422 }
8423
8424 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8425 {
8426 return gc->send_chain(chain, tag, sync);
8427 }
8428
8429 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8430 {
8431 const rgw_bucket& bucket = bucket_info.bucket;
8432 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8433 if (r < 0)
8434 return r;
8435
8436 if (bucket.bucket_id.empty()) {
8437 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8438 return -EIO;
8439 }
8440
8441 bucket_oid = dir_oid_prefix;
8442 bucket_oid.append(bucket.bucket_id);
8443
8444 return 0;
8445 }
8446
8447 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8448 string& bucket_oid_base) {
8449 const rgw_bucket& bucket = bucket_info.bucket;
8450 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8451 if (r < 0)
8452 return r;
8453
8454 if (bucket.bucket_id.empty()) {
8455 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8456 return -EIO;
8457 }
8458
8459 bucket_oid_base = dir_oid_prefix;
8460 bucket_oid_base.append(bucket.bucket_id);
8461
8462 return 0;
8463
8464 }
8465
8466 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8467 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8468 string bucket_oid_base;
8469 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8470 if (ret < 0) {
8471 return ret;
8472 }
8473
8474 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8475 if (bucket_instance_ids) {
8476 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8477 }
8478 return 0;
8479 }
8480
8481 template<typename T>
8482 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8483 map<int, string>& oids, map<int, T>& bucket_objs,
8484 int shard_id, map<int, string> *bucket_instance_ids)
8485 {
8486 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8487 if (ret < 0)
8488 return ret;
8489
8490 map<int, string>::const_iterator iter = oids.begin();
8491 for (; iter != oids.end(); ++iter) {
8492 bucket_objs[iter->first] = T();
8493 }
8494 return 0;
8495 }
8496
8497 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8498 const string& obj_key, string *bucket_obj, int *shard_id)
8499 {
8500 string bucket_oid_base;
8501 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8502 if (ret < 0)
8503 return ret;
8504
8505 RGWObjectCtx obj_ctx(this);
8506
8507 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8508 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8509 if (ret < 0) {
8510 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8511 return ret;
8512 }
8513 return 0;
8514 }
8515
8516 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8517 int shard_id, string *bucket_obj)
8518 {
8519 string bucket_oid_base;
8520 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8521 if (ret < 0)
8522 return ret;
8523
8524 RGWObjectCtx obj_ctx(this);
8525
8526 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8527 shard_id, bucket_obj);
8528 return 0;
8529 }
8530
8531 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8532 map<RGWObjCategory, RGWStorageStats>& stats)
8533 {
8534 for (const auto& pair : header.stats) {
8535 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8536 const rgw_bucket_category_stats& header_stats = pair.second;
8537
8538 RGWStorageStats& s = stats[category];
8539
8540 s.category = category;
8541 s.size += header_stats.total_size;
8542 s.size_rounded += header_stats.total_size_rounded;
8543 s.size_utilized += header_stats.actual_size;
8544 s.num_objects += header_stats.num_entries;
8545 }
8546 }
8547
8548 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8549 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8550 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8551 {
8552 librados::IoCtx index_ctx;
8553 // key - bucket index object id
8554 // value - bucket index check OP returned result with the given bucket index object (shard)
8555 map<int, string> oids;
8556 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8557
8558 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8559 if (ret < 0) {
8560 return ret;
8561 }
8562
8563 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8564 if (ret < 0) {
8565 return ret;
8566 }
8567
8568 // Aggregate results (from different shards if there is any)
8569 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8570 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8571 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8572 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8573 }
8574
8575 return 0;
8576 }
8577
8578 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8579 {
8580 librados::IoCtx index_ctx;
8581 map<int, string> bucket_objs;
8582
8583 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8584 if (r < 0) {
8585 return r;
8586 }
8587
8588 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8589 }
8590
8591 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8592 {
8593 librados::IoCtx index_ctx;
8594 map<int, string> bucket_objs;
8595
8596 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8597 if (r < 0) {
8598 return r;
8599 }
8600
8601 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8602 }
8603
8604 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8605 {
8606 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8607 std::string oid, key;
8608 get_obj_bucket_and_oid_loc(obj, oid, key);
8609 if (!rctx)
8610 return 0;
8611
8612 RGWObjState *state = NULL;
8613
8614 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8615 if (r < 0)
8616 return r;
8617
8618 if (!state->is_atomic) {
8619 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8620 return -EINVAL;
8621 }
8622
8623 if (state->obj_tag.length() == 0) {// check for backward compatibility
8624 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8625 return -EINVAL;
8626 }
8627
8628 string tag = state->obj_tag.c_str();
8629
8630 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8631
8632 return gc->defer_chain(tag, false);
8633 }
8634
8635 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8636 {
8637 list<string> prefixes;
8638 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8639 cls_rgw_remove_obj(op, prefixes);
8640 }
8641
8642 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8643 {
8644 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8645 }
8646
8647 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8648 {
8649 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8650 }
8651
8652
8653 /**
8654 * Delete an object.
8655 * bucket: name of the bucket storing the object
8656 * obj: name of the object to delete
8657 * Returns: 0 on success, -ERR# otherwise.
8658 */
8659 int RGWRados::Object::Delete::delete_obj()
8660 {
8661 RGWRados *store = target->get_store();
8662 rgw_obj& src_obj = target->get_obj();
8663 const string& instance = src_obj.key.instance;
8664 rgw_obj obj = src_obj;
8665
8666 if (instance == "null") {
8667 obj.key.instance.clear();
8668 }
8669
8670 bool explicit_marker_version = (!params.marker_version_id.empty());
8671
8672 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8673 if (instance.empty() || explicit_marker_version) {
8674 rgw_obj marker = obj;
8675
8676 if (!params.marker_version_id.empty()) {
8677 if (params.marker_version_id != "null") {
8678 marker.key.set_instance(params.marker_version_id);
8679 }
8680 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8681 store->gen_rand_obj_instance_name(&marker);
8682 }
8683
8684 result.version_id = marker.key.instance;
8685 result.delete_marker = true;
8686
8687 struct rgw_bucket_dir_entry_meta meta;
8688
8689 meta.owner = params.obj_owner.get_id().to_str();
8690 meta.owner_display_name = params.obj_owner.get_display_name();
8691
8692 if (real_clock::is_zero(params.mtime)) {
8693 meta.mtime = real_clock::now();
8694 } else {
8695 meta.mtime = params.mtime;
8696 }
8697
8698 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8699 if (r < 0) {
8700 return r;
8701 }
8702 } else {
8703 rgw_bucket_dir_entry dirent;
8704
8705 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8706 if (r < 0) {
8707 return r;
8708 }
8709 result.delete_marker = dirent.is_delete_marker();
8710 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8711 if (r < 0) {
8712 return r;
8713 }
8714 result.version_id = instance;
8715 }
8716
8717 BucketShard *bs;
8718 int r = target->get_bucket_shard(&bs);
8719 if (r < 0) {
8720 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8721 return r;
8722 }
8723
8724 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8725 if (r < 0) {
8726 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8727 return r;
8728 }
8729
8730 return 0;
8731 }
8732
8733 rgw_rados_ref ref;
8734 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8735 if (r < 0) {
8736 return r;
8737 }
8738
8739 RGWObjState *state;
8740 r = target->get_state(&state, false);
8741 if (r < 0)
8742 return r;
8743
8744 ObjectWriteOperation op;
8745
8746 if (!real_clock::is_zero(params.unmod_since)) {
8747 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8748 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8749 if (!params.high_precision_time) {
8750 ctime.tv_nsec = 0;
8751 unmod.tv_nsec = 0;
8752 }
8753
8754 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8755 if (ctime > unmod) {
8756 return -ERR_PRECONDITION_FAILED;
8757 }
8758
8759 /* only delete object if mtime is less than or equal to params.unmod_since */
8760 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8761 }
8762 uint64_t obj_size = state->size;
8763
8764 if (!real_clock::is_zero(params.expiration_time)) {
8765 bufferlist bl;
8766 real_time delete_at;
8767
8768 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8769 try {
8770 bufferlist::iterator iter = bl.begin();
8771 ::decode(delete_at, iter);
8772 } catch (buffer::error& err) {
8773 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8774 return -EIO;
8775 }
8776
8777 if (params.expiration_time != delete_at) {
8778 return -ERR_PRECONDITION_FAILED;
8779 }
8780 } else {
8781 return -ERR_PRECONDITION_FAILED;
8782 }
8783 }
8784
8785 if (!state->exists) {
8786 target->invalidate_state();
8787 return -ENOENT;
8788 }
8789
8790 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
8791 if (r < 0)
8792 return r;
8793
8794 RGWBucketInfo& bucket_info = target->get_bucket_info();
8795
8796 RGWRados::Bucket bop(store, bucket_info);
8797 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8798
8799 index_op.set_zones_trace(params.zones_trace);
8800 index_op.set_bilog_flags(params.bilog_flags);
8801
8802
8803 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8804 if (r < 0)
8805 return r;
8806
8807 store->remove_rgw_head_obj(op);
8808 r = ref.ioctx.operate(ref.oid, &op);
8809 bool need_invalidate = false;
8810 if (r == -ECANCELED) {
8811 /* raced with another operation, we can regard it as removed */
8812 need_invalidate = true;
8813 r = 0;
8814 }
8815
8816 int64_t poolid = ref.ioctx.get_id();
8817 if (r >= 0) {
8818 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8819 if (obj_tombstone_cache) {
8820 tombstone_entry entry{*state};
8821 obj_tombstone_cache->add(obj, entry);
8822 }
8823 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8824
8825 int ret = target->complete_atomic_modification();
8826 if (ret < 0) {
8827 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8828 }
8829 /* other than that, no need to propagate error */
8830 } else {
8831 int ret = index_op.cancel();
8832 if (ret < 0) {
8833 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8834 }
8835 }
8836
8837 if (need_invalidate) {
8838 target->invalidate_state();
8839 }
8840
8841 if (r < 0)
8842 return r;
8843
8844 /* update quota cache */
8845 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8846
8847 return 0;
8848 }
8849
8850 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8851 const RGWBucketInfo& bucket_info,
8852 const rgw_obj& obj,
8853 int versioning_status,
8854 uint16_t bilog_flags,
8855 const real_time& expiration_time,
8856 rgw_zone_set *zones_trace)
8857 {
8858 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
8859 RGWRados::Object::Delete del_op(&del_target);
8860
8861 del_op.params.bucket_owner = bucket_info.owner;
8862 del_op.params.versioning_status = versioning_status;
8863 del_op.params.bilog_flags = bilog_flags;
8864 del_op.params.expiration_time = expiration_time;
8865 del_op.params.zones_trace = zones_trace;
8866
8867 return del_op.delete_obj();
8868 }
8869
8870 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
8871 {
8872 rgw_rados_ref ref;
8873 int r = get_raw_obj_ref(obj, &ref);
8874 if (r < 0) {
8875 return r;
8876 }
8877
8878 ObjectWriteOperation op;
8879
8880 op.remove();
8881 r = ref.ioctx.operate(ref.oid, &op);
8882 if (r < 0)
8883 return r;
8884
8885 return 0;
8886 }
8887
8888 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
8889 {
8890 if (obj.empty()) {
8891 ldout(cct, 1) << "delete_system_obj got empty object name "
8892 << obj << ", returning EINVAL" << dendl;
8893 return -EINVAL;
8894 }
8895 rgw_rados_ref ref;
8896 int r = get_raw_obj_ref(obj, &ref);
8897 if (r < 0) {
8898 return r;
8899 }
8900
8901 ObjectWriteOperation op;
8902
8903 if (objv_tracker) {
8904 objv_tracker->prepare_op_for_write(&op);
8905 }
8906
8907 op.remove();
8908 r = ref.ioctx.operate(ref.oid, &op);
8909 if (r < 0)
8910 return r;
8911
8912 return 0;
8913 }
8914
8915 int RGWRados::delete_obj_index(const rgw_obj& obj)
8916 {
8917 std::string oid, key;
8918 get_obj_bucket_and_oid_loc(obj, oid, key);
8919
8920 RGWObjectCtx obj_ctx(this);
8921
8922 RGWBucketInfo bucket_info;
8923 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
8924 if (ret < 0) {
8925 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
8926 return ret;
8927 }
8928
8929 RGWRados::Bucket bop(this, bucket_info);
8930 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8931
8932 real_time removed_mtime;
8933 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
8934
8935 return r;
8936 }
8937
8938 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
8939 {
8940 string tag;
8941
8942 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
8943 if (mi != manifest.obj_end()) {
8944 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
8945 ++mi;
8946 tag = mi.get_location().get_raw_obj(store).oid;
8947 tag.append("_");
8948 }
8949
8950 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
8951 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
8952 MD5 hash;
8953 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
8954
8955 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
8956 if (iter != attrset.end()) {
8957 bufferlist& bl = iter->second;
8958 hash.Update((const byte *)bl.c_str(), bl.length());
8959 }
8960
8961 hash.Final(md5);
8962 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
8963 tag.append(md5_str);
8964
8965 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
8966
8967 tag_bl.append(tag.c_str(), tag.size() + 1);
8968 }
8969
8970 static bool is_olh(map<string, bufferlist>& attrs)
8971 {
8972 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
8973 return (iter != attrs.end());
8974 }
8975
8976 static bool has_olh_tag(map<string, bufferlist>& attrs)
8977 {
8978 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
8979 return (iter != attrs.end());
8980 }
8981
8982 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8983 RGWObjState *olh_state, RGWObjState **target_state)
8984 {
8985 assert(olh_state->is_olh);
8986
8987 rgw_obj target;
8988 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
8989 if (r < 0) {
8990 return r;
8991 }
8992 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
8993 if (r < 0) {
8994 return r;
8995 }
8996
8997 return 0;
8998 }
8999
9000 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9001 {
9002 if (obj.empty()) {
9003 return -EINVAL;
9004 }
9005
9006 RGWRawObjState *s = rctx->raw.get_state(obj);
9007 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9008 *state = s;
9009 if (s->has_attrs) {
9010 return 0;
9011 }
9012
9013 s->obj = obj;
9014
9015 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9016 if (r == -ENOENT) {
9017 s->exists = false;
9018 s->has_attrs = true;
9019 s->mtime = real_time();
9020 return 0;
9021 }
9022 if (r < 0)
9023 return r;
9024
9025 s->exists = true;
9026 s->has_attrs = true;
9027 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9028
9029 if (s->obj_tag.length())
9030 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9031 << s->obj_tag.c_str() << dendl;
9032 else
9033 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9034
9035 return 0;
9036 }
9037
9038 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9039 {
9040 int ret;
9041
9042 do {
9043 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9044 } while (ret == -EAGAIN);
9045
9046 return ret;
9047 }
9048
9049 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9050 RGWObjState **state, bool follow_olh, bool assume_noent)
9051 {
9052 if (obj.empty()) {
9053 return -EINVAL;
9054 }
9055
9056 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9057
9058 RGWObjState *s = rctx->obj.get_state(obj);
9059 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9060 *state = s;
9061 if (s->has_attrs) {
9062 if (s->is_olh && need_follow_olh) {
9063 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9064 }
9065 return 0;
9066 }
9067
9068 s->obj = obj;
9069
9070 rgw_raw_obj raw_obj;
9071 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9072
9073 int r = -ENOENT;
9074
9075 if (!assume_noent) {
9076 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9077 }
9078
9079 if (r == -ENOENT) {
9080 s->exists = false;
9081 s->has_attrs = true;
9082 tombstone_entry entry;
9083 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9084 s->mtime = entry.mtime;
9085 s->zone_short_id = entry.zone_short_id;
9086 s->pg_ver = entry.pg_ver;
9087 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9088 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9089 } else {
9090 s->mtime = real_time();
9091 }
9092 return 0;
9093 }
9094 if (r < 0)
9095 return r;
9096
9097 s->exists = true;
9098 s->has_attrs = true;
9099 s->accounted_size = s->size;
9100
9101 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9102 const bool compressed = (iter != s->attrset.end());
9103 if (compressed) {
9104 // use uncompressed size for accounted_size
9105 try {
9106 RGWCompressionInfo info;
9107 auto p = iter->second.begin();
9108 ::decode(info, p);
9109 s->accounted_size = info.orig_size;
9110 } catch (buffer::error&) {
9111 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9112 return -EIO;
9113 }
9114 }
9115
9116 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9117 if (iter != s->attrset.end()) {
9118 bufferlist bl = iter->second;
9119 bufferlist::iterator it = bl.begin();
9120 it.copy(bl.length(), s->shadow_obj);
9121 s->shadow_obj[bl.length()] = '\0';
9122 }
9123 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9124
9125 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9126 if (manifest_bl.length()) {
9127 bufferlist::iterator miter = manifest_bl.begin();
9128 try {
9129 ::decode(s->manifest, miter);
9130 s->has_manifest = true;
9131 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9132 broken due to old bugs */
9133 s->size = s->manifest.get_obj_size();
9134 if (!compressed)
9135 s->accounted_size = s->size;
9136 } catch (buffer::error& err) {
9137 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9138 return -EIO;
9139 }
9140 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9141 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9142 RGWObjManifest::obj_iterator mi;
9143 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9144 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9145 }
9146 }
9147
9148 if (!s->obj_tag.length()) {
9149 /*
9150 * Uh oh, something's wrong, object with manifest should have tag. Let's
9151 * create one out of the manifest, would be unique
9152 */
9153 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9154 s->fake_tag = true;
9155 }
9156 }
9157 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9158 if (aiter != s->attrset.end()) {
9159 bufferlist& pg_ver_bl = aiter->second;
9160 if (pg_ver_bl.length()) {
9161 bufferlist::iterator pgbl = pg_ver_bl.begin();
9162 try {
9163 ::decode(s->pg_ver, pgbl);
9164 } catch (buffer::error& err) {
9165 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9166 }
9167 }
9168 }
9169 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9170 if (aiter != s->attrset.end()) {
9171 bufferlist& zone_short_id_bl = aiter->second;
9172 if (zone_short_id_bl.length()) {
9173 bufferlist::iterator zbl = zone_short_id_bl.begin();
9174 try {
9175 ::decode(s->zone_short_id, zbl);
9176 } catch (buffer::error& err) {
9177 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9178 }
9179 }
9180 }
9181 if (s->obj_tag.length())
9182 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9183 else
9184 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9185
9186 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9187 * it exist, and not only if is_olh() returns true
9188 */
9189 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9190 if (iter != s->attrset.end()) {
9191 s->olh_tag = iter->second;
9192 }
9193
9194 if (is_olh(s->attrset)) {
9195 s->is_olh = true;
9196
9197 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9198
9199 if (need_follow_olh) {
9200 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9201 }
9202 }
9203
9204 return 0;
9205 }
9206
9207 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9208 bool follow_olh, bool assume_noent)
9209 {
9210 int ret;
9211
9212 do {
9213 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9214 } while (ret == -EAGAIN);
9215
9216 return ret;
9217 }
9218
9219 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9220 {
9221 RGWObjState *astate;
9222 int r = get_state(&astate, true);
9223 if (r < 0) {
9224 return r;
9225 }
9226
9227 *pmanifest = &astate->manifest;
9228
9229 return 0;
9230 }
9231
9232 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9233 {
9234 RGWObjState *state;
9235 int r = source->get_state(&state, true);
9236 if (r < 0)
9237 return r;
9238 if (!state->exists)
9239 return -ENOENT;
9240 if (!state->get_attr(name, dest))
9241 return -ENODATA;
9242
9243 return 0;
9244 }
9245
9246
9247 int RGWRados::Object::Stat::stat_async()
9248 {
9249 RGWObjectCtx& ctx = source->get_ctx();
9250 rgw_obj& obj = source->get_obj();
9251 RGWRados *store = source->get_store();
9252
9253 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9254 result.obj = obj;
9255 if (s->has_attrs) {
9256 state.ret = 0;
9257 result.size = s->size;
9258 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9259 result.attrs = s->attrset;
9260 result.has_manifest = s->has_manifest;
9261 result.manifest = s->manifest;
9262 return 0;
9263 }
9264
9265 string oid;
9266 string loc;
9267 get_obj_bucket_and_oid_loc(obj, oid, loc);
9268
9269 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9270 if (r < 0) {
9271 return r;
9272 }
9273
9274 librados::ObjectReadOperation op;
9275 op.stat2(&result.size, &result.mtime, NULL);
9276 op.getxattrs(&result.attrs, NULL);
9277 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9278 state.io_ctx.locator_set_key(loc);
9279 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9280 if (r < 0) {
9281 ldout(store->ctx(), 5) << __func__
9282 << ": ERROR: aio_operate() returned ret=" << r
9283 << dendl;
9284 return r;
9285 }
9286
9287 return 0;
9288 }
9289
9290
9291 int RGWRados::Object::Stat::wait()
9292 {
9293 if (!state.completion) {
9294 return state.ret;
9295 }
9296
9297 state.completion->wait_for_safe();
9298 state.ret = state.completion->get_return_value();
9299 state.completion->release();
9300
9301 if (state.ret != 0) {
9302 return state.ret;
9303 }
9304
9305 return finish();
9306 }
9307
9308 int RGWRados::Object::Stat::finish()
9309 {
9310 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9311 if (iter != result.attrs.end()) {
9312 bufferlist& bl = iter->second;
9313 bufferlist::iterator biter = bl.begin();
9314 try {
9315 ::decode(result.manifest, biter);
9316 } catch (buffer::error& err) {
9317 RGWRados *store = source->get_store();
9318 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9319 return -EIO;
9320 }
9321 result.has_manifest = true;
9322 }
9323
9324 return 0;
9325 }
9326
9327 /**
9328 * Get an attribute for a system object.
9329 * obj: the object to get attr
9330 * name: name of the attr to retrieve
9331 * dest: bufferlist to store the result in
9332 * Returns: 0 on success, -ERR# otherwise.
9333 */
9334 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9335 {
9336 rgw_rados_ref ref;
9337 int r = get_system_obj_ref(obj, &ref);
9338 if (r < 0) {
9339 return r;
9340 }
9341
9342 ObjectReadOperation op;
9343
9344 int rval;
9345 op.getxattr(name, &dest, &rval);
9346
9347 r = ref.ioctx.operate(ref.oid, &op, NULL);
9348 if (r < 0)
9349 return r;
9350
9351 return 0;
9352 }
9353
9354 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9355 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9356 ObjectOperation& op, RGWObjState **pstate)
9357 {
9358 if (!rctx)
9359 return 0;
9360
9361 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9362 if (r < 0)
9363 return r;
9364
9365 RGWObjState *state = *pstate;
9366
9367 if (!state->is_atomic) {
9368 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9369 return 0;
9370 }
9371
9372 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9373 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9374 } else {
9375 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9376 }
9377 return 0;
9378 }
9379
9380 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9381 {
9382 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9383 }
9384
9385 void RGWRados::Object::invalidate_state()
9386 {
9387 ctx.obj.invalidate(obj);
9388 }
9389
9390 void RGWRados::SystemObject::invalidate_state()
9391 {
9392 ctx.raw.invalidate(obj);
9393 }
9394
9395 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9396 const char *if_match, const char *if_nomatch, bool removal_op)
9397 {
9398 int r = get_state(&state, false);
9399 if (r < 0)
9400 return r;
9401
9402 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9403 if_match != NULL || if_nomatch != NULL) &&
9404 (!state->fake_tag);
9405
9406 if (!state->is_atomic) {
9407 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9408
9409 if (reset_obj) {
9410 op.create(false);
9411 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9412 }
9413
9414 return 0;
9415 }
9416
9417 if (need_guard) {
9418 /* first verify that the object wasn't replaced under */
9419 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9420 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9421 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9422 }
9423
9424 if (if_match) {
9425 if (strcmp(if_match, "*") == 0) {
9426 // test the object is existing
9427 if (!state->exists) {
9428 return -ERR_PRECONDITION_FAILED;
9429 }
9430 } else {
9431 bufferlist bl;
9432 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9433 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9434 return -ERR_PRECONDITION_FAILED;
9435 }
9436 }
9437 }
9438
9439 if (if_nomatch) {
9440 if (strcmp(if_nomatch, "*") == 0) {
9441 // test the object is NOT existing
9442 if (state->exists) {
9443 return -ERR_PRECONDITION_FAILED;
9444 }
9445 } else {
9446 bufferlist bl;
9447 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9448 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9449 return -ERR_PRECONDITION_FAILED;
9450 }
9451 }
9452 }
9453 }
9454
9455 if (reset_obj) {
9456 if (state->exists) {
9457 op.create(false);
9458 store->remove_rgw_head_obj(op);
9459 } else {
9460 op.create(true);
9461 }
9462 }
9463
9464 if (removal_op) {
9465 /* the object is being removed, no need to update its tag */
9466 return 0;
9467 }
9468
9469 if (ptag) {
9470 state->write_tag = *ptag;
9471 } else {
9472 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9473 }
9474 bufferlist bl;
9475 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9476
9477 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9478
9479 op.setxattr(RGW_ATTR_ID_TAG, bl);
9480
9481 return 0;
9482 }
9483
9484 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9485 RGWObjVersionTracker *objv_tracker)
9486 {
9487 map<string, bufferlist> attrs;
9488 attrs[name] = bl;
9489 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9490 }
9491
9492 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9493 map<string, bufferlist>& attrs,
9494 map<string, bufferlist>* rmattrs,
9495 RGWObjVersionTracker *objv_tracker)
9496 {
9497 rgw_rados_ref ref;
9498 int r = get_system_obj_ref(obj, &ref);
9499 if (r < 0) {
9500 return r;
9501 }
9502 ObjectWriteOperation op;
9503
9504 if (objv_tracker) {
9505 objv_tracker->prepare_op_for_write(&op);
9506 }
9507
9508 map<string, bufferlist>::iterator iter;
9509 if (rmattrs) {
9510 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9511 const string& name = iter->first;
9512 op.rmxattr(name.c_str());
9513 }
9514 }
9515
9516 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9517 const string& name = iter->first;
9518 bufferlist& bl = iter->second;
9519
9520 if (!bl.length())
9521 continue;
9522
9523 op.setxattr(name.c_str(), bl);
9524 }
9525
9526 if (!op.size())
9527 return 0;
9528
9529 bufferlist bl;
9530
9531 r = ref.ioctx.operate(ref.oid, &op);
9532 if (r < 0)
9533 return r;
9534
9535 return 0;
9536 }
9537
9538 /**
9539 * Set an attr on an object.
9540 * bucket: name of the bucket holding the object
9541 * obj: name of the object to set the attr on
9542 * name: the attr to set
9543 * bl: the contents of the attr
9544 * Returns: 0 on success, -ERR# otherwise.
9545 */
9546 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9547 {
9548 map<string, bufferlist> attrs;
9549 attrs[name] = bl;
9550 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9551 }
9552
9553 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9554 map<string, bufferlist>& attrs,
9555 map<string, bufferlist>* rmattrs)
9556 {
9557 rgw_rados_ref ref;
9558 int r = get_obj_head_ref(bucket_info, obj, &ref);
9559 if (r < 0) {
9560 return r;
9561 }
9562 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9563
9564 ObjectWriteOperation op;
9565 RGWObjState *state = NULL;
9566
9567 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9568 if (r < 0)
9569 return r;
9570
9571 map<string, bufferlist>::iterator iter;
9572 if (rmattrs) {
9573 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9574 const string& name = iter->first;
9575 op.rmxattr(name.c_str());
9576 }
9577 }
9578
9579 const rgw_bucket& bucket = obj.bucket;
9580
9581 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9582 const string& name = iter->first;
9583 bufferlist& bl = iter->second;
9584
9585 if (!bl.length())
9586 continue;
9587
9588 op.setxattr(name.c_str(), bl);
9589
9590 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9591 real_time ts;
9592 try {
9593 ::decode(ts, bl);
9594
9595 rgw_obj_index_key obj_key;
9596 obj.key.get_index_key(&obj_key);
9597
9598 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9599 } catch (buffer::error& err) {
9600 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9601 }
9602 }
9603 }
9604
9605 if (!op.size())
9606 return 0;
9607
9608 RGWObjectCtx obj_ctx(this);
9609
9610 bufferlist bl;
9611 RGWRados::Bucket bop(this, bucket_info);
9612 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9613
9614 if (state) {
9615 string tag;
9616 append_rand_alpha(cct, tag, tag, 32);
9617 state->write_tag = tag;
9618 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9619
9620 if (r < 0)
9621 return r;
9622
9623 bl.append(tag.c_str(), tag.size() + 1);
9624
9625 op.setxattr(RGW_ATTR_ID_TAG, bl);
9626 }
9627
9628 r = ref.ioctx.operate(ref.oid, &op);
9629 if (state) {
9630 if (r >= 0) {
9631 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9632 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9633 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9634 string etag(etag_bl.c_str(), etag_bl.length());
9635 string content_type(content_type_bl.c_str(), content_type_bl.length());
9636 uint64_t epoch = ref.ioctx.get_last_version();
9637 int64_t poolid = ref.ioctx.get_id();
9638 real_time mtime = real_clock::now();
9639 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9640 mtime, etag, content_type, &acl_bl,
9641 RGW_OBJ_CATEGORY_MAIN, NULL);
9642 } else {
9643 int ret = index_op.cancel();
9644 if (ret < 0) {
9645 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9646 }
9647 }
9648 }
9649 if (r < 0)
9650 return r;
9651
9652 if (state) {
9653 state->obj_tag.swap(bl);
9654 if (rmattrs) {
9655 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9656 state->attrset.erase(iter->first);
9657 }
9658 }
9659 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9660 state->attrset[iter->first] = iter->second;
9661 }
9662 }
9663
9664 return 0;
9665 }
9666
9667 int RGWRados::Object::Read::prepare()
9668 {
9669 RGWRados *store = source->get_store();
9670 CephContext *cct = store->ctx();
9671
9672 bufferlist etag;
9673
9674 map<string, bufferlist>::iterator iter;
9675
9676 RGWObjState *astate;
9677 int r = source->get_state(&astate, true);
9678 if (r < 0)
9679 return r;
9680
9681 if (!astate->exists) {
9682 return -ENOENT;
9683 }
9684
9685 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9686
9687 state.obj = astate->obj;
9688 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9689
9690 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9691 if (r < 0) {
9692 return r;
9693 }
9694 if (params.attrs) {
9695 *params.attrs = astate->attrset;
9696 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9697 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9698 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9699 }
9700 }
9701 }
9702
9703 /* Convert all times go GMT to make them compatible */
9704 if (conds.mod_ptr || conds.unmod_ptr) {
9705 obj_time_weight src_weight;
9706 src_weight.init(astate);
9707 src_weight.high_precision = conds.high_precision_time;
9708
9709 obj_time_weight dest_weight;
9710 dest_weight.high_precision = conds.high_precision_time;
9711
9712 if (conds.mod_ptr) {
9713 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9714 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9715 if (!(dest_weight < src_weight)) {
9716 return -ERR_NOT_MODIFIED;
9717 }
9718 }
9719
9720 if (conds.unmod_ptr) {
9721 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9722 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9723 if (dest_weight < src_weight) {
9724 return -ERR_PRECONDITION_FAILED;
9725 }
9726 }
9727 }
9728 if (conds.if_match || conds.if_nomatch) {
9729 r = get_attr(RGW_ATTR_ETAG, etag);
9730 if (r < 0)
9731 return r;
9732
9733 if (conds.if_match) {
9734 string if_match_str = rgw_string_unquote(conds.if_match);
9735 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9736 if (if_match_str.compare(etag.c_str()) != 0) {
9737 return -ERR_PRECONDITION_FAILED;
9738 }
9739 }
9740
9741 if (conds.if_nomatch) {
9742 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9743 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9744 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9745 return -ERR_NOT_MODIFIED;
9746 }
9747 }
9748 }
9749
9750 if (params.obj_size)
9751 *params.obj_size = astate->size;
9752 if (params.lastmod)
9753 *params.lastmod = astate->mtime;
9754
9755 return 0;
9756 }
9757
9758 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9759 {
9760 if (ofs < 0) {
9761 ofs += obj_size;
9762 if (ofs < 0)
9763 ofs = 0;
9764 end = obj_size - 1;
9765 } else if (end < 0) {
9766 end = obj_size - 1;
9767 }
9768
9769 if (obj_size > 0) {
9770 if (ofs >= (off_t)obj_size) {
9771 return -ERANGE;
9772 }
9773 if (end >= (off_t)obj_size) {
9774 end = obj_size - 1;
9775 }
9776 }
9777 return 0;
9778 }
9779
9780 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9781 {
9782 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9783 }
9784
9785 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9786 RGWRados::SystemObject::Read::GetObjState& state,
9787 rgw_raw_obj& obj,
9788 map<string, bufferlist> *attrs,
9789 real_time *lastmod,
9790 uint64_t *obj_size,
9791 RGWObjVersionTracker *objv_tracker)
9792 {
9793 RGWRawObjState *astate = NULL;
9794
9795 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9796 if (r < 0)
9797 return r;
9798
9799 if (!astate->exists) {
9800 return -ENOENT;
9801 }
9802
9803 if (attrs) {
9804 *attrs = astate->attrset;
9805 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9806 map<string, bufferlist>::iterator iter;
9807 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9808 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9809 }
9810 }
9811 }
9812
9813 if (obj_size)
9814 *obj_size = astate->size;
9815 if (lastmod)
9816 *lastmod = astate->mtime;
9817
9818 return 0;
9819 }
9820
9821
9822 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
9823 {
9824 RGWRados *store = target->get_store();
9825 BucketShard *bs;
9826 int r;
9827
9828 #define NUM_RESHARD_RETRIES 10
9829 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
9830 int ret = get_bucket_shard(&bs);
9831 if (ret < 0) {
9832 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9833 return ret;
9834 }
9835 r = call(bs);
9836 if (r != -ERR_BUSY_RESHARDING) {
9837 break;
9838 }
9839 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
9840 string new_bucket_id;
9841 r = store->block_while_resharding(bs, &new_bucket_id);
9842 if (r == -ERR_BUSY_RESHARDING) {
9843 continue;
9844 }
9845 if (r < 0) {
9846 return r;
9847 }
9848 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
9849 i = 0; /* resharding is finished, make sure we can retry */
9850 r = target->update_bucket_id(new_bucket_id);
9851 if (r < 0) {
9852 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
9853 return r;
9854 }
9855 invalidate_bs();
9856 }
9857
9858 if (r < 0) {
9859 return r;
9860 }
9861
9862 if (pbs) {
9863 *pbs = bs;
9864 }
9865
9866 return 0;
9867 }
9868
9869 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
9870 {
9871 RGWRados *store = source->get_store();
9872 rgw_raw_obj& obj = source->get_obj();
9873
9874 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
9875 stat_params.lastmod, stat_params.obj_size, objv_tracker);
9876 }
9877
9878 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
9879 {
9880 if (blind) {
9881 return 0;
9882 }
9883 RGWRados *store = target->get_store();
9884
9885 if (write_tag && write_tag->length()) {
9886 optag = string(write_tag->c_str(), write_tag->length());
9887 } else {
9888 if (optag.empty()) {
9889 append_rand_alpha(store->ctx(), optag, optag, 32);
9890 }
9891 }
9892
9893 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
9894 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
9895 });
9896
9897 if (r < 0) {
9898 return r;
9899 }
9900 prepared = true;
9901
9902 return 0;
9903 }
9904
9905 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
9906 uint64_t size, uint64_t accounted_size,
9907 ceph::real_time& ut, const string& etag,
9908 const string& content_type,
9909 bufferlist *acl_bl,
9910 RGWObjCategory category,
9911 list<rgw_obj_index_key> *remove_objs, const string *user_data)
9912 {
9913 if (blind) {
9914 return 0;
9915 }
9916 RGWRados *store = target->get_store();
9917 BucketShard *bs;
9918
9919 int ret = get_bucket_shard(&bs);
9920 if (ret < 0) {
9921 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9922 return ret;
9923 }
9924
9925 rgw_bucket_dir_entry ent;
9926 obj.key.get_index_key(&ent.key);
9927 ent.meta.size = size;
9928 ent.meta.accounted_size = accounted_size;
9929 ent.meta.mtime = ut;
9930 ent.meta.etag = etag;
9931 if (user_data)
9932 ent.meta.user_data = *user_data;
9933
9934 ACLOwner owner;
9935 if (acl_bl && acl_bl->length()) {
9936 int ret = store->decode_policy(*acl_bl, &owner);
9937 if (ret < 0) {
9938 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
9939 }
9940 }
9941 ent.meta.owner = owner.get_id().to_str();
9942 ent.meta.owner_display_name = owner.get_display_name();
9943 ent.meta.content_type = content_type;
9944
9945 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
9946
9947 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9948 if (r < 0) {
9949 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9950 }
9951
9952 return ret;
9953 }
9954
9955 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
9956 real_time& removed_mtime,
9957 list<rgw_obj_index_key> *remove_objs)
9958 {
9959 if (blind) {
9960 return 0;
9961 }
9962 RGWRados *store = target->get_store();
9963 BucketShard *bs;
9964
9965 int ret = get_bucket_shard(&bs);
9966 if (ret < 0) {
9967 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9968 return ret;
9969 }
9970
9971 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
9972
9973 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9974 if (r < 0) {
9975 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9976 }
9977
9978 return ret;
9979 }
9980
9981
9982 int RGWRados::Bucket::UpdateIndex::cancel()
9983 {
9984 if (blind) {
9985 return 0;
9986 }
9987 RGWRados *store = target->get_store();
9988 BucketShard *bs;
9989
9990 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
9991 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
9992 });
9993
9994 /*
9995 * need to update data log anyhow, so that whoever follows needs to update its internal markers
9996 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
9997 * have no way to tell that they're all caught up
9998 */
9999 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10000 if (r < 0) {
10001 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10002 }
10003
10004 return ret;
10005 }
10006
10007 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10008 {
10009 RGWRados *store = source->get_store();
10010 CephContext *cct = store->ctx();
10011
10012 std::string oid, key;
10013 rgw_raw_obj read_obj;
10014 uint64_t read_ofs = ofs;
10015 uint64_t len, read_len;
10016 bool reading_from_head = true;
10017 ObjectReadOperation op;
10018
10019 bool merge_bl = false;
10020 bufferlist *pbl = &bl;
10021 bufferlist read_bl;
10022 uint64_t max_chunk_size;
10023
10024 RGWObjState *astate;
10025 int r = source->get_state(&astate, true);
10026 if (r < 0)
10027 return r;
10028
10029 if (end < 0)
10030 len = 0;
10031 else
10032 len = end - ofs + 1;
10033
10034 if (astate->has_manifest && astate->manifest.has_tail()) {
10035 /* now get the relevant object part */
10036 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10037
10038 uint64_t stripe_ofs = iter.get_stripe_ofs();
10039 read_obj = iter.get_location().get_raw_obj(store);
10040 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10041 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10042 reading_from_head = (read_obj == state.head_obj);
10043 } else {
10044 read_obj = state.head_obj;
10045 }
10046
10047 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10048 if (r < 0) {
10049 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10050 return r;
10051 }
10052
10053 if (len > max_chunk_size)
10054 len = max_chunk_size;
10055
10056
10057 state.io_ctx.locator_set_key(read_obj.loc);
10058
10059 read_len = len;
10060
10061 if (reading_from_head) {
10062 /* only when reading from the head object do we need to do the atomic test */
10063 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10064 if (r < 0)
10065 return r;
10066
10067 if (astate && astate->prefetch_data) {
10068 if (!ofs && astate->data.length() >= len) {
10069 bl = astate->data;
10070 return bl.length();
10071 }
10072
10073 if (ofs < astate->data.length()) {
10074 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10075 astate->data.copy(ofs, copy_len, bl);
10076 read_len -= copy_len;
10077 read_ofs += copy_len;
10078 if (!read_len)
10079 return bl.length();
10080
10081 merge_bl = true;
10082 pbl = &read_bl;
10083 }
10084 }
10085 }
10086
10087 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10088 op.read(read_ofs, read_len, pbl, NULL);
10089
10090 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10091 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10092
10093 if (r < 0) {
10094 return r;
10095 }
10096
10097 if (merge_bl) {
10098 bl.append(read_bl);
10099 }
10100
10101 return bl.length();
10102 }
10103
10104 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10105 {
10106 if (!has_ref) {
10107 int r = store->get_raw_obj_ref(obj, &ref);
10108 if (r < 0) {
10109 return r;
10110 }
10111 has_ref = true;
10112 }
10113 *pref = &ref;
10114 return 0;
10115
10116 }
10117
10118 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10119 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10120 bufferlist& bl, off_t ofs, off_t end,
10121 map<string, bufferlist> *attrs,
10122 rgw_cache_entry_info *cache_info)
10123 {
10124 uint64_t len;
10125 ObjectReadOperation op;
10126
10127 if (end < 0)
10128 len = 0;
10129 else
10130 len = end - ofs + 1;
10131
10132 if (objv_tracker) {
10133 objv_tracker->prepare_op_for_read(&op);
10134 }
10135
10136 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10137 op.read(ofs, len, &bl, NULL);
10138
10139 if (attrs) {
10140 op.getxattrs(attrs, NULL);
10141 }
10142
10143 rgw_rados_ref *ref;
10144 int r = read_state.get_ref(this, obj, &ref);
10145 if (r < 0) {
10146 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10147 return r;
10148 }
10149 r = ref->ioctx.operate(ref->oid, &op, NULL);
10150 if (r < 0) {
10151 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10152 return r;
10153 }
10154 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10155
10156 uint64_t op_ver = ref->ioctx.get_last_version();
10157
10158 if (read_state.last_ver > 0 &&
10159 read_state.last_ver != op_ver) {
10160 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10161 return -ECANCELED;
10162 }
10163
10164 read_state.last_ver = op_ver;
10165
10166 return bl.length();
10167 }
10168
10169 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl, RGWObjVersionTracker *objv_tracker)
10170 {
10171 RGWRados *store = source->get_store();
10172 rgw_raw_obj& obj = source->get_obj();
10173
10174 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl, ofs, end, read_params.attrs, read_params.cache_info);
10175 }
10176
10177 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10178 {
10179 RGWRados *store = source->get_store();
10180 rgw_raw_obj& obj = source->get_obj();
10181
10182 return store->system_obj_get_attr(obj, name, dest);
10183 }
10184
10185 struct get_obj_data;
10186
10187 struct get_obj_aio_data {
10188 struct get_obj_data *op_data;
10189 off_t ofs;
10190 off_t len;
10191 };
10192
10193 struct get_obj_io {
10194 off_t len;
10195 bufferlist bl;
10196 };
10197
10198 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10199
10200 struct get_obj_data : public RefCountedObject {
10201 CephContext *cct;
10202 RGWRados *rados;
10203 RGWObjectCtx *ctx;
10204 IoCtx io_ctx;
10205 map<off_t, get_obj_io> io_map;
10206 map<off_t, librados::AioCompletion *> completion_map;
10207 uint64_t total_read;
10208 Mutex lock;
10209 Mutex data_lock;
10210 list<get_obj_aio_data> aio_data;
10211 RGWGetDataCB *client_cb;
10212 std::atomic<bool> cancelled = { false };
10213 std::atomic<int64_t> err_code = { 0 };
10214 Throttle throttle;
10215 list<bufferlist> read_list;
10216
10217 explicit get_obj_data(CephContext *_cct)
10218 : cct(_cct),
10219 rados(NULL), ctx(NULL),
10220 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10221 client_cb(NULL),
10222 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10223 ~get_obj_data() override { }
10224 void set_cancelled(int r) {
10225 cancelled = true;
10226 err_code = r;
10227 }
10228
10229 bool is_cancelled() {
10230 return cancelled;
10231 }
10232
10233 int get_err_code() {
10234 return err_code;
10235 }
10236
10237 int wait_next_io(bool *done) {
10238 lock.Lock();
10239 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10240 if (iter == completion_map.end()) {
10241 *done = true;
10242 lock.Unlock();
10243 return 0;
10244 }
10245 off_t cur_ofs = iter->first;
10246 librados::AioCompletion *c = iter->second;
10247 lock.Unlock();
10248
10249 c->wait_for_safe_and_cb();
10250 int r = c->get_return_value();
10251
10252 lock.Lock();
10253 completion_map.erase(cur_ofs);
10254
10255 if (completion_map.empty()) {
10256 *done = true;
10257 }
10258 lock.Unlock();
10259
10260 c->release();
10261
10262 return r;
10263 }
10264
10265 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10266 Mutex::Locker l(lock);
10267
10268 const auto& io_iter = io_map.insert(
10269 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10270
10271 assert(io_iter.second); // assert new insertion
10272
10273 get_obj_io& io = (io_iter.first)->second;
10274 *pbl = &io.bl;
10275
10276 struct get_obj_aio_data aio;
10277 aio.ofs = ofs;
10278 aio.len = len;
10279 aio.op_data = this;
10280
10281 aio_data.push_back(aio);
10282
10283 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10284
10285 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10286 completion_map[ofs] = c;
10287
10288 *pc = c;
10289
10290 /* we have a reference per IO, plus one reference for the calling function.
10291 * reference is dropped for each callback, plus when we're done iterating
10292 * over the parts */
10293 get();
10294 }
10295
10296 void cancel_io(off_t ofs) {
10297 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10298 lock.Lock();
10299 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10300 if (iter != completion_map.end()) {
10301 AioCompletion *c = iter->second;
10302 c->release();
10303 completion_map.erase(ofs);
10304 io_map.erase(ofs);
10305 }
10306 lock.Unlock();
10307
10308 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10309 * need IoCtx to live, as io callback may still be called
10310 */
10311 }
10312
10313 void cancel_all_io() {
10314 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10315 Mutex::Locker l(lock);
10316 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10317 iter != completion_map.end(); ++iter) {
10318 librados::AioCompletion *c = iter->second;
10319 c->release();
10320 }
10321 }
10322
10323 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10324 Mutex::Locker l(lock);
10325
10326 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10327
10328 if (liter == io_map.end() ||
10329 liter->first != ofs) {
10330 return 0;
10331 }
10332
10333 map<off_t, librados::AioCompletion *>::iterator aiter;
10334 aiter = completion_map.find(ofs);
10335 if (aiter == completion_map.end()) {
10336 /* completion map does not hold this io, it was cancelled */
10337 return 0;
10338 }
10339
10340 AioCompletion *completion = aiter->second;
10341 int r = completion->get_return_value();
10342 if (r < 0)
10343 return r;
10344
10345 for (; aiter != completion_map.end(); ++aiter) {
10346 completion = aiter->second;
10347 if (!completion->is_safe()) {
10348 /* reached a request that is not yet complete, stop */
10349 break;
10350 }
10351
10352 r = completion->get_return_value();
10353 if (r < 0) {
10354 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10355 return r;
10356 }
10357
10358 total_read += r;
10359
10360 map<off_t, get_obj_io>::iterator old_liter = liter++;
10361 bl_list.push_back(old_liter->second.bl);
10362 io_map.erase(old_liter);
10363 }
10364
10365 return 0;
10366 }
10367 };
10368
10369 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10370 {
10371 struct get_obj_data *d = (struct get_obj_data *)arg;
10372
10373 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10374 }
10375
10376 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10377 {
10378 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10379 struct get_obj_data *d = aio_data->op_data;
10380
10381 d->rados->get_obj_aio_completion_cb(cb, arg);
10382 }
10383
10384
10385 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10386 {
10387 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10388 struct get_obj_data *d = aio_data->op_data;
10389 off_t ofs = aio_data->ofs;
10390 off_t len = aio_data->len;
10391
10392 list<bufferlist> bl_list;
10393 list<bufferlist>::iterator iter;
10394 int r;
10395
10396 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10397 d->throttle.put(len);
10398
10399 r = rados_aio_get_return_value(c);
10400 if (r < 0) {
10401 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10402 d->set_cancelled(r);
10403 goto done;
10404 }
10405
10406 if (d->is_cancelled()) {
10407 goto done;
10408 }
10409
10410 d->data_lock.Lock();
10411
10412 r = d->get_complete_ios(ofs, bl_list);
10413 if (r < 0) {
10414 goto done_unlock;
10415 }
10416
10417 d->read_list.splice(d->read_list.end(), bl_list);
10418
10419 done_unlock:
10420 d->data_lock.Unlock();
10421 done:
10422 d->put();
10423 return;
10424 }
10425
10426 int RGWRados::flush_read_list(struct get_obj_data *d)
10427 {
10428 d->data_lock.Lock();
10429 list<bufferlist> l;
10430 l.swap(d->read_list);
10431 d->get();
10432 d->read_list.clear();
10433
10434 d->data_lock.Unlock();
10435
10436 int r = 0;
10437
10438 list<bufferlist>::iterator iter;
10439 for (iter = l.begin(); iter != l.end(); ++iter) {
10440 bufferlist& bl = *iter;
10441 r = d->client_cb->handle_data(bl, 0, bl.length());
10442 if (r < 0) {
10443 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10444 break;
10445 }
10446 }
10447
10448 d->data_lock.Lock();
10449 d->put();
10450 if (r < 0) {
10451 d->set_cancelled(r);
10452 }
10453 d->data_lock.Unlock();
10454 return r;
10455 }
10456
10457 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10458 const RGWBucketInfo& bucket_info,
10459 const rgw_obj& obj,
10460 const rgw_raw_obj& read_obj,
10461 off_t obj_ofs,
10462 off_t read_ofs, off_t len,
10463 bool is_head_obj, void *arg)
10464 {
10465 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10466 ObjectReadOperation op;
10467 struct get_obj_data *d = (struct get_obj_data *)arg;
10468 string oid, key;
10469 bufferlist *pbl;
10470 AioCompletion *c;
10471
10472 int r;
10473
10474 if (is_head_obj) {
10475 /* only when reading from the head object do we need to do the atomic test */
10476 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10477 if (r < 0)
10478 return r;
10479
10480 if (astate &&
10481 obj_ofs < astate->data.length()) {
10482 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10483
10484 d->data_lock.Lock();
10485 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10486 d->data_lock.Unlock();
10487 if (r < 0)
10488 return r;
10489
10490 d->lock.Lock();
10491 d->total_read += chunk_len;
10492 d->lock.Unlock();
10493
10494 len -= chunk_len;
10495 read_ofs += chunk_len;
10496 obj_ofs += chunk_len;
10497 if (!len)
10498 return 0;
10499 }
10500 }
10501
10502 d->throttle.get(len);
10503 if (d->is_cancelled()) {
10504 return d->get_err_code();
10505 }
10506
10507 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10508 * cleaning up
10509 */
10510 d->add_io(obj_ofs, len, &pbl, &c);
10511
10512 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10513 op.read(read_ofs, len, pbl, NULL);
10514
10515 librados::IoCtx io_ctx(d->io_ctx);
10516 io_ctx.locator_set_key(read_obj.loc);
10517
10518 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10519 if (r < 0) {
10520 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10521 goto done_err;
10522 }
10523
10524 // Flush data to client if there is any
10525 r = flush_read_list(d);
10526 if (r < 0)
10527 return r;
10528
10529 return 0;
10530
10531 done_err:
10532 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10533 d->set_cancelled(r);
10534 d->cancel_io(obj_ofs);
10535
10536 return r;
10537 }
10538
10539 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10540 {
10541 RGWRados *store = source->get_store();
10542 CephContext *cct = store->ctx();
10543
10544 struct get_obj_data *data = new get_obj_data(cct);
10545 bool done = false;
10546
10547 RGWObjectCtx& obj_ctx = source->get_ctx();
10548
10549 data->rados = store;
10550 data->io_ctx.dup(state.io_ctx);
10551 data->client_cb = cb;
10552
10553 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10554 if (r < 0) {
10555 data->cancel_all_io();
10556 goto done;
10557 }
10558
10559 while (!done) {
10560 r = data->wait_next_io(&done);
10561 if (r < 0) {
10562 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10563 data->cancel_all_io();
10564 break;
10565 }
10566 r = store->flush_read_list(data);
10567 if (r < 0) {
10568 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10569 data->cancel_all_io();
10570 break;
10571 }
10572 }
10573
10574 done:
10575 data->put();
10576 return r;
10577 }
10578
10579 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10580 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10581 off_t ofs, off_t end,
10582 uint64_t max_chunk_size,
10583 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10584 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10585 RGWObjState *, void *),
10586 void *arg)
10587 {
10588 rgw_raw_obj head_obj;
10589 rgw_raw_obj read_obj;
10590 uint64_t read_ofs = ofs;
10591 uint64_t len;
10592 bool reading_from_head = true;
10593 RGWObjState *astate = NULL;
10594
10595 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10596
10597 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10598 if (r < 0) {
10599 return r;
10600 }
10601
10602 if (end < 0)
10603 len = 0;
10604 else
10605 len = end - ofs + 1;
10606
10607 if (astate->has_manifest) {
10608 /* now get the relevant object stripe */
10609 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10610
10611 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10612
10613 for (; iter != obj_end && ofs <= end; ++iter) {
10614 off_t stripe_ofs = iter.get_stripe_ofs();
10615 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10616
10617 while (ofs < next_stripe_ofs && ofs <= end) {
10618 read_obj = iter.get_location().get_raw_obj(this);
10619 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10620 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10621
10622 if (read_len > max_chunk_size) {
10623 read_len = max_chunk_size;
10624 }
10625
10626 reading_from_head = (read_obj == head_obj);
10627 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10628 if (r < 0) {
10629 return r;
10630 }
10631
10632 len -= read_len;
10633 ofs += read_len;
10634 }
10635 }
10636 } else {
10637 while (ofs <= end) {
10638 read_obj = head_obj;
10639 uint64_t read_len = min(len, max_chunk_size);
10640
10641 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10642 if (r < 0) {
10643 return r;
10644 }
10645
10646 len -= read_len;
10647 ofs += read_len;
10648 }
10649 }
10650
10651 return 0;
10652 }
10653
10654 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10655 {
10656 rgw_rados_ref ref;
10657 int r = get_obj_head_ref(bucket_info, obj, &ref);
10658 if (r < 0) {
10659 return r;
10660 }
10661
10662 return ref.ioctx.operate(ref.oid, op);
10663 }
10664
10665 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10666 {
10667 rgw_rados_ref ref;
10668 int r = get_obj_head_ref(bucket_info, obj, &ref);
10669 if (r < 0) {
10670 return r;
10671 }
10672
10673 bufferlist outbl;
10674
10675 return ref.ioctx.operate(ref.oid, op, &outbl);
10676 }
10677
10678 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10679 {
10680 ObjectWriteOperation op;
10681
10682 assert(olh_obj.key.instance.empty());
10683
10684 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10685
10686 if (!state.exists) {
10687 op.create(true);
10688 } else {
10689 op.assert_exists();
10690 }
10691
10692 /*
10693 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10694 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10695 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10696 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10697 * log will reflect that.
10698 *
10699 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10700 * is used for object data instance, olh_tag for olh instance.
10701 */
10702 if (has_tag) {
10703 /* guard against racing writes */
10704 bucket_index_guard_olh_op(state, op);
10705 }
10706
10707 if (!has_tag) {
10708 /* obj tag */
10709 string obj_tag;
10710 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10711 if (ret < 0) {
10712 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10713 return ret;
10714 }
10715 bufferlist bl;
10716 bl.append(obj_tag.c_str(), obj_tag.size());
10717 op.setxattr(RGW_ATTR_ID_TAG, bl);
10718
10719 state.attrset[RGW_ATTR_ID_TAG] = bl;
10720 state.obj_tag = bl;
10721
10722 /* olh tag */
10723 string olh_tag;
10724 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10725 if (ret < 0) {
10726 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10727 return ret;
10728 }
10729 bufferlist olh_bl;
10730 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10731 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10732
10733 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10734 state.olh_tag = olh_bl;
10735 state.is_olh = true;
10736
10737 bufferlist verbl;
10738 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10739 }
10740
10741 bufferlist bl;
10742 RGWOLHPendingInfo pending_info;
10743 pending_info.time = real_clock::now();
10744 ::encode(pending_info, bl);
10745
10746 #define OLH_PENDING_TAG_LEN 32
10747 /* tag will start with current time epoch, this so that entries are sorted by time */
10748 char buf[32];
10749 utime_t ut(pending_info.time);
10750 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10751 *op_tag = buf;
10752
10753 string s;
10754 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10755 if (ret < 0) {
10756 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10757 return ret;
10758 }
10759 op_tag->append(s);
10760
10761 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10762 attr_name.append(*op_tag);
10763
10764 op.setxattr(attr_name.c_str(), bl);
10765
10766 ret = obj_operate(bucket_info, olh_obj, &op);
10767 if (ret < 0) {
10768 return ret;
10769 }
10770
10771 state.exists = true;
10772 state.attrset[attr_name] = bl;
10773
10774 return 0;
10775 }
10776
10777 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10778 {
10779 int ret;
10780
10781 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10782 if (ret == -EEXIST) {
10783 ret = -ECANCELED;
10784 }
10785
10786 return ret;
10787 }
10788
10789 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
10790 {
10791 rgw_obj obj;
10792 const rgw_obj *pobj = &obj_instance;
10793 int r;
10794
10795 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10796 r = bs->init(pobj->bucket, *pobj);
10797 if (r < 0) {
10798 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
10799 return r;
10800 }
10801 r = call(bs);
10802 if (r != -ERR_BUSY_RESHARDING) {
10803 break;
10804 }
10805 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10806 string new_bucket_id;
10807 r = block_while_resharding(bs, &new_bucket_id);
10808 if (r == -ERR_BUSY_RESHARDING) {
10809 continue;
10810 }
10811 if (r < 0) {
10812 return r;
10813 }
10814 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10815 i = 0; /* resharding is finished, make sure we can retry */
10816
10817 obj = *pobj;
10818 obj.bucket.update_bucket_id(new_bucket_id);
10819 pobj = &obj;
10820 }
10821
10822 if (r < 0) {
10823 return r;
10824 }
10825
10826 return 0;
10827 }
10828
10829 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
10830 {
10831 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
10832
10833 return waiter->block_while_resharding(bs, new_bucket_id);
10834 }
10835
10836 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
10837 bool delete_marker,
10838 const string& op_tag,
10839 struct rgw_bucket_dir_entry_meta *meta,
10840 uint64_t olh_epoch,
10841 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
10842 {
10843 rgw_rados_ref ref;
10844 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10845 if (r < 0) {
10846 return r;
10847 }
10848
10849 rgw_zone_set zones_trace;
10850 if (_zones_trace) {
10851 zones_trace = *_zones_trace;
10852 } else {
10853 zones_trace.insert(get_zone().id);
10854 }
10855
10856 BucketShard bs(this);
10857
10858 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10859 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10860 librados::ObjectWriteOperation op;
10861 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10862 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
10863 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
10864 unmod_since, high_precision_time,
10865 get_zone().log_data, zones_trace);
10866 });
10867 if (r < 0) {
10868 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10869 return r;
10870 }
10871
10872 return 0;
10873 }
10874
10875 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
10876 {
10877 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
10878 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
10879 }
10880
10881 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
10882 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
10883 {
10884 rgw_rados_ref ref;
10885 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10886 if (r < 0) {
10887 return r;
10888 }
10889
10890 rgw_zone_set zones_trace;
10891 if (_zones_trace) {
10892 zones_trace = *_zones_trace;
10893 }
10894 zones_trace.insert(get_zone().id);
10895
10896 BucketShard bs(this);
10897
10898 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10899 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10900 librados::ObjectWriteOperation op;
10901 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10902 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
10903 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
10904 });
10905 if (r < 0) {
10906 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10907 return r;
10908 }
10909
10910 return 0;
10911 }
10912
10913 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
10914 const rgw_obj& obj_instance, uint64_t ver_marker,
10915 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
10916 bool *is_truncated)
10917 {
10918 rgw_rados_ref ref;
10919 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10920 if (r < 0) {
10921 return r;
10922 }
10923
10924 BucketShard bs(this);
10925 int ret = bs.init(obj_instance.bucket, obj_instance);
10926 if (ret < 0) {
10927 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10928 return ret;
10929 }
10930
10931 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10932
10933 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10934
10935 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10936 ObjectReadOperation op;
10937 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10938 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
10939 key, ver_marker, olh_tag, log, is_truncated);
10940 });
10941 if (ret < 0) {
10942 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
10943 return ret;
10944 }
10945
10946 return 0;
10947 }
10948
10949 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
10950 {
10951 rgw_rados_ref ref;
10952 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10953 if (r < 0) {
10954 return r;
10955 }
10956
10957 BucketShard bs(this);
10958 int ret = bs.init(obj_instance.bucket, obj_instance);
10959 if (ret < 0) {
10960 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10961 return ret;
10962 }
10963
10964 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10965
10966 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10967
10968 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
10969 ObjectWriteOperation op;
10970 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10971 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
10972 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
10973 });
10974 if (ret < 0) {
10975 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
10976 return ret;
10977 }
10978
10979 return 0;
10980 }
10981
10982 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
10983 {
10984 rgw_rados_ref ref;
10985 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10986 if (r < 0) {
10987 return r;
10988 }
10989
10990 BucketShard bs(this);
10991
10992 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10993
10994 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10995
10996 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
10997 ObjectWriteOperation op;
10998 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10999 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11000 });
11001 if (ret < 0) {
11002 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11003 return ret;
11004 }
11005
11006 return 0;
11007 }
11008
11009 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11010 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11011 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11012 {
11013 if (log.empty()) {
11014 return 0;
11015 }
11016
11017 librados::ObjectWriteOperation op;
11018
11019 uint64_t last_ver = log.rbegin()->first;
11020 *plast_ver = last_ver;
11021
11022 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11023
11024 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11025 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11026
11027 bool need_to_link = false;
11028 cls_rgw_obj_key key;
11029 bool delete_marker = false;
11030 list<cls_rgw_obj_key> remove_instances;
11031 bool need_to_remove = false;
11032
11033 for (iter = log.begin(); iter != log.end(); ++iter) {
11034 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11035 for (; viter != iter->second.end(); ++viter) {
11036 rgw_bucket_olh_log_entry& entry = *viter;
11037
11038 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11039 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11040 << (entry.delete_marker ? "(delete)" : "") << dendl;
11041 switch (entry.op) {
11042 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11043 remove_instances.push_back(entry.key);
11044 break;
11045 case CLS_RGW_OLH_OP_LINK_OLH:
11046 need_to_link = true;
11047 need_to_remove = false;
11048 key = entry.key;
11049 delete_marker = entry.delete_marker;
11050 break;
11051 case CLS_RGW_OLH_OP_UNLINK_OLH:
11052 need_to_remove = true;
11053 need_to_link = false;
11054 break;
11055 default:
11056 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11057 return -EIO;
11058 }
11059 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11060 attr_name.append(entry.op_tag);
11061 op.rmxattr(attr_name.c_str());
11062 }
11063 }
11064
11065 rgw_rados_ref ref;
11066 int r = get_obj_head_ref(bucket_info, obj, &ref);
11067 if (r < 0) {
11068 return r;
11069 }
11070
11071 const rgw_bucket& bucket = obj.bucket;
11072
11073 if (need_to_link) {
11074 rgw_obj target(bucket, key);
11075 RGWOLHInfo info;
11076 info.target = target;
11077 info.removed = delete_marker;
11078 bufferlist bl;
11079 ::encode(info, bl);
11080 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11081 }
11082
11083 /* first remove object instances */
11084 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11085 liter != remove_instances.end(); ++liter) {
11086 cls_rgw_obj_key& key = *liter;
11087 rgw_obj obj_instance(bucket, key);
11088 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11089 if (ret < 0 && ret != -ENOENT) {
11090 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11091 return ret;
11092 }
11093 }
11094
11095 /* update olh object */
11096 r = ref.ioctx.operate(ref.oid, &op);
11097 if (r == -ECANCELED) {
11098 r = 0;
11099 }
11100 if (r < 0) {
11101 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11102 return r;
11103 }
11104
11105 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11106 if (r < 0) {
11107 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11108 return r;
11109 }
11110
11111 if (need_to_remove) {
11112 ObjectWriteOperation rm_op;
11113
11114 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11115 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11116 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11117 rm_op.remove();
11118
11119 r = ref.ioctx.operate(ref.oid, &rm_op);
11120 if (r == -ECANCELED) {
11121 return 0; /* someone else won this race */
11122 } else {
11123 /*
11124 * only clear if was successful, otherwise we might clobber pending operations on this object
11125 */
11126 r = bucket_index_clear_olh(bucket_info, state, obj);
11127 if (r < 0) {
11128 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11129 return r;
11130 }
11131 }
11132 }
11133
11134 return 0;
11135 }
11136
11137 /*
11138 * read olh log and apply it
11139 */
11140 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11141 {
11142 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11143 bool is_truncated;
11144 uint64_t ver_marker = 0;
11145
11146 do {
11147 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11148 if (ret < 0) {
11149 return ret;
11150 }
11151 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11152 if (ret < 0) {
11153 return ret;
11154 }
11155 } while (is_truncated);
11156
11157 return 0;
11158 }
11159
11160 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11161 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11162 {
11163 string op_tag;
11164
11165 rgw_obj olh_obj = target_obj;
11166 olh_obj.key.instance.clear();
11167
11168 RGWObjState *state = NULL;
11169
11170 int ret = 0;
11171 int i;
11172
11173 #define MAX_ECANCELED_RETRY 100
11174 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11175 if (ret == -ECANCELED) {
11176 obj_ctx.obj.invalidate(olh_obj);
11177 }
11178
11179 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11180 if (ret < 0) {
11181 return ret;
11182 }
11183
11184 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11185 if (ret < 0) {
11186 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11187 if (ret == -ECANCELED) {
11188 continue;
11189 }
11190 return ret;
11191 }
11192 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11193 if (ret < 0) {
11194 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11195 if (ret == -ECANCELED) {
11196 continue;
11197 }
11198 return ret;
11199 }
11200 break;
11201 }
11202
11203 if (i == MAX_ECANCELED_RETRY) {
11204 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11205 return -EIO;
11206 }
11207
11208 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11209 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11210 ret = 0;
11211 }
11212 if (ret < 0) {
11213 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11214 return ret;
11215 }
11216
11217 return 0;
11218 }
11219
11220 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11221 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11222 {
11223 string op_tag;
11224
11225 rgw_obj olh_obj = target_obj;
11226 olh_obj.key.instance.clear();
11227
11228 RGWObjState *state = NULL;
11229
11230 int ret = 0;
11231 int i;
11232
11233 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11234 if (ret == -ECANCELED) {
11235 obj_ctx.obj.invalidate(olh_obj);
11236 }
11237
11238 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11239 if (ret < 0)
11240 return ret;
11241
11242 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11243 if (ret < 0) {
11244 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11245 if (ret == -ECANCELED) {
11246 continue;
11247 }
11248 return ret;
11249 }
11250
11251 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11252
11253 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11254 if (ret < 0) {
11255 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11256 if (ret == -ECANCELED) {
11257 continue;
11258 }
11259 return ret;
11260 }
11261 break;
11262 }
11263
11264 if (i == MAX_ECANCELED_RETRY) {
11265 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11266 return -EIO;
11267 }
11268
11269 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11270 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11271 return 0;
11272 }
11273 if (ret < 0) {
11274 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11275 return ret;
11276 }
11277
11278 return 0;
11279 }
11280
11281 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11282 {
11283 #define OBJ_INSTANCE_LEN 32
11284 char buf[OBJ_INSTANCE_LEN + 1];
11285
11286 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11287 no underscore for instance name due to the way we encode the raw keys */
11288
11289 target_obj->key.set_instance(buf);
11290 }
11291
11292 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11293 map<string, bufferlist> *attrset)
11294 {
11295 attrset->clear();
11296 map<string, bufferlist>::iterator iter;
11297 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11298 iter != unfiltered_attrset.end(); ++iter) {
11299 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11300 break;
11301 (*attrset)[iter->first] = iter->second;
11302 }
11303 }
11304
11305 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11306 {
11307 map<string, bufferlist> unfiltered_attrset;
11308
11309 ObjectReadOperation op;
11310 op.getxattrs(&unfiltered_attrset, NULL);
11311
11312 bufferlist outbl;
11313 int r = obj_operate(bucket_info, obj, &op);
11314
11315 if (r < 0) {
11316 return r;
11317 }
11318 map<string, bufferlist> attrset;
11319
11320 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11321
11322 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11323 if (iter == attrset.end()) { /* not an olh */
11324 return -EINVAL;
11325 }
11326
11327 try {
11328 bufferlist::iterator biter = iter->second.begin();
11329 ::decode(*olh, biter);
11330 } catch (buffer::error& err) {
11331 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11332 return -EIO;
11333 }
11334
11335 return 0;
11336 }
11337
11338 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11339 map<string, bufferlist> *rm_pending_entries)
11340 {
11341 map<string, bufferlist>::iterator iter = pending_entries.begin();
11342
11343 real_time now = real_clock::now();
11344
11345 while (iter != pending_entries.end()) {
11346 bufferlist::iterator biter = iter->second.begin();
11347 RGWOLHPendingInfo pending_info;
11348 try {
11349 ::decode(pending_info, biter);
11350 } catch (buffer::error& err) {
11351 /* skipping bad entry, we could remove it but it might hide a bug */
11352 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11353 ++iter;
11354 continue;
11355 }
11356
11357 map<string, bufferlist>::iterator cur_iter = iter;
11358 ++iter;
11359 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11360 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11361 pending_entries.erase(cur_iter);
11362 } else {
11363 /* entries names are sorted by time (rounded to a second) */
11364 break;
11365 }
11366 }
11367 }
11368
11369 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11370 {
11371 ObjectWriteOperation op;
11372
11373 bucket_index_guard_olh_op(state, op);
11374
11375 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11376 op.rmxattr(iter->first.c_str());
11377 }
11378
11379 rgw_rados_ref ref;
11380 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11381 if (r < 0) {
11382 return r;
11383 }
11384
11385 /* update olh object */
11386 r = ref.ioctx.operate(ref.oid, &op);
11387 if (r == -ENOENT || r == -ECANCELED) {
11388 /* raced with some other change, shouldn't sweat about it */
11389 r = 0;
11390 }
11391 if (r < 0) {
11392 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11393 return r;
11394 }
11395
11396 return 0;
11397 }
11398
11399 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11400 {
11401 map<string, bufferlist> pending_entries;
11402 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11403
11404 map<string, bufferlist> rm_pending_entries;
11405 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11406
11407 if (!rm_pending_entries.empty()) {
11408 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11409 if (ret < 0) {
11410 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11411 return ret;
11412 }
11413 }
11414 if (!pending_entries.empty()) {
11415 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11416
11417 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11418 if (ret < 0) {
11419 return ret;
11420 }
11421 }
11422
11423 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11424 assert(iter != state->attrset.end());
11425 RGWOLHInfo olh;
11426 try {
11427 bufferlist::iterator biter = iter->second.begin();
11428 ::decode(olh, biter);
11429 } catch (buffer::error& err) {
11430 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11431 return -EIO;
11432 }
11433
11434 if (olh.removed) {
11435 return -ENOENT;
11436 }
11437
11438 *target = olh.target;
11439
11440 return 0;
11441 }
11442
11443 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11444 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11445 RGWObjVersionTracker *objv_tracker)
11446 {
11447 rgw_rados_ref ref;
11448 int r = get_raw_obj_ref(obj, &ref);
11449 if (r < 0) {
11450 return r;
11451 }
11452
11453 map<string, bufferlist> unfiltered_attrset;
11454 uint64_t size = 0;
11455 struct timespec mtime_ts;
11456
11457 ObjectReadOperation op;
11458 if (objv_tracker) {
11459 objv_tracker->prepare_op_for_read(&op);
11460 }
11461 if (attrs) {
11462 op.getxattrs(&unfiltered_attrset, NULL);
11463 }
11464 if (psize || pmtime) {
11465 op.stat2(&size, &mtime_ts, NULL);
11466 }
11467 if (first_chunk) {
11468 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11469 }
11470 bufferlist outbl;
11471 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11472
11473 if (epoch) {
11474 *epoch = ref.ioctx.get_last_version();
11475 }
11476
11477 if (r < 0)
11478 return r;
11479
11480 if (psize)
11481 *psize = size;
11482 if (pmtime)
11483 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11484 if (attrs) {
11485 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11486 }
11487
11488 return 0;
11489 }
11490
11491 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11492 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker)
11493 {
11494 map<string, rgw_bucket_dir_header> headers;
11495 map<int, string> bucket_instance_ids;
11496 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11497 if (r < 0) {
11498 return r;
11499 }
11500
11501 assert(headers.size() == bucket_instance_ids.size());
11502
11503 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11504 map<int, string>::iterator viter = bucket_instance_ids.begin();
11505 BucketIndexShardsManager ver_mgr;
11506 BucketIndexShardsManager master_ver_mgr;
11507 BucketIndexShardsManager marker_mgr;
11508 string shard_marker;
11509 char buf[64];
11510 for(; iter != headers.end(); ++iter, ++viter) {
11511 accumulate_raw_stats(iter->second, stats);
11512 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11513 ver_mgr.add(viter->first, string(buf));
11514 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11515 master_ver_mgr.add(viter->first, string(buf));
11516 if (shard_id >= 0) {
11517 *max_marker = iter->second.max_marker;
11518 } else {
11519 marker_mgr.add(viter->first, iter->second.max_marker);
11520 }
11521 }
11522 ver_mgr.to_string(bucket_ver);
11523 master_ver_mgr.to_string(master_ver);
11524 if (shard_id < 0) {
11525 marker_mgr.to_string(max_marker);
11526 }
11527 return 0;
11528 }
11529
11530 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11531 map<int, string>& markers)
11532 {
11533 map<string, rgw_bucket_dir_header> headers;
11534 map<int, string> bucket_instance_ids;
11535 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11536 if (r < 0)
11537 return r;
11538
11539 assert(headers.size() == bucket_instance_ids.size());
11540
11541 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11542 map<int, string>::iterator viter = bucket_instance_ids.begin();
11543
11544 for(; iter != headers.end(); ++iter, ++viter) {
11545 if (shard_id >= 0) {
11546 markers[shard_id] = iter->second.max_marker;
11547 } else {
11548 markers[viter->first] = iter->second.max_marker;
11549 }
11550 }
11551 return 0;
11552 }
11553
11554 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11555 RGWGetBucketStats_CB *cb;
11556 uint32_t pendings;
11557 map<RGWObjCategory, RGWStorageStats> stats;
11558 int ret_code;
11559 bool should_cb;
11560 Mutex lock;
11561
11562 public:
11563 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11564 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11565 lock("RGWGetBucketStatsContext") {}
11566
11567 void handle_response(int r, rgw_bucket_dir_header& header) override {
11568 Mutex::Locker l(lock);
11569 if (should_cb) {
11570 if ( r >= 0) {
11571 accumulate_raw_stats(header, stats);
11572 } else {
11573 ret_code = r;
11574 }
11575
11576 // Are we all done?
11577 if (--pendings == 0) {
11578 if (!ret_code) {
11579 cb->set_response(&stats);
11580 }
11581 cb->handle_response(ret_code);
11582 cb->put();
11583 }
11584 }
11585 }
11586
11587 void unset_cb() {
11588 Mutex::Locker l(lock);
11589 should_cb = false;
11590 }
11591 };
11592
11593 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11594 {
11595 int num_aio = 0;
11596 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards);
11597 assert(get_ctx);
11598 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11599 get_ctx->put();
11600 if (r < 0) {
11601 ctx->put();
11602 if (num_aio) {
11603 get_ctx->unset_cb();
11604 }
11605 }
11606 return r;
11607 }
11608
11609 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11610 RGWGetUserStats_CB *cb;
11611
11612 public:
11613 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11614 : cb(cb) {}
11615
11616 void handle_response(int r, cls_user_header& header) override {
11617 const cls_user_stats& hs = header.stats;
11618 if (r >= 0) {
11619 RGWStorageStats stats;
11620
11621 stats.size = hs.total_bytes;
11622 stats.size_rounded = hs.total_bytes_rounded;
11623 stats.num_objects = hs.total_entries;
11624
11625 cb->set_response(stats);
11626 }
11627
11628 cb->handle_response(r);
11629
11630 cb->put();
11631 }
11632 };
11633
11634 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11635 {
11636 string user_str = user.to_str();
11637
11638 cls_user_header header;
11639 int r = cls_user_get_header(user_str, &header);
11640 if (r < 0)
11641 return r;
11642
11643 const cls_user_stats& hs = header.stats;
11644
11645 stats.size = hs.total_bytes;
11646 stats.size_rounded = hs.total_bytes_rounded;
11647 stats.num_objects = hs.total_entries;
11648
11649 return 0;
11650 }
11651
11652 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11653 {
11654 string user_str = user.to_str();
11655
11656 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11657 int r = cls_user_get_header_async(user_str, get_ctx);
11658 if (r < 0) {
11659 ctx->put();
11660 delete get_ctx;
11661 return r;
11662 }
11663
11664 return 0;
11665 }
11666
11667 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11668 {
11669 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11670 }
11671
11672 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11673 {
11674 if (!bucket.oid.empty()) {
11675 obj.init(get_zone_params().domain_root, bucket.oid);
11676 } else {
11677 string oid;
11678 get_bucket_meta_oid(bucket, oid);
11679 obj.init(get_zone_params().domain_root, oid);
11680 }
11681 }
11682
11683 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11684 real_time *pmtime, map<string, bufferlist> *pattrs)
11685 {
11686 size_t pos = meta_key.find(':');
11687 if (pos == string::npos) {
11688 return -EINVAL;
11689 }
11690 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11691 rgw_bucket_instance_key_to_oid(oid);
11692
11693 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11694 }
11695
11696 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11697 real_time *pmtime, map<string, bufferlist> *pattrs)
11698 {
11699 string oid;
11700 if (bucket.oid.empty()) {
11701 get_bucket_meta_oid(bucket, oid);
11702 } else {
11703 oid = bucket.oid;
11704 }
11705
11706 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11707 }
11708
11709 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11710 real_time *pmtime, map<string, bufferlist> *pattrs,
11711 rgw_cache_entry_info *cache_info)
11712 {
11713 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11714
11715 bufferlist epbl;
11716
11717 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, oid, epbl, &info.objv_tracker, pmtime, pattrs, cache_info);
11718 if (ret < 0) {
11719 return ret;
11720 }
11721
11722 bufferlist::iterator iter = epbl.begin();
11723 try {
11724 ::decode(info, iter);
11725 } catch (buffer::error& err) {
11726 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11727 return -EIO;
11728 }
11729 info.bucket.oid = oid;
11730 return 0;
11731 }
11732
11733 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11734 const string& tenant_name,
11735 const string& bucket_name,
11736 RGWBucketEntryPoint& entry_point,
11737 RGWObjVersionTracker *objv_tracker,
11738 real_time *pmtime,
11739 map<string, bufferlist> *pattrs,
11740 rgw_cache_entry_info *cache_info)
11741 {
11742 bufferlist bl;
11743 string bucket_entry;
11744
11745 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11746 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
11747 if (ret < 0) {
11748 return ret;
11749 }
11750
11751 bufferlist::iterator iter = bl.begin();
11752 try {
11753 ::decode(entry_point, iter);
11754 } catch (buffer::error& err) {
11755 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11756 return -EIO;
11757 }
11758 return 0;
11759 }
11760
11761 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11762 const string& tenant_name,
11763 const string& bucket_name)
11764 {
11765 RGWBucketEntryPoint entry_point;
11766 real_time ep_mtime;
11767 RGWObjVersionTracker ot;
11768 map<string, bufferlist> attrs;
11769 RGWBucketInfo info;
11770
11771 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11772
11773 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11774 if (ret < 0) {
11775 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11776 return ret;
11777 }
11778
11779 if (!entry_point.has_bucket_info) {
11780 /* already converted! */
11781 return 0;
11782 }
11783
11784 info = entry_point.old_bucket_info;
11785 info.bucket.oid = bucket_name;
11786 info.ep_objv = ot.read_version;
11787
11788 ot.generate_new_write_ver(cct);
11789
11790 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11791 if (ret < 0) {
11792 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11793 return ret;
11794 }
11795
11796 return 0;
11797 }
11798
11799 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
11800 const string& tenant, const string& bucket_name, RGWBucketInfo& info,
11801 real_time *pmtime, map<string, bufferlist> *pattrs)
11802 {
11803 bucket_info_entry e;
11804 string bucket_entry;
11805 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11806
11807 if (binfo_cache->find(bucket_entry, &e)) {
11808 info = e.info;
11809 if (pattrs)
11810 *pattrs = e.attrs;
11811 if (pmtime)
11812 *pmtime = e.mtime;
11813 return 0;
11814 }
11815
11816 RGWBucketEntryPoint entry_point;
11817 real_time ep_mtime;
11818 RGWObjVersionTracker ot;
11819 rgw_cache_entry_info entry_cache_info;
11820 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
11821 if (ret < 0) {
11822 /* only init these fields */
11823 info.bucket.tenant = tenant;
11824 info.bucket.name = bucket_name;
11825 return ret;
11826 }
11827
11828 if (entry_point.has_bucket_info) {
11829 info = entry_point.old_bucket_info;
11830 info.bucket.oid = bucket_name;
11831 info.bucket.tenant = tenant;
11832 info.ep_objv = ot.read_version;
11833 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
11834 return 0;
11835 }
11836
11837 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11838 * that we got
11839 */
11840 if (pattrs) {
11841 pattrs->clear();
11842 }
11843
11844 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
11845
11846
11847 /* read bucket instance info */
11848
11849 string oid;
11850 get_bucket_meta_oid(entry_point.bucket, oid);
11851
11852 rgw_cache_entry_info cache_info;
11853
11854 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, &cache_info);
11855 e.info.ep_objv = ot.read_version;
11856 info = e.info;
11857 if (ret < 0) {
11858 info.bucket.tenant = tenant;
11859 info.bucket.name = bucket_name;
11860 // XXX and why return anything in case of an error anyway?
11861 return ret;
11862 }
11863
11864 if (pmtime)
11865 *pmtime = e.mtime;
11866 if (pattrs)
11867 *pattrs = e.attrs;
11868
11869 list<rgw_cache_entry_info *> cache_info_entries;
11870 cache_info_entries.push_back(&entry_cache_info);
11871 cache_info_entries.push_back(&cache_info);
11872
11873
11874 /* chain to both bucket entry point and bucket instance */
11875 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
11876 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
11877 }
11878
11879 return 0;
11880 }
11881
11882 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
11883 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
11884 map<string, bufferlist> *pattrs)
11885 {
11886 bufferlist epbl;
11887 ::encode(entry_point, epbl);
11888 string bucket_entry;
11889 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11890 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
11891 }
11892
11893 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
11894 real_time mtime, map<string, bufferlist> *pattrs)
11895 {
11896 info.has_instance_obj = true;
11897 bufferlist bl;
11898
11899 ::encode(info, bl);
11900
11901 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
11902 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
11903 if (ret == -EEXIST) {
11904 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11905 * bucket operation on this specific bucket (e.g., being synced from the master), but
11906 * since bucket instace meta object is unique for this specific bucket instace, we don't
11907 * need to return an error.
11908 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11909 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11910 * locally, while in the sync thread we sync the new bucket.
11911 */
11912 ret = 0;
11913 }
11914 return ret;
11915 }
11916
11917 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
11918 map<string, bufferlist> *pattrs, bool create_entry_point)
11919 {
11920 bool create_head = !info.has_instance_obj || create_entry_point;
11921
11922 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
11923 if (ret < 0) {
11924 return ret;
11925 }
11926
11927 if (!create_head)
11928 return 0; /* done! */
11929
11930 RGWBucketEntryPoint entry_point;
11931 entry_point.bucket = info.bucket;
11932 entry_point.owner = info.owner;
11933 entry_point.creation_time = info.creation_time;
11934 entry_point.linked = true;
11935 RGWObjVersionTracker ot;
11936 if (pep_objv && !pep_objv->tag.empty()) {
11937 ot.write_version = *pep_objv;
11938 } else {
11939 ot.generate_new_write_ver(cct);
11940 if (pep_objv) {
11941 *pep_objv = ot.write_version;
11942 }
11943 }
11944 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
11945 if (ret < 0)
11946 return ret;
11947
11948 return 0;
11949 }
11950
11951 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
11952 {
11953 rgw_rados_ref ref;
11954 int r = get_raw_obj_ref(obj, &ref);
11955 if (r < 0) {
11956 return r;
11957 }
11958
11959 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
11960 if (r < 0)
11961 return r;
11962
11963 return 0;
11964
11965 }
11966
11967 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
11968 std::map<string, bufferlist>& m)
11969 {
11970 rgw_rados_ref ref;
11971 int r = get_raw_obj_ref(obj, &ref);
11972 if (r < 0) {
11973 return r;
11974 }
11975
11976 #define MAX_OMAP_GET_ENTRIES 1024
11977 const int count = MAX_OMAP_GET_ENTRIES;
11978 string start_after;
11979
11980 while (true) {
11981 std::map<string, bufferlist> t;
11982 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
11983 if (r < 0) {
11984 return r;
11985 }
11986 if (t.empty()) {
11987 break;
11988 }
11989 start_after = t.rbegin()->first;
11990 m.insert(t.begin(), t.end());
11991 }
11992 return 0;
11993 }
11994
11995 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
11996 {
11997 rgw_rados_ref ref;
11998 int r = get_raw_obj_ref(obj, &ref);
11999 if (r < 0) {
12000 return r;
12001 }
12002 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12003
12004 map<string, bufferlist> m;
12005 m[key] = bl;
12006
12007 r = ref.ioctx.omap_set(ref.oid, m);
12008
12009 return r;
12010 }
12011
12012 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12013 {
12014 rgw_rados_ref ref;
12015 int r = get_raw_obj_ref(obj, &ref);
12016 if (r < 0) {
12017 return r;
12018 }
12019
12020 r = ref.ioctx.omap_set(ref.oid, m);
12021
12022 return r;
12023 }
12024
12025 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12026 {
12027 rgw_rados_ref ref;
12028 int r = get_raw_obj_ref(obj, &ref);
12029 if (r < 0) {
12030 return r;
12031 }
12032
12033 set<string> k;
12034 k.insert(key);
12035
12036 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12037 return r;
12038 }
12039
12040 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12041 {
12042 RGWObjectCtx obj_ctx(this);
12043
12044 map<string, RGWBucketEnt>::iterator iter;
12045 for (iter = m.begin(); iter != m.end(); ++iter) {
12046 RGWBucketEnt& ent = iter->second;
12047 rgw_bucket& bucket = ent.bucket;
12048 ent.count = 0;
12049 ent.size = 0;
12050 ent.size_rounded = 0;
12051
12052 map<string, rgw_bucket_dir_header> headers;
12053
12054 RGWBucketInfo bucket_info;
12055 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12056 if (ret < 0) {
12057 return ret;
12058 }
12059
12060 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12061 if (r < 0)
12062 return r;
12063
12064 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12065 for (; hiter != headers.end(); ++hiter) {
12066 RGWObjCategory category = main_category;
12067 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12068 if (iter != hiter->second.stats.end()) {
12069 struct rgw_bucket_category_stats& stats = iter->second;
12070 ent.count += stats.num_entries;
12071 ent.size += stats.total_size;
12072 ent.size_rounded += stats.total_size_rounded;
12073 }
12074 }
12075 }
12076
12077 return m.size();
12078 }
12079
12080 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12081 {
12082 rgw_rados_ref ref;
12083 int r = get_raw_obj_ref(obj, &ref);
12084 if (r < 0) {
12085 return r;
12086 }
12087 librados::Rados *rad = get_rados_handle();
12088 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12089
12090 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12091 completion->release();
12092 return r;
12093 }
12094
12095 int RGWRados::distribute(const string& key, bufferlist& bl)
12096 {
12097 /*
12098 * we were called before watch was initialized. This can only happen if we're updating some system
12099 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12100 * objects, they're currently only read on startup anyway.
12101 */
12102 if (!watch_initialized)
12103 return 0;
12104
12105 string notify_oid;
12106 pick_control_oid(key, notify_oid);
12107
12108 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12109 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12110 }
12111
12112 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12113 {
12114 librados::IoCtx& io_ctx = ctx.io_ctx;
12115 librados::NObjectIterator& iter = ctx.iter;
12116
12117 int r = open_pool_ctx(pool, io_ctx);
12118 if (r < 0)
12119 return r;
12120
12121 iter = io_ctx.nobjects_begin();
12122
12123 return 0;
12124 }
12125
12126 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12127 bool *is_truncated, RGWAccessListFilter *filter)
12128 {
12129 librados::IoCtx& io_ctx = ctx.io_ctx;
12130 librados::NObjectIterator& iter = ctx.iter;
12131
12132 if (iter == io_ctx.nobjects_end())
12133 return -ENOENT;
12134
12135 uint32_t i;
12136
12137 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12138 rgw_bucket_dir_entry e;
12139
12140 string oid = iter->get_oid();
12141 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12142
12143 // fill it in with initial values; we may correct later
12144 if (filter && !filter->filter(oid, oid))
12145 continue;
12146
12147 e.key = oid;
12148 objs.push_back(e);
12149 }
12150
12151 if (is_truncated)
12152 *is_truncated = (iter != io_ctx.nobjects_end());
12153
12154 return objs.size();
12155 }
12156 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12157 string prefix;
12158
12159 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12160 bool filter(string& name, string& key) override {
12161 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12162 }
12163 };
12164
12165 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12166 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12167 bool *is_truncated)
12168 {
12169 RGWAccessListFilterPrefix filter(prefix_filter);
12170
12171 if (!ctx.initialized) {
12172 int r = pool_iterate_begin(pool, ctx.iter_ctx);
12173 if (r < 0) {
12174 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12175 return r;
12176 }
12177 ctx.initialized = true;
12178 }
12179
12180 vector<rgw_bucket_dir_entry> objs;
12181 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12182 if (r < 0) {
12183 if(r != -ENOENT)
12184 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12185 return r;
12186 }
12187
12188 vector<rgw_bucket_dir_entry>::iterator iter;
12189 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12190 oids.push_back(iter->key.name);
12191 }
12192
12193 return oids.size();
12194 }
12195
12196 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12197 std::list<rgw_bi_log_entry>& result, bool *truncated)
12198 {
12199 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12200 result.clear();
12201
12202 librados::IoCtx index_ctx;
12203 map<int, string> oids;
12204 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12205 map<int, string> bucket_instance_ids;
12206 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12207 if (r < 0)
12208 return r;
12209
12210 BucketIndexShardsManager marker_mgr;
12211 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12212 // If there are multiple shards for the bucket index object, the marker
12213 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12214 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12215 // only contain one record, and the key is the bucket instance id.
12216 r = marker_mgr.from_string(marker, shard_id);
12217 if (r < 0)
12218 return r;
12219
12220 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12221 if (r < 0)
12222 return r;
12223
12224 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12225 map<int, list<rgw_bi_log_entry>::iterator> vends;
12226 if (truncated) {
12227 *truncated = false;
12228 }
12229 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12230 for (; miter != bi_log_lists.end(); ++miter) {
12231 int shard_id = miter->first;
12232 vcurrents[shard_id] = miter->second.entries.begin();
12233 vends[shard_id] = miter->second.entries.end();
12234 if (truncated) {
12235 *truncated = (*truncated || miter->second.truncated);
12236 }
12237 }
12238
12239 size_t total = 0;
12240 bool has_more = true;
12241 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12242 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12243 while (total < max && has_more) {
12244 has_more = false;
12245
12246 viter = vcurrents.begin();
12247 eiter = vends.begin();
12248
12249 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12250 assert (eiter != vends.end());
12251
12252 int shard_id = viter->first;
12253 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12254
12255 if (liter == eiter->second){
12256 continue;
12257 }
12258 rgw_bi_log_entry& entry = *(liter);
12259 if (has_shards) {
12260 char buf[16];
12261 snprintf(buf, sizeof(buf), "%d", shard_id);
12262 string tmp_id;
12263 build_bucket_index_marker(buf, entry.id, &tmp_id);
12264 entry.id.swap(tmp_id);
12265 }
12266 marker_mgr.add(shard_id, entry.id);
12267 result.push_back(entry);
12268 total++;
12269 has_more = true;
12270 ++liter;
12271 }
12272 }
12273
12274 if (truncated) {
12275 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12276 assert (eiter != vends.end());
12277 *truncated = (*truncated || (viter->second != eiter->second));
12278 }
12279 }
12280
12281 // Refresh marker, if there are multiple shards, the output will look like
12282 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12283 // if there is no sharding, the simply marker (without oid) is returned
12284 if (has_shards) {
12285 marker_mgr.to_string(&marker);
12286 } else {
12287 if (!result.empty()) {
12288 marker = result.rbegin()->id;
12289 }
12290 }
12291
12292 return 0;
12293 }
12294
12295 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12296 {
12297 librados::IoCtx index_ctx;
12298 map<int, string> bucket_objs;
12299
12300 BucketIndexShardsManager start_marker_mgr;
12301 BucketIndexShardsManager end_marker_mgr;
12302
12303 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12304 if (r < 0) {
12305 return r;
12306 }
12307
12308 r = start_marker_mgr.from_string(start_marker, shard_id);
12309 if (r < 0) {
12310 return r;
12311 }
12312
12313 r = end_marker_mgr.from_string(end_marker, shard_id);
12314 if (r < 0) {
12315 return r;
12316 }
12317
12318 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12319 cct->_conf->rgw_bucket_index_max_aio)();
12320
12321 return r;
12322 }
12323
12324 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12325 {
12326 rgw_rados_ref ref;
12327 int r = get_obj_head_ref(bucket_info, obj, &ref);
12328 if (r < 0) {
12329 return r;
12330 }
12331
12332 rgw_cls_bi_entry bi_entry;
12333 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12334 if (r < 0 && r != -ENOENT) {
12335 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12336 }
12337 if (r < 0) {
12338 return r;
12339 }
12340 bufferlist::iterator iter = bi_entry.data.begin();
12341 try {
12342 ::decode(*dirent, iter);
12343 } catch (buffer::error& err) {
12344 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12345 return -EIO;
12346 }
12347
12348 return 0;
12349 }
12350
12351 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12352 {
12353 BucketShard bs(this);
12354 int ret = bs.init(bucket, obj);
12355 if (ret < 0) {
12356 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12357 return ret;
12358 }
12359
12360 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12361
12362 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12363 if (ret < 0)
12364 return ret;
12365
12366 return 0;
12367 }
12368
12369 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12370 {
12371 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12372 }
12373
12374 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12375 {
12376 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12377 if (ret < 0)
12378 return ret;
12379
12380 return 0;
12381 }
12382
12383 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12384 {
12385 BucketShard bs(this);
12386 int ret = bs.init(bucket, obj);
12387 if (ret < 0) {
12388 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12389 return ret;
12390 }
12391
12392 return bi_put(bs, entry);
12393 }
12394
12395 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12396 {
12397 rgw_obj obj(bucket, obj_name);
12398 BucketShard bs(this);
12399 int ret = bs.init(bucket, obj);
12400 if (ret < 0) {
12401 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12402 return ret;
12403 }
12404
12405 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12406 if (ret == -ENOENT) {
12407 *is_truncated = false;
12408 }
12409 if (ret < 0)
12410 return ret;
12411
12412 return 0;
12413 }
12414
12415 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12416 {
12417 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12418 if (ret < 0)
12419 return ret;
12420
12421 return 0;
12422 }
12423
12424 int RGWRados::bi_remove(BucketShard& bs)
12425 {
12426 int ret = bs.index_ctx.remove(bs.bucket_obj);
12427 if (ret == -ENOENT) {
12428 ret = 0;
12429 }
12430 if (ret < 0) {
12431 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12432 return ret;
12433 }
12434
12435 return 0;
12436 }
12437
12438 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12439 {
12440 BucketShard bs(this);
12441 int ret = bs.init(bucket, shard_id);
12442 if (ret < 0) {
12443 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12444 return ret;
12445 }
12446
12447 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12448 }
12449
12450 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12451 {
12452 return gc_pool_ctx.operate(oid, op);
12453 }
12454
12455 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12456 {
12457 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12458 int r = gc_pool_ctx.aio_operate(oid, c, op);
12459 c->release();
12460 return r;
12461 }
12462
12463 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12464 {
12465 return gc_pool_ctx.operate(oid, op, pbl);
12466 }
12467
12468 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12469 {
12470 return gc->list(index, marker, max, expired_only, result, truncated);
12471 }
12472
12473 int RGWRados::process_gc()
12474 {
12475 return gc->process();
12476 }
12477
12478 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12479 {
12480 return lc->list_lc_progress(marker, max_entries, progress_map);
12481 }
12482
12483 int RGWRados::process_lc()
12484 {
12485 return lc->process();
12486 }
12487
12488 int RGWRados::process_expire_objects()
12489 {
12490 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12491 return 0;
12492 }
12493
12494 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12495 {
12496 bufferlist in;
12497 cls_rgw_bucket_init(op);
12498 return index_ctx.operate(oid, &op);
12499 }
12500
12501 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12502 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12503 {
12504 rgw_zone_set zones_trace;
12505 if (_zones_trace) {
12506 zones_trace = *_zones_trace;
12507 }
12508 else {
12509 zones_trace.insert(get_zone().id);
12510 }
12511
12512 ObjectWriteOperation o;
12513 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12514 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12515 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12516 return bs.index_ctx.operate(bs.bucket_obj, &o);
12517 }
12518
12519 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12520 int64_t pool, uint64_t epoch,
12521 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12522 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12523 {
12524 ObjectWriteOperation o;
12525 rgw_bucket_dir_entry_meta dir_meta;
12526 dir_meta = ent.meta;
12527 dir_meta.category = category;
12528
12529 rgw_bucket_entry_ver ver;
12530 ver.pool = pool;
12531 ver.epoch = epoch;
12532 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12533 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12534 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12535 get_zone().log_data, bilog_flags, _zones_trace);
12536 complete_op_data *arg;
12537 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12538 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12539 librados::AioCompletion *completion = arg->rados_completion;
12540 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12541 completion->release(); /* can't reference arg here, as it might have already been released */
12542 return ret;
12543 }
12544
12545 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12546 int64_t pool, uint64_t epoch,
12547 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12548 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12549 {
12550 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12551 }
12552
12553 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12554 int64_t pool, uint64_t epoch,
12555 rgw_obj& obj,
12556 real_time& removed_mtime,
12557 list<rgw_obj_index_key> *remove_objs,
12558 uint16_t bilog_flags,
12559 rgw_zone_set *zones_trace)
12560 {
12561 rgw_bucket_dir_entry ent;
12562 ent.meta.mtime = removed_mtime;
12563 obj.key.get_index_key(&ent.key);
12564 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12565 }
12566
12567 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12568 {
12569 rgw_bucket_dir_entry ent;
12570 obj.key.get_index_key(&ent.key);
12571 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12572 }
12573
12574 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12575 {
12576 librados::IoCtx index_ctx;
12577 map<int, string> bucket_objs;
12578 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12579 if (r < 0)
12580 return r;
12581
12582 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12583 }
12584
12585 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12586 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12587 bool *is_truncated, rgw_obj_index_key *last_entry,
12588 bool (*force_check_filter)(const string& name))
12589 {
12590 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12591
12592 librados::IoCtx index_ctx;
12593 // key - oid (for different shards if there is any)
12594 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12595 map<int, string> oids;
12596 map<int, struct rgw_cls_list_ret> list_results;
12597 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12598 if (r < 0)
12599 return r;
12600
12601 cls_rgw_obj_key start_key(start.name, start.instance);
12602 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12603 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12604 if (r < 0)
12605 return r;
12606
12607 // Create a list of iterators that are used to iterate each shard
12608 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12609 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12610 vector<string> vnames(list_results.size());
12611 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12612 *is_truncated = false;
12613 for (; iter != list_results.end(); ++iter) {
12614 vcurrents.push_back(iter->second.dir.m.begin());
12615 vends.push_back(iter->second.dir.m.end());
12616 vnames.push_back(oids[iter->first]);
12617 *is_truncated = (*is_truncated || iter->second.is_truncated);
12618 }
12619
12620 // Create a map to track the next candidate entry from each shard, if the entry
12621 // from a specified shard is selected/erased, the next entry from that shard will
12622 // be inserted for next round selection
12623 map<string, size_t> candidates;
12624 for (size_t i = 0; i < vcurrents.size(); ++i) {
12625 if (vcurrents[i] != vends[i]) {
12626 candidates[vcurrents[i]->first] = i;
12627 }
12628 }
12629
12630 map<string, bufferlist> updates;
12631 uint32_t count = 0;
12632 while (count < num_entries && !candidates.empty()) {
12633 r = 0;
12634 // Select the next one
12635 int pos = candidates.begin()->second;
12636 const string& name = vcurrents[pos]->first;
12637 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12638
12639 bool force_check = force_check_filter && force_check_filter(dirent.key.name);
12640 if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
12641 /* there are uncommitted ops. We need to check the current state,
12642 * and if the tags are old we need to do cleanup as well. */
12643 librados::IoCtx sub_ctx;
12644 sub_ctx.dup(index_ctx);
12645 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12646 if (r < 0 && r != -ENOENT) {
12647 return r;
12648 }
12649 }
12650 if (r >= 0) {
12651 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12652 m[name] = std::move(dirent);
12653 ++count;
12654 }
12655
12656 // Refresh the candidates map
12657 candidates.erase(candidates.begin());
12658 ++vcurrents[pos];
12659 if (vcurrents[pos] != vends[pos]) {
12660 candidates[vcurrents[pos]->first] = pos;
12661 }
12662 }
12663
12664 // Suggest updates if there is any
12665 map<string, bufferlist>::iterator miter = updates.begin();
12666 for (; miter != updates.end(); ++miter) {
12667 if (miter->second.length()) {
12668 ObjectWriteOperation o;
12669 cls_rgw_suggest_changes(o, miter->second);
12670 // we don't care if we lose suggested updates, send them off blindly
12671 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12672 index_ctx.aio_operate(miter->first, c, &o);
12673 c->release();
12674 }
12675 }
12676
12677 // Check if all the returned entries are consumed or not
12678 for (size_t i = 0; i < vcurrents.size(); ++i) {
12679 if (vcurrents[i] != vends[i])
12680 *is_truncated = true;
12681 }
12682 if (!m.empty())
12683 *last_entry = m.rbegin()->first;
12684
12685 return 0;
12686 }
12687
12688 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12689 {
12690 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12691
12692 rgw_rados_ref ref;
12693 int r = get_raw_obj_ref(obj, &ref);
12694 if (r < 0) {
12695 return r;
12696 }
12697
12698 ObjectWriteOperation op;
12699 cls_rgw_usage_log_add(op, info);
12700
12701 r = ref.ioctx.operate(ref.oid, &op);
12702 return r;
12703 }
12704
12705 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
12706 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
12707 {
12708 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12709
12710 rgw_rados_ref ref;
12711 int r = get_raw_obj_ref(obj, &ref);
12712 if (r < 0) {
12713 return r;
12714 }
12715
12716 *is_truncated = false;
12717
12718 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
12719 max_entries, read_iter, usage, is_truncated);
12720
12721 return r;
12722 }
12723
12724 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
12725 {
12726 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12727
12728 rgw_rados_ref ref;
12729 int r = get_raw_obj_ref(obj, &ref);
12730 if (r < 0) {
12731 return r;
12732 }
12733
12734 ObjectWriteOperation op;
12735 cls_rgw_usage_log_trim(op, user, start_epoch, end_epoch);
12736
12737 r = ref.ioctx.operate(ref.oid, &op);
12738 return r;
12739 }
12740
12741 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
12742 {
12743 librados::IoCtx index_ctx;
12744 string dir_oid;
12745
12746 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12747
12748 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
12749 if (r < 0)
12750 return r;
12751
12752 bufferlist updates;
12753
12754 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
12755 rgw_bucket_dir_entry entry;
12756 entry.key = *iter;
12757 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
12758 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12759 updates.append(CEPH_RGW_REMOVE | suggest_flag);
12760 ::encode(entry, updates);
12761 }
12762
12763 bufferlist out;
12764
12765 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
12766
12767 return r;
12768 }
12769
12770 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
12771 const RGWBucketInfo& bucket_info,
12772 rgw_bucket_dir_entry& list_state,
12773 rgw_bucket_dir_entry& object,
12774 bufferlist& suggested_updates)
12775 {
12776 const rgw_bucket& bucket = bucket_info.bucket;
12777 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12778
12779 std::string loc;
12780
12781 rgw_obj obj(bucket, list_state.key);
12782
12783 string oid;
12784 get_obj_bucket_and_oid_loc(obj, oid, loc);
12785
12786 if (loc != list_state.locator) {
12787 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
12788 }
12789
12790 io_ctx.locator_set_key(list_state.locator);
12791
12792 RGWObjState *astate = NULL;
12793 RGWObjectCtx rctx(this);
12794 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
12795 if (r < 0)
12796 return r;
12797
12798 list_state.pending_map.clear(); // we don't need this and it inflates size
12799 if (!astate->exists) {
12800 /* object doesn't exist right now -- hopefully because it's
12801 * marked as !exists and got deleted */
12802 if (list_state.exists) {
12803 /* FIXME: what should happen now? Work out if there are any
12804 * non-bad ways this could happen (there probably are, but annoying
12805 * to handle!) */
12806 }
12807 // encode a suggested removal of that key
12808 list_state.ver.epoch = io_ctx.get_last_version();
12809 list_state.ver.pool = io_ctx.get_id();
12810 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
12811 return -ENOENT;
12812 }
12813
12814 string etag;
12815 string content_type;
12816 ACLOwner owner;
12817
12818 object.meta.size = astate->size;
12819 object.meta.accounted_size = astate->accounted_size;
12820 object.meta.mtime = astate->mtime;
12821
12822 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
12823 if (iter != astate->attrset.end()) {
12824 etag = iter->second.c_str();
12825 }
12826 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
12827 if (iter != astate->attrset.end()) {
12828 content_type = iter->second.c_str();
12829 }
12830 iter = astate->attrset.find(RGW_ATTR_ACL);
12831 if (iter != astate->attrset.end()) {
12832 r = decode_policy(iter->second, &owner);
12833 if (r < 0) {
12834 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
12835 }
12836 }
12837
12838 if (astate->has_manifest) {
12839 RGWObjManifest::obj_iterator miter;
12840 RGWObjManifest& manifest = astate->manifest;
12841 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
12842 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
12843 rgw_obj loc;
12844 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
12845
12846 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
12847 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
12848 r = delete_obj_index(loc);
12849 if (r < 0) {
12850 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
12851 }
12852 }
12853 }
12854 }
12855
12856 object.meta.etag = etag;
12857 object.meta.content_type = content_type;
12858 object.meta.owner = owner.get_id().to_str();
12859 object.meta.owner_display_name = owner.get_display_name();
12860
12861 // encode suggested updates
12862 list_state.ver.pool = io_ctx.get_id();
12863 list_state.ver.epoch = astate->epoch;
12864 list_state.meta.size = object.meta.size;
12865 list_state.meta.accounted_size = object.meta.accounted_size;
12866 list_state.meta.mtime = object.meta.mtime;
12867 list_state.meta.category = main_category;
12868 list_state.meta.etag = etag;
12869 list_state.meta.content_type = content_type;
12870 if (astate->obj_tag.length() > 0)
12871 list_state.tag = astate->obj_tag.c_str();
12872 list_state.meta.owner = owner.get_id().to_str();
12873 list_state.meta.owner_display_name = owner.get_display_name();
12874
12875 list_state.exists = true;
12876 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
12877 return 0;
12878 }
12879
12880 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
12881 {
12882 librados::IoCtx index_ctx;
12883 map<int, string> oids;
12884 map<int, struct rgw_cls_list_ret> list_results;
12885 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
12886 if (r < 0)
12887 return r;
12888
12889 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12890 if (r < 0)
12891 return r;
12892
12893 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12894 for(; iter != list_results.end(); ++iter) {
12895 headers[oids[iter->first]] = iter->second.dir.header;
12896 }
12897 return 0;
12898 }
12899
12900 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
12901 {
12902 librados::IoCtx index_ctx;
12903 map<int, string> bucket_objs;
12904 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12905 if (r < 0)
12906 return r;
12907
12908 map<int, string>::iterator iter = bucket_objs.begin();
12909 for (; iter != bucket_objs.end(); ++iter) {
12910 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
12911 if (r < 0) {
12912 ctx->put();
12913 break;
12914 } else {
12915 (*num_aio)++;
12916 }
12917 }
12918 return r;
12919 }
12920
12921 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
12922 {
12923 string buckets_obj_id;
12924 rgw_get_buckets_obj(user_id, buckets_obj_id);
12925 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12926
12927 rgw_rados_ref ref;
12928 int r = get_raw_obj_ref(obj, &ref);
12929 if (r < 0) {
12930 return r;
12931 }
12932
12933 librados::ObjectReadOperation op;
12934 int rc;
12935 ::cls_user_get_header(op, header, &rc);
12936 bufferlist ibl;
12937 r = ref.ioctx.operate(ref.oid, &op, &ibl);
12938 if (r < 0)
12939 return r;
12940 if (rc < 0)
12941 return rc;
12942
12943 return 0;
12944 }
12945
12946 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
12947 {
12948 string buckets_obj_id;
12949 rgw_get_buckets_obj(user_id, buckets_obj_id);
12950 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12951
12952 rgw_rados_ref ref;
12953 int r = get_raw_obj_ref(obj, &ref);
12954 if (r < 0) {
12955 return r;
12956 }
12957
12958 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
12959 if (r < 0)
12960 return r;
12961
12962 return 0;
12963 }
12964
12965 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
12966 {
12967 map<string, struct rgw_bucket_dir_header> headers;
12968 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12969 if (r < 0) {
12970 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
12971 return r;
12972 }
12973
12974 cls_user_bucket_entry entry;
12975
12976 bucket_info.bucket.convert(&entry.bucket);
12977
12978 map<string, struct rgw_bucket_dir_header>::iterator hiter = headers.begin();
12979 for (; hiter != headers.end(); ++hiter) {
12980 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = hiter->second.stats.begin();
12981 for (; iter != hiter->second.stats.end(); ++iter) {
12982 struct rgw_bucket_category_stats& header_stats = iter->second;
12983 entry.size += header_stats.total_size;
12984 entry.size_rounded += header_stats.total_size_rounded;
12985 entry.count += header_stats.num_entries;
12986 }
12987 }
12988
12989 list<cls_user_bucket_entry> entries;
12990 entries.push_back(entry);
12991
12992 r = cls_user_update_buckets(user_obj, entries, false);
12993 if (r < 0) {
12994 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
12995 return r;
12996 }
12997
12998 return 0;
12999 }
13000
13001 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13002 const string& in_marker,
13003 const string& end_marker,
13004 const int max_entries,
13005 list<cls_user_bucket_entry>& entries,
13006 string * const out_marker,
13007 bool * const truncated)
13008 {
13009 rgw_rados_ref ref;
13010 int r = get_raw_obj_ref(obj, &ref);
13011 if (r < 0) {
13012 return r;
13013 }
13014
13015 librados::ObjectReadOperation op;
13016 int rc;
13017
13018 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13019 bufferlist ibl;
13020 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13021 if (r < 0)
13022 return r;
13023 if (rc < 0)
13024 return rc;
13025
13026 return 0;
13027 }
13028
13029 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13030 {
13031 rgw_rados_ref ref;
13032 int r = get_raw_obj_ref(obj, &ref);
13033 if (r < 0) {
13034 return r;
13035 }
13036
13037 librados::ObjectWriteOperation op;
13038 cls_user_set_buckets(op, entries, add);
13039 r = ref.ioctx.operate(ref.oid, &op);
13040 if (r < 0)
13041 return r;
13042
13043 return 0;
13044 }
13045
13046 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13047 {
13048 string buckets_obj_id;
13049 rgw_get_buckets_obj(user_id, buckets_obj_id);
13050 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13051 return cls_user_complete_stats_sync(obj);
13052 }
13053
13054 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13055 {
13056 rgw_rados_ref ref;
13057 int r = get_raw_obj_ref(obj, &ref);
13058 if (r < 0) {
13059 return r;
13060 }
13061
13062 librados::ObjectWriteOperation op;
13063 ::cls_user_complete_stats_sync(op);
13064 r = ref.ioctx.operate(ref.oid, &op);
13065 if (r < 0)
13066 return r;
13067
13068 return 0;
13069 }
13070
13071 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13072 {
13073 list<cls_user_bucket_entry> l;
13074 l.push_back(entry);
13075
13076 return cls_user_update_buckets(obj, l, true);
13077 }
13078
13079 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13080 {
13081 rgw_rados_ref ref;
13082 int r = get_system_obj_ref(obj, &ref);
13083 if (r < 0) {
13084 return r;
13085 }
13086
13087 librados::ObjectWriteOperation op;
13088 ::cls_user_remove_bucket(op, bucket);
13089 r = ref.ioctx.operate(ref.oid, &op);
13090 if (r < 0)
13091 return r;
13092
13093 return 0;
13094 }
13095
13096 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13097 RGWQuotaInfo& bucket_quota)
13098 {
13099 if (!cct->_conf->rgw_dynamic_resharding) {
13100 return 0;
13101 }
13102
13103 bool need_resharding = false;
13104 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13105 uint32_t suggested_num_shards;
13106
13107 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13108 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13109 1, need_resharding, &suggested_num_shards);
13110 if (ret < 0) {
13111 return ret;
13112 }
13113
13114 if (need_resharding) {
13115 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13116 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13117 dendl;
13118 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13119 }
13120
13121 return ret;
13122 }
13123
13124 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13125 {
13126 RGWReshard reshard(this);
13127
13128 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13129
13130 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13131 if (new_num_shards <= num_source_shards) {
13132 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13133 return 0;
13134 }
13135
13136 cls_rgw_reshard_entry entry;
13137 entry.time = real_clock::now();
13138 entry.tenant = bucket_info.owner.tenant;
13139 entry.bucket_name = bucket_info.bucket.name;
13140 entry.bucket_id = bucket_info.bucket.bucket_id;
13141 entry.old_num_shards = num_source_shards;
13142 entry.new_num_shards = new_num_shards;
13143
13144 return reshard.add(entry);
13145 }
13146
13147 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13148 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13149 {
13150 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13151 }
13152
13153 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13154 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13155 {
13156 if (!num_shards) {
13157 bucket_objects[0] = bucket_oid_base;
13158 } else {
13159 char buf[bucket_oid_base.size() + 32];
13160 if (shard_id < 0) {
13161 for (uint32_t i = 0; i < num_shards; ++i) {
13162 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13163 bucket_objects[i] = buf;
13164 }
13165 } else {
13166 if ((uint32_t)shard_id > num_shards) {
13167 return;
13168 }
13169 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13170 bucket_objects[shard_id] = buf;
13171 }
13172 }
13173 }
13174
13175 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13176 {
13177 const rgw_bucket& bucket = bucket_info.bucket;
13178 string plain_id = bucket.name + ":" + bucket.bucket_id;
13179 if (!bucket_info.num_shards) {
13180 (*result)[0] = plain_id;
13181 } else {
13182 char buf[16];
13183 if (shard_id < 0) {
13184 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13185 snprintf(buf, sizeof(buf), ":%d", i);
13186 (*result)[i] = plain_id + buf;
13187 }
13188 } else {
13189 if ((uint32_t)shard_id > bucket_info.num_shards) {
13190 return;
13191 }
13192 snprintf(buf, sizeof(buf), ":%d", shard_id);
13193 (*result)[shard_id] = plain_id + buf;
13194 }
13195 }
13196 }
13197
13198 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13199 int *shard_id)
13200 {
13201 int r = 0;
13202 switch (bucket_info.bucket_index_shard_hash_type) {
13203 case RGWBucketInfo::MOD:
13204 if (!bucket_info.num_shards) {
13205 if (shard_id) {
13206 *shard_id = -1;
13207 }
13208 } else {
13209 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13210 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13211 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13212 if (shard_id) {
13213 *shard_id = (int)sid;
13214 }
13215 }
13216 break;
13217 default:
13218 r = -ENOTSUP;
13219 }
13220 return r;
13221 }
13222
13223 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13224 int shard_id, string *bucket_obj)
13225 {
13226 if (!num_shards) {
13227 // By default with no sharding, we use the bucket oid as itself
13228 (*bucket_obj) = bucket_oid_base;
13229 } else {
13230 char buf[bucket_oid_base.size() + 32];
13231 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13232 (*bucket_obj) = buf;
13233 }
13234 }
13235
13236 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13237 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13238 {
13239 int r = 0;
13240 switch (hash_type) {
13241 case RGWBucketInfo::MOD:
13242 if (!num_shards) {
13243 // By default with no sharding, we use the bucket oid as itself
13244 (*bucket_obj) = bucket_oid_base;
13245 if (shard_id) {
13246 *shard_id = -1;
13247 }
13248 } else {
13249 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13250 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13251 sid = rgw_shards_mod(sid2, num_shards);
13252 char buf[bucket_oid_base.size() + 32];
13253 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13254 (*bucket_obj) = buf;
13255 if (shard_id) {
13256 *shard_id = (int)sid;
13257 }
13258 }
13259 break;
13260 default:
13261 r = -ENOTSUP;
13262 }
13263 return r;
13264 }
13265
13266 void RGWStateLog::oid_str(int shard, string& oid) {
13267 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13268 char buf[16];
13269 snprintf(buf, sizeof(buf), "%d", shard);
13270 oid += buf;
13271 }
13272
13273 int RGWStateLog::get_shard_num(const string& object) {
13274 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13275 return val % num_shards;
13276 }
13277
13278 string RGWStateLog::get_oid(const string& object) {
13279 int shard = get_shard_num(object);
13280 string oid;
13281 oid_str(shard, oid);
13282 return oid;
13283 }
13284
13285 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13286 rgw_pool pool;
13287 store->get_log_pool(pool);
13288 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13289 if (r < 0) {
13290 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13291 return r;
13292 }
13293 return 0;
13294 }
13295
13296 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13297 uint32_t state, bufferlist *bl, uint32_t *check_state)
13298 {
13299 if (client_id.empty() ||
13300 op_id.empty() ||
13301 object.empty()) {
13302 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13303 }
13304
13305 librados::IoCtx ioctx;
13306 int r = open_ioctx(ioctx);
13307 if (r < 0)
13308 return r;
13309
13310 string oid = get_oid(object);
13311
13312 librados::ObjectWriteOperation op;
13313 if (check_state) {
13314 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13315 }
13316 utime_t ts = ceph_clock_now();
13317 bufferlist nobl;
13318 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13319 r = ioctx.operate(oid, &op);
13320 if (r < 0) {
13321 return r;
13322 }
13323
13324 return 0;
13325 }
13326
13327 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13328 {
13329 if (client_id.empty() ||
13330 op_id.empty() ||
13331 object.empty()) {
13332 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13333 }
13334
13335 librados::IoCtx ioctx;
13336 int r = open_ioctx(ioctx);
13337 if (r < 0)
13338 return r;
13339
13340 string oid = get_oid(object);
13341
13342 librados::ObjectWriteOperation op;
13343 cls_statelog_remove_by_object(op, object, op_id);
13344 r = ioctx.operate(oid, &op);
13345 if (r < 0) {
13346 return r;
13347 }
13348
13349 return 0;
13350 }
13351
13352 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13353 void **handle)
13354 {
13355 list_state *state = new list_state;
13356 state->client_id = client_id;
13357 state->op_id = op_id;
13358 state->object = object;
13359 if (object.empty()) {
13360 state->cur_shard = 0;
13361 state->max_shard = num_shards - 1;
13362 } else {
13363 state->cur_shard = state->max_shard = get_shard_num(object);
13364 }
13365 *handle = (void *)state;
13366 }
13367
13368 int RGWStateLog::list_entries(void *handle, int max_entries,
13369 list<cls_statelog_entry>& entries,
13370 bool *done)
13371 {
13372 list_state *state = static_cast<list_state *>(handle);
13373
13374 librados::IoCtx ioctx;
13375 int r = open_ioctx(ioctx);
13376 if (r < 0)
13377 return r;
13378
13379 entries.clear();
13380
13381 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13382 string oid;
13383 oid_str(state->cur_shard, oid);
13384
13385 librados::ObjectReadOperation op;
13386 list<cls_statelog_entry> ents;
13387 bool truncated;
13388 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13389 max_entries, ents, &state->marker, &truncated);
13390 bufferlist ibl;
13391 r = ioctx.operate(oid, &op, &ibl);
13392 if (r == -ENOENT) {
13393 truncated = false;
13394 r = 0;
13395 }
13396 if (r < 0) {
13397 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13398 return r;
13399 }
13400
13401 if (!truncated) {
13402 state->marker.clear();
13403 }
13404
13405 max_entries -= ents.size();
13406
13407 entries.splice(entries.end(), ents);
13408
13409 if (truncated)
13410 break;
13411 }
13412
13413 *done = (state->cur_shard > state->max_shard);
13414
13415 return 0;
13416 }
13417
13418 void RGWStateLog::finish_list_entries(void *handle)
13419 {
13420 list_state *state = static_cast<list_state *>(handle);
13421 delete state;
13422 }
13423
13424 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13425 {
13426 f->open_object_section("statelog_entry");
13427 f->dump_string("client_id", entry.client_id);
13428 f->dump_string("op_id", entry.op_id);
13429 f->dump_string("object", entry.object);
13430 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13431 if (!dump_entry_internal(entry, f)) {
13432 f->dump_int("state", entry.state);
13433 }
13434 f->close_section();
13435 }
13436
13437 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13438 {
13439 }
13440
13441 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13442 {
13443 string s;
13444 switch ((OpState)entry.state) {
13445 case OPSTATE_UNKNOWN:
13446 s = "unknown";
13447 break;
13448 case OPSTATE_IN_PROGRESS:
13449 s = "in-progress";
13450 break;
13451 case OPSTATE_COMPLETE:
13452 s = "complete";
13453 break;
13454 case OPSTATE_ERROR:
13455 s = "error";
13456 break;
13457 case OPSTATE_ABORT:
13458 s = "abort";
13459 break;
13460 case OPSTATE_CANCELLED:
13461 s = "cancelled";
13462 break;
13463 default:
13464 s = "invalid";
13465 }
13466 f->dump_string("state", s);
13467 return true;
13468 }
13469
13470 int RGWOpState::state_from_str(const string& s, OpState *state)
13471 {
13472 if (s == "unknown") {
13473 *state = OPSTATE_UNKNOWN;
13474 } else if (s == "in-progress") {
13475 *state = OPSTATE_IN_PROGRESS;
13476 } else if (s == "complete") {
13477 *state = OPSTATE_COMPLETE;
13478 } else if (s == "error") {
13479 *state = OPSTATE_ERROR;
13480 } else if (s == "abort") {
13481 *state = OPSTATE_ABORT;
13482 } else if (s == "cancelled") {
13483 *state = OPSTATE_CANCELLED;
13484 } else {
13485 return -EINVAL;
13486 }
13487
13488 return 0;
13489 }
13490
13491 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13492 {
13493 uint32_t s = (uint32_t)state;
13494 return store_entry(client_id, op_id, object, s, NULL, NULL);
13495 }
13496
13497 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13498 {
13499 uint32_t s = (uint32_t)state;
13500 return store_entry(client_id, op_id, object, s, NULL, &s);
13501 }
13502
13503 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13504 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13505 {
13506 cct = store->ctx();
13507 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13508 }
13509
13510 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13511 last_update = real_clock::now();
13512 cur_state = state;
13513 return os.set_state(client_id, op_id, object, state);
13514 }
13515
13516 int RGWOpStateSingleOp::renew_state() {
13517 real_time now = real_clock::now();
13518
13519 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13520
13521 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13522 return 0;
13523 }
13524
13525 last_update = now;
13526 return os.renew_state(client_id, op_id, object, cur_state);
13527 }
13528
13529
13530 uint64_t RGWRados::instance_id()
13531 {
13532 return get_rados_handle()->get_instance_id();
13533 }
13534
13535 uint64_t RGWRados::next_bucket_id()
13536 {
13537 Mutex::Locker l(bucket_id_lock);
13538 return ++max_bucket_id;
13539 }
13540
13541 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13542 {
13543 int use_cache = cct->_conf->rgw_cache_enabled;
13544 RGWRados *store = NULL;
13545 if (!use_cache) {
13546 store = new RGWRados;
13547 } else {
13548 store = new RGWCache<RGWRados>;
13549 }
13550
13551 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13552 delete store;
13553 return NULL;
13554 }
13555
13556 return store;
13557 }
13558
13559 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13560 {
13561 RGWRados *store = NULL;
13562 store = new RGWRados;
13563
13564 store->set_context(cct);
13565
13566 if (store->init_rados() < 0) {
13567 delete store;
13568 return NULL;
13569 }
13570
13571 return store;
13572 }
13573
13574 void RGWStoreManager::close_storage(RGWRados *store)
13575 {
13576 if (!store)
13577 return;
13578
13579 store->finalize();
13580
13581 delete store;
13582 }
13583
13584 librados::Rados* RGWRados::get_rados_handle()
13585 {
13586 if (rados.size() == 1) {
13587 return &rados[0];
13588 } else {
13589 handle_lock.get_read();
13590 pthread_t id = pthread_self();
13591 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13592
13593 if (it != rados_map.end()) {
13594 handle_lock.put_read();
13595 return &rados[it->second];
13596 } else {
13597 handle_lock.put_read();
13598 handle_lock.get_write();
13599 const uint32_t handle = next_rados_handle;
13600 rados_map[id] = handle;
13601 if (++next_rados_handle == rados.size()) {
13602 next_rados_handle = 0;
13603 }
13604 handle_lock.put_write();
13605 return &rados[handle];
13606 }
13607 }
13608 }
13609
13610 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13611 {
13612 rgw_rados_ref ref;
13613 int ret = get_raw_obj_ref(obj, &ref);
13614 if (ret < 0) {
13615 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13616 return ret;
13617 }
13618
13619 ObjectWriteOperation op;
13620 list<string> prefixes;
13621 cls_rgw_remove_obj(op, prefixes);
13622
13623 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13624 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13625 if (ret < 0) {
13626 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13627 c->release();
13628 return ret;
13629 }
13630
13631 handles.push_back(c);
13632
13633 return 0;
13634 }
13635
13636 int RGWRados::delete_obj_aio(const rgw_obj& obj,
13637 RGWBucketInfo& bucket_info, RGWObjState *astate,
13638 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13639 {
13640 rgw_rados_ref ref;
13641 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13642 if (ret < 0) {
13643 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13644 return ret;
13645 }
13646
13647 if (keep_index_consistent) {
13648 RGWRados::Bucket bop(this, bucket_info);
13649 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13650
13651 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13652 if (ret < 0) {
13653 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13654 return ret;
13655 }
13656 }
13657
13658 ObjectWriteOperation op;
13659 list<string> prefixes;
13660 cls_rgw_remove_obj(op, prefixes);
13661
13662 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13663 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13664 if (ret < 0) {
13665 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13666 c->release();
13667 return ret;
13668 }
13669
13670 handles.push_back(c);
13671
13672 if (keep_index_consistent) {
13673 ret = delete_obj_index(obj);
13674 if (ret < 0) {
13675 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13676 return ret;
13677 }
13678 }
13679 return ret;
13680 }
13681
13682 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
13683 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
13684 if (value != attrs.end()) {
13685 bufferlist::iterator bliter = value->second.begin();
13686 try {
13687 ::decode(cs_info, bliter);
13688 } catch (buffer::error& err) {
13689 return -EIO;
13690 }
13691 if (cs_info.blocks.size() == 0) {
13692 return -EIO;
13693 }
13694 if (cs_info.compression_type != "none")
13695 need_decompress = true;
13696 else
13697 need_decompress = false;
13698 return 0;
13699 } else {
13700 need_decompress = false;
13701 return 0;
13702 }
13703 }
13704