]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to v12.2.5
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
13
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
16
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
24 #include "rgw_acl.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
31
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "osd/osd_types.h"
44
45 #include "rgw_tools.h"
46 #include "rgw_coroutine.h"
47 #include "rgw_compression.h"
48
49 #undef fork // fails to compile RGWPeriod::fork() below
50
51 #include "common/Clock.h"
52
53 #include "include/rados/librados.hpp"
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "auth/Crypto.h" // get_random_bytes()
63
64 #include "rgw_log.h"
65
66 #include "rgw_gc.h"
67 #include "rgw_lc.h"
68
69 #include "rgw_object_expirer_core.h"
70 #include "rgw_sync.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "compressor/Compressor.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
79
80 using namespace std;
81
82 static string notify_oid_prefix = "notify";
83 static string *notify_oids = NULL;
84 static string shadow_ns = "shadow";
85 static string dir_oid_prefix = ".dir.";
86 static string default_storage_pool_suffix = "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89 static string avail_pools = ".pools.avail";
90
91 static string zone_info_oid_prefix = "zone_info.";
92 static string zone_names_oid_prefix = "zone_names.";
93 static string region_info_oid_prefix = "region_info.";
94 static string zone_group_info_oid_prefix = "zonegroup_info.";
95 static string realm_names_oid_prefix = "realms_names.";
96 static string realm_info_oid_prefix = "realms.";
97 static string default_region_info_oid = "default.region";
98 static string default_zone_group_info_oid = "default.zonegroup";
99 static string period_info_oid_prefix = "periods.";
100 static string period_latest_epoch_info_oid = ".latest_epoch";
101 static string region_map_oid = "region_map";
102 static string zonegroup_map_oid = "zonegroup_map";
103 static string log_lock_name = "rgw_log_lock";
104 static string default_realm_info_oid = "default.realm";
105 const string default_zonegroup_name = "default";
106 const string default_zone_name = "default";
107 static string zonegroup_names_oid_prefix = "zonegroups_names.";
108 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118 #define dout_subsys ceph_subsys_rgw
119
120
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123 {
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
133 *pool = placement.get_data_extra_pool();
134 }
135 }
136
137 return true;
138 }
139
140 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142 {
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146 }
147
148 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149 {
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156 }
157
158 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159 {
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166 }
167
168 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169 {
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
173 if (r < 0 && r != -EEXIST) {
174 return r;
175 }
176
177 r = rados->ioctx_create(pool.name.c_str(), ioctx);
178 if (r < 0) {
179 return r;
180 }
181
182 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
183 if (r < 0 && r != -EOPNOTSUPP) {
184 return r;
185 }
186 } else if (r < 0) {
187 return r;
188 }
189 if (!pool.ns.empty()) {
190 ioctx.set_namespace(pool.ns);
191 }
192 return 0;
193 }
194
195 template<>
196 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
197 RWLock::WLocker wl(lock);
198 auto iter = objs_state.find(obj);
199 if (iter == objs_state.end()) {
200 return;
201 }
202 bool is_atomic = iter->second.is_atomic;
203 bool prefetch_data = iter->second.prefetch_data;
204
205 objs_state.erase(iter);
206
207 if (is_atomic || prefetch_data) {
208 auto& s = objs_state[obj];
209 s.is_atomic = is_atomic;
210 s.prefetch_data = prefetch_data;
211 }
212 }
213
214 template<>
215 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
216 RWLock::WLocker wl(lock);
217 auto iter = objs_state.find(obj);
218 if (iter == objs_state.end()) {
219 return;
220 }
221
222 objs_state.erase(iter);
223 }
224
225 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
226 encode_json("default_zonegroup", default_zonegroup, f);
227 }
228
229 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
230
231 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
232 /* backward compatability with region */
233 if (default_zonegroup.empty()) {
234 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
235 }
236 }
237
238 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
239 {
240 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
241 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
242 }
243
244 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
245 }
246
247 int RGWZoneGroup::create_default(bool old_format)
248 {
249 name = default_zonegroup_name;
250 is_master = true;
251
252 RGWZoneGroupPlacementTarget placement_target;
253 placement_target.name = "default-placement";
254 placement_targets[placement_target.name] = placement_target;
255 default_placement = "default-placement";
256
257 RGWZoneParams zone_params(default_zone_name);
258
259 int r = zone_params.init(cct, store, false);
260 if (r < 0) {
261 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
262 return r;
263 }
264
265 r = zone_params.create_default();
266 if (r < 0 && r != -EEXIST) {
267 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
268 return r;
269 } else if (r == -EEXIST) {
270 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
271 zone_params.clear_id();
272 r = zone_params.init(cct, store);
273 if (r < 0) {
274 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
275 return r;
276 }
277 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
278 << dendl;
279 }
280
281 RGWZone& default_zone = zones[zone_params.get_id()];
282 default_zone.name = zone_params.get_name();
283 default_zone.id = zone_params.get_id();
284 master_zone = default_zone.id;
285
286 r = create();
287 if (r < 0 && r != -EEXIST) {
288 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
289 return r;
290 }
291
292 if (r == -EEXIST) {
293 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
294 id.clear();
295 r = init(cct, store);
296 if (r < 0) {
297 return r;
298 }
299 }
300
301 if (old_format) {
302 name = id;
303 }
304
305 post_process_params();
306
307 return 0;
308 }
309
310 const string RGWZoneGroup::get_default_oid(bool old_region_format)
311 {
312 if (old_region_format) {
313 if (cct->_conf->rgw_default_region_info_oid.empty()) {
314 return default_region_info_oid;
315 }
316 return cct->_conf->rgw_default_region_info_oid;
317 }
318
319 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
320
321 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
322 default_oid = default_zone_group_info_oid;
323 }
324
325 default_oid += "." + realm_id;
326
327 return default_oid;
328 }
329
330 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
331 {
332 if (old_region_format) {
333 return region_info_oid_prefix;
334 }
335 return zone_group_info_oid_prefix;
336 }
337
338 const string& RGWZoneGroup::get_names_oid_prefix()
339 {
340 return zonegroup_names_oid_prefix;
341 }
342
343 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
344 return cct->_conf->rgw_zonegroup;
345 }
346
347 int RGWZoneGroup::equals(const string& other_zonegroup) const
348 {
349 if (is_master && other_zonegroup.empty())
350 return true;
351
352 return (id == other_zonegroup);
353 }
354
355 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
356 const list<string>& endpoints, const string *ptier_type,
357 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
358 {
359 auto& zone_id = zone_params.get_id();
360 auto& zone_name = zone_params.get_name();
361
362 // check for duplicate zone name on insert
363 if (!zones.count(zone_id)) {
364 for (const auto& zone : zones) {
365 if (zone.second.name == zone_name) {
366 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
367 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
368 return -EEXIST;
369 }
370 }
371 }
372
373 if (is_master) {
374 if (*is_master) {
375 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
376 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
377 }
378 master_zone = zone_params.get_id();
379 } else if (master_zone == zone_params.get_id()) {
380 master_zone.clear();
381 }
382 }
383
384 RGWZone& zone = zones[zone_params.get_id()];
385 zone.name = zone_params.get_name();
386 zone.id = zone_params.get_id();
387 if (!endpoints.empty()) {
388 zone.endpoints = endpoints;
389 }
390 if (read_only) {
391 zone.read_only = *read_only;
392 }
393 if (ptier_type) {
394 zone.tier_type = *ptier_type;
395 }
396
397 if (psync_from_all) {
398 zone.sync_from_all = *psync_from_all;
399 }
400
401 for (auto add : sync_from) {
402 zone.sync_from.insert(add);
403 }
404
405 for (auto rm : sync_from_rm) {
406 zone.sync_from.erase(rm);
407 }
408
409 post_process_params();
410
411 return update();
412 }
413
414
415 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
416 {
417 RGWZone& zone = zones[zone_params.get_id()];
418 zone.name = zone_params.get_name();
419
420 return update();
421 }
422
423 void RGWZoneGroup::post_process_params()
424 {
425 bool log_data = zones.size() > 1;
426
427 if (master_zone.empty()) {
428 map<string, RGWZone>::iterator iter = zones.begin();
429 if (iter != zones.end()) {
430 master_zone = iter->first;
431 }
432 }
433
434 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
435 RGWZone& zone = iter->second;
436 zone.log_data = log_data;
437
438 RGWZoneParams zone_params(zone.id, zone.name);
439 int ret = zone_params.init(cct, store);
440 if (ret < 0) {
441 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
442 continue;
443 }
444
445 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
446 iter != zone_params.placement_pools.end(); ++iter) {
447 const string& placement_name = iter->first;
448 if (placement_targets.find(placement_name) == placement_targets.end()) {
449 RGWZoneGroupPlacementTarget placement_target;
450 placement_target.name = placement_name;
451 placement_targets[placement_name] = placement_target;
452 }
453 }
454 }
455
456 if (default_placement.empty() && !placement_targets.empty()) {
457 default_placement = placement_targets.begin()->first;
458 }
459 }
460
461 int RGWZoneGroup::remove_zone(const std::string& zone_id)
462 {
463 map<string, RGWZone>::iterator iter = zones.find(zone_id);
464 if (iter == zones.end()) {
465 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
466 << name << dendl;
467 return -ENOENT;
468 }
469
470 zones.erase(iter);
471
472 post_process_params();
473
474 return update();
475 }
476
477 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
478 {
479 if (realm_id.empty()) {
480 /* try using default realm */
481 RGWRealm realm;
482 int ret = realm.init(cct, store);
483 // no default realm exist
484 if (ret < 0) {
485 return read_id(default_zonegroup_name, default_id);
486 }
487 realm_id = realm.get_id();
488 }
489
490 return RGWSystemMetaObj::read_default_id(default_id, old_format);
491 }
492
493 int RGWZoneGroup::set_as_default(bool exclusive)
494 {
495 if (realm_id.empty()) {
496 /* try using default realm */
497 RGWRealm realm;
498 int ret = realm.init(cct, store);
499 if (ret < 0) {
500 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
501 return -EINVAL;
502 }
503 realm_id = realm.get_id();
504 }
505
506 return RGWSystemMetaObj::set_as_default(exclusive);
507 }
508
509 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
510 {
511 cct = _cct;
512 store = _store;
513
514 if (!setup_obj)
515 return 0;
516
517 if (old_format && id.empty()) {
518 id = name;
519 }
520
521 if (id.empty()) {
522 int r;
523 if (name.empty()) {
524 name = get_predefined_name(cct);
525 }
526 if (name.empty()) {
527 r = use_default(old_format);
528 if (r < 0) {
529 return r;
530 }
531 } else if (!old_format) {
532 r = read_id(name, id);
533 if (r < 0) {
534 if (r != -ENOENT) {
535 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
536 }
537 return r;
538 }
539 }
540 }
541
542 return read_info(id, old_format);
543 }
544
545 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
546 {
547 auto pool = get_pool(cct);
548 bufferlist bl;
549 RGWObjectCtx obj_ctx(store);
550 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
551 if (ret < 0)
552 return ret;
553
554 try {
555 bufferlist::iterator iter = bl.begin();
556 ::decode(default_info, iter);
557 } catch (buffer::error& err) {
558 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
559 return -EIO;
560 }
561
562 return 0;
563 }
564
565 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
566 {
567 RGWDefaultSystemMetaObjInfo default_info;
568
569 int ret = read_default(default_info, get_default_oid(old_format));
570 if (ret < 0) {
571 return ret;
572 }
573
574 default_id = default_info.default_id;
575
576 return 0;
577 }
578
579 int RGWSystemMetaObj::use_default(bool old_format)
580 {
581 return read_default_id(id, old_format);
582 }
583
584 int RGWSystemMetaObj::set_as_default(bool exclusive)
585 {
586 string oid = get_default_oid();
587
588 rgw_pool pool(get_pool(cct));
589 bufferlist bl;
590
591 RGWDefaultSystemMetaObjInfo default_info;
592 default_info.default_id = id;
593
594 ::encode(default_info, bl);
595
596 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
597 exclusive, NULL, real_time(), NULL);
598 if (ret < 0)
599 return ret;
600
601 return 0;
602 }
603
604 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
605 {
606 rgw_pool pool(get_pool(cct));
607 bufferlist bl;
608
609 string oid = get_names_oid_prefix() + obj_name;
610
611 RGWObjectCtx obj_ctx(store);
612 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
613 if (ret < 0) {
614 return ret;
615 }
616
617 RGWNameToId nameToId;
618 try {
619 bufferlist::iterator iter = bl.begin();
620 ::decode(nameToId, iter);
621 } catch (buffer::error& err) {
622 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
623 return -EIO;
624 }
625 object_id = nameToId.obj_id;
626 return 0;
627 }
628
629 int RGWSystemMetaObj::delete_obj(bool old_format)
630 {
631 rgw_pool pool(get_pool(cct));
632
633 /* check to see if obj is the default */
634 RGWDefaultSystemMetaObjInfo default_info;
635 int ret = read_default(default_info, get_default_oid(old_format));
636 if (ret < 0 && ret != -ENOENT)
637 return ret;
638 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
639 string oid = get_default_oid(old_format);
640 rgw_raw_obj default_named_obj(pool, oid);
641 ret = store->delete_system_obj(default_named_obj);
642 if (ret < 0) {
643 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
644 return ret;
645 }
646 }
647 if (!old_format) {
648 string oid = get_names_oid_prefix() + name;
649 rgw_raw_obj object_name(pool, oid);
650 ret = store->delete_system_obj(object_name);
651 if (ret < 0) {
652 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
653 return ret;
654 }
655 }
656
657 string oid = get_info_oid_prefix(old_format);
658 if (old_format) {
659 oid += name;
660 } else {
661 oid += id;
662 }
663
664 rgw_raw_obj object_id(pool, oid);
665 ret = store->delete_system_obj(object_id);
666 if (ret < 0) {
667 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
668 }
669
670 return ret;
671 }
672
673 int RGWSystemMetaObj::store_name(bool exclusive)
674 {
675 rgw_pool pool(get_pool(cct));
676 string oid = get_names_oid_prefix() + name;
677
678 RGWNameToId nameToId;
679 nameToId.obj_id = id;
680
681 bufferlist bl;
682 ::encode(nameToId, bl);
683 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
684 }
685
686 int RGWSystemMetaObj::rename(const string& new_name)
687 {
688 string new_id;
689 int ret = read_id(new_name, new_id);
690 if (!ret) {
691 return -EEXIST;
692 }
693 if (ret < 0 && ret != -ENOENT) {
694 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
695 return ret;
696 }
697 string old_name = name;
698 name = new_name;
699 ret = update();
700 if (ret < 0) {
701 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
702 return ret;
703 }
704 ret = store_name(true);
705 if (ret < 0) {
706 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
707 return ret;
708 }
709 /* delete old name */
710 rgw_pool pool(get_pool(cct));
711 string oid = get_names_oid_prefix() + old_name;
712 rgw_raw_obj old_name_obj(pool, oid);
713 ret = store->delete_system_obj(old_name_obj);
714 if (ret < 0) {
715 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
716 return ret;
717 }
718
719 return ret;
720 }
721
722 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
723 {
724 rgw_pool pool(get_pool(cct));
725
726 bufferlist bl;
727
728 string oid = get_info_oid_prefix(old_format) + obj_id;
729
730 RGWObjectCtx obj_ctx(store);
731 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
732 if (ret < 0) {
733 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
734 return ret;
735 }
736
737 try {
738 bufferlist::iterator iter = bl.begin();
739 ::decode(*this, iter);
740 } catch (buffer::error& err) {
741 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
742 return -EIO;
743 }
744
745 return 0;
746 }
747
748 int RGWSystemMetaObj::read()
749 {
750 int ret = read_id(name, id);
751 if (ret < 0) {
752 return ret;
753 }
754
755 return read_info(id);
756 }
757
758 int RGWSystemMetaObj::create(bool exclusive)
759 {
760 int ret;
761
762 /* check to see the name is not used */
763 ret = read_id(name, id);
764 if (exclusive && ret == 0) {
765 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
766 return -EEXIST;
767 } else if ( ret < 0 && ret != -ENOENT) {
768 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
769 return ret;
770 }
771
772 if (id.empty()) {
773 /* create unique id */
774 uuid_d new_uuid;
775 char uuid_str[37];
776 new_uuid.generate_random();
777 new_uuid.print(uuid_str);
778 id = uuid_str;
779 }
780
781 ret = store_info(exclusive);
782 if (ret < 0) {
783 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
784 return ret;
785 }
786
787 return store_name(exclusive);
788 }
789
790 int RGWSystemMetaObj::store_info(bool exclusive)
791 {
792 rgw_pool pool(get_pool(cct));
793
794 string oid = get_info_oid_prefix() + id;
795
796 bufferlist bl;
797 ::encode(*this, bl);
798 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
799 }
800
801 int RGWSystemMetaObj::write(bool exclusive)
802 {
803 int ret = store_info(exclusive);
804 if (ret < 0) {
805 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
806 return ret;
807 }
808 ret = store_name(exclusive);
809 if (ret < 0) {
810 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
811 return ret;
812 }
813 return 0;
814 }
815
816
817 const string& RGWRealm::get_predefined_name(CephContext *cct) {
818 return cct->_conf->rgw_realm;
819 }
820
821 int RGWRealm::create(bool exclusive)
822 {
823 int ret = RGWSystemMetaObj::create(exclusive);
824 if (ret < 0) {
825 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
826 return ret;
827 }
828 // create the control object for watch/notify
829 ret = create_control(exclusive);
830 if (ret < 0) {
831 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
832 return ret;
833 }
834 RGWPeriod period;
835 if (current_period.empty()) {
836 /* create new period for the realm */
837 ret = period.init(cct, store, id, name, false);
838 if (ret < 0 ) {
839 return ret;
840 }
841 ret = period.create(true);
842 if (ret < 0) {
843 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
844 return ret;
845 }
846 } else {
847 period = RGWPeriod(current_period, 0);
848 int ret = period.init(cct, store, id, name);
849 if (ret < 0) {
850 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
851 return ret;
852 }
853 }
854 ret = set_current_period(period);
855 if (ret < 0) {
856 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
857 return ret;
858 }
859 // try to set as default. may race with another create, so pass exclusive=true
860 // so we don't override an existing default
861 ret = set_as_default(true);
862 if (ret < 0 && ret != -EEXIST) {
863 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
864 }
865
866 return 0;
867 }
868
869 int RGWRealm::delete_obj()
870 {
871 int ret = RGWSystemMetaObj::delete_obj();
872 if (ret < 0) {
873 return ret;
874 }
875 return delete_control();
876 }
877
878 int RGWRealm::create_control(bool exclusive)
879 {
880 auto pool = rgw_pool{get_pool(cct)};
881 auto oid = get_control_oid();
882 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
883 nullptr, real_time(), nullptr);
884 }
885
886 int RGWRealm::delete_control()
887 {
888 auto pool = rgw_pool{get_pool(cct)};
889 auto obj = rgw_raw_obj{pool, get_control_oid()};
890 return store->delete_system_obj(obj);
891 }
892
893 rgw_pool RGWRealm::get_pool(CephContext *cct)
894 {
895 if (cct->_conf->rgw_realm_root_pool.empty()) {
896 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
897 }
898 return rgw_pool(cct->_conf->rgw_realm_root_pool);
899 }
900
901 const string RGWRealm::get_default_oid(bool old_format)
902 {
903 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
904 return default_realm_info_oid;
905 }
906 return cct->_conf->rgw_default_realm_info_oid;
907 }
908
909 const string& RGWRealm::get_names_oid_prefix()
910 {
911 return realm_names_oid_prefix;
912 }
913
914 const string& RGWRealm::get_info_oid_prefix(bool old_format)
915 {
916 return realm_info_oid_prefix;
917 }
918
919 int RGWRealm::set_current_period(RGWPeriod& period)
920 {
921 // update realm epoch to match the period's
922 if (epoch > period.get_realm_epoch()) {
923 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
924 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
925 return -EINVAL;
926 }
927 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
928 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
929 << period.get_realm_epoch() << ", but different period id "
930 << period.get_id() << " != " << current_period << dendl;
931 return -EINVAL;
932 }
933
934 epoch = period.get_realm_epoch();
935 current_period = period.get_id();
936
937 int ret = update();
938 if (ret < 0) {
939 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
940 return ret;
941 }
942
943 ret = period.reflect();
944 if (ret < 0) {
945 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
946 return ret;
947 }
948
949 return 0;
950 }
951
952 string RGWRealm::get_control_oid()
953 {
954 return get_info_oid_prefix() + id + ".control";
955 }
956
957 int RGWRealm::notify_zone(bufferlist& bl)
958 {
959 // open a context on the realm's pool
960 rgw_pool pool{get_pool(cct)};
961 librados::IoCtx ctx;
962 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
963 if (r < 0) {
964 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
965 return r;
966 }
967 // send a notify on the realm object
968 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
969 if (r < 0) {
970 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
971 return r;
972 }
973 return 0;
974 }
975
976 int RGWRealm::notify_new_period(const RGWPeriod& period)
977 {
978 bufferlist bl;
979 // push the period to dependent zonegroups/zones
980 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
981 ::encode(period, bl);
982 // reload the gateway with the new period
983 ::encode(RGWRealmNotify::Reload, bl);
984
985 return notify_zone(bl);
986 }
987
988 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
989 {
990 if (realm_id.empty()) {
991 return "period_config.default";
992 }
993 return "period_config." + realm_id;
994 }
995
996 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
997 {
998 const auto& pool_name = cct->_conf->rgw_period_root_pool;
999 if (pool_name.empty()) {
1000 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1001 }
1002 return {pool_name};
1003 }
1004
1005 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1006 {
1007 RGWObjectCtx obj_ctx(store);
1008 const auto& pool = get_pool(store->ctx());
1009 const auto& oid = get_oid(realm_id);
1010 bufferlist bl;
1011
1012 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1013 if (ret < 0) {
1014 return ret;
1015 }
1016 try {
1017 bufferlist::iterator iter = bl.begin();
1018 ::decode(*this, iter);
1019 } catch (buffer::error& err) {
1020 return -EIO;
1021 }
1022 return 0;
1023 }
1024
1025 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1026 {
1027 const auto& pool = get_pool(store->ctx());
1028 const auto& oid = get_oid(realm_id);
1029 bufferlist bl;
1030 ::encode(*this, bl);
1031 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1032 false, nullptr, real_time(), nullptr);
1033 }
1034
1035 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1036 const string& period_realm_name, bool setup_obj)
1037 {
1038 cct = _cct;
1039 store = _store;
1040 realm_id = period_realm_id;
1041 realm_name = period_realm_name;
1042
1043 if (!setup_obj)
1044 return 0;
1045
1046 return init(_cct, _store, setup_obj);
1047 }
1048
1049
1050 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1051 {
1052 cct = _cct;
1053 store = _store;
1054
1055 if (!setup_obj)
1056 return 0;
1057
1058 if (id.empty()) {
1059 RGWRealm realm(realm_id, realm_name);
1060 int ret = realm.init(cct, store);
1061 if (ret < 0) {
1062 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1063 cpp_strerror(-ret) << dendl;
1064 return ret;
1065 }
1066 id = realm.get_current_period();
1067 realm_id = realm.get_id();
1068 }
1069
1070 if (!epoch) {
1071 int ret = use_latest_epoch();
1072 if (ret < 0) {
1073 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1074 << " : " << cpp_strerror(-ret) << dendl;
1075 return ret;
1076 }
1077 }
1078
1079 return read_info();
1080 }
1081
1082
1083 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1084 map<string, RGWZoneGroup>::const_iterator iter;
1085 if (!zonegroup_id.empty()) {
1086 iter = period_map.zonegroups.find(zonegroup_id);
1087 } else {
1088 iter = period_map.zonegroups.find("default");
1089 }
1090 if (iter != period_map.zonegroups.end()) {
1091 zonegroup = iter->second;
1092 return 0;
1093 }
1094
1095 return -ENOENT;
1096 }
1097
1098 const string& RGWPeriod::get_latest_epoch_oid()
1099 {
1100 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1101 return period_latest_epoch_info_oid;
1102 }
1103 return cct->_conf->rgw_period_latest_epoch_info_oid;
1104 }
1105
1106 const string& RGWPeriod::get_info_oid_prefix()
1107 {
1108 return period_info_oid_prefix;
1109 }
1110
1111 const string RGWPeriod::get_period_oid_prefix()
1112 {
1113 return get_info_oid_prefix() + id;
1114 }
1115
1116 const string RGWPeriod::get_period_oid()
1117 {
1118 std::ostringstream oss;
1119 oss << get_period_oid_prefix();
1120 // skip the epoch for the staging period
1121 if (id != get_staging_id(realm_id))
1122 oss << "." << epoch;
1123 return oss.str();
1124 }
1125
1126 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1127 RGWObjVersionTracker *objv)
1128 {
1129 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1130
1131 rgw_pool pool(get_pool(cct));
1132 bufferlist bl;
1133 RGWObjectCtx obj_ctx(store);
1134 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1135 if (ret < 0) {
1136 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1137 return ret;
1138 }
1139 try {
1140 bufferlist::iterator iter = bl.begin();
1141 ::decode(info, iter);
1142 } catch (buffer::error& err) {
1143 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1144 return -EIO;
1145 }
1146
1147 return 0;
1148 }
1149
1150 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1151 {
1152 RGWPeriodLatestEpochInfo info;
1153
1154 int ret = read_latest_epoch(info);
1155 if (ret < 0) {
1156 return ret;
1157 }
1158
1159 latest_epoch = info.epoch;
1160
1161 return 0;
1162 }
1163
1164 int RGWPeriod::use_latest_epoch()
1165 {
1166 RGWPeriodLatestEpochInfo info;
1167 int ret = read_latest_epoch(info);
1168 if (ret < 0) {
1169 return ret;
1170 }
1171
1172 epoch = info.epoch;
1173
1174 return 0;
1175 }
1176
1177 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1178 RGWObjVersionTracker *objv)
1179 {
1180 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1181
1182 rgw_pool pool(get_pool(cct));
1183 bufferlist bl;
1184
1185 RGWPeriodLatestEpochInfo info;
1186 info.epoch = epoch;
1187
1188 ::encode(info, bl);
1189
1190 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1191 exclusive, objv, real_time(), nullptr);
1192 }
1193
1194 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1195 {
1196 static constexpr int MAX_RETRIES = 20;
1197
1198 for (int i = 0; i < MAX_RETRIES; i++) {
1199 RGWPeriodLatestEpochInfo info;
1200 RGWObjVersionTracker objv;
1201 bool exclusive = false;
1202
1203 // read existing epoch
1204 int r = read_latest_epoch(info, &objv);
1205 if (r == -ENOENT) {
1206 // use an exclusive create to set the epoch atomically
1207 exclusive = true;
1208 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1209 << " for period=" << id << dendl;
1210 } else if (r < 0) {
1211 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1212 return r;
1213 } else if (epoch <= info.epoch) {
1214 r = -EEXIST; // fail with EEXIST if epoch is not newer
1215 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1216 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1217 return r;
1218 } else {
1219 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1220 << " -> " << epoch << " on period=" << id << dendl;
1221 }
1222
1223 r = set_latest_epoch(epoch, exclusive, &objv);
1224 if (r == -EEXIST) {
1225 continue; // exclusive create raced with another update, retry
1226 } else if (r == -ECANCELED) {
1227 continue; // write raced with a conflicting version, retry
1228 }
1229 if (r < 0) {
1230 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1231 return r;
1232 }
1233 return 0; // return success
1234 }
1235
1236 return -ECANCELED; // fail after max retries
1237 }
1238
1239 int RGWPeriod::delete_obj()
1240 {
1241 rgw_pool pool(get_pool(cct));
1242
1243 // delete the object for each period epoch
1244 for (epoch_t e = 1; e <= epoch; e++) {
1245 RGWPeriod p{get_id(), e};
1246 rgw_raw_obj oid{pool, p.get_period_oid()};
1247 int ret = store->delete_system_obj(oid);
1248 if (ret < 0) {
1249 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1250 << ": " << cpp_strerror(-ret) << dendl;
1251 }
1252 }
1253
1254 // delete the .latest_epoch object
1255 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1256 int ret = store->delete_system_obj(oid);
1257 if (ret < 0) {
1258 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1259 << ": " << cpp_strerror(-ret) << dendl;
1260 }
1261 return ret;
1262 }
1263
1264 int RGWPeriod::read_info()
1265 {
1266 rgw_pool pool(get_pool(cct));
1267
1268 bufferlist bl;
1269
1270 RGWObjectCtx obj_ctx(store);
1271 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1272 if (ret < 0) {
1273 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1274 return ret;
1275 }
1276
1277 try {
1278 bufferlist::iterator iter = bl.begin();
1279 ::decode(*this, iter);
1280 } catch (buffer::error& err) {
1281 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1282 return -EIO;
1283 }
1284
1285 return 0;
1286 }
1287
1288 int RGWPeriod::create(bool exclusive)
1289 {
1290 int ret;
1291
1292 /* create unique id */
1293 uuid_d new_uuid;
1294 char uuid_str[37];
1295 new_uuid.generate_random();
1296 new_uuid.print(uuid_str);
1297 id = uuid_str;
1298
1299 epoch = FIRST_EPOCH;
1300
1301 period_map.id = id;
1302
1303 ret = store_info(exclusive);
1304 if (ret < 0) {
1305 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1306 return ret;
1307 }
1308
1309 ret = set_latest_epoch(epoch);
1310 if (ret < 0) {
1311 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1312 }
1313
1314 return ret;
1315 }
1316
1317 int RGWPeriod::store_info(bool exclusive)
1318 {
1319 rgw_pool pool(get_pool(cct));
1320
1321 string oid = get_period_oid();
1322 bufferlist bl;
1323 ::encode(*this, bl);
1324
1325 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1326 exclusive, NULL, real_time(), NULL);
1327 }
1328
1329 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1330 {
1331 if (cct->_conf->rgw_period_root_pool.empty()) {
1332 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1333 }
1334 return rgw_pool(cct->_conf->rgw_period_root_pool);
1335 }
1336
1337 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1338 {
1339 if (zonegroup.realm_id != realm_id) {
1340 return 0;
1341 }
1342 int ret = period_map.update(zonegroup, cct);
1343 if (ret < 0) {
1344 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1345 return ret;
1346 }
1347
1348 return store_info(false);
1349 }
1350
1351 int RGWPeriod::update()
1352 {
1353 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1354 list<string> zonegroups;
1355 int ret = store->list_zonegroups(zonegroups);
1356 if (ret < 0) {
1357 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1358 return ret;
1359 }
1360
1361 // clear zone short ids of removed zones. period_map.update() will add the
1362 // remaining zones back
1363 period_map.short_zone_ids.clear();
1364
1365 for (auto& iter : zonegroups) {
1366 RGWZoneGroup zg(string(), iter);
1367 ret = zg.init(cct, store);
1368 if (ret < 0) {
1369 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1370 continue;
1371 }
1372
1373 if (zg.realm_id != realm_id) {
1374 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1375 continue;
1376 }
1377
1378 if (zg.master_zone.empty()) {
1379 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1380 return -EINVAL;
1381 }
1382
1383 if (zg.is_master_zonegroup()) {
1384 master_zonegroup = zg.get_id();
1385 master_zone = zg.master_zone;
1386 }
1387
1388 int ret = period_map.update(zg, cct);
1389 if (ret < 0) {
1390 return ret;
1391 }
1392 }
1393
1394 ret = period_config.read(store, realm_id);
1395 if (ret < 0 && ret != -ENOENT) {
1396 ldout(cct, 0) << "ERROR: failed to read period config: "
1397 << cpp_strerror(ret) << dendl;
1398 return ret;
1399 }
1400 return 0;
1401 }
1402
1403 int RGWPeriod::reflect()
1404 {
1405 for (auto& iter : period_map.zonegroups) {
1406 RGWZoneGroup& zg = iter.second;
1407 zg.reinit_instance(cct, store);
1408 int r = zg.write(false);
1409 if (r < 0) {
1410 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1411 return r;
1412 }
1413 if (zg.is_master_zonegroup()) {
1414 // set master as default if no default exists
1415 r = zg.set_as_default(true);
1416 if (r == 0) {
1417 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1418 << " as the default" << dendl;
1419 }
1420 }
1421 }
1422
1423 int r = period_config.write(store, realm_id);
1424 if (r < 0) {
1425 ldout(cct, 0) << "ERROR: failed to store period config: "
1426 << cpp_strerror(-r) << dendl;
1427 return r;
1428 }
1429 return 0;
1430 }
1431
1432 void RGWPeriod::fork()
1433 {
1434 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1435 predecessor_uuid = id;
1436 id = get_staging_id(realm_id);
1437 period_map.reset();
1438 realm_epoch++;
1439 }
1440
1441 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1442 {
1443 // initialize a sync status manager to read the status
1444 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1445 int r = mgr.init();
1446 if (r < 0) {
1447 return r;
1448 }
1449 r = mgr.read_sync_status(sync_status);
1450 mgr.stop();
1451 return r;
1452 }
1453
1454 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1455 std::ostream& error_stream,
1456 bool force_if_stale)
1457 {
1458 rgw_meta_sync_status status;
1459 int r = read_sync_status(store, &status);
1460 if (r < 0) {
1461 ldout(cct, 0) << "period failed to read sync status: "
1462 << cpp_strerror(-r) << dendl;
1463 return r;
1464 }
1465
1466 std::vector<std::string> markers;
1467
1468 const auto current_epoch = current_period.get_realm_epoch();
1469 if (current_epoch != status.sync_info.realm_epoch) {
1470 // no sync status markers for the current period
1471 assert(current_epoch > status.sync_info.realm_epoch);
1472 const int behind = current_epoch - status.sync_info.realm_epoch;
1473 if (!force_if_stale && current_epoch > 1) {
1474 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1475 "the current master zone in metadata sync. If this zone is promoted "
1476 "to master, any metadata changes during that time are likely to "
1477 "be lost.\n"
1478 "Waiting for this zone to catch up on metadata sync (see "
1479 "'radosgw-admin sync status') is recommended.\n"
1480 "To promote this zone to master anyway, add the flag "
1481 "--yes-i-really-mean-it." << std::endl;
1482 return -EINVAL;
1483 }
1484 // empty sync status markers - other zones will skip this period during
1485 // incremental metadata sync
1486 markers.resize(status.sync_info.num_shards);
1487 } else {
1488 markers.reserve(status.sync_info.num_shards);
1489 for (auto& i : status.sync_markers) {
1490 auto& marker = i.second;
1491 // filter out markers from other periods
1492 if (marker.realm_epoch != current_epoch) {
1493 marker.marker.clear();
1494 }
1495 markers.emplace_back(std::move(marker.marker));
1496 }
1497 }
1498
1499 std::swap(sync_status, markers);
1500 return 0;
1501 }
1502
1503 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1504 std::ostream& error_stream, bool force_if_stale)
1505 {
1506 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1507 // gateway must be in the master zone to commit
1508 if (master_zone != store->get_zone_params().get_id()) {
1509 error_stream << "Cannot commit period on zone "
1510 << store->get_zone_params().get_id() << ", it must be sent to "
1511 "the period's master zone " << master_zone << '.' << std::endl;
1512 return -EINVAL;
1513 }
1514 // period predecessor must match current period
1515 if (predecessor_uuid != current_period.get_id()) {
1516 error_stream << "Period predecessor " << predecessor_uuid
1517 << " does not match current period " << current_period.get_id()
1518 << ". Use 'period pull' to get the latest period from the master, "
1519 "reapply your changes, and try again." << std::endl;
1520 return -EINVAL;
1521 }
1522 // realm epoch must be 1 greater than current period
1523 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1524 error_stream << "Period's realm epoch " << realm_epoch
1525 << " does not come directly after current realm epoch "
1526 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1527 "latest realm and period from the master zone, reapply your changes, "
1528 "and try again." << std::endl;
1529 return -EINVAL;
1530 }
1531 // did the master zone change?
1532 if (master_zone != current_period.get_master_zone()) {
1533 // store the current metadata sync status in the period
1534 int r = update_sync_status(current_period, error_stream, force_if_stale);
1535 if (r < 0) {
1536 ldout(cct, 0) << "failed to update metadata sync status: "
1537 << cpp_strerror(-r) << dendl;
1538 return r;
1539 }
1540 // create an object with a new period id
1541 r = create(true);
1542 if (r < 0) {
1543 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1544 return r;
1545 }
1546 // set as current period
1547 r = realm.set_current_period(*this);
1548 if (r < 0) {
1549 ldout(cct, 0) << "failed to update realm's current period: "
1550 << cpp_strerror(-r) << dendl;
1551 return r;
1552 }
1553 ldout(cct, 4) << "Promoted to master zone and committed new period "
1554 << id << dendl;
1555 realm.notify_new_period(*this);
1556 return 0;
1557 }
1558 // period must be based on current epoch
1559 if (epoch != current_period.get_epoch()) {
1560 error_stream << "Period epoch " << epoch << " does not match "
1561 "predecessor epoch " << current_period.get_epoch()
1562 << ". Use 'period pull' to get the latest epoch from the master zone, "
1563 "reapply your changes, and try again." << std::endl;
1564 return -EINVAL;
1565 }
1566 // set period as next epoch
1567 set_id(current_period.get_id());
1568 set_epoch(current_period.get_epoch() + 1);
1569 set_predecessor(current_period.get_predecessor());
1570 realm_epoch = current_period.get_realm_epoch();
1571 // write the period to rados
1572 int r = store_info(false);
1573 if (r < 0) {
1574 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1575 return r;
1576 }
1577 // set as latest epoch
1578 r = update_latest_epoch(epoch);
1579 if (r == -EEXIST) {
1580 // already have this epoch (or a more recent one)
1581 return 0;
1582 }
1583 if (r < 0) {
1584 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1585 return r;
1586 }
1587 r = reflect();
1588 if (r < 0) {
1589 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1590 return r;
1591 }
1592 ldout(cct, 4) << "Committed new epoch " << epoch
1593 << " for period " << id << dendl;
1594 realm.notify_new_period(*this);
1595 return 0;
1596 }
1597
1598 int RGWZoneParams::create_default(bool old_format)
1599 {
1600 name = default_zone_name;
1601
1602 int r = create();
1603 if (r < 0) {
1604 return r;
1605 }
1606
1607 if (old_format) {
1608 name = id;
1609 }
1610
1611 return r;
1612 }
1613
1614
1615 int get_zones_pool_set(CephContext* cct,
1616 RGWRados* store,
1617 const list<string>& zones,
1618 const string& my_zone_id,
1619 set<rgw_pool>& pool_names)
1620 {
1621 for(auto const& iter : zones) {
1622 RGWZoneParams zone(iter);
1623 int r = zone.init(cct, store);
1624 if (r < 0) {
1625 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1626 return r;
1627 }
1628 if (zone.get_id() != my_zone_id) {
1629 pool_names.insert(zone.domain_root);
1630 pool_names.insert(zone.metadata_heap);
1631 pool_names.insert(zone.control_pool);
1632 pool_names.insert(zone.gc_pool);
1633 pool_names.insert(zone.log_pool);
1634 pool_names.insert(zone.intent_log_pool);
1635 pool_names.insert(zone.usage_log_pool);
1636 pool_names.insert(zone.user_keys_pool);
1637 pool_names.insert(zone.user_email_pool);
1638 pool_names.insert(zone.user_swift_pool);
1639 pool_names.insert(zone.user_uid_pool);
1640 pool_names.insert(zone.roles_pool);
1641 pool_names.insert(zone.reshard_pool);
1642 for(auto& iter : zone.placement_pools) {
1643 pool_names.insert(iter.second.index_pool);
1644 pool_names.insert(iter.second.data_pool);
1645 pool_names.insert(iter.second.data_extra_pool);
1646 }
1647 }
1648 }
1649 return 0;
1650 }
1651
1652 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1653 const string& default_prefix,
1654 const string& default_suffix,
1655 const rgw_pool& suggested_pool)
1656 {
1657 string suggested_name = suggested_pool.to_str();
1658
1659 string prefix = default_prefix;
1660 string suffix = default_suffix;
1661
1662 if (!suggested_pool.empty()) {
1663 prefix = suggested_name.substr(0, suggested_name.find("."));
1664 suffix = suggested_name.substr(prefix.length());
1665 }
1666
1667 rgw_pool pool(prefix + suffix);
1668
1669 if (pools.find(pool) == pools.end()) {
1670 return pool;
1671 } else {
1672 while(true) {
1673 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1674 if (pools.find(pool) == pools.end()) {
1675 return pool;
1676 }
1677 }
1678 }
1679 }
1680
1681 int RGWZoneParams::fix_pool_names()
1682 {
1683
1684 list<string> zones;
1685 int r = store->list_zones(zones);
1686 if (r < 0) {
1687 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1688 }
1689
1690 set<rgw_pool> pools;
1691 r = get_zones_pool_set(cct, store, zones, id, pools);
1692 if (r < 0) {
1693 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1694 return r;
1695 }
1696
1697 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1698 if (!metadata_heap.name.empty()) {
1699 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1700 }
1701 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1702 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1703 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1704 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1705 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1706 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1707 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1708 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1709 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1710 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1711 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1712 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1713
1714 for(auto& iter : placement_pools) {
1715 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1716 iter.second.index_pool);
1717 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1718 iter.second.data_pool);
1719 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1720 iter.second.data_extra_pool);
1721 }
1722
1723 return 0;
1724 }
1725
1726 int RGWZoneParams::create(bool exclusive)
1727 {
1728 /* check for old pools config */
1729 rgw_raw_obj obj(domain_root, avail_pools);
1730 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1731 if (r < 0) {
1732 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1733 /* a new system, let's set new placement info */
1734 RGWZonePlacementInfo default_placement;
1735 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1736 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1737 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1738 placement_pools["default-placement"] = default_placement;
1739 }
1740
1741 r = fix_pool_names();
1742 if (r < 0) {
1743 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1744 return r;
1745 }
1746
1747 r = RGWSystemMetaObj::create(exclusive);
1748 if (r < 0) {
1749 return r;
1750 }
1751
1752 // try to set as default. may race with another create, so pass exclusive=true
1753 // so we don't override an existing default
1754 r = set_as_default(true);
1755 if (r < 0 && r != -EEXIST) {
1756 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1757 }
1758
1759 return 0;
1760 }
1761
1762 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1763 {
1764 if (cct->_conf->rgw_zone_root_pool.empty()) {
1765 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1766 }
1767
1768 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1769 }
1770
1771 const string RGWZoneParams::get_default_oid(bool old_format)
1772 {
1773 if (old_format) {
1774 return cct->_conf->rgw_default_zone_info_oid;
1775 }
1776
1777 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1778 }
1779
1780 const string& RGWZoneParams::get_names_oid_prefix()
1781 {
1782 return zone_names_oid_prefix;
1783 }
1784
1785 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1786 {
1787 return zone_info_oid_prefix;
1788 }
1789
1790 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1791 return cct->_conf->rgw_zone;
1792 }
1793
1794 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1795 {
1796 if (name.empty()) {
1797 name = cct->_conf->rgw_zone;
1798 }
1799
1800 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1801 }
1802
1803 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1804 {
1805 if (realm_id.empty()) {
1806 /* try using default realm */
1807 RGWRealm realm;
1808 int ret = realm.init(cct, store);
1809 //no default realm exist
1810 if (ret < 0) {
1811 return read_id(default_zone_name, default_id);
1812 }
1813 realm_id = realm.get_id();
1814 }
1815
1816 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1817 }
1818
1819
1820 int RGWZoneParams::set_as_default(bool exclusive)
1821 {
1822 if (realm_id.empty()) {
1823 /* try using default realm */
1824 RGWRealm realm;
1825 int ret = realm.init(cct, store);
1826 if (ret < 0) {
1827 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1828 return -EINVAL;
1829 }
1830 realm_id = realm.get_id();
1831 }
1832
1833 return RGWSystemMetaObj::set_as_default(exclusive);
1834 }
1835
1836 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1837 {
1838 static const std::string NONE{"none"};
1839 auto p = placement_pools.find(placement_rule);
1840 if (p == placement_pools.end()) {
1841 return NONE;
1842 }
1843 const auto& type = p->second.compression_type;
1844 return !type.empty() ? type : NONE;
1845 }
1846
1847 void RGWPeriodMap::encode(bufferlist& bl) const {
1848 ENCODE_START(2, 1, bl);
1849 ::encode(id, bl);
1850 ::encode(zonegroups, bl);
1851 ::encode(master_zonegroup, bl);
1852 ::encode(short_zone_ids, bl);
1853 ENCODE_FINISH(bl);
1854 }
1855
1856 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1857 DECODE_START(2, bl);
1858 ::decode(id, bl);
1859 ::decode(zonegroups, bl);
1860 ::decode(master_zonegroup, bl);
1861 if (struct_v >= 2) {
1862 ::decode(short_zone_ids, bl);
1863 }
1864 DECODE_FINISH(bl);
1865
1866 zonegroups_by_api.clear();
1867 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1868 iter != zonegroups.end(); ++iter) {
1869 RGWZoneGroup& zonegroup = iter->second;
1870 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1871 if (zonegroup.is_master_zonegroup()) {
1872 master_zonegroup = zonegroup.get_id();
1873 }
1874 }
1875 }
1876
1877 // run an MD5 hash on the zone_id and return the first 32 bits
1878 static uint32_t gen_short_zone_id(const std::string zone_id)
1879 {
1880 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1881 MD5 hash;
1882 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1883 hash.Final(md5);
1884
1885 uint32_t short_id;
1886 memcpy((char *)&short_id, md5, sizeof(short_id));
1887 return std::max(short_id, 1u);
1888 }
1889
1890 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1891 {
1892 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1893 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1894 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1895 return -EINVAL;
1896 }
1897 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1898 if (iter != zonegroups.end()) {
1899 RGWZoneGroup& old_zonegroup = iter->second;
1900 if (!old_zonegroup.api_name.empty()) {
1901 zonegroups_by_api.erase(old_zonegroup.api_name);
1902 }
1903 }
1904 zonegroups[zonegroup.get_id()] = zonegroup;
1905
1906 if (!zonegroup.api_name.empty()) {
1907 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1908 }
1909
1910 if (zonegroup.is_master_zonegroup()) {
1911 master_zonegroup = zonegroup.get_id();
1912 } else if (master_zonegroup == zonegroup.get_id()) {
1913 master_zonegroup = "";
1914 }
1915
1916 for (auto& i : zonegroup.zones) {
1917 auto& zone = i.second;
1918 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1919 continue;
1920 }
1921 // calculate the zone's short id
1922 uint32_t short_id = gen_short_zone_id(zone.id);
1923
1924 // search for an existing zone with the same short id
1925 for (auto& s : short_zone_ids) {
1926 if (s.second == short_id) {
1927 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1928 << ") generates the same short_zone_id " << short_id
1929 << " as existing zone id " << s.first << dendl;
1930 return -EEXIST;
1931 }
1932 }
1933
1934 short_zone_ids[zone.id] = short_id;
1935 }
1936
1937 return 0;
1938 }
1939
1940 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1941 {
1942 auto i = short_zone_ids.find(zone_id);
1943 if (i == short_zone_ids.end()) {
1944 return 0;
1945 }
1946 return i->second;
1947 }
1948
1949 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1950 {
1951
1952 RGWPeriod period;
1953 int ret = period.init(cct, store);
1954 if (ret < 0) {
1955 cerr << "failed to read current period info: " << cpp_strerror(ret);
1956 return ret;
1957 }
1958
1959 bucket_quota = period.get_config().bucket_quota;
1960 user_quota = period.get_config().user_quota;
1961 zonegroups = period.get_map().zonegroups;
1962 zonegroups_by_api = period.get_map().zonegroups_by_api;
1963 master_zonegroup = period.get_map().master_zonegroup;
1964
1965 return 0;
1966 }
1967
1968 void RGWRegionMap::encode(bufferlist& bl) const {
1969 ENCODE_START( 3, 1, bl);
1970 ::encode(regions, bl);
1971 ::encode(master_region, bl);
1972 ::encode(bucket_quota, bl);
1973 ::encode(user_quota, bl);
1974 ENCODE_FINISH(bl);
1975 }
1976
1977 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1978 DECODE_START(3, bl);
1979 ::decode(regions, bl);
1980 ::decode(master_region, bl);
1981 if (struct_v >= 2)
1982 ::decode(bucket_quota, bl);
1983 if (struct_v >= 3)
1984 ::decode(user_quota, bl);
1985 DECODE_FINISH(bl);
1986 }
1987
1988 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1989 ENCODE_START( 3, 1, bl);
1990 ::encode(zonegroups, bl);
1991 ::encode(master_zonegroup, bl);
1992 ::encode(bucket_quota, bl);
1993 ::encode(user_quota, bl);
1994 ENCODE_FINISH(bl);
1995 }
1996
1997 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1998 DECODE_START(3, bl);
1999 ::decode(zonegroups, bl);
2000 ::decode(master_zonegroup, bl);
2001 if (struct_v >= 2)
2002 ::decode(bucket_quota, bl);
2003 if (struct_v >= 3)
2004 ::decode(user_quota, bl);
2005 DECODE_FINISH(bl);
2006
2007 zonegroups_by_api.clear();
2008 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2009 iter != zonegroups.end(); ++iter) {
2010 RGWZoneGroup& zonegroup = iter->second;
2011 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2012 if (zonegroup.is_master_zonegroup()) {
2013 master_zonegroup = zonegroup.get_name();
2014 }
2015 }
2016 }
2017
2018 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2019 {
2020 obj_version *check_objv = version_for_check();
2021
2022 if (check_objv) {
2023 cls_version_check(*op, *check_objv, VER_COND_EQ);
2024 }
2025
2026 cls_version_read(*op, &read_version);
2027 }
2028
2029 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2030 {
2031 obj_version *check_objv = version_for_check();
2032 obj_version *modify_version = version_for_write();
2033
2034 if (check_objv) {
2035 cls_version_check(*op, *check_objv, VER_COND_EQ);
2036 }
2037
2038 if (modify_version) {
2039 cls_version_set(*op, *modify_version);
2040 } else {
2041 cls_version_inc(*op);
2042 }
2043 }
2044
2045 void RGWObjManifest::obj_iterator::operator++()
2046 {
2047 if (manifest->explicit_objs) {
2048 ++explicit_iter;
2049
2050 if (explicit_iter == manifest->objs.end()) {
2051 ofs = manifest->obj_size;
2052 return;
2053 }
2054
2055 update_explicit_pos();
2056
2057 update_location();
2058 return;
2059 }
2060
2061 uint64_t obj_size = manifest->get_obj_size();
2062 uint64_t head_size = manifest->get_head_size();
2063
2064 if (ofs == obj_size) {
2065 return;
2066 }
2067
2068 if (manifest->rules.empty()) {
2069 return;
2070 }
2071
2072 /* are we still pointing at the head? */
2073 if (ofs < head_size) {
2074 rule_iter = manifest->rules.begin();
2075 RGWObjManifestRule *rule = &rule_iter->second;
2076 ofs = MIN(head_size, obj_size);
2077 stripe_ofs = ofs;
2078 cur_stripe = 1;
2079 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2080 if (rule->part_size > 0) {
2081 stripe_size = MIN(stripe_size, rule->part_size);
2082 }
2083 update_location();
2084 return;
2085 }
2086
2087 RGWObjManifestRule *rule = &rule_iter->second;
2088
2089 stripe_ofs += rule->stripe_max_size;
2090 cur_stripe++;
2091 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2092
2093 if (rule->part_size > 0) {
2094 /* multi part, multi stripes object */
2095
2096 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2097
2098 if (stripe_ofs >= part_ofs + rule->part_size) {
2099 /* moved to the next part */
2100 cur_stripe = 0;
2101 part_ofs += rule->part_size;
2102 stripe_ofs = part_ofs;
2103
2104 bool last_rule = (next_rule_iter == manifest->rules.end());
2105 /* move to the next rule? */
2106 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2107 rule_iter = next_rule_iter;
2108 last_rule = (next_rule_iter == manifest->rules.end());
2109 if (!last_rule) {
2110 ++next_rule_iter;
2111 }
2112 cur_part_id = rule_iter->second.start_part_num;
2113 } else {
2114 cur_part_id++;
2115 }
2116
2117 rule = &rule_iter->second;
2118 }
2119
2120 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2121 }
2122
2123 cur_override_prefix = rule->override_prefix;
2124
2125 ofs = stripe_ofs;
2126 if (ofs > obj_size) {
2127 ofs = obj_size;
2128 stripe_ofs = ofs;
2129 stripe_size = 0;
2130 }
2131
2132 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2133 update_location();
2134 }
2135
2136 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2137 {
2138 manifest = _m;
2139
2140 manifest->set_tail_placement(placement_rule, _b);
2141 manifest->set_head(placement_rule, _obj, 0);
2142 last_ofs = 0;
2143
2144 if (manifest->get_prefix().empty()) {
2145 char buf[33];
2146 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2147
2148 string oid_prefix = ".";
2149 oid_prefix.append(buf);
2150 oid_prefix.append("_");
2151
2152 manifest->set_prefix(oid_prefix);
2153 }
2154
2155 bool found = manifest->get_rule(0, &rule);
2156 if (!found) {
2157 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2158 return -EIO;
2159 }
2160
2161 uint64_t head_size = manifest->get_head_size();
2162
2163 if (head_size > 0) {
2164 cur_stripe_size = head_size;
2165 } else {
2166 cur_stripe_size = rule.stripe_max_size;
2167 }
2168
2169 cur_part_id = rule.start_part_num;
2170
2171 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2172
2173 // Normal object which not generated through copy operation
2174 manifest->set_tail_instance(_obj.key.instance);
2175
2176 manifest->update_iterators();
2177
2178 return 0;
2179 }
2180
2181 int RGWObjManifest::generator::create_next(uint64_t ofs)
2182 {
2183 if (ofs < last_ofs) /* only going forward */
2184 return -EINVAL;
2185
2186 uint64_t max_head_size = manifest->get_max_head_size();
2187
2188 if (ofs < max_head_size) {
2189 manifest->set_head_size(ofs);
2190 }
2191
2192 if (ofs >= max_head_size) {
2193 manifest->set_head_size(max_head_size);
2194 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2195 cur_stripe_size = rule.stripe_max_size;
2196
2197 if (cur_part_id == 0 && max_head_size > 0) {
2198 cur_stripe++;
2199 }
2200 }
2201
2202 last_ofs = ofs;
2203 manifest->set_obj_size(ofs);
2204
2205 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2206
2207 manifest->update_iterators();
2208
2209 return 0;
2210 }
2211
2212 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2213 {
2214 return begin_iter;
2215 }
2216
2217 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2218 {
2219 return end_iter;
2220 }
2221
2222 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2223 {
2224 if (ofs > obj_size) {
2225 ofs = obj_size;
2226 }
2227 RGWObjManifest::obj_iterator iter(this);
2228 iter.seek(ofs);
2229 return iter;
2230 }
2231
2232 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2233 {
2234 if (explicit_objs || m.explicit_objs) {
2235 return append_explicit(m, zonegroup, zone_params);
2236 }
2237
2238 if (rules.empty()) {
2239 *this = m;
2240 return 0;
2241 }
2242
2243 string override_prefix;
2244
2245 if (prefix.empty()) {
2246 prefix = m.prefix;
2247 }
2248
2249 if (prefix != m.prefix) {
2250 override_prefix = m.prefix;
2251 }
2252
2253 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2254 if (miter == m.rules.end()) {
2255 return append_explicit(m, zonegroup, zone_params);
2256 }
2257
2258 for (; miter != m.rules.end(); ++miter) {
2259 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2260
2261 RGWObjManifestRule& rule = last_rule->second;
2262
2263 if (rule.part_size == 0) {
2264 rule.part_size = obj_size - rule.start_ofs;
2265 }
2266
2267 RGWObjManifestRule& next_rule = miter->second;
2268 if (!next_rule.part_size) {
2269 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2270 }
2271
2272 string rule_prefix = prefix;
2273 if (!rule.override_prefix.empty()) {
2274 rule_prefix = rule.override_prefix;
2275 }
2276
2277 string next_rule_prefix = m.prefix;
2278 if (!next_rule.override_prefix.empty()) {
2279 next_rule_prefix = next_rule.override_prefix;
2280 }
2281
2282 if (rule.part_size != next_rule.part_size ||
2283 rule.stripe_max_size != next_rule.stripe_max_size ||
2284 rule_prefix != next_rule_prefix) {
2285 if (next_rule_prefix != prefix) {
2286 append_rules(m, miter, &next_rule_prefix);
2287 } else {
2288 append_rules(m, miter, NULL);
2289 }
2290 break;
2291 }
2292
2293 uint64_t expected_part_num = rule.start_part_num + 1;
2294 if (rule.part_size > 0) {
2295 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2296 }
2297
2298 if (expected_part_num != next_rule.start_part_num) {
2299 append_rules(m, miter, NULL);
2300 break;
2301 }
2302 }
2303
2304 set_obj_size(obj_size + m.obj_size);
2305
2306 return 0;
2307 }
2308
2309 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2310 {
2311 return append(m, store->get_zonegroup(), store->get_zone_params());
2312 }
2313
2314 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2315 string *override_prefix)
2316 {
2317 for (; miter != m.rules.end(); ++miter) {
2318 RGWObjManifestRule rule = miter->second;
2319 rule.start_ofs += obj_size;
2320 if (override_prefix)
2321 rule.override_prefix = *override_prefix;
2322 rules[rule.start_ofs] = rule;
2323 }
2324 }
2325
2326 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2327 {
2328 if (explicit_objs) {
2329 return;
2330 }
2331 obj_iterator iter = obj_begin();
2332
2333 while (iter != obj_end()) {
2334 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2335 const rgw_obj_select& os = iter.get_location();
2336 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2337 part.loc_ofs = 0;
2338
2339 uint64_t ofs = iter.get_stripe_ofs();
2340
2341 if (ofs == 0) {
2342 part.loc = obj;
2343 } else {
2344 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2345 }
2346 ++iter;
2347 uint64_t next_ofs = iter.get_stripe_ofs();
2348
2349 part.size = next_ofs - ofs;
2350 }
2351
2352 explicit_objs = true;
2353 rules.clear();
2354 prefix.clear();
2355 }
2356
2357 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2358 {
2359 if (!explicit_objs) {
2360 convert_to_explicit(zonegroup, zone_params);
2361 }
2362 if (!m.explicit_objs) {
2363 m.convert_to_explicit(zonegroup, zone_params);
2364 }
2365 map<uint64_t, RGWObjManifestPart>::iterator iter;
2366 uint64_t base = obj_size;
2367 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2368 RGWObjManifestPart& part = iter->second;
2369 objs[base + iter->first] = part;
2370 }
2371 obj_size += m.obj_size;
2372
2373 return 0;
2374 }
2375
2376 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2377 {
2378 if (rules.empty()) {
2379 return false;
2380 }
2381
2382 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2383 if (iter != rules.begin()) {
2384 --iter;
2385 }
2386
2387 *rule = iter->second;
2388
2389 return true;
2390 }
2391
2392 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2393 {
2394 write_version.ver = 1;
2395 #define TAG_LEN 24
2396
2397 write_version.tag.clear();
2398 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2399 }
2400
2401 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2402 real_time *mtime, real_time set_mtime,
2403 map<string, bufferlist>& attrs, real_time delete_at,
2404 const char *if_match, const char *if_nomatch, const string *user_data,
2405 rgw_zone_set *zones_trace)
2406 {
2407 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2408 if (r < 0)
2409 return r;
2410
2411 is_complete = !canceled;
2412 return 0;
2413 }
2414
2415 CephContext *RGWPutObjProcessor::ctx()
2416 {
2417 return store->ctx();
2418 }
2419
2420 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2421 {
2422 drain_pending();
2423
2424 if (is_complete)
2425 return;
2426
2427 set<rgw_raw_obj>::iterator iter;
2428 bool need_to_remove_head = false;
2429 rgw_raw_obj raw_head;
2430
2431 if (!head_obj.empty()) {
2432 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2433 }
2434
2435 /**
2436 * We should delete the object in the "multipart" namespace to avoid race condition.
2437 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2438 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2439 * written by the second upload may be deleted by the first upload.
2440 * details is describled on #11749
2441 *
2442 * The above comment still stands, but instead of searching for a specific object in the multipart
2443 * namespace, we just make sure that we remove the object that is marked as the head object after
2444 * we remove all the other raw objects. Note that we use different call to remove the head object,
2445 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2446 */
2447 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2448 const rgw_raw_obj& obj = *iter;
2449 if (!head_obj.empty() && obj == raw_head) {
2450 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2451 need_to_remove_head = true;
2452 continue;
2453 }
2454
2455 int r = store->delete_raw_obj(obj);
2456 if (r < 0 && r != -ENOENT) {
2457 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2458 }
2459 }
2460
2461 if (need_to_remove_head) {
2462 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2463 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2464 if (r < 0 && r != -ENOENT) {
2465 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2466 }
2467 }
2468 }
2469
2470 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2471 {
2472 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2473 obj_len = abs_ofs + bl.length();
2474
2475 if (!(obj == last_written_obj)) {
2476 last_written_obj = obj;
2477 }
2478
2479 // For the first call pass -1 as the offset to
2480 // do a write_full.
2481 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2482 }
2483
2484 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2485 {
2486 struct put_obj_aio_info info;
2487 info = pending.front();
2488 pending.pop_front();
2489 pending_size -= info.size;
2490 return info;
2491 }
2492
2493 int RGWPutObjProcessor_Aio::wait_pending_front()
2494 {
2495 if (pending.empty()) {
2496 return 0;
2497 }
2498 struct put_obj_aio_info info = pop_pending();
2499 int ret = store->aio_wait(info.handle);
2500
2501 if (ret >= 0) {
2502 add_written_obj(info.obj);
2503 }
2504
2505 return ret;
2506 }
2507
2508 bool RGWPutObjProcessor_Aio::pending_has_completed()
2509 {
2510 if (pending.empty())
2511 return false;
2512
2513 struct put_obj_aio_info& info = pending.front();
2514 return store->aio_completed(info.handle);
2515 }
2516
2517 int RGWPutObjProcessor_Aio::drain_pending()
2518 {
2519 int ret = 0;
2520 while (!pending.empty()) {
2521 int r = wait_pending_front();
2522 if (r < 0)
2523 ret = r;
2524 }
2525 return ret;
2526 }
2527
2528 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2529 {
2530 bool _wait = need_to_wait;
2531
2532 if (handle) {
2533 struct put_obj_aio_info info;
2534 info.handle = handle;
2535 info.obj = obj;
2536 info.size = size;
2537 pending_size += size;
2538 pending.push_back(info);
2539 }
2540 size_t orig_size = pending_size;
2541
2542 /* first drain complete IOs */
2543 while (pending_has_completed()) {
2544 int r = wait_pending_front();
2545 if (r < 0)
2546 return r;
2547
2548 _wait = false;
2549 }
2550
2551 /* resize window in case messages are draining too fast */
2552 if (orig_size - pending_size >= window_size) {
2553 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2554 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2555 if (window_size > max_window_size) {
2556 window_size = max_window_size;
2557 }
2558 }
2559
2560 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2561 if (pending_size > window_size || _wait) {
2562 int r = wait_pending_front();
2563 if (r < 0)
2564 return r;
2565 }
2566 return 0;
2567 }
2568
2569 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2570 {
2571 if (ofs >= next_part_ofs) {
2572 int r = prepare_next_part(ofs);
2573 if (r < 0) {
2574 return r;
2575 }
2576 }
2577
2578 *pobj = cur_obj;
2579
2580 if (!bl.length()) {
2581 *phandle = nullptr;
2582 return 0;
2583 }
2584
2585 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2586 }
2587
2588 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2589 {
2590 RGWPutObjProcessor::prepare(store, oid_rand);
2591
2592 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2593
2594 return 0;
2595 }
2596
2597 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2598 {
2599 *phandle = NULL;
2600 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2601
2602 pending_data_bl.claim_append(bl);
2603 if (pending_data_bl.length() < max_write_size) {
2604 *again = false;
2605 return 0;
2606 }
2607
2608 pending_data_bl.splice(0, max_write_size, &bl);
2609
2610 /* do we have enough data pending accumulated that needs to be written? */
2611 *again = (pending_data_bl.length() >= max_chunk_size);
2612
2613 if (!data_ofs && !immutable_head()) {
2614 first_chunk.claim(bl);
2615 obj_len = (uint64_t)first_chunk.length();
2616 int r = prepare_next_part(obj_len);
2617 if (r < 0) {
2618 return r;
2619 }
2620 data_ofs = obj_len;
2621 return 0;
2622 }
2623 off_t write_ofs = data_ofs;
2624 data_ofs = write_ofs + bl.length();
2625 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2626 we could be racing with another upload, to the same
2627 object and cleanup can be messy */
2628 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2629 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2630 bl.clear();
2631 }
2632 return ret;
2633 }
2634
2635
2636 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2637 {
2638 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2639
2640 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2641 if (r < 0) {
2642 return r;
2643 }
2644
2645 return 0;
2646 }
2647
2648 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2649 {
2650 head_obj.init(bucket, obj_str);
2651
2652 int r = prepare_init(store, oid_rand);
2653 if (r < 0) {
2654 return r;
2655 }
2656
2657 if (!version_id.empty()) {
2658 head_obj.key.set_instance(version_id);
2659 } else if (versioned_object) {
2660 store->gen_rand_obj_instance_name(&head_obj);
2661 }
2662
2663 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2664
2665 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2666 if (r < 0) {
2667 return r;
2668 }
2669
2670 return 0;
2671 }
2672
2673 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2674
2675 int ret = manifest_gen.create_next(ofs);
2676 if (ret < 0) {
2677 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2678 return ret;
2679 }
2680 cur_part_ofs = ofs;
2681 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2682 cur_obj = manifest_gen.get_cur_obj(store);
2683
2684 return 0;
2685 }
2686
2687 int RGWPutObjProcessor_Atomic::complete_parts()
2688 {
2689 if (obj_len > (uint64_t)cur_part_ofs) {
2690 return prepare_next_part(obj_len);
2691 }
2692 return 0;
2693 }
2694
2695 int RGWPutObjProcessor_Atomic::complete_writing_data()
2696 {
2697 if (!data_ofs && !immutable_head()) {
2698 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2699 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2700 * clobber first_chunk
2701 */
2702 if (pending_data_bl.length() > 0) {
2703 first_chunk.claim(pending_data_bl);
2704 }
2705 obj_len = (uint64_t)first_chunk.length();
2706 }
2707 while (pending_data_bl.length()) {
2708 void *handle = nullptr;
2709 rgw_raw_obj obj;
2710 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2711 if (max_write_size > pending_data_bl.length()) {
2712 max_write_size = pending_data_bl.length();
2713 }
2714 bufferlist bl;
2715 pending_data_bl.splice(0, max_write_size, &bl);
2716 uint64_t write_len = bl.length();
2717 int r = write_data(bl, data_ofs, &handle, &obj, false);
2718 if (r < 0) {
2719 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2720 return r;
2721 }
2722 data_ofs += write_len;
2723 r = throttle_data(handle, obj, write_len, false);
2724 if (r < 0) {
2725 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2726 return r;
2727 }
2728
2729 if (data_ofs >= next_part_ofs) {
2730 r = prepare_next_part(data_ofs);
2731 if (r < 0) {
2732 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2733 return r;
2734 }
2735 }
2736 }
2737 int r = complete_parts();
2738 if (r < 0) {
2739 return r;
2740 }
2741
2742 r = drain_pending();
2743 if (r < 0)
2744 return r;
2745
2746 return 0;
2747 }
2748
2749 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2750 real_time *mtime, real_time set_mtime,
2751 map<string, bufferlist>& attrs,
2752 real_time delete_at,
2753 const char *if_match,
2754 const char *if_nomatch, const string *user_data,
2755 rgw_zone_set *zones_trace) {
2756 int r = complete_writing_data();
2757 if (r < 0)
2758 return r;
2759
2760 obj_ctx.obj.set_atomic(head_obj);
2761
2762 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2763
2764 /* some object types shouldn't be versioned, e.g., multipart parts */
2765 op_target.set_versioning_disabled(!versioned_object);
2766
2767 RGWRados::Object::Write obj_op(&op_target);
2768
2769 obj_op.meta.data = &first_chunk;
2770 obj_op.meta.manifest = &manifest;
2771 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2772 obj_op.meta.if_match = if_match;
2773 obj_op.meta.if_nomatch = if_nomatch;
2774 obj_op.meta.mtime = mtime;
2775 obj_op.meta.set_mtime = set_mtime;
2776 obj_op.meta.owner = bucket_info.owner;
2777 obj_op.meta.flags = PUT_OBJ_CREATE;
2778 obj_op.meta.olh_epoch = olh_epoch;
2779 obj_op.meta.delete_at = delete_at;
2780 obj_op.meta.user_data = user_data;
2781 obj_op.meta.zones_trace = zones_trace;
2782 obj_op.meta.modify_tail = true;
2783
2784 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2785 if (r < 0) {
2786 return r;
2787 }
2788
2789 canceled = obj_op.meta.canceled;
2790
2791 return 0;
2792 }
2793
2794 const char* RGWRados::admin_commands[4][3] = {
2795 { "cache list",
2796 "cache list name=filter,type=CephString,req=false",
2797 "cache list [filter_str]: list object cache, possibly matching substrings" },
2798 { "cache inspect",
2799 "cache inspect name=target,type=CephString,req=true",
2800 "cache inspect target: print cache element" },
2801 { "cache erase",
2802 "cache erase name=target,type=CephString,req=true",
2803 "cache erase target: erase element from cache" },
2804 { "cache zap",
2805 "cache zap",
2806 "cache zap: erase all elements from cache" }
2807 };
2808
2809
2810 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2811 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2812 if (r < 0)
2813 return r;
2814 return 0;
2815 }
2816
2817 int RGWRados::unwatch(uint64_t watch_handle)
2818 {
2819 int r = control_pool_ctx.unwatch2(watch_handle);
2820 if (r < 0) {
2821 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2822 return r;
2823 }
2824 r = rados[0].watch_flush();
2825 if (r < 0) {
2826 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2827 return r;
2828 }
2829 return 0;
2830 }
2831
2832 void RGWRados::add_watcher(int i)
2833 {
2834 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2835 Mutex::Locker l(watchers_lock);
2836 watchers_set.insert(i);
2837 if (watchers_set.size() == (size_t)num_watchers) {
2838 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2839 set_cache_enabled(true);
2840 }
2841 }
2842
2843 void RGWRados::remove_watcher(int i)
2844 {
2845 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2846 Mutex::Locker l(watchers_lock);
2847 size_t orig_size = watchers_set.size();
2848 watchers_set.erase(i);
2849 if (orig_size == (size_t)num_watchers &&
2850 watchers_set.size() < orig_size) { /* actually removed */
2851 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2852 set_cache_enabled(false);
2853 }
2854 }
2855
2856 class RGWWatcher : public librados::WatchCtx2 {
2857 RGWRados *rados;
2858 int index;
2859 string oid;
2860 uint64_t watch_handle;
2861
2862 class C_ReinitWatch : public Context {
2863 RGWWatcher *watcher;
2864 public:
2865 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2866 void finish(int r) override {
2867 watcher->reinit();
2868 }
2869 };
2870 public:
2871 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2872 void handle_notify(uint64_t notify_id,
2873 uint64_t cookie,
2874 uint64_t notifier_id,
2875 bufferlist& bl) override {
2876 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2877 << " notify_id " << notify_id
2878 << " cookie " << cookie
2879 << " notifier " << notifier_id
2880 << " bl.length()=" << bl.length() << dendl;
2881 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2882
2883 bufferlist reply_bl; // empty reply payload
2884 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2885 }
2886 void handle_error(uint64_t cookie, int err) override {
2887 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2888 << " err " << cpp_strerror(err) << dendl;
2889 rados->remove_watcher(index);
2890 rados->schedule_context(new C_ReinitWatch(this));
2891 }
2892
2893 void reinit() {
2894 int ret = unregister_watch();
2895 if (ret < 0) {
2896 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2897 return;
2898 }
2899 ret = register_watch();
2900 if (ret < 0) {
2901 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2902 return;
2903 }
2904 }
2905
2906 int unregister_watch() {
2907 int r = rados->unwatch(watch_handle);
2908 if (r < 0) {
2909 return r;
2910 }
2911 rados->remove_watcher(index);
2912 return 0;
2913 }
2914
2915 int register_watch() {
2916 int r = rados->watch(oid, &watch_handle, this);
2917 if (r < 0) {
2918 return r;
2919 }
2920 rados->add_watcher(index);
2921 return 0;
2922 }
2923 };
2924
2925 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2926 RGWRados *store;
2927 RGWHTTPManager http_manager;
2928
2929 public:
2930 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2931 http_manager(store->ctx(), completion_mgr) {
2932 http_manager.set_threaded();
2933 }
2934
2935 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2936 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2937 { "notify", NULL },
2938 { NULL, NULL } };
2939
2940 list<RGWCoroutinesStack *> stacks;
2941 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2942 RGWRESTConn *conn = iter->second;
2943 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2944 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2945
2946 stacks.push_back(stack);
2947 }
2948 return run(stacks);
2949 }
2950 };
2951
2952 class RGWDataNotifierManager : public RGWCoroutinesManager {
2953 RGWRados *store;
2954 RGWHTTPManager http_manager;
2955
2956 public:
2957 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2958 http_manager(store->ctx(), completion_mgr) {
2959 http_manager.set_threaded();
2960 }
2961
2962 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2963 rgw_http_param_pair pairs[] = { { "type", "data" },
2964 { "notify", NULL },
2965 { "source-zone", store->get_zone_params().get_id().c_str() },
2966 { NULL, NULL } };
2967
2968 list<RGWCoroutinesStack *> stacks;
2969 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2970 RGWRESTConn *conn = iter->second;
2971 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2972 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2973
2974 stacks.push_back(stack);
2975 }
2976 return run(stacks);
2977 }
2978 };
2979
2980 class RGWRadosThread {
2981 class Worker : public Thread {
2982 CephContext *cct;
2983 RGWRadosThread *processor;
2984 Mutex lock;
2985 Cond cond;
2986
2987 void wait() {
2988 Mutex::Locker l(lock);
2989 cond.Wait(lock);
2990 };
2991
2992 void wait_interval(const utime_t& wait_time) {
2993 Mutex::Locker l(lock);
2994 cond.WaitInterval(lock, wait_time);
2995 }
2996
2997 public:
2998 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2999 void *entry() override;
3000 void signal() {
3001 Mutex::Locker l(lock);
3002 cond.Signal();
3003 }
3004 };
3005
3006 Worker *worker;
3007
3008 protected:
3009 CephContext *cct;
3010 RGWRados *store;
3011
3012 std::atomic<bool> down_flag = { false };
3013
3014 string thread_name;
3015
3016 virtual uint64_t interval_msec() = 0;
3017 virtual void stop_process() {}
3018 public:
3019 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3020 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3021 virtual ~RGWRadosThread() {
3022 stop();
3023 }
3024
3025 virtual int init() { return 0; }
3026 virtual int process() = 0;
3027
3028 bool going_down() { return down_flag; }
3029
3030 void start();
3031 void stop();
3032
3033 void signal() {
3034 if (worker) {
3035 worker->signal();
3036 }
3037 }
3038 };
3039
3040 void RGWRadosThread::start()
3041 {
3042 worker = new Worker(cct, this);
3043 worker->create(thread_name.c_str());
3044 }
3045
3046 void RGWRadosThread::stop()
3047 {
3048 down_flag = true;
3049 stop_process();
3050 if (worker) {
3051 worker->signal();
3052 worker->join();
3053 }
3054 delete worker;
3055 worker = NULL;
3056 }
3057
3058 void *RGWRadosThread::Worker::entry() {
3059 uint64_t msec = processor->interval_msec();
3060 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3061
3062 do {
3063 utime_t start = ceph_clock_now();
3064 int r = processor->process();
3065 if (r < 0) {
3066 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3067 }
3068
3069 if (processor->going_down())
3070 break;
3071
3072 utime_t end = ceph_clock_now();
3073 end -= start;
3074
3075 uint64_t cur_msec = processor->interval_msec();
3076 if (cur_msec != msec) { /* was it reconfigured? */
3077 msec = cur_msec;
3078 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3079 }
3080
3081 if (cur_msec > 0) {
3082 if (interval <= end)
3083 continue; // next round
3084
3085 utime_t wait_time = interval;
3086 wait_time -= end;
3087
3088 wait_interval(wait_time);
3089 } else {
3090 wait();
3091 }
3092 } while (!processor->going_down());
3093
3094 return NULL;
3095 }
3096
3097 class RGWMetaNotifier : public RGWRadosThread {
3098 RGWMetaNotifierManager notify_mgr;
3099 RGWMetadataLog *const log;
3100
3101 uint64_t interval_msec() override {
3102 return cct->_conf->rgw_md_notify_interval_msec;
3103 }
3104 public:
3105 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3106 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3107
3108 int process() override;
3109 };
3110
3111 int RGWMetaNotifier::process()
3112 {
3113 set<int> shards;
3114
3115 log->read_clear_modified(shards);
3116
3117 if (shards.empty()) {
3118 return 0;
3119 }
3120
3121 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3122 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3123 }
3124
3125 notify_mgr.notify_all(store->zone_conn_map, shards);
3126
3127 return 0;
3128 }
3129
3130 class RGWDataNotifier : public RGWRadosThread {
3131 RGWDataNotifierManager notify_mgr;
3132
3133 uint64_t interval_msec() override {
3134 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3135 }
3136 public:
3137 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3138
3139 int process() override;
3140 };
3141
3142 int RGWDataNotifier::process()
3143 {
3144 if (!store->data_log) {
3145 return 0;
3146 }
3147
3148 map<int, set<string> > shards;
3149
3150 store->data_log->read_clear_modified(shards);
3151
3152 if (shards.empty()) {
3153 return 0;
3154 }
3155
3156 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3157 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3158 }
3159
3160 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3161
3162 return 0;
3163 }
3164
3165 class RGWSyncProcessorThread : public RGWRadosThread {
3166 public:
3167 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3168 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3169 ~RGWSyncProcessorThread() override {}
3170 int init() override = 0 ;
3171 int process() override = 0;
3172 };
3173
3174 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3175 {
3176 RGWMetaSyncStatusManager sync;
3177
3178 uint64_t interval_msec() override {
3179 return 0; /* no interval associated, it'll run once until stopped */
3180 }
3181 void stop_process() override {
3182 sync.stop();
3183 }
3184 public:
3185 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3186 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3187
3188 void wakeup_sync_shards(set<int>& shard_ids) {
3189 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3190 sync.wakeup(*iter);
3191 }
3192 }
3193 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3194
3195 int init() override {
3196 int ret = sync.init();
3197 if (ret < 0) {
3198 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3199 return ret;
3200 }
3201 return 0;
3202 }
3203
3204 int process() override {
3205 sync.run();
3206 return 0;
3207 }
3208 };
3209
3210 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3211 {
3212 RGWDataSyncStatusManager sync;
3213 bool initialized;
3214
3215 uint64_t interval_msec() override {
3216 if (initialized) {
3217 return 0; /* no interval associated, it'll run once until stopped */
3218 } else {
3219 #define DATA_SYNC_INIT_WAIT_SEC 20
3220 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3221 }
3222 }
3223 void stop_process() override {
3224 sync.stop();
3225 }
3226 public:
3227 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3228 const string& _source_zone,
3229 rgw::BucketChangeObserver *observer)
3230 : RGWSyncProcessorThread(_store, "data-sync"),
3231 sync(_store, async_rados, _source_zone, observer),
3232 initialized(false) {}
3233
3234 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3235 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3236 sync.wakeup(iter->first, iter->second);
3237 }
3238 }
3239 RGWDataSyncStatusManager* get_manager() { return &sync; }
3240
3241 int init() override {
3242 return 0;
3243 }
3244
3245 int process() override {
3246 while (!initialized) {
3247 if (going_down()) {
3248 return 0;
3249 }
3250 int ret = sync.init();
3251 if (ret >= 0) {
3252 initialized = true;
3253 break;
3254 }
3255 /* we'll be back! */
3256 return 0;
3257 }
3258 sync.run();
3259 return 0;
3260 }
3261 };
3262
3263 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3264 {
3265 RGWCoroutinesManager crs;
3266 RGWRados *store;
3267 rgw::BucketTrimManager *bucket_trim;
3268 RGWHTTPManager http;
3269 const utime_t trim_interval;
3270
3271 uint64_t interval_msec() override { return 0; }
3272 void stop_process() override { crs.stop(); }
3273 public:
3274 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3275 int interval)
3276 : RGWSyncProcessorThread(store, "sync-log-trim"),
3277 crs(store->ctx(), store->get_cr_registry()), store(store),
3278 bucket_trim(bucket_trim),
3279 http(store->ctx(), crs.get_completion_mgr()),
3280 trim_interval(interval, 0)
3281 {}
3282
3283 int init() override {
3284 return http.set_threaded();
3285 }
3286 int process() override {
3287 list<RGWCoroutinesStack*> stacks;
3288 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3289 meta->call(create_meta_log_trim_cr(store, &http,
3290 cct->_conf->rgw_md_log_max_shards,
3291 trim_interval));
3292 stacks.push_back(meta);
3293
3294 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3295 data->call(create_data_log_trim_cr(store, &http,
3296 cct->_conf->rgw_data_log_num_shards,
3297 trim_interval));
3298 stacks.push_back(data);
3299
3300 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3301 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3302 stacks.push_back(bucket);
3303
3304 crs.run(stacks);
3305 return 0;
3306 }
3307 };
3308
3309 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3310 {
3311 Mutex::Locker l(meta_sync_thread_lock);
3312 if (meta_sync_processor_thread) {
3313 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3314 }
3315 }
3316
3317 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3318 {
3319 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3320 Mutex::Locker l(data_sync_thread_lock);
3321 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3322 if (iter == data_sync_processor_threads.end()) {
3323 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3324 return;
3325 }
3326
3327 RGWDataSyncProcessorThread *thread = iter->second;
3328 assert(thread);
3329 thread->wakeup_sync_shards(shard_ids);
3330 }
3331
3332 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3333 {
3334 Mutex::Locker l(meta_sync_thread_lock);
3335 if (meta_sync_processor_thread) {
3336 return meta_sync_processor_thread->get_manager();
3337 }
3338 return nullptr;
3339 }
3340
3341 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3342 {
3343 Mutex::Locker l(data_sync_thread_lock);
3344 auto thread = data_sync_processor_threads.find(source_zone);
3345 if (thread == data_sync_processor_threads.end()) {
3346 return nullptr;
3347 }
3348 return thread->second->get_manager();
3349 }
3350
3351 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3352 {
3353 IoCtx ioctx;
3354 int r = open_pool_ctx(pool, ioctx);
3355 if (r < 0) {
3356 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3357 return r;
3358 }
3359
3360 bool requires;
3361 r = ioctx.pool_requires_alignment2(&requires);
3362 if (r < 0) {
3363 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3364 << r << dendl;
3365 return r;
3366 }
3367
3368 if (!requires) {
3369 *alignment = 0;
3370 return 0;
3371 }
3372
3373 uint64_t align;
3374 r = ioctx.pool_required_alignment2(&align);
3375 if (r < 0) {
3376 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3377 << r << dendl;
3378 return r;
3379 }
3380 if (align != 0) {
3381 ldout(cct, 20) << "required alignment=" << align << dendl;
3382 }
3383 *alignment = align;
3384 return 0;
3385 }
3386
3387 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3388 {
3389 uint64_t alignment = 0;
3390 int r = get_required_alignment(pool, &alignment);
3391 if (r < 0) {
3392 return r;
3393 }
3394
3395 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3396
3397 if (alignment == 0) {
3398 *max_chunk_size = config_chunk_size;
3399 return 0;
3400 }
3401
3402 if (config_chunk_size <= alignment) {
3403 *max_chunk_size = alignment;
3404 return 0;
3405 }
3406
3407 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3408
3409 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3410
3411 return 0;
3412 }
3413
3414 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3415 {
3416 rgw_pool pool;
3417 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3418 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3419 return -EIO;
3420 }
3421 return get_max_chunk_size(pool, max_chunk_size);
3422 }
3423
3424 class RGWIndexCompletionManager;
3425
3426 struct complete_op_data {
3427 Mutex lock{"complete_op_data"};
3428 AioCompletion *rados_completion{nullptr};
3429 int manager_shard_id{-1};
3430 RGWIndexCompletionManager *manager{nullptr};
3431 rgw_obj obj;
3432 RGWModifyOp op;
3433 string tag;
3434 rgw_bucket_entry_ver ver;
3435 cls_rgw_obj_key key;
3436 rgw_bucket_dir_entry_meta dir_meta;
3437 list<cls_rgw_obj_key> remove_objs;
3438 bool log_op;
3439 uint16_t bilog_op;
3440 rgw_zone_set zones_trace;
3441
3442 bool stopped{false};
3443
3444 void stop() {
3445 Mutex::Locker l(lock);
3446 stopped = true;
3447 }
3448 };
3449
3450 class RGWIndexCompletionThread : public RGWRadosThread {
3451 RGWRados *store;
3452
3453 uint64_t interval_msec() override {
3454 return 0;
3455 }
3456
3457 list<complete_op_data *> completions;
3458
3459 Mutex completions_lock;
3460 public:
3461 RGWIndexCompletionThread(RGWRados *_store)
3462 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3463
3464 int process() override;
3465
3466 void add_completion(complete_op_data *completion) {
3467 {
3468 Mutex::Locker l(completions_lock);
3469 completions.push_back(completion);
3470 }
3471
3472 signal();
3473 }
3474 };
3475
3476 int RGWIndexCompletionThread::process()
3477 {
3478 list<complete_op_data *> comps;
3479
3480 {
3481 Mutex::Locker l(completions_lock);
3482 completions.swap(comps);
3483 }
3484
3485 for (auto c : comps) {
3486 std::unique_ptr<complete_op_data> up{c};
3487
3488 if (going_down()) {
3489 continue;
3490 }
3491 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3492
3493 RGWRados::BucketShard bs(store);
3494
3495 int r = bs.init(c->obj.bucket, c->obj);
3496 if (r < 0) {
3497 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3498 /* not much to do */
3499 continue;
3500 }
3501
3502 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3503 librados::ObjectWriteOperation o;
3504 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3505 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3506 c->log_op, c->bilog_op, &c->zones_trace);
3507
3508 return bs->index_ctx.operate(bs->bucket_obj, &o);
3509 });
3510 if (r < 0) {
3511 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3512 /* ignoring error, can't do anything about it */
3513 continue;
3514 }
3515 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3516 if (r < 0) {
3517 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3518 }
3519 }
3520
3521 return 0;
3522 }
3523
3524 class RGWIndexCompletionManager {
3525 RGWRados *store{nullptr};
3526 vector<Mutex *> locks;
3527 vector<set<complete_op_data *> > completions;
3528
3529 RGWIndexCompletionThread *completion_thread{nullptr};
3530
3531 int num_shards;
3532
3533 std::atomic<int> cur_shard {0};
3534
3535
3536 public:
3537 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3538 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3539
3540 for (int i = 0; i < num_shards; i++) {
3541 char buf[64];
3542 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3543 locks.push_back(new Mutex(buf));
3544 }
3545
3546 completions.resize(num_shards);
3547 }
3548 ~RGWIndexCompletionManager() {
3549 stop();
3550
3551 for (auto l : locks) {
3552 delete l;
3553 }
3554 }
3555
3556 int next_shard() {
3557 int result = cur_shard % num_shards;
3558 cur_shard++;
3559 return result;
3560 }
3561
3562 void create_completion(const rgw_obj& obj,
3563 RGWModifyOp op, string& tag,
3564 rgw_bucket_entry_ver& ver,
3565 const cls_rgw_obj_key& key,
3566 rgw_bucket_dir_entry_meta& dir_meta,
3567 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3568 uint16_t bilog_op,
3569 rgw_zone_set *zones_trace,
3570 complete_op_data **result);
3571 bool handle_completion(completion_t cb, complete_op_data *arg);
3572
3573 int start() {
3574 completion_thread = new RGWIndexCompletionThread(store);
3575 int ret = completion_thread->init();
3576 if (ret < 0) {
3577 return ret;
3578 }
3579 completion_thread->start();
3580 return 0;
3581 }
3582 void stop() {
3583 if (completion_thread) {
3584 completion_thread->stop();
3585 delete completion_thread;
3586 }
3587
3588 for (int i = 0; i < num_shards; ++i) {
3589 Mutex::Locker l(*locks[i]);
3590 for (auto c : completions[i]) {
3591 Mutex::Locker cl(c->lock);
3592 c->stop();
3593 }
3594 }
3595 completions.clear();
3596 }
3597 };
3598
3599 static void obj_complete_cb(completion_t cb, void *arg)
3600 {
3601 complete_op_data *completion = (complete_op_data *)arg;
3602 completion->lock.Lock();
3603 if (completion->stopped) {
3604 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3605 delete completion;
3606 return;
3607 }
3608 bool need_delete = completion->manager->handle_completion(cb, completion);
3609 completion->lock.Unlock();
3610 if (need_delete) {
3611 delete completion;
3612 }
3613 }
3614
3615
3616 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3617 RGWModifyOp op, string& tag,
3618 rgw_bucket_entry_ver& ver,
3619 const cls_rgw_obj_key& key,
3620 rgw_bucket_dir_entry_meta& dir_meta,
3621 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3622 uint16_t bilog_op,
3623 rgw_zone_set *zones_trace,
3624 complete_op_data **result)
3625 {
3626 complete_op_data *entry = new complete_op_data;
3627
3628 int shard_id = next_shard();
3629
3630 entry->manager_shard_id = shard_id;
3631 entry->manager = this;
3632 entry->obj = obj;
3633 entry->op = op;
3634 entry->tag = tag;
3635 entry->ver = ver;
3636 entry->key = key;
3637 entry->dir_meta = dir_meta;
3638 entry->log_op = log_op;
3639 entry->bilog_op = bilog_op;
3640
3641 if (remove_objs) {
3642 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3643 entry->remove_objs.push_back(*iter);
3644 }
3645 }
3646
3647 if (zones_trace) {
3648 entry->zones_trace = *zones_trace;
3649 } else {
3650 entry->zones_trace.insert(store->get_zone().id);
3651 }
3652
3653 *result = entry;
3654
3655 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3656
3657 Mutex::Locker l(*locks[shard_id]);
3658 completions[shard_id].insert(entry);
3659 }
3660
3661 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3662 {
3663 int shard_id = arg->manager_shard_id;
3664 {
3665 Mutex::Locker l(*locks[shard_id]);
3666
3667 auto& comps = completions[shard_id];
3668
3669 auto iter = comps.find(arg);
3670 if (iter == comps.end()) {
3671 return true;
3672 }
3673
3674 comps.erase(iter);
3675 }
3676
3677 int r = rados_aio_get_return_value(cb);
3678 if (r != -ERR_BUSY_RESHARDING) {
3679 return true;
3680 }
3681 completion_thread->add_completion(arg);
3682 return false;
3683 }
3684
3685 void RGWRados::finalize()
3686 {
3687 auto admin_socket = cct->get_admin_socket();
3688 for (auto cmd : admin_commands) {
3689 int r = admin_socket->unregister_command(cmd[0]);
3690 if (r < 0) {
3691 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3692 << ")" << dendl;
3693 }
3694 }
3695
3696 if (run_sync_thread) {
3697 Mutex::Locker l(meta_sync_thread_lock);
3698 meta_sync_processor_thread->stop();
3699
3700 Mutex::Locker dl(data_sync_thread_lock);
3701 for (auto iter : data_sync_processor_threads) {
3702 RGWDataSyncProcessorThread *thread = iter.second;
3703 thread->stop();
3704 }
3705 if (sync_log_trimmer) {
3706 sync_log_trimmer->stop();
3707 }
3708 }
3709 if (async_rados) {
3710 async_rados->stop();
3711 }
3712 if (run_sync_thread) {
3713 delete meta_sync_processor_thread;
3714 meta_sync_processor_thread = NULL;
3715 Mutex::Locker dl(data_sync_thread_lock);
3716 for (auto iter : data_sync_processor_threads) {
3717 RGWDataSyncProcessorThread *thread = iter.second;
3718 delete thread;
3719 }
3720 data_sync_processor_threads.clear();
3721 delete sync_log_trimmer;
3722 sync_log_trimmer = nullptr;
3723 bucket_trim = boost::none;
3724 }
3725 if (finisher) {
3726 finisher->stop();
3727 }
3728 if (need_watch_notify()) {
3729 finalize_watch();
3730 }
3731 if (finisher) {
3732 /* delete finisher only after cleaning up watches, as watch error path might call
3733 * into finisher. We stop finisher before finalizing watch to make sure we don't
3734 * actually handle any racing work
3735 */
3736 delete finisher;
3737 }
3738 if (meta_notifier) {
3739 meta_notifier->stop();
3740 delete meta_notifier;
3741 }
3742 if (data_notifier) {
3743 data_notifier->stop();
3744 delete data_notifier;
3745 }
3746 delete data_log;
3747 if (async_rados) {
3748 delete async_rados;
3749 }
3750
3751 delete lc;
3752 lc = NULL;
3753
3754 delete gc;
3755 gc = NULL;
3756
3757 delete obj_expirer;
3758 obj_expirer = NULL;
3759
3760 delete rest_master_conn;
3761
3762 map<string, RGWRESTConn *>::iterator iter;
3763 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3764 RGWRESTConn *conn = iter->second;
3765 delete conn;
3766 }
3767
3768 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3769 RGWRESTConn *conn = iter->second;
3770 delete conn;
3771 }
3772 RGWQuotaHandler::free_handler(quota_handler);
3773 if (cr_registry) {
3774 cr_registry->put();
3775 }
3776 delete meta_mgr;
3777 delete binfo_cache;
3778 delete obj_tombstone_cache;
3779 delete sync_modules_manager;
3780
3781 if (reshard_wait.get()) {
3782 reshard_wait->stop();
3783 reshard_wait.reset();
3784 }
3785
3786 if (run_reshard_thread) {
3787 reshard->stop_processor();
3788 }
3789 delete reshard;
3790 delete index_completion_manager;
3791 }
3792
3793 /**
3794 * Initialize the RADOS instance and prepare to do other ops
3795 * Returns 0 on success, -ERR# on failure.
3796 */
3797 int RGWRados::init_rados()
3798 {
3799 int ret = 0;
3800 auto admin_socket = cct->get_admin_socket();
3801 for (auto cmd : admin_commands) {
3802 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3803 cmd[2]);
3804 if (r < 0) {
3805 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3806 << ")" << dendl;
3807 return r;
3808 }
3809 }
3810
3811 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3812
3813 for (auto& r : handles) {
3814 ret = r.init_with_context(cct);
3815 if (ret < 0) {
3816 return ret;
3817 }
3818 ret = r.connect();
3819 if (ret < 0) {
3820 return ret;
3821 }
3822 }
3823
3824 sync_modules_manager = new RGWSyncModulesManager();
3825
3826 rgw_register_sync_modules(sync_modules_manager);
3827
3828 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3829 new RGWCoroutinesManagerRegistry(cct)};
3830 ret = crs->hook_to_admin_command("cr dump");
3831 if (ret < 0) {
3832 return ret;
3833 }
3834
3835 meta_mgr = new RGWMetadataManager(cct, this);
3836 data_log = new RGWDataChangesLog(cct, this);
3837 cr_registry = crs.release();
3838
3839 std::swap(handles, rados);
3840 return ret;
3841 }
3842
3843
3844 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3845 {
3846 map<string,string> metadata = meta;
3847 metadata["num_handles"] = stringify(rados.size());
3848 metadata["zonegroup_id"] = zonegroup.get_id();
3849 metadata["zonegroup_name"] = zonegroup.get_name();
3850 metadata["zone_name"] = zone_name();
3851 metadata["zone_id"] = zone_id();;
3852 string name = cct->_conf->name.get_id();
3853 if (name.find("rgw.") == 0) {
3854 name = name.substr(4);
3855 }
3856 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3857 if (ret < 0) {
3858 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3859 return ret;
3860 }
3861
3862 return 0;
3863 }
3864
3865 /**
3866 * Add new connection to connections map
3867 * @param zonegroup_conn_map map which new connection will be added to
3868 * @param zonegroup zonegroup which new connection will connect to
3869 * @param new_connection pointer to new connection instance
3870 */
3871 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3872 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3873 {
3874 // Delete if connection is already exists
3875 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3876 if (iterZoneGroup != zonegroup_conn_map.end()) {
3877 delete iterZoneGroup->second;
3878 }
3879
3880 // Add new connection to connections map
3881 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3882 }
3883
3884 int RGWRados::convert_regionmap()
3885 {
3886 RGWZoneGroupMap zonegroupmap;
3887
3888 string pool_name = cct->_conf->rgw_zone_root_pool;
3889 if (pool_name.empty()) {
3890 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3891 }
3892 string oid = region_map_oid;
3893
3894 rgw_pool pool(pool_name);
3895 bufferlist bl;
3896 RGWObjectCtx obj_ctx(this);
3897 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3898 if (ret < 0 && ret != -ENOENT) {
3899 return ret;
3900 } else if (ret == -ENOENT) {
3901 return 0;
3902 }
3903
3904 try {
3905 bufferlist::iterator iter = bl.begin();
3906 ::decode(zonegroupmap, iter);
3907 } catch (buffer::error& err) {
3908 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3909 return -EIO;
3910 }
3911
3912 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3913 iter != zonegroupmap.zonegroups.end(); ++iter) {
3914 RGWZoneGroup& zonegroup = iter->second;
3915 ret = zonegroup.init(cct, this, false);
3916 ret = zonegroup.update();
3917 if (ret < 0 && ret != -ENOENT) {
3918 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3919 cpp_strerror(-ret) << dendl;
3920 return ret;
3921 } else if (ret == -ENOENT) {
3922 ret = zonegroup.create();
3923 if (ret < 0) {
3924 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3925 cpp_strerror(-ret) << dendl;
3926 return ret;
3927 }
3928 }
3929 }
3930
3931 current_period.set_user_quota(zonegroupmap.user_quota);
3932 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3933
3934 // remove the region_map so we don't try to convert again
3935 rgw_raw_obj obj(pool, oid);
3936 ret = delete_system_obj(obj);
3937 if (ret < 0) {
3938 ldout(cct, 0) << "Error could not remove " << obj
3939 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3940 return ret;
3941 }
3942
3943 return 0;
3944 }
3945
3946 /**
3947 * Replace all region configuration with zonegroup for
3948 * backward compatability
3949 * Returns 0 on success, -ERR# on failure.
3950 */
3951 int RGWRados::replace_region_with_zonegroup()
3952 {
3953 /* copy default region */
3954 /* convert default region to default zonegroup */
3955 string default_oid = cct->_conf->rgw_default_region_info_oid;
3956 if (default_oid.empty()) {
3957 default_oid = default_region_info_oid;
3958 }
3959
3960
3961 RGWZoneGroup default_zonegroup;
3962 rgw_pool pool{default_zonegroup.get_pool(cct)};
3963 string oid = "converted";
3964 bufferlist bl;
3965 RGWObjectCtx obj_ctx(this);
3966
3967 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3968 if (ret < 0 && ret != -ENOENT) {
3969 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3970 << dendl;
3971 return ret;
3972 } else if (ret != -ENOENT) {
3973 ldout(cct, 20) << "System already converted " << dendl;
3974 return 0;
3975 }
3976
3977 string default_region;
3978 ret = default_zonegroup.init(cct, this, false, true);
3979 if (ret < 0) {
3980 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3981 return ret;
3982 }
3983 ret = default_zonegroup.read_default_id(default_region, true);
3984 if (ret < 0 && ret != -ENOENT) {
3985 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3986 return ret;
3987 }
3988
3989 /* convert regions to zonegroups */
3990 list<string> regions;
3991 ret = list_regions(regions);
3992 if (ret < 0 && ret != -ENOENT) {
3993 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3994 return ret;
3995 } else if (ret == -ENOENT || regions.empty()) {
3996 RGWZoneParams zoneparams(default_zone_name);
3997 int ret = zoneparams.init(cct, this);
3998 if (ret < 0 && ret != -ENOENT) {
3999 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4000 return ret;
4001 }
4002 /* update master zone */
4003 RGWZoneGroup default_zg(default_zonegroup_name);
4004 ret = default_zg.init(cct, this);
4005 if (ret < 0 && ret != -ENOENT) {
4006 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4007 return ret;
4008 }
4009 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4010 default_zg.master_zone = zoneparams.get_id();
4011 return default_zg.update();
4012 }
4013 return 0;
4014 }
4015
4016 string master_region, master_zone;
4017 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4018 if (*iter != default_zonegroup_name){
4019 RGWZoneGroup region(*iter);
4020 int ret = region.init(cct, this, true, true);
4021 if (ret < 0) {
4022 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4023 return ret;
4024 }
4025 if (region.is_master_zonegroup()) {
4026 master_region = region.get_id();
4027 master_zone = region.master_zone;
4028 }
4029 }
4030 }
4031
4032 /* create realm if there is none.
4033 The realm name will be the region and zone concatenated
4034 realm id will be mds of its name */
4035 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4036 string new_realm_name = master_region + "." + master_zone;
4037 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4038 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4039 MD5 hash;
4040 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4041 hash.Final(md5);
4042 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4043 string new_realm_id(md5_str);
4044 RGWRealm new_realm(new_realm_id,new_realm_name);
4045 ret = new_realm.init(cct, this, false);
4046 if (ret < 0) {
4047 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4048 return ret;
4049 }
4050 ret = new_realm.create();
4051 if (ret < 0 && ret != -EEXIST) {
4052 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4053 return ret;
4054 }
4055 ret = new_realm.set_as_default();
4056 if (ret < 0) {
4057 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4058 return ret;
4059 }
4060 ret = realm.init(cct, this);
4061 if (ret < 0) {
4062 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4063 return ret;
4064 }
4065 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4066 if (ret < 0) {
4067 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4068 return ret;
4069 }
4070 }
4071
4072 list<string>::iterator iter;
4073 /* create zonegroups */
4074 for (iter = regions.begin(); iter != regions.end(); ++iter)
4075 {
4076 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4077 /* check to see if we don't have already a zonegroup with this name */
4078 RGWZoneGroup new_zonegroup(*iter);
4079 ret = new_zonegroup.init(cct , this);
4080 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4081 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4082 " skipping conversion " << dendl;
4083 continue;
4084 }
4085 RGWZoneGroup zonegroup(*iter);
4086 zonegroup.set_id(*iter);
4087 int ret = zonegroup.init(cct, this, true, true);
4088 if (ret < 0) {
4089 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4090 return ret;
4091 }
4092 zonegroup.realm_id = realm.get_id();
4093 /* fix default region master zone */
4094 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4095 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4096 zonegroup.master_zone = default_zone_name;
4097 }
4098 ret = zonegroup.update();
4099 if (ret < 0 && ret != -EEXIST) {
4100 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4101 << dendl;
4102 return ret;
4103 }
4104 ret = zonegroup.update_name();
4105 if (ret < 0 && ret != -EEXIST) {
4106 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4107 << dendl;
4108 return ret;
4109 }
4110 if (zonegroup.get_name() == default_region) {
4111 ret = zonegroup.set_as_default();
4112 if (ret < 0) {
4113 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4114 << dendl;
4115 return ret;
4116 }
4117 }
4118 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4119 ++iter) {
4120 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4121 RGWZoneParams zoneparams(iter->first, iter->first);
4122 zoneparams.set_id(iter->first);
4123 zoneparams.realm_id = realm.get_id();
4124 ret = zoneparams.init(cct, this);
4125 if (ret < 0 && ret != -ENOENT) {
4126 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4127 return ret;
4128 } else if (ret == -ENOENT) {
4129 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4130 continue;
4131 }
4132 zonegroup.realm_id = realm.get_id();
4133 ret = zoneparams.update();
4134 if (ret < 0 && ret != -EEXIST) {
4135 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4136 return ret;
4137 }
4138 ret = zoneparams.update_name();
4139 if (ret < 0 && ret != -EEXIST) {
4140 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4141 return ret;
4142 }
4143 }
4144
4145 if (!current_period.get_id().empty()) {
4146 ret = current_period.add_zonegroup(zonegroup);
4147 if (ret < 0) {
4148 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4149 return ret;
4150 }
4151 }
4152 }
4153
4154 if (!current_period.get_id().empty()) {
4155 ret = current_period.update();
4156 if (ret < 0) {
4157 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4158 return ret;
4159 }
4160 ret = current_period.store_info(false);
4161 if (ret < 0) {
4162 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4163 return ret;
4164 }
4165 ret = current_period.reflect();
4166 if (ret < 0) {
4167 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4168 return ret;
4169 }
4170 }
4171
4172 for (auto const& iter : regions) {
4173 RGWZoneGroup zonegroup(iter);
4174 int ret = zonegroup.init(cct, this, true, true);
4175 if (ret < 0) {
4176 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4177 return ret;
4178 }
4179 ret = zonegroup.delete_obj(true);
4180 if (ret < 0 && ret != -ENOENT) {
4181 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4182 << dendl;
4183 return ret;
4184 }
4185 }
4186
4187 /* mark as converted */
4188 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4189 true, NULL, real_time(), NULL);
4190 if (ret < 0 ) {
4191 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4192 << dendl;
4193 return ret;
4194 }
4195
4196 return 0;
4197 }
4198
4199 int RGWRados::init_zg_from_period(bool *initialized)
4200 {
4201 *initialized = false;
4202
4203 if (current_period.get_id().empty()) {
4204 return 0;
4205 }
4206
4207 int ret = zonegroup.init(cct, this);
4208 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4209 if (ret == -ENOENT) {
4210 return 0;
4211 }
4212 if (ret < 0) {
4213 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4214 return ret;
4215 }
4216 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4217
4218 map<string, RGWZoneGroup>::const_iterator iter =
4219 current_period.get_map().zonegroups.find(zonegroup.get_id());
4220
4221 if (iter != current_period.get_map().zonegroups.end()) {
4222 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4223 zonegroup = iter->second;
4224 ret = zonegroup.init(cct, this, false);
4225 if (ret < 0) {
4226 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4227 return ret;
4228 }
4229 ret = zone_params.init(cct, this);
4230 if (ret < 0 && ret != -ENOENT) {
4231 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4232 return ret;
4233 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4234 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4235 zone_params.set_name(default_zone_name);
4236 ret = zone_params.init(cct, this);
4237 if (ret < 0 && ret != -ENOENT) {
4238 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4239 return ret;
4240 }
4241 }
4242 }
4243 for (iter = current_period.get_map().zonegroups.begin();
4244 iter != current_period.get_map().zonegroups.end(); ++iter){
4245 const RGWZoneGroup& zg = iter->second;
4246 // use endpoints from the zonegroup's master zone
4247 auto master = zg.zones.find(zg.master_zone);
4248 if (master == zg.zones.end()) {
4249 // fix missing master zone for a single zone zonegroup
4250 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4251 master = zg.zones.begin();
4252 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4253 master->second.name << " id:" << master->second.id << " as master" << dendl;
4254 if (zonegroup.get_id() == zg.get_id()) {
4255 zonegroup.master_zone = master->second.id;
4256 ret = zonegroup.update();
4257 if (ret < 0) {
4258 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4259 return ret;
4260 }
4261 } else {
4262 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4263 ret = fixed_zg.init(cct, this);
4264 if (ret < 0) {
4265 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4266 return ret;
4267 }
4268 fixed_zg.master_zone = master->second.id;
4269 ret = fixed_zg.update();
4270 if (ret < 0) {
4271 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4272 return ret;
4273 }
4274 }
4275 } else {
4276 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4277 zg.master_zone << dendl;
4278 return -EINVAL;
4279 }
4280 }
4281 const auto& endpoints = master->second.endpoints;
4282 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4283 if (!current_period.get_master_zonegroup().empty() &&
4284 zg.get_id() == current_period.get_master_zonegroup()) {
4285 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4286 }
4287 }
4288
4289 *initialized = true;
4290
4291 return 0;
4292 }
4293
4294 int RGWRados::init_zg_from_local(bool *creating_defaults)
4295 {
4296 int ret = zonegroup.init(cct, this);
4297 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4298 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4299 return ret;
4300 } else if (ret == -ENOENT) {
4301 *creating_defaults = true;
4302 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4303 ret = zonegroup.create_default();
4304 if (ret < 0) {
4305 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4306 << dendl;
4307 return ret;
4308 }
4309 ret = zonegroup.init(cct, this);
4310 if (ret < 0) {
4311 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4312 << dendl;
4313 return ret;
4314 }
4315 }
4316 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4317 if (zonegroup.is_master_zonegroup()) {
4318 // use endpoints from the zonegroup's master zone
4319 auto master = zonegroup.zones.find(zonegroup.master_zone);
4320 if (master == zonegroup.zones.end()) {
4321 // fix missing master zone for a single zone zonegroup
4322 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4323 master = zonegroup.zones.begin();
4324 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4325 master->second.name << " id:" << master->second.id << " as master" << dendl;
4326 zonegroup.master_zone = master->second.id;
4327 ret = zonegroup.update();
4328 if (ret < 0) {
4329 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4330 return ret;
4331 }
4332 } else {
4333 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4334 "master_zone=" << zonegroup.master_zone << dendl;
4335 return -EINVAL;
4336 }
4337 }
4338 const auto& endpoints = master->second.endpoints;
4339 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4340 }
4341
4342 return 0;
4343 }
4344
4345
4346 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4347 {
4348 return target_zone.syncs_from(source_zone.name) &&
4349 sync_modules_manager->supports_data_export(source_zone.tier_type);
4350 }
4351
4352 /**
4353 * Initialize the RADOS instance and prepare to do other ops
4354 * Returns 0 on success, -ERR# on failure.
4355 */
4356 int RGWRados::init_complete()
4357 {
4358 int ret = realm.init(cct, this);
4359 if (ret < 0 && ret != -ENOENT) {
4360 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4361 return ret;
4362 } else if (ret != -ENOENT) {
4363 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4364 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4365 if (ret < 0 && ret != -ENOENT) {
4366 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4367 return ret;
4368 }
4369 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4370 }
4371
4372 ret = replace_region_with_zonegroup();
4373 if (ret < 0) {
4374 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4375 return ret;
4376 }
4377
4378 ret = convert_regionmap();
4379 if (ret < 0) {
4380 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4381 return ret;
4382 }
4383
4384 bool zg_initialized = false;
4385
4386 if (!current_period.get_id().empty()) {
4387 ret = init_zg_from_period(&zg_initialized);
4388 if (ret < 0) {
4389 return ret;
4390 }
4391 }
4392
4393 bool creating_defaults = false;
4394 bool using_local = (!zg_initialized);
4395 if (using_local) {
4396 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4397 ret = init_zg_from_local(&creating_defaults);
4398 if (ret < 0) {
4399 return ret;
4400 }
4401 // read period_config into current_period
4402 auto& period_config = current_period.get_config();
4403 ret = period_config.read(this, zonegroup.realm_id);
4404 if (ret < 0 && ret != -ENOENT) {
4405 ldout(cct, 0) << "ERROR: failed to read period config: "
4406 << cpp_strerror(ret) << dendl;
4407 return ret;
4408 }
4409 }
4410
4411 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4412 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4413 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4414 zone_params.set_name(default_zone_name);
4415 }
4416
4417 ret = zone_params.init(cct, this);
4418 if (ret < 0 && ret != -ENOENT) {
4419 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4420 return ret;
4421 }
4422 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4423 if (zone_iter == get_zonegroup().zones.end()) {
4424 if (using_local) {
4425 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4426 return -EINVAL;
4427 }
4428 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4429 ret = init_zg_from_local(&creating_defaults);
4430 if (ret < 0) {
4431 return ret;
4432 }
4433 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4434 }
4435 if (zone_iter != get_zonegroup().zones.end()) {
4436 zone_public_config = zone_iter->second;
4437 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4438 } else {
4439 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4440 return -EINVAL;
4441 }
4442
4443 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4444
4445 if (run_sync_thread) {
4446 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4447 if (ret < 0) {
4448 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4449 return ret;
4450 }
4451 }
4452
4453 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4454
4455 init_unique_trans_id_deps();
4456
4457 finisher = new Finisher(cct);
4458 finisher->start();
4459
4460 period_puller.reset(new RGWPeriodPuller(this));
4461 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4462 current_period));
4463
4464 if (need_watch_notify()) {
4465 ret = init_watch();
4466 if (ret < 0) {
4467 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4468 return ret;
4469 }
4470 }
4471
4472 /* first build all zones index */
4473 for (auto ziter : get_zonegroup().zones) {
4474 const string& id = ziter.first;
4475 RGWZone& z = ziter.second;
4476 zone_id_by_name[z.name] = id;
4477 zone_by_id[id] = z;
4478 }
4479
4480 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4481 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4482 }
4483 zone_public_config = zone_by_id[zone_id()];
4484 for (auto ziter : get_zonegroup().zones) {
4485 const string& id = ziter.first;
4486 RGWZone& z = ziter.second;
4487 if (id == zone_id()) {
4488 continue;
4489 }
4490 if (z.endpoints.empty()) {
4491 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4492 continue;
4493 }
4494 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4495 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4496 zone_conn_map[id] = conn;
4497 if (zone_syncs_from(zone_public_config, z) ||
4498 zone_syncs_from(z, zone_public_config)) {
4499 if (zone_syncs_from(zone_public_config, z)) {
4500 zone_data_sync_from_map[id] = conn;
4501 }
4502 if (zone_syncs_from(z, zone_public_config)) {
4503 zone_data_notify_to_map[id] = conn;
4504 }
4505 } else {
4506 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4507 }
4508 }
4509
4510 ret = open_root_pool_ctx();
4511 if (ret < 0)
4512 return ret;
4513
4514 ret = open_gc_pool_ctx();
4515 if (ret < 0)
4516 return ret;
4517
4518 ret = open_lc_pool_ctx();
4519 if (ret < 0)
4520 return ret;
4521
4522 ret = open_objexp_pool_ctx();
4523 if (ret < 0)
4524 return ret;
4525
4526 ret = open_reshard_pool_ctx();
4527 if (ret < 0)
4528 return ret;
4529
4530 pools_initialized = true;
4531
4532 gc = new RGWGC();
4533 gc->initialize(cct, this);
4534
4535 obj_expirer = new RGWObjectExpirer(this);
4536
4537 if (use_gc_thread) {
4538 gc->start_processor();
4539 obj_expirer->start_processor();
4540 }
4541
4542 /* no point of running sync thread if we don't have a master zone configured
4543 or there is no rest_master_conn */
4544 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4545 || current_period.get_id().empty()) {
4546 run_sync_thread = false;
4547 }
4548
4549 if (run_sync_thread) {
4550 // initialize the log period history
4551 meta_mgr->init_oldest_log_period();
4552 }
4553
4554 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4555 async_rados->start();
4556
4557 ret = meta_mgr->init(current_period.get_id());
4558 if (ret < 0) {
4559 lderr(cct) << "ERROR: failed to initialize metadata log: "
4560 << cpp_strerror(-ret) << dendl;
4561 return ret;
4562 }
4563
4564 if (is_meta_master()) {
4565 auto md_log = meta_mgr->get_log(current_period.get_id());
4566 meta_notifier = new RGWMetaNotifier(this, md_log);
4567 meta_notifier->start();
4568 }
4569
4570 if (run_sync_thread) {
4571 Mutex::Locker l(meta_sync_thread_lock);
4572 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4573 ret = meta_sync_processor_thread->init();
4574 if (ret < 0) {
4575 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4576 return ret;
4577 }
4578 meta_sync_processor_thread->start();
4579
4580 // configure the bucket trim manager
4581 rgw::BucketTrimConfig config;
4582 rgw::configure_bucket_trim(cct, config);
4583
4584 bucket_trim.emplace(this, config);
4585 ret = bucket_trim->init();
4586 if (ret < 0) {
4587 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4588 return ret;
4589 }
4590
4591 Mutex::Locker dl(data_sync_thread_lock);
4592 for (auto iter : zone_data_sync_from_map) {
4593 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4594 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first,
4595 &*bucket_trim);
4596 ret = thread->init();
4597 if (ret < 0) {
4598 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4599 return ret;
4600 }
4601 thread->start();
4602 data_sync_processor_threads[iter.first] = thread;
4603 }
4604 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4605 if (interval > 0) {
4606 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
4607 ret = sync_log_trimmer->init();
4608 if (ret < 0) {
4609 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4610 return ret;
4611 }
4612 sync_log_trimmer->start();
4613 }
4614 }
4615 data_notifier = new RGWDataNotifier(this);
4616 data_notifier->start();
4617
4618 lc = new RGWLC();
4619 lc->initialize(cct, this);
4620
4621 if (use_lc_thread)
4622 lc->start_processor();
4623
4624 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4625
4626 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4627 get_zone().bucket_index_max_shards);
4628 if (bucket_index_max_shards > get_max_bucket_shards()) {
4629 bucket_index_max_shards = get_max_bucket_shards();
4630 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4631 << get_max_bucket_shards() << dendl;
4632 }
4633 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4634
4635 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4636 binfo_cache->init(this);
4637
4638 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4639
4640 if (need_tombstone_cache) {
4641 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4642 }
4643
4644 reshard_wait = std::make_shared<RGWReshardWait>(this);
4645
4646 reshard = new RGWReshard(this);
4647
4648 /* only the master zone in the zonegroup reshards buckets */
4649 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4650 if (run_reshard_thread) {
4651 reshard->start_processor();
4652 }
4653
4654 index_completion_manager = new RGWIndexCompletionManager(this);
4655 ret = index_completion_manager->start();
4656
4657 return ret;
4658 }
4659
4660 /**
4661 * Initialize the RADOS instance and prepare to do other ops
4662 * Returns 0 on success, -ERR# on failure.
4663 */
4664 int RGWRados::initialize()
4665 {
4666 int ret;
4667
4668 ret = init_rados();
4669 if (ret < 0)
4670 return ret;
4671
4672 return init_complete();
4673 }
4674
4675 void RGWRados::finalize_watch()
4676 {
4677 for (int i = 0; i < num_watchers; i++) {
4678 RGWWatcher *watcher = watchers[i];
4679 watcher->unregister_watch();
4680 delete watcher;
4681 }
4682
4683 delete[] notify_oids;
4684 delete[] watchers;
4685 }
4686
4687 void RGWRados::schedule_context(Context *c) {
4688 finisher->queue(c);
4689 }
4690
4691 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4692 {
4693 bool is_truncated;
4694 RGWListRawObjsCtx ctx;
4695 do {
4696 list<string> oids;
4697 int r = list_raw_objects(pool, prefix, 1000,
4698 ctx, oids, &is_truncated);
4699 if (r < 0) {
4700 return r;
4701 }
4702 list<string>::iterator iter;
4703 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4704 string& val = *iter;
4705 if (val.size() > prefix.size())
4706 result.push_back(val.substr(prefix.size()));
4707 }
4708 } while (is_truncated);
4709
4710 return 0;
4711 }
4712
4713 int RGWRados::list_regions(list<string>& regions)
4714 {
4715 RGWZoneGroup zonegroup;
4716
4717 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4718 }
4719
4720 int RGWRados::list_zonegroups(list<string>& zonegroups)
4721 {
4722 RGWZoneGroup zonegroup;
4723
4724 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4725 }
4726
4727 int RGWRados::list_zones(list<string>& zones)
4728 {
4729 RGWZoneParams zoneparams;
4730
4731 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4732 }
4733
4734 int RGWRados::list_realms(list<string>& realms)
4735 {
4736 RGWRealm realm(cct, this);
4737 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4738 }
4739
4740 int RGWRados::list_periods(list<string>& periods)
4741 {
4742 RGWPeriod period;
4743 list<string> raw_periods;
4744 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4745 if (ret < 0) {
4746 return ret;
4747 }
4748 for (const auto& oid : raw_periods) {
4749 size_t pos = oid.find(".");
4750 if (pos != std::string::npos) {
4751 periods.push_back(oid.substr(0, pos));
4752 } else {
4753 periods.push_back(oid);
4754 }
4755 }
4756 periods.sort(); // unique() only detects duplicates if they're adjacent
4757 periods.unique();
4758 return 0;
4759 }
4760
4761
4762 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4763 {
4764 int ret = 0;
4765 string period_id = current_period;
4766 while(!period_id.empty()) {
4767 RGWPeriod period(period_id);
4768 ret = period.init(cct, this);
4769 if (ret < 0) {
4770 return ret;
4771 }
4772 periods.push_back(period.get_id());
4773 period_id = period.get_predecessor();
4774 }
4775
4776 return ret;
4777 }
4778
4779 /**
4780 * Open the pool used as root for this gateway
4781 * Returns: 0 on success, -ERR# otherwise.
4782 */
4783 int RGWRados::open_root_pool_ctx()
4784 {
4785 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4786 }
4787
4788 int RGWRados::open_gc_pool_ctx()
4789 {
4790 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4791 }
4792
4793 int RGWRados::open_lc_pool_ctx()
4794 {
4795 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4796 }
4797
4798 int RGWRados::open_objexp_pool_ctx()
4799 {
4800 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4801 }
4802
4803 int RGWRados::open_reshard_pool_ctx()
4804 {
4805 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4806 }
4807
4808 int RGWRados::init_watch()
4809 {
4810 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4811 if (r < 0) {
4812 return r;
4813 }
4814
4815 num_watchers = cct->_conf->rgw_num_control_oids;
4816
4817 bool compat_oid = (num_watchers == 0);
4818
4819 if (num_watchers <= 0)
4820 num_watchers = 1;
4821
4822 notify_oids = new string[num_watchers];
4823 watchers = new RGWWatcher *[num_watchers];
4824
4825 for (int i=0; i < num_watchers; i++) {
4826 string& notify_oid = notify_oids[i];
4827 notify_oid = notify_oid_prefix;
4828 if (!compat_oid) {
4829 char buf[16];
4830 snprintf(buf, sizeof(buf), ".%d", i);
4831 notify_oid.append(buf);
4832 }
4833 r = control_pool_ctx.create(notify_oid, false);
4834 if (r < 0 && r != -EEXIST)
4835 return r;
4836
4837 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4838 watchers[i] = watcher;
4839
4840 r = watcher->register_watch();
4841 if (r < 0)
4842 return r;
4843 }
4844
4845 watch_initialized = true;
4846
4847 set_cache_enabled(true);
4848
4849 return 0;
4850 }
4851
4852 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4853 {
4854 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4855
4856 int i = r % num_watchers;
4857 char buf[16];
4858 snprintf(buf, sizeof(buf), ".%d", i);
4859
4860 notify_oid = notify_oid_prefix;
4861 notify_oid.append(buf);
4862 }
4863
4864 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4865 {
4866 librados::Rados *rad = get_rados_handle();
4867 int r = rgw_init_ioctx(rad, pool, io_ctx);
4868 if (r != -ENOENT)
4869 return r;
4870
4871 if (!pools_initialized)
4872 return r;
4873
4874 r = rad->pool_create(pool.name.c_str());
4875 if (r < 0 && r != -EEXIST)
4876 return r;
4877
4878 r = rgw_init_ioctx(rad, pool, io_ctx);
4879 if (r < 0)
4880 return r;
4881
4882 r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
4883 if (r < 0 && r != -EOPNOTSUPP)
4884 return r;
4885 return 0;
4886 }
4887
4888 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4889 string *marker) {
4890 if (marker) {
4891 *marker = shard_id_str;
4892 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4893 marker->append(shard_marker);
4894 }
4895 }
4896
4897 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4898 {
4899 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4900
4901 if (!explicit_pool.empty()) {
4902 return open_pool_ctx(explicit_pool, index_ctx);
4903 }
4904
4905 const string *rule = &bucket_info.placement_rule;
4906 if (rule->empty()) {
4907 rule = &zonegroup.default_placement;
4908 }
4909 auto iter = zone_params.placement_pools.find(*rule);
4910 if (iter == zone_params.placement_pools.end()) {
4911 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4912 return -EINVAL;
4913 }
4914
4915 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4916 if (r < 0)
4917 return r;
4918
4919 return 0;
4920 }
4921
4922 /**
4923 * set up a bucket listing.
4924 * handle is filled in.
4925 * Returns 0 on success, -ERR# otherwise.
4926 */
4927 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4928 {
4929 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4930 *handle = (RGWAccessHandle)state;
4931 return 0;
4932 }
4933
4934 /**
4935 * get the next bucket in the listing.
4936 * obj is filled in,
4937 * handle is updated.
4938 * returns 0 on success, -ERR# otherwise.
4939 */
4940 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4941 {
4942 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4943
4944 do {
4945 if (*state == root_pool_ctx.nobjects_end()) {
4946 delete state;
4947 return -ENOENT;
4948 }
4949
4950 obj.key.name = (*state)->get_oid();
4951 if (obj.key.name[0] == '_') {
4952 obj.key.name = obj.key.name.substr(1);
4953 }
4954
4955 (*state)++;
4956 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4957
4958 return 0;
4959 }
4960
4961
4962 /**** logs ****/
4963
4964 struct log_list_state {
4965 string prefix;
4966 librados::IoCtx io_ctx;
4967 librados::NObjectIterator obit;
4968 };
4969
4970 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4971 {
4972 log_list_state *state = new log_list_state;
4973 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4974 if (r < 0) {
4975 delete state;
4976 return r;
4977 }
4978 state->prefix = prefix;
4979 state->obit = state->io_ctx.nobjects_begin();
4980 *handle = (RGWAccessHandle)state;
4981 return 0;
4982 }
4983
4984 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4985 {
4986 log_list_state *state = static_cast<log_list_state *>(handle);
4987 while (true) {
4988 if (state->obit == state->io_ctx.nobjects_end()) {
4989 delete state;
4990 return -ENOENT;
4991 }
4992 if (state->prefix.length() &&
4993 state->obit->get_oid().find(state->prefix) != 0) {
4994 state->obit++;
4995 continue;
4996 }
4997 *name = state->obit->get_oid();
4998 state->obit++;
4999 break;
5000 }
5001 return 0;
5002 }
5003
5004 int RGWRados::log_remove(const string& name)
5005 {
5006 librados::IoCtx io_ctx;
5007 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5008 if (r < 0)
5009 return r;
5010 return io_ctx.remove(name);
5011 }
5012
5013 struct log_show_state {
5014 librados::IoCtx io_ctx;
5015 bufferlist bl;
5016 bufferlist::iterator p;
5017 string name;
5018 uint64_t pos;
5019 bool eof;
5020 log_show_state() : pos(0), eof(false) {}
5021 };
5022
5023 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5024 {
5025 log_show_state *state = new log_show_state;
5026 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5027 if (r < 0) {
5028 delete state;
5029 return r;
5030 }
5031 state->name = name;
5032 *handle = (RGWAccessHandle)state;
5033 return 0;
5034 }
5035
5036 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5037 {
5038 log_show_state *state = static_cast<log_show_state *>(handle);
5039 off_t off = state->p.get_off();
5040
5041 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5042 << " off " << off
5043 << " eof " << (int)state->eof
5044 << dendl;
5045 // read some?
5046 unsigned chunk = 1024*1024;
5047 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5048 bufferlist more;
5049 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5050 if (r < 0)
5051 return r;
5052 state->pos += r;
5053 bufferlist old;
5054 try {
5055 old.substr_of(state->bl, off, state->bl.length() - off);
5056 } catch (buffer::error& err) {
5057 return -EINVAL;
5058 }
5059 state->bl.clear();
5060 state->bl.claim(old);
5061 state->bl.claim_append(more);
5062 state->p = state->bl.begin();
5063 if ((unsigned)r < chunk)
5064 state->eof = true;
5065 ldout(cct, 10) << " read " << r << dendl;
5066 }
5067
5068 if (state->p.end())
5069 return 0; // end of file
5070 try {
5071 ::decode(*entry, state->p);
5072 }
5073 catch (const buffer::error &e) {
5074 return -EINVAL;
5075 }
5076 return 1;
5077 }
5078
5079 /**
5080 * usage_log_hash: get usage log key hash, based on name and index
5081 *
5082 * Get the usage object name. Since a user may have more than 1
5083 * object holding that info (multiple shards), we use index to
5084 * specify that shard number. Once index exceeds max shards it
5085 * wraps.
5086 * If name is not being set, results for all users will be returned
5087 * and index will wrap only after total shards number.
5088 *
5089 * @param cct [in] ceph context
5090 * @param name [in] user name
5091 * @param hash [out] hash value
5092 * @param index [in] shard index number
5093 */
5094 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5095 {
5096 uint32_t val = index;
5097
5098 if (!name.empty()) {
5099 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5100 val %= max_user_shards;
5101 val += ceph_str_hash_linux(name.c_str(), name.size());
5102 }
5103 char buf[17];
5104 int max_shards = cct->_conf->rgw_usage_max_shards;
5105 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5106 hash = buf;
5107 }
5108
5109 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5110 {
5111 uint32_t index = 0;
5112
5113 map<string, rgw_usage_log_info> log_objs;
5114
5115 string hash;
5116 string last_user;
5117
5118 /* restructure usage map, zone by object hash */
5119 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5120 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5121 const rgw_user_bucket& ub = iter->first;
5122 RGWUsageBatch& info = iter->second;
5123
5124 if (ub.user.empty()) {
5125 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5126 continue;
5127 }
5128
5129 if (ub.user != last_user) {
5130 /* index *should* be random, but why waste extra cycles
5131 in most cases max user shards is not going to exceed 1,
5132 so just incrementing it */
5133 usage_log_hash(cct, ub.user, hash, index++);
5134 }
5135 last_user = ub.user;
5136 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5137
5138 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5139 v.push_back(miter->second);
5140 }
5141 }
5142
5143 map<string, rgw_usage_log_info>::iterator liter;
5144
5145 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5146 int r = cls_obj_usage_log_add(liter->first, liter->second);
5147 if (r < 0)
5148 return r;
5149 }
5150 return 0;
5151 }
5152
5153 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5154 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5155 {
5156 uint32_t num = max_entries;
5157 string hash, first_hash;
5158 string user_str = user.to_str();
5159 usage_log_hash(cct, user_str, first_hash, 0);
5160
5161 if (usage_iter.index) {
5162 usage_log_hash(cct, user_str, hash, usage_iter.index);
5163 } else {
5164 hash = first_hash;
5165 }
5166
5167 usage.clear();
5168
5169 do {
5170 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5171 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5172
5173 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5174 usage_iter.read_iter, ret_usage, is_truncated);
5175 if (ret == -ENOENT)
5176 goto next;
5177
5178 if (ret < 0)
5179 return ret;
5180
5181 num -= ret_usage.size();
5182
5183 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5184 usage[iter->first].aggregate(iter->second);
5185 }
5186
5187 next:
5188 if (!*is_truncated) {
5189 usage_iter.read_iter.clear();
5190 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5191 }
5192 } while (num && !*is_truncated && hash != first_hash);
5193 return 0;
5194 }
5195
5196 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5197 {
5198 uint32_t index = 0;
5199 string hash, first_hash;
5200 string user_str = user.to_str();
5201 usage_log_hash(cct, user_str, first_hash, index);
5202
5203 hash = first_hash;
5204 do {
5205 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5206
5207 if (ret < 0 && ret != -ENOENT)
5208 return ret;
5209
5210 usage_log_hash(cct, user_str, hash, ++index);
5211 } while (hash != first_hash);
5212
5213 return 0;
5214 }
5215
5216 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5217 {
5218 return rgw_shards_hash(key, max_shards);
5219 }
5220
5221 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5222 {
5223 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5224 char buf[16];
5225 if (shard_id) {
5226 *shard_id = val % max_shards;
5227 }
5228 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5229 name = prefix + buf;
5230 }
5231
5232 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5233 {
5234 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5235 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5236 char buf[16];
5237 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5238 name = prefix + buf;
5239 }
5240
5241 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5242 {
5243 char buf[16];
5244 snprintf(buf, sizeof(buf), "%u", shard_id);
5245 name = prefix + buf;
5246
5247 }
5248
5249 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5250 {
5251 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5252 }
5253
5254 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5255 {
5256 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5257
5258 }
5259
5260 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5261 {
5262 librados::IoCtx io_ctx;
5263
5264 int r = time_log_add_init(io_ctx);
5265 if (r < 0) {
5266 return r;
5267 }
5268
5269 ObjectWriteOperation op;
5270 utime_t t(ut);
5271 cls_log_add(op, t, section, key, bl);
5272
5273 return io_ctx.operate(oid, &op);
5274 }
5275
5276 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5277 librados::AioCompletion *completion, bool monotonic_inc)
5278 {
5279 librados::IoCtx io_ctx;
5280
5281 int r = time_log_add_init(io_ctx);
5282 if (r < 0) {
5283 return r;
5284 }
5285
5286 ObjectWriteOperation op;
5287 cls_log_add(op, entries, monotonic_inc);
5288
5289 if (!completion) {
5290 r = io_ctx.operate(oid, &op);
5291 } else {
5292 r = io_ctx.aio_operate(oid, completion, &op);
5293 }
5294 return r;
5295 }
5296
5297 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5298 int max_entries, list<cls_log_entry>& entries,
5299 const string& marker,
5300 string *out_marker,
5301 bool *truncated)
5302 {
5303 librados::IoCtx io_ctx;
5304
5305 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5306 if (r < 0)
5307 return r;
5308 librados::ObjectReadOperation op;
5309
5310 utime_t st(start_time);
5311 utime_t et(end_time);
5312
5313 cls_log_list(op, st, et, marker, max_entries, entries,
5314 out_marker, truncated);
5315
5316 bufferlist obl;
5317
5318 int ret = io_ctx.operate(oid, &op, &obl);
5319 if (ret < 0)
5320 return ret;
5321
5322 return 0;
5323 }
5324
5325 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5326 {
5327 librados::IoCtx io_ctx;
5328
5329 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5330 if (r < 0)
5331 return r;
5332 librados::ObjectReadOperation op;
5333
5334 cls_log_info(op, header);
5335
5336 bufferlist obl;
5337
5338 int ret = io_ctx.operate(oid, &op, &obl);
5339 if (ret < 0)
5340 return ret;
5341
5342 return 0;
5343 }
5344
5345 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5346 {
5347 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5348 if (r < 0)
5349 return r;
5350
5351 librados::ObjectReadOperation op;
5352
5353 cls_log_info(op, header);
5354
5355 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5356 if (ret < 0)
5357 return ret;
5358
5359 return 0;
5360 }
5361
5362 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5363 const string& from_marker, const string& to_marker,
5364 librados::AioCompletion *completion)
5365 {
5366 librados::IoCtx io_ctx;
5367
5368 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5369 if (r < 0)
5370 return r;
5371
5372 utime_t st(start_time);
5373 utime_t et(end_time);
5374
5375 ObjectWriteOperation op;
5376 cls_log_trim(op, st, et, from_marker, to_marker);
5377
5378 if (!completion) {
5379 r = io_ctx.operate(oid, &op);
5380 } else {
5381 r = io_ctx.aio_operate(oid, completion, &op);
5382 }
5383 return r;
5384 }
5385
5386 string RGWRados::objexp_hint_get_shardname(int shard_num)
5387 {
5388 char buf[32];
5389 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5390
5391 string objname("obj_delete_at_hint.");
5392 return objname + buf;
5393 }
5394
5395 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5396 {
5397 string obj_key = key.name + key.instance;
5398 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5399 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5400 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5401 sid = rgw_shards_mod(sid2, num_shards);
5402 return sid;
5403 }
5404
5405 static string objexp_hint_get_keyext(const string& tenant_name,
5406 const string& bucket_name,
5407 const string& bucket_id,
5408 const rgw_obj_key& obj_key)
5409 {
5410 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5411 ":" + obj_key.name + ":" + obj_key.instance;
5412 }
5413
5414 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5415 const string& tenant_name,
5416 const string& bucket_name,
5417 const string& bucket_id,
5418 const rgw_obj_index_key& obj_key)
5419 {
5420 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5421 bucket_id, obj_key);
5422 objexp_hint_entry he = {
5423 .tenant = tenant_name,
5424 .bucket_name = bucket_name,
5425 .bucket_id = bucket_id,
5426 .obj_key = obj_key,
5427 .exp_time = delete_at };
5428 bufferlist hebl;
5429 ::encode(he, hebl);
5430 ObjectWriteOperation op;
5431 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5432
5433 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5434 return objexp_pool_ctx.operate(shard_name, &op);
5435 }
5436
5437 void RGWRados::objexp_get_shard(int shard_num,
5438 string& shard) /* out */
5439 {
5440 shard = objexp_hint_get_shardname(shard_num);
5441 }
5442
5443 int RGWRados::objexp_hint_list(const string& oid,
5444 const ceph::real_time& start_time,
5445 const ceph::real_time& end_time,
5446 const int max_entries,
5447 const string& marker,
5448 list<cls_timeindex_entry>& entries, /* out */
5449 string *out_marker, /* out */
5450 bool *truncated) /* out */
5451 {
5452 librados::ObjectReadOperation op;
5453 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5454 out_marker, truncated);
5455
5456 bufferlist obl;
5457 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5458
5459 if ((ret < 0 ) && (ret != -ENOENT)) {
5460 return ret;
5461 }
5462
5463 if ((ret == -ENOENT) && truncated) {
5464 *truncated = false;
5465 }
5466
5467 return 0;
5468 }
5469
5470 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5471 objexp_hint_entry& hint_entry) /* out */
5472 {
5473 try {
5474 bufferlist::iterator iter = ti_entry.value.begin();
5475 ::decode(hint_entry, iter);
5476 } catch (buffer::error& err) {
5477 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5478 }
5479
5480 return 0;
5481 }
5482
5483 int RGWRados::objexp_hint_trim(const string& oid,
5484 const ceph::real_time& start_time,
5485 const ceph::real_time& end_time,
5486 const string& from_marker,
5487 const string& to_marker)
5488 {
5489 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5490 from_marker, to_marker);
5491 if ((ret < 0 ) && (ret != -ENOENT)) {
5492 return ret;
5493 }
5494
5495 return 0;
5496 }
5497
5498 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5499 string& zone_id, string& owner_id) {
5500 librados::IoCtx io_ctx;
5501
5502 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5503 if (r < 0) {
5504 return r;
5505 }
5506 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5507 utime_t ut(msec / 1000, msec % 1000);
5508
5509 rados::cls::lock::Lock l(log_lock_name);
5510 l.set_duration(ut);
5511 l.set_cookie(owner_id);
5512 l.set_tag(zone_id);
5513 l.set_renew(true);
5514
5515 return l.lock_exclusive(&io_ctx, oid);
5516 }
5517
5518 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5519 librados::IoCtx io_ctx;
5520
5521 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5522 if (r < 0) {
5523 return r;
5524 }
5525
5526 rados::cls::lock::Lock l(log_lock_name);
5527 l.set_tag(zone_id);
5528 l.set_cookie(owner_id);
5529
5530 return l.unlock(&io_ctx, oid);
5531 }
5532
5533 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5534 {
5535 bufferlist::iterator i = bl.begin();
5536 RGWAccessControlPolicy policy(cct);
5537 try {
5538 policy.decode_owner(i);
5539 } catch (buffer::error& err) {
5540 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5541 return -EIO;
5542 }
5543 *owner = policy.get_owner();
5544 return 0;
5545 }
5546
5547 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5548 {
5549 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5550 if (aiter == attrset.end())
5551 return -EIO;
5552
5553 bufferlist& bl = aiter->second;
5554 bufferlist::iterator iter = bl.begin();
5555 try {
5556 policy->decode(iter);
5557 } catch (buffer::error& err) {
5558 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5559 return -EIO;
5560 }
5561 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5562 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5563 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5564 s3policy->to_xml(*_dout);
5565 *_dout << dendl;
5566 }
5567 return 0;
5568 }
5569
5570
5571 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5572 {
5573 rgw_bucket bucket = bucket_info.bucket;
5574 bucket.update_bucket_id(new_bucket_id);
5575
5576 RGWObjectCtx obj_ctx(store);
5577
5578 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5579 if (ret < 0) {
5580 return ret;
5581 }
5582
5583 return 0;
5584 }
5585
5586 /**
5587 * get listing of the objects in a bucket.
5588 *
5589 * max: maximum number of results to return
5590 * bucket: bucket to list contents of
5591 * prefix: only return results that match this prefix
5592 * delim: do not include results that match this string.
5593 * Any skipped results will have the matching portion of their name
5594 * inserted in common_prefixes with a "true" mark.
5595 * marker: if filled in, begin the listing with this object.
5596 * end_marker: if filled in, end the listing with this object.
5597 * result: the objects are put in here.
5598 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5599 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5600 */
5601 int RGWRados::Bucket::List::list_objects(int64_t max,
5602 vector<rgw_bucket_dir_entry> *result,
5603 map<string, bool> *common_prefixes,
5604 bool *is_truncated)
5605 {
5606 RGWRados *store = target->get_store();
5607 CephContext *cct = store->ctx();
5608 int shard_id = target->get_shard_id();
5609
5610 int count = 0;
5611 bool truncated = true;
5612 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5613
5614 result->clear();
5615
5616 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5617 rgw_obj_index_key cur_marker;
5618 marker_obj.get_index_key(&cur_marker);
5619
5620 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5621 params.ns);
5622 rgw_obj_index_key cur_end_marker;
5623 end_marker_obj.get_index_key(&cur_end_marker);
5624 const bool cur_end_marker_valid = !params.end_marker.empty();
5625
5626 rgw_obj_key prefix_obj(params.prefix);
5627 prefix_obj.ns = params.ns;
5628 string cur_prefix = prefix_obj.get_index_key_name();
5629
5630 string bigger_than_delim;
5631
5632 if (!params.delim.empty()) {
5633 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5634 char buf[params.delim.size() + 16];
5635 int r = encode_utf8(val + 1, (unsigned char *)buf);
5636 if (r < 0) {
5637 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5638 return -EINVAL;
5639 }
5640 buf[r] = '\0';
5641
5642 bigger_than_delim = buf;
5643
5644 /* if marker points at a common prefix, fast forward it into its upperbound string */
5645 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5646 if (delim_pos >= 0) {
5647 string s = cur_marker.name.substr(0, delim_pos);
5648 s.append(bigger_than_delim);
5649 cur_marker = s;
5650 }
5651 }
5652
5653 string skip_after_delim;
5654 while (truncated && count <= max) {
5655 if (skip_after_delim > cur_marker.name) {
5656 cur_marker = skip_after_delim;
5657 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5658 }
5659 std::map<string, rgw_bucket_dir_entry> ent_map;
5660 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5661 read_ahead + 1 - count, params.list_versions, ent_map,
5662 &truncated, &cur_marker);
5663 if (r < 0)
5664 return r;
5665
5666 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5667 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5668 rgw_bucket_dir_entry& entry = eiter->second;
5669 rgw_obj_index_key index_key = entry.key;
5670
5671 rgw_obj_key obj(index_key);
5672
5673 /* note that parse_raw_oid() here will not set the correct object's instance, as
5674 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5675 * not needed for the checks here and we end up using the raw entry for the return vector
5676 */
5677 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5678 if (!valid) {
5679 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5680 continue;
5681 }
5682 bool check_ns = (obj.ns == params.ns);
5683 if (!params.list_versions && !entry.is_visible()) {
5684 continue;
5685 }
5686
5687 if (params.enforce_ns && !check_ns) {
5688 if (!params.ns.empty()) {
5689 /* we've iterated past the namespace we're searching -- done now */
5690 truncated = false;
5691 goto done;
5692 }
5693
5694 /* we're not looking at the namespace this object is in, next! */
5695 continue;
5696 }
5697
5698 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5699 truncated = false;
5700 goto done;
5701 }
5702
5703 if (count < max) {
5704 params.marker = index_key;
5705 next_marker = index_key;
5706 }
5707
5708 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5709 continue;
5710
5711 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5712 continue;
5713
5714 if (!params.delim.empty()) {
5715 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5716
5717 if (delim_pos >= 0) {
5718 string prefix_key = obj.name.substr(0, delim_pos + 1);
5719
5720 if (common_prefixes &&
5721 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5722 if (count >= max) {
5723 truncated = true;
5724 goto done;
5725 }
5726 next_marker = prefix_key;
5727 (*common_prefixes)[prefix_key] = true;
5728
5729 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5730
5731 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5732 skip_after_delim.append(bigger_than_delim);
5733
5734 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5735
5736 count++;
5737 }
5738
5739 continue;
5740 }
5741 }
5742
5743 if (count >= max) {
5744 truncated = true;
5745 goto done;
5746 }
5747
5748 result->emplace_back(std::move(entry));
5749 count++;
5750 }
5751
5752 // Either the back-end telling us truncated, or we don't consume all
5753 // items returned per the amount caller request
5754 truncated = (truncated || eiter != ent_map.end());
5755 }
5756
5757 done:
5758 if (is_truncated)
5759 *is_truncated = truncated;
5760
5761 return 0;
5762 }
5763
5764 /**
5765 * create a rados pool, associated meta info
5766 * returns 0 on success, -ERR# otherwise.
5767 */
5768 int RGWRados::create_pool(const rgw_pool& pool)
5769 {
5770 int ret = 0;
5771
5772 librados::Rados *rad = get_rados_handle();
5773 ret = rad->pool_create(pool.name.c_str(), 0);
5774 if (ret == -EEXIST)
5775 ret = 0;
5776 else if (ret == -ERANGE) {
5777 ldout(cct, 0)
5778 << __func__
5779 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5780 << " (this can be due to a pool or placement group misconfiguration, e.g."
5781 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
5782 << dendl;
5783 }
5784 if (ret < 0)
5785 return ret;
5786
5787 librados::IoCtx io_ctx;
5788 ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
5789 if (ret < 0)
5790 return ret;
5791
5792 ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
5793 if (ret < 0 && ret != -EOPNOTSUPP)
5794 return ret;
5795 return 0;
5796 }
5797
5798 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5799 {
5800 librados::IoCtx index_ctx; // context for new bucket
5801
5802 string dir_oid = dir_oid_prefix;
5803 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5804 if (r < 0) {
5805 return r;
5806 }
5807
5808 dir_oid.append(bucket_info.bucket.bucket_id);
5809
5810 map<int, string> bucket_objs;
5811 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5812
5813 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5814 }
5815
5816 void RGWRados::create_bucket_id(string *bucket_id)
5817 {
5818 uint64_t iid = instance_id();
5819 uint64_t bid = next_bucket_id();
5820 char buf[get_zone_params().get_id().size() + 48];
5821 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5822 *bucket_id = buf;
5823 }
5824
5825 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5826 const string& zonegroup_id,
5827 const string& placement_rule,
5828 const string& swift_ver_location,
5829 const RGWQuotaInfo * pquota_info,
5830 map<std::string, bufferlist>& attrs,
5831 RGWBucketInfo& info,
5832 obj_version *pobjv,
5833 obj_version *pep_objv,
5834 real_time creation_time,
5835 rgw_bucket *pmaster_bucket,
5836 uint32_t *pmaster_num_shards,
5837 bool exclusive)
5838 {
5839 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5840 string selected_placement_rule_name;
5841 RGWZonePlacementInfo rule_info;
5842
5843 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5844 int ret = 0;
5845 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5846 &selected_placement_rule_name, &rule_info);
5847 if (ret < 0)
5848 return ret;
5849
5850 if (!pmaster_bucket) {
5851 create_bucket_id(&bucket.marker);
5852 bucket.bucket_id = bucket.marker;
5853 } else {
5854 bucket.marker = pmaster_bucket->marker;
5855 bucket.bucket_id = pmaster_bucket->bucket_id;
5856 }
5857
5858 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5859
5860 if (pobjv) {
5861 objv_tracker.write_version = *pobjv;
5862 } else {
5863 objv_tracker.generate_new_write_ver(cct);
5864 }
5865
5866 info.bucket = bucket;
5867 info.owner = owner.user_id;
5868 info.zonegroup = zonegroup_id;
5869 info.placement_rule = selected_placement_rule_name;
5870 info.index_type = rule_info.index_type;
5871 info.swift_ver_location = swift_ver_location;
5872 info.swift_versioning = (!swift_ver_location.empty());
5873 if (pmaster_num_shards) {
5874 info.num_shards = *pmaster_num_shards;
5875 } else {
5876 info.num_shards = bucket_index_max_shards;
5877 }
5878 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5879 info.requester_pays = false;
5880 if (real_clock::is_zero(creation_time)) {
5881 info.creation_time = ceph::real_clock::now();
5882 } else {
5883 info.creation_time = creation_time;
5884 }
5885 if (pquota_info) {
5886 info.quota = *pquota_info;
5887 }
5888
5889 int r = init_bucket_index(info, info.num_shards);
5890 if (r < 0) {
5891 return r;
5892 }
5893
5894 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5895 if (ret == -EEXIST) {
5896 librados::IoCtx index_ctx;
5897 map<int, string> bucket_objs;
5898 int r = open_bucket_index(info, index_ctx, bucket_objs);
5899 if (r < 0)
5900 return r;
5901
5902 /* we need to reread the info and return it, caller will have a use for it */
5903 RGWObjVersionTracker instance_ver = info.objv_tracker;
5904 info.objv_tracker.clear();
5905 RGWObjectCtx obj_ctx(this);
5906 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5907 if (r < 0) {
5908 if (r == -ENOENT) {
5909 continue;
5910 }
5911 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5912 return r;
5913 }
5914
5915 /* only remove it if it's a different bucket instance */
5916 if (info.bucket.bucket_id != bucket.bucket_id) {
5917 /* remove bucket meta instance */
5918 string entry = bucket.get_key();
5919 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5920 if (r < 0)
5921 return r;
5922
5923 map<int, string>::const_iterator biter;
5924 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5925 // Do best effort removal
5926 index_ctx.remove(biter->second);
5927 }
5928 }
5929 /* ret == -ENOENT here */
5930 }
5931 return ret;
5932 }
5933
5934 /* this is highly unlikely */
5935 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5936 return -ENOENT;
5937 }
5938
5939 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5940 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5941
5942 {
5943 /* first check that zonegroup exists within current period. */
5944 RGWZoneGroup zonegroup;
5945 int ret = get_zonegroup(zonegroup_id, zonegroup);
5946 if (ret < 0) {
5947 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5948 return ret;
5949 }
5950
5951 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5952 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
5953
5954 if (!request_rule.empty()) {
5955 titer = zonegroup.placement_targets.find(request_rule);
5956 if (titer == zonegroup.placement_targets.end()) {
5957 ldout(cct, 0) << "could not find requested placement id " << request_rule
5958 << " within zonegroup " << dendl;
5959 return -ERR_INVALID_LOCATION_CONSTRAINT;
5960 }
5961 } else if (!user_info.default_placement.empty()) {
5962 titer = zonegroup.placement_targets.find(user_info.default_placement);
5963 if (titer == zonegroup.placement_targets.end()) {
5964 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
5965 << " within zonegroup " << dendl;
5966 return -ERR_INVALID_LOCATION_CONSTRAINT;
5967 }
5968 } else {
5969 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
5970 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
5971 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
5972 } else {
5973 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
5974 if (titer == zonegroup.placement_targets.end()) {
5975 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
5976 << " within zonegroup " << dendl;
5977 return -ERR_INVALID_LOCATION_CONSTRAINT;
5978 }
5979 }
5980 }
5981
5982 /* now check tag for the rule, whether user is permitted to use rule */
5983 const auto& target_rule = titer->second;
5984 if (!target_rule.user_permitted(user_info.placement_tags)) {
5985 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
5986 return -EPERM;
5987 }
5988
5989 if (pselected_rule_name)
5990 *pselected_rule_name = titer->first;
5991
5992 return select_bucket_location_by_rule(titer->first, rule_info);
5993 }
5994
5995 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5996 {
5997 if (location_rule.empty()) {
5998 /* we can only reach here if we're trying to set a bucket location from a bucket
5999 * created on a different zone, using a legacy / default pool configuration
6000 */
6001 return select_legacy_bucket_placement(rule_info);
6002 }
6003
6004 /*
6005 * make sure that zone has this rule configured. We're
6006 * checking it for the local zone, because that's where this bucket object is going to
6007 * reside.
6008 */
6009 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
6010 if (piter == get_zone_params().placement_pools.end()) {
6011 /* couldn't find, means we cannot really place data for this bucket in this zone */
6012 if (get_zonegroup().equals(zonegroup.get_id())) {
6013 /* that's a configuration error, zone should have that rule, as we're within the requested
6014 * zonegroup */
6015 return -EINVAL;
6016 } else {
6017 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6018 return 0;
6019 }
6020 }
6021
6022 RGWZonePlacementInfo& placement_info = piter->second;
6023
6024 if (rule_info) {
6025 *rule_info = placement_info;
6026 }
6027
6028 return 0;
6029 }
6030
6031 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
6032 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6033 {
6034 if (!get_zone_params().placement_pools.empty()) {
6035 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6036 pselected_rule_name, rule_info);
6037 }
6038
6039 if (pselected_rule_name) {
6040 pselected_rule_name->clear();
6041 }
6042
6043 return select_legacy_bucket_placement(rule_info);
6044 }
6045
6046 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6047 {
6048 bufferlist map_bl;
6049 map<string, bufferlist> m;
6050 string pool_name;
6051 bool write_map = false;
6052
6053 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6054
6055 RGWObjectCtx obj_ctx(this);
6056 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6057 if (ret < 0) {
6058 goto read_omap;
6059 }
6060
6061 try {
6062 bufferlist::iterator iter = map_bl.begin();
6063 ::decode(m, iter);
6064 } catch (buffer::error& err) {
6065 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6066 }
6067
6068 read_omap:
6069 if (m.empty()) {
6070 bufferlist header;
6071 ret = omap_get_all(obj, header, m);
6072
6073 write_map = true;
6074 }
6075
6076 if (ret < 0 || m.empty()) {
6077 vector<rgw_pool> pools;
6078 string s = string("default.") + default_storage_pool_suffix;
6079 pools.push_back(rgw_pool(s));
6080 vector<int> retcodes;
6081 bufferlist bl;
6082 ret = create_pools(pools, retcodes);
6083 if (ret < 0)
6084 return ret;
6085 ret = omap_set(obj, s, bl);
6086 if (ret < 0)
6087 return ret;
6088 m[s] = bl;
6089 }
6090
6091 if (write_map) {
6092 bufferlist new_bl;
6093 ::encode(m, new_bl);
6094 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6095 if (ret < 0) {
6096 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6097 }
6098 }
6099
6100 map<string, bufferlist>::iterator miter;
6101 if (m.size() > 1) {
6102 vector<string> v;
6103 for (miter = m.begin(); miter != m.end(); ++miter) {
6104 v.push_back(miter->first);
6105 }
6106
6107 uint32_t r;
6108 ret = get_random_bytes((char *)&r, sizeof(r));
6109 if (ret < 0)
6110 return ret;
6111
6112 int i = r % v.size();
6113 pool_name = v[i];
6114 } else {
6115 miter = m.begin();
6116 pool_name = miter->first;
6117 }
6118
6119 rule_info->data_pool = pool_name;
6120 rule_info->data_extra_pool = pool_name;
6121 rule_info->index_pool = pool_name;
6122 rule_info->index_type = RGWBIType_Normal;
6123
6124 return 0;
6125 }
6126
6127 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6128 {
6129 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6130 }
6131
6132 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6133 {
6134 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6135
6136 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6137 }
6138
6139 int RGWRados::update_placement_map()
6140 {
6141 bufferlist header;
6142 map<string, bufferlist> m;
6143 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6144 int ret = omap_get_all(obj, header, m);
6145 if (ret < 0)
6146 return ret;
6147
6148 bufferlist new_bl;
6149 ::encode(m, new_bl);
6150 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6151 if (ret < 0) {
6152 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6153 }
6154
6155 return ret;
6156 }
6157
6158 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6159 {
6160 librados::Rados *rad = get_rados_handle();
6161 int ret = rad->pool_lookup(new_pool.name.c_str());
6162 if (ret < 0) // DNE, or something
6163 return ret;
6164
6165 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6166 bufferlist empty_bl;
6167 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6168
6169 // don't care about return value
6170 update_placement_map();
6171
6172 return ret;
6173 }
6174
6175 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6176 {
6177 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6178 int ret = omap_del(obj, old_pool.to_str());
6179
6180 // don't care about return value
6181 update_placement_map();
6182
6183 return ret;
6184 }
6185
6186 int RGWRados::list_placement_set(set<rgw_pool>& names)
6187 {
6188 bufferlist header;
6189 map<string, bufferlist> m;
6190
6191 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6192 int ret = omap_get_all(obj, header, m);
6193 if (ret < 0)
6194 return ret;
6195
6196 names.clear();
6197 map<string, bufferlist>::iterator miter;
6198 for (miter = m.begin(); miter != m.end(); ++miter) {
6199 names.insert(rgw_pool(miter->first));
6200 }
6201
6202 return names.size();
6203 }
6204
6205 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6206 {
6207 vector<librados::PoolAsyncCompletion *> completions;
6208 vector<int> rets;
6209
6210 librados::Rados *rad = get_rados_handle();
6211 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6212 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6213 completions.push_back(c);
6214 rgw_pool& pool = *iter;
6215 int ret = rad->pool_create_async(pool.name.c_str(), c);
6216 rets.push_back(ret);
6217 }
6218
6219 vector<int>::iterator riter;
6220 vector<librados::PoolAsyncCompletion *>::iterator citer;
6221
6222 bool error = false;
6223 assert(rets.size() == completions.size());
6224 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6225 int r = *riter;
6226 PoolAsyncCompletion *c = *citer;
6227 if (r == 0) {
6228 c->wait();
6229 r = c->get_return_value();
6230 if (r < 0) {
6231 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6232 error = true;
6233 }
6234 }
6235 c->release();
6236 retcodes.push_back(r);
6237 }
6238 if (error) {
6239 return 0;
6240 }
6241
6242 std::vector<librados::IoCtx> io_ctxs;
6243 retcodes.clear();
6244 for (auto pool : pools) {
6245 io_ctxs.emplace_back();
6246 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6247 if (ret < 0) {
6248 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6249 error = true;
6250 }
6251 retcodes.push_back(ret);
6252 }
6253 if (error) {
6254 return 0;
6255 }
6256
6257 completions.clear();
6258 for (auto &io_ctx : io_ctxs) {
6259 librados::PoolAsyncCompletion *c =
6260 librados::Rados::pool_async_create_completion();
6261 completions.push_back(c);
6262 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6263 false, c);
6264 assert(ret == 0);
6265 }
6266
6267 retcodes.clear();
6268 for (auto c : completions) {
6269 c->wait();
6270 int ret = c->get_return_value();
6271 if (ret == -EOPNOTSUPP) {
6272 ret = 0;
6273 } else if (ret < 0) {
6274 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6275 << dendl;
6276 error = true;
6277 }
6278 c->release();
6279 retcodes.push_back(ret);
6280 }
6281 return 0;
6282 }
6283
6284 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6285 {
6286 string oid, key;
6287 get_obj_bucket_and_oid_loc(obj, oid, key);
6288
6289 rgw_pool pool;
6290 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6291 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6292 return -EIO;
6293 }
6294
6295 int r = open_pool_ctx(pool, *ioctx);
6296 if (r < 0) {
6297 return r;
6298 }
6299
6300 ioctx->locator_set_key(key);
6301
6302 return 0;
6303 }
6304
6305 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6306 {
6307 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6308
6309 rgw_pool pool;
6310 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6311 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6312 return -EIO;
6313 }
6314
6315 int r = open_pool_ctx(pool, ref->ioctx);
6316 if (r < 0) {
6317 return r;
6318 }
6319
6320 ref->ioctx.locator_set_key(ref->key);
6321
6322 return 0;
6323 }
6324
6325 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6326 {
6327 ref->oid = obj.oid;
6328 ref->key = obj.loc;
6329
6330 int r;
6331
6332 if (ref->oid.empty()) {
6333 ref->oid = obj.pool.to_str();
6334 ref->pool = get_zone_params().domain_root;
6335 } else {
6336 ref->pool = obj.pool;
6337 }
6338 r = open_pool_ctx(ref->pool, ref->ioctx);
6339 if (r < 0)
6340 return r;
6341
6342 ref->ioctx.locator_set_key(ref->key);
6343
6344 return 0;
6345 }
6346
6347 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6348 {
6349 return get_raw_obj_ref(obj, ref);
6350 }
6351
6352 /*
6353 * fixes an issue where head objects were supposed to have a locator created, but ended
6354 * up without one
6355 */
6356 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6357 {
6358 const rgw_bucket& bucket = bucket_info.bucket;
6359 string oid;
6360 string locator;
6361
6362 rgw_obj obj(bucket, key);
6363
6364 get_obj_bucket_and_oid_loc(obj, oid, locator);
6365
6366 if (locator.empty()) {
6367 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6368 return 0;
6369 }
6370
6371 librados::IoCtx ioctx;
6372
6373 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6374 if (ret < 0) {
6375 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6376 return ret;
6377 }
6378 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6379
6380 uint64_t size;
6381 bufferlist data;
6382
6383 struct timespec mtime_ts;
6384 map<string, bufferlist> attrs;
6385 librados::ObjectReadOperation op;
6386 op.getxattrs(&attrs, NULL);
6387 op.stat2(&size, &mtime_ts, NULL);
6388 #define HEAD_SIZE 512 * 1024
6389 op.read(0, HEAD_SIZE, &data, NULL);
6390
6391 ret = ioctx.operate(oid, &op, NULL);
6392 if (ret < 0) {
6393 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6394 return ret;
6395 }
6396
6397 if (size > HEAD_SIZE) {
6398 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6399 return -EIO;
6400 }
6401
6402 if (size != data.length()) {
6403 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6404 return -EIO;
6405 }
6406
6407 if (copy_obj) {
6408 librados::ObjectWriteOperation wop;
6409
6410 wop.mtime2(&mtime_ts);
6411
6412 map<string, bufferlist>::iterator iter;
6413 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6414 wop.setxattr(iter->first.c_str(), iter->second);
6415 }
6416
6417 wop.write(0, data);
6418
6419 ioctx.locator_set_key(locator);
6420 ioctx.operate(oid, &wop);
6421 }
6422
6423 if (remove_bad) {
6424 ioctx.locator_set_key(string());
6425
6426 ret = ioctx.remove(oid);
6427 if (ret < 0) {
6428 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6429 return ret;
6430 }
6431 }
6432
6433 return 0;
6434 }
6435
6436 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6437 const string& src_oid, const string& src_locator,
6438 librados::IoCtx& dst_ioctx,
6439 const string& dst_oid, const string& dst_locator)
6440 {
6441
6442 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6443 bool done = false;
6444 uint64_t chunk_size = COPY_BUF_SIZE;
6445 uint64_t ofs = 0;
6446 int ret = 0;
6447 real_time mtime;
6448 struct timespec mtime_ts;
6449 uint64_t size;
6450
6451 if (src_oid == dst_oid && src_locator == dst_locator) {
6452 return 0;
6453 }
6454
6455 src_ioctx.locator_set_key(src_locator);
6456 dst_ioctx.locator_set_key(dst_locator);
6457
6458 do {
6459 bufferlist data;
6460 ObjectReadOperation rop;
6461 ObjectWriteOperation wop;
6462
6463 if (ofs == 0) {
6464 rop.stat2(&size, &mtime_ts, NULL);
6465 mtime = real_clock::from_timespec(mtime_ts);
6466 }
6467 rop.read(ofs, chunk_size, &data, NULL);
6468 ret = src_ioctx.operate(src_oid, &rop, NULL);
6469 if (ret < 0) {
6470 goto done_err;
6471 }
6472
6473 if (data.length() == 0) {
6474 break;
6475 }
6476
6477 if (ofs == 0) {
6478 wop.create(true); /* make it exclusive */
6479 wop.mtime2(&mtime_ts);
6480 mtime = real_clock::from_timespec(mtime_ts);
6481 }
6482 wop.write(ofs, data);
6483 ret = dst_ioctx.operate(dst_oid, &wop);
6484 ofs += data.length();
6485 done = data.length() != chunk_size;
6486 } while (!done);
6487
6488 if (ofs != size) {
6489 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6490 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6491 ret = -EIO;
6492 goto done_err;
6493 }
6494
6495 src_ioctx.remove(src_oid);
6496
6497 return 0;
6498
6499 done_err:
6500 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6501 return ret;
6502 }
6503
6504 /*
6505 * fixes an issue where head objects were supposed to have a locator created, but ended
6506 * up without one
6507 */
6508 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6509 {
6510 const rgw_bucket& bucket = bucket_info.bucket;
6511 rgw_obj obj(bucket, key);
6512
6513 if (need_fix) {
6514 *need_fix = false;
6515 }
6516
6517 rgw_rados_ref ref;
6518 int r = get_obj_head_ref(bucket_info, obj, &ref);
6519 if (r < 0) {
6520 return r;
6521 }
6522
6523 RGWObjState *astate = NULL;
6524 RGWObjectCtx rctx(this);
6525 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6526 if (r < 0)
6527 return r;
6528
6529 if (astate->has_manifest) {
6530 RGWObjManifest::obj_iterator miter;
6531 RGWObjManifest& manifest = astate->manifest;
6532 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6533 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6534 rgw_obj loc;
6535 string oid;
6536 string locator;
6537
6538 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6539
6540 if (loc.key.ns.empty()) {
6541 /* continue, we're only interested in tail objects */
6542 continue;
6543 }
6544
6545 get_obj_bucket_and_oid_loc(loc, oid, locator);
6546 ref.ioctx.locator_set_key(locator);
6547
6548 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6549
6550 r = ref.ioctx.stat(oid, NULL, NULL);
6551 if (r != -ENOENT) {
6552 continue;
6553 }
6554
6555 string bad_loc;
6556 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6557
6558 /* create a new ioctx with the bad locator */
6559 librados::IoCtx src_ioctx;
6560 src_ioctx.dup(ref.ioctx);
6561 src_ioctx.locator_set_key(bad_loc);
6562
6563 r = src_ioctx.stat(oid, NULL, NULL);
6564 if (r != 0) {
6565 /* cannot find a broken part */
6566 continue;
6567 }
6568 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6569 if (need_fix) {
6570 *need_fix = true;
6571 }
6572 if (fix) {
6573 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6574 if (r < 0) {
6575 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6576 }
6577 }
6578 }
6579 }
6580
6581 return 0;
6582 }
6583
6584 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6585 {
6586 bucket = _bucket;
6587
6588 RGWObjectCtx obj_ctx(store);
6589
6590 RGWBucketInfo bucket_info;
6591 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6592 if (ret < 0) {
6593 return ret;
6594 }
6595
6596 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6597 if (ret < 0) {
6598 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6599 return ret;
6600 }
6601 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6602
6603 return 0;
6604 }
6605
6606 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6607 {
6608 bucket = _bucket;
6609 shard_id = sid;
6610
6611 RGWObjectCtx obj_ctx(store);
6612
6613 RGWBucketInfo bucket_info;
6614 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6615 if (ret < 0) {
6616 return ret;
6617 }
6618
6619 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6620 if (ret < 0) {
6621 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6622 return ret;
6623 }
6624 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6625
6626 return 0;
6627 }
6628
6629 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6630 {
6631 bucket = bucket_info.bucket;
6632 shard_id = sid;
6633
6634 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6635 if (ret < 0) {
6636 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6637 return ret;
6638 }
6639 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6640
6641 return 0;
6642 }
6643
6644
6645 /* Execute @handler on last item in bucket listing for bucket specified
6646 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6647 * to objects matching these criterias. */
6648 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6649 const std::string& obj_prefix,
6650 const std::string& obj_delim,
6651 std::function<int(const rgw_bucket_dir_entry&)> handler)
6652 {
6653 RGWRados::Bucket target(this, bucket_info);
6654 RGWRados::Bucket::List list_op(&target);
6655
6656 list_op.params.prefix = obj_prefix;
6657 list_op.params.delim = obj_delim;
6658
6659 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6660 << ", obj_prefix=" << obj_prefix
6661 << ", obj_delim=" << obj_delim
6662 << dendl;
6663
6664 bool is_truncated = false;
6665
6666 boost::optional<rgw_bucket_dir_entry> last_entry;
6667 /* We need to rewind to the last object in a listing. */
6668 do {
6669 /* List bucket entries in chunks. */
6670 static constexpr int MAX_LIST_OBJS = 100;
6671 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6672
6673 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6674 &is_truncated);
6675 if (ret < 0) {
6676 return ret;
6677 } else if (!entries.empty()) {
6678 last_entry = entries.back();
6679 }
6680 } while (is_truncated);
6681
6682 if (last_entry) {
6683 return handler(*last_entry);
6684 }
6685
6686 /* Empty listing - no items we can run handler on. */
6687 return 0;
6688 }
6689
6690
6691 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6692 const rgw_user& user,
6693 RGWBucketInfo& bucket_info,
6694 rgw_obj& obj)
6695 {
6696 if (! swift_versioning_enabled(bucket_info)) {
6697 return 0;
6698 }
6699
6700 obj_ctx.obj.set_atomic(obj);
6701
6702 RGWObjState * state = nullptr;
6703 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6704 if (r < 0) {
6705 return r;
6706 }
6707
6708 if (!state->exists) {
6709 return 0;
6710 }
6711
6712 string client_id;
6713 string op_id;
6714
6715 const string& src_name = obj.get_oid();
6716 char buf[src_name.size() + 32];
6717 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6718 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6719 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6720
6721 RGWBucketInfo dest_bucket_info;
6722
6723 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6724 if (r < 0) {
6725 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6726 if (r == -ENOENT) {
6727 return -ERR_PRECONDITION_FAILED;
6728 }
6729 return r;
6730 }
6731
6732 if (dest_bucket_info.owner != bucket_info.owner) {
6733 return -ERR_PRECONDITION_FAILED;
6734 }
6735
6736 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6737 obj_ctx.obj.set_atomic(dest_obj);
6738
6739 string no_zone;
6740
6741 r = copy_obj(obj_ctx,
6742 user,
6743 client_id,
6744 op_id,
6745 NULL, /* req_info *info */
6746 no_zone,
6747 dest_obj,
6748 obj,
6749 dest_bucket_info,
6750 bucket_info,
6751 NULL, /* time_t *src_mtime */
6752 NULL, /* time_t *mtime */
6753 NULL, /* const time_t *mod_ptr */
6754 NULL, /* const time_t *unmod_ptr */
6755 false, /* bool high_precision_time */
6756 NULL, /* const char *if_match */
6757 NULL, /* const char *if_nomatch */
6758 RGWRados::ATTRSMOD_NONE,
6759 true, /* bool copy_if_newer */
6760 state->attrset,
6761 RGW_OBJ_CATEGORY_MAIN,
6762 0, /* uint64_t olh_epoch */
6763 real_time(), /* time_t delete_at */
6764 NULL, /* string *version_id */
6765 NULL, /* string *ptag */
6766 NULL, /* string *petag */
6767 NULL, /* void (*progress_cb)(off_t, void *) */
6768 NULL); /* void *progress_data */
6769 if (r == -ECANCELED || r == -ENOENT) {
6770 /* Has already been overwritten, meaning another rgw process already
6771 * copied it out */
6772 return 0;
6773 }
6774
6775 return r;
6776 }
6777
6778 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6779 const rgw_user& user,
6780 RGWBucketInfo& bucket_info,
6781 rgw_obj& obj,
6782 bool& restored) /* out */
6783 {
6784 if (! swift_versioning_enabled(bucket_info)) {
6785 return 0;
6786 }
6787
6788 /* Bucket info of the bucket that stores previous versions of our object. */
6789 RGWBucketInfo archive_binfo;
6790
6791 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6792 bucket_info.swift_ver_location, archive_binfo,
6793 nullptr, nullptr);
6794 if (ret < 0) {
6795 return ret;
6796 }
6797
6798 /* Abort the operation if the bucket storing our archive belongs to someone
6799 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6800 * into consideration. For we can live with that.
6801 *
6802 * TODO: delegate this check to un upper layer and compare with ACLs. */
6803 if (bucket_info.owner != archive_binfo.owner) {
6804 return -EPERM;
6805 }
6806
6807 /* This code will be executed on latest version of the object. */
6808 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6809 std::string no_client_id;
6810 std::string no_op_id;
6811 std::string no_zone;
6812
6813 /* We don't support object versioning of Swift API on those buckets that
6814 * are already versioned using the S3 mechanism. This affects also bucket
6815 * storing archived objects. Otherwise the delete operation would create
6816 * a deletion marker. */
6817 if (archive_binfo.versioned()) {
6818 restored = false;
6819 return -ERR_PRECONDITION_FAILED;
6820 }
6821
6822 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6823 * irrelevant and may be safely skipped. */
6824 std::map<std::string, ceph::bufferlist> no_attrs;
6825
6826 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6827 obj_ctx.obj.set_atomic(archive_obj);
6828 obj_ctx.obj.set_atomic(obj);
6829
6830 int ret = copy_obj(obj_ctx,
6831 user,
6832 no_client_id,
6833 no_op_id,
6834 nullptr, /* req_info *info */
6835 no_zone,
6836 obj, /* dest obj */
6837 archive_obj, /* src obj */
6838 bucket_info, /* dest bucket info */
6839 archive_binfo, /* src bucket info */
6840 nullptr, /* time_t *src_mtime */
6841 nullptr, /* time_t *mtime */
6842 nullptr, /* const time_t *mod_ptr */
6843 nullptr, /* const time_t *unmod_ptr */
6844 false, /* bool high_precision_time */
6845 nullptr, /* const char *if_match */
6846 nullptr, /* const char *if_nomatch */
6847 RGWRados::ATTRSMOD_NONE,
6848 true, /* bool copy_if_newer */
6849 no_attrs,
6850 RGW_OBJ_CATEGORY_MAIN,
6851 0, /* uint64_t olh_epoch */
6852 real_time(), /* time_t delete_at */
6853 nullptr, /* string *version_id */
6854 nullptr, /* string *ptag */
6855 nullptr, /* string *petag */
6856 nullptr, /* void (*progress_cb)(off_t, void *) */
6857 nullptr); /* void *progress_data */
6858 if (ret == -ECANCELED || ret == -ENOENT) {
6859 /* Has already been overwritten, meaning another rgw process already
6860 * copied it out */
6861 return 0;
6862 } else if (ret < 0) {
6863 return ret;
6864 } else {
6865 restored = true;
6866 }
6867
6868 /* Need to remove the archived copy. */
6869 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6870 archive_binfo.versioning_status());
6871
6872 return ret;
6873 };
6874
6875 const std::string& obj_name = obj.get_oid();
6876 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6877 % obj_name);
6878
6879 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6880 handler);
6881 }
6882
6883 /**
6884 * Write/overwrite an object to the bucket storage.
6885 * bucket: the bucket to store the object in
6886 * obj: the object name/key
6887 * data: the object contents/value
6888 * size: the amount of data to write (data must be this long)
6889 * accounted_size: original size of data before compression, encryption
6890 * mtime: if non-NULL, writes the given mtime to the bucket storage
6891 * attrs: all the given attrs are written to bucket storage for the given object
6892 * exclusive: create object exclusively
6893 * Returns: 0 on success, -ERR# otherwise.
6894 */
6895 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6896 map<string, bufferlist>& attrs,
6897 bool assume_noent, bool modify_tail,
6898 void *_index_op)
6899 {
6900 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6901 RGWRados *store = target->get_store();
6902
6903 ObjectWriteOperation op;
6904
6905 RGWObjState *state;
6906 int r = target->get_state(&state, false, assume_noent);
6907 if (r < 0)
6908 return r;
6909
6910 rgw_obj& obj = target->get_obj();
6911
6912 if (obj.get_oid().empty()) {
6913 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6914 return -EIO;
6915 }
6916
6917 rgw_rados_ref ref;
6918 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6919 if (r < 0)
6920 return r;
6921
6922 bool is_olh = state->is_olh;
6923
6924 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6925
6926 const string *ptag = meta.ptag;
6927 if (!ptag && !index_op->get_optag()->empty()) {
6928 ptag = index_op->get_optag();
6929 }
6930 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
6931 if (r < 0)
6932 return r;
6933
6934 if (real_clock::is_zero(meta.set_mtime)) {
6935 meta.set_mtime = real_clock::now();
6936 }
6937
6938 if (state->is_olh) {
6939 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6940 }
6941
6942 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6943 op.mtime2(&mtime_ts);
6944
6945 if (meta.data) {
6946 /* if we want to overwrite the data, we also want to overwrite the
6947 xattrs, so just remove the object */
6948 op.write_full(*meta.data);
6949 }
6950
6951 string etag;
6952 string content_type;
6953 bufferlist acl_bl;
6954
6955 map<string, bufferlist>::iterator iter;
6956 if (meta.rmattrs) {
6957 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6958 const string& name = iter->first;
6959 op.rmxattr(name.c_str());
6960 }
6961 }
6962
6963 if (meta.manifest) {
6964 /* remove existing manifest attr */
6965 iter = attrs.find(RGW_ATTR_MANIFEST);
6966 if (iter != attrs.end())
6967 attrs.erase(iter);
6968
6969 bufferlist bl;
6970 ::encode(*meta.manifest, bl);
6971 op.setxattr(RGW_ATTR_MANIFEST, bl);
6972 }
6973
6974 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6975 const string& name = iter->first;
6976 bufferlist& bl = iter->second;
6977
6978 if (!bl.length())
6979 continue;
6980
6981 op.setxattr(name.c_str(), bl);
6982
6983 if (name.compare(RGW_ATTR_ETAG) == 0) {
6984 etag = bl.c_str();
6985 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6986 content_type = bl.c_str();
6987 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6988 acl_bl = bl;
6989 }
6990 }
6991 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6992 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6993 }
6994
6995 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6996 bufferlist bl;
6997 ::encode(store->get_zone_short_id(), bl);
6998 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6999 }
7000
7001 if (!op.size())
7002 return 0;
7003
7004 uint64_t epoch;
7005 int64_t poolid;
7006 bool orig_exists;
7007 uint64_t orig_size;
7008
7009 if (!reset_obj) { //Multipart upload, it has immutable head.
7010 orig_exists = false;
7011 orig_size = 0;
7012 } else {
7013 orig_exists = state->exists;
7014 orig_size = state->accounted_size;
7015 }
7016
7017 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
7018
7019 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
7020
7021 if (versioned_op) {
7022 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
7023 }
7024
7025 if (!index_op->is_prepared()) {
7026 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
7027 if (r < 0)
7028 return r;
7029 }
7030
7031 r = ref.ioctx.operate(ref.oid, &op);
7032 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7033 or -ENOENT if was removed, or -EEXIST if it did not exist
7034 before and now it does */
7035 if (r == -EEXIST && assume_noent) {
7036 target->invalidate_state();
7037 return r;
7038 }
7039 goto done_cancel;
7040 }
7041
7042 epoch = ref.ioctx.get_last_version();
7043 poolid = ref.ioctx.get_id();
7044
7045 r = target->complete_atomic_modification();
7046 if (r < 0) {
7047 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7048 }
7049
7050 r = index_op->complete(poolid, epoch, size, accounted_size,
7051 meta.set_mtime, etag, content_type, &acl_bl,
7052 meta.category, meta.remove_objs, meta.user_data);
7053 if (r < 0)
7054 goto done_cancel;
7055
7056 if (meta.mtime) {
7057 *meta.mtime = meta.set_mtime;
7058 }
7059
7060 /* note that index_op was using state so we couldn't invalidate it earlier */
7061 target->invalidate_state();
7062 state = NULL;
7063
7064 if (versioned_op) {
7065 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
7066 if (r < 0) {
7067 return r;
7068 }
7069 }
7070
7071 if (!real_clock::is_zero(meta.delete_at)) {
7072 rgw_obj_index_key obj_key;
7073 obj.key.get_index_key(&obj_key);
7074
7075 r = store->objexp_hint_add(meta.delete_at,
7076 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7077 if (r < 0) {
7078 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7079 /* ignoring error, nothing we can do at this point */
7080 }
7081 }
7082 meta.canceled = false;
7083
7084 /* update quota cache */
7085 if (meta.completeMultipart){
7086 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7087 0, orig_size);
7088 }
7089 else {
7090 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7091 accounted_size, orig_size);
7092 }
7093 return 0;
7094
7095 done_cancel:
7096 int ret = index_op->cancel();
7097 if (ret < 0) {
7098 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7099 }
7100
7101 meta.canceled = true;
7102
7103 /* we lost in a race. There are a few options:
7104 * - existing object was rewritten (ECANCELED)
7105 * - non existing object was created (EEXIST)
7106 * - object was removed (ENOENT)
7107 * should treat it as a success
7108 */
7109 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7110 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7111 r = 0;
7112 }
7113 } else {
7114 if (meta.if_match != NULL) {
7115 // only overwrite existing object
7116 if (strcmp(meta.if_match, "*") == 0) {
7117 if (r == -ENOENT) {
7118 r = -ERR_PRECONDITION_FAILED;
7119 } else if (r == -ECANCELED) {
7120 r = 0;
7121 }
7122 }
7123 }
7124
7125 if (meta.if_nomatch != NULL) {
7126 // only create a new object
7127 if (strcmp(meta.if_nomatch, "*") == 0) {
7128 if (r == -EEXIST) {
7129 r = -ERR_PRECONDITION_FAILED;
7130 } else if (r == -ENOENT) {
7131 r = 0;
7132 }
7133 }
7134 }
7135 }
7136
7137 return r;
7138 }
7139
7140 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7141 map<string, bufferlist>& attrs)
7142 {
7143 RGWBucketInfo& bucket_info = target->get_bucket_info();
7144
7145 RGWRados::Bucket bop(target->get_store(), bucket_info);
7146 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7147 index_op.set_zones_trace(meta.zones_trace);
7148
7149 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7150 int r;
7151 if (assume_noent) {
7152 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7153 if (r == -EEXIST) {
7154 assume_noent = false;
7155 }
7156 }
7157 if (!assume_noent) {
7158 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7159 }
7160 return r;
7161 }
7162
7163 /** Write/overwrite a system object. */
7164 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7165 map<std::string, bufferlist>& attrs, int flags,
7166 bufferlist& data,
7167 RGWObjVersionTracker *objv_tracker,
7168 real_time set_mtime /* 0 for don't set */)
7169 {
7170 rgw_rados_ref ref;
7171 int r = get_system_obj_ref(obj, &ref);
7172 if (r < 0)
7173 return r;
7174
7175 ObjectWriteOperation op;
7176
7177 if (flags & PUT_OBJ_EXCL) {
7178 if (!(flags & PUT_OBJ_CREATE))
7179 return -EINVAL;
7180 op.create(true); // exclusive create
7181 } else {
7182 op.remove();
7183 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7184 op.create(false);
7185 }
7186
7187 if (objv_tracker) {
7188 objv_tracker->prepare_op_for_write(&op);
7189 }
7190
7191 if (real_clock::is_zero(set_mtime)) {
7192 set_mtime = real_clock::now();
7193 }
7194
7195 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7196 op.mtime2(&mtime_ts);
7197 op.write_full(data);
7198
7199 bufferlist acl_bl;
7200
7201 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7202 const string& name = iter->first;
7203 bufferlist& bl = iter->second;
7204
7205 if (!bl.length())
7206 continue;
7207
7208 op.setxattr(name.c_str(), bl);
7209 }
7210
7211 r = ref.ioctx.operate(ref.oid, &op);
7212 if (r < 0) {
7213 return r;
7214 }
7215
7216 if (objv_tracker) {
7217 objv_tracker->apply_write();
7218 }
7219
7220 if (mtime) {
7221 *mtime = set_mtime;
7222 }
7223
7224 return 0;
7225 }
7226
7227 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7228 off_t ofs, bool exclusive,
7229 RGWObjVersionTracker *objv_tracker)
7230 {
7231 rgw_rados_ref ref;
7232 int r = get_system_obj_ref(obj, &ref);
7233 if (r < 0) {
7234 return r;
7235 }
7236
7237 ObjectWriteOperation op;
7238
7239 if (exclusive)
7240 op.create(true);
7241
7242 if (objv_tracker) {
7243 objv_tracker->prepare_op_for_write(&op);
7244 }
7245 if (ofs == -1) {
7246 op.write_full(bl);
7247 } else {
7248 op.write(ofs, bl);
7249 }
7250 r = ref.ioctx.operate(ref.oid, &op);
7251 if (r < 0)
7252 return r;
7253
7254 if (objv_tracker) {
7255 objv_tracker->apply_write();
7256 }
7257 return 0;
7258 }
7259
7260 /**
7261 * Write/overwrite an object to the bucket storage.
7262 * bucket: the bucket to store the object in
7263 * obj: the object name/key
7264 * data: the object contents/value
7265 * offset: the offet to write to in the object
7266 * If this is -1, we will overwrite the whole object.
7267 * size: the amount of data to write (data must be this long)
7268 * attrs: all the given attrs are written to bucket storage for the given object
7269 * Returns: 0 on success, -ERR# otherwise.
7270 */
7271
7272 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7273 off_t ofs, bool exclusive,
7274 void **handle)
7275 {
7276 rgw_rados_ref ref;
7277 int r = get_raw_obj_ref(obj, &ref);
7278 if (r < 0) {
7279 return r;
7280 }
7281
7282 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7283 *handle = c;
7284
7285 ObjectWriteOperation op;
7286
7287 if (exclusive)
7288 op.create(true);
7289
7290 if (ofs == -1) {
7291 op.write_full(bl);
7292 } else {
7293 op.write(ofs, bl);
7294 }
7295 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7296 if (r < 0)
7297 return r;
7298
7299 return 0;
7300 }
7301
7302 int RGWRados::aio_wait(void *handle)
7303 {
7304 AioCompletion *c = (AioCompletion *)handle;
7305 c->wait_for_safe();
7306 int ret = c->get_return_value();
7307 c->release();
7308 return ret;
7309 }
7310
7311 bool RGWRados::aio_completed(void *handle)
7312 {
7313 AioCompletion *c = (AioCompletion *)handle;
7314 return c->is_safe();
7315 }
7316
7317 class RGWRadosPutObj : public RGWGetDataCB
7318 {
7319 CephContext* cct;
7320 rgw_obj obj;
7321 RGWPutObjDataProcessor *filter;
7322 boost::optional<RGWPutObj_Compress>& compressor;
7323 CompressorRef& plugin;
7324 RGWPutObjProcessor_Atomic *processor;
7325 RGWOpStateSingleOp *opstate;
7326 void (*progress_cb)(off_t, void *);
7327 void *progress_data;
7328 bufferlist extra_data_bl;
7329 uint64_t extra_data_left;
7330 uint64_t data_len;
7331 map<string, bufferlist> src_attrs;
7332 public:
7333 RGWRadosPutObj(CephContext* cct,
7334 CompressorRef& plugin,
7335 boost::optional<RGWPutObj_Compress>& compressor,
7336 RGWPutObjProcessor_Atomic *p,
7337 RGWOpStateSingleOp *_ops,
7338 void (*_progress_cb)(off_t, void *),
7339 void *_progress_data) :
7340 cct(cct),
7341 filter(p),
7342 compressor(compressor),
7343 plugin(plugin),
7344 processor(p),
7345 opstate(_ops),
7346 progress_cb(_progress_cb),
7347 progress_data(_progress_data),
7348 extra_data_left(0),
7349 data_len(0) {}
7350
7351 int process_attrs(void) {
7352 if (extra_data_bl.length()) {
7353 JSONParser jp;
7354 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7355 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7356 return -EIO;
7357 }
7358
7359 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7360
7361 src_attrs.erase(RGW_ATTR_COMPRESSION);
7362 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7363 }
7364
7365 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7366 //do not compress if object is encrypted
7367 compressor = boost::in_place(cct, plugin, filter);
7368 filter = &*compressor;
7369 }
7370 return 0;
7371 }
7372
7373 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7374 if (progress_cb) {
7375 progress_cb(ofs, progress_data);
7376 }
7377 if (extra_data_left) {
7378 size_t extra_len = bl.length();
7379 if (extra_len > extra_data_left)
7380 extra_len = extra_data_left;
7381
7382 bufferlist extra;
7383 bl.splice(0, extra_len, &extra);
7384 extra_data_bl.append(extra);
7385
7386 extra_data_left -= extra_len;
7387 if (extra_data_left == 0) {
7388 int res = process_attrs();
7389 if (res < 0)
7390 return res;
7391 }
7392 if (bl.length() == 0) {
7393 return 0;
7394 }
7395 ofs += extra_len;
7396 }
7397 // adjust ofs based on extra_data_len, so the result is a logical offset
7398 // into the object data
7399 assert(uint64_t(ofs) >= extra_data_len);
7400 ofs -= extra_data_len;
7401
7402 data_len += bl.length();
7403 bool again = false;
7404
7405 bool need_opstate = true;
7406
7407 do {
7408 void *handle = NULL;
7409 rgw_raw_obj obj;
7410 uint64_t size = bl.length();
7411 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7412 if (ret < 0)
7413 return ret;
7414
7415 if (need_opstate && opstate) {
7416 /* need to update opstate repository with new state. This is ratelimited, so we're not
7417 * really doing it every time
7418 */
7419 ret = opstate->renew_state();
7420 if (ret < 0) {
7421 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7422 int r = filter->throttle_data(handle, obj, size, false);
7423 if (r < 0) {
7424 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7425 }
7426 /* could not renew state! might have been marked as cancelled */
7427 return ret;
7428 }
7429 need_opstate = false;
7430 }
7431
7432 ret = filter->throttle_data(handle, obj, size, false);
7433 if (ret < 0)
7434 return ret;
7435 } while (again);
7436
7437 return 0;
7438 }
7439
7440 bufferlist& get_extra_data() { return extra_data_bl; }
7441
7442 map<string, bufferlist>& get_attrs() { return src_attrs; }
7443
7444 void set_extra_data_len(uint64_t len) override {
7445 extra_data_left = len;
7446 RGWGetDataCB::set_extra_data_len(len);
7447 }
7448
7449 uint64_t get_data_len() {
7450 return data_len;
7451 }
7452
7453 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7454 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7455 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7456 }
7457
7458 bool is_canceled() {
7459 return processor->is_canceled();
7460 }
7461 };
7462
7463 /*
7464 * prepare attrset depending on attrs_mod.
7465 */
7466 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7467 map<string, bufferlist>& attrs,
7468 RGWRados::AttrsMod attrs_mod)
7469 {
7470 switch (attrs_mod) {
7471 case RGWRados::ATTRSMOD_NONE:
7472 attrs = src_attrs;
7473 break;
7474 case RGWRados::ATTRSMOD_REPLACE:
7475 if (!attrs[RGW_ATTR_ETAG].length()) {
7476 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7477 }
7478 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7479 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7480 if (ttiter != src_attrs.end()) {
7481 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7482 }
7483 }
7484 break;
7485 case RGWRados::ATTRSMOD_MERGE:
7486 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7487 if (attrs.find(it->first) == attrs.end()) {
7488 attrs[it->first] = it->second;
7489 }
7490 }
7491 break;
7492 }
7493 }
7494
7495 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7496 {
7497 map<string, bufferlist> attrset;
7498
7499 real_time mtime;
7500 uint64_t obj_size;
7501 RGWObjectCtx rctx(this);
7502
7503 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7504 RGWRados::Object::Read read_op(&op_target);
7505
7506 read_op.params.attrs = &attrset;
7507 read_op.params.lastmod = &mtime;
7508 read_op.params.obj_size = &obj_size;
7509
7510 int ret = read_op.prepare();
7511 if (ret < 0)
7512 return ret;
7513
7514 attrset.erase(RGW_ATTR_ID_TAG);
7515 attrset.erase(RGW_ATTR_TAIL_TAG);
7516
7517 uint64_t max_chunk_size;
7518
7519 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7520 if (ret < 0) {
7521 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7522 return ret;
7523 }
7524
7525 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7526 max_chunk_size, NULL, mtime, attrset,
7527 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7528 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7529 NULL, NULL);
7530 }
7531
7532 struct obj_time_weight {
7533 real_time mtime;
7534 uint32_t zone_short_id;
7535 uint64_t pg_ver;
7536 bool high_precision;
7537
7538 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7539
7540 bool compare_low_precision(const obj_time_weight& rhs) {
7541 struct timespec l = ceph::real_clock::to_timespec(mtime);
7542 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7543 l.tv_nsec = 0;
7544 r.tv_nsec = 0;
7545 if (l > r) {
7546 return false;
7547 }
7548 if (l < r) {
7549 return true;
7550 }
7551 if (zone_short_id != rhs.zone_short_id) {
7552 return (zone_short_id < rhs.zone_short_id);
7553 }
7554 return (pg_ver < rhs.pg_ver);
7555
7556 }
7557
7558 bool operator<(const obj_time_weight& rhs) {
7559 if (!high_precision || !rhs.high_precision) {
7560 return compare_low_precision(rhs);
7561 }
7562 if (mtime > rhs.mtime) {
7563 return false;
7564 }
7565 if (mtime < rhs.mtime) {
7566 return true;
7567 }
7568 if (zone_short_id != rhs.zone_short_id) {
7569 return (zone_short_id < rhs.zone_short_id);
7570 }
7571 return (pg_ver < rhs.pg_ver);
7572 }
7573
7574 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7575 mtime = _mtime;
7576 zone_short_id = _short_id;
7577 pg_ver = _pg_ver;
7578 }
7579
7580 void init(RGWObjState *state) {
7581 mtime = state->mtime;
7582 zone_short_id = state->zone_short_id;
7583 pg_ver = state->pg_ver;
7584 }
7585 };
7586
7587 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7588 out << o.mtime;
7589
7590 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7591 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7592 }
7593
7594 return out;
7595 }
7596
7597 class RGWGetExtraDataCB : public RGWGetDataCB {
7598 bufferlist extra_data;
7599 public:
7600 RGWGetExtraDataCB() {}
7601 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7602 if (extra_data.length() < extra_data_len) {
7603 off_t max = extra_data_len - extra_data.length();
7604 if (max > bl_len) {
7605 max = bl_len;
7606 }
7607 bl.splice(0, max, &extra_data);
7608 }
7609 return bl_len;
7610 }
7611
7612 bufferlist& get_extra_data() {
7613 return extra_data;
7614 }
7615 };
7616
7617 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7618 const rgw_user& user_id,
7619 const string& client_id,
7620 req_info *info,
7621 const string& source_zone,
7622 rgw_obj& src_obj,
7623 RGWBucketInfo& src_bucket_info,
7624 real_time *src_mtime,
7625 uint64_t *psize,
7626 const real_time *mod_ptr,
7627 const real_time *unmod_ptr,
7628 bool high_precision_time,
7629 const char *if_match,
7630 const char *if_nomatch,
7631 map<string, bufferlist> *pattrs,
7632 string *version_id,
7633 string *ptag,
7634 string *petag)
7635 {
7636 /* source is in a different zonegroup, copy from there */
7637
7638 RGWRESTStreamRWRequest *in_stream_req;
7639 string tag;
7640 map<string, bufferlist> src_attrs;
7641 append_rand_alpha(cct, tag, tag, 32);
7642 obj_time_weight set_mtime_weight;
7643 set_mtime_weight.high_precision = high_precision_time;
7644
7645 RGWRESTConn *conn;
7646 if (source_zone.empty()) {
7647 if (src_bucket_info.zonegroup.empty()) {
7648 /* source is in the master zonegroup */
7649 conn = rest_master_conn;
7650 } else {
7651 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7652 if (iter == zonegroup_conn_map.end()) {
7653 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7654 return -ENOENT;
7655 }
7656 conn = iter->second;
7657 }
7658 } else {
7659 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7660 if (iter == zone_conn_map.end()) {
7661 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7662 return -ENOENT;
7663 }
7664 conn = iter->second;
7665 }
7666
7667 RGWGetExtraDataCB cb;
7668 string etag;
7669 map<string, string> req_headers;
7670 real_time set_mtime;
7671
7672 const real_time *pmod = mod_ptr;
7673
7674 obj_time_weight dest_mtime_weight;
7675
7676 constexpr bool prepend_meta = true;
7677 constexpr bool get_op = true;
7678 constexpr bool rgwx_stat = true;
7679 constexpr bool sync_manifest = true;
7680 constexpr bool skip_decrypt = true;
7681 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7682 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7683 prepend_meta, get_op, rgwx_stat,
7684 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7685 if (ret < 0) {
7686 return ret;
7687 }
7688
7689 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7690 if (ret < 0) {
7691 return ret;
7692 }
7693
7694 bufferlist& extra_data_bl = cb.get_extra_data();
7695 if (extra_data_bl.length()) {
7696 JSONParser jp;
7697 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7698 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7699 return -EIO;
7700 }
7701
7702 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7703
7704 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7705 }
7706
7707 if (src_mtime) {
7708 *src_mtime = set_mtime;
7709 }
7710
7711 if (petag) {
7712 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7713 if (iter != src_attrs.end()) {
7714 bufferlist& etagbl = iter->second;
7715 *petag = etagbl.to_str();
7716 }
7717 }
7718
7719 if (pattrs) {
7720 *pattrs = src_attrs;
7721 }
7722
7723 return 0;
7724 }
7725
7726 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7727 const rgw_user& user_id,
7728 const string& client_id,
7729 const string& op_id,
7730 bool record_op_state,
7731 req_info *info,
7732 const string& source_zone,
7733 rgw_obj& dest_obj,
7734 rgw_obj& src_obj,
7735 RGWBucketInfo& dest_bucket_info,
7736 RGWBucketInfo& src_bucket_info,
7737 real_time *src_mtime,
7738 real_time *mtime,
7739 const real_time *mod_ptr,
7740 const real_time *unmod_ptr,
7741 bool high_precision_time,
7742 const char *if_match,
7743 const char *if_nomatch,
7744 AttrsMod attrs_mod,
7745 bool copy_if_newer,
7746 map<string, bufferlist>& attrs,
7747 RGWObjCategory category,
7748 uint64_t olh_epoch,
7749 real_time delete_at,
7750 string *version_id,
7751 string *ptag,
7752 ceph::buffer::list *petag,
7753 void (*progress_cb)(off_t, void *),
7754 void *progress_data,
7755 rgw_zone_set *zones_trace)
7756 {
7757 /* source is in a different zonegroup, copy from there */
7758
7759 RGWRESTStreamRWRequest *in_stream_req;
7760 string tag;
7761 int i;
7762 append_rand_alpha(cct, tag, tag, 32);
7763 obj_time_weight set_mtime_weight;
7764 set_mtime_weight.high_precision = high_precision_time;
7765
7766 RGWPutObjProcessor_Atomic processor(obj_ctx,
7767 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7768 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7769 if (version_id && *version_id != "null") {
7770 processor.set_version_id(*version_id);
7771 }
7772 processor.set_olh_epoch(olh_epoch);
7773 int ret = processor.prepare(this, NULL);
7774 if (ret < 0) {
7775 return ret;
7776 }
7777
7778 RGWRESTConn *conn;
7779 if (source_zone.empty()) {
7780 if (dest_bucket_info.zonegroup.empty()) {
7781 /* source is in the master zonegroup */
7782 conn = rest_master_conn;
7783 } else {
7784 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7785 if (iter == zonegroup_conn_map.end()) {
7786 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7787 return -ENOENT;
7788 }
7789 conn = iter->second;
7790 }
7791 } else {
7792 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7793 if (iter == zone_conn_map.end()) {
7794 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7795 return -ENOENT;
7796 }
7797 conn = iter->second;
7798 }
7799
7800 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7801
7802 RGWOpStateSingleOp *opstate = NULL;
7803
7804 if (record_op_state) {
7805 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7806
7807 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7808 if (ret < 0) {
7809 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7810 delete opstate;
7811 return ret;
7812 }
7813 }
7814
7815 boost::optional<RGWPutObj_Compress> compressor;
7816 CompressorRef plugin;
7817
7818 const auto& compression_type = zone_params.get_compression_type(
7819 dest_bucket_info.placement_rule);
7820 if (compression_type != "none") {
7821 plugin = Compressor::create(cct, compression_type);
7822 if (!plugin) {
7823 ldout(cct, 1) << "Cannot load plugin for compression type "
7824 << compression_type << dendl;
7825 }
7826 }
7827
7828 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7829
7830 string etag;
7831 map<string, string> req_headers;
7832 real_time set_mtime;
7833
7834 RGWObjState *dest_state = NULL;
7835
7836 const real_time *pmod = mod_ptr;
7837
7838 obj_time_weight dest_mtime_weight;
7839
7840 if (copy_if_newer) {
7841 /* need to get mtime for destination */
7842 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7843 if (ret < 0)
7844 goto set_err_state;
7845
7846 if (!real_clock::is_zero(dest_state->mtime)) {
7847 dest_mtime_weight.init(dest_state);
7848 pmod = &dest_mtime_weight.mtime;
7849 }
7850 }
7851
7852 static constexpr bool prepend_meta = true;
7853 static constexpr bool get_op = true;
7854 static constexpr bool rgwx_stat = false;
7855 static constexpr bool sync_manifest = true;
7856 static constexpr bool skip_decrypt = true;
7857 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7858 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7859 prepend_meta, get_op, rgwx_stat,
7860 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7861 if (ret < 0) {
7862 goto set_err_state;
7863 }
7864
7865 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7866 if (ret < 0) {
7867 goto set_err_state;
7868 }
7869 if (compressor && compressor->is_compressed()) {
7870 bufferlist tmp;
7871 RGWCompressionInfo cs_info;
7872 cs_info.compression_type = plugin->get_type_name();
7873 cs_info.orig_size = cb.get_data_len();
7874 cs_info.blocks = move(compressor->get_compression_blocks());
7875 ::encode(cs_info, tmp);
7876 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7877 }
7878
7879 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7880 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7881 } else {
7882 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7883 if (iter != cb.get_attrs().end()) {
7884 try {
7885 ::decode(delete_at, iter->second);
7886 } catch (buffer::error& err) {
7887 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7888 }
7889 }
7890 }
7891
7892 if (src_mtime) {
7893 *src_mtime = set_mtime;
7894 }
7895
7896 if (petag) {
7897 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7898 if (iter != cb.get_attrs().end()) {
7899 *petag = iter->second;
7900 }
7901 }
7902
7903 if (source_zone.empty()) {
7904 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7905 } else {
7906 attrs = cb.get_attrs();
7907 }
7908
7909 if (copy_if_newer) {
7910 uint64_t pg_ver = 0;
7911 auto i = attrs.find(RGW_ATTR_PG_VER);
7912 if (i != attrs.end() && i->second.length() > 0) {
7913 bufferlist::iterator iter = i->second.begin();
7914 try {
7915 ::decode(pg_ver, iter);
7916 } catch (buffer::error& err) {
7917 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7918 /* non critical error */
7919 }
7920 }
7921 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7922 }
7923
7924 #define MAX_COMPLETE_RETRY 100
7925 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7926 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7927 if (ret < 0) {
7928 goto set_err_state;
7929 }
7930 if (copy_if_newer && cb.is_canceled()) {
7931 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7932 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7933 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7934 if (ret < 0) {
7935 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7936 goto set_err_state;
7937 }
7938 dest_mtime_weight.init(dest_state);
7939 dest_mtime_weight.high_precision = high_precision_time;
7940 if (!dest_state->exists ||
7941 dest_mtime_weight < set_mtime_weight) {
7942 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7943 continue;
7944 } else {
7945 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7946 }
7947 }
7948 break;
7949 }
7950
7951 if (i == MAX_COMPLETE_RETRY) {
7952 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7953 ret = -EIO;
7954 goto set_err_state;
7955 }
7956
7957 if (opstate) {
7958 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7959 if (ret < 0) {
7960 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7961 }
7962 delete opstate;
7963 }
7964
7965 return 0;
7966 set_err_state:
7967 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7968 ret = 0;
7969 }
7970 if (opstate) {
7971 RGWOpState::OpState state;
7972 if (ret < 0) {
7973 state = RGWOpState::OPSTATE_ERROR;
7974 } else {
7975 state = RGWOpState::OPSTATE_COMPLETE;
7976 }
7977 int r = opstate->set_state(state);
7978 if (r < 0) {
7979 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7980 }
7981 delete opstate;
7982 }
7983 return ret;
7984 }
7985
7986
7987 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7988 map<string, bufferlist>& src_attrs,
7989 RGWRados::Object::Read& read_op,
7990 const rgw_user& user_id,
7991 rgw_obj& dest_obj,
7992 real_time *mtime)
7993 {
7994 string etag;
7995
7996 RGWRESTStreamWriteRequest *out_stream_req;
7997
7998 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7999 if (ret < 0) {
8000 return ret;
8001 }
8002
8003 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
8004 if (ret < 0) {
8005 delete out_stream_req;
8006 return ret;
8007 }
8008
8009 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8010 if (ret < 0)
8011 return ret;
8012
8013 return 0;
8014 }
8015
8016 /**
8017 * Copy an object.
8018 * dest_obj: the object to copy into
8019 * src_obj: the object to copy from
8020 * attrs: usage depends on attrs_mod parameter
8021 * attrs_mod: the modification mode of the attrs, may have the following values:
8022 * ATTRSMOD_NONE - the attributes of the source object will be
8023 * copied without modifications, attrs parameter is ignored;
8024 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8025 * parameter, source object attributes are not copied;
8026 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8027 * are overwritten by values contained in attrs parameter.
8028 * err: stores any errors resulting from the get of the original object
8029 * Returns: 0 on success, -ERR# otherwise.
8030 */
8031 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8032 const rgw_user& user_id,
8033 const string& client_id,
8034 const string& op_id,
8035 req_info *info,
8036 const string& source_zone,
8037 rgw_obj& dest_obj,
8038 rgw_obj& src_obj,
8039 RGWBucketInfo& dest_bucket_info,
8040 RGWBucketInfo& src_bucket_info,
8041 real_time *src_mtime,
8042 real_time *mtime,
8043 const real_time *mod_ptr,
8044 const real_time *unmod_ptr,
8045 bool high_precision_time,
8046 const char *if_match,
8047 const char *if_nomatch,
8048 AttrsMod attrs_mod,
8049 bool copy_if_newer,
8050 map<string, bufferlist>& attrs,
8051 RGWObjCategory category,
8052 uint64_t olh_epoch,
8053 real_time delete_at,
8054 string *version_id,
8055 string *ptag,
8056 ceph::buffer::list *petag,
8057 void (*progress_cb)(off_t, void *),
8058 void *progress_data)
8059 {
8060 int ret;
8061 uint64_t obj_size;
8062 rgw_obj shadow_obj = dest_obj;
8063 string shadow_oid;
8064
8065 bool remote_src;
8066 bool remote_dest;
8067
8068 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8069 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8070
8071 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8072 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8073
8074 if (remote_src && remote_dest) {
8075 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8076 return -EINVAL;
8077 }
8078
8079 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8080
8081 if (remote_src || !source_zone.empty()) {
8082 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8083 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8084 unmod_ptr, high_precision_time,
8085 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
8086 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
8087 }
8088
8089 map<string, bufferlist> src_attrs;
8090 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8091 RGWRados::Object::Read read_op(&src_op_target);
8092
8093 read_op.conds.mod_ptr = mod_ptr;
8094 read_op.conds.unmod_ptr = unmod_ptr;
8095 read_op.conds.high_precision_time = high_precision_time;
8096 read_op.conds.if_match = if_match;
8097 read_op.conds.if_nomatch = if_nomatch;
8098 read_op.params.attrs = &src_attrs;
8099 read_op.params.lastmod = src_mtime;
8100 read_op.params.obj_size = &obj_size;
8101
8102 ret = read_op.prepare();
8103 if (ret < 0) {
8104 return ret;
8105 }
8106 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8107 // Current implementation does not follow S3 spec and even
8108 // may result in data corruption silently when copying
8109 // multipart objects acorss pools. So reject COPY operations
8110 //on encrypted objects before it is fully functional.
8111 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8112 << " has not been implemented." << dendl;
8113 return -ERR_NOT_IMPLEMENTED;
8114 }
8115
8116 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8117 src_attrs.erase(RGW_ATTR_DELETE_AT);
8118
8119 set_copy_attrs(src_attrs, attrs, attrs_mod);
8120 attrs.erase(RGW_ATTR_ID_TAG);
8121 attrs.erase(RGW_ATTR_PG_VER);
8122 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8123 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8124 if (cmp != src_attrs.end())
8125 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8126
8127 RGWObjManifest manifest;
8128 RGWObjState *astate = NULL;
8129
8130 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8131 if (ret < 0) {
8132 return ret;
8133 }
8134
8135 vector<rgw_raw_obj> ref_objs;
8136
8137 if (remote_dest) {
8138 /* dest is in a different zonegroup, copy it there */
8139 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8140 }
8141 uint64_t max_chunk_size;
8142
8143 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8144 if (ret < 0) {
8145 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8146 return ret;
8147 }
8148
8149 rgw_pool src_pool;
8150 rgw_pool dest_pool;
8151 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8152 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8153 return -EIO;
8154 }
8155 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8156 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8157 return -EIO;
8158 }
8159
8160
8161 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8162 bool copy_first = false;
8163 if (astate->has_manifest) {
8164 if (!astate->manifest.has_tail()) {
8165 copy_data = true;
8166 } else {
8167 uint64_t head_size = astate->manifest.get_head_size();
8168
8169 if (head_size > 0) {
8170 if (head_size > max_chunk_size) {
8171 copy_data = true;
8172 } else {
8173 copy_first = true;
8174 }
8175 }
8176 }
8177 }
8178
8179 if (petag) {
8180 const auto iter = attrs.find(RGW_ATTR_ETAG);
8181 if (iter != attrs.end()) {
8182 *petag = iter->second;
8183 }
8184 }
8185
8186 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8187 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8188 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8189 version_id, ptag, petag);
8190 }
8191
8192 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8193
8194 if (copy_first) { // we need to copy first chunk, not increase refcount
8195 ++miter;
8196 }
8197
8198 rgw_rados_ref ref;
8199 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8200 if (ret < 0) {
8201 return ret;
8202 }
8203
8204 bool versioned_dest = dest_bucket_info.versioning_enabled();
8205
8206 if (version_id && !version_id->empty()) {
8207 versioned_dest = true;
8208 dest_obj.key.set_instance(*version_id);
8209 } else if (versioned_dest) {
8210 gen_rand_obj_instance_name(&dest_obj);
8211 }
8212
8213 bufferlist first_chunk;
8214
8215 bool copy_itself = (dest_obj == src_obj);
8216 RGWObjManifest *pmanifest;
8217 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8218
8219 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8220 RGWRados::Object::Write write_op(&dest_op_target);
8221
8222 string tag;
8223
8224 if (ptag) {
8225 tag = *ptag;
8226 }
8227
8228 if (tag.empty()) {
8229 append_rand_alpha(cct, tag, tag, 32);
8230 }
8231
8232 if (!copy_itself) {
8233 attrs.erase(RGW_ATTR_TAIL_TAG);
8234 manifest = astate->manifest;
8235 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8236 if (tail_placement.bucket.name.empty()) {
8237 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8238 }
8239 string ref_tag;
8240 for (; miter != astate->manifest.obj_end(); ++miter) {
8241 ObjectWriteOperation op;
8242 ref_tag = tag + '\0';
8243 cls_refcount_get(op, ref_tag, true);
8244 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8245 ref.ioctx.locator_set_key(loc.loc);
8246
8247 ret = ref.ioctx.operate(loc.oid, &op);
8248 if (ret < 0) {
8249 goto done_ret;
8250 }
8251
8252 ref_objs.push_back(loc);
8253 }
8254
8255 pmanifest = &manifest;
8256 } else {
8257 pmanifest = &astate->manifest;
8258 /* don't send the object's tail for garbage collection */
8259 astate->keep_tail = true;
8260 }
8261
8262 if (copy_first) {
8263 ret = read_op.read(0, max_chunk_size, first_chunk);
8264 if (ret < 0) {
8265 goto done_ret;
8266 }
8267
8268 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8269 } else {
8270 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8271 }
8272
8273 write_op.meta.data = &first_chunk;
8274 write_op.meta.manifest = pmanifest;
8275 write_op.meta.ptag = &tag;
8276 write_op.meta.owner = dest_bucket_info.owner;
8277 write_op.meta.mtime = mtime;
8278 write_op.meta.flags = PUT_OBJ_CREATE;
8279 write_op.meta.category = category;
8280 write_op.meta.olh_epoch = olh_epoch;
8281 write_op.meta.delete_at = delete_at;
8282 write_op.meta.modify_tail = !copy_itself;
8283
8284 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8285 if (ret < 0) {
8286 goto done_ret;
8287 }
8288
8289 return 0;
8290
8291 done_ret:
8292 if (!copy_itself) {
8293 vector<rgw_raw_obj>::iterator riter;
8294
8295 /* rollback reference */
8296 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8297 ObjectWriteOperation op;
8298 cls_refcount_put(op, tag, true);
8299
8300 ref.ioctx.locator_set_key(riter->loc);
8301
8302 int r = ref.ioctx.operate(riter->oid, &op);
8303 if (r < 0) {
8304 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8305 }
8306 }
8307 }
8308 return ret;
8309 }
8310
8311
8312 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8313 RGWBucketInfo& dest_bucket_info,
8314 RGWRados::Object::Read& read_op, off_t end,
8315 rgw_obj& dest_obj,
8316 rgw_obj& src_obj,
8317 uint64_t max_chunk_size,
8318 real_time *mtime,
8319 real_time set_mtime,
8320 map<string, bufferlist>& attrs,
8321 RGWObjCategory category,
8322 uint64_t olh_epoch,
8323 real_time delete_at,
8324 string *version_id,
8325 string *ptag,
8326 ceph::buffer::list *petag)
8327 {
8328 bufferlist first_chunk;
8329 RGWObjManifest manifest;
8330
8331 string tag;
8332 append_rand_alpha(cct, tag, tag, 32);
8333
8334 RGWPutObjProcessor_Atomic processor(obj_ctx,
8335 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
8336 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8337 if (version_id) {
8338 processor.set_version_id(*version_id);
8339 }
8340 processor.set_olh_epoch(olh_epoch);
8341 int ret = processor.prepare(this, NULL);
8342 if (ret < 0)
8343 return ret;
8344
8345 off_t ofs = 0;
8346
8347 do {
8348 bufferlist bl;
8349 ret = read_op.read(ofs, end, bl);
8350
8351 uint64_t read_len = ret;
8352 bool again;
8353
8354 do {
8355 void *handle;
8356 rgw_raw_obj obj;
8357
8358 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8359 if (ret < 0) {
8360 return ret;
8361 }
8362 ret = processor.throttle_data(handle, obj, read_len, false);
8363 if (ret < 0)
8364 return ret;
8365 } while (again);
8366
8367 ofs += read_len;
8368 } while (ofs <= end);
8369
8370 string etag;
8371 auto iter = attrs.find(RGW_ATTR_ETAG);
8372 if (iter != attrs.end()) {
8373 bufferlist& bl = iter->second;
8374 etag = string(bl.c_str(), bl.length());
8375 if (petag) {
8376 *petag = bl;
8377 }
8378 }
8379
8380 uint64_t accounted_size;
8381 {
8382 bool compressed{false};
8383 RGWCompressionInfo cs_info;
8384 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8385 if (ret < 0) {
8386 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8387 return ret;
8388 }
8389 // pass original size if compressed
8390 accounted_size = compressed ? cs_info.orig_size : ofs;
8391 }
8392
8393 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8394 }
8395
8396 bool RGWRados::is_meta_master()
8397 {
8398 if (!get_zonegroup().is_master_zonegroup()) {
8399 return false;
8400 }
8401
8402 return (get_zonegroup().master_zone == zone_public_config.id);
8403 }
8404
8405 /**
8406 * Check to see if the bucket metadata could be synced
8407 * bucket: the bucket to check
8408 * Returns false is the bucket is not synced
8409 */
8410 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8411 {
8412
8413 /* no current period */
8414 if (current_period.get_id().empty()) {
8415 return false;
8416 }
8417
8418 /* zonegroup is not master zonegroup */
8419 if (!get_zonegroup().is_master_zonegroup()) {
8420 return false;
8421 }
8422
8423 /* single zonegroup and a single zone */
8424 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8425 return false;
8426 }
8427
8428 /* zone is not master */
8429 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8430 return false;
8431 }
8432
8433 return true;
8434 }
8435
8436 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8437 {
8438 std::map<string, rgw_bucket_dir_entry> ent_map;
8439 rgw_obj_index_key marker;
8440 string prefix;
8441 bool is_truncated;
8442
8443 do {
8444 #define NUM_ENTRIES 1000
8445 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8446 &is_truncated, &marker);
8447 if (r < 0)
8448 return r;
8449
8450 string ns;
8451 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8452 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8453 rgw_obj_key obj;
8454
8455 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8456 return -ENOTEMPTY;
8457 }
8458 } while (is_truncated);
8459 return 0;
8460 }
8461
8462 /**
8463 * Delete a bucket.
8464 * bucket: the name of the bucket to delete
8465 * Returns 0 on success, -ERR# otherwise.
8466 */
8467 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8468 {
8469 const rgw_bucket& bucket = bucket_info.bucket;
8470 librados::IoCtx index_ctx;
8471 map<int, string> bucket_objs;
8472 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8473 if (r < 0)
8474 return r;
8475
8476 if (check_empty) {
8477 r = check_bucket_empty(bucket_info);
8478 if (r < 0) {
8479 return r;
8480 }
8481 }
8482
8483 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8484 if (r < 0)
8485 return r;
8486
8487 /* if the bucket is not synced we can remove the meta file */
8488 if (!is_syncing_bucket_meta(bucket)) {
8489 RGWObjVersionTracker objv_tracker;
8490 string entry = bucket.get_key();
8491 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8492 if (r < 0) {
8493 return r;
8494 }
8495 /* remove bucket index objects*/
8496 map<int, string>::const_iterator biter;
8497 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8498 index_ctx.remove(biter->second);
8499 }
8500 }
8501 return 0;
8502 }
8503
8504 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8505 {
8506 RGWBucketInfo info;
8507 map<string, bufferlist> attrs;
8508 RGWObjectCtx obj_ctx(this);
8509 int r;
8510 if (bucket.bucket_id.empty()) {
8511 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8512 } else {
8513 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8514 }
8515 if (r < 0) {
8516 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8517 return r;
8518 }
8519
8520 info.owner = owner.get_id();
8521
8522 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8523 if (r < 0) {
8524 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8525 return r;
8526 }
8527
8528 return 0;
8529 }
8530
8531
8532 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8533 {
8534 int ret = 0;
8535
8536 vector<rgw_bucket>::iterator iter;
8537
8538 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8539 rgw_bucket& bucket = *iter;
8540 if (enabled)
8541 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8542 else
8543 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8544
8545 RGWBucketInfo info;
8546 map<string, bufferlist> attrs;
8547 RGWObjectCtx obj_ctx(this);
8548 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8549 if (r < 0) {
8550 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8551 ret = r;
8552 continue;
8553 }
8554 if (enabled) {
8555 info.flags &= ~BUCKET_SUSPENDED;
8556 } else {
8557 info.flags |= BUCKET_SUSPENDED;
8558 }
8559
8560 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8561 if (r < 0) {
8562 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8563 ret = r;
8564 continue;
8565 }
8566 }
8567 return ret;
8568 }
8569
8570 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8571 {
8572 RGWBucketInfo bucket_info;
8573 RGWObjectCtx obj_ctx(this);
8574 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8575 if (ret < 0) {
8576 return ret;
8577 }
8578
8579 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8580 return 0;
8581 }
8582
8583 int RGWRados::Object::complete_atomic_modification()
8584 {
8585 if (!state->has_manifest || state->keep_tail)
8586 return 0;
8587
8588 cls_rgw_obj_chain chain;
8589 store->update_gc_chain(obj, state->manifest, &chain);
8590
8591 if (chain.empty()) {
8592 return 0;
8593 }
8594
8595 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
8596 return store->gc->send_chain(chain, tag, false); // do it async
8597 }
8598
8599 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8600 {
8601 RGWObjManifest::obj_iterator iter;
8602 rgw_raw_obj raw_head;
8603 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8604 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8605 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8606 if (mobj == raw_head)
8607 continue;
8608 cls_rgw_obj_key key(mobj.oid);
8609 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8610 }
8611 }
8612
8613 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8614 {
8615 return gc->send_chain(chain, tag, sync);
8616 }
8617
8618 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8619 {
8620 const rgw_bucket& bucket = bucket_info.bucket;
8621 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8622 if (r < 0)
8623 return r;
8624
8625 if (bucket.bucket_id.empty()) {
8626 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8627 return -EIO;
8628 }
8629
8630 bucket_oid = dir_oid_prefix;
8631 bucket_oid.append(bucket.bucket_id);
8632
8633 return 0;
8634 }
8635
8636 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8637 string& bucket_oid_base) {
8638 const rgw_bucket& bucket = bucket_info.bucket;
8639 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8640 if (r < 0)
8641 return r;
8642
8643 if (bucket.bucket_id.empty()) {
8644 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8645 return -EIO;
8646 }
8647
8648 bucket_oid_base = dir_oid_prefix;
8649 bucket_oid_base.append(bucket.bucket_id);
8650
8651 return 0;
8652
8653 }
8654
8655 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8656 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8657 string bucket_oid_base;
8658 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8659 if (ret < 0) {
8660 return ret;
8661 }
8662
8663 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8664 if (bucket_instance_ids) {
8665 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8666 }
8667 return 0;
8668 }
8669
8670 template<typename T>
8671 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8672 map<int, string>& oids, map<int, T>& bucket_objs,
8673 int shard_id, map<int, string> *bucket_instance_ids)
8674 {
8675 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8676 if (ret < 0)
8677 return ret;
8678
8679 map<int, string>::const_iterator iter = oids.begin();
8680 for (; iter != oids.end(); ++iter) {
8681 bucket_objs[iter->first] = T();
8682 }
8683 return 0;
8684 }
8685
8686 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8687 const string& obj_key, string *bucket_obj, int *shard_id)
8688 {
8689 string bucket_oid_base;
8690 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8691 if (ret < 0)
8692 return ret;
8693
8694 RGWObjectCtx obj_ctx(this);
8695
8696 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8697 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8698 if (ret < 0) {
8699 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8700 return ret;
8701 }
8702 return 0;
8703 }
8704
8705 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8706 int shard_id, string *bucket_obj)
8707 {
8708 string bucket_oid_base;
8709 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8710 if (ret < 0)
8711 return ret;
8712
8713 RGWObjectCtx obj_ctx(this);
8714
8715 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8716 shard_id, bucket_obj);
8717 return 0;
8718 }
8719
8720 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8721 map<RGWObjCategory, RGWStorageStats>& stats)
8722 {
8723 for (const auto& pair : header.stats) {
8724 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8725 const rgw_bucket_category_stats& header_stats = pair.second;
8726
8727 RGWStorageStats& s = stats[category];
8728
8729 s.category = category;
8730 s.size += header_stats.total_size;
8731 s.size_rounded += header_stats.total_size_rounded;
8732 s.size_utilized += header_stats.actual_size;
8733 s.num_objects += header_stats.num_entries;
8734 }
8735 }
8736
8737 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8738 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8739 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8740 {
8741 librados::IoCtx index_ctx;
8742 // key - bucket index object id
8743 // value - bucket index check OP returned result with the given bucket index object (shard)
8744 map<int, string> oids;
8745 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8746
8747 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8748 if (ret < 0) {
8749 return ret;
8750 }
8751
8752 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8753 if (ret < 0) {
8754 return ret;
8755 }
8756
8757 // Aggregate results (from different shards if there is any)
8758 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8759 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8760 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8761 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8762 }
8763
8764 return 0;
8765 }
8766
8767 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8768 {
8769 librados::IoCtx index_ctx;
8770 map<int, string> bucket_objs;
8771
8772 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8773 if (r < 0) {
8774 return r;
8775 }
8776
8777 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8778 }
8779
8780 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8781 {
8782 librados::IoCtx index_ctx;
8783 map<int, string> bucket_objs;
8784
8785 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8786 if (r < 0) {
8787 return r;
8788 }
8789
8790 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8791 }
8792
8793 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8794 {
8795 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8796 std::string oid, key;
8797 get_obj_bucket_and_oid_loc(obj, oid, key);
8798 if (!rctx)
8799 return 0;
8800
8801 RGWObjState *state = NULL;
8802
8803 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8804 if (r < 0)
8805 return r;
8806
8807 if (!state->is_atomic) {
8808 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8809 return -EINVAL;
8810 }
8811
8812 string tag;
8813
8814 if (state->tail_tag.length() > 0) {
8815 tag = state->tail_tag.c_str();
8816 } else if (state->obj_tag.length() > 0) {
8817 tag = state->obj_tag.c_str();
8818 } else {
8819 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8820 return -EINVAL;
8821 }
8822
8823 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8824
8825 return gc->defer_chain(tag, false);
8826 }
8827
8828 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8829 {
8830 list<string> prefixes;
8831 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8832 cls_rgw_remove_obj(op, prefixes);
8833 }
8834
8835 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8836 {
8837 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8838 }
8839
8840 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8841 {
8842 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8843 }
8844
8845
8846 /**
8847 * Delete an object.
8848 * bucket: name of the bucket storing the object
8849 * obj: name of the object to delete
8850 * Returns: 0 on success, -ERR# otherwise.
8851 */
8852 int RGWRados::Object::Delete::delete_obj()
8853 {
8854 RGWRados *store = target->get_store();
8855 rgw_obj& src_obj = target->get_obj();
8856 const string& instance = src_obj.key.instance;
8857 rgw_obj obj = src_obj;
8858
8859 if (instance == "null") {
8860 obj.key.instance.clear();
8861 }
8862
8863 bool explicit_marker_version = (!params.marker_version_id.empty());
8864
8865 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8866 if (instance.empty() || explicit_marker_version) {
8867 rgw_obj marker = obj;
8868
8869 if (!params.marker_version_id.empty()) {
8870 if (params.marker_version_id != "null") {
8871 marker.key.set_instance(params.marker_version_id);
8872 }
8873 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8874 store->gen_rand_obj_instance_name(&marker);
8875 }
8876
8877 result.version_id = marker.key.instance;
8878 result.delete_marker = true;
8879
8880 struct rgw_bucket_dir_entry_meta meta;
8881
8882 meta.owner = params.obj_owner.get_id().to_str();
8883 meta.owner_display_name = params.obj_owner.get_display_name();
8884
8885 if (real_clock::is_zero(params.mtime)) {
8886 meta.mtime = real_clock::now();
8887 } else {
8888 meta.mtime = params.mtime;
8889 }
8890
8891 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8892 if (r < 0) {
8893 return r;
8894 }
8895 } else {
8896 rgw_bucket_dir_entry dirent;
8897
8898 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8899 if (r < 0) {
8900 return r;
8901 }
8902 result.delete_marker = dirent.is_delete_marker();
8903 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8904 if (r < 0) {
8905 return r;
8906 }
8907 result.version_id = instance;
8908 }
8909
8910 BucketShard *bs;
8911 int r = target->get_bucket_shard(&bs);
8912 if (r < 0) {
8913 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8914 return r;
8915 }
8916
8917 if (target->bucket_info.datasync_flag_enabled()) {
8918 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8919 if (r < 0) {
8920 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8921 return r;
8922 }
8923 }
8924
8925 return 0;
8926 }
8927
8928 rgw_rados_ref ref;
8929 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8930 if (r < 0) {
8931 return r;
8932 }
8933
8934 RGWObjState *state;
8935 r = target->get_state(&state, false);
8936 if (r < 0)
8937 return r;
8938
8939 ObjectWriteOperation op;
8940
8941 if (!real_clock::is_zero(params.unmod_since)) {
8942 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8943 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8944 if (!params.high_precision_time) {
8945 ctime.tv_nsec = 0;
8946 unmod.tv_nsec = 0;
8947 }
8948
8949 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8950 if (ctime > unmod) {
8951 return -ERR_PRECONDITION_FAILED;
8952 }
8953
8954 /* only delete object if mtime is less than or equal to params.unmod_since */
8955 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8956 }
8957 uint64_t obj_size = state->size;
8958
8959 if (!real_clock::is_zero(params.expiration_time)) {
8960 bufferlist bl;
8961 real_time delete_at;
8962
8963 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8964 try {
8965 bufferlist::iterator iter = bl.begin();
8966 ::decode(delete_at, iter);
8967 } catch (buffer::error& err) {
8968 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8969 return -EIO;
8970 }
8971
8972 if (params.expiration_time != delete_at) {
8973 return -ERR_PRECONDITION_FAILED;
8974 }
8975 } else {
8976 return -ERR_PRECONDITION_FAILED;
8977 }
8978 }
8979
8980 if (!state->exists) {
8981 target->invalidate_state();
8982 return -ENOENT;
8983 }
8984
8985 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
8986 if (r < 0)
8987 return r;
8988
8989 RGWBucketInfo& bucket_info = target->get_bucket_info();
8990
8991 RGWRados::Bucket bop(store, bucket_info);
8992 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8993
8994 index_op.set_zones_trace(params.zones_trace);
8995 index_op.set_bilog_flags(params.bilog_flags);
8996
8997 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8998 if (r < 0)
8999 return r;
9000
9001 store->remove_rgw_head_obj(op);
9002 r = ref.ioctx.operate(ref.oid, &op);
9003
9004 /* raced with another operation, object state is indeterminate */
9005 const bool need_invalidate = (r == -ECANCELED);
9006
9007 int64_t poolid = ref.ioctx.get_id();
9008 if (r >= 0) {
9009 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9010 if (obj_tombstone_cache) {
9011 tombstone_entry entry{*state};
9012 obj_tombstone_cache->add(obj, entry);
9013 }
9014 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
9015
9016 int ret = target->complete_atomic_modification();
9017 if (ret < 0) {
9018 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9019 }
9020 /* other than that, no need to propagate error */
9021 } else {
9022 int ret = index_op.cancel();
9023 if (ret < 0) {
9024 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9025 }
9026 }
9027
9028 if (need_invalidate) {
9029 target->invalidate_state();
9030 }
9031
9032 if (r < 0)
9033 return r;
9034
9035 /* update quota cache */
9036 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9037
9038 return 0;
9039 }
9040
9041 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9042 const RGWBucketInfo& bucket_info,
9043 const rgw_obj& obj,
9044 int versioning_status,
9045 uint16_t bilog_flags,
9046 const real_time& expiration_time,
9047 rgw_zone_set *zones_trace)
9048 {
9049 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9050 RGWRados::Object::Delete del_op(&del_target);
9051
9052 del_op.params.bucket_owner = bucket_info.owner;
9053 del_op.params.versioning_status = versioning_status;
9054 del_op.params.bilog_flags = bilog_flags;
9055 del_op.params.expiration_time = expiration_time;
9056 del_op.params.zones_trace = zones_trace;
9057
9058 return del_op.delete_obj();
9059 }
9060
9061 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9062 {
9063 rgw_rados_ref ref;
9064 int r = get_raw_obj_ref(obj, &ref);
9065 if (r < 0) {
9066 return r;
9067 }
9068
9069 ObjectWriteOperation op;
9070
9071 op.remove();
9072 r = ref.ioctx.operate(ref.oid, &op);
9073 if (r < 0)
9074 return r;
9075
9076 return 0;
9077 }
9078
9079 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9080 {
9081 if (obj.empty()) {
9082 ldout(cct, 1) << "delete_system_obj got empty object name "
9083 << obj << ", returning EINVAL" << dendl;
9084 return -EINVAL;
9085 }
9086 rgw_rados_ref ref;
9087 int r = get_raw_obj_ref(obj, &ref);
9088 if (r < 0) {
9089 return r;
9090 }
9091
9092 ObjectWriteOperation op;
9093
9094 if (objv_tracker) {
9095 objv_tracker->prepare_op_for_write(&op);
9096 }
9097
9098 op.remove();
9099 r = ref.ioctx.operate(ref.oid, &op);
9100 if (r < 0)
9101 return r;
9102
9103 return 0;
9104 }
9105
9106 int RGWRados::delete_obj_index(const rgw_obj& obj)
9107 {
9108 std::string oid, key;
9109 get_obj_bucket_and_oid_loc(obj, oid, key);
9110
9111 RGWObjectCtx obj_ctx(this);
9112
9113 RGWBucketInfo bucket_info;
9114 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9115 if (ret < 0) {
9116 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9117 return ret;
9118 }
9119
9120 RGWRados::Bucket bop(this, bucket_info);
9121 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9122
9123 real_time removed_mtime;
9124 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9125
9126 return r;
9127 }
9128
9129 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9130 {
9131 string tag;
9132
9133 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9134 if (mi != manifest.obj_end()) {
9135 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9136 ++mi;
9137 tag = mi.get_location().get_raw_obj(store).oid;
9138 tag.append("_");
9139 }
9140
9141 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9142 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9143 MD5 hash;
9144 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9145
9146 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9147 if (iter != attrset.end()) {
9148 bufferlist& bl = iter->second;
9149 hash.Update((const byte *)bl.c_str(), bl.length());
9150 }
9151
9152 hash.Final(md5);
9153 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9154 tag.append(md5_str);
9155
9156 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9157
9158 tag_bl.append(tag.c_str(), tag.size() + 1);
9159 }
9160
9161 static bool is_olh(map<string, bufferlist>& attrs)
9162 {
9163 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9164 return (iter != attrs.end());
9165 }
9166
9167 static bool has_olh_tag(map<string, bufferlist>& attrs)
9168 {
9169 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9170 return (iter != attrs.end());
9171 }
9172
9173 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9174 RGWObjState *olh_state, RGWObjState **target_state)
9175 {
9176 assert(olh_state->is_olh);
9177
9178 rgw_obj target;
9179 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9180 if (r < 0) {
9181 return r;
9182 }
9183 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9184 if (r < 0) {
9185 return r;
9186 }
9187
9188 return 0;
9189 }
9190
9191 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9192 {
9193 if (obj.empty()) {
9194 return -EINVAL;
9195 }
9196
9197 RGWRawObjState *s = rctx->raw.get_state(obj);
9198 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9199 *state = s;
9200 if (s->has_attrs) {
9201 return 0;
9202 }
9203
9204 s->obj = obj;
9205
9206 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9207 if (r == -ENOENT) {
9208 s->exists = false;
9209 s->has_attrs = true;
9210 s->mtime = real_time();
9211 return 0;
9212 }
9213 if (r < 0)
9214 return r;
9215
9216 s->exists = true;
9217 s->has_attrs = true;
9218 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9219
9220 if (s->obj_tag.length())
9221 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9222 << s->obj_tag.c_str() << dendl;
9223 else
9224 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9225
9226 return 0;
9227 }
9228
9229 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9230 {
9231 int ret;
9232
9233 do {
9234 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9235 } while (ret == -EAGAIN);
9236
9237 return ret;
9238 }
9239
9240 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9241 RGWObjState **state, bool follow_olh, bool assume_noent)
9242 {
9243 if (obj.empty()) {
9244 return -EINVAL;
9245 }
9246
9247 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9248
9249 RGWObjState *s = rctx->obj.get_state(obj);
9250 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9251 *state = s;
9252 if (s->has_attrs) {
9253 if (s->is_olh && need_follow_olh) {
9254 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9255 }
9256 return 0;
9257 }
9258
9259 s->obj = obj;
9260
9261 rgw_raw_obj raw_obj;
9262 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9263
9264 int r = -ENOENT;
9265
9266 if (!assume_noent) {
9267 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9268 }
9269
9270 if (r == -ENOENT) {
9271 s->exists = false;
9272 s->has_attrs = true;
9273 tombstone_entry entry;
9274 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9275 s->mtime = entry.mtime;
9276 s->zone_short_id = entry.zone_short_id;
9277 s->pg_ver = entry.pg_ver;
9278 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9279 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9280 } else {
9281 s->mtime = real_time();
9282 }
9283 return 0;
9284 }
9285 if (r < 0)
9286 return r;
9287
9288 s->exists = true;
9289 s->has_attrs = true;
9290 s->accounted_size = s->size;
9291
9292 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9293 const bool compressed = (iter != s->attrset.end());
9294 if (compressed) {
9295 // use uncompressed size for accounted_size
9296 try {
9297 RGWCompressionInfo info;
9298 auto p = iter->second.begin();
9299 ::decode(info, p);
9300 s->accounted_size = info.orig_size;
9301 } catch (buffer::error&) {
9302 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9303 return -EIO;
9304 }
9305 }
9306
9307 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9308 if (iter != s->attrset.end()) {
9309 bufferlist bl = iter->second;
9310 bufferlist::iterator it = bl.begin();
9311 it.copy(bl.length(), s->shadow_obj);
9312 s->shadow_obj[bl.length()] = '\0';
9313 }
9314 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9315 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9316 if (ttiter != s->attrset.end()) {
9317 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9318 }
9319
9320 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9321 if (manifest_bl.length()) {
9322 bufferlist::iterator miter = manifest_bl.begin();
9323 try {
9324 ::decode(s->manifest, miter);
9325 s->has_manifest = true;
9326 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9327 broken due to old bugs */
9328 s->size = s->manifest.get_obj_size();
9329 if (!compressed)
9330 s->accounted_size = s->size;
9331 } catch (buffer::error& err) {
9332 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9333 return -EIO;
9334 }
9335 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9336 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9337 RGWObjManifest::obj_iterator mi;
9338 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9339 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9340 }
9341 }
9342
9343 if (!s->obj_tag.length()) {
9344 /*
9345 * Uh oh, something's wrong, object with manifest should have tag. Let's
9346 * create one out of the manifest, would be unique
9347 */
9348 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9349 s->fake_tag = true;
9350 }
9351 }
9352 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9353 if (aiter != s->attrset.end()) {
9354 bufferlist& pg_ver_bl = aiter->second;
9355 if (pg_ver_bl.length()) {
9356 bufferlist::iterator pgbl = pg_ver_bl.begin();
9357 try {
9358 ::decode(s->pg_ver, pgbl);
9359 } catch (buffer::error& err) {
9360 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9361 }
9362 }
9363 }
9364 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9365 if (aiter != s->attrset.end()) {
9366 bufferlist& zone_short_id_bl = aiter->second;
9367 if (zone_short_id_bl.length()) {
9368 bufferlist::iterator zbl = zone_short_id_bl.begin();
9369 try {
9370 ::decode(s->zone_short_id, zbl);
9371 } catch (buffer::error& err) {
9372 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9373 }
9374 }
9375 }
9376 if (s->obj_tag.length())
9377 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9378 else
9379 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9380
9381 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9382 * it exist, and not only if is_olh() returns true
9383 */
9384 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9385 if (iter != s->attrset.end()) {
9386 s->olh_tag = iter->second;
9387 }
9388
9389 if (is_olh(s->attrset)) {
9390 s->is_olh = true;
9391
9392 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9393
9394 if (need_follow_olh) {
9395 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9396 }
9397 }
9398
9399 return 0;
9400 }
9401
9402 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9403 bool follow_olh, bool assume_noent)
9404 {
9405 int ret;
9406
9407 do {
9408 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9409 } while (ret == -EAGAIN);
9410
9411 return ret;
9412 }
9413
9414 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9415 {
9416 RGWObjState *astate;
9417 int r = get_state(&astate, true);
9418 if (r < 0) {
9419 return r;
9420 }
9421
9422 *pmanifest = &astate->manifest;
9423
9424 return 0;
9425 }
9426
9427 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9428 {
9429 RGWObjState *state;
9430 int r = source->get_state(&state, true);
9431 if (r < 0)
9432 return r;
9433 if (!state->exists)
9434 return -ENOENT;
9435 if (!state->get_attr(name, dest))
9436 return -ENODATA;
9437
9438 return 0;
9439 }
9440
9441
9442 int RGWRados::Object::Stat::stat_async()
9443 {
9444 RGWObjectCtx& ctx = source->get_ctx();
9445 rgw_obj& obj = source->get_obj();
9446 RGWRados *store = source->get_store();
9447
9448 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9449 result.obj = obj;
9450 if (s->has_attrs) {
9451 state.ret = 0;
9452 result.size = s->size;
9453 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9454 result.attrs = s->attrset;
9455 result.has_manifest = s->has_manifest;
9456 result.manifest = s->manifest;
9457 return 0;
9458 }
9459
9460 string oid;
9461 string loc;
9462 get_obj_bucket_and_oid_loc(obj, oid, loc);
9463
9464 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9465 if (r < 0) {
9466 return r;
9467 }
9468
9469 librados::ObjectReadOperation op;
9470 op.stat2(&result.size, &result.mtime, NULL);
9471 op.getxattrs(&result.attrs, NULL);
9472 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9473 state.io_ctx.locator_set_key(loc);
9474 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9475 if (r < 0) {
9476 ldout(store->ctx(), 5) << __func__
9477 << ": ERROR: aio_operate() returned ret=" << r
9478 << dendl;
9479 return r;
9480 }
9481
9482 return 0;
9483 }
9484
9485
9486 int RGWRados::Object::Stat::wait()
9487 {
9488 if (!state.completion) {
9489 return state.ret;
9490 }
9491
9492 state.completion->wait_for_safe();
9493 state.ret = state.completion->get_return_value();
9494 state.completion->release();
9495
9496 if (state.ret != 0) {
9497 return state.ret;
9498 }
9499
9500 return finish();
9501 }
9502
9503 int RGWRados::Object::Stat::finish()
9504 {
9505 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9506 if (iter != result.attrs.end()) {
9507 bufferlist& bl = iter->second;
9508 bufferlist::iterator biter = bl.begin();
9509 try {
9510 ::decode(result.manifest, biter);
9511 } catch (buffer::error& err) {
9512 RGWRados *store = source->get_store();
9513 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9514 return -EIO;
9515 }
9516 result.has_manifest = true;
9517 }
9518
9519 return 0;
9520 }
9521
9522 /**
9523 * Get an attribute for a system object.
9524 * obj: the object to get attr
9525 * name: name of the attr to retrieve
9526 * dest: bufferlist to store the result in
9527 * Returns: 0 on success, -ERR# otherwise.
9528 */
9529 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9530 {
9531 rgw_rados_ref ref;
9532 int r = get_system_obj_ref(obj, &ref);
9533 if (r < 0) {
9534 return r;
9535 }
9536
9537 ObjectReadOperation op;
9538
9539 int rval;
9540 op.getxattr(name, &dest, &rval);
9541
9542 r = ref.ioctx.operate(ref.oid, &op, NULL);
9543 if (r < 0)
9544 return r;
9545
9546 return 0;
9547 }
9548
9549 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9550 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9551 ObjectOperation& op, RGWObjState **pstate)
9552 {
9553 if (!rctx)
9554 return 0;
9555
9556 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9557 if (r < 0)
9558 return r;
9559
9560 RGWObjState *state = *pstate;
9561
9562 if (!state->is_atomic) {
9563 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9564 return 0;
9565 }
9566
9567 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9568 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9569 } else {
9570 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9571 }
9572 return 0;
9573 }
9574
9575 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9576 {
9577 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9578 }
9579
9580 void RGWRados::Object::invalidate_state()
9581 {
9582 ctx.obj.invalidate(obj);
9583 }
9584
9585 void RGWRados::SystemObject::invalidate_state()
9586 {
9587 ctx.raw.invalidate(obj);
9588 }
9589
9590 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9591 const char *if_match, const char *if_nomatch, bool removal_op,
9592 bool modify_tail)
9593 {
9594 int r = get_state(&state, false);
9595 if (r < 0)
9596 return r;
9597
9598 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9599 if_match != NULL || if_nomatch != NULL) &&
9600 (!state->fake_tag);
9601
9602 if (!state->is_atomic) {
9603 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9604
9605 if (reset_obj) {
9606 op.create(false);
9607 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9608 }
9609
9610 return 0;
9611 }
9612
9613 if (need_guard) {
9614 /* first verify that the object wasn't replaced under */
9615 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9616 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9617 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9618 }
9619
9620 if (if_match) {
9621 if (strcmp(if_match, "*") == 0) {
9622 // test the object is existing
9623 if (!state->exists) {
9624 return -ERR_PRECONDITION_FAILED;
9625 }
9626 } else {
9627 bufferlist bl;
9628 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9629 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9630 return -ERR_PRECONDITION_FAILED;
9631 }
9632 }
9633 }
9634
9635 if (if_nomatch) {
9636 if (strcmp(if_nomatch, "*") == 0) {
9637 // test the object is NOT existing
9638 if (state->exists) {
9639 return -ERR_PRECONDITION_FAILED;
9640 }
9641 } else {
9642 bufferlist bl;
9643 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9644 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9645 return -ERR_PRECONDITION_FAILED;
9646 }
9647 }
9648 }
9649 }
9650
9651 if (reset_obj) {
9652 if (state->exists) {
9653 op.create(false);
9654 store->remove_rgw_head_obj(op);
9655 } else {
9656 op.create(true);
9657 }
9658 }
9659
9660 if (removal_op) {
9661 /* the object is being removed, no need to update its tag */
9662 return 0;
9663 }
9664
9665 if (ptag) {
9666 state->write_tag = *ptag;
9667 } else {
9668 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9669 }
9670 bufferlist bl;
9671 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9672
9673 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9674
9675 op.setxattr(RGW_ATTR_ID_TAG, bl);
9676 if (modify_tail) {
9677 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9678 }
9679
9680 return 0;
9681 }
9682
9683 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9684 RGWObjVersionTracker *objv_tracker)
9685 {
9686 map<string, bufferlist> attrs;
9687 attrs[name] = bl;
9688 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9689 }
9690
9691 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9692 map<string, bufferlist>& attrs,
9693 map<string, bufferlist>* rmattrs,
9694 RGWObjVersionTracker *objv_tracker)
9695 {
9696 rgw_rados_ref ref;
9697 int r = get_system_obj_ref(obj, &ref);
9698 if (r < 0) {
9699 return r;
9700 }
9701 ObjectWriteOperation op;
9702
9703 if (objv_tracker) {
9704 objv_tracker->prepare_op_for_write(&op);
9705 }
9706
9707 map<string, bufferlist>::iterator iter;
9708 if (rmattrs) {
9709 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9710 const string& name = iter->first;
9711 op.rmxattr(name.c_str());
9712 }
9713 }
9714
9715 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9716 const string& name = iter->first;
9717 bufferlist& bl = iter->second;
9718
9719 if (!bl.length())
9720 continue;
9721
9722 op.setxattr(name.c_str(), bl);
9723 }
9724
9725 if (!op.size())
9726 return 0;
9727
9728 bufferlist bl;
9729
9730 r = ref.ioctx.operate(ref.oid, &op);
9731 if (r < 0)
9732 return r;
9733
9734 return 0;
9735 }
9736
9737 /**
9738 * Set an attr on an object.
9739 * bucket: name of the bucket holding the object
9740 * obj: name of the object to set the attr on
9741 * name: the attr to set
9742 * bl: the contents of the attr
9743 * Returns: 0 on success, -ERR# otherwise.
9744 */
9745 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9746 {
9747 map<string, bufferlist> attrs;
9748 attrs[name] = bl;
9749 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9750 }
9751
9752 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9753 map<string, bufferlist>& attrs,
9754 map<string, bufferlist>* rmattrs)
9755 {
9756 rgw_rados_ref ref;
9757 int r = get_obj_head_ref(bucket_info, obj, &ref);
9758 if (r < 0) {
9759 return r;
9760 }
9761 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9762
9763 ObjectWriteOperation op;
9764 RGWObjState *state = NULL;
9765
9766 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9767 if (r < 0)
9768 return r;
9769
9770 map<string, bufferlist>::iterator iter;
9771 if (rmattrs) {
9772 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9773 const string& name = iter->first;
9774 op.rmxattr(name.c_str());
9775 }
9776 }
9777
9778 const rgw_bucket& bucket = obj.bucket;
9779
9780 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9781 const string& name = iter->first;
9782 bufferlist& bl = iter->second;
9783
9784 if (!bl.length())
9785 continue;
9786
9787 op.setxattr(name.c_str(), bl);
9788
9789 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9790 real_time ts;
9791 try {
9792 ::decode(ts, bl);
9793
9794 rgw_obj_index_key obj_key;
9795 obj.key.get_index_key(&obj_key);
9796
9797 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9798 } catch (buffer::error& err) {
9799 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9800 }
9801 }
9802 }
9803
9804 if (!op.size())
9805 return 0;
9806
9807 RGWObjectCtx obj_ctx(this);
9808
9809 bufferlist bl;
9810 RGWRados::Bucket bop(this, bucket_info);
9811 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9812
9813 if (state) {
9814 string tag;
9815 append_rand_alpha(cct, tag, tag, 32);
9816 state->write_tag = tag;
9817 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9818
9819 if (r < 0)
9820 return r;
9821
9822 bl.append(tag.c_str(), tag.size() + 1);
9823 op.setxattr(RGW_ATTR_ID_TAG, bl);
9824 }
9825
9826
9827 real_time mtime = real_clock::now();
9828 struct timespec mtime_ts = real_clock::to_timespec(mtime);
9829 op.mtime2(&mtime_ts);
9830 r = ref.ioctx.operate(ref.oid, &op);
9831 if (state) {
9832 if (r >= 0) {
9833 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9834 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9835 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9836 string etag(etag_bl.c_str(), etag_bl.length());
9837 string content_type(content_type_bl.c_str(), content_type_bl.length());
9838 uint64_t epoch = ref.ioctx.get_last_version();
9839 int64_t poolid = ref.ioctx.get_id();
9840 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9841 mtime, etag, content_type, &acl_bl,
9842 RGW_OBJ_CATEGORY_MAIN, NULL);
9843 } else {
9844 int ret = index_op.cancel();
9845 if (ret < 0) {
9846 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9847 }
9848 }
9849 }
9850 if (r < 0)
9851 return r;
9852
9853 if (state) {
9854 state->obj_tag.swap(bl);
9855 if (rmattrs) {
9856 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9857 state->attrset.erase(iter->first);
9858 }
9859 }
9860 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9861 state->attrset[iter->first] = iter->second;
9862 }
9863 }
9864
9865 return 0;
9866 }
9867
9868 int RGWRados::Object::Read::prepare()
9869 {
9870 RGWRados *store = source->get_store();
9871 CephContext *cct = store->ctx();
9872
9873 bufferlist etag;
9874
9875 map<string, bufferlist>::iterator iter;
9876
9877 RGWObjState *astate;
9878 int r = source->get_state(&astate, true);
9879 if (r < 0)
9880 return r;
9881
9882 if (!astate->exists) {
9883 return -ENOENT;
9884 }
9885
9886 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9887
9888 state.obj = astate->obj;
9889 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9890
9891 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9892 if (r < 0) {
9893 return r;
9894 }
9895 if (params.attrs) {
9896 *params.attrs = astate->attrset;
9897 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9898 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9899 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9900 }
9901 }
9902 }
9903
9904 /* Convert all times go GMT to make them compatible */
9905 if (conds.mod_ptr || conds.unmod_ptr) {
9906 obj_time_weight src_weight;
9907 src_weight.init(astate);
9908 src_weight.high_precision = conds.high_precision_time;
9909
9910 obj_time_weight dest_weight;
9911 dest_weight.high_precision = conds.high_precision_time;
9912
9913 if (conds.mod_ptr) {
9914 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9915 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9916 if (!(dest_weight < src_weight)) {
9917 return -ERR_NOT_MODIFIED;
9918 }
9919 }
9920
9921 if (conds.unmod_ptr) {
9922 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9923 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9924 if (dest_weight < src_weight) {
9925 return -ERR_PRECONDITION_FAILED;
9926 }
9927 }
9928 }
9929 if (conds.if_match || conds.if_nomatch) {
9930 r = get_attr(RGW_ATTR_ETAG, etag);
9931 if (r < 0)
9932 return r;
9933
9934 if (conds.if_match) {
9935 string if_match_str = rgw_string_unquote(conds.if_match);
9936 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9937 if (if_match_str.compare(etag.c_str()) != 0) {
9938 return -ERR_PRECONDITION_FAILED;
9939 }
9940 }
9941
9942 if (conds.if_nomatch) {
9943 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9944 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9945 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9946 return -ERR_NOT_MODIFIED;
9947 }
9948 }
9949 }
9950
9951 if (params.obj_size)
9952 *params.obj_size = astate->size;
9953 if (params.lastmod)
9954 *params.lastmod = astate->mtime;
9955
9956 return 0;
9957 }
9958
9959 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9960 {
9961 if (ofs < 0) {
9962 ofs += obj_size;
9963 if (ofs < 0)
9964 ofs = 0;
9965 end = obj_size - 1;
9966 } else if (end < 0) {
9967 end = obj_size - 1;
9968 }
9969
9970 if (obj_size > 0) {
9971 if (ofs >= (off_t)obj_size) {
9972 return -ERANGE;
9973 }
9974 if (end >= (off_t)obj_size) {
9975 end = obj_size - 1;
9976 }
9977 }
9978 return 0;
9979 }
9980
9981 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9982 {
9983 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9984 }
9985
9986 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9987 RGWRados::SystemObject::Read::GetObjState& state,
9988 rgw_raw_obj& obj,
9989 map<string, bufferlist> *attrs,
9990 real_time *lastmod,
9991 uint64_t *obj_size,
9992 RGWObjVersionTracker *objv_tracker)
9993 {
9994 RGWRawObjState *astate = NULL;
9995
9996 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9997 if (r < 0)
9998 return r;
9999
10000 if (!astate->exists) {
10001 return -ENOENT;
10002 }
10003
10004 if (attrs) {
10005 *attrs = astate->attrset;
10006 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10007 map<string, bufferlist>::iterator iter;
10008 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10009 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10010 }
10011 }
10012 }
10013
10014 if (obj_size)
10015 *obj_size = astate->size;
10016 if (lastmod)
10017 *lastmod = astate->mtime;
10018
10019 return 0;
10020 }
10021
10022
10023 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10024 {
10025 RGWRados *store = target->get_store();
10026 BucketShard *bs;
10027 int r;
10028
10029 #define NUM_RESHARD_RETRIES 10
10030 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10031 int ret = get_bucket_shard(&bs);
10032 if (ret < 0) {
10033 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10034 return ret;
10035 }
10036 r = call(bs);
10037 if (r != -ERR_BUSY_RESHARDING) {
10038 break;
10039 }
10040 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10041 string new_bucket_id;
10042 r = store->block_while_resharding(bs, &new_bucket_id);
10043 if (r == -ERR_BUSY_RESHARDING) {
10044 continue;
10045 }
10046 if (r < 0) {
10047 return r;
10048 }
10049 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10050 i = 0; /* resharding is finished, make sure we can retry */
10051 r = target->update_bucket_id(new_bucket_id);
10052 if (r < 0) {
10053 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10054 return r;
10055 }
10056 invalidate_bs();
10057 }
10058
10059 if (r < 0) {
10060 return r;
10061 }
10062
10063 if (pbs) {
10064 *pbs = bs;
10065 }
10066
10067 return 0;
10068 }
10069
10070 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10071 {
10072 RGWRados *store = source->get_store();
10073 rgw_raw_obj& obj = source->get_obj();
10074
10075 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10076 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10077 }
10078
10079 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10080 {
10081 if (blind) {
10082 return 0;
10083 }
10084 RGWRados *store = target->get_store();
10085
10086 if (write_tag && write_tag->length()) {
10087 optag = string(write_tag->c_str(), write_tag->length());
10088 } else {
10089 if (optag.empty()) {
10090 append_rand_alpha(store->ctx(), optag, optag, 32);
10091 }
10092 }
10093
10094 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10095 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10096 });
10097
10098 if (r < 0) {
10099 return r;
10100 }
10101 prepared = true;
10102
10103 return 0;
10104 }
10105
10106 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10107 uint64_t size, uint64_t accounted_size,
10108 ceph::real_time& ut, const string& etag,
10109 const string& content_type,
10110 bufferlist *acl_bl,
10111 RGWObjCategory category,
10112 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10113 {
10114 if (blind) {
10115 return 0;
10116 }
10117 RGWRados *store = target->get_store();
10118 BucketShard *bs;
10119
10120 int ret = get_bucket_shard(&bs);
10121 if (ret < 0) {
10122 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10123 return ret;
10124 }
10125
10126 rgw_bucket_dir_entry ent;
10127 obj.key.get_index_key(&ent.key);
10128 ent.meta.size = size;
10129 ent.meta.accounted_size = accounted_size;
10130 ent.meta.mtime = ut;
10131 ent.meta.etag = etag;
10132 if (user_data)
10133 ent.meta.user_data = *user_data;
10134
10135 ACLOwner owner;
10136 if (acl_bl && acl_bl->length()) {
10137 int ret = store->decode_policy(*acl_bl, &owner);
10138 if (ret < 0) {
10139 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10140 }
10141 }
10142 ent.meta.owner = owner.get_id().to_str();
10143 ent.meta.owner_display_name = owner.get_display_name();
10144 ent.meta.content_type = content_type;
10145
10146 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10147
10148 if (target->bucket_info.datasync_flag_enabled()) {
10149 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10150 if (r < 0) {
10151 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10152 }
10153 }
10154
10155 return ret;
10156 }
10157
10158 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10159 real_time& removed_mtime,
10160 list<rgw_obj_index_key> *remove_objs)
10161 {
10162 if (blind) {
10163 return 0;
10164 }
10165 RGWRados *store = target->get_store();
10166 BucketShard *bs;
10167
10168 int ret = get_bucket_shard(&bs);
10169 if (ret < 0) {
10170 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10171 return ret;
10172 }
10173
10174 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10175
10176 if (target->bucket_info.datasync_flag_enabled()) {
10177 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10178 if (r < 0) {
10179 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10180 }
10181 }
10182
10183 return ret;
10184 }
10185
10186
10187 int RGWRados::Bucket::UpdateIndex::cancel()
10188 {
10189 if (blind) {
10190 return 0;
10191 }
10192 RGWRados *store = target->get_store();
10193 BucketShard *bs;
10194
10195 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10196 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10197 });
10198
10199 /*
10200 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10201 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10202 * have no way to tell that they're all caught up
10203 */
10204 if (target->bucket_info.datasync_flag_enabled()) {
10205 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10206 if (r < 0) {
10207 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10208 }
10209 }
10210
10211 return ret;
10212 }
10213
10214 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10215 {
10216 RGWRados *store = source->get_store();
10217 CephContext *cct = store->ctx();
10218
10219 rgw_raw_obj read_obj;
10220 uint64_t read_ofs = ofs;
10221 uint64_t len, read_len;
10222 bool reading_from_head = true;
10223 ObjectReadOperation op;
10224
10225 bool merge_bl = false;
10226 bufferlist *pbl = &bl;
10227 bufferlist read_bl;
10228 uint64_t max_chunk_size;
10229
10230 RGWObjState *astate;
10231 int r = source->get_state(&astate, true);
10232 if (r < 0)
10233 return r;
10234
10235 if (end < 0)
10236 len = 0;
10237 else
10238 len = end - ofs + 1;
10239
10240 if (astate->has_manifest && astate->manifest.has_tail()) {
10241 /* now get the relevant object part */
10242 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10243
10244 uint64_t stripe_ofs = iter.get_stripe_ofs();
10245 read_obj = iter.get_location().get_raw_obj(store);
10246 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10247 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10248 reading_from_head = (read_obj == state.head_obj);
10249 } else {
10250 read_obj = state.head_obj;
10251 }
10252
10253 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10254 if (r < 0) {
10255 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10256 return r;
10257 }
10258
10259 if (len > max_chunk_size)
10260 len = max_chunk_size;
10261
10262
10263 state.io_ctx.locator_set_key(read_obj.loc);
10264
10265 read_len = len;
10266
10267 if (reading_from_head) {
10268 /* only when reading from the head object do we need to do the atomic test */
10269 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10270 if (r < 0)
10271 return r;
10272
10273 if (astate && astate->prefetch_data) {
10274 if (!ofs && astate->data.length() >= len) {
10275 bl = astate->data;
10276 return bl.length();
10277 }
10278
10279 if (ofs < astate->data.length()) {
10280 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10281 astate->data.copy(ofs, copy_len, bl);
10282 read_len -= copy_len;
10283 read_ofs += copy_len;
10284 if (!read_len)
10285 return bl.length();
10286
10287 merge_bl = true;
10288 pbl = &read_bl;
10289 }
10290 }
10291 }
10292
10293 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10294 op.read(read_ofs, read_len, pbl, NULL);
10295
10296 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10297 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10298
10299 if (r < 0) {
10300 return r;
10301 }
10302
10303 if (merge_bl) {
10304 bl.append(read_bl);
10305 }
10306
10307 return bl.length();
10308 }
10309
10310 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10311 {
10312 if (!has_ref) {
10313 int r = store->get_raw_obj_ref(obj, &ref);
10314 if (r < 0) {
10315 return r;
10316 }
10317 has_ref = true;
10318 }
10319 *pref = &ref;
10320 return 0;
10321
10322 }
10323
10324 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10325 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10326 bufferlist& bl, off_t ofs, off_t end,
10327 map<string, bufferlist> *attrs,
10328 rgw_cache_entry_info *cache_info,
10329 boost::optional<obj_version>)
10330 {
10331 uint64_t len;
10332 ObjectReadOperation op;
10333
10334 if (end < 0)
10335 len = 0;
10336 else
10337 len = end - ofs + 1;
10338
10339 if (objv_tracker) {
10340 objv_tracker->prepare_op_for_read(&op);
10341 }
10342
10343 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10344 op.read(ofs, len, &bl, NULL);
10345
10346 if (attrs) {
10347 op.getxattrs(attrs, NULL);
10348 }
10349
10350 rgw_rados_ref *ref;
10351 int r = read_state.get_ref(this, obj, &ref);
10352 if (r < 0) {
10353 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10354 return r;
10355 }
10356 r = ref->ioctx.operate(ref->oid, &op, NULL);
10357 if (r < 0) {
10358 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10359 return r;
10360 }
10361 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10362
10363 uint64_t op_ver = ref->ioctx.get_last_version();
10364
10365 if (read_state.last_ver > 0 &&
10366 read_state.last_ver != op_ver) {
10367 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10368 return -ECANCELED;
10369 }
10370
10371 read_state.last_ver = op_ver;
10372
10373 return bl.length();
10374 }
10375
10376 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10377 RGWObjVersionTracker *objv_tracker,
10378 boost::optional<obj_version> refresh_version)
10379 {
10380 RGWRados *store = source->get_store();
10381 rgw_raw_obj& obj = source->get_obj();
10382
10383 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10384 ofs, end, read_params.attrs,
10385 read_params.cache_info, refresh_version);
10386 }
10387
10388 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10389 {
10390 RGWRados *store = source->get_store();
10391 rgw_raw_obj& obj = source->get_obj();
10392
10393 return store->system_obj_get_attr(obj, name, dest);
10394 }
10395
10396 struct get_obj_data;
10397
10398 struct get_obj_aio_data {
10399 struct get_obj_data *op_data;
10400 off_t ofs;
10401 off_t len;
10402 };
10403
10404 struct get_obj_io {
10405 off_t len;
10406 bufferlist bl;
10407 };
10408
10409 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10410
10411 struct get_obj_data : public RefCountedObject {
10412 CephContext *cct;
10413 RGWRados *rados;
10414 RGWObjectCtx *ctx;
10415 IoCtx io_ctx;
10416 map<off_t, get_obj_io> io_map;
10417 map<off_t, librados::AioCompletion *> completion_map;
10418 uint64_t total_read;
10419 Mutex lock;
10420 Mutex data_lock;
10421 list<get_obj_aio_data> aio_data;
10422 RGWGetDataCB *client_cb;
10423 std::atomic<bool> cancelled = { false };
10424 std::atomic<int64_t> err_code = { 0 };
10425 Throttle throttle;
10426 list<bufferlist> read_list;
10427
10428 explicit get_obj_data(CephContext *_cct)
10429 : cct(_cct),
10430 rados(NULL), ctx(NULL),
10431 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10432 client_cb(NULL),
10433 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10434 ~get_obj_data() override { }
10435 void set_cancelled(int r) {
10436 cancelled = true;
10437 err_code = r;
10438 }
10439
10440 bool is_cancelled() {
10441 return cancelled;
10442 }
10443
10444 int get_err_code() {
10445 return err_code;
10446 }
10447
10448 int wait_next_io(bool *done) {
10449 lock.Lock();
10450 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10451 if (iter == completion_map.end()) {
10452 *done = true;
10453 lock.Unlock();
10454 return 0;
10455 }
10456 off_t cur_ofs = iter->first;
10457 librados::AioCompletion *c = iter->second;
10458 lock.Unlock();
10459
10460 c->wait_for_safe_and_cb();
10461 int r = c->get_return_value();
10462
10463 lock.Lock();
10464 completion_map.erase(cur_ofs);
10465
10466 if (completion_map.empty()) {
10467 *done = true;
10468 }
10469 lock.Unlock();
10470
10471 c->release();
10472
10473 return r;
10474 }
10475
10476 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10477 Mutex::Locker l(lock);
10478
10479 const auto& io_iter = io_map.insert(
10480 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10481
10482 assert(io_iter.second); // assert new insertion
10483
10484 get_obj_io& io = (io_iter.first)->second;
10485 *pbl = &io.bl;
10486
10487 struct get_obj_aio_data aio;
10488 aio.ofs = ofs;
10489 aio.len = len;
10490 aio.op_data = this;
10491
10492 aio_data.push_back(aio);
10493
10494 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10495
10496 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10497 completion_map[ofs] = c;
10498
10499 *pc = c;
10500
10501 /* we have a reference per IO, plus one reference for the calling function.
10502 * reference is dropped for each callback, plus when we're done iterating
10503 * over the parts */
10504 get();
10505 }
10506
10507 void cancel_io(off_t ofs) {
10508 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10509 lock.Lock();
10510 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10511 if (iter != completion_map.end()) {
10512 AioCompletion *c = iter->second;
10513 c->release();
10514 completion_map.erase(ofs);
10515 io_map.erase(ofs);
10516 }
10517 lock.Unlock();
10518
10519 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10520 * need IoCtx to live, as io callback may still be called
10521 */
10522 }
10523
10524 void cancel_all_io() {
10525 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10526 Mutex::Locker l(lock);
10527 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10528 iter != completion_map.end(); ++iter) {
10529 librados::AioCompletion *c = iter->second;
10530 c->release();
10531 }
10532 }
10533
10534 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10535 Mutex::Locker l(lock);
10536
10537 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10538
10539 if (liter == io_map.end() ||
10540 liter->first != ofs) {
10541 return 0;
10542 }
10543
10544 map<off_t, librados::AioCompletion *>::iterator aiter;
10545 aiter = completion_map.find(ofs);
10546 if (aiter == completion_map.end()) {
10547 /* completion map does not hold this io, it was cancelled */
10548 return 0;
10549 }
10550
10551 AioCompletion *completion = aiter->second;
10552 int r = completion->get_return_value();
10553 if (r < 0)
10554 return r;
10555
10556 for (; aiter != completion_map.end(); ++aiter) {
10557 completion = aiter->second;
10558 if (!completion->is_safe()) {
10559 /* reached a request that is not yet complete, stop */
10560 break;
10561 }
10562
10563 r = completion->get_return_value();
10564 if (r < 0) {
10565 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10566 return r;
10567 }
10568
10569 total_read += r;
10570
10571 map<off_t, get_obj_io>::iterator old_liter = liter++;
10572 bl_list.push_back(old_liter->second.bl);
10573 io_map.erase(old_liter);
10574 }
10575
10576 return 0;
10577 }
10578 };
10579
10580 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10581 {
10582 struct get_obj_data *d = (struct get_obj_data *)arg;
10583
10584 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10585 }
10586
10587 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10588 {
10589 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10590 struct get_obj_data *d = aio_data->op_data;
10591
10592 d->rados->get_obj_aio_completion_cb(cb, arg);
10593 }
10594
10595
10596 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10597 {
10598 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10599 struct get_obj_data *d = aio_data->op_data;
10600 off_t ofs = aio_data->ofs;
10601 off_t len = aio_data->len;
10602
10603 list<bufferlist> bl_list;
10604 list<bufferlist>::iterator iter;
10605 int r;
10606
10607 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10608 d->throttle.put(len);
10609
10610 r = rados_aio_get_return_value(c);
10611 if (r < 0) {
10612 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10613 d->set_cancelled(r);
10614 goto done;
10615 }
10616
10617 if (d->is_cancelled()) {
10618 goto done;
10619 }
10620
10621 d->data_lock.Lock();
10622
10623 r = d->get_complete_ios(ofs, bl_list);
10624 if (r < 0) {
10625 goto done_unlock;
10626 }
10627
10628 d->read_list.splice(d->read_list.end(), bl_list);
10629
10630 done_unlock:
10631 d->data_lock.Unlock();
10632 done:
10633 d->put();
10634 return;
10635 }
10636
10637 int RGWRados::flush_read_list(struct get_obj_data *d)
10638 {
10639 d->data_lock.Lock();
10640 list<bufferlist> l;
10641 l.swap(d->read_list);
10642 d->get();
10643 d->read_list.clear();
10644
10645 d->data_lock.Unlock();
10646
10647 int r = 0;
10648
10649 list<bufferlist>::iterator iter;
10650 for (iter = l.begin(); iter != l.end(); ++iter) {
10651 bufferlist& bl = *iter;
10652 r = d->client_cb->handle_data(bl, 0, bl.length());
10653 if (r < 0) {
10654 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10655 break;
10656 }
10657 }
10658
10659 d->data_lock.Lock();
10660 d->put();
10661 if (r < 0) {
10662 d->set_cancelled(r);
10663 }
10664 d->data_lock.Unlock();
10665 return r;
10666 }
10667
10668 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10669 const RGWBucketInfo& bucket_info,
10670 const rgw_obj& obj,
10671 const rgw_raw_obj& read_obj,
10672 off_t obj_ofs,
10673 off_t read_ofs, off_t len,
10674 bool is_head_obj, void *arg)
10675 {
10676 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10677 ObjectReadOperation op;
10678 struct get_obj_data *d = (struct get_obj_data *)arg;
10679 string oid, key;
10680 bufferlist *pbl;
10681 AioCompletion *c;
10682
10683 int r;
10684
10685 if (is_head_obj) {
10686 /* only when reading from the head object do we need to do the atomic test */
10687 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10688 if (r < 0)
10689 return r;
10690
10691 if (astate &&
10692 obj_ofs < astate->data.length()) {
10693 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10694
10695 d->data_lock.Lock();
10696 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10697 d->data_lock.Unlock();
10698 if (r < 0)
10699 return r;
10700
10701 d->lock.Lock();
10702 d->total_read += chunk_len;
10703 d->lock.Unlock();
10704
10705 len -= chunk_len;
10706 read_ofs += chunk_len;
10707 obj_ofs += chunk_len;
10708 if (!len)
10709 return 0;
10710 }
10711 }
10712
10713 d->throttle.get(len);
10714 if (d->is_cancelled()) {
10715 return d->get_err_code();
10716 }
10717
10718 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10719 * cleaning up
10720 */
10721 d->add_io(obj_ofs, len, &pbl, &c);
10722
10723 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10724 op.read(read_ofs, len, pbl, NULL);
10725
10726 librados::IoCtx io_ctx(d->io_ctx);
10727 io_ctx.locator_set_key(read_obj.loc);
10728
10729 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10730 if (r < 0) {
10731 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10732 goto done_err;
10733 }
10734
10735 // Flush data to client if there is any
10736 r = flush_read_list(d);
10737 if (r < 0)
10738 return r;
10739
10740 return 0;
10741
10742 done_err:
10743 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10744 d->set_cancelled(r);
10745 d->cancel_io(obj_ofs);
10746
10747 return r;
10748 }
10749
10750 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10751 {
10752 RGWRados *store = source->get_store();
10753 CephContext *cct = store->ctx();
10754
10755 struct get_obj_data *data = new get_obj_data(cct);
10756 bool done = false;
10757
10758 RGWObjectCtx& obj_ctx = source->get_ctx();
10759
10760 data->rados = store;
10761 data->io_ctx.dup(state.io_ctx);
10762 data->client_cb = cb;
10763
10764 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10765 if (r < 0) {
10766 data->cancel_all_io();
10767 goto done;
10768 }
10769
10770 while (!done) {
10771 r = data->wait_next_io(&done);
10772 if (r < 0) {
10773 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10774 data->cancel_all_io();
10775 break;
10776 }
10777 r = store->flush_read_list(data);
10778 if (r < 0) {
10779 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10780 data->cancel_all_io();
10781 break;
10782 }
10783 }
10784
10785 done:
10786 data->put();
10787 return r;
10788 }
10789
10790 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10791 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10792 off_t ofs, off_t end,
10793 uint64_t max_chunk_size,
10794 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10795 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10796 RGWObjState *, void *),
10797 void *arg)
10798 {
10799 rgw_raw_obj head_obj;
10800 rgw_raw_obj read_obj;
10801 uint64_t read_ofs = ofs;
10802 uint64_t len;
10803 bool reading_from_head = true;
10804 RGWObjState *astate = NULL;
10805
10806 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10807
10808 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10809 if (r < 0) {
10810 return r;
10811 }
10812
10813 if (end < 0)
10814 len = 0;
10815 else
10816 len = end - ofs + 1;
10817
10818 if (astate->has_manifest) {
10819 /* now get the relevant object stripe */
10820 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10821
10822 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10823
10824 for (; iter != obj_end && ofs <= end; ++iter) {
10825 off_t stripe_ofs = iter.get_stripe_ofs();
10826 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10827
10828 while (ofs < next_stripe_ofs && ofs <= end) {
10829 read_obj = iter.get_location().get_raw_obj(this);
10830 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10831 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10832
10833 if (read_len > max_chunk_size) {
10834 read_len = max_chunk_size;
10835 }
10836
10837 reading_from_head = (read_obj == head_obj);
10838 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10839 if (r < 0) {
10840 return r;
10841 }
10842
10843 len -= read_len;
10844 ofs += read_len;
10845 }
10846 }
10847 } else {
10848 while (ofs <= end) {
10849 read_obj = head_obj;
10850 uint64_t read_len = min(len, max_chunk_size);
10851
10852 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10853 if (r < 0) {
10854 return r;
10855 }
10856
10857 len -= read_len;
10858 ofs += read_len;
10859 }
10860 }
10861
10862 return 0;
10863 }
10864
10865 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10866 {
10867 rgw_rados_ref ref;
10868 int r = get_obj_head_ref(bucket_info, obj, &ref);
10869 if (r < 0) {
10870 return r;
10871 }
10872
10873 return ref.ioctx.operate(ref.oid, op);
10874 }
10875
10876 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10877 {
10878 rgw_rados_ref ref;
10879 int r = get_obj_head_ref(bucket_info, obj, &ref);
10880 if (r < 0) {
10881 return r;
10882 }
10883
10884 bufferlist outbl;
10885
10886 return ref.ioctx.operate(ref.oid, op, &outbl);
10887 }
10888
10889 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10890 {
10891 ObjectWriteOperation op;
10892
10893 assert(olh_obj.key.instance.empty());
10894
10895 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10896
10897 if (!state.exists) {
10898 op.create(true);
10899 } else {
10900 op.assert_exists();
10901 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
10902 op.mtime2(&mtime_ts);
10903 }
10904
10905 /*
10906 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10907 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10908 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10909 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10910 * log will reflect that.
10911 *
10912 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10913 * is used for object data instance, olh_tag for olh instance.
10914 */
10915 if (has_tag) {
10916 /* guard against racing writes */
10917 bucket_index_guard_olh_op(state, op);
10918 }
10919
10920 if (!has_tag) {
10921 /* obj tag */
10922 string obj_tag;
10923 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10924 if (ret < 0) {
10925 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10926 return ret;
10927 }
10928 bufferlist bl;
10929 bl.append(obj_tag.c_str(), obj_tag.size());
10930 op.setxattr(RGW_ATTR_ID_TAG, bl);
10931
10932 state.attrset[RGW_ATTR_ID_TAG] = bl;
10933 state.obj_tag = bl;
10934
10935 /* olh tag */
10936 string olh_tag;
10937 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10938 if (ret < 0) {
10939 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10940 return ret;
10941 }
10942 bufferlist olh_bl;
10943 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10944 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10945
10946 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10947 state.olh_tag = olh_bl;
10948 state.is_olh = true;
10949
10950 bufferlist verbl;
10951 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10952 }
10953
10954 bufferlist bl;
10955 RGWOLHPendingInfo pending_info;
10956 pending_info.time = real_clock::now();
10957 ::encode(pending_info, bl);
10958
10959 #define OLH_PENDING_TAG_LEN 32
10960 /* tag will start with current time epoch, this so that entries are sorted by time */
10961 char buf[32];
10962 utime_t ut(pending_info.time);
10963 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10964 *op_tag = buf;
10965
10966 string s;
10967 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10968 if (ret < 0) {
10969 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10970 return ret;
10971 }
10972 op_tag->append(s);
10973
10974 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10975 attr_name.append(*op_tag);
10976
10977 op.setxattr(attr_name.c_str(), bl);
10978
10979 ret = obj_operate(bucket_info, olh_obj, &op);
10980 if (ret < 0) {
10981 return ret;
10982 }
10983
10984 state.exists = true;
10985 state.attrset[attr_name] = bl;
10986
10987 return 0;
10988 }
10989
10990 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10991 {
10992 int ret;
10993
10994 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10995 if (ret == -EEXIST) {
10996 ret = -ECANCELED;
10997 }
10998
10999 return ret;
11000 }
11001
11002 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
11003 {
11004 rgw_obj obj;
11005 const rgw_obj *pobj = &obj_instance;
11006 int r;
11007
11008 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
11009 r = bs->init(pobj->bucket, *pobj);
11010 if (r < 0) {
11011 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11012 return r;
11013 }
11014 r = call(bs);
11015 if (r != -ERR_BUSY_RESHARDING) {
11016 break;
11017 }
11018 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11019 string new_bucket_id;
11020 r = block_while_resharding(bs, &new_bucket_id);
11021 if (r == -ERR_BUSY_RESHARDING) {
11022 continue;
11023 }
11024 if (r < 0) {
11025 return r;
11026 }
11027 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11028 i = 0; /* resharding is finished, make sure we can retry */
11029
11030 obj = *pobj;
11031 obj.bucket.update_bucket_id(new_bucket_id);
11032 pobj = &obj;
11033 }
11034
11035 if (r < 0) {
11036 return r;
11037 }
11038
11039 return 0;
11040 }
11041
11042 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
11043 {
11044 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11045
11046 return waiter->block_while_resharding(bs, new_bucket_id);
11047 }
11048
11049 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11050 bool delete_marker,
11051 const string& op_tag,
11052 struct rgw_bucket_dir_entry_meta *meta,
11053 uint64_t olh_epoch,
11054 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
11055 {
11056 rgw_rados_ref ref;
11057 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11058 if (r < 0) {
11059 return r;
11060 }
11061
11062 rgw_zone_set zones_trace;
11063 if (_zones_trace) {
11064 zones_trace = *_zones_trace;
11065 } else {
11066 zones_trace.insert(get_zone().id);
11067 }
11068
11069 BucketShard bs(this);
11070
11071 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11072 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11073 librados::ObjectWriteOperation op;
11074 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11075 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11076 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11077 unmod_since, high_precision_time,
11078 get_zone().log_data, zones_trace);
11079 });
11080 if (r < 0) {
11081 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11082 return r;
11083 }
11084
11085 return 0;
11086 }
11087
11088 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11089 {
11090 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11091 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11092 }
11093
11094 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
11095 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
11096 {
11097 rgw_rados_ref ref;
11098 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11099 if (r < 0) {
11100 return r;
11101 }
11102
11103 rgw_zone_set zones_trace;
11104 if (_zones_trace) {
11105 zones_trace = *_zones_trace;
11106 }
11107 zones_trace.insert(get_zone().id);
11108
11109 BucketShard bs(this);
11110
11111 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11112 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11113 librados::ObjectWriteOperation op;
11114 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11115 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11116 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11117 });
11118 if (r < 0) {
11119 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11120 return r;
11121 }
11122
11123 return 0;
11124 }
11125
11126 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11127 const rgw_obj& obj_instance, uint64_t ver_marker,
11128 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11129 bool *is_truncated)
11130 {
11131 rgw_rados_ref ref;
11132 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11133 if (r < 0) {
11134 return r;
11135 }
11136
11137 BucketShard bs(this);
11138 int ret = bs.init(obj_instance.bucket, obj_instance);
11139 if (ret < 0) {
11140 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11141 return ret;
11142 }
11143
11144 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11145
11146 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11147
11148 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11149 ObjectReadOperation op;
11150 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11151 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11152 key, ver_marker, olh_tag, log, is_truncated);
11153 });
11154 if (ret < 0) {
11155 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11156 return ret;
11157 }
11158
11159 return 0;
11160 }
11161
11162 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11163 {
11164 rgw_rados_ref ref;
11165 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11166 if (r < 0) {
11167 return r;
11168 }
11169
11170 BucketShard bs(this);
11171 int ret = bs.init(obj_instance.bucket, obj_instance);
11172 if (ret < 0) {
11173 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11174 return ret;
11175 }
11176
11177 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11178
11179 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11180
11181 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11182 ObjectWriteOperation op;
11183 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11184 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11185 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11186 });
11187 if (ret < 0) {
11188 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11189 return ret;
11190 }
11191
11192 return 0;
11193 }
11194
11195 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11196 {
11197 rgw_rados_ref ref;
11198 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11199 if (r < 0) {
11200 return r;
11201 }
11202
11203 BucketShard bs(this);
11204
11205 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11206
11207 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11208
11209 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11210 ObjectWriteOperation op;
11211 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11212 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11213 });
11214 if (ret < 0) {
11215 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11216 return ret;
11217 }
11218
11219 return 0;
11220 }
11221
11222 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11223 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11224 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11225 {
11226 if (log.empty()) {
11227 return 0;
11228 }
11229
11230 librados::ObjectWriteOperation op;
11231
11232 uint64_t last_ver = log.rbegin()->first;
11233 *plast_ver = last_ver;
11234
11235 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11236
11237 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11238 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11239
11240 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11241 op.mtime2(&mtime_ts);
11242
11243 bool need_to_link = false;
11244 cls_rgw_obj_key key;
11245 bool delete_marker = false;
11246 list<cls_rgw_obj_key> remove_instances;
11247 bool need_to_remove = false;
11248
11249 for (iter = log.begin(); iter != log.end(); ++iter) {
11250 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11251 for (; viter != iter->second.end(); ++viter) {
11252 rgw_bucket_olh_log_entry& entry = *viter;
11253
11254 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11255 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11256 << (entry.delete_marker ? "(delete)" : "") << dendl;
11257 switch (entry.op) {
11258 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11259 remove_instances.push_back(entry.key);
11260 break;
11261 case CLS_RGW_OLH_OP_LINK_OLH:
11262 need_to_link = true;
11263 need_to_remove = false;
11264 key = entry.key;
11265 delete_marker = entry.delete_marker;
11266 break;
11267 case CLS_RGW_OLH_OP_UNLINK_OLH:
11268 need_to_remove = true;
11269 need_to_link = false;
11270 break;
11271 default:
11272 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11273 return -EIO;
11274 }
11275 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11276 attr_name.append(entry.op_tag);
11277 op.rmxattr(attr_name.c_str());
11278 }
11279 }
11280
11281 rgw_rados_ref ref;
11282 int r = get_obj_head_ref(bucket_info, obj, &ref);
11283 if (r < 0) {
11284 return r;
11285 }
11286
11287 const rgw_bucket& bucket = obj.bucket;
11288
11289 if (need_to_link) {
11290 rgw_obj target(bucket, key);
11291 RGWOLHInfo info;
11292 info.target = target;
11293 info.removed = delete_marker;
11294 bufferlist bl;
11295 ::encode(info, bl);
11296 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11297 }
11298
11299 /* first remove object instances */
11300 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11301 liter != remove_instances.end(); ++liter) {
11302 cls_rgw_obj_key& key = *liter;
11303 rgw_obj obj_instance(bucket, key);
11304 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11305 if (ret < 0 && ret != -ENOENT) {
11306 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11307 return ret;
11308 }
11309 }
11310
11311 /* update olh object */
11312 r = ref.ioctx.operate(ref.oid, &op);
11313 if (r == -ECANCELED) {
11314 r = 0;
11315 }
11316 if (r < 0) {
11317 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11318 return r;
11319 }
11320
11321 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11322 if (r < 0) {
11323 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11324 return r;
11325 }
11326
11327 if (need_to_remove) {
11328 ObjectWriteOperation rm_op;
11329
11330 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11331 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11332 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11333 rm_op.remove();
11334
11335 r = ref.ioctx.operate(ref.oid, &rm_op);
11336 if (r == -ECANCELED) {
11337 return 0; /* someone else won this race */
11338 } else {
11339 /*
11340 * only clear if was successful, otherwise we might clobber pending operations on this object
11341 */
11342 r = bucket_index_clear_olh(bucket_info, state, obj);
11343 if (r < 0) {
11344 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11345 return r;
11346 }
11347 }
11348 }
11349
11350 return 0;
11351 }
11352
11353 /*
11354 * read olh log and apply it
11355 */
11356 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11357 {
11358 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11359 bool is_truncated;
11360 uint64_t ver_marker = 0;
11361
11362 do {
11363 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11364 if (ret < 0) {
11365 return ret;
11366 }
11367 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11368 if (ret < 0) {
11369 return ret;
11370 }
11371 } while (is_truncated);
11372
11373 return 0;
11374 }
11375
11376 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11377 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11378 {
11379 string op_tag;
11380
11381 rgw_obj olh_obj = target_obj;
11382 olh_obj.key.instance.clear();
11383
11384 RGWObjState *state = NULL;
11385
11386 int ret = 0;
11387 int i;
11388
11389 #define MAX_ECANCELED_RETRY 100
11390 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11391 if (ret == -ECANCELED) {
11392 obj_ctx.obj.invalidate(olh_obj);
11393 }
11394
11395 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11396 if (ret < 0) {
11397 return ret;
11398 }
11399
11400 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11401 if (ret < 0) {
11402 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11403 if (ret == -ECANCELED) {
11404 continue;
11405 }
11406 return ret;
11407 }
11408 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11409 if (ret < 0) {
11410 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11411 if (ret == -ECANCELED) {
11412 continue;
11413 }
11414 return ret;
11415 }
11416 break;
11417 }
11418
11419 if (i == MAX_ECANCELED_RETRY) {
11420 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11421 return -EIO;
11422 }
11423
11424 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11425 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11426 ret = 0;
11427 }
11428 if (ret < 0) {
11429 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11430 return ret;
11431 }
11432
11433 return 0;
11434 }
11435
11436 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11437 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11438 {
11439 string op_tag;
11440
11441 rgw_obj olh_obj = target_obj;
11442 olh_obj.key.instance.clear();
11443
11444 RGWObjState *state = NULL;
11445
11446 int ret = 0;
11447 int i;
11448
11449 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11450 if (ret == -ECANCELED) {
11451 obj_ctx.obj.invalidate(olh_obj);
11452 }
11453
11454 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11455 if (ret < 0)
11456 return ret;
11457
11458 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11459 if (ret < 0) {
11460 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11461 if (ret == -ECANCELED) {
11462 continue;
11463 }
11464 return ret;
11465 }
11466
11467 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11468
11469 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11470 if (ret < 0) {
11471 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11472 if (ret == -ECANCELED) {
11473 continue;
11474 }
11475 return ret;
11476 }
11477 break;
11478 }
11479
11480 if (i == MAX_ECANCELED_RETRY) {
11481 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11482 return -EIO;
11483 }
11484
11485 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11486 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11487 return 0;
11488 }
11489 if (ret < 0) {
11490 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11491 return ret;
11492 }
11493
11494 return 0;
11495 }
11496
11497 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11498 {
11499 #define OBJ_INSTANCE_LEN 32
11500 char buf[OBJ_INSTANCE_LEN + 1];
11501
11502 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11503 no underscore for instance name due to the way we encode the raw keys */
11504
11505 target_obj->key.set_instance(buf);
11506 }
11507
11508 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11509 map<string, bufferlist> *attrset)
11510 {
11511 attrset->clear();
11512 map<string, bufferlist>::iterator iter;
11513 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11514 iter != unfiltered_attrset.end(); ++iter) {
11515 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11516 break;
11517 (*attrset)[iter->first] = iter->second;
11518 }
11519 }
11520
11521 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11522 {
11523 map<string, bufferlist> unfiltered_attrset;
11524
11525 ObjectReadOperation op;
11526 op.getxattrs(&unfiltered_attrset, NULL);
11527
11528 bufferlist outbl;
11529 int r = obj_operate(bucket_info, obj, &op);
11530
11531 if (r < 0) {
11532 return r;
11533 }
11534 map<string, bufferlist> attrset;
11535
11536 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11537
11538 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11539 if (iter == attrset.end()) { /* not an olh */
11540 return -EINVAL;
11541 }
11542
11543 try {
11544 bufferlist::iterator biter = iter->second.begin();
11545 ::decode(*olh, biter);
11546 } catch (buffer::error& err) {
11547 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11548 return -EIO;
11549 }
11550
11551 return 0;
11552 }
11553
11554 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11555 map<string, bufferlist> *rm_pending_entries)
11556 {
11557 map<string, bufferlist>::iterator iter = pending_entries.begin();
11558
11559 real_time now = real_clock::now();
11560
11561 while (iter != pending_entries.end()) {
11562 bufferlist::iterator biter = iter->second.begin();
11563 RGWOLHPendingInfo pending_info;
11564 try {
11565 ::decode(pending_info, biter);
11566 } catch (buffer::error& err) {
11567 /* skipping bad entry, we could remove it but it might hide a bug */
11568 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11569 ++iter;
11570 continue;
11571 }
11572
11573 map<string, bufferlist>::iterator cur_iter = iter;
11574 ++iter;
11575 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11576 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11577 pending_entries.erase(cur_iter);
11578 } else {
11579 /* entries names are sorted by time (rounded to a second) */
11580 break;
11581 }
11582 }
11583 }
11584
11585 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11586 {
11587 ObjectWriteOperation op;
11588
11589 bucket_index_guard_olh_op(state, op);
11590
11591 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11592 op.rmxattr(iter->first.c_str());
11593 }
11594
11595 rgw_rados_ref ref;
11596 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11597 if (r < 0) {
11598 return r;
11599 }
11600
11601 /* update olh object */
11602 r = ref.ioctx.operate(ref.oid, &op);
11603 if (r == -ENOENT || r == -ECANCELED) {
11604 /* raced with some other change, shouldn't sweat about it */
11605 r = 0;
11606 }
11607 if (r < 0) {
11608 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11609 return r;
11610 }
11611
11612 return 0;
11613 }
11614
11615 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11616 {
11617 map<string, bufferlist> pending_entries;
11618 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11619
11620 map<string, bufferlist> rm_pending_entries;
11621 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11622
11623 if (!rm_pending_entries.empty()) {
11624 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11625 if (ret < 0) {
11626 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11627 return ret;
11628 }
11629 }
11630 if (!pending_entries.empty()) {
11631 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11632
11633 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11634 if (ret < 0) {
11635 return ret;
11636 }
11637 }
11638
11639 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11640 assert(iter != state->attrset.end());
11641 RGWOLHInfo olh;
11642 try {
11643 bufferlist::iterator biter = iter->second.begin();
11644 ::decode(olh, biter);
11645 } catch (buffer::error& err) {
11646 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11647 return -EIO;
11648 }
11649
11650 if (olh.removed) {
11651 return -ENOENT;
11652 }
11653
11654 *target = olh.target;
11655
11656 return 0;
11657 }
11658
11659 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11660 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11661 RGWObjVersionTracker *objv_tracker)
11662 {
11663 rgw_rados_ref ref;
11664 int r = get_raw_obj_ref(obj, &ref);
11665 if (r < 0) {
11666 return r;
11667 }
11668
11669 map<string, bufferlist> unfiltered_attrset;
11670 uint64_t size = 0;
11671 struct timespec mtime_ts;
11672
11673 ObjectReadOperation op;
11674 if (objv_tracker) {
11675 objv_tracker->prepare_op_for_read(&op);
11676 }
11677 if (attrs) {
11678 op.getxattrs(&unfiltered_attrset, NULL);
11679 }
11680 if (psize || pmtime) {
11681 op.stat2(&size, &mtime_ts, NULL);
11682 }
11683 if (first_chunk) {
11684 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11685 }
11686 bufferlist outbl;
11687 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11688
11689 if (epoch) {
11690 *epoch = ref.ioctx.get_last_version();
11691 }
11692
11693 if (r < 0)
11694 return r;
11695
11696 if (psize)
11697 *psize = size;
11698 if (pmtime)
11699 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11700 if (attrs) {
11701 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11702 }
11703
11704 return 0;
11705 }
11706
11707 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11708 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11709 {
11710 map<string, rgw_bucket_dir_header> headers;
11711 map<int, string> bucket_instance_ids;
11712 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11713 if (r < 0) {
11714 return r;
11715 }
11716
11717 assert(headers.size() == bucket_instance_ids.size());
11718
11719 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11720 map<int, string>::iterator viter = bucket_instance_ids.begin();
11721 BucketIndexShardsManager ver_mgr;
11722 BucketIndexShardsManager master_ver_mgr;
11723 BucketIndexShardsManager marker_mgr;
11724 char buf[64];
11725 for(; iter != headers.end(); ++iter, ++viter) {
11726 accumulate_raw_stats(iter->second, stats);
11727 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11728 ver_mgr.add(viter->first, string(buf));
11729 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11730 master_ver_mgr.add(viter->first, string(buf));
11731 if (shard_id >= 0) {
11732 *max_marker = iter->second.max_marker;
11733 } else {
11734 marker_mgr.add(viter->first, iter->second.max_marker);
11735 }
11736 if (syncstopped != NULL)
11737 *syncstopped = iter->second.syncstopped;
11738 }
11739 ver_mgr.to_string(bucket_ver);
11740 master_ver_mgr.to_string(master_ver);
11741 if (shard_id < 0) {
11742 marker_mgr.to_string(max_marker);
11743 }
11744 return 0;
11745 }
11746
11747 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11748 map<int, string>& markers)
11749 {
11750 map<string, rgw_bucket_dir_header> headers;
11751 map<int, string> bucket_instance_ids;
11752 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11753 if (r < 0)
11754 return r;
11755
11756 assert(headers.size() == bucket_instance_ids.size());
11757
11758 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11759 map<int, string>::iterator viter = bucket_instance_ids.begin();
11760
11761 for(; iter != headers.end(); ++iter, ++viter) {
11762 if (shard_id >= 0) {
11763 markers[shard_id] = iter->second.max_marker;
11764 } else {
11765 markers[viter->first] = iter->second.max_marker;
11766 }
11767 }
11768 return 0;
11769 }
11770
11771 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11772 RGWGetBucketStats_CB *cb;
11773 uint32_t pendings;
11774 map<RGWObjCategory, RGWStorageStats> stats;
11775 int ret_code;
11776 bool should_cb;
11777 Mutex lock;
11778
11779 public:
11780 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11781 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11782 lock("RGWGetBucketStatsContext") {}
11783
11784 void handle_response(int r, rgw_bucket_dir_header& header) override {
11785 Mutex::Locker l(lock);
11786 if (should_cb) {
11787 if ( r >= 0) {
11788 accumulate_raw_stats(header, stats);
11789 } else {
11790 ret_code = r;
11791 }
11792
11793 // Are we all done?
11794 if (--pendings == 0) {
11795 if (!ret_code) {
11796 cb->set_response(&stats);
11797 }
11798 cb->handle_response(ret_code);
11799 cb->put();
11800 }
11801 }
11802 }
11803
11804 void unset_cb() {
11805 Mutex::Locker l(lock);
11806 should_cb = false;
11807 }
11808 };
11809
11810 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11811 {
11812 int num_aio = 0;
11813 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11814 assert(get_ctx);
11815 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11816 if (r < 0) {
11817 ctx->put();
11818 if (num_aio) {
11819 get_ctx->unset_cb();
11820 }
11821 }
11822 get_ctx->put();
11823 return r;
11824 }
11825
11826 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11827 RGWGetUserStats_CB *cb;
11828
11829 public:
11830 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11831 : cb(cb) {}
11832
11833 void handle_response(int r, cls_user_header& header) override {
11834 const cls_user_stats& hs = header.stats;
11835 if (r >= 0) {
11836 RGWStorageStats stats;
11837
11838 stats.size = hs.total_bytes;
11839 stats.size_rounded = hs.total_bytes_rounded;
11840 stats.num_objects = hs.total_entries;
11841
11842 cb->set_response(stats);
11843 }
11844
11845 cb->handle_response(r);
11846
11847 cb->put();
11848 }
11849 };
11850
11851 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11852 {
11853 string user_str = user.to_str();
11854
11855 cls_user_header header;
11856 int r = cls_user_get_header(user_str, &header);
11857 if (r < 0)
11858 return r;
11859
11860 const cls_user_stats& hs = header.stats;
11861
11862 stats.size = hs.total_bytes;
11863 stats.size_rounded = hs.total_bytes_rounded;
11864 stats.num_objects = hs.total_entries;
11865
11866 return 0;
11867 }
11868
11869 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11870 {
11871 string user_str = user.to_str();
11872
11873 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11874 int r = cls_user_get_header_async(user_str, get_ctx);
11875 if (r < 0) {
11876 ctx->put();
11877 delete get_ctx;
11878 return r;
11879 }
11880
11881 return 0;
11882 }
11883
11884 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11885 {
11886 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11887 }
11888
11889 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11890 {
11891 if (!bucket.oid.empty()) {
11892 obj.init(get_zone_params().domain_root, bucket.oid);
11893 } else {
11894 string oid;
11895 get_bucket_meta_oid(bucket, oid);
11896 obj.init(get_zone_params().domain_root, oid);
11897 }
11898 }
11899
11900 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11901 real_time *pmtime, map<string, bufferlist> *pattrs)
11902 {
11903 size_t pos = meta_key.find(':');
11904 if (pos == string::npos) {
11905 return -EINVAL;
11906 }
11907 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11908 rgw_bucket_instance_key_to_oid(oid);
11909
11910 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11911 }
11912
11913 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11914 real_time *pmtime, map<string, bufferlist> *pattrs)
11915 {
11916 string oid;
11917 if (bucket.oid.empty()) {
11918 get_bucket_meta_oid(bucket, oid);
11919 } else {
11920 oid = bucket.oid;
11921 }
11922
11923 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11924 }
11925
11926 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11927 real_time *pmtime, map<string, bufferlist> *pattrs,
11928 rgw_cache_entry_info *cache_info,
11929 boost::optional<obj_version> refresh_version)
11930 {
11931 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11932
11933 bufferlist epbl;
11934
11935 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11936 oid, epbl, &info.objv_tracker, pmtime, pattrs,
11937 cache_info, refresh_version);
11938 if (ret < 0) {
11939 return ret;
11940 }
11941
11942 bufferlist::iterator iter = epbl.begin();
11943 try {
11944 ::decode(info, iter);
11945 } catch (buffer::error& err) {
11946 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11947 return -EIO;
11948 }
11949 info.bucket.oid = oid;
11950 return 0;
11951 }
11952
11953 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11954 const string& tenant_name,
11955 const string& bucket_name,
11956 RGWBucketEntryPoint& entry_point,
11957 RGWObjVersionTracker *objv_tracker,
11958 real_time *pmtime,
11959 map<string, bufferlist> *pattrs,
11960 rgw_cache_entry_info *cache_info,
11961 boost::optional<obj_version> refresh_version)
11962 {
11963 bufferlist bl;
11964 string bucket_entry;
11965
11966 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11967 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11968 bucket_entry, bl, objv_tracker, pmtime, pattrs,
11969 cache_info, refresh_version);
11970 if (ret < 0) {
11971 return ret;
11972 }
11973
11974 bufferlist::iterator iter = bl.begin();
11975 try {
11976 ::decode(entry_point, iter);
11977 } catch (buffer::error& err) {
11978 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11979 return -EIO;
11980 }
11981 return 0;
11982 }
11983
11984 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11985 const string& tenant_name,
11986 const string& bucket_name)
11987 {
11988 RGWBucketEntryPoint entry_point;
11989 real_time ep_mtime;
11990 RGWObjVersionTracker ot;
11991 map<string, bufferlist> attrs;
11992 RGWBucketInfo info;
11993
11994 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11995
11996 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11997 if (ret < 0) {
11998 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11999 return ret;
12000 }
12001
12002 if (!entry_point.has_bucket_info) {
12003 /* already converted! */
12004 return 0;
12005 }
12006
12007 info = entry_point.old_bucket_info;
12008 info.bucket.oid = bucket_name;
12009 info.ep_objv = ot.read_version;
12010
12011 ot.generate_new_write_ver(cct);
12012
12013 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12014 if (ret < 0) {
12015 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12016 return ret;
12017 }
12018
12019 return 0;
12020 }
12021
12022 int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12023 const string& tenant,
12024 const string& bucket_name,
12025 RGWBucketInfo& info,
12026 real_time *pmtime,
12027 map<string, bufferlist> *pattrs,
12028 boost::optional<obj_version> refresh_version)
12029 {
12030 bucket_info_entry e;
12031 string bucket_entry;
12032 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12033
12034
12035 if (binfo_cache->find(bucket_entry, &e)) {
12036 if (refresh_version &&
12037 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12038 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12039 << "a failure that should be debugged. I am a nice machine, "
12040 << "so I will try to recover." << dendl;
12041 binfo_cache->invalidate(bucket_entry);
12042 }
12043 info = e.info;
12044 if (pattrs)
12045 *pattrs = e.attrs;
12046 if (pmtime)
12047 *pmtime = e.mtime;
12048 return 0;
12049 }
12050
12051 RGWBucketEntryPoint entry_point;
12052 real_time ep_mtime;
12053 RGWObjVersionTracker ot;
12054 rgw_cache_entry_info entry_cache_info;
12055 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12056 entry_point, &ot, &ep_mtime, pattrs,
12057 &entry_cache_info, refresh_version);
12058 if (ret < 0) {
12059 /* only init these fields */
12060 info.bucket.tenant = tenant;
12061 info.bucket.name = bucket_name;
12062 return ret;
12063 }
12064
12065 if (entry_point.has_bucket_info) {
12066 info = entry_point.old_bucket_info;
12067 info.bucket.oid = bucket_name;
12068 info.bucket.tenant = tenant;
12069 info.ep_objv = ot.read_version;
12070 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12071 return 0;
12072 }
12073
12074 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12075 * that we got
12076 */
12077 if (pattrs) {
12078 pattrs->clear();
12079 }
12080
12081 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12082
12083
12084 /* read bucket instance info */
12085
12086 string oid;
12087 get_bucket_meta_oid(entry_point.bucket, oid);
12088
12089 rgw_cache_entry_info cache_info;
12090
12091 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12092 &cache_info, refresh_version);
12093 e.info.ep_objv = ot.read_version;
12094 info = e.info;
12095 if (ret < 0) {
12096 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
12097 info.bucket.tenant = tenant;
12098 info.bucket.name = bucket_name;
12099 // XXX and why return anything in case of an error anyway?
12100 return ret;
12101 }
12102
12103 if (pmtime)
12104 *pmtime = e.mtime;
12105 if (pattrs)
12106 *pattrs = e.attrs;
12107
12108 list<rgw_cache_entry_info *> cache_info_entries;
12109 cache_info_entries.push_back(&entry_cache_info);
12110 cache_info_entries.push_back(&cache_info);
12111
12112
12113 /* chain to both bucket entry point and bucket instance */
12114 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12115 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12116 }
12117
12118 if (refresh_version &&
12119 refresh_version->compare(&info.objv_tracker.read_version)) {
12120 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12121 << "have gone squirrelly. An administrator may have forced a "
12122 << "change; otherwise there is a problem somewhere." << dendl;
12123 }
12124
12125 return 0;
12126 }
12127
12128 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12129 const string& tenant, const string& bucket_name,
12130 RGWBucketInfo& info,
12131 real_time *pmtime, map<string, bufferlist> *pattrs)
12132 {
12133 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12134 pattrs, boost::none);
12135 }
12136
12137 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12138 ceph::real_time *pmtime,
12139 map<string, bufferlist> *pattrs)
12140 {
12141 RGWObjectCtx obj_ctx(this);
12142
12143 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12144 info, pmtime, pattrs, info.objv_tracker.read_version);
12145 }
12146
12147 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12148 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12149 map<string, bufferlist> *pattrs)
12150 {
12151 bufferlist epbl;
12152 ::encode(entry_point, epbl);
12153 string bucket_entry;
12154 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12155 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12156 }
12157
12158 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12159 real_time mtime, map<string, bufferlist> *pattrs)
12160 {
12161 info.has_instance_obj = true;
12162 bufferlist bl;
12163
12164 ::encode(info, bl);
12165
12166 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12167 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12168 if (ret == -EEXIST) {
12169 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12170 * bucket operation on this specific bucket (e.g., being synced from the master), but
12171 * since bucket instace meta object is unique for this specific bucket instace, we don't
12172 * need to return an error.
12173 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12174 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12175 * locally, while in the sync thread we sync the new bucket.
12176 */
12177 ret = 0;
12178 }
12179 return ret;
12180 }
12181
12182 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12183 map<string, bufferlist> *pattrs, bool create_entry_point)
12184 {
12185 bool create_head = !info.has_instance_obj || create_entry_point;
12186
12187 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12188 if (ret < 0) {
12189 return ret;
12190 }
12191
12192 if (!create_head)
12193 return 0; /* done! */
12194
12195 RGWBucketEntryPoint entry_point;
12196 entry_point.bucket = info.bucket;
12197 entry_point.owner = info.owner;
12198 entry_point.creation_time = info.creation_time;
12199 entry_point.linked = true;
12200 RGWObjVersionTracker ot;
12201 if (pep_objv && !pep_objv->tag.empty()) {
12202 ot.write_version = *pep_objv;
12203 } else {
12204 ot.generate_new_write_ver(cct);
12205 if (pep_objv) {
12206 *pep_objv = ot.write_version;
12207 }
12208 }
12209 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12210 if (ret < 0)
12211 return ret;
12212
12213 return 0;
12214 }
12215
12216 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12217 {
12218 rgw_rados_ref ref;
12219 int r = get_raw_obj_ref(obj, &ref);
12220 if (r < 0) {
12221 return r;
12222 }
12223
12224 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12225 if (r < 0)
12226 return r;
12227
12228 return 0;
12229
12230 }
12231
12232 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12233 std::map<string, bufferlist>& m)
12234 {
12235 rgw_rados_ref ref;
12236 int r = get_raw_obj_ref(obj, &ref);
12237 if (r < 0) {
12238 return r;
12239 }
12240
12241 #define MAX_OMAP_GET_ENTRIES 1024
12242 const int count = MAX_OMAP_GET_ENTRIES;
12243 string start_after;
12244
12245 while (true) {
12246 std::map<string, bufferlist> t;
12247 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12248 if (r < 0) {
12249 return r;
12250 }
12251 if (t.empty()) {
12252 break;
12253 }
12254 start_after = t.rbegin()->first;
12255 m.insert(t.begin(), t.end());
12256 }
12257 return 0;
12258 }
12259
12260 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12261 {
12262 rgw_rados_ref ref;
12263 int r = get_raw_obj_ref(obj, &ref);
12264 if (r < 0) {
12265 return r;
12266 }
12267 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12268
12269 map<string, bufferlist> m;
12270 m[key] = bl;
12271
12272 r = ref.ioctx.omap_set(ref.oid, m);
12273
12274 return r;
12275 }
12276
12277 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12278 {
12279 rgw_rados_ref ref;
12280 int r = get_raw_obj_ref(obj, &ref);
12281 if (r < 0) {
12282 return r;
12283 }
12284
12285 r = ref.ioctx.omap_set(ref.oid, m);
12286
12287 return r;
12288 }
12289
12290 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12291 {
12292 rgw_rados_ref ref;
12293 int r = get_raw_obj_ref(obj, &ref);
12294 if (r < 0) {
12295 return r;
12296 }
12297
12298 set<string> k;
12299 k.insert(key);
12300
12301 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12302 return r;
12303 }
12304
12305 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12306 {
12307 RGWObjectCtx obj_ctx(this);
12308
12309 map<string, RGWBucketEnt>::iterator iter;
12310 for (iter = m.begin(); iter != m.end(); ++iter) {
12311 RGWBucketEnt& ent = iter->second;
12312 rgw_bucket& bucket = ent.bucket;
12313 ent.count = 0;
12314 ent.size = 0;
12315 ent.size_rounded = 0;
12316
12317 map<string, rgw_bucket_dir_header> headers;
12318
12319 RGWBucketInfo bucket_info;
12320 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12321 if (ret < 0) {
12322 return ret;
12323 }
12324
12325 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12326 if (r < 0)
12327 return r;
12328
12329 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12330 for (; hiter != headers.end(); ++hiter) {
12331 RGWObjCategory category = main_category;
12332 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12333 if (iter != hiter->second.stats.end()) {
12334 struct rgw_bucket_category_stats& stats = iter->second;
12335 ent.count += stats.num_entries;
12336 ent.size += stats.total_size;
12337 ent.size_rounded += stats.total_size_rounded;
12338 }
12339 }
12340
12341 // fill in placement_rule from the bucket instance for use in swift's
12342 // per-storage policy statistics
12343 ent.placement_rule = std::move(bucket_info.placement_rule);
12344 }
12345
12346 return m.size();
12347 }
12348
12349 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12350 {
12351 rgw_rados_ref ref;
12352 int r = get_raw_obj_ref(obj, &ref);
12353 if (r < 0) {
12354 return r;
12355 }
12356 librados::Rados *rad = get_rados_handle();
12357 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12358
12359 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12360 completion->release();
12361 return r;
12362 }
12363
12364 int RGWRados::distribute(const string& key, bufferlist& bl)
12365 {
12366 /*
12367 * we were called before watch was initialized. This can only happen if we're updating some system
12368 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12369 * objects, they're currently only read on startup anyway.
12370 */
12371 if (!watch_initialized)
12372 return 0;
12373
12374 string notify_oid;
12375 pick_control_oid(key, notify_oid);
12376
12377 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12378 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12379 }
12380
12381 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12382 {
12383 librados::IoCtx& io_ctx = ctx.io_ctx;
12384 librados::NObjectIterator& iter = ctx.iter;
12385
12386 int r = open_pool_ctx(pool, io_ctx);
12387 if (r < 0)
12388 return r;
12389
12390 iter = io_ctx.nobjects_begin();
12391
12392 return 0;
12393 }
12394
12395 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12396 {
12397 librados::IoCtx& io_ctx = ctx.io_ctx;
12398 librados::NObjectIterator& iter = ctx.iter;
12399
12400 int r = open_pool_ctx(pool, io_ctx);
12401 if (r < 0)
12402 return r;
12403
12404 librados::ObjectCursor oc;
12405 if (!oc.from_str(cursor)) {
12406 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12407 return -EINVAL;
12408 }
12409
12410 iter = io_ctx.nobjects_begin(oc);
12411
12412 return 0;
12413 }
12414
12415 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12416 {
12417 return ctx.iter.get_cursor().to_str();
12418 }
12419
12420 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12421 bool *is_truncated, RGWAccessListFilter *filter)
12422 {
12423 librados::IoCtx& io_ctx = ctx.io_ctx;
12424 librados::NObjectIterator& iter = ctx.iter;
12425
12426 if (iter == io_ctx.nobjects_end())
12427 return -ENOENT;
12428
12429 uint32_t i;
12430
12431 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12432 rgw_bucket_dir_entry e;
12433
12434 string oid = iter->get_oid();
12435 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12436
12437 // fill it in with initial values; we may correct later
12438 if (filter && !filter->filter(oid, oid))
12439 continue;
12440
12441 e.key = oid;
12442 objs.push_back(e);
12443 }
12444
12445 if (is_truncated)
12446 *is_truncated = (iter != io_ctx.nobjects_end());
12447
12448 return objs.size();
12449 }
12450 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12451 string prefix;
12452
12453 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12454 bool filter(string& name, string& key) override {
12455 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12456 }
12457 };
12458
12459 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
12460 {
12461 if (!ctx->initialized) {
12462 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
12463 if (r < 0) {
12464 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12465 return r;
12466 }
12467 ctx->initialized = true;
12468 }
12469 return 0;
12470 }
12471
12472 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12473 RGWListRawObjsCtx& ctx, list<string>& oids,
12474 bool *is_truncated)
12475 {
12476 if (!ctx.initialized) {
12477 return -EINVAL;
12478 }
12479 RGWAccessListFilterPrefix filter(prefix_filter);
12480 vector<rgw_bucket_dir_entry> objs;
12481 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12482 if (r < 0) {
12483 if(r != -ENOENT)
12484 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12485 return r;
12486 }
12487
12488 vector<rgw_bucket_dir_entry>::iterator iter;
12489 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12490 oids.push_back(iter->key.name);
12491 }
12492
12493 return oids.size();
12494 }
12495
12496 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12497 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12498 bool *is_truncated)
12499 {
12500 if (!ctx.initialized) {
12501 int r = list_raw_objects_init(pool, string(), &ctx);
12502 if (r < 0) {
12503 return r;
12504 }
12505 }
12506
12507 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12508 }
12509
12510 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12511 {
12512 return pool_iterate_get_cursor(ctx.iter_ctx);
12513 }
12514
12515 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12516 std::list<rgw_bi_log_entry>& result, bool *truncated)
12517 {
12518 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12519 result.clear();
12520
12521 librados::IoCtx index_ctx;
12522 map<int, string> oids;
12523 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12524 map<int, string> bucket_instance_ids;
12525 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12526 if (r < 0)
12527 return r;
12528
12529 BucketIndexShardsManager marker_mgr;
12530 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12531 // If there are multiple shards for the bucket index object, the marker
12532 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12533 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12534 // only contain one record, and the key is the bucket instance id.
12535 r = marker_mgr.from_string(marker, shard_id);
12536 if (r < 0)
12537 return r;
12538
12539 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12540 if (r < 0)
12541 return r;
12542
12543 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12544 map<int, list<rgw_bi_log_entry>::iterator> vends;
12545 if (truncated) {
12546 *truncated = false;
12547 }
12548 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12549 for (; miter != bi_log_lists.end(); ++miter) {
12550 int shard_id = miter->first;
12551 vcurrents[shard_id] = miter->second.entries.begin();
12552 vends[shard_id] = miter->second.entries.end();
12553 if (truncated) {
12554 *truncated = (*truncated || miter->second.truncated);
12555 }
12556 }
12557
12558 size_t total = 0;
12559 bool has_more = true;
12560 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12561 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12562 while (total < max && has_more) {
12563 has_more = false;
12564
12565 viter = vcurrents.begin();
12566 eiter = vends.begin();
12567
12568 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12569 assert (eiter != vends.end());
12570
12571 int shard_id = viter->first;
12572 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12573
12574 if (liter == eiter->second){
12575 continue;
12576 }
12577 rgw_bi_log_entry& entry = *(liter);
12578 if (has_shards) {
12579 char buf[16];
12580 snprintf(buf, sizeof(buf), "%d", shard_id);
12581 string tmp_id;
12582 build_bucket_index_marker(buf, entry.id, &tmp_id);
12583 entry.id.swap(tmp_id);
12584 }
12585 marker_mgr.add(shard_id, entry.id);
12586 result.push_back(entry);
12587 total++;
12588 has_more = true;
12589 ++liter;
12590 }
12591 }
12592
12593 if (truncated) {
12594 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12595 assert (eiter != vends.end());
12596 *truncated = (*truncated || (viter->second != eiter->second));
12597 }
12598 }
12599
12600 // Refresh marker, if there are multiple shards, the output will look like
12601 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12602 // if there is no sharding, the simply marker (without oid) is returned
12603 if (has_shards) {
12604 marker_mgr.to_string(&marker);
12605 } else {
12606 if (!result.empty()) {
12607 marker = result.rbegin()->id;
12608 }
12609 }
12610
12611 return 0;
12612 }
12613
12614 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12615 {
12616 librados::IoCtx index_ctx;
12617 map<int, string> bucket_objs;
12618
12619 BucketIndexShardsManager start_marker_mgr;
12620 BucketIndexShardsManager end_marker_mgr;
12621
12622 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12623 if (r < 0) {
12624 return r;
12625 }
12626
12627 r = start_marker_mgr.from_string(start_marker, shard_id);
12628 if (r < 0) {
12629 return r;
12630 }
12631
12632 r = end_marker_mgr.from_string(end_marker, shard_id);
12633 if (r < 0) {
12634 return r;
12635 }
12636
12637 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12638 cct->_conf->rgw_bucket_index_max_aio)();
12639
12640 return r;
12641 }
12642
12643 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12644 {
12645 librados::IoCtx index_ctx;
12646 map<int, string> bucket_objs;
12647 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12648 if (r < 0)
12649 return r;
12650
12651 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12652 }
12653
12654 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12655 {
12656 librados::IoCtx index_ctx;
12657 map<int, string> bucket_objs;
12658 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12659 if (r < 0)
12660 return r;
12661
12662 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12663 }
12664
12665 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12666 {
12667 rgw_rados_ref ref;
12668 int r = get_obj_head_ref(bucket_info, obj, &ref);
12669 if (r < 0) {
12670 return r;
12671 }
12672
12673 rgw_cls_bi_entry bi_entry;
12674 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12675 if (r < 0 && r != -ENOENT) {
12676 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12677 }
12678 if (r < 0) {
12679 return r;
12680 }
12681 bufferlist::iterator iter = bi_entry.data.begin();
12682 try {
12683 ::decode(*dirent, iter);
12684 } catch (buffer::error& err) {
12685 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12686 return -EIO;
12687 }
12688
12689 return 0;
12690 }
12691
12692 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12693 {
12694 BucketShard bs(this);
12695 int ret = bs.init(bucket, obj);
12696 if (ret < 0) {
12697 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12698 return ret;
12699 }
12700
12701 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12702
12703 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12704 if (ret < 0)
12705 return ret;
12706
12707 return 0;
12708 }
12709
12710 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12711 {
12712 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12713 }
12714
12715 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12716 {
12717 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12718 if (ret < 0)
12719 return ret;
12720
12721 return 0;
12722 }
12723
12724 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12725 {
12726 BucketShard bs(this);
12727 int ret = bs.init(bucket, obj);
12728 if (ret < 0) {
12729 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12730 return ret;
12731 }
12732
12733 return bi_put(bs, entry);
12734 }
12735
12736 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12737 {
12738 rgw_obj obj(bucket, obj_name);
12739 BucketShard bs(this);
12740 int ret = bs.init(bucket, obj);
12741 if (ret < 0) {
12742 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12743 return ret;
12744 }
12745
12746 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12747 if (ret == -ENOENT) {
12748 *is_truncated = false;
12749 }
12750 if (ret < 0)
12751 return ret;
12752
12753 return 0;
12754 }
12755
12756 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12757 {
12758 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12759 if (ret < 0)
12760 return ret;
12761
12762 return 0;
12763 }
12764
12765 int RGWRados::bi_remove(BucketShard& bs)
12766 {
12767 int ret = bs.index_ctx.remove(bs.bucket_obj);
12768 if (ret == -ENOENT) {
12769 ret = 0;
12770 }
12771 if (ret < 0) {
12772 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12773 return ret;
12774 }
12775
12776 return 0;
12777 }
12778
12779 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12780 {
12781 BucketShard bs(this);
12782 int ret = bs.init(bucket, shard_id);
12783 if (ret < 0) {
12784 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12785 return ret;
12786 }
12787
12788 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12789 }
12790
12791 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12792 {
12793 return gc_pool_ctx.operate(oid, op);
12794 }
12795
12796 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12797 {
12798 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12799 int r = gc_pool_ctx.aio_operate(oid, c, op);
12800 c->release();
12801 return r;
12802 }
12803
12804 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12805 {
12806 return gc_pool_ctx.operate(oid, op, pbl);
12807 }
12808
12809 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12810 {
12811 return gc->list(index, marker, max, expired_only, result, truncated);
12812 }
12813
12814 int RGWRados::process_gc()
12815 {
12816 return gc->process();
12817 }
12818
12819 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12820 {
12821 return lc->list_lc_progress(marker, max_entries, progress_map);
12822 }
12823
12824 int RGWRados::process_lc()
12825 {
12826 return lc->process();
12827 }
12828
12829 int RGWRados::process_expire_objects()
12830 {
12831 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12832 return 0;
12833 }
12834
12835 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12836 {
12837 bufferlist in;
12838 cls_rgw_bucket_init(op);
12839 return index_ctx.operate(oid, &op);
12840 }
12841
12842 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12843 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12844 {
12845 rgw_zone_set zones_trace;
12846 if (_zones_trace) {
12847 zones_trace = *_zones_trace;
12848 }
12849 else {
12850 zones_trace.insert(get_zone().id);
12851 }
12852
12853 ObjectWriteOperation o;
12854 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12855 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12856 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12857 return bs.index_ctx.operate(bs.bucket_obj, &o);
12858 }
12859
12860 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12861 int64_t pool, uint64_t epoch,
12862 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12863 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12864 {
12865 ObjectWriteOperation o;
12866 rgw_bucket_dir_entry_meta dir_meta;
12867 dir_meta = ent.meta;
12868 dir_meta.category = category;
12869
12870 rgw_bucket_entry_ver ver;
12871 ver.pool = pool;
12872 ver.epoch = epoch;
12873 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12874 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12875 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12876 get_zone().log_data, bilog_flags, _zones_trace);
12877 complete_op_data *arg;
12878 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12879 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12880 librados::AioCompletion *completion = arg->rados_completion;
12881 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12882 completion->release(); /* can't reference arg here, as it might have already been released */
12883 return ret;
12884 }
12885
12886 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12887 int64_t pool, uint64_t epoch,
12888 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12889 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12890 {
12891 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12892 }
12893
12894 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12895 int64_t pool, uint64_t epoch,
12896 rgw_obj& obj,
12897 real_time& removed_mtime,
12898 list<rgw_obj_index_key> *remove_objs,
12899 uint16_t bilog_flags,
12900 rgw_zone_set *zones_trace)
12901 {
12902 rgw_bucket_dir_entry ent;
12903 ent.meta.mtime = removed_mtime;
12904 obj.key.get_index_key(&ent.key);
12905 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12906 }
12907
12908 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12909 {
12910 rgw_bucket_dir_entry ent;
12911 obj.key.get_index_key(&ent.key);
12912 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12913 }
12914
12915 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12916 {
12917 librados::IoCtx index_ctx;
12918 map<int, string> bucket_objs;
12919 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12920 if (r < 0)
12921 return r;
12922
12923 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12924 }
12925
12926 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12927 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12928 bool *is_truncated, rgw_obj_index_key *last_entry,
12929 bool (*force_check_filter)(const string& name))
12930 {
12931 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12932
12933 librados::IoCtx index_ctx;
12934 // key - oid (for different shards if there is any)
12935 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12936 map<int, string> oids;
12937 map<int, struct rgw_cls_list_ret> list_results;
12938 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12939 if (r < 0)
12940 return r;
12941
12942 cls_rgw_obj_key start_key(start.name, start.instance);
12943 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12944 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12945 if (r < 0)
12946 return r;
12947
12948 // Create a list of iterators that are used to iterate each shard
12949 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12950 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12951 vector<string> vnames(list_results.size());
12952 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12953 *is_truncated = false;
12954 for (; iter != list_results.end(); ++iter) {
12955 vcurrents.push_back(iter->second.dir.m.begin());
12956 vends.push_back(iter->second.dir.m.end());
12957 vnames.push_back(oids[iter->first]);
12958 *is_truncated = (*is_truncated || iter->second.is_truncated);
12959 }
12960
12961 // Create a map to track the next candidate entry from each shard, if the entry
12962 // from a specified shard is selected/erased, the next entry from that shard will
12963 // be inserted for next round selection
12964 map<string, size_t> candidates;
12965 for (size_t i = 0; i < vcurrents.size(); ++i) {
12966 if (vcurrents[i] != vends[i]) {
12967 candidates[vcurrents[i]->first] = i;
12968 }
12969 }
12970
12971 map<string, bufferlist> updates;
12972 uint32_t count = 0;
12973 while (count < num_entries && !candidates.empty()) {
12974 r = 0;
12975 // Select the next one
12976 int pos = candidates.begin()->second;
12977 const string& name = vcurrents[pos]->first;
12978 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12979
12980 bool force_check = force_check_filter &&
12981 force_check_filter(dirent.key.name);
12982 if ((!dirent.exists && !dirent.is_delete_marker()) ||
12983 !dirent.pending_map.empty() ||
12984 force_check) {
12985 /* there are uncommitted ops. We need to check the current state,
12986 * and if the tags are old we need to do cleanup as well. */
12987 librados::IoCtx sub_ctx;
12988 sub_ctx.dup(index_ctx);
12989 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12990 if (r < 0 && r != -ENOENT) {
12991 return r;
12992 }
12993 }
12994 if (r >= 0) {
12995 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12996 m[name] = std::move(dirent);
12997 ++count;
12998 }
12999
13000 // Refresh the candidates map
13001 candidates.erase(candidates.begin());
13002 ++vcurrents[pos];
13003 if (vcurrents[pos] != vends[pos]) {
13004 candidates[vcurrents[pos]->first] = pos;
13005 }
13006 }
13007
13008 // Suggest updates if there is any
13009 map<string, bufferlist>::iterator miter = updates.begin();
13010 for (; miter != updates.end(); ++miter) {
13011 if (miter->second.length()) {
13012 ObjectWriteOperation o;
13013 cls_rgw_suggest_changes(o, miter->second);
13014 // we don't care if we lose suggested updates, send them off blindly
13015 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13016 index_ctx.aio_operate(miter->first, c, &o);
13017 c->release();
13018 }
13019 }
13020
13021 // Check if all the returned entries are consumed or not
13022 for (size_t i = 0; i < vcurrents.size(); ++i) {
13023 if (vcurrents[i] != vends[i])
13024 *is_truncated = true;
13025 }
13026 if (!m.empty())
13027 *last_entry = m.rbegin()->first;
13028
13029 return 0;
13030 }
13031
13032 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
13033 {
13034 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13035
13036 rgw_rados_ref ref;
13037 int r = get_raw_obj_ref(obj, &ref);
13038 if (r < 0) {
13039 return r;
13040 }
13041
13042 ObjectWriteOperation op;
13043 cls_rgw_usage_log_add(op, info);
13044
13045 r = ref.ioctx.operate(ref.oid, &op);
13046 return r;
13047 }
13048
13049 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13050 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13051 {
13052 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13053
13054 rgw_rados_ref ref;
13055 int r = get_raw_obj_ref(obj, &ref);
13056 if (r < 0) {
13057 return r;
13058 }
13059
13060 *is_truncated = false;
13061
13062 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13063 max_entries, read_iter, usage, is_truncated);
13064
13065 return r;
13066 }
13067
13068 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13069 {
13070 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13071
13072 rgw_rados_ref ref;
13073 int r = get_raw_obj_ref(obj, &ref);
13074 if (r < 0) {
13075 return r;
13076 }
13077
13078 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
13079 return r;
13080 }
13081
13082 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13083 {
13084 librados::IoCtx index_ctx;
13085 string dir_oid;
13086
13087 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13088
13089 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13090 if (r < 0)
13091 return r;
13092
13093 bufferlist updates;
13094
13095 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13096 rgw_bucket_dir_entry entry;
13097 entry.key = *iter;
13098 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13099 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13100 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13101 ::encode(entry, updates);
13102 }
13103
13104 bufferlist out;
13105
13106 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13107
13108 return r;
13109 }
13110
13111 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13112 const RGWBucketInfo& bucket_info,
13113 rgw_bucket_dir_entry& list_state,
13114 rgw_bucket_dir_entry& object,
13115 bufferlist& suggested_updates)
13116 {
13117 const rgw_bucket& bucket = bucket_info.bucket;
13118 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13119
13120 std::string loc;
13121
13122 rgw_obj obj(bucket, list_state.key);
13123
13124 string oid;
13125 get_obj_bucket_and_oid_loc(obj, oid, loc);
13126
13127 if (loc != list_state.locator) {
13128 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13129 }
13130
13131 io_ctx.locator_set_key(list_state.locator);
13132
13133 RGWObjState *astate = NULL;
13134 RGWObjectCtx rctx(this);
13135 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13136 if (r < 0)
13137 return r;
13138
13139 list_state.pending_map.clear(); // we don't need this and it inflates size
13140 if (!astate->exists) {
13141 /* object doesn't exist right now -- hopefully because it's
13142 * marked as !exists and got deleted */
13143 if (list_state.exists) {
13144 /* FIXME: what should happen now? Work out if there are any
13145 * non-bad ways this could happen (there probably are, but annoying
13146 * to handle!) */
13147 }
13148 // encode a suggested removal of that key
13149 list_state.ver.epoch = io_ctx.get_last_version();
13150 list_state.ver.pool = io_ctx.get_id();
13151 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13152 return -ENOENT;
13153 }
13154
13155 string etag;
13156 string content_type;
13157 ACLOwner owner;
13158
13159 object.meta.size = astate->size;
13160 object.meta.accounted_size = astate->accounted_size;
13161 object.meta.mtime = astate->mtime;
13162
13163 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13164 if (iter != astate->attrset.end()) {
13165 etag = iter->second.c_str();
13166 }
13167 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13168 if (iter != astate->attrset.end()) {
13169 content_type = iter->second.c_str();
13170 }
13171 iter = astate->attrset.find(RGW_ATTR_ACL);
13172 if (iter != astate->attrset.end()) {
13173 r = decode_policy(iter->second, &owner);
13174 if (r < 0) {
13175 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13176 }
13177 }
13178
13179 if (astate->has_manifest) {
13180 RGWObjManifest::obj_iterator miter;
13181 RGWObjManifest& manifest = astate->manifest;
13182 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13183 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13184 rgw_obj loc;
13185 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13186
13187 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13188 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13189 r = delete_obj_index(loc);
13190 if (r < 0) {
13191 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13192 }
13193 }
13194 }
13195 }
13196
13197 object.meta.etag = etag;
13198 object.meta.content_type = content_type;
13199 object.meta.owner = owner.get_id().to_str();
13200 object.meta.owner_display_name = owner.get_display_name();
13201
13202 // encode suggested updates
13203 list_state.ver.pool = io_ctx.get_id();
13204 list_state.ver.epoch = astate->epoch;
13205 list_state.meta.size = object.meta.size;
13206 list_state.meta.accounted_size = object.meta.accounted_size;
13207 list_state.meta.mtime = object.meta.mtime;
13208 list_state.meta.category = main_category;
13209 list_state.meta.etag = etag;
13210 list_state.meta.content_type = content_type;
13211 if (astate->obj_tag.length() > 0)
13212 list_state.tag = astate->obj_tag.c_str();
13213 list_state.meta.owner = owner.get_id().to_str();
13214 list_state.meta.owner_display_name = owner.get_display_name();
13215
13216 list_state.exists = true;
13217 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13218 return 0;
13219 }
13220
13221 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13222 {
13223 librados::IoCtx index_ctx;
13224 map<int, string> oids;
13225 map<int, struct rgw_cls_list_ret> list_results;
13226 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13227 if (r < 0)
13228 return r;
13229
13230 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13231 if (r < 0)
13232 return r;
13233
13234 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13235 for(; iter != list_results.end(); ++iter) {
13236 headers[oids[iter->first]] = iter->second.dir.header;
13237 }
13238 return 0;
13239 }
13240
13241 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13242 {
13243 librados::IoCtx index_ctx;
13244 map<int, string> bucket_objs;
13245 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13246 if (r < 0)
13247 return r;
13248
13249 map<int, string>::iterator iter = bucket_objs.begin();
13250 for (; iter != bucket_objs.end(); ++iter) {
13251 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13252 if (r < 0) {
13253 ctx->put();
13254 break;
13255 } else {
13256 (*num_aio)++;
13257 }
13258 }
13259 return r;
13260 }
13261
13262 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13263 {
13264 string buckets_obj_id;
13265 rgw_get_buckets_obj(user_id, buckets_obj_id);
13266 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13267
13268 rgw_rados_ref ref;
13269 int r = get_raw_obj_ref(obj, &ref);
13270 if (r < 0) {
13271 return r;
13272 }
13273
13274 librados::ObjectReadOperation op;
13275 int rc;
13276 ::cls_user_get_header(op, header, &rc);
13277 bufferlist ibl;
13278 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13279 if (r < 0)
13280 return r;
13281 if (rc < 0)
13282 return rc;
13283
13284 return 0;
13285 }
13286
13287 int RGWRados::cls_user_reset_stats(const string& user_id)
13288 {
13289 string buckets_obj_id;
13290 rgw_get_buckets_obj(user_id, buckets_obj_id);
13291 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13292
13293 rgw_rados_ref ref;
13294 int r = get_raw_obj_ref(obj, &ref);
13295 if (r < 0) {
13296 return r;
13297 }
13298
13299 librados::ObjectWriteOperation op;
13300 ::cls_user_reset_stats(op);
13301 return ref.ioctx.operate(ref.oid, &op);
13302 }
13303
13304 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13305 {
13306 string buckets_obj_id;
13307 rgw_get_buckets_obj(user_id, buckets_obj_id);
13308 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13309
13310 rgw_rados_ref ref;
13311 int r = get_raw_obj_ref(obj, &ref);
13312 if (r < 0) {
13313 return r;
13314 }
13315
13316 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13317 if (r < 0)
13318 return r;
13319
13320 return 0;
13321 }
13322
13323 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13324 {
13325 map<string, struct rgw_bucket_dir_header> headers;
13326 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13327 if (r < 0) {
13328 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13329 return r;
13330 }
13331
13332 cls_user_bucket_entry entry;
13333
13334 bucket_info.bucket.convert(&entry.bucket);
13335
13336 for (const auto& hiter : headers) {
13337 for (const auto& iter : hiter.second.stats) {
13338 const struct rgw_bucket_category_stats& header_stats = iter.second;
13339 entry.size += header_stats.total_size;
13340 entry.size_rounded += header_stats.total_size_rounded;
13341 entry.count += header_stats.num_entries;
13342 }
13343 }
13344
13345 list<cls_user_bucket_entry> entries;
13346 entries.push_back(entry);
13347
13348 r = cls_user_update_buckets(user_obj, entries, false);
13349 if (r < 0) {
13350 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13351 return r;
13352 }
13353
13354 return 0;
13355 }
13356
13357 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13358 {
13359 map<string, struct rgw_bucket_dir_header> headers;
13360 RGWBucketInfo bucket_info;
13361 RGWObjectCtx obj_ctx(this);
13362 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13363 if (ret < 0) {
13364 return ret;
13365 }
13366
13367 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13368 if (ret < 0) {
13369 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13370 return ret;
13371 }
13372
13373 bucket.convert(&entry.bucket);
13374
13375 for (const auto& hiter : headers) {
13376 for (const auto& iter : hiter.second.stats) {
13377 const struct rgw_bucket_category_stats& header_stats = iter.second;
13378 entry.size += header_stats.total_size;
13379 entry.size_rounded += header_stats.total_size_rounded;
13380 entry.count += header_stats.num_entries;
13381 }
13382 }
13383
13384 return 0;
13385 }
13386
13387 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13388 const string& in_marker,
13389 const string& end_marker,
13390 const int max_entries,
13391 list<cls_user_bucket_entry>& entries,
13392 string * const out_marker,
13393 bool * const truncated)
13394 {
13395 rgw_rados_ref ref;
13396 int r = get_raw_obj_ref(obj, &ref);
13397 if (r < 0) {
13398 return r;
13399 }
13400
13401 librados::ObjectReadOperation op;
13402 int rc;
13403
13404 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13405 bufferlist ibl;
13406 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13407 if (r < 0)
13408 return r;
13409 if (rc < 0)
13410 return rc;
13411
13412 return 0;
13413 }
13414
13415 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13416 {
13417 rgw_rados_ref ref;
13418 int r = get_raw_obj_ref(obj, &ref);
13419 if (r < 0) {
13420 return r;
13421 }
13422
13423 librados::ObjectWriteOperation op;
13424 cls_user_set_buckets(op, entries, add);
13425 r = ref.ioctx.operate(ref.oid, &op);
13426 if (r < 0)
13427 return r;
13428
13429 return 0;
13430 }
13431
13432 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13433 {
13434 string buckets_obj_id;
13435 rgw_get_buckets_obj(user_id, buckets_obj_id);
13436 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13437 return cls_user_complete_stats_sync(obj);
13438 }
13439
13440 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13441 {
13442 rgw_rados_ref ref;
13443 int r = get_raw_obj_ref(obj, &ref);
13444 if (r < 0) {
13445 return r;
13446 }
13447
13448 librados::ObjectWriteOperation op;
13449 ::cls_user_complete_stats_sync(op);
13450 r = ref.ioctx.operate(ref.oid, &op);
13451 if (r < 0)
13452 return r;
13453
13454 return 0;
13455 }
13456
13457 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13458 {
13459 list<cls_user_bucket_entry> l;
13460 l.push_back(entry);
13461
13462 return cls_user_update_buckets(obj, l, true);
13463 }
13464
13465 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13466 {
13467 rgw_rados_ref ref;
13468 int r = get_system_obj_ref(obj, &ref);
13469 if (r < 0) {
13470 return r;
13471 }
13472
13473 librados::ObjectWriteOperation op;
13474 ::cls_user_remove_bucket(op, bucket);
13475 r = ref.ioctx.operate(ref.oid, &op);
13476 if (r < 0)
13477 return r;
13478
13479 return 0;
13480 }
13481
13482 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13483 RGWQuotaInfo& bucket_quota)
13484 {
13485 if (!cct->_conf->rgw_dynamic_resharding) {
13486 return 0;
13487 }
13488
13489 bool need_resharding = false;
13490 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13491 uint32_t suggested_num_shards;
13492
13493 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13494 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13495 1, need_resharding, &suggested_num_shards);
13496 if (ret < 0) {
13497 return ret;
13498 }
13499
13500 if (need_resharding) {
13501 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13502 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13503 dendl;
13504 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13505 }
13506
13507 return ret;
13508 }
13509
13510 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13511 {
13512 RGWReshard reshard(this);
13513
13514 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13515
13516 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13517 if (new_num_shards <= num_source_shards) {
13518 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13519 return 0;
13520 }
13521
13522 cls_rgw_reshard_entry entry;
13523 entry.time = real_clock::now();
13524 entry.tenant = bucket_info.owner.tenant;
13525 entry.bucket_name = bucket_info.bucket.name;
13526 entry.bucket_id = bucket_info.bucket.bucket_id;
13527 entry.old_num_shards = num_source_shards;
13528 entry.new_num_shards = new_num_shards;
13529
13530 return reshard.add(entry);
13531 }
13532
13533 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13534 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13535 {
13536 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13537 }
13538
13539 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13540 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13541 {
13542 if (!num_shards) {
13543 bucket_objects[0] = bucket_oid_base;
13544 } else {
13545 char buf[bucket_oid_base.size() + 32];
13546 if (shard_id < 0) {
13547 for (uint32_t i = 0; i < num_shards; ++i) {
13548 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13549 bucket_objects[i] = buf;
13550 }
13551 } else {
13552 if ((uint32_t)shard_id > num_shards) {
13553 return;
13554 }
13555 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13556 bucket_objects[shard_id] = buf;
13557 }
13558 }
13559 }
13560
13561 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13562 {
13563 const rgw_bucket& bucket = bucket_info.bucket;
13564 string plain_id = bucket.name + ":" + bucket.bucket_id;
13565 if (!bucket_info.num_shards) {
13566 (*result)[0] = plain_id;
13567 } else {
13568 char buf[16];
13569 if (shard_id < 0) {
13570 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13571 snprintf(buf, sizeof(buf), ":%d", i);
13572 (*result)[i] = plain_id + buf;
13573 }
13574 } else {
13575 if ((uint32_t)shard_id > bucket_info.num_shards) {
13576 return;
13577 }
13578 snprintf(buf, sizeof(buf), ":%d", shard_id);
13579 (*result)[shard_id] = plain_id + buf;
13580 }
13581 }
13582 }
13583
13584 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13585 int *shard_id)
13586 {
13587 int r = 0;
13588 switch (bucket_info.bucket_index_shard_hash_type) {
13589 case RGWBucketInfo::MOD:
13590 if (!bucket_info.num_shards) {
13591 if (shard_id) {
13592 *shard_id = -1;
13593 }
13594 } else {
13595 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13596 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13597 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13598 if (shard_id) {
13599 *shard_id = (int)sid;
13600 }
13601 }
13602 break;
13603 default:
13604 r = -ENOTSUP;
13605 }
13606 return r;
13607 }
13608
13609 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13610 int shard_id, string *bucket_obj)
13611 {
13612 if (!num_shards) {
13613 // By default with no sharding, we use the bucket oid as itself
13614 (*bucket_obj) = bucket_oid_base;
13615 } else {
13616 char buf[bucket_oid_base.size() + 32];
13617 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13618 (*bucket_obj) = buf;
13619 }
13620 }
13621
13622 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13623 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13624 {
13625 int r = 0;
13626 switch (hash_type) {
13627 case RGWBucketInfo::MOD:
13628 if (!num_shards) {
13629 // By default with no sharding, we use the bucket oid as itself
13630 (*bucket_obj) = bucket_oid_base;
13631 if (shard_id) {
13632 *shard_id = -1;
13633 }
13634 } else {
13635 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13636 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13637 sid = rgw_shards_mod(sid2, num_shards);
13638 char buf[bucket_oid_base.size() + 32];
13639 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13640 (*bucket_obj) = buf;
13641 if (shard_id) {
13642 *shard_id = (int)sid;
13643 }
13644 }
13645 break;
13646 default:
13647 r = -ENOTSUP;
13648 }
13649 return r;
13650 }
13651
13652 void RGWStateLog::oid_str(int shard, string& oid) {
13653 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13654 char buf[16];
13655 snprintf(buf, sizeof(buf), "%d", shard);
13656 oid += buf;
13657 }
13658
13659 int RGWStateLog::get_shard_num(const string& object) {
13660 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13661 return val % num_shards;
13662 }
13663
13664 string RGWStateLog::get_oid(const string& object) {
13665 int shard = get_shard_num(object);
13666 string oid;
13667 oid_str(shard, oid);
13668 return oid;
13669 }
13670
13671 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13672 rgw_pool pool;
13673 store->get_log_pool(pool);
13674 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13675 if (r < 0) {
13676 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13677 return r;
13678 }
13679 return 0;
13680 }
13681
13682 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13683 uint32_t state, bufferlist *bl, uint32_t *check_state)
13684 {
13685 if (client_id.empty() ||
13686 op_id.empty() ||
13687 object.empty()) {
13688 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13689 }
13690
13691 librados::IoCtx ioctx;
13692 int r = open_ioctx(ioctx);
13693 if (r < 0)
13694 return r;
13695
13696 string oid = get_oid(object);
13697
13698 librados::ObjectWriteOperation op;
13699 if (check_state) {
13700 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13701 }
13702 utime_t ts = ceph_clock_now();
13703 bufferlist nobl;
13704 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13705 r = ioctx.operate(oid, &op);
13706 if (r < 0) {
13707 return r;
13708 }
13709
13710 return 0;
13711 }
13712
13713 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13714 {
13715 if (client_id.empty() ||
13716 op_id.empty() ||
13717 object.empty()) {
13718 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13719 }
13720
13721 librados::IoCtx ioctx;
13722 int r = open_ioctx(ioctx);
13723 if (r < 0)
13724 return r;
13725
13726 string oid = get_oid(object);
13727
13728 librados::ObjectWriteOperation op;
13729 cls_statelog_remove_by_object(op, object, op_id);
13730 r = ioctx.operate(oid, &op);
13731 if (r < 0) {
13732 return r;
13733 }
13734
13735 return 0;
13736 }
13737
13738 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13739 void **handle)
13740 {
13741 list_state *state = new list_state;
13742 state->client_id = client_id;
13743 state->op_id = op_id;
13744 state->object = object;
13745 if (object.empty()) {
13746 state->cur_shard = 0;
13747 state->max_shard = num_shards - 1;
13748 } else {
13749 state->cur_shard = state->max_shard = get_shard_num(object);
13750 }
13751 *handle = (void *)state;
13752 }
13753
13754 int RGWStateLog::list_entries(void *handle, int max_entries,
13755 list<cls_statelog_entry>& entries,
13756 bool *done)
13757 {
13758 list_state *state = static_cast<list_state *>(handle);
13759
13760 librados::IoCtx ioctx;
13761 int r = open_ioctx(ioctx);
13762 if (r < 0)
13763 return r;
13764
13765 entries.clear();
13766
13767 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13768 string oid;
13769 oid_str(state->cur_shard, oid);
13770
13771 librados::ObjectReadOperation op;
13772 list<cls_statelog_entry> ents;
13773 bool truncated;
13774 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13775 max_entries, ents, &state->marker, &truncated);
13776 bufferlist ibl;
13777 r = ioctx.operate(oid, &op, &ibl);
13778 if (r == -ENOENT) {
13779 truncated = false;
13780 r = 0;
13781 }
13782 if (r < 0) {
13783 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13784 return r;
13785 }
13786
13787 if (!truncated) {
13788 state->marker.clear();
13789 }
13790
13791 max_entries -= ents.size();
13792
13793 entries.splice(entries.end(), ents);
13794
13795 if (truncated)
13796 break;
13797 }
13798
13799 *done = (state->cur_shard > state->max_shard);
13800
13801 return 0;
13802 }
13803
13804 void RGWStateLog::finish_list_entries(void *handle)
13805 {
13806 list_state *state = static_cast<list_state *>(handle);
13807 delete state;
13808 }
13809
13810 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13811 {
13812 f->open_object_section("statelog_entry");
13813 f->dump_string("client_id", entry.client_id);
13814 f->dump_string("op_id", entry.op_id);
13815 f->dump_string("object", entry.object);
13816 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13817 if (!dump_entry_internal(entry, f)) {
13818 f->dump_int("state", entry.state);
13819 }
13820 f->close_section();
13821 }
13822
13823 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13824 {
13825 }
13826
13827 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13828 {
13829 string s;
13830 switch ((OpState)entry.state) {
13831 case OPSTATE_UNKNOWN:
13832 s = "unknown";
13833 break;
13834 case OPSTATE_IN_PROGRESS:
13835 s = "in-progress";
13836 break;
13837 case OPSTATE_COMPLETE:
13838 s = "complete";
13839 break;
13840 case OPSTATE_ERROR:
13841 s = "error";
13842 break;
13843 case OPSTATE_ABORT:
13844 s = "abort";
13845 break;
13846 case OPSTATE_CANCELLED:
13847 s = "cancelled";
13848 break;
13849 default:
13850 s = "invalid";
13851 }
13852 f->dump_string("state", s);
13853 return true;
13854 }
13855
13856 int RGWOpState::state_from_str(const string& s, OpState *state)
13857 {
13858 if (s == "unknown") {
13859 *state = OPSTATE_UNKNOWN;
13860 } else if (s == "in-progress") {
13861 *state = OPSTATE_IN_PROGRESS;
13862 } else if (s == "complete") {
13863 *state = OPSTATE_COMPLETE;
13864 } else if (s == "error") {
13865 *state = OPSTATE_ERROR;
13866 } else if (s == "abort") {
13867 *state = OPSTATE_ABORT;
13868 } else if (s == "cancelled") {
13869 *state = OPSTATE_CANCELLED;
13870 } else {
13871 return -EINVAL;
13872 }
13873
13874 return 0;
13875 }
13876
13877 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13878 {
13879 uint32_t s = (uint32_t)state;
13880 return store_entry(client_id, op_id, object, s, NULL, NULL);
13881 }
13882
13883 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13884 {
13885 uint32_t s = (uint32_t)state;
13886 return store_entry(client_id, op_id, object, s, NULL, &s);
13887 }
13888
13889 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13890 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13891 {
13892 cct = store->ctx();
13893 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13894 }
13895
13896 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13897 last_update = real_clock::now();
13898 cur_state = state;
13899 return os.set_state(client_id, op_id, object, state);
13900 }
13901
13902 int RGWOpStateSingleOp::renew_state() {
13903 real_time now = real_clock::now();
13904
13905 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13906
13907 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13908 return 0;
13909 }
13910
13911 last_update = now;
13912 return os.renew_state(client_id, op_id, object, cur_state);
13913 }
13914
13915
13916 uint64_t RGWRados::instance_id()
13917 {
13918 return get_rados_handle()->get_instance_id();
13919 }
13920
13921 uint64_t RGWRados::next_bucket_id()
13922 {
13923 Mutex::Locker l(bucket_id_lock);
13924 return ++max_bucket_id;
13925 }
13926
13927 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13928 {
13929 int use_cache = cct->_conf->rgw_cache_enabled;
13930 RGWRados *store = NULL;
13931 if (!use_cache) {
13932 store = new RGWRados;
13933 } else {
13934 store = new RGWCache<RGWRados>;
13935 }
13936
13937 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13938 delete store;
13939 return NULL;
13940 }
13941
13942 return store;
13943 }
13944
13945 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13946 {
13947 RGWRados *store = NULL;
13948 store = new RGWRados;
13949
13950 store->set_context(cct);
13951
13952 if (store->init_rados() < 0) {
13953 delete store;
13954 return NULL;
13955 }
13956
13957 return store;
13958 }
13959
13960 void RGWStoreManager::close_storage(RGWRados *store)
13961 {
13962 if (!store)
13963 return;
13964
13965 store->finalize();
13966
13967 delete store;
13968 }
13969
13970 librados::Rados* RGWRados::get_rados_handle()
13971 {
13972 if (rados.size() == 1) {
13973 return &rados[0];
13974 } else {
13975 handle_lock.get_read();
13976 pthread_t id = pthread_self();
13977 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13978
13979 if (it != rados_map.end()) {
13980 handle_lock.put_read();
13981 return &rados[it->second];
13982 } else {
13983 handle_lock.put_read();
13984 handle_lock.get_write();
13985 const uint32_t handle = next_rados_handle;
13986 rados_map[id] = handle;
13987 if (++next_rados_handle == rados.size()) {
13988 next_rados_handle = 0;
13989 }
13990 handle_lock.put_write();
13991 return &rados[handle];
13992 }
13993 }
13994 }
13995
13996 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13997 {
13998 rgw_rados_ref ref;
13999 int ret = get_raw_obj_ref(obj, &ref);
14000 if (ret < 0) {
14001 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14002 return ret;
14003 }
14004
14005 ObjectWriteOperation op;
14006 list<string> prefixes;
14007 cls_rgw_remove_obj(op, prefixes);
14008
14009 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14010 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14011 if (ret < 0) {
14012 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14013 c->release();
14014 return ret;
14015 }
14016
14017 handles.push_back(c);
14018
14019 return 0;
14020 }
14021
14022 int RGWRados::delete_obj_aio(const rgw_obj& obj,
14023 RGWBucketInfo& bucket_info, RGWObjState *astate,
14024 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14025 {
14026 rgw_rados_ref ref;
14027 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14028 if (ret < 0) {
14029 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14030 return ret;
14031 }
14032
14033 if (keep_index_consistent) {
14034 RGWRados::Bucket bop(this, bucket_info);
14035 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14036
14037 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14038 if (ret < 0) {
14039 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14040 return ret;
14041 }
14042 }
14043
14044 ObjectWriteOperation op;
14045 list<string> prefixes;
14046 cls_rgw_remove_obj(op, prefixes);
14047
14048 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14049 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14050 if (ret < 0) {
14051 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14052 c->release();
14053 return ret;
14054 }
14055
14056 handles.push_back(c);
14057
14058 if (keep_index_consistent) {
14059 ret = delete_obj_index(obj);
14060 if (ret < 0) {
14061 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14062 return ret;
14063 }
14064 }
14065 return ret;
14066 }
14067
14068 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14069 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14070 if (value != attrs.end()) {
14071 bufferlist::iterator bliter = value->second.begin();
14072 try {
14073 ::decode(cs_info, bliter);
14074 } catch (buffer::error& err) {
14075 return -EIO;
14076 }
14077 if (cs_info.blocks.size() == 0) {
14078 return -EIO;
14079 }
14080 if (cs_info.compression_type != "none")
14081 need_decompress = true;
14082 else
14083 need_decompress = false;
14084 return 0;
14085 } else {
14086 need_decompress = false;
14087 return 0;
14088 }
14089 }
14090
14091 bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14092 bufferlist& out)
14093 {
14094 if (command == "cache list") {
14095 boost::optional<std::string> filter;
14096 auto i = cmdmap.find("filter");
14097 if (i != cmdmap.cend()) {
14098 filter = boost::get<std::string>(i->second);
14099 }
14100 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14101 if (f) {
14102 f->open_array_section("cache_entries");
14103 call_list(filter, f.get());
14104 f->close_section();
14105 f->flush(out);
14106 return true;
14107 } else {
14108 out.append("Unable to create Formatter.\n");
14109 return false;
14110 }
14111 } else if (command == "cache inspect") {
14112 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14113 if (f) {
14114 const auto& target = boost::get<std::string>(cmdmap["target"]);
14115 if (call_inspect(target, f.get())) {
14116 f->flush(out);
14117 return true;
14118 } else {
14119 out.append(string("Unable to find entry ") + target + string(".\n"));
14120 return false;
14121 }
14122 } else {
14123 out.append("Unable to create Formatter.\n");
14124 return false;
14125 }
14126 } else if (command == "cache erase") {
14127 const auto& target = boost::get<std::string>(cmdmap["target"]);
14128 if (call_erase(target)) {
14129 return true;
14130 } else {
14131 out.append(string("Unable to find entry ") + target + string(".\n"));
14132 return false;
14133 }
14134 } else if (command == "cache zap") {
14135 call_zap();
14136 return true;
14137 }
14138 return false;
14139 }
14140
14141 void RGWRados::call_list(const boost::optional<std::string>&,
14142 ceph::Formatter*)
14143 {
14144 return;
14145 }
14146
14147 bool RGWRados::call_inspect(const std::string&, Formatter*)
14148 {
14149 return false;
14150 }
14151
14152 bool RGWRados::call_erase(const std::string&) {
14153 return false;
14154 }
14155
14156 void RGWRados::call_zap() {
14157 return;
14158 }