]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to 12.2.8
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
13
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
16
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
24 #include "rgw_acl.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
31
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "osd/osd_types.h"
44
45 #include "rgw_tools.h"
46 #include "rgw_coroutine.h"
47 #include "rgw_compression.h"
48
49 #undef fork // fails to compile RGWPeriod::fork() below
50
51 #include "common/Clock.h"
52
53 #include "include/rados/librados.hpp"
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "auth/Crypto.h" // get_random_bytes()
63
64 #include "rgw_log.h"
65
66 #include "rgw_gc.h"
67 #include "rgw_lc.h"
68
69 #include "rgw_object_expirer_core.h"
70 #include "rgw_sync.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "compressor/Compressor.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
79
80 using namespace std;
81
82 static string notify_oid_prefix = "notify";
83 static string *notify_oids = NULL;
84 static string shadow_ns = "shadow";
85 static string dir_oid_prefix = ".dir.";
86 static string default_storage_pool_suffix = "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89 static string avail_pools = ".pools.avail";
90
91 static string zone_info_oid_prefix = "zone_info.";
92 static string zone_names_oid_prefix = "zone_names.";
93 static string region_info_oid_prefix = "region_info.";
94 static string zone_group_info_oid_prefix = "zonegroup_info.";
95 static string realm_names_oid_prefix = "realms_names.";
96 static string realm_info_oid_prefix = "realms.";
97 static string default_region_info_oid = "default.region";
98 static string default_zone_group_info_oid = "default.zonegroup";
99 static string period_info_oid_prefix = "periods.";
100 static string period_latest_epoch_info_oid = ".latest_epoch";
101 static string region_map_oid = "region_map";
102 static string zonegroup_map_oid = "zonegroup_map";
103 static string log_lock_name = "rgw_log_lock";
104 static string default_realm_info_oid = "default.realm";
105 const string default_zonegroup_name = "default";
106 const string default_zone_name = "default";
107 static string zonegroup_names_oid_prefix = "zonegroups_names.";
108 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118 #define dout_subsys ceph_subsys_rgw
119
120
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123 {
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
133 *pool = placement.get_data_extra_pool();
134 }
135 }
136
137 return true;
138 }
139
140 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142 {
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146 }
147
148 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149 {
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156 }
157
158 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159 {
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166 }
167
168 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169 {
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
173 if (r == -ERANGE) {
174 dout(0)
175 << __func__
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
179 << dendl;
180 }
181 if (r < 0 && r != -EEXIST) {
182 return r;
183 }
184
185 r = rados->ioctx_create(pool.name.c_str(), ioctx);
186 if (r < 0) {
187 return r;
188 }
189
190 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
191 if (r < 0 && r != -EOPNOTSUPP) {
192 return r;
193 }
194 } else if (r < 0) {
195 return r;
196 }
197 if (!pool.ns.empty()) {
198 ioctx.set_namespace(pool.ns);
199 }
200 return 0;
201 }
202
203 template<>
204 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
205 RWLock::WLocker wl(lock);
206 auto iter = objs_state.find(obj);
207 if (iter == objs_state.end()) {
208 return;
209 }
210 bool is_atomic = iter->second.is_atomic;
211 bool prefetch_data = iter->second.prefetch_data;
212
213 objs_state.erase(iter);
214
215 if (is_atomic || prefetch_data) {
216 auto& s = objs_state[obj];
217 s.is_atomic = is_atomic;
218 s.prefetch_data = prefetch_data;
219 }
220 }
221
222 template<>
223 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
224 RWLock::WLocker wl(lock);
225 auto iter = objs_state.find(obj);
226 if (iter == objs_state.end()) {
227 return;
228 }
229
230 objs_state.erase(iter);
231 }
232
233 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
234 encode_json("default_zonegroup", default_zonegroup, f);
235 }
236
237 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
238
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
240 /* backward compatability with region */
241 if (default_zonegroup.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
243 }
244 }
245
246 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
247 {
248 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
250 }
251
252 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
253 }
254
255 int RGWZoneGroup::create_default(bool old_format)
256 {
257 name = default_zonegroup_name;
258 is_master = true;
259
260 RGWZoneGroupPlacementTarget placement_target;
261 placement_target.name = "default-placement";
262 placement_targets[placement_target.name] = placement_target;
263 default_placement = "default-placement";
264
265 RGWZoneParams zone_params(default_zone_name);
266
267 int r = zone_params.init(cct, store, false);
268 if (r < 0) {
269 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
270 return r;
271 }
272
273 r = zone_params.create_default();
274 if (r < 0 && r != -EEXIST) {
275 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 } else if (r == -EEXIST) {
278 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
279 zone_params.clear_id();
280 r = zone_params.init(cct, store);
281 if (r < 0) {
282 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
283 return r;
284 }
285 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
286 << dendl;
287 }
288
289 RGWZone& default_zone = zones[zone_params.get_id()];
290 default_zone.name = zone_params.get_name();
291 default_zone.id = zone_params.get_id();
292 master_zone = default_zone.id;
293
294 r = create();
295 if (r < 0 && r != -EEXIST) {
296 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
297 return r;
298 }
299
300 if (r == -EEXIST) {
301 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
302 id.clear();
303 r = init(cct, store);
304 if (r < 0) {
305 return r;
306 }
307 }
308
309 if (old_format) {
310 name = id;
311 }
312
313 post_process_params();
314
315 return 0;
316 }
317
318 const string RGWZoneGroup::get_default_oid(bool old_region_format)
319 {
320 if (old_region_format) {
321 if (cct->_conf->rgw_default_region_info_oid.empty()) {
322 return default_region_info_oid;
323 }
324 return cct->_conf->rgw_default_region_info_oid;
325 }
326
327 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
328
329 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
330 default_oid = default_zone_group_info_oid;
331 }
332
333 default_oid += "." + realm_id;
334
335 return default_oid;
336 }
337
338 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
339 {
340 if (old_region_format) {
341 return region_info_oid_prefix;
342 }
343 return zone_group_info_oid_prefix;
344 }
345
346 const string& RGWZoneGroup::get_names_oid_prefix()
347 {
348 return zonegroup_names_oid_prefix;
349 }
350
351 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
352 return cct->_conf->rgw_zonegroup;
353 }
354
355 int RGWZoneGroup::equals(const string& other_zonegroup) const
356 {
357 if (is_master && other_zonegroup.empty())
358 return true;
359
360 return (id == other_zonegroup);
361 }
362
363 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
364 const list<string>& endpoints, const string *ptier_type,
365 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
366 {
367 auto& zone_id = zone_params.get_id();
368 auto& zone_name = zone_params.get_name();
369
370 // check for duplicate zone name on insert
371 if (!zones.count(zone_id)) {
372 for (const auto& zone : zones) {
373 if (zone.second.name == zone_name) {
374 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
376 return -EEXIST;
377 }
378 }
379 }
380
381 if (is_master) {
382 if (*is_master) {
383 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
384 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
385 }
386 master_zone = zone_params.get_id();
387 } else if (master_zone == zone_params.get_id()) {
388 master_zone.clear();
389 }
390 }
391
392 RGWZone& zone = zones[zone_params.get_id()];
393 zone.name = zone_params.get_name();
394 zone.id = zone_params.get_id();
395 if (!endpoints.empty()) {
396 zone.endpoints = endpoints;
397 }
398 if (read_only) {
399 zone.read_only = *read_only;
400 }
401 if (ptier_type) {
402 zone.tier_type = *ptier_type;
403 }
404
405 if (psync_from_all) {
406 zone.sync_from_all = *psync_from_all;
407 }
408
409 for (auto add : sync_from) {
410 zone.sync_from.insert(add);
411 }
412
413 for (auto rm : sync_from_rm) {
414 zone.sync_from.erase(rm);
415 }
416
417 post_process_params();
418
419 return update();
420 }
421
422
423 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
424 {
425 RGWZone& zone = zones[zone_params.get_id()];
426 zone.name = zone_params.get_name();
427
428 return update();
429 }
430
431 void RGWZoneGroup::post_process_params()
432 {
433 bool log_data = zones.size() > 1;
434
435 if (master_zone.empty()) {
436 map<string, RGWZone>::iterator iter = zones.begin();
437 if (iter != zones.end()) {
438 master_zone = iter->first;
439 }
440 }
441
442 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
443 RGWZone& zone = iter->second;
444 zone.log_data = log_data;
445
446 RGWZoneParams zone_params(zone.id, zone.name);
447 int ret = zone_params.init(cct, store);
448 if (ret < 0) {
449 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
450 continue;
451 }
452
453 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
454 iter != zone_params.placement_pools.end(); ++iter) {
455 const string& placement_name = iter->first;
456 if (placement_targets.find(placement_name) == placement_targets.end()) {
457 RGWZoneGroupPlacementTarget placement_target;
458 placement_target.name = placement_name;
459 placement_targets[placement_name] = placement_target;
460 }
461 }
462 }
463
464 if (default_placement.empty() && !placement_targets.empty()) {
465 default_placement = placement_targets.begin()->first;
466 }
467 }
468
469 int RGWZoneGroup::remove_zone(const std::string& zone_id)
470 {
471 map<string, RGWZone>::iterator iter = zones.find(zone_id);
472 if (iter == zones.end()) {
473 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
474 << name << dendl;
475 return -ENOENT;
476 }
477
478 zones.erase(iter);
479
480 post_process_params();
481
482 return update();
483 }
484
485 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
486 {
487 if (realm_id.empty()) {
488 /* try using default realm */
489 RGWRealm realm;
490 int ret = realm.init(cct, store);
491 // no default realm exist
492 if (ret < 0) {
493 return read_id(default_zonegroup_name, default_id);
494 }
495 realm_id = realm.get_id();
496 }
497
498 return RGWSystemMetaObj::read_default_id(default_id, old_format);
499 }
500
501 int RGWZoneGroup::set_as_default(bool exclusive)
502 {
503 if (realm_id.empty()) {
504 /* try using default realm */
505 RGWRealm realm;
506 int ret = realm.init(cct, store);
507 if (ret < 0) {
508 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
509 return -EINVAL;
510 }
511 realm_id = realm.get_id();
512 }
513
514 return RGWSystemMetaObj::set_as_default(exclusive);
515 }
516
517 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
518 {
519 cct = _cct;
520 store = _store;
521
522 if (!setup_obj)
523 return 0;
524
525 if (old_format && id.empty()) {
526 id = name;
527 }
528
529 if (id.empty()) {
530 int r;
531 if (name.empty()) {
532 name = get_predefined_name(cct);
533 }
534 if (name.empty()) {
535 r = use_default(old_format);
536 if (r < 0) {
537 return r;
538 }
539 } else if (!old_format) {
540 r = read_id(name, id);
541 if (r < 0) {
542 if (r != -ENOENT) {
543 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
544 }
545 return r;
546 }
547 }
548 }
549
550 return read_info(id, old_format);
551 }
552
553 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
554 {
555 auto pool = get_pool(cct);
556 bufferlist bl;
557 RGWObjectCtx obj_ctx(store);
558 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
559 if (ret < 0)
560 return ret;
561
562 try {
563 bufferlist::iterator iter = bl.begin();
564 ::decode(default_info, iter);
565 } catch (buffer::error& err) {
566 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
567 return -EIO;
568 }
569
570 return 0;
571 }
572
573 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
574 {
575 RGWDefaultSystemMetaObjInfo default_info;
576
577 int ret = read_default(default_info, get_default_oid(old_format));
578 if (ret < 0) {
579 return ret;
580 }
581
582 default_id = default_info.default_id;
583
584 return 0;
585 }
586
587 int RGWSystemMetaObj::use_default(bool old_format)
588 {
589 return read_default_id(id, old_format);
590 }
591
592 int RGWSystemMetaObj::set_as_default(bool exclusive)
593 {
594 string oid = get_default_oid();
595
596 rgw_pool pool(get_pool(cct));
597 bufferlist bl;
598
599 RGWDefaultSystemMetaObjInfo default_info;
600 default_info.default_id = id;
601
602 ::encode(default_info, bl);
603
604 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
605 exclusive, NULL, real_time(), NULL);
606 if (ret < 0)
607 return ret;
608
609 return 0;
610 }
611
612 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
613 {
614 rgw_pool pool(get_pool(cct));
615 bufferlist bl;
616
617 string oid = get_names_oid_prefix() + obj_name;
618
619 RGWObjectCtx obj_ctx(store);
620 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
621 if (ret < 0) {
622 return ret;
623 }
624
625 RGWNameToId nameToId;
626 try {
627 bufferlist::iterator iter = bl.begin();
628 ::decode(nameToId, iter);
629 } catch (buffer::error& err) {
630 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
631 return -EIO;
632 }
633 object_id = nameToId.obj_id;
634 return 0;
635 }
636
637 int RGWSystemMetaObj::delete_obj(bool old_format)
638 {
639 rgw_pool pool(get_pool(cct));
640
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info;
643 int ret = read_default(default_info, get_default_oid(old_format));
644 if (ret < 0 && ret != -ENOENT)
645 return ret;
646 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
647 string oid = get_default_oid(old_format);
648 rgw_raw_obj default_named_obj(pool, oid);
649 ret = store->delete_system_obj(default_named_obj);
650 if (ret < 0) {
651 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
652 return ret;
653 }
654 }
655 if (!old_format) {
656 string oid = get_names_oid_prefix() + name;
657 rgw_raw_obj object_name(pool, oid);
658 ret = store->delete_system_obj(object_name);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
661 return ret;
662 }
663 }
664
665 string oid = get_info_oid_prefix(old_format);
666 if (old_format) {
667 oid += name;
668 } else {
669 oid += id;
670 }
671
672 rgw_raw_obj object_id(pool, oid);
673 ret = store->delete_system_obj(object_id);
674 if (ret < 0) {
675 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
676 }
677
678 return ret;
679 }
680
681 int RGWSystemMetaObj::store_name(bool exclusive)
682 {
683 rgw_pool pool(get_pool(cct));
684 string oid = get_names_oid_prefix() + name;
685
686 RGWNameToId nameToId;
687 nameToId.obj_id = id;
688
689 bufferlist bl;
690 ::encode(nameToId, bl);
691 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
692 }
693
694 int RGWSystemMetaObj::rename(const string& new_name)
695 {
696 string new_id;
697 int ret = read_id(new_name, new_id);
698 if (!ret) {
699 return -EEXIST;
700 }
701 if (ret < 0 && ret != -ENOENT) {
702 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 string old_name = name;
706 name = new_name;
707 ret = update();
708 if (ret < 0) {
709 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712 ret = store_name(true);
713 if (ret < 0) {
714 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
715 return ret;
716 }
717 /* delete old name */
718 rgw_pool pool(get_pool(cct));
719 string oid = get_names_oid_prefix() + old_name;
720 rgw_raw_obj old_name_obj(pool, oid);
721 ret = store->delete_system_obj(old_name_obj);
722 if (ret < 0) {
723 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
724 return ret;
725 }
726
727 return ret;
728 }
729
730 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
731 {
732 rgw_pool pool(get_pool(cct));
733
734 bufferlist bl;
735
736 string oid = get_info_oid_prefix(old_format) + obj_id;
737
738 RGWObjectCtx obj_ctx(store);
739 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
740 if (ret < 0) {
741 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
742 return ret;
743 }
744
745 try {
746 bufferlist::iterator iter = bl.begin();
747 ::decode(*this, iter);
748 } catch (buffer::error& err) {
749 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
750 return -EIO;
751 }
752
753 return 0;
754 }
755
756 int RGWSystemMetaObj::read()
757 {
758 int ret = read_id(name, id);
759 if (ret < 0) {
760 return ret;
761 }
762
763 return read_info(id);
764 }
765
766 int RGWSystemMetaObj::create(bool exclusive)
767 {
768 int ret;
769
770 /* check to see the name is not used */
771 ret = read_id(name, id);
772 if (exclusive && ret == 0) {
773 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
774 return -EEXIST;
775 } else if ( ret < 0 && ret != -ENOENT) {
776 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 if (id.empty()) {
781 /* create unique id */
782 uuid_d new_uuid;
783 char uuid_str[37];
784 new_uuid.generate_random();
785 new_uuid.print(uuid_str);
786 id = uuid_str;
787 }
788
789 ret = store_info(exclusive);
790 if (ret < 0) {
791 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
792 return ret;
793 }
794
795 return store_name(exclusive);
796 }
797
798 int RGWSystemMetaObj::store_info(bool exclusive)
799 {
800 rgw_pool pool(get_pool(cct));
801
802 string oid = get_info_oid_prefix() + id;
803
804 bufferlist bl;
805 ::encode(*this, bl);
806 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
807 }
808
809 int RGWSystemMetaObj::write(bool exclusive)
810 {
811 int ret = store_info(exclusive);
812 if (ret < 0) {
813 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
814 return ret;
815 }
816 ret = store_name(exclusive);
817 if (ret < 0) {
818 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
819 return ret;
820 }
821 return 0;
822 }
823
824
825 const string& RGWRealm::get_predefined_name(CephContext *cct) {
826 return cct->_conf->rgw_realm;
827 }
828
829 int RGWRealm::create(bool exclusive)
830 {
831 int ret = RGWSystemMetaObj::create(exclusive);
832 if (ret < 0) {
833 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
834 return ret;
835 }
836 // create the control object for watch/notify
837 ret = create_control(exclusive);
838 if (ret < 0) {
839 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
840 return ret;
841 }
842 RGWPeriod period;
843 if (current_period.empty()) {
844 /* create new period for the realm */
845 ret = period.init(cct, store, id, name, false);
846 if (ret < 0 ) {
847 return ret;
848 }
849 ret = period.create(true);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
852 return ret;
853 }
854 } else {
855 period = RGWPeriod(current_period, 0);
856 int ret = period.init(cct, store, id, name);
857 if (ret < 0) {
858 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
859 return ret;
860 }
861 }
862 ret = set_current_period(period);
863 if (ret < 0) {
864 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
865 return ret;
866 }
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret = set_as_default(true);
870 if (ret < 0 && ret != -EEXIST) {
871 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
872 }
873
874 return 0;
875 }
876
877 int RGWRealm::delete_obj()
878 {
879 int ret = RGWSystemMetaObj::delete_obj();
880 if (ret < 0) {
881 return ret;
882 }
883 return delete_control();
884 }
885
886 int RGWRealm::create_control(bool exclusive)
887 {
888 auto pool = rgw_pool{get_pool(cct)};
889 auto oid = get_control_oid();
890 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
891 nullptr, real_time(), nullptr);
892 }
893
894 int RGWRealm::delete_control()
895 {
896 auto pool = rgw_pool{get_pool(cct)};
897 auto obj = rgw_raw_obj{pool, get_control_oid()};
898 return store->delete_system_obj(obj);
899 }
900
901 rgw_pool RGWRealm::get_pool(CephContext *cct)
902 {
903 if (cct->_conf->rgw_realm_root_pool.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
905 }
906 return rgw_pool(cct->_conf->rgw_realm_root_pool);
907 }
908
909 const string RGWRealm::get_default_oid(bool old_format)
910 {
911 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
912 return default_realm_info_oid;
913 }
914 return cct->_conf->rgw_default_realm_info_oid;
915 }
916
917 const string& RGWRealm::get_names_oid_prefix()
918 {
919 return realm_names_oid_prefix;
920 }
921
922 const string& RGWRealm::get_info_oid_prefix(bool old_format)
923 {
924 return realm_info_oid_prefix;
925 }
926
927 int RGWRealm::set_current_period(RGWPeriod& period)
928 {
929 // update realm epoch to match the period's
930 if (epoch > period.get_realm_epoch()) {
931 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
932 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
933 return -EINVAL;
934 }
935 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
936 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
937 << period.get_realm_epoch() << ", but different period id "
938 << period.get_id() << " != " << current_period << dendl;
939 return -EINVAL;
940 }
941
942 epoch = period.get_realm_epoch();
943 current_period = period.get_id();
944
945 int ret = update();
946 if (ret < 0) {
947 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
948 return ret;
949 }
950
951 ret = period.reflect();
952 if (ret < 0) {
953 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
954 return ret;
955 }
956
957 return 0;
958 }
959
960 string RGWRealm::get_control_oid()
961 {
962 return get_info_oid_prefix() + id + ".control";
963 }
964
965 int RGWRealm::notify_zone(bufferlist& bl)
966 {
967 // open a context on the realm's pool
968 rgw_pool pool{get_pool(cct)};
969 librados::IoCtx ctx;
970 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
971 if (r < 0) {
972 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
973 return r;
974 }
975 // send a notify on the realm object
976 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
977 if (r < 0) {
978 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
979 return r;
980 }
981 return 0;
982 }
983
984 int RGWRealm::notify_new_period(const RGWPeriod& period)
985 {
986 bufferlist bl;
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
989 ::encode(period, bl);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload, bl);
992
993 return notify_zone(bl);
994 }
995
996 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
997 {
998 if (realm_id.empty()) {
999 return "period_config.default";
1000 }
1001 return "period_config." + realm_id;
1002 }
1003
1004 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
1005 {
1006 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1007 if (pool_name.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1009 }
1010 return {pool_name};
1011 }
1012
1013 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1014 {
1015 RGWObjectCtx obj_ctx(store);
1016 const auto& pool = get_pool(store->ctx());
1017 const auto& oid = get_oid(realm_id);
1018 bufferlist bl;
1019
1020 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1021 if (ret < 0) {
1022 return ret;
1023 }
1024 try {
1025 bufferlist::iterator iter = bl.begin();
1026 ::decode(*this, iter);
1027 } catch (buffer::error& err) {
1028 return -EIO;
1029 }
1030 return 0;
1031 }
1032
1033 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1034 {
1035 const auto& pool = get_pool(store->ctx());
1036 const auto& oid = get_oid(realm_id);
1037 bufferlist bl;
1038 ::encode(*this, bl);
1039 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1040 false, nullptr, real_time(), nullptr);
1041 }
1042
1043 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1044 const string& period_realm_name, bool setup_obj)
1045 {
1046 cct = _cct;
1047 store = _store;
1048 realm_id = period_realm_id;
1049 realm_name = period_realm_name;
1050
1051 if (!setup_obj)
1052 return 0;
1053
1054 return init(_cct, _store, setup_obj);
1055 }
1056
1057
1058 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1059 {
1060 cct = _cct;
1061 store = _store;
1062
1063 if (!setup_obj)
1064 return 0;
1065
1066 if (id.empty()) {
1067 RGWRealm realm(realm_id, realm_name);
1068 int ret = realm.init(cct, store);
1069 if (ret < 0) {
1070 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1071 cpp_strerror(-ret) << dendl;
1072 return ret;
1073 }
1074 id = realm.get_current_period();
1075 realm_id = realm.get_id();
1076 }
1077
1078 if (!epoch) {
1079 int ret = use_latest_epoch();
1080 if (ret < 0) {
1081 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1082 << " : " << cpp_strerror(-ret) << dendl;
1083 return ret;
1084 }
1085 }
1086
1087 return read_info();
1088 }
1089
1090
1091 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1092 map<string, RGWZoneGroup>::const_iterator iter;
1093 if (!zonegroup_id.empty()) {
1094 iter = period_map.zonegroups.find(zonegroup_id);
1095 } else {
1096 iter = period_map.zonegroups.find("default");
1097 }
1098 if (iter != period_map.zonegroups.end()) {
1099 zonegroup = iter->second;
1100 return 0;
1101 }
1102
1103 return -ENOENT;
1104 }
1105
1106 const string& RGWPeriod::get_latest_epoch_oid()
1107 {
1108 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1109 return period_latest_epoch_info_oid;
1110 }
1111 return cct->_conf->rgw_period_latest_epoch_info_oid;
1112 }
1113
1114 const string& RGWPeriod::get_info_oid_prefix()
1115 {
1116 return period_info_oid_prefix;
1117 }
1118
1119 const string RGWPeriod::get_period_oid_prefix()
1120 {
1121 return get_info_oid_prefix() + id;
1122 }
1123
1124 const string RGWPeriod::get_period_oid()
1125 {
1126 std::ostringstream oss;
1127 oss << get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id != get_staging_id(realm_id))
1130 oss << "." << epoch;
1131 return oss.str();
1132 }
1133
1134 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1135 RGWObjVersionTracker *objv)
1136 {
1137 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1138
1139 rgw_pool pool(get_pool(cct));
1140 bufferlist bl;
1141 RGWObjectCtx obj_ctx(store);
1142 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1143 if (ret < 0) {
1144 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1145 return ret;
1146 }
1147 try {
1148 bufferlist::iterator iter = bl.begin();
1149 ::decode(info, iter);
1150 } catch (buffer::error& err) {
1151 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1152 return -EIO;
1153 }
1154
1155 return 0;
1156 }
1157
1158 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1159 {
1160 RGWPeriodLatestEpochInfo info;
1161
1162 int ret = read_latest_epoch(info);
1163 if (ret < 0) {
1164 return ret;
1165 }
1166
1167 latest_epoch = info.epoch;
1168
1169 return 0;
1170 }
1171
1172 int RGWPeriod::use_latest_epoch()
1173 {
1174 RGWPeriodLatestEpochInfo info;
1175 int ret = read_latest_epoch(info);
1176 if (ret < 0) {
1177 return ret;
1178 }
1179
1180 epoch = info.epoch;
1181
1182 return 0;
1183 }
1184
1185 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1186 RGWObjVersionTracker *objv)
1187 {
1188 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1189
1190 rgw_pool pool(get_pool(cct));
1191 bufferlist bl;
1192
1193 RGWPeriodLatestEpochInfo info;
1194 info.epoch = epoch;
1195
1196 ::encode(info, bl);
1197
1198 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1199 exclusive, objv, real_time(), nullptr);
1200 }
1201
1202 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1203 {
1204 static constexpr int MAX_RETRIES = 20;
1205
1206 for (int i = 0; i < MAX_RETRIES; i++) {
1207 RGWPeriodLatestEpochInfo info;
1208 RGWObjVersionTracker objv;
1209 bool exclusive = false;
1210
1211 // read existing epoch
1212 int r = read_latest_epoch(info, &objv);
1213 if (r == -ENOENT) {
1214 // use an exclusive create to set the epoch atomically
1215 exclusive = true;
1216 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id << dendl;
1218 } else if (r < 0) {
1219 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1220 return r;
1221 } else if (epoch <= info.epoch) {
1222 r = -EEXIST; // fail with EEXIST if epoch is not newer
1223 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1224 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1225 return r;
1226 } else {
1227 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1228 << " -> " << epoch << " on period=" << id << dendl;
1229 }
1230
1231 r = set_latest_epoch(epoch, exclusive, &objv);
1232 if (r == -EEXIST) {
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r == -ECANCELED) {
1235 continue; // write raced with a conflicting version, retry
1236 }
1237 if (r < 0) {
1238 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1239 return r;
1240 }
1241 return 0; // return success
1242 }
1243
1244 return -ECANCELED; // fail after max retries
1245 }
1246
1247 int RGWPeriod::delete_obj()
1248 {
1249 rgw_pool pool(get_pool(cct));
1250
1251 // delete the object for each period epoch
1252 for (epoch_t e = 1; e <= epoch; e++) {
1253 RGWPeriod p{get_id(), e};
1254 rgw_raw_obj oid{pool, p.get_period_oid()};
1255 int ret = store->delete_system_obj(oid);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret) << dendl;
1259 }
1260 }
1261
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret = store->delete_system_obj(oid);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret) << dendl;
1268 }
1269 return ret;
1270 }
1271
1272 int RGWPeriod::read_info()
1273 {
1274 rgw_pool pool(get_pool(cct));
1275
1276 bufferlist bl;
1277
1278 RGWObjectCtx obj_ctx(store);
1279 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1280 if (ret < 0) {
1281 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1282 return ret;
1283 }
1284
1285 try {
1286 bufferlist::iterator iter = bl.begin();
1287 ::decode(*this, iter);
1288 } catch (buffer::error& err) {
1289 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1290 return -EIO;
1291 }
1292
1293 return 0;
1294 }
1295
1296 int RGWPeriod::create(bool exclusive)
1297 {
1298 int ret;
1299
1300 /* create unique id */
1301 uuid_d new_uuid;
1302 char uuid_str[37];
1303 new_uuid.generate_random();
1304 new_uuid.print(uuid_str);
1305 id = uuid_str;
1306
1307 epoch = FIRST_EPOCH;
1308
1309 period_map.id = id;
1310
1311 ret = store_info(exclusive);
1312 if (ret < 0) {
1313 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1314 return ret;
1315 }
1316
1317 ret = set_latest_epoch(epoch);
1318 if (ret < 0) {
1319 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1320 }
1321
1322 return ret;
1323 }
1324
1325 int RGWPeriod::store_info(bool exclusive)
1326 {
1327 rgw_pool pool(get_pool(cct));
1328
1329 string oid = get_period_oid();
1330 bufferlist bl;
1331 ::encode(*this, bl);
1332
1333 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1334 exclusive, NULL, real_time(), NULL);
1335 }
1336
1337 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1338 {
1339 if (cct->_conf->rgw_period_root_pool.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1341 }
1342 return rgw_pool(cct->_conf->rgw_period_root_pool);
1343 }
1344
1345 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1346 {
1347 if (zonegroup.realm_id != realm_id) {
1348 return 0;
1349 }
1350 int ret = period_map.update(zonegroup, cct);
1351 if (ret < 0) {
1352 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1353 return ret;
1354 }
1355
1356 return store_info(false);
1357 }
1358
1359 int RGWPeriod::update()
1360 {
1361 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1362 list<string> zonegroups;
1363 int ret = store->list_zonegroups(zonegroups);
1364 if (ret < 0) {
1365 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1366 return ret;
1367 }
1368
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map.short_zone_ids.clear();
1372
1373 for (auto& iter : zonegroups) {
1374 RGWZoneGroup zg(string(), iter);
1375 ret = zg.init(cct, store);
1376 if (ret < 0) {
1377 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1378 continue;
1379 }
1380
1381 if (zg.realm_id != realm_id) {
1382 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1383 continue;
1384 }
1385
1386 if (zg.master_zone.empty()) {
1387 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1388 return -EINVAL;
1389 }
1390
1391 if (zg.is_master_zonegroup()) {
1392 master_zonegroup = zg.get_id();
1393 master_zone = zg.master_zone;
1394 }
1395
1396 int ret = period_map.update(zg, cct);
1397 if (ret < 0) {
1398 return ret;
1399 }
1400 }
1401
1402 ret = period_config.read(store, realm_id);
1403 if (ret < 0 && ret != -ENOENT) {
1404 ldout(cct, 0) << "ERROR: failed to read period config: "
1405 << cpp_strerror(ret) << dendl;
1406 return ret;
1407 }
1408 return 0;
1409 }
1410
1411 int RGWPeriod::reflect()
1412 {
1413 for (auto& iter : period_map.zonegroups) {
1414 RGWZoneGroup& zg = iter.second;
1415 zg.reinit_instance(cct, store);
1416 int r = zg.write(false);
1417 if (r < 0) {
1418 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1419 return r;
1420 }
1421 if (zg.is_master_zonegroup()) {
1422 // set master as default if no default exists
1423 r = zg.set_as_default(true);
1424 if (r == 0) {
1425 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1426 << " as the default" << dendl;
1427 }
1428 }
1429 }
1430
1431 int r = period_config.write(store, realm_id);
1432 if (r < 0) {
1433 ldout(cct, 0) << "ERROR: failed to store period config: "
1434 << cpp_strerror(-r) << dendl;
1435 return r;
1436 }
1437 return 0;
1438 }
1439
1440 void RGWPeriod::fork()
1441 {
1442 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1443 predecessor_uuid = id;
1444 id = get_staging_id(realm_id);
1445 period_map.reset();
1446 realm_epoch++;
1447 }
1448
1449 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1450 {
1451 // initialize a sync status manager to read the status
1452 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1453 int r = mgr.init();
1454 if (r < 0) {
1455 return r;
1456 }
1457 r = mgr.read_sync_status(sync_status);
1458 mgr.stop();
1459 return r;
1460 }
1461
1462 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1463 std::ostream& error_stream,
1464 bool force_if_stale)
1465 {
1466 rgw_meta_sync_status status;
1467 int r = read_sync_status(store, &status);
1468 if (r < 0) {
1469 ldout(cct, 0) << "period failed to read sync status: "
1470 << cpp_strerror(-r) << dendl;
1471 return r;
1472 }
1473
1474 std::vector<std::string> markers;
1475
1476 const auto current_epoch = current_period.get_realm_epoch();
1477 if (current_epoch != status.sync_info.realm_epoch) {
1478 // no sync status markers for the current period
1479 assert(current_epoch > status.sync_info.realm_epoch);
1480 const int behind = current_epoch - status.sync_info.realm_epoch;
1481 if (!force_if_stale && current_epoch > 1) {
1482 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1483 "the current master zone in metadata sync. If this zone is promoted "
1484 "to master, any metadata changes during that time are likely to "
1485 "be lost.\n"
1486 "Waiting for this zone to catch up on metadata sync (see "
1487 "'radosgw-admin sync status') is recommended.\n"
1488 "To promote this zone to master anyway, add the flag "
1489 "--yes-i-really-mean-it." << std::endl;
1490 return -EINVAL;
1491 }
1492 // empty sync status markers - other zones will skip this period during
1493 // incremental metadata sync
1494 markers.resize(status.sync_info.num_shards);
1495 } else {
1496 markers.reserve(status.sync_info.num_shards);
1497 for (auto& i : status.sync_markers) {
1498 auto& marker = i.second;
1499 // filter out markers from other periods
1500 if (marker.realm_epoch != current_epoch) {
1501 marker.marker.clear();
1502 }
1503 markers.emplace_back(std::move(marker.marker));
1504 }
1505 }
1506
1507 std::swap(sync_status, markers);
1508 return 0;
1509 }
1510
1511 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1512 std::ostream& error_stream, bool force_if_stale)
1513 {
1514 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1515 // gateway must be in the master zone to commit
1516 if (master_zone != store->get_zone_params().get_id()) {
1517 error_stream << "Cannot commit period on zone "
1518 << store->get_zone_params().get_id() << ", it must be sent to "
1519 "the period's master zone " << master_zone << '.' << std::endl;
1520 return -EINVAL;
1521 }
1522 // period predecessor must match current period
1523 if (predecessor_uuid != current_period.get_id()) {
1524 error_stream << "Period predecessor " << predecessor_uuid
1525 << " does not match current period " << current_period.get_id()
1526 << ". Use 'period pull' to get the latest period from the master, "
1527 "reapply your changes, and try again." << std::endl;
1528 return -EINVAL;
1529 }
1530 // realm epoch must be 1 greater than current period
1531 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1532 error_stream << "Period's realm epoch " << realm_epoch
1533 << " does not come directly after current realm epoch "
1534 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1535 "latest realm and period from the master zone, reapply your changes, "
1536 "and try again." << std::endl;
1537 return -EINVAL;
1538 }
1539 // did the master zone change?
1540 if (master_zone != current_period.get_master_zone()) {
1541 // store the current metadata sync status in the period
1542 int r = update_sync_status(current_period, error_stream, force_if_stale);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to update metadata sync status: "
1545 << cpp_strerror(-r) << dendl;
1546 return r;
1547 }
1548 // create an object with a new period id
1549 r = create(true);
1550 if (r < 0) {
1551 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 // set as current period
1555 r = realm.set_current_period(*this);
1556 if (r < 0) {
1557 ldout(cct, 0) << "failed to update realm's current period: "
1558 << cpp_strerror(-r) << dendl;
1559 return r;
1560 }
1561 ldout(cct, 4) << "Promoted to master zone and committed new period "
1562 << id << dendl;
1563 realm.notify_new_period(*this);
1564 return 0;
1565 }
1566 // period must be based on current epoch
1567 if (epoch != current_period.get_epoch()) {
1568 error_stream << "Period epoch " << epoch << " does not match "
1569 "predecessor epoch " << current_period.get_epoch()
1570 << ". Use 'period pull' to get the latest epoch from the master zone, "
1571 "reapply your changes, and try again." << std::endl;
1572 return -EINVAL;
1573 }
1574 // set period as next epoch
1575 set_id(current_period.get_id());
1576 set_epoch(current_period.get_epoch() + 1);
1577 set_predecessor(current_period.get_predecessor());
1578 realm_epoch = current_period.get_realm_epoch();
1579 // write the period to rados
1580 int r = store_info(false);
1581 if (r < 0) {
1582 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1583 return r;
1584 }
1585 // set as latest epoch
1586 r = update_latest_epoch(epoch);
1587 if (r == -EEXIST) {
1588 // already have this epoch (or a more recent one)
1589 return 0;
1590 }
1591 if (r < 0) {
1592 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1593 return r;
1594 }
1595 r = reflect();
1596 if (r < 0) {
1597 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1598 return r;
1599 }
1600 ldout(cct, 4) << "Committed new epoch " << epoch
1601 << " for period " << id << dendl;
1602 realm.notify_new_period(*this);
1603 return 0;
1604 }
1605
1606 int RGWZoneParams::create_default(bool old_format)
1607 {
1608 name = default_zone_name;
1609
1610 int r = create();
1611 if (r < 0) {
1612 return r;
1613 }
1614
1615 if (old_format) {
1616 name = id;
1617 }
1618
1619 return r;
1620 }
1621
1622
1623 int get_zones_pool_set(CephContext* cct,
1624 RGWRados* store,
1625 const list<string>& zones,
1626 const string& my_zone_id,
1627 set<rgw_pool>& pool_names)
1628 {
1629 for(auto const& iter : zones) {
1630 RGWZoneParams zone(iter);
1631 int r = zone.init(cct, store);
1632 if (r < 0) {
1633 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1634 return r;
1635 }
1636 if (zone.get_id() != my_zone_id) {
1637 pool_names.insert(zone.domain_root);
1638 pool_names.insert(zone.metadata_heap);
1639 pool_names.insert(zone.control_pool);
1640 pool_names.insert(zone.gc_pool);
1641 pool_names.insert(zone.log_pool);
1642 pool_names.insert(zone.intent_log_pool);
1643 pool_names.insert(zone.usage_log_pool);
1644 pool_names.insert(zone.user_keys_pool);
1645 pool_names.insert(zone.user_email_pool);
1646 pool_names.insert(zone.user_swift_pool);
1647 pool_names.insert(zone.user_uid_pool);
1648 pool_names.insert(zone.roles_pool);
1649 pool_names.insert(zone.reshard_pool);
1650 for(auto& iter : zone.placement_pools) {
1651 pool_names.insert(iter.second.index_pool);
1652 pool_names.insert(iter.second.data_pool);
1653 pool_names.insert(iter.second.data_extra_pool);
1654 }
1655 }
1656 }
1657 return 0;
1658 }
1659
1660 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1661 const string& default_prefix,
1662 const string& default_suffix,
1663 const rgw_pool& suggested_pool)
1664 {
1665 string suggested_name = suggested_pool.to_str();
1666
1667 string prefix = default_prefix;
1668 string suffix = default_suffix;
1669
1670 if (!suggested_pool.empty()) {
1671 prefix = suggested_name.substr(0, suggested_name.find("."));
1672 suffix = suggested_name.substr(prefix.length());
1673 }
1674
1675 rgw_pool pool(prefix + suffix);
1676
1677 if (pools.find(pool) == pools.end()) {
1678 return pool;
1679 } else {
1680 while(true) {
1681 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1682 if (pools.find(pool) == pools.end()) {
1683 return pool;
1684 }
1685 }
1686 }
1687 }
1688
1689 int RGWZoneParams::fix_pool_names()
1690 {
1691
1692 list<string> zones;
1693 int r = store->list_zones(zones);
1694 if (r < 0) {
1695 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1696 }
1697
1698 set<rgw_pool> pools;
1699 r = get_zones_pool_set(cct, store, zones, id, pools);
1700 if (r < 0) {
1701 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1702 return r;
1703 }
1704
1705 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1706 if (!metadata_heap.name.empty()) {
1707 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1708 }
1709 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1710 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1711 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1712 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1713 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1714 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1715 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1716 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1717 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1718 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1719 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1720 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1721
1722 for(auto& iter : placement_pools) {
1723 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1724 iter.second.index_pool);
1725 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1726 iter.second.data_pool);
1727 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1728 iter.second.data_extra_pool);
1729 }
1730
1731 return 0;
1732 }
1733
1734 int RGWZoneParams::create(bool exclusive)
1735 {
1736 /* check for old pools config */
1737 rgw_raw_obj obj(domain_root, avail_pools);
1738 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1739 if (r < 0) {
1740 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1741 /* a new system, let's set new placement info */
1742 RGWZonePlacementInfo default_placement;
1743 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1744 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1745 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1746 placement_pools["default-placement"] = default_placement;
1747 }
1748
1749 r = fix_pool_names();
1750 if (r < 0) {
1751 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1752 return r;
1753 }
1754
1755 r = RGWSystemMetaObj::create(exclusive);
1756 if (r < 0) {
1757 return r;
1758 }
1759
1760 // try to set as default. may race with another create, so pass exclusive=true
1761 // so we don't override an existing default
1762 r = set_as_default(true);
1763 if (r < 0 && r != -EEXIST) {
1764 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1765 }
1766
1767 return 0;
1768 }
1769
1770 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1771 {
1772 if (cct->_conf->rgw_zone_root_pool.empty()) {
1773 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1774 }
1775
1776 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1777 }
1778
1779 const string RGWZoneParams::get_default_oid(bool old_format)
1780 {
1781 if (old_format) {
1782 return cct->_conf->rgw_default_zone_info_oid;
1783 }
1784
1785 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1786 }
1787
1788 const string& RGWZoneParams::get_names_oid_prefix()
1789 {
1790 return zone_names_oid_prefix;
1791 }
1792
1793 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1794 {
1795 return zone_info_oid_prefix;
1796 }
1797
1798 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1799 return cct->_conf->rgw_zone;
1800 }
1801
1802 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1803 {
1804 if (name.empty()) {
1805 name = cct->_conf->rgw_zone;
1806 }
1807
1808 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1809 }
1810
1811 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1812 {
1813 if (realm_id.empty()) {
1814 /* try using default realm */
1815 RGWRealm realm;
1816 int ret = realm.init(cct, store);
1817 //no default realm exist
1818 if (ret < 0) {
1819 return read_id(default_zone_name, default_id);
1820 }
1821 realm_id = realm.get_id();
1822 }
1823
1824 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1825 }
1826
1827
1828 int RGWZoneParams::set_as_default(bool exclusive)
1829 {
1830 if (realm_id.empty()) {
1831 /* try using default realm */
1832 RGWRealm realm;
1833 int ret = realm.init(cct, store);
1834 if (ret < 0) {
1835 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1836 return -EINVAL;
1837 }
1838 realm_id = realm.get_id();
1839 }
1840
1841 return RGWSystemMetaObj::set_as_default(exclusive);
1842 }
1843
1844 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1845 {
1846 static const std::string NONE{"none"};
1847 auto p = placement_pools.find(placement_rule);
1848 if (p == placement_pools.end()) {
1849 return NONE;
1850 }
1851 const auto& type = p->second.compression_type;
1852 return !type.empty() ? type : NONE;
1853 }
1854
1855 void RGWPeriodMap::encode(bufferlist& bl) const {
1856 ENCODE_START(2, 1, bl);
1857 ::encode(id, bl);
1858 ::encode(zonegroups, bl);
1859 ::encode(master_zonegroup, bl);
1860 ::encode(short_zone_ids, bl);
1861 ENCODE_FINISH(bl);
1862 }
1863
1864 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1865 DECODE_START(2, bl);
1866 ::decode(id, bl);
1867 ::decode(zonegroups, bl);
1868 ::decode(master_zonegroup, bl);
1869 if (struct_v >= 2) {
1870 ::decode(short_zone_ids, bl);
1871 }
1872 DECODE_FINISH(bl);
1873
1874 zonegroups_by_api.clear();
1875 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1876 iter != zonegroups.end(); ++iter) {
1877 RGWZoneGroup& zonegroup = iter->second;
1878 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1879 if (zonegroup.is_master_zonegroup()) {
1880 master_zonegroup = zonegroup.get_id();
1881 }
1882 }
1883 }
1884
1885 // run an MD5 hash on the zone_id and return the first 32 bits
1886 static uint32_t gen_short_zone_id(const std::string zone_id)
1887 {
1888 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1889 MD5 hash;
1890 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1891 hash.Final(md5);
1892
1893 uint32_t short_id;
1894 memcpy((char *)&short_id, md5, sizeof(short_id));
1895 return std::max(short_id, 1u);
1896 }
1897
1898 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1899 {
1900 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1901 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1902 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1903 return -EINVAL;
1904 }
1905 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1906 if (iter != zonegroups.end()) {
1907 RGWZoneGroup& old_zonegroup = iter->second;
1908 if (!old_zonegroup.api_name.empty()) {
1909 zonegroups_by_api.erase(old_zonegroup.api_name);
1910 }
1911 }
1912 zonegroups[zonegroup.get_id()] = zonegroup;
1913
1914 if (!zonegroup.api_name.empty()) {
1915 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1916 }
1917
1918 if (zonegroup.is_master_zonegroup()) {
1919 master_zonegroup = zonegroup.get_id();
1920 } else if (master_zonegroup == zonegroup.get_id()) {
1921 master_zonegroup = "";
1922 }
1923
1924 for (auto& i : zonegroup.zones) {
1925 auto& zone = i.second;
1926 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1927 continue;
1928 }
1929 // calculate the zone's short id
1930 uint32_t short_id = gen_short_zone_id(zone.id);
1931
1932 // search for an existing zone with the same short id
1933 for (auto& s : short_zone_ids) {
1934 if (s.second == short_id) {
1935 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1936 << ") generates the same short_zone_id " << short_id
1937 << " as existing zone id " << s.first << dendl;
1938 return -EEXIST;
1939 }
1940 }
1941
1942 short_zone_ids[zone.id] = short_id;
1943 }
1944
1945 return 0;
1946 }
1947
1948 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1949 {
1950 auto i = short_zone_ids.find(zone_id);
1951 if (i == short_zone_ids.end()) {
1952 return 0;
1953 }
1954 return i->second;
1955 }
1956
1957 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1958 {
1959
1960 RGWPeriod period;
1961 int ret = period.init(cct, store);
1962 if (ret < 0) {
1963 cerr << "failed to read current period info: " << cpp_strerror(ret);
1964 return ret;
1965 }
1966
1967 bucket_quota = period.get_config().bucket_quota;
1968 user_quota = period.get_config().user_quota;
1969 zonegroups = period.get_map().zonegroups;
1970 zonegroups_by_api = period.get_map().zonegroups_by_api;
1971 master_zonegroup = period.get_map().master_zonegroup;
1972
1973 return 0;
1974 }
1975
1976 void RGWRegionMap::encode(bufferlist& bl) const {
1977 ENCODE_START( 3, 1, bl);
1978 ::encode(regions, bl);
1979 ::encode(master_region, bl);
1980 ::encode(bucket_quota, bl);
1981 ::encode(user_quota, bl);
1982 ENCODE_FINISH(bl);
1983 }
1984
1985 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1986 DECODE_START(3, bl);
1987 ::decode(regions, bl);
1988 ::decode(master_region, bl);
1989 if (struct_v >= 2)
1990 ::decode(bucket_quota, bl);
1991 if (struct_v >= 3)
1992 ::decode(user_quota, bl);
1993 DECODE_FINISH(bl);
1994 }
1995
1996 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1997 ENCODE_START( 3, 1, bl);
1998 ::encode(zonegroups, bl);
1999 ::encode(master_zonegroup, bl);
2000 ::encode(bucket_quota, bl);
2001 ::encode(user_quota, bl);
2002 ENCODE_FINISH(bl);
2003 }
2004
2005 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
2006 DECODE_START(3, bl);
2007 ::decode(zonegroups, bl);
2008 ::decode(master_zonegroup, bl);
2009 if (struct_v >= 2)
2010 ::decode(bucket_quota, bl);
2011 if (struct_v >= 3)
2012 ::decode(user_quota, bl);
2013 DECODE_FINISH(bl);
2014
2015 zonegroups_by_api.clear();
2016 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2017 iter != zonegroups.end(); ++iter) {
2018 RGWZoneGroup& zonegroup = iter->second;
2019 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2020 if (zonegroup.is_master_zonegroup()) {
2021 master_zonegroup = zonegroup.get_name();
2022 }
2023 }
2024 }
2025
2026 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2027 {
2028 obj_version *check_objv = version_for_check();
2029
2030 if (check_objv) {
2031 cls_version_check(*op, *check_objv, VER_COND_EQ);
2032 }
2033
2034 cls_version_read(*op, &read_version);
2035 }
2036
2037 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2038 {
2039 obj_version *check_objv = version_for_check();
2040 obj_version *modify_version = version_for_write();
2041
2042 if (check_objv) {
2043 cls_version_check(*op, *check_objv, VER_COND_EQ);
2044 }
2045
2046 if (modify_version) {
2047 cls_version_set(*op, *modify_version);
2048 } else {
2049 cls_version_inc(*op);
2050 }
2051 }
2052
2053 void RGWObjManifest::obj_iterator::operator++()
2054 {
2055 if (manifest->explicit_objs) {
2056 ++explicit_iter;
2057
2058 if (explicit_iter == manifest->objs.end()) {
2059 ofs = manifest->obj_size;
2060 return;
2061 }
2062
2063 update_explicit_pos();
2064
2065 update_location();
2066 return;
2067 }
2068
2069 uint64_t obj_size = manifest->get_obj_size();
2070 uint64_t head_size = manifest->get_head_size();
2071
2072 if (ofs == obj_size) {
2073 return;
2074 }
2075
2076 if (manifest->rules.empty()) {
2077 return;
2078 }
2079
2080 /* are we still pointing at the head? */
2081 if (ofs < head_size) {
2082 rule_iter = manifest->rules.begin();
2083 RGWObjManifestRule *rule = &rule_iter->second;
2084 ofs = MIN(head_size, obj_size);
2085 stripe_ofs = ofs;
2086 cur_stripe = 1;
2087 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2088 if (rule->part_size > 0) {
2089 stripe_size = MIN(stripe_size, rule->part_size);
2090 }
2091 update_location();
2092 return;
2093 }
2094
2095 RGWObjManifestRule *rule = &rule_iter->second;
2096
2097 stripe_ofs += rule->stripe_max_size;
2098 cur_stripe++;
2099 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2100
2101 if (rule->part_size > 0) {
2102 /* multi part, multi stripes object */
2103
2104 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2105
2106 if (stripe_ofs >= part_ofs + rule->part_size) {
2107 /* moved to the next part */
2108 cur_stripe = 0;
2109 part_ofs += rule->part_size;
2110 stripe_ofs = part_ofs;
2111
2112 bool last_rule = (next_rule_iter == manifest->rules.end());
2113 /* move to the next rule? */
2114 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2115 rule_iter = next_rule_iter;
2116 last_rule = (next_rule_iter == manifest->rules.end());
2117 if (!last_rule) {
2118 ++next_rule_iter;
2119 }
2120 cur_part_id = rule_iter->second.start_part_num;
2121 } else {
2122 cur_part_id++;
2123 }
2124
2125 rule = &rule_iter->second;
2126 }
2127
2128 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2129 }
2130
2131 cur_override_prefix = rule->override_prefix;
2132
2133 ofs = stripe_ofs;
2134 if (ofs > obj_size) {
2135 ofs = obj_size;
2136 stripe_ofs = ofs;
2137 stripe_size = 0;
2138 }
2139
2140 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2141 update_location();
2142 }
2143
2144 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2145 {
2146 manifest = _m;
2147
2148 manifest->set_tail_placement(placement_rule, _b);
2149 manifest->set_head(placement_rule, _obj, 0);
2150 last_ofs = 0;
2151
2152 if (manifest->get_prefix().empty()) {
2153 char buf[33];
2154 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2155
2156 string oid_prefix = ".";
2157 oid_prefix.append(buf);
2158 oid_prefix.append("_");
2159
2160 manifest->set_prefix(oid_prefix);
2161 }
2162
2163 bool found = manifest->get_rule(0, &rule);
2164 if (!found) {
2165 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2166 return -EIO;
2167 }
2168
2169 uint64_t head_size = manifest->get_head_size();
2170
2171 if (head_size > 0) {
2172 cur_stripe_size = head_size;
2173 } else {
2174 cur_stripe_size = rule.stripe_max_size;
2175 }
2176
2177 cur_part_id = rule.start_part_num;
2178
2179 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2180
2181 // Normal object which not generated through copy operation
2182 manifest->set_tail_instance(_obj.key.instance);
2183
2184 manifest->update_iterators();
2185
2186 return 0;
2187 }
2188
2189 int RGWObjManifest::generator::create_next(uint64_t ofs)
2190 {
2191 if (ofs < last_ofs) /* only going forward */
2192 return -EINVAL;
2193
2194 uint64_t max_head_size = manifest->get_max_head_size();
2195
2196 if (ofs < max_head_size) {
2197 manifest->set_head_size(ofs);
2198 }
2199
2200 if (ofs >= max_head_size) {
2201 manifest->set_head_size(max_head_size);
2202 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2203 cur_stripe_size = rule.stripe_max_size;
2204
2205 if (cur_part_id == 0 && max_head_size > 0) {
2206 cur_stripe++;
2207 }
2208 }
2209
2210 last_ofs = ofs;
2211 manifest->set_obj_size(ofs);
2212
2213 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2214
2215 manifest->update_iterators();
2216
2217 return 0;
2218 }
2219
2220 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2221 {
2222 return begin_iter;
2223 }
2224
2225 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2226 {
2227 return end_iter;
2228 }
2229
2230 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2231 {
2232 if (ofs > obj_size) {
2233 ofs = obj_size;
2234 }
2235 RGWObjManifest::obj_iterator iter(this);
2236 iter.seek(ofs);
2237 return iter;
2238 }
2239
2240 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2241 {
2242 if (explicit_objs || m.explicit_objs) {
2243 return append_explicit(m, zonegroup, zone_params);
2244 }
2245
2246 if (rules.empty()) {
2247 *this = m;
2248 return 0;
2249 }
2250
2251 string override_prefix;
2252
2253 if (prefix.empty()) {
2254 prefix = m.prefix;
2255 }
2256
2257 if (prefix != m.prefix) {
2258 override_prefix = m.prefix;
2259 }
2260
2261 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2262 if (miter == m.rules.end()) {
2263 return append_explicit(m, zonegroup, zone_params);
2264 }
2265
2266 for (; miter != m.rules.end(); ++miter) {
2267 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2268
2269 RGWObjManifestRule& rule = last_rule->second;
2270
2271 if (rule.part_size == 0) {
2272 rule.part_size = obj_size - rule.start_ofs;
2273 }
2274
2275 RGWObjManifestRule& next_rule = miter->second;
2276 if (!next_rule.part_size) {
2277 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2278 }
2279
2280 string rule_prefix = prefix;
2281 if (!rule.override_prefix.empty()) {
2282 rule_prefix = rule.override_prefix;
2283 }
2284
2285 string next_rule_prefix = m.prefix;
2286 if (!next_rule.override_prefix.empty()) {
2287 next_rule_prefix = next_rule.override_prefix;
2288 }
2289
2290 if (rule.part_size != next_rule.part_size ||
2291 rule.stripe_max_size != next_rule.stripe_max_size ||
2292 rule_prefix != next_rule_prefix) {
2293 if (next_rule_prefix != prefix) {
2294 append_rules(m, miter, &next_rule_prefix);
2295 } else {
2296 append_rules(m, miter, NULL);
2297 }
2298 break;
2299 }
2300
2301 uint64_t expected_part_num = rule.start_part_num + 1;
2302 if (rule.part_size > 0) {
2303 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2304 }
2305
2306 if (expected_part_num != next_rule.start_part_num) {
2307 append_rules(m, miter, NULL);
2308 break;
2309 }
2310 }
2311
2312 set_obj_size(obj_size + m.obj_size);
2313
2314 return 0;
2315 }
2316
2317 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2318 {
2319 return append(m, store->get_zonegroup(), store->get_zone_params());
2320 }
2321
2322 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2323 string *override_prefix)
2324 {
2325 for (; miter != m.rules.end(); ++miter) {
2326 RGWObjManifestRule rule = miter->second;
2327 rule.start_ofs += obj_size;
2328 if (override_prefix)
2329 rule.override_prefix = *override_prefix;
2330 rules[rule.start_ofs] = rule;
2331 }
2332 }
2333
2334 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2335 {
2336 if (explicit_objs) {
2337 return;
2338 }
2339 obj_iterator iter = obj_begin();
2340
2341 while (iter != obj_end()) {
2342 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2343 const rgw_obj_select& os = iter.get_location();
2344 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2345 part.loc_ofs = 0;
2346
2347 uint64_t ofs = iter.get_stripe_ofs();
2348
2349 if (ofs == 0) {
2350 part.loc = obj;
2351 } else {
2352 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2353 }
2354 ++iter;
2355 uint64_t next_ofs = iter.get_stripe_ofs();
2356
2357 part.size = next_ofs - ofs;
2358 }
2359
2360 explicit_objs = true;
2361 rules.clear();
2362 prefix.clear();
2363 }
2364
2365 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2366 {
2367 if (!explicit_objs) {
2368 convert_to_explicit(zonegroup, zone_params);
2369 }
2370 if (!m.explicit_objs) {
2371 m.convert_to_explicit(zonegroup, zone_params);
2372 }
2373 map<uint64_t, RGWObjManifestPart>::iterator iter;
2374 uint64_t base = obj_size;
2375 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2376 RGWObjManifestPart& part = iter->second;
2377 objs[base + iter->first] = part;
2378 }
2379 obj_size += m.obj_size;
2380
2381 return 0;
2382 }
2383
2384 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2385 {
2386 if (rules.empty()) {
2387 return false;
2388 }
2389
2390 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2391 if (iter != rules.begin()) {
2392 --iter;
2393 }
2394
2395 *rule = iter->second;
2396
2397 return true;
2398 }
2399
2400 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2401 {
2402 write_version.ver = 1;
2403 #define TAG_LEN 24
2404
2405 write_version.tag.clear();
2406 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2407 }
2408
2409 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2410 real_time *mtime, real_time set_mtime,
2411 map<string, bufferlist>& attrs, real_time delete_at,
2412 const char *if_match, const char *if_nomatch, const string *user_data,
2413 rgw_zone_set *zones_trace)
2414 {
2415 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2416 if (r < 0)
2417 return r;
2418
2419 is_complete = !canceled;
2420 return 0;
2421 }
2422
2423 CephContext *RGWPutObjProcessor::ctx()
2424 {
2425 return store->ctx();
2426 }
2427
2428 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2429 {
2430 drain_pending();
2431
2432 if (is_complete)
2433 return;
2434
2435 set<rgw_raw_obj>::iterator iter;
2436 bool need_to_remove_head = false;
2437 rgw_raw_obj raw_head;
2438
2439 if (!head_obj.empty()) {
2440 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2441 }
2442
2443 /**
2444 * We should delete the object in the "multipart" namespace to avoid race condition.
2445 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2446 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2447 * written by the second upload may be deleted by the first upload.
2448 * details is describled on #11749
2449 *
2450 * The above comment still stands, but instead of searching for a specific object in the multipart
2451 * namespace, we just make sure that we remove the object that is marked as the head object after
2452 * we remove all the other raw objects. Note that we use different call to remove the head object,
2453 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2454 */
2455 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2456 const rgw_raw_obj& obj = *iter;
2457 if (!head_obj.empty() && obj == raw_head) {
2458 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2459 need_to_remove_head = true;
2460 continue;
2461 }
2462
2463 int r = store->delete_raw_obj(obj);
2464 if (r < 0 && r != -ENOENT) {
2465 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2466 }
2467 }
2468
2469 if (need_to_remove_head) {
2470 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2471 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2472 if (r < 0 && r != -ENOENT) {
2473 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2474 }
2475 }
2476 }
2477
2478 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2479 {
2480 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2481 obj_len = abs_ofs + bl.length();
2482
2483 if (!(obj == last_written_obj)) {
2484 last_written_obj = obj;
2485 }
2486
2487 // For the first call pass -1 as the offset to
2488 // do a write_full.
2489 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2490 }
2491
2492 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2493 {
2494 struct put_obj_aio_info info;
2495 info = pending.front();
2496 pending.pop_front();
2497 pending_size -= info.size;
2498 return info;
2499 }
2500
2501 int RGWPutObjProcessor_Aio::wait_pending_front()
2502 {
2503 if (pending.empty()) {
2504 return 0;
2505 }
2506 struct put_obj_aio_info info = pop_pending();
2507 int ret = store->aio_wait(info.handle);
2508
2509 if (ret >= 0) {
2510 add_written_obj(info.obj);
2511 }
2512
2513 return ret;
2514 }
2515
2516 bool RGWPutObjProcessor_Aio::pending_has_completed()
2517 {
2518 if (pending.empty())
2519 return false;
2520
2521 struct put_obj_aio_info& info = pending.front();
2522 return store->aio_completed(info.handle);
2523 }
2524
2525 int RGWPutObjProcessor_Aio::drain_pending()
2526 {
2527 int ret = 0;
2528 while (!pending.empty()) {
2529 int r = wait_pending_front();
2530 if (r < 0)
2531 ret = r;
2532 }
2533 return ret;
2534 }
2535
2536 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2537 {
2538 bool _wait = need_to_wait;
2539
2540 if (handle) {
2541 struct put_obj_aio_info info;
2542 info.handle = handle;
2543 info.obj = obj;
2544 info.size = size;
2545 pending_size += size;
2546 pending.push_back(info);
2547 }
2548 size_t orig_size = pending_size;
2549
2550 /* first drain complete IOs */
2551 while (pending_has_completed()) {
2552 int r = wait_pending_front();
2553 if (r < 0)
2554 return r;
2555
2556 _wait = false;
2557 }
2558
2559 /* resize window in case messages are draining too fast */
2560 if (orig_size - pending_size >= window_size) {
2561 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2562 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2563 if (window_size > max_window_size) {
2564 window_size = max_window_size;
2565 }
2566 }
2567
2568 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2569 if (pending_size > window_size || _wait) {
2570 int r = wait_pending_front();
2571 if (r < 0)
2572 return r;
2573 }
2574 return 0;
2575 }
2576
2577 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2578 {
2579 if (ofs >= next_part_ofs) {
2580 int r = prepare_next_part(ofs);
2581 if (r < 0) {
2582 return r;
2583 }
2584 }
2585
2586 *pobj = cur_obj;
2587
2588 if (!bl.length()) {
2589 *phandle = nullptr;
2590 return 0;
2591 }
2592
2593 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2594 }
2595
2596 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2597 {
2598 RGWPutObjProcessor::prepare(store, oid_rand);
2599
2600 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2601
2602 return 0;
2603 }
2604
2605 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2606 {
2607 *phandle = NULL;
2608 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2609
2610 pending_data_bl.claim_append(bl);
2611 if (pending_data_bl.length() < max_write_size) {
2612 *again = false;
2613 return 0;
2614 }
2615
2616 pending_data_bl.splice(0, max_write_size, &bl);
2617
2618 /* do we have enough data pending accumulated that needs to be written? */
2619 *again = (pending_data_bl.length() >= max_chunk_size);
2620
2621 if (!data_ofs && !immutable_head()) {
2622 first_chunk.claim(bl);
2623 obj_len = (uint64_t)first_chunk.length();
2624 int r = prepare_next_part(obj_len);
2625 if (r < 0) {
2626 return r;
2627 }
2628 data_ofs = obj_len;
2629 return 0;
2630 }
2631 off_t write_ofs = data_ofs;
2632 data_ofs = write_ofs + bl.length();
2633 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2634 we could be racing with another upload, to the same
2635 object and cleanup can be messy */
2636 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2637 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2638 bl.clear();
2639 }
2640 return ret;
2641 }
2642
2643
2644 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2645 {
2646 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2647
2648 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2649 if (r < 0) {
2650 return r;
2651 }
2652
2653 return 0;
2654 }
2655
2656 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2657 {
2658 head_obj.init(bucket, obj_str);
2659
2660 int r = prepare_init(store, oid_rand);
2661 if (r < 0) {
2662 return r;
2663 }
2664
2665 if (!version_id.empty()) {
2666 head_obj.key.set_instance(version_id);
2667 } else if (versioned_object) {
2668 store->gen_rand_obj_instance_name(&head_obj);
2669 }
2670
2671 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2672
2673 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2674 if (r < 0) {
2675 return r;
2676 }
2677
2678 return 0;
2679 }
2680
2681 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2682
2683 int ret = manifest_gen.create_next(ofs);
2684 if (ret < 0) {
2685 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2686 return ret;
2687 }
2688 cur_part_ofs = ofs;
2689 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2690 cur_obj = manifest_gen.get_cur_obj(store);
2691
2692 return 0;
2693 }
2694
2695 int RGWPutObjProcessor_Atomic::complete_parts()
2696 {
2697 if (obj_len > (uint64_t)cur_part_ofs) {
2698 return prepare_next_part(obj_len);
2699 }
2700 return 0;
2701 }
2702
2703 int RGWPutObjProcessor_Atomic::complete_writing_data()
2704 {
2705 if (!data_ofs && !immutable_head()) {
2706 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2707 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2708 * clobber first_chunk
2709 */
2710 if (pending_data_bl.length() > 0) {
2711 first_chunk.claim(pending_data_bl);
2712 }
2713 obj_len = (uint64_t)first_chunk.length();
2714 }
2715 while (pending_data_bl.length()) {
2716 void *handle = nullptr;
2717 rgw_raw_obj obj;
2718 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2719 if (max_write_size > pending_data_bl.length()) {
2720 max_write_size = pending_data_bl.length();
2721 }
2722 bufferlist bl;
2723 pending_data_bl.splice(0, max_write_size, &bl);
2724 uint64_t write_len = bl.length();
2725 int r = write_data(bl, data_ofs, &handle, &obj, false);
2726 if (r < 0) {
2727 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2728 return r;
2729 }
2730 data_ofs += write_len;
2731 r = throttle_data(handle, obj, write_len, false);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2734 return r;
2735 }
2736
2737 if (data_ofs >= next_part_ofs) {
2738 r = prepare_next_part(data_ofs);
2739 if (r < 0) {
2740 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2741 return r;
2742 }
2743 }
2744 }
2745 int r = complete_parts();
2746 if (r < 0) {
2747 return r;
2748 }
2749
2750 r = drain_pending();
2751 if (r < 0)
2752 return r;
2753
2754 return 0;
2755 }
2756
2757 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2758 real_time *mtime, real_time set_mtime,
2759 map<string, bufferlist>& attrs,
2760 real_time delete_at,
2761 const char *if_match,
2762 const char *if_nomatch, const string *user_data,
2763 rgw_zone_set *zones_trace) {
2764 int r = complete_writing_data();
2765 if (r < 0)
2766 return r;
2767
2768 obj_ctx.obj.set_atomic(head_obj);
2769
2770 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2771
2772 /* some object types shouldn't be versioned, e.g., multipart parts */
2773 op_target.set_versioning_disabled(!versioned_object);
2774
2775 RGWRados::Object::Write obj_op(&op_target);
2776
2777 obj_op.meta.data = &first_chunk;
2778 obj_op.meta.manifest = &manifest;
2779 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2780 obj_op.meta.if_match = if_match;
2781 obj_op.meta.if_nomatch = if_nomatch;
2782 obj_op.meta.mtime = mtime;
2783 obj_op.meta.set_mtime = set_mtime;
2784 obj_op.meta.owner = bucket_info.owner;
2785 obj_op.meta.flags = PUT_OBJ_CREATE;
2786 obj_op.meta.olh_epoch = olh_epoch;
2787 obj_op.meta.delete_at = delete_at;
2788 obj_op.meta.user_data = user_data;
2789 obj_op.meta.zones_trace = zones_trace;
2790 obj_op.meta.modify_tail = true;
2791
2792 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2793 if (r < 0) {
2794 return r;
2795 }
2796
2797 canceled = obj_op.meta.canceled;
2798
2799 return 0;
2800 }
2801
2802 const char* RGWRados::admin_commands[4][3] = {
2803 { "cache list",
2804 "cache list name=filter,type=CephString,req=false",
2805 "cache list [filter_str]: list object cache, possibly matching substrings" },
2806 { "cache inspect",
2807 "cache inspect name=target,type=CephString,req=true",
2808 "cache inspect target: print cache element" },
2809 { "cache erase",
2810 "cache erase name=target,type=CephString,req=true",
2811 "cache erase target: erase element from cache" },
2812 { "cache zap",
2813 "cache zap",
2814 "cache zap: erase all elements from cache" }
2815 };
2816
2817
2818 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2819 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2820 if (r < 0)
2821 return r;
2822 return 0;
2823 }
2824
2825 int RGWRados::unwatch(uint64_t watch_handle)
2826 {
2827 int r = control_pool_ctx.unwatch2(watch_handle);
2828 if (r < 0) {
2829 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2830 return r;
2831 }
2832 r = rados[0].watch_flush();
2833 if (r < 0) {
2834 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2835 return r;
2836 }
2837 return 0;
2838 }
2839
2840 void RGWRados::add_watcher(int i)
2841 {
2842 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2843 Mutex::Locker l(watchers_lock);
2844 watchers_set.insert(i);
2845 if (watchers_set.size() == (size_t)num_watchers) {
2846 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2847 set_cache_enabled(true);
2848 }
2849 }
2850
2851 void RGWRados::remove_watcher(int i)
2852 {
2853 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2854 Mutex::Locker l(watchers_lock);
2855 size_t orig_size = watchers_set.size();
2856 watchers_set.erase(i);
2857 if (orig_size == (size_t)num_watchers &&
2858 watchers_set.size() < orig_size) { /* actually removed */
2859 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2860 set_cache_enabled(false);
2861 }
2862 }
2863
2864 class RGWWatcher : public librados::WatchCtx2 {
2865 RGWRados *rados;
2866 int index;
2867 string oid;
2868 uint64_t watch_handle;
2869
2870 class C_ReinitWatch : public Context {
2871 RGWWatcher *watcher;
2872 public:
2873 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2874 void finish(int r) override {
2875 watcher->reinit();
2876 }
2877 };
2878 public:
2879 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2880 void handle_notify(uint64_t notify_id,
2881 uint64_t cookie,
2882 uint64_t notifier_id,
2883 bufferlist& bl) override {
2884 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2885 << " notify_id " << notify_id
2886 << " cookie " << cookie
2887 << " notifier " << notifier_id
2888 << " bl.length()=" << bl.length() << dendl;
2889 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2890
2891 bufferlist reply_bl; // empty reply payload
2892 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2893 }
2894 void handle_error(uint64_t cookie, int err) override {
2895 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2896 << " err " << cpp_strerror(err) << dendl;
2897 rados->remove_watcher(index);
2898 rados->schedule_context(new C_ReinitWatch(this));
2899 }
2900
2901 void reinit() {
2902 int ret = unregister_watch();
2903 if (ret < 0) {
2904 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2905 return;
2906 }
2907 ret = register_watch();
2908 if (ret < 0) {
2909 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2910 return;
2911 }
2912 }
2913
2914 int unregister_watch() {
2915 int r = rados->unwatch(watch_handle);
2916 if (r < 0) {
2917 return r;
2918 }
2919 rados->remove_watcher(index);
2920 return 0;
2921 }
2922
2923 int register_watch() {
2924 int r = rados->watch(oid, &watch_handle, this);
2925 if (r < 0) {
2926 return r;
2927 }
2928 rados->add_watcher(index);
2929 return 0;
2930 }
2931 };
2932
2933 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2934 RGWRados *store;
2935 RGWHTTPManager http_manager;
2936
2937 public:
2938 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2939 http_manager(store->ctx(), completion_mgr) {
2940 http_manager.set_threaded();
2941 }
2942
2943 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2944 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2945 { "notify", NULL },
2946 { NULL, NULL } };
2947
2948 list<RGWCoroutinesStack *> stacks;
2949 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2950 RGWRESTConn *conn = iter->second;
2951 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2952 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2953
2954 stacks.push_back(stack);
2955 }
2956 return run(stacks);
2957 }
2958 };
2959
2960 class RGWDataNotifierManager : public RGWCoroutinesManager {
2961 RGWRados *store;
2962 RGWHTTPManager http_manager;
2963
2964 public:
2965 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2966 http_manager(store->ctx(), completion_mgr) {
2967 http_manager.set_threaded();
2968 }
2969
2970 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2971 rgw_http_param_pair pairs[] = { { "type", "data" },
2972 { "notify", NULL },
2973 { "source-zone", store->get_zone_params().get_id().c_str() },
2974 { NULL, NULL } };
2975
2976 list<RGWCoroutinesStack *> stacks;
2977 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2978 RGWRESTConn *conn = iter->second;
2979 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2980 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2981
2982 stacks.push_back(stack);
2983 }
2984 return run(stacks);
2985 }
2986 };
2987
2988 class RGWRadosThread {
2989 class Worker : public Thread {
2990 CephContext *cct;
2991 RGWRadosThread *processor;
2992 Mutex lock;
2993 Cond cond;
2994
2995 void wait() {
2996 Mutex::Locker l(lock);
2997 cond.Wait(lock);
2998 };
2999
3000 void wait_interval(const utime_t& wait_time) {
3001 Mutex::Locker l(lock);
3002 cond.WaitInterval(lock, wait_time);
3003 }
3004
3005 public:
3006 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
3007 void *entry() override;
3008 void signal() {
3009 Mutex::Locker l(lock);
3010 cond.Signal();
3011 }
3012 };
3013
3014 Worker *worker;
3015
3016 protected:
3017 CephContext *cct;
3018 RGWRados *store;
3019
3020 std::atomic<bool> down_flag = { false };
3021
3022 string thread_name;
3023
3024 virtual uint64_t interval_msec() = 0;
3025 virtual void stop_process() {}
3026 public:
3027 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3028 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3029 virtual ~RGWRadosThread() {
3030 stop();
3031 }
3032
3033 virtual int init() { return 0; }
3034 virtual int process() = 0;
3035
3036 bool going_down() { return down_flag; }
3037
3038 void start();
3039 void stop();
3040
3041 void signal() {
3042 if (worker) {
3043 worker->signal();
3044 }
3045 }
3046 };
3047
3048 void RGWRadosThread::start()
3049 {
3050 worker = new Worker(cct, this);
3051 worker->create(thread_name.c_str());
3052 }
3053
3054 void RGWRadosThread::stop()
3055 {
3056 down_flag = true;
3057 stop_process();
3058 if (worker) {
3059 worker->signal();
3060 worker->join();
3061 }
3062 delete worker;
3063 worker = NULL;
3064 }
3065
3066 void *RGWRadosThread::Worker::entry() {
3067 uint64_t msec = processor->interval_msec();
3068 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3069
3070 do {
3071 utime_t start = ceph_clock_now();
3072 int r = processor->process();
3073 if (r < 0) {
3074 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3075 }
3076
3077 if (processor->going_down())
3078 break;
3079
3080 utime_t end = ceph_clock_now();
3081 end -= start;
3082
3083 uint64_t cur_msec = processor->interval_msec();
3084 if (cur_msec != msec) { /* was it reconfigured? */
3085 msec = cur_msec;
3086 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3087 }
3088
3089 if (cur_msec > 0) {
3090 if (interval <= end)
3091 continue; // next round
3092
3093 utime_t wait_time = interval;
3094 wait_time -= end;
3095
3096 wait_interval(wait_time);
3097 } else {
3098 wait();
3099 }
3100 } while (!processor->going_down());
3101
3102 return NULL;
3103 }
3104
3105 class RGWMetaNotifier : public RGWRadosThread {
3106 RGWMetaNotifierManager notify_mgr;
3107 RGWMetadataLog *const log;
3108
3109 uint64_t interval_msec() override {
3110 return cct->_conf->rgw_md_notify_interval_msec;
3111 }
3112 void stop_process() override {
3113 notify_mgr.stop();
3114 }
3115 public:
3116 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3117 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3118
3119 int process() override;
3120 };
3121
3122 int RGWMetaNotifier::process()
3123 {
3124 set<int> shards;
3125
3126 log->read_clear_modified(shards);
3127
3128 if (shards.empty()) {
3129 return 0;
3130 }
3131
3132 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3133 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3134 }
3135
3136 notify_mgr.notify_all(store->zone_conn_map, shards);
3137
3138 return 0;
3139 }
3140
3141 class RGWDataNotifier : public RGWRadosThread {
3142 RGWDataNotifierManager notify_mgr;
3143
3144 uint64_t interval_msec() override {
3145 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3146 }
3147 void stop_process() override {
3148 notify_mgr.stop();
3149 }
3150 public:
3151 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3152
3153 int process() override;
3154 };
3155
3156 int RGWDataNotifier::process()
3157 {
3158 if (!store->data_log) {
3159 return 0;
3160 }
3161
3162 map<int, set<string> > shards;
3163
3164 store->data_log->read_clear_modified(shards);
3165
3166 if (shards.empty()) {
3167 return 0;
3168 }
3169
3170 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3171 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3172 }
3173
3174 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3175
3176 return 0;
3177 }
3178
3179 class RGWSyncProcessorThread : public RGWRadosThread {
3180 public:
3181 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3182 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3183 ~RGWSyncProcessorThread() override {}
3184 int init() override = 0 ;
3185 int process() override = 0;
3186 };
3187
3188 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3189 {
3190 RGWMetaSyncStatusManager sync;
3191
3192 uint64_t interval_msec() override {
3193 return 0; /* no interval associated, it'll run once until stopped */
3194 }
3195 void stop_process() override {
3196 sync.stop();
3197 }
3198 public:
3199 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3200 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3201
3202 void wakeup_sync_shards(set<int>& shard_ids) {
3203 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3204 sync.wakeup(*iter);
3205 }
3206 }
3207 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3208
3209 int init() override {
3210 int ret = sync.init();
3211 if (ret < 0) {
3212 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3213 return ret;
3214 }
3215 return 0;
3216 }
3217
3218 int process() override {
3219 sync.run();
3220 return 0;
3221 }
3222 };
3223
3224 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3225 {
3226 RGWDataSyncStatusManager sync;
3227 bool initialized;
3228
3229 uint64_t interval_msec() override {
3230 if (initialized) {
3231 return 0; /* no interval associated, it'll run once until stopped */
3232 } else {
3233 #define DATA_SYNC_INIT_WAIT_SEC 20
3234 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3235 }
3236 }
3237 void stop_process() override {
3238 sync.stop();
3239 }
3240 public:
3241 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3242 const string& _source_zone,
3243 rgw::BucketChangeObserver *observer)
3244 : RGWSyncProcessorThread(_store, "data-sync"),
3245 sync(_store, async_rados, _source_zone, observer),
3246 initialized(false) {}
3247
3248 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3249 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3250 sync.wakeup(iter->first, iter->second);
3251 }
3252 }
3253 RGWDataSyncStatusManager* get_manager() { return &sync; }
3254
3255 int init() override {
3256 return 0;
3257 }
3258
3259 int process() override {
3260 while (!initialized) {
3261 if (going_down()) {
3262 return 0;
3263 }
3264 int ret = sync.init();
3265 if (ret >= 0) {
3266 initialized = true;
3267 break;
3268 }
3269 /* we'll be back! */
3270 return 0;
3271 }
3272 sync.run();
3273 return 0;
3274 }
3275 };
3276
3277 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3278 {
3279 RGWCoroutinesManager crs;
3280 RGWRados *store;
3281 rgw::BucketTrimManager *bucket_trim;
3282 RGWHTTPManager http;
3283 const utime_t trim_interval;
3284
3285 uint64_t interval_msec() override { return 0; }
3286 void stop_process() override { crs.stop(); }
3287 public:
3288 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3289 int interval)
3290 : RGWSyncProcessorThread(store, "sync-log-trim"),
3291 crs(store->ctx(), store->get_cr_registry()), store(store),
3292 bucket_trim(bucket_trim),
3293 http(store->ctx(), crs.get_completion_mgr()),
3294 trim_interval(interval, 0)
3295 {}
3296
3297 int init() override {
3298 return http.set_threaded();
3299 }
3300 int process() override {
3301 list<RGWCoroutinesStack*> stacks;
3302 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3303 meta->call(create_meta_log_trim_cr(store, &http,
3304 cct->_conf->rgw_md_log_max_shards,
3305 trim_interval));
3306 stacks.push_back(meta);
3307
3308 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3309 data->call(create_data_log_trim_cr(store, &http,
3310 cct->_conf->rgw_data_log_num_shards,
3311 trim_interval));
3312 stacks.push_back(data);
3313
3314 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3315 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3316 stacks.push_back(bucket);
3317
3318 crs.run(stacks);
3319 return 0;
3320 }
3321 };
3322
3323 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3324 {
3325 Mutex::Locker l(meta_sync_thread_lock);
3326 if (meta_sync_processor_thread) {
3327 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3328 }
3329 }
3330
3331 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3332 {
3333 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3334 Mutex::Locker l(data_sync_thread_lock);
3335 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3336 if (iter == data_sync_processor_threads.end()) {
3337 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3338 return;
3339 }
3340
3341 RGWDataSyncProcessorThread *thread = iter->second;
3342 assert(thread);
3343 thread->wakeup_sync_shards(shard_ids);
3344 }
3345
3346 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3347 {
3348 Mutex::Locker l(meta_sync_thread_lock);
3349 if (meta_sync_processor_thread) {
3350 return meta_sync_processor_thread->get_manager();
3351 }
3352 return nullptr;
3353 }
3354
3355 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3356 {
3357 Mutex::Locker l(data_sync_thread_lock);
3358 auto thread = data_sync_processor_threads.find(source_zone);
3359 if (thread == data_sync_processor_threads.end()) {
3360 return nullptr;
3361 }
3362 return thread->second->get_manager();
3363 }
3364
3365 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3366 {
3367 IoCtx ioctx;
3368 int r = open_pool_ctx(pool, ioctx);
3369 if (r < 0) {
3370 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3371 return r;
3372 }
3373
3374 bool requires;
3375 r = ioctx.pool_requires_alignment2(&requires);
3376 if (r < 0) {
3377 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3378 << r << dendl;
3379 return r;
3380 }
3381
3382 if (!requires) {
3383 *alignment = 0;
3384 return 0;
3385 }
3386
3387 uint64_t align;
3388 r = ioctx.pool_required_alignment2(&align);
3389 if (r < 0) {
3390 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3391 << r << dendl;
3392 return r;
3393 }
3394 if (align != 0) {
3395 ldout(cct, 20) << "required alignment=" << align << dendl;
3396 }
3397 *alignment = align;
3398 return 0;
3399 }
3400
3401 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3402 {
3403 uint64_t alignment = 0;
3404 int r = get_required_alignment(pool, &alignment);
3405 if (r < 0) {
3406 return r;
3407 }
3408
3409 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3410
3411 if (alignment == 0) {
3412 *max_chunk_size = config_chunk_size;
3413 return 0;
3414 }
3415
3416 if (config_chunk_size <= alignment) {
3417 *max_chunk_size = alignment;
3418 return 0;
3419 }
3420
3421 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3422
3423 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3424
3425 return 0;
3426 }
3427
3428 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3429 {
3430 rgw_pool pool;
3431 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3432 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3433 return -EIO;
3434 }
3435 return get_max_chunk_size(pool, max_chunk_size);
3436 }
3437
3438 class RGWIndexCompletionManager;
3439
3440 struct complete_op_data {
3441 Mutex lock{"complete_op_data"};
3442 AioCompletion *rados_completion{nullptr};
3443 int manager_shard_id{-1};
3444 RGWIndexCompletionManager *manager{nullptr};
3445 rgw_obj obj;
3446 RGWModifyOp op;
3447 string tag;
3448 rgw_bucket_entry_ver ver;
3449 cls_rgw_obj_key key;
3450 rgw_bucket_dir_entry_meta dir_meta;
3451 list<cls_rgw_obj_key> remove_objs;
3452 bool log_op;
3453 uint16_t bilog_op;
3454 rgw_zone_set zones_trace;
3455
3456 bool stopped{false};
3457
3458 void stop() {
3459 Mutex::Locker l(lock);
3460 stopped = true;
3461 }
3462 };
3463
3464 class RGWIndexCompletionThread : public RGWRadosThread {
3465 RGWRados *store;
3466
3467 uint64_t interval_msec() override {
3468 return 0;
3469 }
3470
3471 list<complete_op_data *> completions;
3472
3473 Mutex completions_lock;
3474 public:
3475 RGWIndexCompletionThread(RGWRados *_store)
3476 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3477
3478 int process() override;
3479
3480 void add_completion(complete_op_data *completion) {
3481 {
3482 Mutex::Locker l(completions_lock);
3483 completions.push_back(completion);
3484 }
3485
3486 signal();
3487 }
3488 };
3489
3490 int RGWIndexCompletionThread::process()
3491 {
3492 list<complete_op_data *> comps;
3493
3494 {
3495 Mutex::Locker l(completions_lock);
3496 completions.swap(comps);
3497 }
3498
3499 for (auto c : comps) {
3500 std::unique_ptr<complete_op_data> up{c};
3501
3502 if (going_down()) {
3503 continue;
3504 }
3505 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3506
3507 RGWRados::BucketShard bs(store);
3508
3509 int r = bs.init(c->obj.bucket, c->obj);
3510 if (r < 0) {
3511 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3512 /* not much to do */
3513 continue;
3514 }
3515
3516 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3517 librados::ObjectWriteOperation o;
3518 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3519 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3520 c->log_op, c->bilog_op, &c->zones_trace);
3521
3522 return bs->index_ctx.operate(bs->bucket_obj, &o);
3523 });
3524 if (r < 0) {
3525 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3526 /* ignoring error, can't do anything about it */
3527 continue;
3528 }
3529 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3530 if (r < 0) {
3531 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3532 }
3533 }
3534
3535 return 0;
3536 }
3537
3538 class RGWIndexCompletionManager {
3539 RGWRados *store{nullptr};
3540 vector<Mutex *> locks;
3541 vector<set<complete_op_data *> > completions;
3542
3543 RGWIndexCompletionThread *completion_thread{nullptr};
3544
3545 int num_shards;
3546
3547 std::atomic<int> cur_shard {0};
3548
3549
3550 public:
3551 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3552 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3553
3554 for (int i = 0; i < num_shards; i++) {
3555 char buf[64];
3556 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3557 locks.push_back(new Mutex(buf));
3558 }
3559
3560 completions.resize(num_shards);
3561 }
3562 ~RGWIndexCompletionManager() {
3563 stop();
3564
3565 for (auto l : locks) {
3566 delete l;
3567 }
3568 }
3569
3570 int next_shard() {
3571 int result = cur_shard % num_shards;
3572 cur_shard++;
3573 return result;
3574 }
3575
3576 void create_completion(const rgw_obj& obj,
3577 RGWModifyOp op, string& tag,
3578 rgw_bucket_entry_ver& ver,
3579 const cls_rgw_obj_key& key,
3580 rgw_bucket_dir_entry_meta& dir_meta,
3581 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3582 uint16_t bilog_op,
3583 rgw_zone_set *zones_trace,
3584 complete_op_data **result);
3585 bool handle_completion(completion_t cb, complete_op_data *arg);
3586
3587 int start() {
3588 completion_thread = new RGWIndexCompletionThread(store);
3589 int ret = completion_thread->init();
3590 if (ret < 0) {
3591 return ret;
3592 }
3593 completion_thread->start();
3594 return 0;
3595 }
3596 void stop() {
3597 if (completion_thread) {
3598 completion_thread->stop();
3599 delete completion_thread;
3600 }
3601
3602 for (int i = 0; i < num_shards; ++i) {
3603 Mutex::Locker l(*locks[i]);
3604 for (auto c : completions[i]) {
3605 Mutex::Locker cl(c->lock);
3606 c->stop();
3607 }
3608 }
3609 completions.clear();
3610 }
3611 };
3612
3613 static void obj_complete_cb(completion_t cb, void *arg)
3614 {
3615 complete_op_data *completion = (complete_op_data *)arg;
3616 completion->lock.Lock();
3617 if (completion->stopped) {
3618 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3619 delete completion;
3620 return;
3621 }
3622 bool need_delete = completion->manager->handle_completion(cb, completion);
3623 completion->lock.Unlock();
3624 if (need_delete) {
3625 delete completion;
3626 }
3627 }
3628
3629
3630 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3631 RGWModifyOp op, string& tag,
3632 rgw_bucket_entry_ver& ver,
3633 const cls_rgw_obj_key& key,
3634 rgw_bucket_dir_entry_meta& dir_meta,
3635 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3636 uint16_t bilog_op,
3637 rgw_zone_set *zones_trace,
3638 complete_op_data **result)
3639 {
3640 complete_op_data *entry = new complete_op_data;
3641
3642 int shard_id = next_shard();
3643
3644 entry->manager_shard_id = shard_id;
3645 entry->manager = this;
3646 entry->obj = obj;
3647 entry->op = op;
3648 entry->tag = tag;
3649 entry->ver = ver;
3650 entry->key = key;
3651 entry->dir_meta = dir_meta;
3652 entry->log_op = log_op;
3653 entry->bilog_op = bilog_op;
3654
3655 if (remove_objs) {
3656 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3657 entry->remove_objs.push_back(*iter);
3658 }
3659 }
3660
3661 if (zones_trace) {
3662 entry->zones_trace = *zones_trace;
3663 } else {
3664 entry->zones_trace.insert(store->get_zone().id);
3665 }
3666
3667 *result = entry;
3668
3669 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3670
3671 Mutex::Locker l(*locks[shard_id]);
3672 completions[shard_id].insert(entry);
3673 }
3674
3675 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3676 {
3677 int shard_id = arg->manager_shard_id;
3678 {
3679 Mutex::Locker l(*locks[shard_id]);
3680
3681 auto& comps = completions[shard_id];
3682
3683 auto iter = comps.find(arg);
3684 if (iter == comps.end()) {
3685 return true;
3686 }
3687
3688 comps.erase(iter);
3689 }
3690
3691 int r = rados_aio_get_return_value(cb);
3692 if (r != -ERR_BUSY_RESHARDING) {
3693 return true;
3694 }
3695 completion_thread->add_completion(arg);
3696 return false;
3697 }
3698
3699 void RGWRados::finalize()
3700 {
3701 auto admin_socket = cct->get_admin_socket();
3702 for (auto cmd : admin_commands) {
3703 int r = admin_socket->unregister_command(cmd[0]);
3704 if (r < 0) {
3705 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3706 << ")" << dendl;
3707 }
3708 }
3709
3710 if (run_sync_thread) {
3711 Mutex::Locker l(meta_sync_thread_lock);
3712 meta_sync_processor_thread->stop();
3713
3714 Mutex::Locker dl(data_sync_thread_lock);
3715 for (auto iter : data_sync_processor_threads) {
3716 RGWDataSyncProcessorThread *thread = iter.second;
3717 thread->stop();
3718 }
3719 if (sync_log_trimmer) {
3720 sync_log_trimmer->stop();
3721 }
3722 }
3723 if (async_rados) {
3724 async_rados->stop();
3725 }
3726 if (run_sync_thread) {
3727 delete meta_sync_processor_thread;
3728 meta_sync_processor_thread = NULL;
3729 Mutex::Locker dl(data_sync_thread_lock);
3730 for (auto iter : data_sync_processor_threads) {
3731 RGWDataSyncProcessorThread *thread = iter.second;
3732 delete thread;
3733 }
3734 data_sync_processor_threads.clear();
3735 delete sync_log_trimmer;
3736 sync_log_trimmer = nullptr;
3737 bucket_trim = boost::none;
3738 }
3739 if (finisher) {
3740 finisher->stop();
3741 }
3742 if (need_watch_notify()) {
3743 finalize_watch();
3744 }
3745 if (finisher) {
3746 /* delete finisher only after cleaning up watches, as watch error path might call
3747 * into finisher. We stop finisher before finalizing watch to make sure we don't
3748 * actually handle any racing work
3749 */
3750 delete finisher;
3751 }
3752 if (meta_notifier) {
3753 meta_notifier->stop();
3754 delete meta_notifier;
3755 }
3756 if (data_notifier) {
3757 data_notifier->stop();
3758 delete data_notifier;
3759 }
3760 delete data_log;
3761 if (async_rados) {
3762 delete async_rados;
3763 }
3764
3765 delete lc;
3766 lc = NULL;
3767
3768 delete gc;
3769 gc = NULL;
3770
3771 delete obj_expirer;
3772 obj_expirer = NULL;
3773
3774 delete rest_master_conn;
3775
3776 map<string, RGWRESTConn *>::iterator iter;
3777 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3778 RGWRESTConn *conn = iter->second;
3779 delete conn;
3780 }
3781
3782 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3783 RGWRESTConn *conn = iter->second;
3784 delete conn;
3785 }
3786 RGWQuotaHandler::free_handler(quota_handler);
3787 if (cr_registry) {
3788 cr_registry->put();
3789 }
3790 delete meta_mgr;
3791 delete binfo_cache;
3792 delete obj_tombstone_cache;
3793 delete sync_modules_manager;
3794
3795 if (reshard_wait.get()) {
3796 reshard_wait->stop();
3797 reshard_wait.reset();
3798 }
3799
3800 if (run_reshard_thread) {
3801 reshard->stop_processor();
3802 }
3803 delete reshard;
3804 delete index_completion_manager;
3805 }
3806
3807 /**
3808 * Initialize the RADOS instance and prepare to do other ops
3809 * Returns 0 on success, -ERR# on failure.
3810 */
3811 int RGWRados::init_rados()
3812 {
3813 int ret = 0;
3814 auto admin_socket = cct->get_admin_socket();
3815 for (auto cmd : admin_commands) {
3816 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3817 cmd[2]);
3818 if (r < 0) {
3819 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3820 << ")" << dendl;
3821 return r;
3822 }
3823 }
3824
3825 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3826
3827 for (auto& r : handles) {
3828 ret = r.init_with_context(cct);
3829 if (ret < 0) {
3830 return ret;
3831 }
3832 ret = r.connect();
3833 if (ret < 0) {
3834 return ret;
3835 }
3836 }
3837
3838 sync_modules_manager = new RGWSyncModulesManager();
3839
3840 rgw_register_sync_modules(sync_modules_manager);
3841
3842 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3843 new RGWCoroutinesManagerRegistry(cct)};
3844 ret = crs->hook_to_admin_command("cr dump");
3845 if (ret < 0) {
3846 return ret;
3847 }
3848
3849 meta_mgr = new RGWMetadataManager(cct, this);
3850 data_log = new RGWDataChangesLog(cct, this);
3851 cr_registry = crs.release();
3852
3853 std::swap(handles, rados);
3854 return ret;
3855 }
3856
3857
3858 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3859 {
3860 map<string,string> metadata = meta;
3861 metadata["num_handles"] = stringify(rados.size());
3862 metadata["zonegroup_id"] = zonegroup.get_id();
3863 metadata["zonegroup_name"] = zonegroup.get_name();
3864 metadata["zone_name"] = zone_name();
3865 metadata["zone_id"] = zone_id();;
3866 string name = cct->_conf->name.get_id();
3867 if (name.find("rgw.") == 0) {
3868 name = name.substr(4);
3869 }
3870 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3871 if (ret < 0) {
3872 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3873 return ret;
3874 }
3875
3876 return 0;
3877 }
3878
3879 /**
3880 * Add new connection to connections map
3881 * @param zonegroup_conn_map map which new connection will be added to
3882 * @param zonegroup zonegroup which new connection will connect to
3883 * @param new_connection pointer to new connection instance
3884 */
3885 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3886 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3887 {
3888 // Delete if connection is already exists
3889 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3890 if (iterZoneGroup != zonegroup_conn_map.end()) {
3891 delete iterZoneGroup->second;
3892 }
3893
3894 // Add new connection to connections map
3895 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3896 }
3897
3898 int RGWRados::convert_regionmap()
3899 {
3900 RGWZoneGroupMap zonegroupmap;
3901
3902 string pool_name = cct->_conf->rgw_zone_root_pool;
3903 if (pool_name.empty()) {
3904 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3905 }
3906 string oid = region_map_oid;
3907
3908 rgw_pool pool(pool_name);
3909 bufferlist bl;
3910 RGWObjectCtx obj_ctx(this);
3911 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3912 if (ret < 0 && ret != -ENOENT) {
3913 return ret;
3914 } else if (ret == -ENOENT) {
3915 return 0;
3916 }
3917
3918 try {
3919 bufferlist::iterator iter = bl.begin();
3920 ::decode(zonegroupmap, iter);
3921 } catch (buffer::error& err) {
3922 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3923 return -EIO;
3924 }
3925
3926 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3927 iter != zonegroupmap.zonegroups.end(); ++iter) {
3928 RGWZoneGroup& zonegroup = iter->second;
3929 ret = zonegroup.init(cct, this, false);
3930 ret = zonegroup.update();
3931 if (ret < 0 && ret != -ENOENT) {
3932 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3933 cpp_strerror(-ret) << dendl;
3934 return ret;
3935 } else if (ret == -ENOENT) {
3936 ret = zonegroup.create();
3937 if (ret < 0) {
3938 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3939 cpp_strerror(-ret) << dendl;
3940 return ret;
3941 }
3942 }
3943 }
3944
3945 current_period.set_user_quota(zonegroupmap.user_quota);
3946 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3947
3948 // remove the region_map so we don't try to convert again
3949 rgw_raw_obj obj(pool, oid);
3950 ret = delete_system_obj(obj);
3951 if (ret < 0) {
3952 ldout(cct, 0) << "Error could not remove " << obj
3953 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3954 return ret;
3955 }
3956
3957 return 0;
3958 }
3959
3960 /**
3961 * Replace all region configuration with zonegroup for
3962 * backward compatability
3963 * Returns 0 on success, -ERR# on failure.
3964 */
3965 int RGWRados::replace_region_with_zonegroup()
3966 {
3967 /* copy default region */
3968 /* convert default region to default zonegroup */
3969 string default_oid = cct->_conf->rgw_default_region_info_oid;
3970 if (default_oid.empty()) {
3971 default_oid = default_region_info_oid;
3972 }
3973
3974
3975 RGWZoneGroup default_zonegroup;
3976 rgw_pool pool{default_zonegroup.get_pool(cct)};
3977 string oid = "converted";
3978 bufferlist bl;
3979 RGWObjectCtx obj_ctx(this);
3980
3981 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3982 if (ret < 0 && ret != -ENOENT) {
3983 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3984 << dendl;
3985 return ret;
3986 } else if (ret != -ENOENT) {
3987 ldout(cct, 20) << "System already converted " << dendl;
3988 return 0;
3989 }
3990
3991 string default_region;
3992 ret = default_zonegroup.init(cct, this, false, true);
3993 if (ret < 0) {
3994 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3995 return ret;
3996 }
3997 ret = default_zonegroup.read_default_id(default_region, true);
3998 if (ret < 0 && ret != -ENOENT) {
3999 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4000 return ret;
4001 }
4002
4003 /* convert regions to zonegroups */
4004 list<string> regions;
4005 ret = list_regions(regions);
4006 if (ret < 0 && ret != -ENOENT) {
4007 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4008 return ret;
4009 } else if (ret == -ENOENT || regions.empty()) {
4010 RGWZoneParams zoneparams(default_zone_name);
4011 int ret = zoneparams.init(cct, this);
4012 if (ret < 0 && ret != -ENOENT) {
4013 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4014 return ret;
4015 }
4016 /* update master zone */
4017 RGWZoneGroup default_zg(default_zonegroup_name);
4018 ret = default_zg.init(cct, this);
4019 if (ret < 0 && ret != -ENOENT) {
4020 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4021 return ret;
4022 }
4023 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4024 default_zg.master_zone = zoneparams.get_id();
4025 return default_zg.update();
4026 }
4027 return 0;
4028 }
4029
4030 string master_region, master_zone;
4031 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4032 if (*iter != default_zonegroup_name){
4033 RGWZoneGroup region(*iter);
4034 int ret = region.init(cct, this, true, true);
4035 if (ret < 0) {
4036 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4037 return ret;
4038 }
4039 if (region.is_master_zonegroup()) {
4040 master_region = region.get_id();
4041 master_zone = region.master_zone;
4042 }
4043 }
4044 }
4045
4046 /* create realm if there is none.
4047 The realm name will be the region and zone concatenated
4048 realm id will be mds of its name */
4049 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4050 string new_realm_name = master_region + "." + master_zone;
4051 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4052 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4053 MD5 hash;
4054 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4055 hash.Final(md5);
4056 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4057 string new_realm_id(md5_str);
4058 RGWRealm new_realm(new_realm_id,new_realm_name);
4059 ret = new_realm.init(cct, this, false);
4060 if (ret < 0) {
4061 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4062 return ret;
4063 }
4064 ret = new_realm.create();
4065 if (ret < 0 && ret != -EEXIST) {
4066 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4067 return ret;
4068 }
4069 ret = new_realm.set_as_default();
4070 if (ret < 0) {
4071 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4072 return ret;
4073 }
4074 ret = realm.init(cct, this);
4075 if (ret < 0) {
4076 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4077 return ret;
4078 }
4079 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4080 if (ret < 0) {
4081 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4082 return ret;
4083 }
4084 }
4085
4086 list<string>::iterator iter;
4087 /* create zonegroups */
4088 for (iter = regions.begin(); iter != regions.end(); ++iter)
4089 {
4090 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4091 /* check to see if we don't have already a zonegroup with this name */
4092 RGWZoneGroup new_zonegroup(*iter);
4093 ret = new_zonegroup.init(cct , this);
4094 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4095 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4096 " skipping conversion " << dendl;
4097 continue;
4098 }
4099 RGWZoneGroup zonegroup(*iter);
4100 zonegroup.set_id(*iter);
4101 int ret = zonegroup.init(cct, this, true, true);
4102 if (ret < 0) {
4103 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4104 return ret;
4105 }
4106 zonegroup.realm_id = realm.get_id();
4107 /* fix default region master zone */
4108 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4109 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4110 zonegroup.master_zone = default_zone_name;
4111 }
4112 ret = zonegroup.update();
4113 if (ret < 0 && ret != -EEXIST) {
4114 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4115 << dendl;
4116 return ret;
4117 }
4118 ret = zonegroup.update_name();
4119 if (ret < 0 && ret != -EEXIST) {
4120 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4121 << dendl;
4122 return ret;
4123 }
4124 if (zonegroup.get_name() == default_region) {
4125 ret = zonegroup.set_as_default();
4126 if (ret < 0) {
4127 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4128 << dendl;
4129 return ret;
4130 }
4131 }
4132 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4133 ++iter) {
4134 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4135 RGWZoneParams zoneparams(iter->first, iter->first);
4136 zoneparams.set_id(iter->first);
4137 zoneparams.realm_id = realm.get_id();
4138 ret = zoneparams.init(cct, this);
4139 if (ret < 0 && ret != -ENOENT) {
4140 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4141 return ret;
4142 } else if (ret == -ENOENT) {
4143 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4144 continue;
4145 }
4146 zonegroup.realm_id = realm.get_id();
4147 ret = zoneparams.update();
4148 if (ret < 0 && ret != -EEXIST) {
4149 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4150 return ret;
4151 }
4152 ret = zoneparams.update_name();
4153 if (ret < 0 && ret != -EEXIST) {
4154 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4155 return ret;
4156 }
4157 }
4158
4159 if (!current_period.get_id().empty()) {
4160 ret = current_period.add_zonegroup(zonegroup);
4161 if (ret < 0) {
4162 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4163 return ret;
4164 }
4165 }
4166 }
4167
4168 if (!current_period.get_id().empty()) {
4169 ret = current_period.update();
4170 if (ret < 0) {
4171 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4172 return ret;
4173 }
4174 ret = current_period.store_info(false);
4175 if (ret < 0) {
4176 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4177 return ret;
4178 }
4179 ret = current_period.reflect();
4180 if (ret < 0) {
4181 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4182 return ret;
4183 }
4184 }
4185
4186 for (auto const& iter : regions) {
4187 RGWZoneGroup zonegroup(iter);
4188 int ret = zonegroup.init(cct, this, true, true);
4189 if (ret < 0) {
4190 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4191 return ret;
4192 }
4193 ret = zonegroup.delete_obj(true);
4194 if (ret < 0 && ret != -ENOENT) {
4195 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4196 << dendl;
4197 return ret;
4198 }
4199 }
4200
4201 /* mark as converted */
4202 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4203 true, NULL, real_time(), NULL);
4204 if (ret < 0 ) {
4205 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4206 << dendl;
4207 return ret;
4208 }
4209
4210 return 0;
4211 }
4212
4213 int RGWRados::init_zg_from_period(bool *initialized)
4214 {
4215 *initialized = false;
4216
4217 if (current_period.get_id().empty()) {
4218 return 0;
4219 }
4220
4221 int ret = zonegroup.init(cct, this);
4222 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4223 if (ret == -ENOENT) {
4224 return 0;
4225 }
4226 if (ret < 0) {
4227 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4228 return ret;
4229 }
4230 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4231
4232 map<string, RGWZoneGroup>::const_iterator iter =
4233 current_period.get_map().zonegroups.find(zonegroup.get_id());
4234
4235 if (iter != current_period.get_map().zonegroups.end()) {
4236 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4237 zonegroup = iter->second;
4238 ret = zonegroup.init(cct, this, false);
4239 if (ret < 0) {
4240 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4241 return ret;
4242 }
4243 ret = zone_params.init(cct, this);
4244 if (ret < 0 && ret != -ENOENT) {
4245 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4246 return ret;
4247 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4248 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4249 zone_params.set_name(default_zone_name);
4250 ret = zone_params.init(cct, this);
4251 if (ret < 0 && ret != -ENOENT) {
4252 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4253 return ret;
4254 }
4255 }
4256 }
4257 for (iter = current_period.get_map().zonegroups.begin();
4258 iter != current_period.get_map().zonegroups.end(); ++iter){
4259 const RGWZoneGroup& zg = iter->second;
4260 // use endpoints from the zonegroup's master zone
4261 auto master = zg.zones.find(zg.master_zone);
4262 if (master == zg.zones.end()) {
4263 // fix missing master zone for a single zone zonegroup
4264 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4265 master = zg.zones.begin();
4266 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4267 master->second.name << " id:" << master->second.id << " as master" << dendl;
4268 if (zonegroup.get_id() == zg.get_id()) {
4269 zonegroup.master_zone = master->second.id;
4270 ret = zonegroup.update();
4271 if (ret < 0) {
4272 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4273 return ret;
4274 }
4275 } else {
4276 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4277 ret = fixed_zg.init(cct, this);
4278 if (ret < 0) {
4279 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4280 return ret;
4281 }
4282 fixed_zg.master_zone = master->second.id;
4283 ret = fixed_zg.update();
4284 if (ret < 0) {
4285 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4286 return ret;
4287 }
4288 }
4289 } else {
4290 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4291 zg.master_zone << dendl;
4292 return -EINVAL;
4293 }
4294 }
4295 const auto& endpoints = master->second.endpoints;
4296 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4297 if (!current_period.get_master_zonegroup().empty() &&
4298 zg.get_id() == current_period.get_master_zonegroup()) {
4299 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4300 }
4301 }
4302
4303 *initialized = true;
4304
4305 return 0;
4306 }
4307
4308 int RGWRados::init_zg_from_local(bool *creating_defaults)
4309 {
4310 int ret = zonegroup.init(cct, this);
4311 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4312 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4313 return ret;
4314 } else if (ret == -ENOENT) {
4315 *creating_defaults = true;
4316 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4317 ret = zonegroup.create_default();
4318 if (ret < 0) {
4319 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4320 << dendl;
4321 return ret;
4322 }
4323 ret = zonegroup.init(cct, this);
4324 if (ret < 0) {
4325 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4326 << dendl;
4327 return ret;
4328 }
4329 }
4330 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4331 if (zonegroup.is_master_zonegroup()) {
4332 // use endpoints from the zonegroup's master zone
4333 auto master = zonegroup.zones.find(zonegroup.master_zone);
4334 if (master == zonegroup.zones.end()) {
4335 // fix missing master zone for a single zone zonegroup
4336 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4337 master = zonegroup.zones.begin();
4338 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4339 master->second.name << " id:" << master->second.id << " as master" << dendl;
4340 zonegroup.master_zone = master->second.id;
4341 ret = zonegroup.update();
4342 if (ret < 0) {
4343 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4344 return ret;
4345 }
4346 } else {
4347 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4348 "master_zone=" << zonegroup.master_zone << dendl;
4349 return -EINVAL;
4350 }
4351 }
4352 const auto& endpoints = master->second.endpoints;
4353 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4354 }
4355
4356 return 0;
4357 }
4358
4359
4360 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4361 {
4362 return target_zone.syncs_from(source_zone.name) &&
4363 sync_modules_manager->supports_data_export(source_zone.tier_type);
4364 }
4365
4366 /**
4367 * Initialize the RADOS instance and prepare to do other ops
4368 * Returns 0 on success, -ERR# on failure.
4369 */
4370 int RGWRados::init_complete()
4371 {
4372 int ret = realm.init(cct, this);
4373 if (ret < 0 && ret != -ENOENT) {
4374 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4375 return ret;
4376 } else if (ret != -ENOENT) {
4377 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4378 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4379 if (ret < 0 && ret != -ENOENT) {
4380 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4381 return ret;
4382 }
4383 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4384 }
4385
4386 ret = replace_region_with_zonegroup();
4387 if (ret < 0) {
4388 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4389 return ret;
4390 }
4391
4392 ret = convert_regionmap();
4393 if (ret < 0) {
4394 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4395 return ret;
4396 }
4397
4398 bool zg_initialized = false;
4399
4400 if (!current_period.get_id().empty()) {
4401 ret = init_zg_from_period(&zg_initialized);
4402 if (ret < 0) {
4403 return ret;
4404 }
4405 }
4406
4407 bool creating_defaults = false;
4408 bool using_local = (!zg_initialized);
4409 if (using_local) {
4410 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4411 ret = init_zg_from_local(&creating_defaults);
4412 if (ret < 0) {
4413 return ret;
4414 }
4415 // read period_config into current_period
4416 auto& period_config = current_period.get_config();
4417 ret = period_config.read(this, zonegroup.realm_id);
4418 if (ret < 0 && ret != -ENOENT) {
4419 ldout(cct, 0) << "ERROR: failed to read period config: "
4420 << cpp_strerror(ret) << dendl;
4421 return ret;
4422 }
4423 }
4424
4425 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4426 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4427 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4428 zone_params.set_name(default_zone_name);
4429 }
4430
4431 ret = zone_params.init(cct, this);
4432 if (ret < 0 && ret != -ENOENT) {
4433 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4434 return ret;
4435 }
4436 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4437 if (zone_iter == get_zonegroup().zones.end()) {
4438 if (using_local) {
4439 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4440 return -EINVAL;
4441 }
4442 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4443 ret = init_zg_from_local(&creating_defaults);
4444 if (ret < 0) {
4445 return ret;
4446 }
4447 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4448 }
4449 if (zone_iter != get_zonegroup().zones.end()) {
4450 zone_public_config = zone_iter->second;
4451 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4452 } else {
4453 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4454 return -EINVAL;
4455 }
4456
4457 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4458
4459 if (run_sync_thread) {
4460 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4461 if (ret < 0) {
4462 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4463 return ret;
4464 }
4465 }
4466
4467 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4468
4469 init_unique_trans_id_deps();
4470
4471 finisher = new Finisher(cct);
4472 finisher->start();
4473
4474 period_puller.reset(new RGWPeriodPuller(this));
4475 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4476 current_period));
4477
4478 if (need_watch_notify()) {
4479 ret = init_watch();
4480 if (ret < 0) {
4481 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4482 return ret;
4483 }
4484 }
4485
4486 /* first build all zones index */
4487 for (auto ziter : get_zonegroup().zones) {
4488 const string& id = ziter.first;
4489 RGWZone& z = ziter.second;
4490 zone_id_by_name[z.name] = id;
4491 zone_by_id[id] = z;
4492 }
4493
4494 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4495 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4496 }
4497 zone_public_config = zone_by_id[zone_id()];
4498 for (auto ziter : get_zonegroup().zones) {
4499 const string& id = ziter.first;
4500 RGWZone& z = ziter.second;
4501 if (id == zone_id()) {
4502 continue;
4503 }
4504 if (z.endpoints.empty()) {
4505 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4506 continue;
4507 }
4508 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4509 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4510 zone_conn_map[id] = conn;
4511 if (zone_syncs_from(zone_public_config, z) ||
4512 zone_syncs_from(z, zone_public_config)) {
4513 if (zone_syncs_from(zone_public_config, z)) {
4514 zone_data_sync_from_map[id] = conn;
4515 }
4516 if (zone_syncs_from(z, zone_public_config)) {
4517 zone_data_notify_to_map[id] = conn;
4518 }
4519 } else {
4520 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4521 }
4522 }
4523
4524 ret = open_root_pool_ctx();
4525 if (ret < 0)
4526 return ret;
4527
4528 ret = open_gc_pool_ctx();
4529 if (ret < 0)
4530 return ret;
4531
4532 ret = open_lc_pool_ctx();
4533 if (ret < 0)
4534 return ret;
4535
4536 ret = open_objexp_pool_ctx();
4537 if (ret < 0)
4538 return ret;
4539
4540 ret = open_reshard_pool_ctx();
4541 if (ret < 0)
4542 return ret;
4543
4544 pools_initialized = true;
4545
4546 gc = new RGWGC();
4547 gc->initialize(cct, this);
4548
4549 obj_expirer = new RGWObjectExpirer(this);
4550
4551 if (use_gc_thread) {
4552 gc->start_processor();
4553 obj_expirer->start_processor();
4554 }
4555
4556 /* no point of running sync thread if we don't have a master zone configured
4557 or there is no rest_master_conn */
4558 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4559 || current_period.get_id().empty()) {
4560 run_sync_thread = false;
4561 }
4562
4563 if (run_sync_thread) {
4564 // initialize the log period history
4565 meta_mgr->init_oldest_log_period();
4566 }
4567
4568 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4569 async_rados->start();
4570
4571 ret = meta_mgr->init(current_period.get_id());
4572 if (ret < 0) {
4573 lderr(cct) << "ERROR: failed to initialize metadata log: "
4574 << cpp_strerror(-ret) << dendl;
4575 return ret;
4576 }
4577
4578 if (is_meta_master()) {
4579 auto md_log = meta_mgr->get_log(current_period.get_id());
4580 meta_notifier = new RGWMetaNotifier(this, md_log);
4581 meta_notifier->start();
4582 }
4583
4584 if (run_sync_thread) {
4585 Mutex::Locker l(meta_sync_thread_lock);
4586 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4587 ret = meta_sync_processor_thread->init();
4588 if (ret < 0) {
4589 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4590 return ret;
4591 }
4592 meta_sync_processor_thread->start();
4593
4594 // configure the bucket trim manager
4595 rgw::BucketTrimConfig config;
4596 rgw::configure_bucket_trim(cct, config);
4597
4598 bucket_trim.emplace(this, config);
4599 ret = bucket_trim->init();
4600 if (ret < 0) {
4601 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4602 return ret;
4603 }
4604
4605 Mutex::Locker dl(data_sync_thread_lock);
4606 for (auto iter : zone_data_sync_from_map) {
4607 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4608 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first,
4609 &*bucket_trim);
4610 ret = thread->init();
4611 if (ret < 0) {
4612 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4613 return ret;
4614 }
4615 thread->start();
4616 data_sync_processor_threads[iter.first] = thread;
4617 }
4618 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4619 if (interval > 0) {
4620 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
4621 ret = sync_log_trimmer->init();
4622 if (ret < 0) {
4623 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4624 return ret;
4625 }
4626 sync_log_trimmer->start();
4627 }
4628 }
4629 data_notifier = new RGWDataNotifier(this);
4630 data_notifier->start();
4631
4632 lc = new RGWLC();
4633 lc->initialize(cct, this);
4634
4635 if (use_lc_thread)
4636 lc->start_processor();
4637
4638 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4639
4640 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4641 get_zone().bucket_index_max_shards);
4642 if (bucket_index_max_shards > get_max_bucket_shards()) {
4643 bucket_index_max_shards = get_max_bucket_shards();
4644 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4645 << get_max_bucket_shards() << dendl;
4646 }
4647 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4648
4649 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4650 binfo_cache->init(this);
4651
4652 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4653
4654 if (need_tombstone_cache) {
4655 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4656 }
4657
4658 reshard_wait = std::make_shared<RGWReshardWait>(this);
4659
4660 reshard = new RGWReshard(this);
4661
4662 /* only the master zone in the zonegroup reshards buckets */
4663 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4664 if (run_reshard_thread) {
4665 reshard->start_processor();
4666 }
4667
4668 index_completion_manager = new RGWIndexCompletionManager(this);
4669 ret = index_completion_manager->start();
4670
4671 return ret;
4672 }
4673
4674 /**
4675 * Initialize the RADOS instance and prepare to do other ops
4676 * Returns 0 on success, -ERR# on failure.
4677 */
4678 int RGWRados::initialize()
4679 {
4680 int ret;
4681
4682 ret = init_rados();
4683 if (ret < 0)
4684 return ret;
4685
4686 return init_complete();
4687 }
4688
4689 void RGWRados::finalize_watch()
4690 {
4691 for (int i = 0; i < num_watchers; i++) {
4692 RGWWatcher *watcher = watchers[i];
4693 watcher->unregister_watch();
4694 delete watcher;
4695 }
4696
4697 delete[] notify_oids;
4698 delete[] watchers;
4699 }
4700
4701 void RGWRados::schedule_context(Context *c) {
4702 finisher->queue(c);
4703 }
4704
4705 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4706 {
4707 bool is_truncated;
4708 RGWListRawObjsCtx ctx;
4709 do {
4710 list<string> oids;
4711 int r = list_raw_objects(pool, prefix, 1000,
4712 ctx, oids, &is_truncated);
4713 if (r < 0) {
4714 return r;
4715 }
4716 list<string>::iterator iter;
4717 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4718 string& val = *iter;
4719 if (val.size() > prefix.size())
4720 result.push_back(val.substr(prefix.size()));
4721 }
4722 } while (is_truncated);
4723
4724 return 0;
4725 }
4726
4727 int RGWRados::list_regions(list<string>& regions)
4728 {
4729 RGWZoneGroup zonegroup;
4730
4731 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4732 }
4733
4734 int RGWRados::list_zonegroups(list<string>& zonegroups)
4735 {
4736 RGWZoneGroup zonegroup;
4737
4738 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4739 }
4740
4741 int RGWRados::list_zones(list<string>& zones)
4742 {
4743 RGWZoneParams zoneparams;
4744
4745 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4746 }
4747
4748 int RGWRados::list_realms(list<string>& realms)
4749 {
4750 RGWRealm realm(cct, this);
4751 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4752 }
4753
4754 int RGWRados::list_periods(list<string>& periods)
4755 {
4756 RGWPeriod period;
4757 list<string> raw_periods;
4758 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4759 if (ret < 0) {
4760 return ret;
4761 }
4762 for (const auto& oid : raw_periods) {
4763 size_t pos = oid.find(".");
4764 if (pos != std::string::npos) {
4765 periods.push_back(oid.substr(0, pos));
4766 } else {
4767 periods.push_back(oid);
4768 }
4769 }
4770 periods.sort(); // unique() only detects duplicates if they're adjacent
4771 periods.unique();
4772 return 0;
4773 }
4774
4775
4776 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4777 {
4778 int ret = 0;
4779 string period_id = current_period;
4780 while(!period_id.empty()) {
4781 RGWPeriod period(period_id);
4782 ret = period.init(cct, this);
4783 if (ret < 0) {
4784 return ret;
4785 }
4786 periods.push_back(period.get_id());
4787 period_id = period.get_predecessor();
4788 }
4789
4790 return ret;
4791 }
4792
4793 /**
4794 * Open the pool used as root for this gateway
4795 * Returns: 0 on success, -ERR# otherwise.
4796 */
4797 int RGWRados::open_root_pool_ctx()
4798 {
4799 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4800 }
4801
4802 int RGWRados::open_gc_pool_ctx()
4803 {
4804 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4805 }
4806
4807 int RGWRados::open_lc_pool_ctx()
4808 {
4809 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4810 }
4811
4812 int RGWRados::open_objexp_pool_ctx()
4813 {
4814 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4815 }
4816
4817 int RGWRados::open_reshard_pool_ctx()
4818 {
4819 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4820 }
4821
4822 int RGWRados::init_watch()
4823 {
4824 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4825 if (r < 0) {
4826 return r;
4827 }
4828
4829 num_watchers = cct->_conf->rgw_num_control_oids;
4830
4831 bool compat_oid = (num_watchers == 0);
4832
4833 if (num_watchers <= 0)
4834 num_watchers = 1;
4835
4836 notify_oids = new string[num_watchers];
4837 watchers = new RGWWatcher *[num_watchers];
4838
4839 for (int i=0; i < num_watchers; i++) {
4840 string& notify_oid = notify_oids[i];
4841 notify_oid = notify_oid_prefix;
4842 if (!compat_oid) {
4843 char buf[16];
4844 snprintf(buf, sizeof(buf), ".%d", i);
4845 notify_oid.append(buf);
4846 }
4847 r = control_pool_ctx.create(notify_oid, false);
4848 if (r < 0 && r != -EEXIST)
4849 return r;
4850
4851 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4852 watchers[i] = watcher;
4853
4854 r = watcher->register_watch();
4855 if (r < 0)
4856 return r;
4857 }
4858
4859 watch_initialized = true;
4860
4861 set_cache_enabled(true);
4862
4863 return 0;
4864 }
4865
4866 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4867 {
4868 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4869
4870 int i = r % num_watchers;
4871 char buf[16];
4872 snprintf(buf, sizeof(buf), ".%d", i);
4873
4874 notify_oid = notify_oid_prefix;
4875 notify_oid.append(buf);
4876 }
4877
4878 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4879 {
4880 constexpr bool create = true; // create the pool if it doesn't exist
4881 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
4882 }
4883
4884 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4885 string *marker) {
4886 if (marker) {
4887 *marker = shard_id_str;
4888 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4889 marker->append(shard_marker);
4890 }
4891 }
4892
4893 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4894 {
4895 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4896
4897 if (!explicit_pool.empty()) {
4898 return open_pool_ctx(explicit_pool, index_ctx);
4899 }
4900
4901 const string *rule = &bucket_info.placement_rule;
4902 if (rule->empty()) {
4903 rule = &zonegroup.default_placement;
4904 }
4905 auto iter = zone_params.placement_pools.find(*rule);
4906 if (iter == zone_params.placement_pools.end()) {
4907 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4908 return -EINVAL;
4909 }
4910
4911 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4912 if (r < 0)
4913 return r;
4914
4915 return 0;
4916 }
4917
4918 /**
4919 * set up a bucket listing.
4920 * handle is filled in.
4921 * Returns 0 on success, -ERR# otherwise.
4922 */
4923 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4924 {
4925 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4926 *handle = (RGWAccessHandle)state;
4927 return 0;
4928 }
4929
4930 /**
4931 * get the next bucket in the listing.
4932 * obj is filled in,
4933 * handle is updated.
4934 * returns 0 on success, -ERR# otherwise.
4935 */
4936 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4937 {
4938 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4939
4940 do {
4941 if (*state == root_pool_ctx.nobjects_end()) {
4942 delete state;
4943 return -ENOENT;
4944 }
4945
4946 obj.key.name = (*state)->get_oid();
4947 if (obj.key.name[0] == '_') {
4948 obj.key.name = obj.key.name.substr(1);
4949 }
4950
4951 (*state)++;
4952 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4953
4954 return 0;
4955 }
4956
4957
4958 /**** logs ****/
4959
4960 struct log_list_state {
4961 string prefix;
4962 librados::IoCtx io_ctx;
4963 librados::NObjectIterator obit;
4964 };
4965
4966 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4967 {
4968 log_list_state *state = new log_list_state;
4969 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4970 if (r < 0) {
4971 delete state;
4972 return r;
4973 }
4974 state->prefix = prefix;
4975 state->obit = state->io_ctx.nobjects_begin();
4976 *handle = (RGWAccessHandle)state;
4977 return 0;
4978 }
4979
4980 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4981 {
4982 log_list_state *state = static_cast<log_list_state *>(handle);
4983 while (true) {
4984 if (state->obit == state->io_ctx.nobjects_end()) {
4985 delete state;
4986 return -ENOENT;
4987 }
4988 if (state->prefix.length() &&
4989 state->obit->get_oid().find(state->prefix) != 0) {
4990 state->obit++;
4991 continue;
4992 }
4993 *name = state->obit->get_oid();
4994 state->obit++;
4995 break;
4996 }
4997 return 0;
4998 }
4999
5000 int RGWRados::log_remove(const string& name)
5001 {
5002 librados::IoCtx io_ctx;
5003 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5004 if (r < 0)
5005 return r;
5006 return io_ctx.remove(name);
5007 }
5008
5009 struct log_show_state {
5010 librados::IoCtx io_ctx;
5011 bufferlist bl;
5012 bufferlist::iterator p;
5013 string name;
5014 uint64_t pos;
5015 bool eof;
5016 log_show_state() : pos(0), eof(false) {}
5017 };
5018
5019 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5020 {
5021 log_show_state *state = new log_show_state;
5022 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5023 if (r < 0) {
5024 delete state;
5025 return r;
5026 }
5027 state->name = name;
5028 *handle = (RGWAccessHandle)state;
5029 return 0;
5030 }
5031
5032 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5033 {
5034 log_show_state *state = static_cast<log_show_state *>(handle);
5035 off_t off = state->p.get_off();
5036
5037 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5038 << " off " << off
5039 << " eof " << (int)state->eof
5040 << dendl;
5041 // read some?
5042 unsigned chunk = 1024*1024;
5043 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5044 bufferlist more;
5045 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5046 if (r < 0)
5047 return r;
5048 state->pos += r;
5049 bufferlist old;
5050 try {
5051 old.substr_of(state->bl, off, state->bl.length() - off);
5052 } catch (buffer::error& err) {
5053 return -EINVAL;
5054 }
5055 state->bl.clear();
5056 state->bl.claim(old);
5057 state->bl.claim_append(more);
5058 state->p = state->bl.begin();
5059 if ((unsigned)r < chunk)
5060 state->eof = true;
5061 ldout(cct, 10) << " read " << r << dendl;
5062 }
5063
5064 if (state->p.end())
5065 return 0; // end of file
5066 try {
5067 ::decode(*entry, state->p);
5068 }
5069 catch (const buffer::error &e) {
5070 return -EINVAL;
5071 }
5072 return 1;
5073 }
5074
5075 /**
5076 * usage_log_hash: get usage log key hash, based on name and index
5077 *
5078 * Get the usage object name. Since a user may have more than 1
5079 * object holding that info (multiple shards), we use index to
5080 * specify that shard number. Once index exceeds max shards it
5081 * wraps.
5082 * If name is not being set, results for all users will be returned
5083 * and index will wrap only after total shards number.
5084 *
5085 * @param cct [in] ceph context
5086 * @param name [in] user name
5087 * @param hash [out] hash value
5088 * @param index [in] shard index number
5089 */
5090 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5091 {
5092 uint32_t val = index;
5093
5094 if (!name.empty()) {
5095 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5096 val %= max_user_shards;
5097 val += ceph_str_hash_linux(name.c_str(), name.size());
5098 }
5099 char buf[17];
5100 int max_shards = cct->_conf->rgw_usage_max_shards;
5101 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5102 hash = buf;
5103 }
5104
5105 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5106 {
5107 uint32_t index = 0;
5108
5109 map<string, rgw_usage_log_info> log_objs;
5110
5111 string hash;
5112 string last_user;
5113
5114 /* restructure usage map, zone by object hash */
5115 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5116 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5117 const rgw_user_bucket& ub = iter->first;
5118 RGWUsageBatch& info = iter->second;
5119
5120 if (ub.user.empty()) {
5121 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5122 continue;
5123 }
5124
5125 if (ub.user != last_user) {
5126 /* index *should* be random, but why waste extra cycles
5127 in most cases max user shards is not going to exceed 1,
5128 so just incrementing it */
5129 usage_log_hash(cct, ub.user, hash, index++);
5130 }
5131 last_user = ub.user;
5132 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5133
5134 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5135 v.push_back(miter->second);
5136 }
5137 }
5138
5139 map<string, rgw_usage_log_info>::iterator liter;
5140
5141 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5142 int r = cls_obj_usage_log_add(liter->first, liter->second);
5143 if (r < 0)
5144 return r;
5145 }
5146 return 0;
5147 }
5148
5149 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5150 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5151 {
5152 uint32_t num = max_entries;
5153 string hash, first_hash;
5154 string user_str = user.to_str();
5155 usage_log_hash(cct, user_str, first_hash, 0);
5156
5157 if (usage_iter.index) {
5158 usage_log_hash(cct, user_str, hash, usage_iter.index);
5159 } else {
5160 hash = first_hash;
5161 }
5162
5163 usage.clear();
5164
5165 do {
5166 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5167 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5168
5169 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5170 usage_iter.read_iter, ret_usage, is_truncated);
5171 if (ret == -ENOENT)
5172 goto next;
5173
5174 if (ret < 0)
5175 return ret;
5176
5177 num -= ret_usage.size();
5178
5179 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5180 usage[iter->first].aggregate(iter->second);
5181 }
5182
5183 next:
5184 if (!*is_truncated) {
5185 usage_iter.read_iter.clear();
5186 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5187 }
5188 } while (num && !*is_truncated && hash != first_hash);
5189 return 0;
5190 }
5191
5192 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5193 {
5194 uint32_t index = 0;
5195 string hash, first_hash;
5196 string user_str = user.to_str();
5197 usage_log_hash(cct, user_str, first_hash, index);
5198
5199 hash = first_hash;
5200 do {
5201 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5202
5203 if (ret < 0 && ret != -ENOENT)
5204 return ret;
5205
5206 usage_log_hash(cct, user_str, hash, ++index);
5207 } while (hash != first_hash);
5208
5209 return 0;
5210 }
5211
5212 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5213 {
5214 return rgw_shard_id(key, max_shards);
5215 }
5216
5217 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5218 {
5219 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5220 char buf[16];
5221 if (shard_id) {
5222 *shard_id = val % max_shards;
5223 }
5224 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5225 name = prefix + buf;
5226 }
5227
5228 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5229 {
5230 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5231 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5232 char buf[16];
5233 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5234 name = prefix + buf;
5235 }
5236
5237 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5238 {
5239 char buf[16];
5240 snprintf(buf, sizeof(buf), "%u", shard_id);
5241 name = prefix + buf;
5242
5243 }
5244
5245 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5246 {
5247 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5248 }
5249
5250 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5251 {
5252 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5253
5254 }
5255
5256 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5257 {
5258 librados::IoCtx io_ctx;
5259
5260 int r = time_log_add_init(io_ctx);
5261 if (r < 0) {
5262 return r;
5263 }
5264
5265 ObjectWriteOperation op;
5266 utime_t t(ut);
5267 cls_log_add(op, t, section, key, bl);
5268
5269 return io_ctx.operate(oid, &op);
5270 }
5271
5272 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5273 librados::AioCompletion *completion, bool monotonic_inc)
5274 {
5275 librados::IoCtx io_ctx;
5276
5277 int r = time_log_add_init(io_ctx);
5278 if (r < 0) {
5279 return r;
5280 }
5281
5282 ObjectWriteOperation op;
5283 cls_log_add(op, entries, monotonic_inc);
5284
5285 if (!completion) {
5286 r = io_ctx.operate(oid, &op);
5287 } else {
5288 r = io_ctx.aio_operate(oid, completion, &op);
5289 }
5290 return r;
5291 }
5292
5293 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5294 int max_entries, list<cls_log_entry>& entries,
5295 const string& marker,
5296 string *out_marker,
5297 bool *truncated)
5298 {
5299 librados::IoCtx io_ctx;
5300
5301 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5302 if (r < 0)
5303 return r;
5304 librados::ObjectReadOperation op;
5305
5306 utime_t st(start_time);
5307 utime_t et(end_time);
5308
5309 cls_log_list(op, st, et, marker, max_entries, entries,
5310 out_marker, truncated);
5311
5312 bufferlist obl;
5313
5314 int ret = io_ctx.operate(oid, &op, &obl);
5315 if (ret < 0)
5316 return ret;
5317
5318 return 0;
5319 }
5320
5321 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5322 {
5323 librados::IoCtx io_ctx;
5324
5325 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5326 if (r < 0)
5327 return r;
5328 librados::ObjectReadOperation op;
5329
5330 cls_log_info(op, header);
5331
5332 bufferlist obl;
5333
5334 int ret = io_ctx.operate(oid, &op, &obl);
5335 if (ret < 0)
5336 return ret;
5337
5338 return 0;
5339 }
5340
5341 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5342 {
5343 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5344 if (r < 0)
5345 return r;
5346
5347 librados::ObjectReadOperation op;
5348
5349 cls_log_info(op, header);
5350
5351 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5352 if (ret < 0)
5353 return ret;
5354
5355 return 0;
5356 }
5357
5358 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5359 const string& from_marker, const string& to_marker,
5360 librados::AioCompletion *completion)
5361 {
5362 librados::IoCtx io_ctx;
5363
5364 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5365 if (r < 0)
5366 return r;
5367
5368 utime_t st(start_time);
5369 utime_t et(end_time);
5370
5371 ObjectWriteOperation op;
5372 cls_log_trim(op, st, et, from_marker, to_marker);
5373
5374 if (!completion) {
5375 r = io_ctx.operate(oid, &op);
5376 } else {
5377 r = io_ctx.aio_operate(oid, completion, &op);
5378 }
5379 return r;
5380 }
5381
5382 string RGWRados::objexp_hint_get_shardname(int shard_num)
5383 {
5384 char buf[32];
5385 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5386
5387 string objname("obj_delete_at_hint.");
5388 return objname + buf;
5389 }
5390
5391 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5392 {
5393 string obj_key = key.name + key.instance;
5394 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5395 return rgw_bucket_shard_index(obj_key, num_shards);
5396 }
5397
5398 static string objexp_hint_get_keyext(const string& tenant_name,
5399 const string& bucket_name,
5400 const string& bucket_id,
5401 const rgw_obj_key& obj_key)
5402 {
5403 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5404 ":" + obj_key.name + ":" + obj_key.instance;
5405 }
5406
5407 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5408 const string& tenant_name,
5409 const string& bucket_name,
5410 const string& bucket_id,
5411 const rgw_obj_index_key& obj_key)
5412 {
5413 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5414 bucket_id, obj_key);
5415 objexp_hint_entry he = {
5416 .tenant = tenant_name,
5417 .bucket_name = bucket_name,
5418 .bucket_id = bucket_id,
5419 .obj_key = obj_key,
5420 .exp_time = delete_at };
5421 bufferlist hebl;
5422 ::encode(he, hebl);
5423 ObjectWriteOperation op;
5424 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5425
5426 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5427 return objexp_pool_ctx.operate(shard_name, &op);
5428 }
5429
5430 void RGWRados::objexp_get_shard(int shard_num,
5431 string& shard) /* out */
5432 {
5433 shard = objexp_hint_get_shardname(shard_num);
5434 }
5435
5436 int RGWRados::objexp_hint_list(const string& oid,
5437 const ceph::real_time& start_time,
5438 const ceph::real_time& end_time,
5439 const int max_entries,
5440 const string& marker,
5441 list<cls_timeindex_entry>& entries, /* out */
5442 string *out_marker, /* out */
5443 bool *truncated) /* out */
5444 {
5445 librados::ObjectReadOperation op;
5446 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5447 out_marker, truncated);
5448
5449 bufferlist obl;
5450 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5451
5452 if ((ret < 0 ) && (ret != -ENOENT)) {
5453 return ret;
5454 }
5455
5456 if ((ret == -ENOENT) && truncated) {
5457 *truncated = false;
5458 }
5459
5460 return 0;
5461 }
5462
5463 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5464 objexp_hint_entry& hint_entry) /* out */
5465 {
5466 try {
5467 bufferlist::iterator iter = ti_entry.value.begin();
5468 ::decode(hint_entry, iter);
5469 } catch (buffer::error& err) {
5470 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5471 }
5472
5473 return 0;
5474 }
5475
5476 int RGWRados::objexp_hint_trim(const string& oid,
5477 const ceph::real_time& start_time,
5478 const ceph::real_time& end_time,
5479 const string& from_marker,
5480 const string& to_marker)
5481 {
5482 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5483 from_marker, to_marker);
5484 if ((ret < 0 ) && (ret != -ENOENT)) {
5485 return ret;
5486 }
5487
5488 return 0;
5489 }
5490
5491 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5492 string& zone_id, string& owner_id) {
5493 librados::IoCtx io_ctx;
5494
5495 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5496 if (r < 0) {
5497 return r;
5498 }
5499 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5500 utime_t ut(msec / 1000, msec % 1000);
5501
5502 rados::cls::lock::Lock l(log_lock_name);
5503 l.set_duration(ut);
5504 l.set_cookie(owner_id);
5505 l.set_tag(zone_id);
5506 l.set_renew(true);
5507
5508 return l.lock_exclusive(&io_ctx, oid);
5509 }
5510
5511 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5512 librados::IoCtx io_ctx;
5513
5514 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5515 if (r < 0) {
5516 return r;
5517 }
5518
5519 rados::cls::lock::Lock l(log_lock_name);
5520 l.set_tag(zone_id);
5521 l.set_cookie(owner_id);
5522
5523 return l.unlock(&io_ctx, oid);
5524 }
5525
5526 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5527 {
5528 bufferlist::iterator i = bl.begin();
5529 RGWAccessControlPolicy policy(cct);
5530 try {
5531 policy.decode_owner(i);
5532 } catch (buffer::error& err) {
5533 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5534 return -EIO;
5535 }
5536 *owner = policy.get_owner();
5537 return 0;
5538 }
5539
5540 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5541 {
5542 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5543 if (aiter == attrset.end())
5544 return -EIO;
5545
5546 bufferlist& bl = aiter->second;
5547 bufferlist::iterator iter = bl.begin();
5548 try {
5549 policy->decode(iter);
5550 } catch (buffer::error& err) {
5551 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5552 return -EIO;
5553 }
5554 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5555 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5556 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5557 s3policy->to_xml(*_dout);
5558 *_dout << dendl;
5559 }
5560 return 0;
5561 }
5562
5563
5564 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5565 {
5566 rgw_bucket bucket = bucket_info.bucket;
5567 bucket.update_bucket_id(new_bucket_id);
5568
5569 RGWObjectCtx obj_ctx(store);
5570
5571 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5572 if (ret < 0) {
5573 return ret;
5574 }
5575
5576 return 0;
5577 }
5578
5579
5580 /**
5581 * Get ordered listing of the objects in a bucket.
5582 *
5583 * max: maximum number of results to return
5584 * bucket: bucket to list contents of
5585 * prefix: only return results that match this prefix
5586 * delim: do not include results that match this string.
5587 * Any skipped results will have the matching portion of their name
5588 * inserted in common_prefixes with a "true" mark.
5589 * marker: if filled in, begin the listing with this object.
5590 * end_marker: if filled in, end the listing with this object.
5591 * result: the objects are put in here.
5592 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5593 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5594 */
5595 int RGWRados::Bucket::List::list_objects_ordered(int64_t max,
5596 vector<rgw_bucket_dir_entry> *result,
5597 map<string, bool> *common_prefixes,
5598 bool *is_truncated)
5599 {
5600 RGWRados *store = target->get_store();
5601 CephContext *cct = store->ctx();
5602 int shard_id = target->get_shard_id();
5603
5604 int count = 0;
5605 bool truncated = true;
5606 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5607
5608 result->clear();
5609
5610 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5611 rgw_obj_index_key cur_marker;
5612 marker_obj.get_index_key(&cur_marker);
5613
5614 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5615 params.ns);
5616 rgw_obj_index_key cur_end_marker;
5617 end_marker_obj.get_index_key(&cur_end_marker);
5618 const bool cur_end_marker_valid = !params.end_marker.empty();
5619
5620 rgw_obj_key prefix_obj(params.prefix);
5621 prefix_obj.ns = params.ns;
5622 string cur_prefix = prefix_obj.get_index_key_name();
5623
5624 string bigger_than_delim;
5625
5626 if (!params.delim.empty()) {
5627 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(),
5628 params.delim.size());
5629 char buf[params.delim.size() + 16];
5630 int r = encode_utf8(val + 1, (unsigned char *)buf);
5631 if (r < 0) {
5632 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5633 return -EINVAL;
5634 }
5635 buf[r] = '\0';
5636
5637 bigger_than_delim = buf;
5638
5639 /* if marker points at a common prefix, fast forward it into its upperbound string */
5640 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5641 if (delim_pos >= 0) {
5642 string s = cur_marker.name.substr(0, delim_pos);
5643 s.append(bigger_than_delim);
5644 cur_marker = s;
5645 }
5646 }
5647
5648 string skip_after_delim;
5649 while (truncated && count <= max) {
5650 if (skip_after_delim > cur_marker.name) {
5651 cur_marker = skip_after_delim;
5652 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5653 }
5654 std::map<string, rgw_bucket_dir_entry> ent_map;
5655 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
5656 shard_id,
5657 cur_marker,
5658 cur_prefix,
5659 read_ahead + 1 - count,
5660 params.list_versions,
5661 ent_map,
5662 &truncated,
5663 &cur_marker);
5664 if (r < 0)
5665 return r;
5666
5667 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5668 rgw_bucket_dir_entry& entry = eiter->second;
5669 rgw_obj_index_key index_key = entry.key;
5670
5671 rgw_obj_key obj(index_key);
5672
5673 /* note that parse_raw_oid() here will not set the correct
5674 * object's instance, as rgw_obj_index_key encodes that
5675 * separately. We don't need to set the instance because it's
5676 * not needed for the checks here and we end up using the raw
5677 * entry for the return vector
5678 */
5679 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5680 if (!valid) {
5681 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5682 continue;
5683 }
5684 bool check_ns = (obj.ns == params.ns);
5685 if (!params.list_versions && !entry.is_visible()) {
5686 continue;
5687 }
5688
5689 if (params.enforce_ns && !check_ns) {
5690 if (!params.ns.empty()) {
5691 /* we've iterated past the namespace we're searching -- done now */
5692 truncated = false;
5693 goto done;
5694 }
5695
5696 /* we're not looking at the namespace this object is in, next! */
5697 continue;
5698 }
5699
5700 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5701 truncated = false;
5702 goto done;
5703 }
5704
5705 if (count < max) {
5706 params.marker = index_key;
5707 next_marker = index_key;
5708 }
5709
5710 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5711 continue;
5712
5713 if (params.prefix.size() &&
5714 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5715 continue;
5716
5717 if (!params.delim.empty()) {
5718 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5719
5720 if (delim_pos >= 0) {
5721 string prefix_key = obj.name.substr(0, delim_pos + 1);
5722
5723 if (common_prefixes &&
5724 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5725 if (count >= max) {
5726 truncated = true;
5727 goto done;
5728 }
5729 next_marker = prefix_key;
5730 (*common_prefixes)[prefix_key] = true;
5731
5732 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5733
5734 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5735 skip_after_delim.append(bigger_than_delim);
5736
5737 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5738
5739 count++;
5740 }
5741
5742 continue;
5743 }
5744 }
5745
5746 if (count >= max) {
5747 truncated = true;
5748 goto done;
5749 }
5750
5751 result->emplace_back(std::move(entry));
5752 count++;
5753 }
5754 }
5755
5756 done:
5757 if (is_truncated)
5758 *is_truncated = truncated;
5759
5760 return 0;
5761 } // list_objects_ordered
5762
5763
5764 /**
5765 * Get listing of the objects in a bucket and allow the results to be out
5766 * of order.
5767 *
5768 * Even though there are key differences with the ordered counterpart,
5769 * the parameters are the same to maintain some compatability.
5770 *
5771 * max: maximum number of results to return
5772 * bucket: bucket to list contents of
5773 * prefix: only return results that match this prefix
5774 * delim: should not be set; if it is we should have indicated an error
5775 * marker: if filled in, begin the listing with this object.
5776 * end_marker: if filled in, end the listing with this object.
5777 * result: the objects are put in here.
5778 * common_prefixes: this is never filled with an unordered list; the param
5779 * is maintained for compatibility
5780 * is_truncated: if number of objects in the bucket is bigger than max, then
5781 * truncated.
5782 */
5783 int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
5784 vector<rgw_bucket_dir_entry> *result,
5785 map<string, bool> *common_prefixes,
5786 bool *is_truncated)
5787 {
5788 RGWRados *store = target->get_store();
5789 CephContext *cct = store->ctx();
5790 int shard_id = target->get_shard_id();
5791
5792 int count = 0;
5793 bool truncated = true;
5794
5795 // read a few extra in each call to cls_bucket_list_unordered in
5796 // case some are filtered out due to namespace matching, versioning,
5797 // filtering, etc.
5798 const int64_t max_read_ahead = 100;
5799 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
5800
5801 result->clear();
5802
5803 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5804 rgw_obj_index_key cur_marker;
5805 marker_obj.get_index_key(&cur_marker);
5806
5807 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5808 params.ns);
5809 rgw_obj_index_key cur_end_marker;
5810 end_marker_obj.get_index_key(&cur_end_marker);
5811 const bool cur_end_marker_valid = !params.end_marker.empty();
5812
5813 rgw_obj_key prefix_obj(params.prefix);
5814 prefix_obj.ns = params.ns;
5815 string cur_prefix = prefix_obj.get_index_key_name();
5816
5817 while (truncated && count <= max) {
5818 std::vector<rgw_bucket_dir_entry> ent_list;
5819 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
5820 shard_id,
5821 cur_marker,
5822 cur_prefix,
5823 read_ahead,
5824 params.list_versions,
5825 ent_list,
5826 &truncated,
5827 &cur_marker);
5828 if (r < 0)
5829 return r;
5830
5831 // NB: while regions of ent_list will be sorted, we have no
5832 // guarantee that all items will be sorted since they can cross
5833 // shard boundaries
5834
5835 for (auto& entry : ent_list) {
5836 rgw_obj_index_key index_key = entry.key;
5837 rgw_obj_key obj(index_key);
5838
5839 /* note that parse_raw_oid() here will not set the correct
5840 * object's instance, as rgw_obj_index_key encodes that
5841 * separately. We don't need to set the instance because it's
5842 * not needed for the checks here and we end up using the raw
5843 * entry for the return vector
5844 */
5845 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5846 if (!valid) {
5847 ldout(cct, 0) << "ERROR: could not parse object name: " <<
5848 obj.name << dendl;
5849 continue;
5850 }
5851
5852 if (!params.list_versions && !entry.is_visible()) {
5853 continue;
5854 }
5855
5856 if (params.enforce_ns && obj.ns != params.ns) {
5857 continue;
5858 }
5859
5860 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5861 // we're not guaranteed items will come in order, so we have
5862 // to loop through all
5863 continue;
5864 }
5865
5866 if (count < max) {
5867 params.marker = index_key;
5868 next_marker = index_key;
5869 }
5870
5871 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5872 continue;
5873
5874 if (params.prefix.size() &&
5875 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
5876 continue;
5877
5878 if (count >= max) {
5879 truncated = true;
5880 goto done;
5881 }
5882
5883 result->emplace_back(std::move(entry));
5884 count++;
5885 } // for (auto& entry : ent_list)
5886 } // while (truncated && count <= max)
5887
5888 done:
5889 if (is_truncated)
5890 *is_truncated = truncated;
5891
5892 return 0;
5893 } // list_objects_unordered
5894
5895
5896 /**
5897 * create a rados pool, associated meta info
5898 * returns 0 on success, -ERR# otherwise.
5899 */
5900 int RGWRados::create_pool(const rgw_pool& pool)
5901 {
5902 librados::IoCtx io_ctx;
5903 constexpr bool create = true;
5904 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
5905 }
5906
5907 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5908 {
5909 librados::IoCtx index_ctx; // context for new bucket
5910
5911 string dir_oid = dir_oid_prefix;
5912 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5913 if (r < 0) {
5914 return r;
5915 }
5916
5917 dir_oid.append(bucket_info.bucket.bucket_id);
5918
5919 map<int, string> bucket_objs;
5920 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5921
5922 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5923 }
5924
5925 void RGWRados::create_bucket_id(string *bucket_id)
5926 {
5927 uint64_t iid = instance_id();
5928 uint64_t bid = next_bucket_id();
5929 char buf[get_zone_params().get_id().size() + 48];
5930 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5931 *bucket_id = buf;
5932 }
5933
5934 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5935 const string& zonegroup_id,
5936 const string& placement_rule,
5937 const string& swift_ver_location,
5938 const RGWQuotaInfo * pquota_info,
5939 map<std::string, bufferlist>& attrs,
5940 RGWBucketInfo& info,
5941 obj_version *pobjv,
5942 obj_version *pep_objv,
5943 real_time creation_time,
5944 rgw_bucket *pmaster_bucket,
5945 uint32_t *pmaster_num_shards,
5946 bool exclusive)
5947 {
5948 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5949 string selected_placement_rule_name;
5950 RGWZonePlacementInfo rule_info;
5951
5952 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5953 int ret = 0;
5954 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5955 &selected_placement_rule_name, &rule_info);
5956 if (ret < 0)
5957 return ret;
5958
5959 if (!pmaster_bucket) {
5960 create_bucket_id(&bucket.marker);
5961 bucket.bucket_id = bucket.marker;
5962 } else {
5963 bucket.marker = pmaster_bucket->marker;
5964 bucket.bucket_id = pmaster_bucket->bucket_id;
5965 }
5966
5967 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5968
5969 if (pobjv) {
5970 objv_tracker.write_version = *pobjv;
5971 } else {
5972 objv_tracker.generate_new_write_ver(cct);
5973 }
5974
5975 info.bucket = bucket;
5976 info.owner = owner.user_id;
5977 info.zonegroup = zonegroup_id;
5978 info.placement_rule = selected_placement_rule_name;
5979 info.index_type = rule_info.index_type;
5980 info.swift_ver_location = swift_ver_location;
5981 info.swift_versioning = (!swift_ver_location.empty());
5982 if (pmaster_num_shards) {
5983 info.num_shards = *pmaster_num_shards;
5984 } else {
5985 info.num_shards = bucket_index_max_shards;
5986 }
5987 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5988 info.requester_pays = false;
5989 if (real_clock::is_zero(creation_time)) {
5990 info.creation_time = ceph::real_clock::now();
5991 } else {
5992 info.creation_time = creation_time;
5993 }
5994 if (pquota_info) {
5995 info.quota = *pquota_info;
5996 }
5997
5998 int r = init_bucket_index(info, info.num_shards);
5999 if (r < 0) {
6000 return r;
6001 }
6002
6003 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
6004 if (ret == -EEXIST) {
6005 librados::IoCtx index_ctx;
6006 map<int, string> bucket_objs;
6007 int r = open_bucket_index(info, index_ctx, bucket_objs);
6008 if (r < 0)
6009 return r;
6010
6011 /* we need to reread the info and return it, caller will have a use for it */
6012 RGWObjVersionTracker instance_ver = info.objv_tracker;
6013 info.objv_tracker.clear();
6014 RGWObjectCtx obj_ctx(this);
6015 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
6016 if (r < 0) {
6017 if (r == -ENOENT) {
6018 continue;
6019 }
6020 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
6021 return r;
6022 }
6023
6024 /* only remove it if it's a different bucket instance */
6025 if (info.bucket.bucket_id != bucket.bucket_id) {
6026 /* remove bucket meta instance */
6027 string entry = bucket.get_key();
6028 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
6029 if (r < 0)
6030 return r;
6031
6032 map<int, string>::const_iterator biter;
6033 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
6034 // Do best effort removal
6035 index_ctx.remove(biter->second);
6036 }
6037 }
6038 /* ret == -ENOENT here */
6039 }
6040 return ret;
6041 }
6042
6043 /* this is highly unlikely */
6044 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
6045 return -ENOENT;
6046 }
6047
6048 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
6049 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6050
6051 {
6052 /* first check that zonegroup exists within current period. */
6053 RGWZoneGroup zonegroup;
6054 int ret = get_zonegroup(zonegroup_id, zonegroup);
6055 if (ret < 0) {
6056 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
6057 return ret;
6058 }
6059
6060 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
6061 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
6062
6063 if (!request_rule.empty()) {
6064 titer = zonegroup.placement_targets.find(request_rule);
6065 if (titer == zonegroup.placement_targets.end()) {
6066 ldout(cct, 0) << "could not find requested placement id " << request_rule
6067 << " within zonegroup " << dendl;
6068 return -ERR_INVALID_LOCATION_CONSTRAINT;
6069 }
6070 } else if (!user_info.default_placement.empty()) {
6071 titer = zonegroup.placement_targets.find(user_info.default_placement);
6072 if (titer == zonegroup.placement_targets.end()) {
6073 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
6074 << " within zonegroup " << dendl;
6075 return -ERR_INVALID_LOCATION_CONSTRAINT;
6076 }
6077 } else {
6078 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
6079 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
6080 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
6081 } else {
6082 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
6083 if (titer == zonegroup.placement_targets.end()) {
6084 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
6085 << " within zonegroup " << dendl;
6086 return -ERR_INVALID_LOCATION_CONSTRAINT;
6087 }
6088 }
6089 }
6090
6091 /* now check tag for the rule, whether user is permitted to use rule */
6092 const auto& target_rule = titer->second;
6093 if (!target_rule.user_permitted(user_info.placement_tags)) {
6094 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
6095 return -EPERM;
6096 }
6097
6098 if (pselected_rule_name)
6099 *pselected_rule_name = titer->first;
6100
6101 return select_bucket_location_by_rule(titer->first, rule_info);
6102 }
6103
6104 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
6105 {
6106 if (location_rule.empty()) {
6107 /* we can only reach here if we're trying to set a bucket location from a bucket
6108 * created on a different zone, using a legacy / default pool configuration
6109 */
6110 return select_legacy_bucket_placement(rule_info);
6111 }
6112
6113 /*
6114 * make sure that zone has this rule configured. We're
6115 * checking it for the local zone, because that's where this bucket object is going to
6116 * reside.
6117 */
6118 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
6119 if (piter == get_zone_params().placement_pools.end()) {
6120 /* couldn't find, means we cannot really place data for this bucket in this zone */
6121 if (get_zonegroup().equals(zonegroup.get_id())) {
6122 /* that's a configuration error, zone should have that rule, as we're within the requested
6123 * zonegroup */
6124 return -EINVAL;
6125 } else {
6126 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
6127 return 0;
6128 }
6129 }
6130
6131 RGWZonePlacementInfo& placement_info = piter->second;
6132
6133 if (rule_info) {
6134 *rule_info = placement_info;
6135 }
6136
6137 return 0;
6138 }
6139
6140 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
6141 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6142 {
6143 if (!get_zone_params().placement_pools.empty()) {
6144 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6145 pselected_rule_name, rule_info);
6146 }
6147
6148 if (pselected_rule_name) {
6149 pselected_rule_name->clear();
6150 }
6151
6152 return select_legacy_bucket_placement(rule_info);
6153 }
6154
6155 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6156 {
6157 bufferlist map_bl;
6158 map<string, bufferlist> m;
6159 string pool_name;
6160 bool write_map = false;
6161
6162 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6163
6164 RGWObjectCtx obj_ctx(this);
6165 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6166 if (ret < 0) {
6167 goto read_omap;
6168 }
6169
6170 try {
6171 bufferlist::iterator iter = map_bl.begin();
6172 ::decode(m, iter);
6173 } catch (buffer::error& err) {
6174 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6175 }
6176
6177 read_omap:
6178 if (m.empty()) {
6179 bufferlist header;
6180 ret = omap_get_all(obj, header, m);
6181
6182 write_map = true;
6183 }
6184
6185 if (ret < 0 || m.empty()) {
6186 vector<rgw_pool> pools;
6187 string s = string("default.") + default_storage_pool_suffix;
6188 pools.push_back(rgw_pool(s));
6189 vector<int> retcodes;
6190 bufferlist bl;
6191 ret = create_pools(pools, retcodes);
6192 if (ret < 0)
6193 return ret;
6194 ret = omap_set(obj, s, bl);
6195 if (ret < 0)
6196 return ret;
6197 m[s] = bl;
6198 }
6199
6200 if (write_map) {
6201 bufferlist new_bl;
6202 ::encode(m, new_bl);
6203 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6204 if (ret < 0) {
6205 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6206 }
6207 }
6208
6209 map<string, bufferlist>::iterator miter;
6210 if (m.size() > 1) {
6211 vector<string> v;
6212 for (miter = m.begin(); miter != m.end(); ++miter) {
6213 v.push_back(miter->first);
6214 }
6215
6216 uint32_t r;
6217 ret = get_random_bytes((char *)&r, sizeof(r));
6218 if (ret < 0)
6219 return ret;
6220
6221 int i = r % v.size();
6222 pool_name = v[i];
6223 } else {
6224 miter = m.begin();
6225 pool_name = miter->first;
6226 }
6227
6228 rule_info->data_pool = pool_name;
6229 rule_info->data_extra_pool = pool_name;
6230 rule_info->index_pool = pool_name;
6231 rule_info->index_type = RGWBIType_Normal;
6232
6233 return 0;
6234 }
6235
6236 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6237 {
6238 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6239 }
6240
6241 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6242 {
6243 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6244
6245 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6246 }
6247
6248 int RGWRados::update_placement_map()
6249 {
6250 bufferlist header;
6251 map<string, bufferlist> m;
6252 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6253 int ret = omap_get_all(obj, header, m);
6254 if (ret < 0)
6255 return ret;
6256
6257 bufferlist new_bl;
6258 ::encode(m, new_bl);
6259 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6260 if (ret < 0) {
6261 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6262 }
6263
6264 return ret;
6265 }
6266
6267 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6268 {
6269 librados::Rados *rad = get_rados_handle();
6270 int ret = rad->pool_lookup(new_pool.name.c_str());
6271 if (ret < 0) // DNE, or something
6272 return ret;
6273
6274 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6275 bufferlist empty_bl;
6276 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6277
6278 // don't care about return value
6279 update_placement_map();
6280
6281 return ret;
6282 }
6283
6284 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6285 {
6286 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6287 int ret = omap_del(obj, old_pool.to_str());
6288
6289 // don't care about return value
6290 update_placement_map();
6291
6292 return ret;
6293 }
6294
6295 int RGWRados::list_placement_set(set<rgw_pool>& names)
6296 {
6297 bufferlist header;
6298 map<string, bufferlist> m;
6299
6300 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6301 int ret = omap_get_all(obj, header, m);
6302 if (ret < 0)
6303 return ret;
6304
6305 names.clear();
6306 map<string, bufferlist>::iterator miter;
6307 for (miter = m.begin(); miter != m.end(); ++miter) {
6308 names.insert(rgw_pool(miter->first));
6309 }
6310
6311 return names.size();
6312 }
6313
6314 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6315 {
6316 vector<librados::PoolAsyncCompletion *> completions;
6317 vector<int> rets;
6318
6319 librados::Rados *rad = get_rados_handle();
6320 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6321 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6322 completions.push_back(c);
6323 rgw_pool& pool = *iter;
6324 int ret = rad->pool_create_async(pool.name.c_str(), c);
6325 rets.push_back(ret);
6326 }
6327
6328 vector<int>::iterator riter;
6329 vector<librados::PoolAsyncCompletion *>::iterator citer;
6330
6331 bool error = false;
6332 assert(rets.size() == completions.size());
6333 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6334 int r = *riter;
6335 PoolAsyncCompletion *c = *citer;
6336 if (r == 0) {
6337 c->wait();
6338 r = c->get_return_value();
6339 if (r < 0) {
6340 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6341 error = true;
6342 }
6343 }
6344 c->release();
6345 retcodes.push_back(r);
6346 }
6347 if (error) {
6348 return 0;
6349 }
6350
6351 std::vector<librados::IoCtx> io_ctxs;
6352 retcodes.clear();
6353 for (auto pool : pools) {
6354 io_ctxs.emplace_back();
6355 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6356 if (ret < 0) {
6357 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6358 error = true;
6359 }
6360 retcodes.push_back(ret);
6361 }
6362 if (error) {
6363 return 0;
6364 }
6365
6366 completions.clear();
6367 for (auto &io_ctx : io_ctxs) {
6368 librados::PoolAsyncCompletion *c =
6369 librados::Rados::pool_async_create_completion();
6370 completions.push_back(c);
6371 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6372 false, c);
6373 assert(ret == 0);
6374 }
6375
6376 retcodes.clear();
6377 for (auto c : completions) {
6378 c->wait();
6379 int ret = c->get_return_value();
6380 if (ret == -EOPNOTSUPP) {
6381 ret = 0;
6382 } else if (ret < 0) {
6383 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6384 << dendl;
6385 error = true;
6386 }
6387 c->release();
6388 retcodes.push_back(ret);
6389 }
6390 return 0;
6391 }
6392
6393 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6394 {
6395 string oid, key;
6396 get_obj_bucket_and_oid_loc(obj, oid, key);
6397
6398 rgw_pool pool;
6399 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6400 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6401 return -EIO;
6402 }
6403
6404 int r = open_pool_ctx(pool, *ioctx);
6405 if (r < 0) {
6406 return r;
6407 }
6408
6409 ioctx->locator_set_key(key);
6410
6411 return 0;
6412 }
6413
6414 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6415 {
6416 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6417
6418 rgw_pool pool;
6419 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6420 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6421 return -EIO;
6422 }
6423
6424 int r = open_pool_ctx(pool, ref->ioctx);
6425 if (r < 0) {
6426 return r;
6427 }
6428
6429 ref->ioctx.locator_set_key(ref->key);
6430
6431 return 0;
6432 }
6433
6434 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6435 {
6436 ref->oid = obj.oid;
6437 ref->key = obj.loc;
6438
6439 int r;
6440
6441 if (ref->oid.empty()) {
6442 ref->oid = obj.pool.to_str();
6443 ref->pool = get_zone_params().domain_root;
6444 } else {
6445 ref->pool = obj.pool;
6446 }
6447 r = open_pool_ctx(ref->pool, ref->ioctx);
6448 if (r < 0)
6449 return r;
6450
6451 ref->ioctx.locator_set_key(ref->key);
6452
6453 return 0;
6454 }
6455
6456 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6457 {
6458 return get_raw_obj_ref(obj, ref);
6459 }
6460
6461 /*
6462 * fixes an issue where head objects were supposed to have a locator created, but ended
6463 * up without one
6464 */
6465 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6466 {
6467 const rgw_bucket& bucket = bucket_info.bucket;
6468 string oid;
6469 string locator;
6470
6471 rgw_obj obj(bucket, key);
6472
6473 get_obj_bucket_and_oid_loc(obj, oid, locator);
6474
6475 if (locator.empty()) {
6476 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6477 return 0;
6478 }
6479
6480 librados::IoCtx ioctx;
6481
6482 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6483 if (ret < 0) {
6484 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6485 return ret;
6486 }
6487 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6488
6489 uint64_t size;
6490 bufferlist data;
6491
6492 struct timespec mtime_ts;
6493 map<string, bufferlist> attrs;
6494 librados::ObjectReadOperation op;
6495 op.getxattrs(&attrs, NULL);
6496 op.stat2(&size, &mtime_ts, NULL);
6497 #define HEAD_SIZE 512 * 1024
6498 op.read(0, HEAD_SIZE, &data, NULL);
6499
6500 ret = ioctx.operate(oid, &op, NULL);
6501 if (ret < 0) {
6502 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6503 return ret;
6504 }
6505
6506 if (size > HEAD_SIZE) {
6507 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6508 return -EIO;
6509 }
6510
6511 if (size != data.length()) {
6512 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6513 return -EIO;
6514 }
6515
6516 if (copy_obj) {
6517 librados::ObjectWriteOperation wop;
6518
6519 wop.mtime2(&mtime_ts);
6520
6521 map<string, bufferlist>::iterator iter;
6522 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6523 wop.setxattr(iter->first.c_str(), iter->second);
6524 }
6525
6526 wop.write(0, data);
6527
6528 ioctx.locator_set_key(locator);
6529 ioctx.operate(oid, &wop);
6530 }
6531
6532 if (remove_bad) {
6533 ioctx.locator_set_key(string());
6534
6535 ret = ioctx.remove(oid);
6536 if (ret < 0) {
6537 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6538 return ret;
6539 }
6540 }
6541
6542 return 0;
6543 }
6544
6545 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6546 const string& src_oid, const string& src_locator,
6547 librados::IoCtx& dst_ioctx,
6548 const string& dst_oid, const string& dst_locator)
6549 {
6550
6551 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6552 bool done = false;
6553 uint64_t chunk_size = COPY_BUF_SIZE;
6554 uint64_t ofs = 0;
6555 int ret = 0;
6556 real_time mtime;
6557 struct timespec mtime_ts;
6558 uint64_t size;
6559
6560 if (src_oid == dst_oid && src_locator == dst_locator) {
6561 return 0;
6562 }
6563
6564 src_ioctx.locator_set_key(src_locator);
6565 dst_ioctx.locator_set_key(dst_locator);
6566
6567 do {
6568 bufferlist data;
6569 ObjectReadOperation rop;
6570 ObjectWriteOperation wop;
6571
6572 if (ofs == 0) {
6573 rop.stat2(&size, &mtime_ts, NULL);
6574 mtime = real_clock::from_timespec(mtime_ts);
6575 }
6576 rop.read(ofs, chunk_size, &data, NULL);
6577 ret = src_ioctx.operate(src_oid, &rop, NULL);
6578 if (ret < 0) {
6579 goto done_err;
6580 }
6581
6582 if (data.length() == 0) {
6583 break;
6584 }
6585
6586 if (ofs == 0) {
6587 wop.create(true); /* make it exclusive */
6588 wop.mtime2(&mtime_ts);
6589 mtime = real_clock::from_timespec(mtime_ts);
6590 }
6591 wop.write(ofs, data);
6592 ret = dst_ioctx.operate(dst_oid, &wop);
6593 ofs += data.length();
6594 done = data.length() != chunk_size;
6595 } while (!done);
6596
6597 if (ofs != size) {
6598 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6599 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6600 ret = -EIO;
6601 goto done_err;
6602 }
6603
6604 src_ioctx.remove(src_oid);
6605
6606 return 0;
6607
6608 done_err:
6609 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6610 return ret;
6611 }
6612
6613 /*
6614 * fixes an issue where head objects were supposed to have a locator created, but ended
6615 * up without one
6616 */
6617 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6618 {
6619 const rgw_bucket& bucket = bucket_info.bucket;
6620 rgw_obj obj(bucket, key);
6621
6622 if (need_fix) {
6623 *need_fix = false;
6624 }
6625
6626 rgw_rados_ref ref;
6627 int r = get_obj_head_ref(bucket_info, obj, &ref);
6628 if (r < 0) {
6629 return r;
6630 }
6631
6632 RGWObjState *astate = NULL;
6633 RGWObjectCtx rctx(this);
6634 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6635 if (r < 0)
6636 return r;
6637
6638 if (astate->has_manifest) {
6639 RGWObjManifest::obj_iterator miter;
6640 RGWObjManifest& manifest = astate->manifest;
6641 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6642 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6643 rgw_obj loc;
6644 string oid;
6645 string locator;
6646
6647 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6648
6649 if (loc.key.ns.empty()) {
6650 /* continue, we're only interested in tail objects */
6651 continue;
6652 }
6653
6654 get_obj_bucket_and_oid_loc(loc, oid, locator);
6655 ref.ioctx.locator_set_key(locator);
6656
6657 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6658
6659 r = ref.ioctx.stat(oid, NULL, NULL);
6660 if (r != -ENOENT) {
6661 continue;
6662 }
6663
6664 string bad_loc;
6665 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6666
6667 /* create a new ioctx with the bad locator */
6668 librados::IoCtx src_ioctx;
6669 src_ioctx.dup(ref.ioctx);
6670 src_ioctx.locator_set_key(bad_loc);
6671
6672 r = src_ioctx.stat(oid, NULL, NULL);
6673 if (r != 0) {
6674 /* cannot find a broken part */
6675 continue;
6676 }
6677 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6678 if (need_fix) {
6679 *need_fix = true;
6680 }
6681 if (fix) {
6682 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6683 if (r < 0) {
6684 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6685 }
6686 }
6687 }
6688 }
6689
6690 return 0;
6691 }
6692
6693 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6694 {
6695 bucket = _bucket;
6696
6697 RGWObjectCtx obj_ctx(store);
6698
6699 RGWBucketInfo bucket_info;
6700 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6701 if (ret < 0) {
6702 return ret;
6703 }
6704
6705 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6706 if (ret < 0) {
6707 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6708 return ret;
6709 }
6710 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6711
6712 return 0;
6713 }
6714
6715 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6716 {
6717 bucket = _bucket;
6718 shard_id = sid;
6719
6720 RGWObjectCtx obj_ctx(store);
6721
6722 RGWBucketInfo bucket_info;
6723 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6724 if (ret < 0) {
6725 return ret;
6726 }
6727
6728 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6729 if (ret < 0) {
6730 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6731 return ret;
6732 }
6733 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6734
6735 return 0;
6736 }
6737
6738 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6739 {
6740 bucket = bucket_info.bucket;
6741 shard_id = sid;
6742
6743 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6744 if (ret < 0) {
6745 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6746 return ret;
6747 }
6748 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6749
6750 return 0;
6751 }
6752
6753
6754 /* Execute @handler on last item in bucket listing for bucket specified
6755 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6756 * to objects matching these criterias. */
6757 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6758 const std::string& obj_prefix,
6759 const std::string& obj_delim,
6760 std::function<int(const rgw_bucket_dir_entry&)> handler)
6761 {
6762 RGWRados::Bucket target(this, bucket_info);
6763 RGWRados::Bucket::List list_op(&target);
6764
6765 list_op.params.prefix = obj_prefix;
6766 list_op.params.delim = obj_delim;
6767
6768 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6769 << ", obj_prefix=" << obj_prefix
6770 << ", obj_delim=" << obj_delim
6771 << dendl;
6772
6773 bool is_truncated = false;
6774
6775 boost::optional<rgw_bucket_dir_entry> last_entry;
6776 /* We need to rewind to the last object in a listing. */
6777 do {
6778 /* List bucket entries in chunks. */
6779 static constexpr int MAX_LIST_OBJS = 100;
6780 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6781
6782 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6783 &is_truncated);
6784 if (ret < 0) {
6785 return ret;
6786 } else if (!entries.empty()) {
6787 last_entry = entries.back();
6788 }
6789 } while (is_truncated);
6790
6791 if (last_entry) {
6792 return handler(*last_entry);
6793 }
6794
6795 /* Empty listing - no items we can run handler on. */
6796 return 0;
6797 }
6798
6799
6800 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6801 const rgw_user& user,
6802 RGWBucketInfo& bucket_info,
6803 rgw_obj& obj)
6804 {
6805 if (! swift_versioning_enabled(bucket_info)) {
6806 return 0;
6807 }
6808
6809 obj_ctx.obj.set_atomic(obj);
6810
6811 RGWObjState * state = nullptr;
6812 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6813 if (r < 0) {
6814 return r;
6815 }
6816
6817 if (!state->exists) {
6818 return 0;
6819 }
6820
6821 string client_id;
6822 string op_id;
6823
6824 const string& src_name = obj.get_oid();
6825 char buf[src_name.size() + 32];
6826 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6827 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6828 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6829
6830 RGWBucketInfo dest_bucket_info;
6831
6832 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6833 if (r < 0) {
6834 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6835 if (r == -ENOENT) {
6836 return -ERR_PRECONDITION_FAILED;
6837 }
6838 return r;
6839 }
6840
6841 if (dest_bucket_info.owner != bucket_info.owner) {
6842 return -ERR_PRECONDITION_FAILED;
6843 }
6844
6845 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6846 obj_ctx.obj.set_atomic(dest_obj);
6847
6848 string no_zone;
6849
6850 r = copy_obj(obj_ctx,
6851 user,
6852 client_id,
6853 op_id,
6854 NULL, /* req_info *info */
6855 no_zone,
6856 dest_obj,
6857 obj,
6858 dest_bucket_info,
6859 bucket_info,
6860 NULL, /* time_t *src_mtime */
6861 NULL, /* time_t *mtime */
6862 NULL, /* const time_t *mod_ptr */
6863 NULL, /* const time_t *unmod_ptr */
6864 false, /* bool high_precision_time */
6865 NULL, /* const char *if_match */
6866 NULL, /* const char *if_nomatch */
6867 RGWRados::ATTRSMOD_NONE,
6868 true, /* bool copy_if_newer */
6869 state->attrset,
6870 RGW_OBJ_CATEGORY_MAIN,
6871 0, /* uint64_t olh_epoch */
6872 real_time(), /* time_t delete_at */
6873 NULL, /* string *version_id */
6874 NULL, /* string *ptag */
6875 NULL, /* string *petag */
6876 NULL, /* void (*progress_cb)(off_t, void *) */
6877 NULL); /* void *progress_data */
6878 if (r == -ECANCELED || r == -ENOENT) {
6879 /* Has already been overwritten, meaning another rgw process already
6880 * copied it out */
6881 return 0;
6882 }
6883
6884 return r;
6885 }
6886
6887 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6888 const rgw_user& user,
6889 RGWBucketInfo& bucket_info,
6890 rgw_obj& obj,
6891 bool& restored) /* out */
6892 {
6893 if (! swift_versioning_enabled(bucket_info)) {
6894 return 0;
6895 }
6896
6897 /* Bucket info of the bucket that stores previous versions of our object. */
6898 RGWBucketInfo archive_binfo;
6899
6900 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6901 bucket_info.swift_ver_location, archive_binfo,
6902 nullptr, nullptr);
6903 if (ret < 0) {
6904 return ret;
6905 }
6906
6907 /* Abort the operation if the bucket storing our archive belongs to someone
6908 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6909 * into consideration. For we can live with that.
6910 *
6911 * TODO: delegate this check to un upper layer and compare with ACLs. */
6912 if (bucket_info.owner != archive_binfo.owner) {
6913 return -EPERM;
6914 }
6915
6916 /* This code will be executed on latest version of the object. */
6917 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6918 std::string no_client_id;
6919 std::string no_op_id;
6920 std::string no_zone;
6921
6922 /* We don't support object versioning of Swift API on those buckets that
6923 * are already versioned using the S3 mechanism. This affects also bucket
6924 * storing archived objects. Otherwise the delete operation would create
6925 * a deletion marker. */
6926 if (archive_binfo.versioned()) {
6927 restored = false;
6928 return -ERR_PRECONDITION_FAILED;
6929 }
6930
6931 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6932 * irrelevant and may be safely skipped. */
6933 std::map<std::string, ceph::bufferlist> no_attrs;
6934
6935 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6936 obj_ctx.obj.set_atomic(archive_obj);
6937 obj_ctx.obj.set_atomic(obj);
6938
6939 int ret = copy_obj(obj_ctx,
6940 user,
6941 no_client_id,
6942 no_op_id,
6943 nullptr, /* req_info *info */
6944 no_zone,
6945 obj, /* dest obj */
6946 archive_obj, /* src obj */
6947 bucket_info, /* dest bucket info */
6948 archive_binfo, /* src bucket info */
6949 nullptr, /* time_t *src_mtime */
6950 nullptr, /* time_t *mtime */
6951 nullptr, /* const time_t *mod_ptr */
6952 nullptr, /* const time_t *unmod_ptr */
6953 false, /* bool high_precision_time */
6954 nullptr, /* const char *if_match */
6955 nullptr, /* const char *if_nomatch */
6956 RGWRados::ATTRSMOD_NONE,
6957 true, /* bool copy_if_newer */
6958 no_attrs,
6959 RGW_OBJ_CATEGORY_MAIN,
6960 0, /* uint64_t olh_epoch */
6961 real_time(), /* time_t delete_at */
6962 nullptr, /* string *version_id */
6963 nullptr, /* string *ptag */
6964 nullptr, /* string *petag */
6965 nullptr, /* void (*progress_cb)(off_t, void *) */
6966 nullptr); /* void *progress_data */
6967 if (ret == -ECANCELED || ret == -ENOENT) {
6968 /* Has already been overwritten, meaning another rgw process already
6969 * copied it out */
6970 return 0;
6971 } else if (ret < 0) {
6972 return ret;
6973 } else {
6974 restored = true;
6975 }
6976
6977 /* Need to remove the archived copy. */
6978 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6979 archive_binfo.versioning_status());
6980
6981 return ret;
6982 };
6983
6984 const std::string& obj_name = obj.get_oid();
6985 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6986 % obj_name);
6987
6988 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6989 handler);
6990 }
6991
6992 /**
6993 * Write/overwrite an object to the bucket storage.
6994 * bucket: the bucket to store the object in
6995 * obj: the object name/key
6996 * data: the object contents/value
6997 * size: the amount of data to write (data must be this long)
6998 * accounted_size: original size of data before compression, encryption
6999 * mtime: if non-NULL, writes the given mtime to the bucket storage
7000 * attrs: all the given attrs are written to bucket storage for the given object
7001 * exclusive: create object exclusively
7002 * Returns: 0 on success, -ERR# otherwise.
7003 */
7004 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
7005 map<string, bufferlist>& attrs,
7006 bool assume_noent, bool modify_tail,
7007 void *_index_op)
7008 {
7009 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7010 RGWRados *store = target->get_store();
7011
7012 ObjectWriteOperation op;
7013
7014 RGWObjState *state;
7015 int r = target->get_state(&state, false, assume_noent);
7016 if (r < 0)
7017 return r;
7018
7019 rgw_obj& obj = target->get_obj();
7020
7021 if (obj.get_oid().empty()) {
7022 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7023 return -EIO;
7024 }
7025
7026 rgw_rados_ref ref;
7027 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
7028 if (r < 0)
7029 return r;
7030
7031 bool is_olh = state->is_olh;
7032
7033 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
7034
7035 const string *ptag = meta.ptag;
7036 if (!ptag && !index_op->get_optag()->empty()) {
7037 ptag = index_op->get_optag();
7038 }
7039 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
7040 if (r < 0)
7041 return r;
7042
7043 if (real_clock::is_zero(meta.set_mtime)) {
7044 meta.set_mtime = real_clock::now();
7045 }
7046
7047 if (state->is_olh) {
7048 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
7049 }
7050
7051 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
7052 op.mtime2(&mtime_ts);
7053
7054 if (meta.data) {
7055 /* if we want to overwrite the data, we also want to overwrite the
7056 xattrs, so just remove the object */
7057 op.write_full(*meta.data);
7058 }
7059
7060 string etag;
7061 string content_type;
7062 bufferlist acl_bl;
7063
7064 map<string, bufferlist>::iterator iter;
7065 if (meta.rmattrs) {
7066 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
7067 const string& name = iter->first;
7068 op.rmxattr(name.c_str());
7069 }
7070 }
7071
7072 if (meta.manifest) {
7073 /* remove existing manifest attr */
7074 iter = attrs.find(RGW_ATTR_MANIFEST);
7075 if (iter != attrs.end())
7076 attrs.erase(iter);
7077
7078 bufferlist bl;
7079 ::encode(*meta.manifest, bl);
7080 op.setxattr(RGW_ATTR_MANIFEST, bl);
7081 }
7082
7083 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
7084 const string& name = iter->first;
7085 bufferlist& bl = iter->second;
7086
7087 if (!bl.length())
7088 continue;
7089
7090 op.setxattr(name.c_str(), bl);
7091
7092 if (name.compare(RGW_ATTR_ETAG) == 0) {
7093 etag = bl.c_str();
7094 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
7095 content_type = bl.c_str();
7096 } else if (name.compare(RGW_ATTR_ACL) == 0) {
7097 acl_bl = bl;
7098 }
7099 }
7100 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
7101 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
7102 }
7103
7104 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
7105 bufferlist bl;
7106 ::encode(store->get_zone_short_id(), bl);
7107 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
7108 }
7109
7110 if (!op.size())
7111 return 0;
7112
7113 uint64_t epoch;
7114 int64_t poolid;
7115 bool orig_exists;
7116 uint64_t orig_size;
7117
7118 if (!reset_obj) { //Multipart upload, it has immutable head.
7119 orig_exists = false;
7120 orig_size = 0;
7121 } else {
7122 orig_exists = state->exists;
7123 orig_size = state->accounted_size;
7124 }
7125
7126 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
7127
7128 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
7129
7130 if (versioned_op) {
7131 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
7132 }
7133
7134 if (!index_op->is_prepared()) {
7135 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
7136 if (r < 0)
7137 return r;
7138 }
7139
7140 r = ref.ioctx.operate(ref.oid, &op);
7141 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7142 or -ENOENT if was removed, or -EEXIST if it did not exist
7143 before and now it does */
7144 if (r == -EEXIST && assume_noent) {
7145 target->invalidate_state();
7146 return r;
7147 }
7148 goto done_cancel;
7149 }
7150
7151 epoch = ref.ioctx.get_last_version();
7152 poolid = ref.ioctx.get_id();
7153
7154 r = target->complete_atomic_modification();
7155 if (r < 0) {
7156 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7157 }
7158
7159 r = index_op->complete(poolid, epoch, size, accounted_size,
7160 meta.set_mtime, etag, content_type, &acl_bl,
7161 meta.category, meta.remove_objs, meta.user_data);
7162 if (r < 0)
7163 goto done_cancel;
7164
7165 if (meta.mtime) {
7166 *meta.mtime = meta.set_mtime;
7167 }
7168
7169 /* note that index_op was using state so we couldn't invalidate it earlier */
7170 target->invalidate_state();
7171 state = NULL;
7172
7173 if (versioned_op) {
7174 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
7175 if (r < 0) {
7176 return r;
7177 }
7178 }
7179
7180 if (!real_clock::is_zero(meta.delete_at)) {
7181 rgw_obj_index_key obj_key;
7182 obj.key.get_index_key(&obj_key);
7183
7184 r = store->objexp_hint_add(meta.delete_at,
7185 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7186 if (r < 0) {
7187 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7188 /* ignoring error, nothing we can do at this point */
7189 }
7190 }
7191 meta.canceled = false;
7192
7193 /* update quota cache */
7194 if (meta.completeMultipart){
7195 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7196 0, orig_size);
7197 }
7198 else {
7199 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7200 accounted_size, orig_size);
7201 }
7202 return 0;
7203
7204 done_cancel:
7205 int ret = index_op->cancel();
7206 if (ret < 0) {
7207 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7208 }
7209
7210 meta.canceled = true;
7211
7212 /* we lost in a race. There are a few options:
7213 * - existing object was rewritten (ECANCELED)
7214 * - non existing object was created (EEXIST)
7215 * - object was removed (ENOENT)
7216 * should treat it as a success
7217 */
7218 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7219 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7220 r = 0;
7221 }
7222 } else {
7223 if (meta.if_match != NULL) {
7224 // only overwrite existing object
7225 if (strcmp(meta.if_match, "*") == 0) {
7226 if (r == -ENOENT) {
7227 r = -ERR_PRECONDITION_FAILED;
7228 } else if (r == -ECANCELED) {
7229 r = 0;
7230 }
7231 }
7232 }
7233
7234 if (meta.if_nomatch != NULL) {
7235 // only create a new object
7236 if (strcmp(meta.if_nomatch, "*") == 0) {
7237 if (r == -EEXIST) {
7238 r = -ERR_PRECONDITION_FAILED;
7239 } else if (r == -ENOENT) {
7240 r = 0;
7241 }
7242 }
7243 }
7244 }
7245
7246 return r;
7247 }
7248
7249 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7250 map<string, bufferlist>& attrs)
7251 {
7252 RGWBucketInfo& bucket_info = target->get_bucket_info();
7253
7254 RGWRados::Bucket bop(target->get_store(), bucket_info);
7255 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7256 index_op.set_zones_trace(meta.zones_trace);
7257
7258 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7259 int r;
7260 if (assume_noent) {
7261 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7262 if (r == -EEXIST) {
7263 assume_noent = false;
7264 }
7265 }
7266 if (!assume_noent) {
7267 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7268 }
7269 return r;
7270 }
7271
7272 /** Write/overwrite a system object. */
7273 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7274 map<std::string, bufferlist>& attrs, int flags,
7275 bufferlist& data,
7276 RGWObjVersionTracker *objv_tracker,
7277 real_time set_mtime /* 0 for don't set */)
7278 {
7279 rgw_rados_ref ref;
7280 int r = get_system_obj_ref(obj, &ref);
7281 if (r < 0)
7282 return r;
7283
7284 ObjectWriteOperation op;
7285
7286 if (flags & PUT_OBJ_EXCL) {
7287 if (!(flags & PUT_OBJ_CREATE))
7288 return -EINVAL;
7289 op.create(true); // exclusive create
7290 } else {
7291 op.remove();
7292 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7293 op.create(false);
7294 }
7295
7296 if (objv_tracker) {
7297 objv_tracker->prepare_op_for_write(&op);
7298 }
7299
7300 if (real_clock::is_zero(set_mtime)) {
7301 set_mtime = real_clock::now();
7302 }
7303
7304 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7305 op.mtime2(&mtime_ts);
7306 op.write_full(data);
7307
7308 bufferlist acl_bl;
7309
7310 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7311 const string& name = iter->first;
7312 bufferlist& bl = iter->second;
7313
7314 if (!bl.length())
7315 continue;
7316
7317 op.setxattr(name.c_str(), bl);
7318 }
7319
7320 r = ref.ioctx.operate(ref.oid, &op);
7321 if (r < 0) {
7322 return r;
7323 }
7324
7325 if (objv_tracker) {
7326 objv_tracker->apply_write();
7327 }
7328
7329 if (mtime) {
7330 *mtime = set_mtime;
7331 }
7332
7333 return 0;
7334 }
7335
7336 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7337 off_t ofs, bool exclusive,
7338 RGWObjVersionTracker *objv_tracker)
7339 {
7340 rgw_rados_ref ref;
7341 int r = get_system_obj_ref(obj, &ref);
7342 if (r < 0) {
7343 return r;
7344 }
7345
7346 ObjectWriteOperation op;
7347
7348 if (exclusive)
7349 op.create(true);
7350
7351 if (objv_tracker) {
7352 objv_tracker->prepare_op_for_write(&op);
7353 }
7354 if (ofs == -1) {
7355 op.write_full(bl);
7356 } else {
7357 op.write(ofs, bl);
7358 }
7359 r = ref.ioctx.operate(ref.oid, &op);
7360 if (r < 0)
7361 return r;
7362
7363 if (objv_tracker) {
7364 objv_tracker->apply_write();
7365 }
7366 return 0;
7367 }
7368
7369 /**
7370 * Write/overwrite an object to the bucket storage.
7371 * bucket: the bucket to store the object in
7372 * obj: the object name/key
7373 * data: the object contents/value
7374 * offset: the offet to write to in the object
7375 * If this is -1, we will overwrite the whole object.
7376 * size: the amount of data to write (data must be this long)
7377 * attrs: all the given attrs are written to bucket storage for the given object
7378 * Returns: 0 on success, -ERR# otherwise.
7379 */
7380
7381 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7382 off_t ofs, bool exclusive,
7383 void **handle)
7384 {
7385 rgw_rados_ref ref;
7386 int r = get_raw_obj_ref(obj, &ref);
7387 if (r < 0) {
7388 return r;
7389 }
7390
7391 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7392 *handle = c;
7393
7394 ObjectWriteOperation op;
7395
7396 if (exclusive)
7397 op.create(true);
7398
7399 if (ofs == -1) {
7400 op.write_full(bl);
7401 } else {
7402 op.write(ofs, bl);
7403 }
7404 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7405 if (r < 0)
7406 return r;
7407
7408 return 0;
7409 }
7410
7411 int RGWRados::aio_wait(void *handle)
7412 {
7413 AioCompletion *c = (AioCompletion *)handle;
7414 c->wait_for_safe();
7415 int ret = c->get_return_value();
7416 c->release();
7417 return ret;
7418 }
7419
7420 bool RGWRados::aio_completed(void *handle)
7421 {
7422 AioCompletion *c = (AioCompletion *)handle;
7423 return c->is_safe();
7424 }
7425
7426 // PutObj filter that buffers data so we don't try to compress tiny blocks.
7427 // libcurl reads in 16k at a time, and we need at least 64k to get a good
7428 // compression ratio
7429 class RGWPutObj_Buffer : public RGWPutObj_Filter {
7430 const unsigned buffer_size;
7431 bufferlist buffer;
7432 public:
7433 RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
7434 : RGWPutObj_Filter(next), buffer_size(buffer_size) {
7435 assert(ISP2(buffer_size)); // must be power of 2
7436 }
7437
7438 int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
7439 bool *again) override {
7440 if (*again || !bl.length()) {
7441 // flush buffered data
7442 return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
7443 }
7444 // transform offset to the beginning of the buffer
7445 ofs = ofs - buffer.length();
7446 buffer.claim_append(bl);
7447 if (buffer.length() < buffer_size) {
7448 *again = false; // don't come back until there's more data
7449 return 0;
7450 }
7451 const auto count = P2ALIGN(buffer.length(), buffer_size);
7452 buffer.splice(0, count, &bl);
7453 return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
7454 }
7455 };
7456
7457 class RGWRadosPutObj : public RGWGetDataCB
7458 {
7459 CephContext* cct;
7460 rgw_obj obj;
7461 RGWPutObjDataProcessor *filter;
7462 boost::optional<RGWPutObj_Compress>& compressor;
7463 boost::optional<RGWPutObj_Buffer> buffering;
7464 CompressorRef& plugin;
7465 RGWPutObjProcessor_Atomic *processor;
7466 RGWOpStateSingleOp *opstate;
7467 void (*progress_cb)(off_t, void *);
7468 void *progress_data;
7469 bufferlist extra_data_bl;
7470 uint64_t extra_data_left;
7471 uint64_t data_len;
7472 map<string, bufferlist> src_attrs;
7473 public:
7474 RGWRadosPutObj(CephContext* cct,
7475 CompressorRef& plugin,
7476 boost::optional<RGWPutObj_Compress>& compressor,
7477 RGWPutObjProcessor_Atomic *p,
7478 RGWOpStateSingleOp *_ops,
7479 void (*_progress_cb)(off_t, void *),
7480 void *_progress_data) :
7481 cct(cct),
7482 filter(p),
7483 compressor(compressor),
7484 plugin(plugin),
7485 processor(p),
7486 opstate(_ops),
7487 progress_cb(_progress_cb),
7488 progress_data(_progress_data),
7489 extra_data_left(0),
7490 data_len(0) {}
7491
7492 int process_attrs(void) {
7493 if (extra_data_bl.length()) {
7494 JSONParser jp;
7495 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7496 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7497 return -EIO;
7498 }
7499
7500 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7501
7502 src_attrs.erase(RGW_ATTR_COMPRESSION);
7503 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7504 }
7505
7506 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7507 //do not compress if object is encrypted
7508 compressor = boost::in_place(cct, plugin, filter);
7509 constexpr unsigned buffer_size = 512 * 1024;
7510 buffering = boost::in_place(&*compressor, buffer_size);
7511 filter = &*buffering;
7512 }
7513 return 0;
7514 }
7515
7516 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7517 if (progress_cb) {
7518 progress_cb(ofs, progress_data);
7519 }
7520 if (extra_data_left) {
7521 size_t extra_len = bl.length();
7522 if (extra_len > extra_data_left)
7523 extra_len = extra_data_left;
7524
7525 bufferlist extra;
7526 bl.splice(0, extra_len, &extra);
7527 extra_data_bl.append(extra);
7528
7529 extra_data_left -= extra_len;
7530 if (extra_data_left == 0) {
7531 int res = process_attrs();
7532 if (res < 0)
7533 return res;
7534 }
7535 if (bl.length() == 0) {
7536 return 0;
7537 }
7538 ofs += extra_len;
7539 }
7540 // adjust ofs based on extra_data_len, so the result is a logical offset
7541 // into the object data
7542 assert(uint64_t(ofs) >= extra_data_len);
7543 ofs -= extra_data_len;
7544
7545 data_len += bl.length();
7546 bool again = false;
7547
7548 bool need_opstate = true;
7549
7550 do {
7551 void *handle = NULL;
7552 rgw_raw_obj obj;
7553 uint64_t size = bl.length();
7554 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7555 if (ret < 0)
7556 return ret;
7557
7558 if (need_opstate && opstate) {
7559 /* need to update opstate repository with new state. This is ratelimited, so we're not
7560 * really doing it every time
7561 */
7562 ret = opstate->renew_state();
7563 if (ret < 0) {
7564 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7565 int r = filter->throttle_data(handle, obj, size, false);
7566 if (r < 0) {
7567 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7568 }
7569 /* could not renew state! might have been marked as cancelled */
7570 return ret;
7571 }
7572 need_opstate = false;
7573 }
7574
7575 ret = filter->throttle_data(handle, obj, size, false);
7576 if (ret < 0)
7577 return ret;
7578 } while (again);
7579
7580 return 0;
7581 }
7582
7583 int flush() {
7584 bufferlist bl;
7585 return put_data_and_throttle(filter, bl, 0, false);
7586 }
7587
7588 bufferlist& get_extra_data() { return extra_data_bl; }
7589
7590 map<string, bufferlist>& get_attrs() { return src_attrs; }
7591
7592 void set_extra_data_len(uint64_t len) override {
7593 extra_data_left = len;
7594 RGWGetDataCB::set_extra_data_len(len);
7595 }
7596
7597 uint64_t get_data_len() {
7598 return data_len;
7599 }
7600
7601 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7602 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7603 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7604 }
7605
7606 bool is_canceled() {
7607 return processor->is_canceled();
7608 }
7609 };
7610
7611 /*
7612 * prepare attrset depending on attrs_mod.
7613 */
7614 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7615 map<string, bufferlist>& attrs,
7616 RGWRados::AttrsMod attrs_mod)
7617 {
7618 switch (attrs_mod) {
7619 case RGWRados::ATTRSMOD_NONE:
7620 attrs = src_attrs;
7621 break;
7622 case RGWRados::ATTRSMOD_REPLACE:
7623 if (!attrs[RGW_ATTR_ETAG].length()) {
7624 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7625 }
7626 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7627 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7628 if (ttiter != src_attrs.end()) {
7629 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7630 }
7631 }
7632 break;
7633 case RGWRados::ATTRSMOD_MERGE:
7634 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7635 if (attrs.find(it->first) == attrs.end()) {
7636 attrs[it->first] = it->second;
7637 }
7638 }
7639 break;
7640 }
7641 }
7642
7643 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7644 {
7645 map<string, bufferlist> attrset;
7646
7647 real_time mtime;
7648 uint64_t obj_size;
7649 RGWObjectCtx rctx(this);
7650
7651 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7652 RGWRados::Object::Read read_op(&op_target);
7653
7654 read_op.params.attrs = &attrset;
7655 read_op.params.lastmod = &mtime;
7656 read_op.params.obj_size = &obj_size;
7657
7658 int ret = read_op.prepare();
7659 if (ret < 0)
7660 return ret;
7661
7662 attrset.erase(RGW_ATTR_ID_TAG);
7663 attrset.erase(RGW_ATTR_TAIL_TAG);
7664
7665 uint64_t max_chunk_size;
7666
7667 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7668 if (ret < 0) {
7669 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7670 return ret;
7671 }
7672
7673 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7674 max_chunk_size, NULL, mtime, attrset,
7675 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7676 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7677 NULL, NULL);
7678 }
7679
7680 struct obj_time_weight {
7681 real_time mtime;
7682 uint32_t zone_short_id;
7683 uint64_t pg_ver;
7684 bool high_precision;
7685
7686 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7687
7688 bool compare_low_precision(const obj_time_weight& rhs) {
7689 struct timespec l = ceph::real_clock::to_timespec(mtime);
7690 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7691 l.tv_nsec = 0;
7692 r.tv_nsec = 0;
7693 if (l > r) {
7694 return false;
7695 }
7696 if (l < r) {
7697 return true;
7698 }
7699 if (zone_short_id != rhs.zone_short_id) {
7700 return (zone_short_id < rhs.zone_short_id);
7701 }
7702 return (pg_ver < rhs.pg_ver);
7703
7704 }
7705
7706 bool operator<(const obj_time_weight& rhs) {
7707 if (!high_precision || !rhs.high_precision) {
7708 return compare_low_precision(rhs);
7709 }
7710 if (mtime > rhs.mtime) {
7711 return false;
7712 }
7713 if (mtime < rhs.mtime) {
7714 return true;
7715 }
7716 if (zone_short_id != rhs.zone_short_id) {
7717 return (zone_short_id < rhs.zone_short_id);
7718 }
7719 return (pg_ver < rhs.pg_ver);
7720 }
7721
7722 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7723 mtime = _mtime;
7724 zone_short_id = _short_id;
7725 pg_ver = _pg_ver;
7726 }
7727
7728 void init(RGWObjState *state) {
7729 mtime = state->mtime;
7730 zone_short_id = state->zone_short_id;
7731 pg_ver = state->pg_ver;
7732 }
7733 };
7734
7735 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7736 out << o.mtime;
7737
7738 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7739 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7740 }
7741
7742 return out;
7743 }
7744
7745 class RGWGetExtraDataCB : public RGWGetDataCB {
7746 bufferlist extra_data;
7747 public:
7748 RGWGetExtraDataCB() {}
7749 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7750 if (extra_data.length() < extra_data_len) {
7751 off_t max = extra_data_len - extra_data.length();
7752 if (max > bl_len) {
7753 max = bl_len;
7754 }
7755 bl.splice(0, max, &extra_data);
7756 }
7757 return bl_len;
7758 }
7759
7760 bufferlist& get_extra_data() {
7761 return extra_data;
7762 }
7763 };
7764
7765 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7766 const rgw_user& user_id,
7767 const string& client_id,
7768 req_info *info,
7769 const string& source_zone,
7770 rgw_obj& src_obj,
7771 RGWBucketInfo& src_bucket_info,
7772 real_time *src_mtime,
7773 uint64_t *psize,
7774 const real_time *mod_ptr,
7775 const real_time *unmod_ptr,
7776 bool high_precision_time,
7777 const char *if_match,
7778 const char *if_nomatch,
7779 map<string, bufferlist> *pattrs,
7780 string *version_id,
7781 string *ptag,
7782 string *petag)
7783 {
7784 /* source is in a different zonegroup, copy from there */
7785
7786 RGWRESTStreamRWRequest *in_stream_req;
7787 string tag;
7788 map<string, bufferlist> src_attrs;
7789 append_rand_alpha(cct, tag, tag, 32);
7790 obj_time_weight set_mtime_weight;
7791 set_mtime_weight.high_precision = high_precision_time;
7792
7793 RGWRESTConn *conn;
7794 if (source_zone.empty()) {
7795 if (src_bucket_info.zonegroup.empty()) {
7796 /* source is in the master zonegroup */
7797 conn = rest_master_conn;
7798 } else {
7799 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7800 if (iter == zonegroup_conn_map.end()) {
7801 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7802 return -ENOENT;
7803 }
7804 conn = iter->second;
7805 }
7806 } else {
7807 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7808 if (iter == zone_conn_map.end()) {
7809 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7810 return -ENOENT;
7811 }
7812 conn = iter->second;
7813 }
7814
7815 RGWGetExtraDataCB cb;
7816 string etag;
7817 map<string, string> req_headers;
7818 real_time set_mtime;
7819
7820 const real_time *pmod = mod_ptr;
7821
7822 obj_time_weight dest_mtime_weight;
7823
7824 constexpr bool prepend_meta = true;
7825 constexpr bool get_op = true;
7826 constexpr bool rgwx_stat = true;
7827 constexpr bool sync_manifest = true;
7828 constexpr bool skip_decrypt = true;
7829 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7830 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7831 prepend_meta, get_op, rgwx_stat,
7832 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7833 if (ret < 0) {
7834 return ret;
7835 }
7836
7837 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7838 if (ret < 0) {
7839 return ret;
7840 }
7841
7842 bufferlist& extra_data_bl = cb.get_extra_data();
7843 if (extra_data_bl.length()) {
7844 JSONParser jp;
7845 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7846 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7847 return -EIO;
7848 }
7849
7850 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7851
7852 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7853 }
7854
7855 if (src_mtime) {
7856 *src_mtime = set_mtime;
7857 }
7858
7859 if (petag) {
7860 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7861 if (iter != src_attrs.end()) {
7862 bufferlist& etagbl = iter->second;
7863 *petag = etagbl.to_str();
7864 }
7865 }
7866
7867 if (pattrs) {
7868 *pattrs = src_attrs;
7869 }
7870
7871 return 0;
7872 }
7873
7874 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7875 const rgw_user& user_id,
7876 const string& client_id,
7877 const string& op_id,
7878 bool record_op_state,
7879 req_info *info,
7880 const string& source_zone,
7881 rgw_obj& dest_obj,
7882 rgw_obj& src_obj,
7883 RGWBucketInfo& dest_bucket_info,
7884 RGWBucketInfo& src_bucket_info,
7885 real_time *src_mtime,
7886 real_time *mtime,
7887 const real_time *mod_ptr,
7888 const real_time *unmod_ptr,
7889 bool high_precision_time,
7890 const char *if_match,
7891 const char *if_nomatch,
7892 AttrsMod attrs_mod,
7893 bool copy_if_newer,
7894 map<string, bufferlist>& attrs,
7895 RGWObjCategory category,
7896 uint64_t olh_epoch,
7897 real_time delete_at,
7898 string *version_id,
7899 string *ptag,
7900 ceph::buffer::list *petag,
7901 void (*progress_cb)(off_t, void *),
7902 void *progress_data,
7903 rgw_zone_set *zones_trace)
7904 {
7905 /* source is in a different zonegroup, copy from there */
7906
7907 RGWRESTStreamRWRequest *in_stream_req;
7908 string tag;
7909 int i;
7910 append_rand_alpha(cct, tag, tag, 32);
7911 obj_time_weight set_mtime_weight;
7912 set_mtime_weight.high_precision = high_precision_time;
7913
7914 RGWPutObjProcessor_Atomic processor(obj_ctx,
7915 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7916 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7917 if (version_id && *version_id != "null") {
7918 processor.set_version_id(*version_id);
7919 }
7920 processor.set_olh_epoch(olh_epoch);
7921 int ret = processor.prepare(this, NULL);
7922 if (ret < 0) {
7923 return ret;
7924 }
7925
7926 RGWRESTConn *conn;
7927 if (source_zone.empty()) {
7928 if (dest_bucket_info.zonegroup.empty()) {
7929 /* source is in the master zonegroup */
7930 conn = rest_master_conn;
7931 } else {
7932 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7933 if (iter == zonegroup_conn_map.end()) {
7934 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7935 return -ENOENT;
7936 }
7937 conn = iter->second;
7938 }
7939 } else {
7940 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7941 if (iter == zone_conn_map.end()) {
7942 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7943 return -ENOENT;
7944 }
7945 conn = iter->second;
7946 }
7947
7948 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7949
7950 RGWOpStateSingleOp *opstate = NULL;
7951
7952 if (record_op_state) {
7953 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7954
7955 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7956 if (ret < 0) {
7957 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7958 delete opstate;
7959 return ret;
7960 }
7961 }
7962
7963 boost::optional<RGWPutObj_Compress> compressor;
7964 CompressorRef plugin;
7965
7966 const auto& compression_type = zone_params.get_compression_type(
7967 dest_bucket_info.placement_rule);
7968 if (compression_type != "none") {
7969 plugin = Compressor::create(cct, compression_type);
7970 if (!plugin) {
7971 ldout(cct, 1) << "Cannot load plugin for compression type "
7972 << compression_type << dendl;
7973 }
7974 }
7975
7976 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7977
7978 string etag;
7979 map<string, string> req_headers;
7980 real_time set_mtime;
7981
7982 RGWObjState *dest_state = NULL;
7983
7984 const real_time *pmod = mod_ptr;
7985
7986 obj_time_weight dest_mtime_weight;
7987
7988 if (copy_if_newer) {
7989 /* need to get mtime for destination */
7990 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7991 if (ret < 0)
7992 goto set_err_state;
7993
7994 if (!real_clock::is_zero(dest_state->mtime)) {
7995 dest_mtime_weight.init(dest_state);
7996 pmod = &dest_mtime_weight.mtime;
7997 }
7998 }
7999
8000 static constexpr bool prepend_meta = true;
8001 static constexpr bool get_op = true;
8002 static constexpr bool rgwx_stat = false;
8003 static constexpr bool sync_manifest = true;
8004 static constexpr bool skip_decrypt = true;
8005 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
8006 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
8007 prepend_meta, get_op, rgwx_stat,
8008 sync_manifest, skip_decrypt, &cb, &in_stream_req);
8009 if (ret < 0) {
8010 goto set_err_state;
8011 }
8012
8013 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
8014 if (ret < 0) {
8015 goto set_err_state;
8016 }
8017 ret = cb.flush();
8018 if (ret < 0) {
8019 goto set_err_state;
8020 }
8021 if (compressor && compressor->is_compressed()) {
8022 bufferlist tmp;
8023 RGWCompressionInfo cs_info;
8024 cs_info.compression_type = plugin->get_type_name();
8025 cs_info.orig_size = cb.get_data_len();
8026 cs_info.blocks = move(compressor->get_compression_blocks());
8027 ::encode(cs_info, tmp);
8028 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
8029 }
8030
8031 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
8032 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
8033 } else {
8034 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
8035 if (iter != cb.get_attrs().end()) {
8036 try {
8037 ::decode(delete_at, iter->second);
8038 } catch (buffer::error& err) {
8039 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
8040 }
8041 }
8042 }
8043
8044 if (src_mtime) {
8045 *src_mtime = set_mtime;
8046 }
8047
8048 if (petag) {
8049 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
8050 if (iter != cb.get_attrs().end()) {
8051 *petag = iter->second;
8052 }
8053 }
8054
8055 if (source_zone.empty()) {
8056 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
8057 } else {
8058 attrs = cb.get_attrs();
8059 }
8060
8061 if (copy_if_newer) {
8062 uint64_t pg_ver = 0;
8063 auto i = attrs.find(RGW_ATTR_PG_VER);
8064 if (i != attrs.end() && i->second.length() > 0) {
8065 bufferlist::iterator iter = i->second.begin();
8066 try {
8067 ::decode(pg_ver, iter);
8068 } catch (buffer::error& err) {
8069 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
8070 /* non critical error */
8071 }
8072 }
8073 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
8074 }
8075
8076 #define MAX_COMPLETE_RETRY 100
8077 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
8078 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
8079 if (ret < 0) {
8080 goto set_err_state;
8081 }
8082 if (copy_if_newer && cb.is_canceled()) {
8083 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
8084 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
8085 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
8086 if (ret < 0) {
8087 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
8088 goto set_err_state;
8089 }
8090 dest_mtime_weight.init(dest_state);
8091 dest_mtime_weight.high_precision = high_precision_time;
8092 if (!dest_state->exists ||
8093 dest_mtime_weight < set_mtime_weight) {
8094 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8095 continue;
8096 } else {
8097 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
8098 }
8099 }
8100 break;
8101 }
8102
8103 if (i == MAX_COMPLETE_RETRY) {
8104 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
8105 ret = -EIO;
8106 goto set_err_state;
8107 }
8108
8109 if (opstate) {
8110 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
8111 if (ret < 0) {
8112 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
8113 }
8114 delete opstate;
8115 }
8116
8117 return 0;
8118 set_err_state:
8119 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
8120 ret = 0;
8121 }
8122 if (opstate) {
8123 RGWOpState::OpState state;
8124 if (ret < 0) {
8125 state = RGWOpState::OPSTATE_ERROR;
8126 } else {
8127 state = RGWOpState::OPSTATE_COMPLETE;
8128 }
8129 int r = opstate->set_state(state);
8130 if (r < 0) {
8131 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
8132 }
8133 delete opstate;
8134 }
8135 return ret;
8136 }
8137
8138
8139 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
8140 map<string, bufferlist>& src_attrs,
8141 RGWRados::Object::Read& read_op,
8142 const rgw_user& user_id,
8143 rgw_obj& dest_obj,
8144 real_time *mtime)
8145 {
8146 string etag;
8147
8148 RGWRESTStreamWriteRequest *out_stream_req;
8149
8150 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
8151 if (ret < 0) {
8152 return ret;
8153 }
8154
8155 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
8156 if (ret < 0) {
8157 delete out_stream_req;
8158 return ret;
8159 }
8160
8161 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8162 if (ret < 0)
8163 return ret;
8164
8165 return 0;
8166 }
8167
8168 /**
8169 * Copy an object.
8170 * dest_obj: the object to copy into
8171 * src_obj: the object to copy from
8172 * attrs: usage depends on attrs_mod parameter
8173 * attrs_mod: the modification mode of the attrs, may have the following values:
8174 * ATTRSMOD_NONE - the attributes of the source object will be
8175 * copied without modifications, attrs parameter is ignored;
8176 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8177 * parameter, source object attributes are not copied;
8178 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8179 * are overwritten by values contained in attrs parameter.
8180 * err: stores any errors resulting from the get of the original object
8181 * Returns: 0 on success, -ERR# otherwise.
8182 */
8183 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8184 const rgw_user& user_id,
8185 const string& client_id,
8186 const string& op_id,
8187 req_info *info,
8188 const string& source_zone,
8189 rgw_obj& dest_obj,
8190 rgw_obj& src_obj,
8191 RGWBucketInfo& dest_bucket_info,
8192 RGWBucketInfo& src_bucket_info,
8193 real_time *src_mtime,
8194 real_time *mtime,
8195 const real_time *mod_ptr,
8196 const real_time *unmod_ptr,
8197 bool high_precision_time,
8198 const char *if_match,
8199 const char *if_nomatch,
8200 AttrsMod attrs_mod,
8201 bool copy_if_newer,
8202 map<string, bufferlist>& attrs,
8203 RGWObjCategory category,
8204 uint64_t olh_epoch,
8205 real_time delete_at,
8206 string *version_id,
8207 string *ptag,
8208 ceph::buffer::list *petag,
8209 void (*progress_cb)(off_t, void *),
8210 void *progress_data)
8211 {
8212 int ret;
8213 uint64_t obj_size;
8214 rgw_obj shadow_obj = dest_obj;
8215 string shadow_oid;
8216
8217 bool remote_src;
8218 bool remote_dest;
8219
8220 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8221 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8222
8223 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8224 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8225
8226 if (remote_src && remote_dest) {
8227 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8228 return -EINVAL;
8229 }
8230
8231 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8232
8233 if (remote_src || !source_zone.empty()) {
8234 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8235 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8236 unmod_ptr, high_precision_time,
8237 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
8238 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
8239 }
8240
8241 map<string, bufferlist> src_attrs;
8242 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8243 RGWRados::Object::Read read_op(&src_op_target);
8244
8245 read_op.conds.mod_ptr = mod_ptr;
8246 read_op.conds.unmod_ptr = unmod_ptr;
8247 read_op.conds.high_precision_time = high_precision_time;
8248 read_op.conds.if_match = if_match;
8249 read_op.conds.if_nomatch = if_nomatch;
8250 read_op.params.attrs = &src_attrs;
8251 read_op.params.lastmod = src_mtime;
8252 read_op.params.obj_size = &obj_size;
8253
8254 ret = read_op.prepare();
8255 if (ret < 0) {
8256 return ret;
8257 }
8258 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8259 // Current implementation does not follow S3 spec and even
8260 // may result in data corruption silently when copying
8261 // multipart objects acorss pools. So reject COPY operations
8262 //on encrypted objects before it is fully functional.
8263 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8264 << " has not been implemented." << dendl;
8265 return -ERR_NOT_IMPLEMENTED;
8266 }
8267
8268 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8269 src_attrs.erase(RGW_ATTR_DELETE_AT);
8270
8271 set_copy_attrs(src_attrs, attrs, attrs_mod);
8272 attrs.erase(RGW_ATTR_ID_TAG);
8273 attrs.erase(RGW_ATTR_PG_VER);
8274 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8275 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8276 if (cmp != src_attrs.end())
8277 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8278
8279 RGWObjManifest manifest;
8280 RGWObjState *astate = NULL;
8281
8282 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8283 if (ret < 0) {
8284 return ret;
8285 }
8286
8287 vector<rgw_raw_obj> ref_objs;
8288
8289 if (remote_dest) {
8290 /* dest is in a different zonegroup, copy it there */
8291 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8292 }
8293 uint64_t max_chunk_size;
8294
8295 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8296 if (ret < 0) {
8297 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8298 return ret;
8299 }
8300
8301 rgw_pool src_pool;
8302 rgw_pool dest_pool;
8303 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8304 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8305 return -EIO;
8306 }
8307 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8308 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8309 return -EIO;
8310 }
8311
8312
8313 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8314 bool copy_first = false;
8315 if (astate->has_manifest) {
8316 if (!astate->manifest.has_tail()) {
8317 copy_data = true;
8318 } else {
8319 uint64_t head_size = astate->manifest.get_head_size();
8320
8321 if (head_size > 0) {
8322 if (head_size > max_chunk_size) {
8323 copy_data = true;
8324 } else {
8325 copy_first = true;
8326 }
8327 }
8328 }
8329 }
8330
8331 if (petag) {
8332 const auto iter = attrs.find(RGW_ATTR_ETAG);
8333 if (iter != attrs.end()) {
8334 *petag = iter->second;
8335 }
8336 }
8337
8338 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8339 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8340 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8341 version_id, ptag, petag);
8342 }
8343
8344 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8345
8346 if (copy_first) { // we need to copy first chunk, not increase refcount
8347 ++miter;
8348 }
8349
8350 rgw_rados_ref ref;
8351 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8352 if (ret < 0) {
8353 return ret;
8354 }
8355
8356 bool versioned_dest = dest_bucket_info.versioning_enabled();
8357
8358 if (version_id && !version_id->empty()) {
8359 versioned_dest = true;
8360 dest_obj.key.set_instance(*version_id);
8361 } else if (versioned_dest) {
8362 gen_rand_obj_instance_name(&dest_obj);
8363 }
8364
8365 bufferlist first_chunk;
8366
8367 bool copy_itself = (dest_obj == src_obj);
8368 RGWObjManifest *pmanifest;
8369 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8370
8371 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8372 RGWRados::Object::Write write_op(&dest_op_target);
8373
8374 string tag;
8375
8376 if (ptag) {
8377 tag = *ptag;
8378 }
8379
8380 if (tag.empty()) {
8381 append_rand_alpha(cct, tag, tag, 32);
8382 }
8383
8384 if (!copy_itself) {
8385 attrs.erase(RGW_ATTR_TAIL_TAG);
8386 manifest = astate->manifest;
8387 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8388 if (tail_placement.bucket.name.empty()) {
8389 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8390 }
8391 string ref_tag;
8392 for (; miter != astate->manifest.obj_end(); ++miter) {
8393 ObjectWriteOperation op;
8394 ref_tag = tag + '\0';
8395 cls_refcount_get(op, ref_tag, true);
8396 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8397 ref.ioctx.locator_set_key(loc.loc);
8398
8399 ret = ref.ioctx.operate(loc.oid, &op);
8400 if (ret < 0) {
8401 goto done_ret;
8402 }
8403
8404 ref_objs.push_back(loc);
8405 }
8406
8407 pmanifest = &manifest;
8408 } else {
8409 pmanifest = &astate->manifest;
8410 /* don't send the object's tail for garbage collection */
8411 astate->keep_tail = true;
8412 }
8413
8414 if (copy_first) {
8415 ret = read_op.read(0, max_chunk_size, first_chunk);
8416 if (ret < 0) {
8417 goto done_ret;
8418 }
8419
8420 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8421 } else {
8422 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8423 }
8424
8425 write_op.meta.data = &first_chunk;
8426 write_op.meta.manifest = pmanifest;
8427 write_op.meta.ptag = &tag;
8428 write_op.meta.owner = dest_bucket_info.owner;
8429 write_op.meta.mtime = mtime;
8430 write_op.meta.flags = PUT_OBJ_CREATE;
8431 write_op.meta.category = category;
8432 write_op.meta.olh_epoch = olh_epoch;
8433 write_op.meta.delete_at = delete_at;
8434 write_op.meta.modify_tail = !copy_itself;
8435
8436 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8437 if (ret < 0) {
8438 goto done_ret;
8439 }
8440
8441 return 0;
8442
8443 done_ret:
8444 if (!copy_itself) {
8445 vector<rgw_raw_obj>::iterator riter;
8446
8447 /* rollback reference */
8448 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8449 ObjectWriteOperation op;
8450 cls_refcount_put(op, tag, true);
8451
8452 ref.ioctx.locator_set_key(riter->loc);
8453
8454 int r = ref.ioctx.operate(riter->oid, &op);
8455 if (r < 0) {
8456 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8457 }
8458 }
8459 }
8460 return ret;
8461 }
8462
8463
8464 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8465 RGWBucketInfo& dest_bucket_info,
8466 RGWRados::Object::Read& read_op, off_t end,
8467 rgw_obj& dest_obj,
8468 rgw_obj& src_obj,
8469 uint64_t max_chunk_size,
8470 real_time *mtime,
8471 real_time set_mtime,
8472 map<string, bufferlist>& attrs,
8473 RGWObjCategory category,
8474 uint64_t olh_epoch,
8475 real_time delete_at,
8476 string *version_id,
8477 string *ptag,
8478 ceph::buffer::list *petag)
8479 {
8480 bufferlist first_chunk;
8481 RGWObjManifest manifest;
8482
8483 string tag;
8484 append_rand_alpha(cct, tag, tag, 32);
8485
8486 RGWPutObjProcessor_Atomic processor(obj_ctx,
8487 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
8488 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8489 if (version_id) {
8490 processor.set_version_id(*version_id);
8491 }
8492 processor.set_olh_epoch(olh_epoch);
8493 int ret = processor.prepare(this, NULL);
8494 if (ret < 0)
8495 return ret;
8496
8497 off_t ofs = 0;
8498
8499 do {
8500 bufferlist bl;
8501 ret = read_op.read(ofs, end, bl);
8502
8503 uint64_t read_len = ret;
8504 bool again;
8505
8506 do {
8507 void *handle;
8508 rgw_raw_obj obj;
8509
8510 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8511 if (ret < 0) {
8512 return ret;
8513 }
8514 ret = processor.throttle_data(handle, obj, read_len, false);
8515 if (ret < 0)
8516 return ret;
8517 } while (again);
8518
8519 ofs += read_len;
8520 } while (ofs <= end);
8521
8522 string etag;
8523 auto iter = attrs.find(RGW_ATTR_ETAG);
8524 if (iter != attrs.end()) {
8525 bufferlist& bl = iter->second;
8526 etag = string(bl.c_str(), bl.length());
8527 if (petag) {
8528 *petag = bl;
8529 }
8530 }
8531
8532 uint64_t accounted_size;
8533 {
8534 bool compressed{false};
8535 RGWCompressionInfo cs_info;
8536 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8537 if (ret < 0) {
8538 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8539 return ret;
8540 }
8541 // pass original size if compressed
8542 accounted_size = compressed ? cs_info.orig_size : ofs;
8543 }
8544
8545 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8546 }
8547
8548 bool RGWRados::is_meta_master()
8549 {
8550 if (!get_zonegroup().is_master_zonegroup()) {
8551 return false;
8552 }
8553
8554 return (get_zonegroup().master_zone == zone_public_config.id);
8555 }
8556
8557 /**
8558 * Check to see if the bucket metadata could be synced
8559 * bucket: the bucket to check
8560 * Returns false is the bucket is not synced
8561 */
8562 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8563 {
8564
8565 /* no current period */
8566 if (current_period.get_id().empty()) {
8567 return false;
8568 }
8569
8570 /* zonegroup is not master zonegroup */
8571 if (!get_zonegroup().is_master_zonegroup()) {
8572 return false;
8573 }
8574
8575 /* single zonegroup and a single zone */
8576 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8577 return false;
8578 }
8579
8580 /* zone is not master */
8581 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8582 return false;
8583 }
8584
8585 return true;
8586 }
8587
8588 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8589 {
8590 std::vector<rgw_bucket_dir_entry> ent_list;
8591 rgw_obj_index_key marker;
8592 string prefix;
8593 bool is_truncated;
8594
8595 do {
8596 constexpr uint NUM_ENTRIES = 1000u;
8597 int r = cls_bucket_list_unordered(bucket_info,
8598 RGW_NO_SHARD,
8599 marker,
8600 prefix,
8601 NUM_ENTRIES,
8602 true,
8603 ent_list,
8604 &is_truncated,
8605 &marker);
8606 if (r < 0)
8607 return r;
8608
8609 string ns;
8610 for (auto const& dirent : ent_list) {
8611 rgw_obj_key obj;
8612
8613 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
8614 return -ENOTEMPTY;
8615 }
8616 } while (is_truncated);
8617
8618 return 0;
8619 }
8620
8621 /**
8622 * Delete a bucket.
8623 * bucket: the name of the bucket to delete
8624 * Returns 0 on success, -ERR# otherwise.
8625 */
8626 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8627 {
8628 const rgw_bucket& bucket = bucket_info.bucket;
8629 librados::IoCtx index_ctx;
8630 map<int, string> bucket_objs;
8631 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8632 if (r < 0)
8633 return r;
8634
8635 if (check_empty) {
8636 r = check_bucket_empty(bucket_info);
8637 if (r < 0) {
8638 return r;
8639 }
8640 }
8641
8642 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8643 if (r < 0)
8644 return r;
8645
8646 /* if the bucket is not synced we can remove the meta file */
8647 if (!is_syncing_bucket_meta(bucket)) {
8648 RGWObjVersionTracker objv_tracker;
8649 string entry = bucket.get_key();
8650 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8651 if (r < 0) {
8652 return r;
8653 }
8654 /* remove bucket index objects*/
8655 map<int, string>::const_iterator biter;
8656 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8657 index_ctx.remove(biter->second);
8658 }
8659 }
8660 return 0;
8661 }
8662
8663 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8664 {
8665 RGWBucketInfo info;
8666 map<string, bufferlist> attrs;
8667 RGWObjectCtx obj_ctx(this);
8668 int r;
8669 if (bucket.bucket_id.empty()) {
8670 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8671 } else {
8672 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8673 }
8674 if (r < 0) {
8675 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8676 return r;
8677 }
8678
8679 info.owner = owner.get_id();
8680
8681 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8682 if (r < 0) {
8683 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8684 return r;
8685 }
8686
8687 return 0;
8688 }
8689
8690
8691 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8692 {
8693 int ret = 0;
8694
8695 vector<rgw_bucket>::iterator iter;
8696
8697 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8698 rgw_bucket& bucket = *iter;
8699 if (enabled)
8700 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8701 else
8702 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8703
8704 RGWBucketInfo info;
8705 map<string, bufferlist> attrs;
8706 RGWObjectCtx obj_ctx(this);
8707 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8708 if (r < 0) {
8709 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8710 ret = r;
8711 continue;
8712 }
8713 if (enabled) {
8714 info.flags &= ~BUCKET_SUSPENDED;
8715 } else {
8716 info.flags |= BUCKET_SUSPENDED;
8717 }
8718
8719 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8720 if (r < 0) {
8721 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8722 ret = r;
8723 continue;
8724 }
8725 }
8726 return ret;
8727 }
8728
8729 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8730 {
8731 RGWBucketInfo bucket_info;
8732 RGWObjectCtx obj_ctx(this);
8733 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8734 if (ret < 0) {
8735 return ret;
8736 }
8737
8738 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8739 return 0;
8740 }
8741
8742 int RGWRados::Object::complete_atomic_modification()
8743 {
8744 if (!state->has_manifest || state->keep_tail)
8745 return 0;
8746
8747 cls_rgw_obj_chain chain;
8748 store->update_gc_chain(obj, state->manifest, &chain);
8749
8750 if (chain.empty()) {
8751 return 0;
8752 }
8753
8754 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
8755 return store->gc->send_chain(chain, tag, false); // do it async
8756 }
8757
8758 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8759 {
8760 RGWObjManifest::obj_iterator iter;
8761 rgw_raw_obj raw_head;
8762 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8763 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8764 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8765 if (mobj == raw_head)
8766 continue;
8767 cls_rgw_obj_key key(mobj.oid);
8768 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8769 }
8770 }
8771
8772 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8773 {
8774 return gc->send_chain(chain, tag, sync);
8775 }
8776
8777 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8778 librados::IoCtx& index_ctx,
8779 string& bucket_oid)
8780 {
8781 const rgw_bucket& bucket = bucket_info.bucket;
8782 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8783 if (r < 0)
8784 return r;
8785
8786 if (bucket.bucket_id.empty()) {
8787 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8788 return -EIO;
8789 }
8790
8791 bucket_oid = dir_oid_prefix;
8792 bucket_oid.append(bucket.bucket_id);
8793
8794 return 0;
8795 }
8796
8797 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
8798 librados::IoCtx& index_ctx,
8799 string& bucket_oid_base) {
8800 const rgw_bucket& bucket = bucket_info.bucket;
8801 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8802 if (r < 0)
8803 return r;
8804
8805 if (bucket.bucket_id.empty()) {
8806 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8807 return -EIO;
8808 }
8809
8810 bucket_oid_base = dir_oid_prefix;
8811 bucket_oid_base.append(bucket.bucket_id);
8812
8813 return 0;
8814
8815 }
8816
8817 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
8818 librados::IoCtx& index_ctx,
8819 map<int, string>& bucket_objs,
8820 int shard_id,
8821 map<int, string> *bucket_instance_ids) {
8822 string bucket_oid_base;
8823 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8824 if (ret < 0) {
8825 return ret;
8826 }
8827
8828 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8829 if (bucket_instance_ids) {
8830 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8831 }
8832 return 0;
8833 }
8834
8835 template<typename T>
8836 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8837 map<int, string>& oids, map<int, T>& bucket_objs,
8838 int shard_id, map<int, string> *bucket_instance_ids)
8839 {
8840 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8841 if (ret < 0)
8842 return ret;
8843
8844 map<int, string>::const_iterator iter = oids.begin();
8845 for (; iter != oids.end(); ++iter) {
8846 bucket_objs[iter->first] = T();
8847 }
8848 return 0;
8849 }
8850
8851 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8852 const string& obj_key, string *bucket_obj, int *shard_id)
8853 {
8854 string bucket_oid_base;
8855 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8856 if (ret < 0)
8857 return ret;
8858
8859 RGWObjectCtx obj_ctx(this);
8860
8861 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8862 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8863 if (ret < 0) {
8864 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8865 return ret;
8866 }
8867 return 0;
8868 }
8869
8870 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8871 int shard_id, string *bucket_obj)
8872 {
8873 string bucket_oid_base;
8874 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8875 if (ret < 0)
8876 return ret;
8877
8878 RGWObjectCtx obj_ctx(this);
8879
8880 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8881 shard_id, bucket_obj);
8882 return 0;
8883 }
8884
8885 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8886 map<RGWObjCategory, RGWStorageStats>& stats)
8887 {
8888 for (const auto& pair : header.stats) {
8889 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8890 const rgw_bucket_category_stats& header_stats = pair.second;
8891
8892 RGWStorageStats& s = stats[category];
8893
8894 s.category = category;
8895 s.size += header_stats.total_size;
8896 s.size_rounded += header_stats.total_size_rounded;
8897 s.size_utilized += header_stats.actual_size;
8898 s.num_objects += header_stats.num_entries;
8899 }
8900 }
8901
8902 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8903 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8904 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8905 {
8906 librados::IoCtx index_ctx;
8907 // key - bucket index object id
8908 // value - bucket index check OP returned result with the given bucket index object (shard)
8909 map<int, string> oids;
8910 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8911
8912 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8913 if (ret < 0) {
8914 return ret;
8915 }
8916
8917 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8918 if (ret < 0) {
8919 return ret;
8920 }
8921
8922 // Aggregate results (from different shards if there is any)
8923 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8924 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8925 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8926 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8927 }
8928
8929 return 0;
8930 }
8931
8932 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8933 {
8934 librados::IoCtx index_ctx;
8935 map<int, string> bucket_objs;
8936
8937 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8938 if (r < 0) {
8939 return r;
8940 }
8941
8942 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8943 }
8944
8945 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8946 {
8947 librados::IoCtx index_ctx;
8948 map<int, string> bucket_objs;
8949
8950 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8951 if (r < 0) {
8952 return r;
8953 }
8954
8955 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8956 }
8957
8958 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8959 {
8960 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8961 std::string oid, key;
8962 get_obj_bucket_and_oid_loc(obj, oid, key);
8963 if (!rctx)
8964 return 0;
8965
8966 RGWObjState *state = NULL;
8967
8968 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8969 if (r < 0)
8970 return r;
8971
8972 if (!state->is_atomic) {
8973 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8974 return -EINVAL;
8975 }
8976
8977 string tag;
8978
8979 if (state->tail_tag.length() > 0) {
8980 tag = state->tail_tag.c_str();
8981 } else if (state->obj_tag.length() > 0) {
8982 tag = state->obj_tag.c_str();
8983 } else {
8984 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8985 return -EINVAL;
8986 }
8987
8988 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8989
8990 return gc->defer_chain(tag, false);
8991 }
8992
8993 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8994 {
8995 list<string> prefixes;
8996 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8997 cls_rgw_remove_obj(op, prefixes);
8998 }
8999
9000 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
9001 {
9002 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
9003 }
9004
9005 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
9006 {
9007 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
9008 }
9009
9010
9011 /**
9012 * Delete an object.
9013 * bucket: name of the bucket storing the object
9014 * obj: name of the object to delete
9015 * Returns: 0 on success, -ERR# otherwise.
9016 */
9017 int RGWRados::Object::Delete::delete_obj()
9018 {
9019 RGWRados *store = target->get_store();
9020 rgw_obj& src_obj = target->get_obj();
9021 const string& instance = src_obj.key.instance;
9022 rgw_obj obj = src_obj;
9023
9024 if (instance == "null") {
9025 obj.key.instance.clear();
9026 }
9027
9028 bool explicit_marker_version = (!params.marker_version_id.empty());
9029
9030 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
9031 if (instance.empty() || explicit_marker_version) {
9032 rgw_obj marker = obj;
9033
9034 if (!params.marker_version_id.empty()) {
9035 if (params.marker_version_id != "null") {
9036 marker.key.set_instance(params.marker_version_id);
9037 }
9038 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
9039 store->gen_rand_obj_instance_name(&marker);
9040 }
9041
9042 result.version_id = marker.key.instance;
9043 result.delete_marker = true;
9044
9045 struct rgw_bucket_dir_entry_meta meta;
9046
9047 meta.owner = params.obj_owner.get_id().to_str();
9048 meta.owner_display_name = params.obj_owner.get_display_name();
9049
9050 if (real_clock::is_zero(params.mtime)) {
9051 meta.mtime = real_clock::now();
9052 } else {
9053 meta.mtime = params.mtime;
9054 }
9055
9056 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
9057 if (r < 0) {
9058 return r;
9059 }
9060 } else {
9061 rgw_bucket_dir_entry dirent;
9062
9063 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
9064 if (r < 0) {
9065 return r;
9066 }
9067 result.delete_marker = dirent.is_delete_marker();
9068 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
9069 if (r < 0) {
9070 return r;
9071 }
9072 result.version_id = instance;
9073 }
9074
9075 BucketShard *bs;
9076 int r = target->get_bucket_shard(&bs);
9077 if (r < 0) {
9078 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
9079 return r;
9080 }
9081
9082 if (target->bucket_info.datasync_flag_enabled()) {
9083 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9084 if (r < 0) {
9085 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9086 return r;
9087 }
9088 }
9089
9090 return 0;
9091 }
9092
9093 rgw_rados_ref ref;
9094 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
9095 if (r < 0) {
9096 return r;
9097 }
9098
9099 RGWObjState *state;
9100 r = target->get_state(&state, false);
9101 if (r < 0)
9102 return r;
9103
9104 ObjectWriteOperation op;
9105
9106 if (!real_clock::is_zero(params.unmod_since)) {
9107 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
9108 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
9109 if (!params.high_precision_time) {
9110 ctime.tv_nsec = 0;
9111 unmod.tv_nsec = 0;
9112 }
9113
9114 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
9115 if (ctime > unmod) {
9116 return -ERR_PRECONDITION_FAILED;
9117 }
9118
9119 /* only delete object if mtime is less than or equal to params.unmod_since */
9120 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
9121 }
9122 uint64_t obj_size = state->size;
9123
9124 if (!real_clock::is_zero(params.expiration_time)) {
9125 bufferlist bl;
9126 real_time delete_at;
9127
9128 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
9129 try {
9130 bufferlist::iterator iter = bl.begin();
9131 ::decode(delete_at, iter);
9132 } catch (buffer::error& err) {
9133 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
9134 return -EIO;
9135 }
9136
9137 if (params.expiration_time != delete_at) {
9138 return -ERR_PRECONDITION_FAILED;
9139 }
9140 } else {
9141 return -ERR_PRECONDITION_FAILED;
9142 }
9143 }
9144
9145 if (!state->exists) {
9146 target->invalidate_state();
9147 return -ENOENT;
9148 }
9149
9150 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
9151 if (r < 0)
9152 return r;
9153
9154 RGWBucketInfo& bucket_info = target->get_bucket_info();
9155
9156 RGWRados::Bucket bop(store, bucket_info);
9157 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9158
9159 index_op.set_zones_trace(params.zones_trace);
9160 index_op.set_bilog_flags(params.bilog_flags);
9161
9162 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
9163 if (r < 0)
9164 return r;
9165
9166 store->remove_rgw_head_obj(op);
9167 r = ref.ioctx.operate(ref.oid, &op);
9168
9169 /* raced with another operation, object state is indeterminate */
9170 const bool need_invalidate = (r == -ECANCELED);
9171
9172 int64_t poolid = ref.ioctx.get_id();
9173 if (r >= 0) {
9174 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9175 if (obj_tombstone_cache) {
9176 tombstone_entry entry{*state};
9177 obj_tombstone_cache->add(obj, entry);
9178 }
9179 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
9180
9181 int ret = target->complete_atomic_modification();
9182 if (ret < 0) {
9183 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9184 }
9185 /* other than that, no need to propagate error */
9186 } else {
9187 int ret = index_op.cancel();
9188 if (ret < 0) {
9189 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9190 }
9191 }
9192
9193 if (need_invalidate) {
9194 target->invalidate_state();
9195 }
9196
9197 if (r < 0)
9198 return r;
9199
9200 /* update quota cache */
9201 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9202
9203 return 0;
9204 }
9205
9206 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9207 const RGWBucketInfo& bucket_info,
9208 const rgw_obj& obj,
9209 int versioning_status,
9210 uint16_t bilog_flags,
9211 const real_time& expiration_time,
9212 rgw_zone_set *zones_trace)
9213 {
9214 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9215 RGWRados::Object::Delete del_op(&del_target);
9216
9217 del_op.params.bucket_owner = bucket_info.owner;
9218 del_op.params.versioning_status = versioning_status;
9219 del_op.params.bilog_flags = bilog_flags;
9220 del_op.params.expiration_time = expiration_time;
9221 del_op.params.zones_trace = zones_trace;
9222
9223 return del_op.delete_obj();
9224 }
9225
9226 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9227 {
9228 rgw_rados_ref ref;
9229 int r = get_raw_obj_ref(obj, &ref);
9230 if (r < 0) {
9231 return r;
9232 }
9233
9234 ObjectWriteOperation op;
9235
9236 op.remove();
9237 r = ref.ioctx.operate(ref.oid, &op);
9238 if (r < 0)
9239 return r;
9240
9241 return 0;
9242 }
9243
9244 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9245 {
9246 if (obj.empty()) {
9247 ldout(cct, 1) << "delete_system_obj got empty object name "
9248 << obj << ", returning EINVAL" << dendl;
9249 return -EINVAL;
9250 }
9251 rgw_rados_ref ref;
9252 int r = get_raw_obj_ref(obj, &ref);
9253 if (r < 0) {
9254 return r;
9255 }
9256
9257 ObjectWriteOperation op;
9258
9259 if (objv_tracker) {
9260 objv_tracker->prepare_op_for_write(&op);
9261 }
9262
9263 op.remove();
9264 r = ref.ioctx.operate(ref.oid, &op);
9265 if (r < 0)
9266 return r;
9267
9268 return 0;
9269 }
9270
9271 int RGWRados::delete_obj_index(const rgw_obj& obj)
9272 {
9273 std::string oid, key;
9274 get_obj_bucket_and_oid_loc(obj, oid, key);
9275
9276 RGWObjectCtx obj_ctx(this);
9277
9278 RGWBucketInfo bucket_info;
9279 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9280 if (ret < 0) {
9281 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9282 return ret;
9283 }
9284
9285 RGWRados::Bucket bop(this, bucket_info);
9286 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9287
9288 real_time removed_mtime;
9289 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9290
9291 return r;
9292 }
9293
9294 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9295 {
9296 string tag;
9297
9298 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9299 if (mi != manifest.obj_end()) {
9300 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9301 ++mi;
9302 tag = mi.get_location().get_raw_obj(store).oid;
9303 tag.append("_");
9304 }
9305
9306 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9307 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9308 MD5 hash;
9309 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9310
9311 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9312 if (iter != attrset.end()) {
9313 bufferlist& bl = iter->second;
9314 hash.Update((const byte *)bl.c_str(), bl.length());
9315 }
9316
9317 hash.Final(md5);
9318 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9319 tag.append(md5_str);
9320
9321 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9322
9323 tag_bl.append(tag.c_str(), tag.size() + 1);
9324 }
9325
9326 static bool is_olh(map<string, bufferlist>& attrs)
9327 {
9328 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9329 return (iter != attrs.end());
9330 }
9331
9332 static bool has_olh_tag(map<string, bufferlist>& attrs)
9333 {
9334 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9335 return (iter != attrs.end());
9336 }
9337
9338 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9339 RGWObjState *olh_state, RGWObjState **target_state)
9340 {
9341 assert(olh_state->is_olh);
9342
9343 rgw_obj target;
9344 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9345 if (r < 0) {
9346 return r;
9347 }
9348 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9349 if (r < 0) {
9350 return r;
9351 }
9352
9353 return 0;
9354 }
9355
9356 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9357 {
9358 if (obj.empty()) {
9359 return -EINVAL;
9360 }
9361
9362 RGWRawObjState *s = rctx->raw.get_state(obj);
9363 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9364 *state = s;
9365 if (s->has_attrs) {
9366 return 0;
9367 }
9368
9369 s->obj = obj;
9370
9371 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9372 if (r == -ENOENT) {
9373 s->exists = false;
9374 s->has_attrs = true;
9375 s->mtime = real_time();
9376 return 0;
9377 }
9378 if (r < 0)
9379 return r;
9380
9381 s->exists = true;
9382 s->has_attrs = true;
9383 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9384
9385 if (s->obj_tag.length())
9386 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9387 << s->obj_tag.c_str() << dendl;
9388 else
9389 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9390
9391 return 0;
9392 }
9393
9394 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9395 {
9396 int ret;
9397
9398 do {
9399 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9400 } while (ret == -EAGAIN);
9401
9402 return ret;
9403 }
9404
9405 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9406 RGWObjState **state, bool follow_olh, bool assume_noent)
9407 {
9408 if (obj.empty()) {
9409 return -EINVAL;
9410 }
9411
9412 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9413
9414 RGWObjState *s = rctx->obj.get_state(obj);
9415 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9416 *state = s;
9417 if (s->has_attrs) {
9418 if (s->is_olh && need_follow_olh) {
9419 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9420 }
9421 return 0;
9422 }
9423
9424 s->obj = obj;
9425
9426 rgw_raw_obj raw_obj;
9427 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9428
9429 int r = -ENOENT;
9430
9431 if (!assume_noent) {
9432 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9433 }
9434
9435 if (r == -ENOENT) {
9436 s->exists = false;
9437 s->has_attrs = true;
9438 tombstone_entry entry;
9439 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9440 s->mtime = entry.mtime;
9441 s->zone_short_id = entry.zone_short_id;
9442 s->pg_ver = entry.pg_ver;
9443 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9444 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9445 } else {
9446 s->mtime = real_time();
9447 }
9448 return 0;
9449 }
9450 if (r < 0)
9451 return r;
9452
9453 s->exists = true;
9454 s->has_attrs = true;
9455 s->accounted_size = s->size;
9456
9457 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9458 const bool compressed = (iter != s->attrset.end());
9459 if (compressed) {
9460 // use uncompressed size for accounted_size
9461 try {
9462 RGWCompressionInfo info;
9463 auto p = iter->second.begin();
9464 ::decode(info, p);
9465 s->accounted_size = info.orig_size;
9466 } catch (buffer::error&) {
9467 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9468 return -EIO;
9469 }
9470 }
9471
9472 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9473 if (iter != s->attrset.end()) {
9474 bufferlist bl = iter->second;
9475 bufferlist::iterator it = bl.begin();
9476 it.copy(bl.length(), s->shadow_obj);
9477 s->shadow_obj[bl.length()] = '\0';
9478 }
9479 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9480 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9481 if (ttiter != s->attrset.end()) {
9482 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9483 }
9484
9485 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9486 if (manifest_bl.length()) {
9487 bufferlist::iterator miter = manifest_bl.begin();
9488 try {
9489 ::decode(s->manifest, miter);
9490 s->has_manifest = true;
9491 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9492 broken due to old bugs */
9493 s->size = s->manifest.get_obj_size();
9494 if (!compressed)
9495 s->accounted_size = s->size;
9496 } catch (buffer::error& err) {
9497 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9498 return -EIO;
9499 }
9500 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9501 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9502 RGWObjManifest::obj_iterator mi;
9503 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9504 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9505 }
9506 }
9507
9508 if (!s->obj_tag.length()) {
9509 /*
9510 * Uh oh, something's wrong, object with manifest should have tag. Let's
9511 * create one out of the manifest, would be unique
9512 */
9513 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9514 s->fake_tag = true;
9515 }
9516 }
9517 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9518 if (aiter != s->attrset.end()) {
9519 bufferlist& pg_ver_bl = aiter->second;
9520 if (pg_ver_bl.length()) {
9521 bufferlist::iterator pgbl = pg_ver_bl.begin();
9522 try {
9523 ::decode(s->pg_ver, pgbl);
9524 } catch (buffer::error& err) {
9525 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9526 }
9527 }
9528 }
9529 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9530 if (aiter != s->attrset.end()) {
9531 bufferlist& zone_short_id_bl = aiter->second;
9532 if (zone_short_id_bl.length()) {
9533 bufferlist::iterator zbl = zone_short_id_bl.begin();
9534 try {
9535 ::decode(s->zone_short_id, zbl);
9536 } catch (buffer::error& err) {
9537 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9538 }
9539 }
9540 }
9541 if (s->obj_tag.length())
9542 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9543 else
9544 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9545
9546 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9547 * it exist, and not only if is_olh() returns true
9548 */
9549 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9550 if (iter != s->attrset.end()) {
9551 s->olh_tag = iter->second;
9552 }
9553
9554 if (is_olh(s->attrset)) {
9555 s->is_olh = true;
9556
9557 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9558
9559 if (need_follow_olh) {
9560 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9561 }
9562 }
9563
9564 return 0;
9565 }
9566
9567 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9568 bool follow_olh, bool assume_noent)
9569 {
9570 int ret;
9571
9572 do {
9573 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9574 } while (ret == -EAGAIN);
9575
9576 return ret;
9577 }
9578
9579 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9580 {
9581 RGWObjState *astate;
9582 int r = get_state(&astate, true);
9583 if (r < 0) {
9584 return r;
9585 }
9586
9587 *pmanifest = &astate->manifest;
9588
9589 return 0;
9590 }
9591
9592 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9593 {
9594 RGWObjState *state;
9595 int r = source->get_state(&state, true);
9596 if (r < 0)
9597 return r;
9598 if (!state->exists)
9599 return -ENOENT;
9600 if (!state->get_attr(name, dest))
9601 return -ENODATA;
9602
9603 return 0;
9604 }
9605
9606
9607 int RGWRados::Object::Stat::stat_async()
9608 {
9609 RGWObjectCtx& ctx = source->get_ctx();
9610 rgw_obj& obj = source->get_obj();
9611 RGWRados *store = source->get_store();
9612
9613 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9614 result.obj = obj;
9615 if (s->has_attrs) {
9616 state.ret = 0;
9617 result.size = s->size;
9618 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9619 result.attrs = s->attrset;
9620 result.has_manifest = s->has_manifest;
9621 result.manifest = s->manifest;
9622 return 0;
9623 }
9624
9625 string oid;
9626 string loc;
9627 get_obj_bucket_and_oid_loc(obj, oid, loc);
9628
9629 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9630 if (r < 0) {
9631 return r;
9632 }
9633
9634 librados::ObjectReadOperation op;
9635 op.stat2(&result.size, &result.mtime, NULL);
9636 op.getxattrs(&result.attrs, NULL);
9637 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9638 state.io_ctx.locator_set_key(loc);
9639 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9640 if (r < 0) {
9641 ldout(store->ctx(), 5) << __func__
9642 << ": ERROR: aio_operate() returned ret=" << r
9643 << dendl;
9644 return r;
9645 }
9646
9647 return 0;
9648 }
9649
9650
9651 int RGWRados::Object::Stat::wait()
9652 {
9653 if (!state.completion) {
9654 return state.ret;
9655 }
9656
9657 state.completion->wait_for_safe();
9658 state.ret = state.completion->get_return_value();
9659 state.completion->release();
9660
9661 if (state.ret != 0) {
9662 return state.ret;
9663 }
9664
9665 return finish();
9666 }
9667
9668 int RGWRados::Object::Stat::finish()
9669 {
9670 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9671 if (iter != result.attrs.end()) {
9672 bufferlist& bl = iter->second;
9673 bufferlist::iterator biter = bl.begin();
9674 try {
9675 ::decode(result.manifest, biter);
9676 } catch (buffer::error& err) {
9677 RGWRados *store = source->get_store();
9678 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9679 return -EIO;
9680 }
9681 result.has_manifest = true;
9682 }
9683
9684 return 0;
9685 }
9686
9687 /**
9688 * Get an attribute for a system object.
9689 * obj: the object to get attr
9690 * name: name of the attr to retrieve
9691 * dest: bufferlist to store the result in
9692 * Returns: 0 on success, -ERR# otherwise.
9693 */
9694 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9695 {
9696 rgw_rados_ref ref;
9697 int r = get_system_obj_ref(obj, &ref);
9698 if (r < 0) {
9699 return r;
9700 }
9701
9702 ObjectReadOperation op;
9703
9704 int rval;
9705 op.getxattr(name, &dest, &rval);
9706
9707 r = ref.ioctx.operate(ref.oid, &op, NULL);
9708 if (r < 0)
9709 return r;
9710
9711 return 0;
9712 }
9713
9714 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9715 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9716 ObjectOperation& op, RGWObjState **pstate)
9717 {
9718 if (!rctx)
9719 return 0;
9720
9721 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9722 if (r < 0)
9723 return r;
9724
9725 RGWObjState *state = *pstate;
9726
9727 if (!state->is_atomic) {
9728 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9729 return 0;
9730 }
9731
9732 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9733 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9734 } else {
9735 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9736 }
9737 return 0;
9738 }
9739
9740 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9741 {
9742 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9743 }
9744
9745 void RGWRados::Object::invalidate_state()
9746 {
9747 ctx.obj.invalidate(obj);
9748 }
9749
9750 void RGWRados::SystemObject::invalidate_state()
9751 {
9752 ctx.raw.invalidate(obj);
9753 }
9754
9755 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9756 const char *if_match, const char *if_nomatch, bool removal_op,
9757 bool modify_tail)
9758 {
9759 int r = get_state(&state, false);
9760 if (r < 0)
9761 return r;
9762
9763 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9764 if_match != NULL || if_nomatch != NULL) &&
9765 (!state->fake_tag);
9766
9767 if (!state->is_atomic) {
9768 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9769
9770 if (reset_obj) {
9771 op.create(false);
9772 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9773 }
9774
9775 return 0;
9776 }
9777
9778 if (need_guard) {
9779 /* first verify that the object wasn't replaced under */
9780 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9781 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9782 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9783 }
9784
9785 if (if_match) {
9786 if (strcmp(if_match, "*") == 0) {
9787 // test the object is existing
9788 if (!state->exists) {
9789 return -ERR_PRECONDITION_FAILED;
9790 }
9791 } else {
9792 bufferlist bl;
9793 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9794 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9795 return -ERR_PRECONDITION_FAILED;
9796 }
9797 }
9798 }
9799
9800 if (if_nomatch) {
9801 if (strcmp(if_nomatch, "*") == 0) {
9802 // test the object is NOT existing
9803 if (state->exists) {
9804 return -ERR_PRECONDITION_FAILED;
9805 }
9806 } else {
9807 bufferlist bl;
9808 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9809 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9810 return -ERR_PRECONDITION_FAILED;
9811 }
9812 }
9813 }
9814 }
9815
9816 if (reset_obj) {
9817 if (state->exists) {
9818 op.create(false);
9819 store->remove_rgw_head_obj(op);
9820 } else {
9821 op.create(true);
9822 }
9823 }
9824
9825 if (removal_op) {
9826 /* the object is being removed, no need to update its tag */
9827 return 0;
9828 }
9829
9830 if (ptag) {
9831 state->write_tag = *ptag;
9832 } else {
9833 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9834 }
9835 bufferlist bl;
9836 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9837
9838 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9839
9840 op.setxattr(RGW_ATTR_ID_TAG, bl);
9841 if (modify_tail) {
9842 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9843 }
9844
9845 return 0;
9846 }
9847
9848 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9849 RGWObjVersionTracker *objv_tracker)
9850 {
9851 map<string, bufferlist> attrs;
9852 attrs[name] = bl;
9853 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9854 }
9855
9856 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9857 map<string, bufferlist>& attrs,
9858 map<string, bufferlist>* rmattrs,
9859 RGWObjVersionTracker *objv_tracker)
9860 {
9861 rgw_rados_ref ref;
9862 int r = get_system_obj_ref(obj, &ref);
9863 if (r < 0) {
9864 return r;
9865 }
9866 ObjectWriteOperation op;
9867
9868 if (objv_tracker) {
9869 objv_tracker->prepare_op_for_write(&op);
9870 }
9871
9872 map<string, bufferlist>::iterator iter;
9873 if (rmattrs) {
9874 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9875 const string& name = iter->first;
9876 op.rmxattr(name.c_str());
9877 }
9878 }
9879
9880 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9881 const string& name = iter->first;
9882 bufferlist& bl = iter->second;
9883
9884 if (!bl.length())
9885 continue;
9886
9887 op.setxattr(name.c_str(), bl);
9888 }
9889
9890 if (!op.size())
9891 return 0;
9892
9893 bufferlist bl;
9894
9895 r = ref.ioctx.operate(ref.oid, &op);
9896 if (r < 0)
9897 return r;
9898
9899 return 0;
9900 }
9901
9902 /**
9903 * Set an attr on an object.
9904 * bucket: name of the bucket holding the object
9905 * obj: name of the object to set the attr on
9906 * name: the attr to set
9907 * bl: the contents of the attr
9908 * Returns: 0 on success, -ERR# otherwise.
9909 */
9910 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9911 {
9912 map<string, bufferlist> attrs;
9913 attrs[name] = bl;
9914 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9915 }
9916
9917 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9918 map<string, bufferlist>& attrs,
9919 map<string, bufferlist>* rmattrs)
9920 {
9921 rgw_rados_ref ref;
9922 int r = get_obj_head_ref(bucket_info, obj, &ref);
9923 if (r < 0) {
9924 return r;
9925 }
9926 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9927
9928 ObjectWriteOperation op;
9929 RGWObjState *state = NULL;
9930
9931 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9932 if (r < 0)
9933 return r;
9934
9935 map<string, bufferlist>::iterator iter;
9936 if (rmattrs) {
9937 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9938 const string& name = iter->first;
9939 op.rmxattr(name.c_str());
9940 }
9941 }
9942
9943 const rgw_bucket& bucket = obj.bucket;
9944
9945 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9946 const string& name = iter->first;
9947 bufferlist& bl = iter->second;
9948
9949 if (!bl.length())
9950 continue;
9951
9952 op.setxattr(name.c_str(), bl);
9953
9954 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9955 real_time ts;
9956 try {
9957 ::decode(ts, bl);
9958
9959 rgw_obj_index_key obj_key;
9960 obj.key.get_index_key(&obj_key);
9961
9962 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9963 } catch (buffer::error& err) {
9964 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9965 }
9966 }
9967 }
9968
9969 if (!op.size())
9970 return 0;
9971
9972 RGWObjectCtx obj_ctx(this);
9973
9974 bufferlist bl;
9975 RGWRados::Bucket bop(this, bucket_info);
9976 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9977
9978 if (state) {
9979 string tag;
9980 append_rand_alpha(cct, tag, tag, 32);
9981 state->write_tag = tag;
9982 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9983
9984 if (r < 0)
9985 return r;
9986
9987 bl.append(tag.c_str(), tag.size() + 1);
9988 op.setxattr(RGW_ATTR_ID_TAG, bl);
9989 }
9990
9991
9992 real_time mtime = real_clock::now();
9993 struct timespec mtime_ts = real_clock::to_timespec(mtime);
9994 op.mtime2(&mtime_ts);
9995 r = ref.ioctx.operate(ref.oid, &op);
9996 if (state) {
9997 if (r >= 0) {
9998 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9999 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
10000 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
10001 string etag(etag_bl.c_str(), etag_bl.length());
10002 string content_type(content_type_bl.c_str(), content_type_bl.length());
10003 uint64_t epoch = ref.ioctx.get_last_version();
10004 int64_t poolid = ref.ioctx.get_id();
10005 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
10006 mtime, etag, content_type, &acl_bl,
10007 RGW_OBJ_CATEGORY_MAIN, NULL);
10008 } else {
10009 int ret = index_op.cancel();
10010 if (ret < 0) {
10011 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
10012 }
10013 }
10014 }
10015 if (r < 0)
10016 return r;
10017
10018 if (state) {
10019 state->obj_tag.swap(bl);
10020 if (rmattrs) {
10021 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
10022 state->attrset.erase(iter->first);
10023 }
10024 }
10025 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
10026 state->attrset[iter->first] = iter->second;
10027 }
10028 }
10029
10030 return 0;
10031 }
10032
10033 int RGWRados::Object::Read::prepare()
10034 {
10035 RGWRados *store = source->get_store();
10036 CephContext *cct = store->ctx();
10037
10038 bufferlist etag;
10039
10040 map<string, bufferlist>::iterator iter;
10041
10042 RGWObjState *astate;
10043 int r = source->get_state(&astate, true);
10044 if (r < 0)
10045 return r;
10046
10047 if (!astate->exists) {
10048 return -ENOENT;
10049 }
10050
10051 const RGWBucketInfo& bucket_info = source->get_bucket_info();
10052
10053 state.obj = astate->obj;
10054 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
10055
10056 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
10057 if (r < 0) {
10058 return r;
10059 }
10060 if (params.attrs) {
10061 *params.attrs = astate->attrset;
10062 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10063 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
10064 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10065 }
10066 }
10067 }
10068
10069 /* Convert all times go GMT to make them compatible */
10070 if (conds.mod_ptr || conds.unmod_ptr) {
10071 obj_time_weight src_weight;
10072 src_weight.init(astate);
10073 src_weight.high_precision = conds.high_precision_time;
10074
10075 obj_time_weight dest_weight;
10076 dest_weight.high_precision = conds.high_precision_time;
10077
10078 if (conds.mod_ptr) {
10079 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10080 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10081 if (!(dest_weight < src_weight)) {
10082 return -ERR_NOT_MODIFIED;
10083 }
10084 }
10085
10086 if (conds.unmod_ptr) {
10087 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
10088 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
10089 if (dest_weight < src_weight) {
10090 return -ERR_PRECONDITION_FAILED;
10091 }
10092 }
10093 }
10094 if (conds.if_match || conds.if_nomatch) {
10095 r = get_attr(RGW_ATTR_ETAG, etag);
10096 if (r < 0)
10097 return r;
10098
10099 if (conds.if_match) {
10100 string if_match_str = rgw_string_unquote(conds.if_match);
10101 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
10102 if (if_match_str.compare(etag.c_str()) != 0) {
10103 return -ERR_PRECONDITION_FAILED;
10104 }
10105 }
10106
10107 if (conds.if_nomatch) {
10108 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
10109 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
10110 if (if_nomatch_str.compare(etag.c_str()) == 0) {
10111 return -ERR_NOT_MODIFIED;
10112 }
10113 }
10114 }
10115
10116 if (params.obj_size)
10117 *params.obj_size = astate->size;
10118 if (params.lastmod)
10119 *params.lastmod = astate->mtime;
10120
10121 return 0;
10122 }
10123
10124 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
10125 {
10126 if (ofs < 0) {
10127 ofs += obj_size;
10128 if (ofs < 0)
10129 ofs = 0;
10130 end = obj_size - 1;
10131 } else if (end < 0) {
10132 end = obj_size - 1;
10133 }
10134
10135 if (obj_size > 0) {
10136 if (ofs >= (off_t)obj_size) {
10137 return -ERANGE;
10138 }
10139 if (end >= (off_t)obj_size) {
10140 end = obj_size - 1;
10141 }
10142 }
10143 return 0;
10144 }
10145
10146 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
10147 {
10148 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
10149 }
10150
10151 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
10152 RGWRados::SystemObject::Read::GetObjState& state,
10153 rgw_raw_obj& obj,
10154 map<string, bufferlist> *attrs,
10155 real_time *lastmod,
10156 uint64_t *obj_size,
10157 RGWObjVersionTracker *objv_tracker)
10158 {
10159 RGWRawObjState *astate = NULL;
10160
10161 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
10162 if (r < 0)
10163 return r;
10164
10165 if (!astate->exists) {
10166 return -ENOENT;
10167 }
10168
10169 if (attrs) {
10170 *attrs = astate->attrset;
10171 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10172 map<string, bufferlist>::iterator iter;
10173 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10174 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10175 }
10176 }
10177 }
10178
10179 if (obj_size)
10180 *obj_size = astate->size;
10181 if (lastmod)
10182 *lastmod = astate->mtime;
10183
10184 return 0;
10185 }
10186
10187
10188 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10189 {
10190 RGWRados *store = target->get_store();
10191 BucketShard *bs;
10192 int r;
10193
10194 #define NUM_RESHARD_RETRIES 10
10195 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10196 int ret = get_bucket_shard(&bs);
10197 if (ret < 0) {
10198 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10199 return ret;
10200 }
10201 r = call(bs);
10202 if (r != -ERR_BUSY_RESHARDING) {
10203 break;
10204 }
10205 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10206 string new_bucket_id;
10207 r = store->block_while_resharding(bs, &new_bucket_id);
10208 if (r == -ERR_BUSY_RESHARDING) {
10209 continue;
10210 }
10211 if (r < 0) {
10212 return r;
10213 }
10214 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10215 i = 0; /* resharding is finished, make sure we can retry */
10216 r = target->update_bucket_id(new_bucket_id);
10217 if (r < 0) {
10218 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10219 return r;
10220 }
10221 invalidate_bs();
10222 }
10223
10224 if (r < 0) {
10225 return r;
10226 }
10227
10228 if (pbs) {
10229 *pbs = bs;
10230 }
10231
10232 return 0;
10233 }
10234
10235 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10236 {
10237 RGWRados *store = source->get_store();
10238 rgw_raw_obj& obj = source->get_obj();
10239
10240 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10241 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10242 }
10243
10244 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10245 {
10246 if (blind) {
10247 return 0;
10248 }
10249 RGWRados *store = target->get_store();
10250
10251 if (write_tag && write_tag->length()) {
10252 optag = string(write_tag->c_str(), write_tag->length());
10253 } else {
10254 if (optag.empty()) {
10255 append_rand_alpha(store->ctx(), optag, optag, 32);
10256 }
10257 }
10258
10259 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10260 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10261 });
10262
10263 if (r < 0) {
10264 return r;
10265 }
10266 prepared = true;
10267
10268 return 0;
10269 }
10270
10271 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10272 uint64_t size, uint64_t accounted_size,
10273 ceph::real_time& ut, const string& etag,
10274 const string& content_type,
10275 bufferlist *acl_bl,
10276 RGWObjCategory category,
10277 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10278 {
10279 if (blind) {
10280 return 0;
10281 }
10282 RGWRados *store = target->get_store();
10283 BucketShard *bs;
10284
10285 int ret = get_bucket_shard(&bs);
10286 if (ret < 0) {
10287 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10288 return ret;
10289 }
10290
10291 rgw_bucket_dir_entry ent;
10292 obj.key.get_index_key(&ent.key);
10293 ent.meta.size = size;
10294 ent.meta.accounted_size = accounted_size;
10295 ent.meta.mtime = ut;
10296 ent.meta.etag = etag;
10297 if (user_data)
10298 ent.meta.user_data = *user_data;
10299
10300 ACLOwner owner;
10301 if (acl_bl && acl_bl->length()) {
10302 int ret = store->decode_policy(*acl_bl, &owner);
10303 if (ret < 0) {
10304 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10305 }
10306 }
10307 ent.meta.owner = owner.get_id().to_str();
10308 ent.meta.owner_display_name = owner.get_display_name();
10309 ent.meta.content_type = content_type;
10310
10311 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10312
10313 if (target->bucket_info.datasync_flag_enabled()) {
10314 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10315 if (r < 0) {
10316 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10317 }
10318 }
10319
10320 return ret;
10321 }
10322
10323 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10324 real_time& removed_mtime,
10325 list<rgw_obj_index_key> *remove_objs)
10326 {
10327 if (blind) {
10328 return 0;
10329 }
10330 RGWRados *store = target->get_store();
10331 BucketShard *bs;
10332
10333 int ret = get_bucket_shard(&bs);
10334 if (ret < 0) {
10335 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10336 return ret;
10337 }
10338
10339 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10340
10341 if (target->bucket_info.datasync_flag_enabled()) {
10342 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10343 if (r < 0) {
10344 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10345 }
10346 }
10347
10348 return ret;
10349 }
10350
10351
10352 int RGWRados::Bucket::UpdateIndex::cancel()
10353 {
10354 if (blind) {
10355 return 0;
10356 }
10357 RGWRados *store = target->get_store();
10358 BucketShard *bs;
10359
10360 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10361 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10362 });
10363
10364 /*
10365 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10366 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10367 * have no way to tell that they're all caught up
10368 */
10369 if (target->bucket_info.datasync_flag_enabled()) {
10370 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10371 if (r < 0) {
10372 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10373 }
10374 }
10375
10376 return ret;
10377 }
10378
10379 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10380 {
10381 RGWRados *store = source->get_store();
10382 CephContext *cct = store->ctx();
10383
10384 rgw_raw_obj read_obj;
10385 uint64_t read_ofs = ofs;
10386 uint64_t len, read_len;
10387 bool reading_from_head = true;
10388 ObjectReadOperation op;
10389
10390 bool merge_bl = false;
10391 bufferlist *pbl = &bl;
10392 bufferlist read_bl;
10393 uint64_t max_chunk_size;
10394
10395 RGWObjState *astate;
10396 int r = source->get_state(&astate, true);
10397 if (r < 0)
10398 return r;
10399
10400 if (end < 0)
10401 len = 0;
10402 else
10403 len = end - ofs + 1;
10404
10405 if (astate->has_manifest && astate->manifest.has_tail()) {
10406 /* now get the relevant object part */
10407 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10408
10409 uint64_t stripe_ofs = iter.get_stripe_ofs();
10410 read_obj = iter.get_location().get_raw_obj(store);
10411 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10412 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10413 reading_from_head = (read_obj == state.head_obj);
10414 } else {
10415 read_obj = state.head_obj;
10416 }
10417
10418 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10419 if (r < 0) {
10420 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10421 return r;
10422 }
10423
10424 if (len > max_chunk_size)
10425 len = max_chunk_size;
10426
10427
10428 state.io_ctx.locator_set_key(read_obj.loc);
10429
10430 read_len = len;
10431
10432 if (reading_from_head) {
10433 /* only when reading from the head object do we need to do the atomic test */
10434 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10435 if (r < 0)
10436 return r;
10437
10438 if (astate && astate->prefetch_data) {
10439 if (!ofs && astate->data.length() >= len) {
10440 bl = astate->data;
10441 return bl.length();
10442 }
10443
10444 if (ofs < astate->data.length()) {
10445 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10446 astate->data.copy(ofs, copy_len, bl);
10447 read_len -= copy_len;
10448 read_ofs += copy_len;
10449 if (!read_len)
10450 return bl.length();
10451
10452 merge_bl = true;
10453 pbl = &read_bl;
10454 }
10455 }
10456 }
10457
10458 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10459 op.read(read_ofs, read_len, pbl, NULL);
10460
10461 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10462 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10463
10464 if (r < 0) {
10465 return r;
10466 }
10467
10468 if (merge_bl) {
10469 bl.append(read_bl);
10470 }
10471
10472 return bl.length();
10473 }
10474
10475 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10476 {
10477 if (!has_ref) {
10478 int r = store->get_raw_obj_ref(obj, &ref);
10479 if (r < 0) {
10480 return r;
10481 }
10482 has_ref = true;
10483 }
10484 *pref = &ref;
10485 return 0;
10486
10487 }
10488
10489 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10490 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10491 bufferlist& bl, off_t ofs, off_t end,
10492 map<string, bufferlist> *attrs,
10493 rgw_cache_entry_info *cache_info,
10494 boost::optional<obj_version>)
10495 {
10496 uint64_t len;
10497 ObjectReadOperation op;
10498
10499 if (end < 0)
10500 len = 0;
10501 else
10502 len = end - ofs + 1;
10503
10504 if (objv_tracker) {
10505 objv_tracker->prepare_op_for_read(&op);
10506 }
10507
10508 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10509 op.read(ofs, len, &bl, NULL);
10510
10511 if (attrs) {
10512 op.getxattrs(attrs, NULL);
10513 }
10514
10515 rgw_rados_ref *ref;
10516 int r = read_state.get_ref(this, obj, &ref);
10517 if (r < 0) {
10518 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10519 return r;
10520 }
10521 r = ref->ioctx.operate(ref->oid, &op, NULL);
10522 if (r < 0) {
10523 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10524 return r;
10525 }
10526 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10527
10528 uint64_t op_ver = ref->ioctx.get_last_version();
10529
10530 if (read_state.last_ver > 0 &&
10531 read_state.last_ver != op_ver) {
10532 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10533 return -ECANCELED;
10534 }
10535
10536 read_state.last_ver = op_ver;
10537
10538 return bl.length();
10539 }
10540
10541 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10542 RGWObjVersionTracker *objv_tracker,
10543 boost::optional<obj_version> refresh_version)
10544 {
10545 RGWRados *store = source->get_store();
10546 rgw_raw_obj& obj = source->get_obj();
10547
10548 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10549 ofs, end, read_params.attrs,
10550 read_params.cache_info, refresh_version);
10551 }
10552
10553 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10554 {
10555 RGWRados *store = source->get_store();
10556 rgw_raw_obj& obj = source->get_obj();
10557
10558 return store->system_obj_get_attr(obj, name, dest);
10559 }
10560
10561 struct get_obj_data;
10562
10563 struct get_obj_aio_data {
10564 struct get_obj_data *op_data;
10565 off_t ofs;
10566 off_t len;
10567 };
10568
10569 struct get_obj_io {
10570 off_t len;
10571 bufferlist bl;
10572 };
10573
10574 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10575
10576 struct get_obj_data : public RefCountedObject {
10577 CephContext *cct;
10578 RGWRados *rados;
10579 RGWObjectCtx *ctx;
10580 IoCtx io_ctx;
10581 map<off_t, get_obj_io> io_map;
10582 map<off_t, librados::AioCompletion *> completion_map;
10583 uint64_t total_read;
10584 Mutex lock;
10585 Mutex data_lock;
10586 list<get_obj_aio_data> aio_data;
10587 RGWGetDataCB *client_cb;
10588 std::atomic<bool> cancelled = { false };
10589 std::atomic<int64_t> err_code = { 0 };
10590 Throttle throttle;
10591 list<bufferlist> read_list;
10592
10593 explicit get_obj_data(CephContext *_cct)
10594 : cct(_cct),
10595 rados(NULL), ctx(NULL),
10596 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10597 client_cb(NULL),
10598 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10599 ~get_obj_data() override { }
10600 void set_cancelled(int r) {
10601 cancelled = true;
10602 err_code = r;
10603 }
10604
10605 bool is_cancelled() {
10606 return cancelled;
10607 }
10608
10609 int get_err_code() {
10610 return err_code;
10611 }
10612
10613 int wait_next_io(bool *done) {
10614 lock.Lock();
10615 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10616 if (iter == completion_map.end()) {
10617 *done = true;
10618 lock.Unlock();
10619 return 0;
10620 }
10621 off_t cur_ofs = iter->first;
10622 librados::AioCompletion *c = iter->second;
10623 lock.Unlock();
10624
10625 c->wait_for_safe_and_cb();
10626 int r = c->get_return_value();
10627
10628 lock.Lock();
10629 completion_map.erase(cur_ofs);
10630
10631 if (completion_map.empty()) {
10632 *done = true;
10633 }
10634 lock.Unlock();
10635
10636 c->release();
10637
10638 return r;
10639 }
10640
10641 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10642 Mutex::Locker l(lock);
10643
10644 const auto& io_iter = io_map.insert(
10645 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10646
10647 assert(io_iter.second); // assert new insertion
10648
10649 get_obj_io& io = (io_iter.first)->second;
10650 *pbl = &io.bl;
10651
10652 struct get_obj_aio_data aio;
10653 aio.ofs = ofs;
10654 aio.len = len;
10655 aio.op_data = this;
10656
10657 aio_data.push_back(aio);
10658
10659 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10660
10661 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10662 completion_map[ofs] = c;
10663
10664 *pc = c;
10665
10666 /* we have a reference per IO, plus one reference for the calling function.
10667 * reference is dropped for each callback, plus when we're done iterating
10668 * over the parts */
10669 get();
10670 }
10671
10672 void cancel_io(off_t ofs) {
10673 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10674 lock.Lock();
10675 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10676 if (iter != completion_map.end()) {
10677 AioCompletion *c = iter->second;
10678 c->release();
10679 completion_map.erase(ofs);
10680 io_map.erase(ofs);
10681 }
10682 lock.Unlock();
10683
10684 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10685 * need IoCtx to live, as io callback may still be called
10686 */
10687 }
10688
10689 void cancel_all_io() {
10690 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10691 Mutex::Locker l(lock);
10692 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10693 iter != completion_map.end(); ++iter) {
10694 librados::AioCompletion *c = iter->second;
10695 c->release();
10696 }
10697 }
10698
10699 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10700 Mutex::Locker l(lock);
10701
10702 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10703
10704 if (liter == io_map.end() ||
10705 liter->first != ofs) {
10706 return 0;
10707 }
10708
10709 map<off_t, librados::AioCompletion *>::iterator aiter;
10710 aiter = completion_map.find(ofs);
10711 if (aiter == completion_map.end()) {
10712 /* completion map does not hold this io, it was cancelled */
10713 return 0;
10714 }
10715
10716 AioCompletion *completion = aiter->second;
10717 int r = completion->get_return_value();
10718 if (r < 0)
10719 return r;
10720
10721 for (; aiter != completion_map.end(); ++aiter) {
10722 completion = aiter->second;
10723 if (!completion->is_safe()) {
10724 /* reached a request that is not yet complete, stop */
10725 break;
10726 }
10727
10728 r = completion->get_return_value();
10729 if (r < 0) {
10730 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10731 return r;
10732 }
10733
10734 total_read += r;
10735
10736 map<off_t, get_obj_io>::iterator old_liter = liter++;
10737 bl_list.push_back(old_liter->second.bl);
10738 io_map.erase(old_liter);
10739 }
10740
10741 return 0;
10742 }
10743 };
10744
10745 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10746 {
10747 struct get_obj_data *d = (struct get_obj_data *)arg;
10748
10749 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10750 }
10751
10752 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10753 {
10754 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10755 struct get_obj_data *d = aio_data->op_data;
10756
10757 d->rados->get_obj_aio_completion_cb(cb, arg);
10758 }
10759
10760
10761 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10762 {
10763 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10764 struct get_obj_data *d = aio_data->op_data;
10765 off_t ofs = aio_data->ofs;
10766 off_t len = aio_data->len;
10767
10768 list<bufferlist> bl_list;
10769 list<bufferlist>::iterator iter;
10770 int r;
10771
10772 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10773 d->throttle.put(len);
10774
10775 r = rados_aio_get_return_value(c);
10776 if (r < 0) {
10777 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10778 d->set_cancelled(r);
10779 goto done;
10780 }
10781
10782 if (d->is_cancelled()) {
10783 goto done;
10784 }
10785
10786 d->data_lock.Lock();
10787
10788 r = d->get_complete_ios(ofs, bl_list);
10789 if (r < 0) {
10790 goto done_unlock;
10791 }
10792
10793 d->read_list.splice(d->read_list.end(), bl_list);
10794
10795 done_unlock:
10796 d->data_lock.Unlock();
10797 done:
10798 d->put();
10799 return;
10800 }
10801
10802 int RGWRados::flush_read_list(struct get_obj_data *d)
10803 {
10804 d->data_lock.Lock();
10805 list<bufferlist> l;
10806 l.swap(d->read_list);
10807 d->get();
10808 d->read_list.clear();
10809
10810 d->data_lock.Unlock();
10811
10812 int r = 0;
10813
10814 list<bufferlist>::iterator iter;
10815 for (iter = l.begin(); iter != l.end(); ++iter) {
10816 bufferlist& bl = *iter;
10817 r = d->client_cb->handle_data(bl, 0, bl.length());
10818 if (r < 0) {
10819 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10820 break;
10821 }
10822 }
10823
10824 d->data_lock.Lock();
10825 d->put();
10826 if (r < 0) {
10827 d->set_cancelled(r);
10828 }
10829 d->data_lock.Unlock();
10830 return r;
10831 }
10832
10833 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10834 const RGWBucketInfo& bucket_info,
10835 const rgw_obj& obj,
10836 const rgw_raw_obj& read_obj,
10837 off_t obj_ofs,
10838 off_t read_ofs, off_t len,
10839 bool is_head_obj, void *arg)
10840 {
10841 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10842 ObjectReadOperation op;
10843 struct get_obj_data *d = (struct get_obj_data *)arg;
10844 string oid, key;
10845 bufferlist *pbl;
10846 AioCompletion *c;
10847
10848 int r;
10849
10850 if (is_head_obj) {
10851 /* only when reading from the head object do we need to do the atomic test */
10852 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10853 if (r < 0)
10854 return r;
10855
10856 if (astate &&
10857 obj_ofs < astate->data.length()) {
10858 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10859
10860 d->data_lock.Lock();
10861 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10862 d->data_lock.Unlock();
10863 if (r < 0)
10864 return r;
10865
10866 d->lock.Lock();
10867 d->total_read += chunk_len;
10868 d->lock.Unlock();
10869
10870 len -= chunk_len;
10871 read_ofs += chunk_len;
10872 obj_ofs += chunk_len;
10873 if (!len)
10874 return 0;
10875 }
10876 }
10877
10878 d->throttle.get(len);
10879 if (d->is_cancelled()) {
10880 return d->get_err_code();
10881 }
10882
10883 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10884 * cleaning up
10885 */
10886 d->add_io(obj_ofs, len, &pbl, &c);
10887
10888 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10889 op.read(read_ofs, len, pbl, NULL);
10890
10891 librados::IoCtx io_ctx(d->io_ctx);
10892 io_ctx.locator_set_key(read_obj.loc);
10893
10894 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10895 if (r < 0) {
10896 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10897 goto done_err;
10898 }
10899
10900 // Flush data to client if there is any
10901 r = flush_read_list(d);
10902 if (r < 0)
10903 return r;
10904
10905 return 0;
10906
10907 done_err:
10908 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10909 d->set_cancelled(r);
10910 d->cancel_io(obj_ofs);
10911
10912 return r;
10913 }
10914
10915 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10916 {
10917 RGWRados *store = source->get_store();
10918 CephContext *cct = store->ctx();
10919
10920 struct get_obj_data *data = new get_obj_data(cct);
10921 bool done = false;
10922
10923 RGWObjectCtx& obj_ctx = source->get_ctx();
10924
10925 data->rados = store;
10926 data->io_ctx.dup(state.io_ctx);
10927 data->client_cb = cb;
10928
10929 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10930 if (r < 0) {
10931 data->cancel_all_io();
10932 goto done;
10933 }
10934
10935 while (!done) {
10936 r = data->wait_next_io(&done);
10937 if (r < 0) {
10938 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10939 data->cancel_all_io();
10940 break;
10941 }
10942 r = store->flush_read_list(data);
10943 if (r < 0) {
10944 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10945 data->cancel_all_io();
10946 break;
10947 }
10948 }
10949
10950 done:
10951 data->put();
10952 return r;
10953 }
10954
10955 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10956 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10957 off_t ofs, off_t end,
10958 uint64_t max_chunk_size,
10959 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10960 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10961 RGWObjState *, void *),
10962 void *arg)
10963 {
10964 rgw_raw_obj head_obj;
10965 rgw_raw_obj read_obj;
10966 uint64_t read_ofs = ofs;
10967 uint64_t len;
10968 bool reading_from_head = true;
10969 RGWObjState *astate = NULL;
10970
10971 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10972
10973 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10974 if (r < 0) {
10975 return r;
10976 }
10977
10978 if (end < 0)
10979 len = 0;
10980 else
10981 len = end - ofs + 1;
10982
10983 if (astate->has_manifest) {
10984 /* now get the relevant object stripe */
10985 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10986
10987 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10988
10989 for (; iter != obj_end && ofs <= end; ++iter) {
10990 off_t stripe_ofs = iter.get_stripe_ofs();
10991 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10992
10993 while (ofs < next_stripe_ofs && ofs <= end) {
10994 read_obj = iter.get_location().get_raw_obj(this);
10995 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10996 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10997
10998 if (read_len > max_chunk_size) {
10999 read_len = max_chunk_size;
11000 }
11001
11002 reading_from_head = (read_obj == head_obj);
11003 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
11004 if (r < 0) {
11005 return r;
11006 }
11007
11008 len -= read_len;
11009 ofs += read_len;
11010 }
11011 }
11012 } else {
11013 while (ofs <= end) {
11014 read_obj = head_obj;
11015 uint64_t read_len = min(len, max_chunk_size);
11016
11017 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
11018 if (r < 0) {
11019 return r;
11020 }
11021
11022 len -= read_len;
11023 ofs += read_len;
11024 }
11025 }
11026
11027 return 0;
11028 }
11029
11030 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
11031 {
11032 rgw_rados_ref ref;
11033 int r = get_obj_head_ref(bucket_info, obj, &ref);
11034 if (r < 0) {
11035 return r;
11036 }
11037
11038 return ref.ioctx.operate(ref.oid, op);
11039 }
11040
11041 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
11042 {
11043 rgw_rados_ref ref;
11044 int r = get_obj_head_ref(bucket_info, obj, &ref);
11045 if (r < 0) {
11046 return r;
11047 }
11048
11049 bufferlist outbl;
11050
11051 return ref.ioctx.operate(ref.oid, op, &outbl);
11052 }
11053
11054 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
11055 {
11056 ObjectWriteOperation op;
11057
11058 assert(olh_obj.key.instance.empty());
11059
11060 bool has_tag = (state.exists && has_olh_tag(state.attrset));
11061
11062 if (!state.exists) {
11063 op.create(true);
11064 } else {
11065 op.assert_exists();
11066 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11067 op.mtime2(&mtime_ts);
11068 }
11069
11070 /*
11071 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
11072 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
11073 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
11074 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
11075 * log will reflect that.
11076 *
11077 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
11078 * is used for object data instance, olh_tag for olh instance.
11079 */
11080 if (has_tag) {
11081 /* guard against racing writes */
11082 bucket_index_guard_olh_op(state, op);
11083 }
11084
11085 if (!has_tag) {
11086 /* obj tag */
11087 string obj_tag;
11088 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
11089 if (ret < 0) {
11090 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11091 return ret;
11092 }
11093 bufferlist bl;
11094 bl.append(obj_tag.c_str(), obj_tag.size());
11095 op.setxattr(RGW_ATTR_ID_TAG, bl);
11096
11097 state.attrset[RGW_ATTR_ID_TAG] = bl;
11098 state.obj_tag = bl;
11099
11100 /* olh tag */
11101 string olh_tag;
11102 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
11103 if (ret < 0) {
11104 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11105 return ret;
11106 }
11107 bufferlist olh_bl;
11108 olh_bl.append(olh_tag.c_str(), olh_tag.size());
11109 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
11110
11111 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
11112 state.olh_tag = olh_bl;
11113 state.is_olh = true;
11114
11115 bufferlist verbl;
11116 op.setxattr(RGW_ATTR_OLH_VER, verbl);
11117 }
11118
11119 bufferlist bl;
11120 RGWOLHPendingInfo pending_info;
11121 pending_info.time = real_clock::now();
11122 ::encode(pending_info, bl);
11123
11124 #define OLH_PENDING_TAG_LEN 32
11125 /* tag will start with current time epoch, this so that entries are sorted by time */
11126 char buf[32];
11127 utime_t ut(pending_info.time);
11128 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
11129 *op_tag = buf;
11130
11131 string s;
11132 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
11133 if (ret < 0) {
11134 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
11135 return ret;
11136 }
11137 op_tag->append(s);
11138
11139 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11140 attr_name.append(*op_tag);
11141
11142 op.setxattr(attr_name.c_str(), bl);
11143
11144 ret = obj_operate(bucket_info, olh_obj, &op);
11145 if (ret < 0) {
11146 return ret;
11147 }
11148
11149 state.exists = true;
11150 state.attrset[attr_name] = bl;
11151
11152 return 0;
11153 }
11154
11155 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
11156 {
11157 int ret;
11158
11159 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
11160 if (ret == -EEXIST) {
11161 ret = -ECANCELED;
11162 }
11163
11164 return ret;
11165 }
11166
11167 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
11168 {
11169 rgw_obj obj;
11170 const rgw_obj *pobj = &obj_instance;
11171 int r;
11172
11173 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
11174 r = bs->init(pobj->bucket, *pobj);
11175 if (r < 0) {
11176 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11177 return r;
11178 }
11179 r = call(bs);
11180 if (r != -ERR_BUSY_RESHARDING) {
11181 break;
11182 }
11183 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11184 string new_bucket_id;
11185 r = block_while_resharding(bs, &new_bucket_id);
11186 if (r == -ERR_BUSY_RESHARDING) {
11187 continue;
11188 }
11189 if (r < 0) {
11190 return r;
11191 }
11192 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11193 i = 0; /* resharding is finished, make sure we can retry */
11194
11195 obj = *pobj;
11196 obj.bucket.update_bucket_id(new_bucket_id);
11197 pobj = &obj;
11198 }
11199
11200 if (r < 0) {
11201 return r;
11202 }
11203
11204 return 0;
11205 }
11206
11207 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
11208 {
11209 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11210
11211 return waiter->block_while_resharding(bs, new_bucket_id);
11212 }
11213
11214 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11215 bool delete_marker,
11216 const string& op_tag,
11217 struct rgw_bucket_dir_entry_meta *meta,
11218 uint64_t olh_epoch,
11219 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
11220 {
11221 rgw_rados_ref ref;
11222 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11223 if (r < 0) {
11224 return r;
11225 }
11226
11227 rgw_zone_set zones_trace;
11228 if (_zones_trace) {
11229 zones_trace = *_zones_trace;
11230 }
11231 zones_trace.insert(get_zone().id);
11232
11233 BucketShard bs(this);
11234
11235 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11236 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11237 librados::ObjectWriteOperation op;
11238 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11239 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11240 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11241 unmod_since, high_precision_time,
11242 get_zone().log_data, zones_trace);
11243 });
11244 if (r < 0) {
11245 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11246 return r;
11247 }
11248
11249 return 0;
11250 }
11251
11252 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11253 {
11254 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11255 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11256 }
11257
11258 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
11259 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
11260 {
11261 rgw_rados_ref ref;
11262 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11263 if (r < 0) {
11264 return r;
11265 }
11266
11267 rgw_zone_set zones_trace;
11268 if (_zones_trace) {
11269 zones_trace = *_zones_trace;
11270 }
11271 zones_trace.insert(get_zone().id);
11272
11273 BucketShard bs(this);
11274
11275 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11276 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11277 librados::ObjectWriteOperation op;
11278 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11279 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11280 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11281 });
11282 if (r < 0) {
11283 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11284 return r;
11285 }
11286
11287 return 0;
11288 }
11289
11290 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11291 const rgw_obj& obj_instance, uint64_t ver_marker,
11292 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11293 bool *is_truncated)
11294 {
11295 rgw_rados_ref ref;
11296 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11297 if (r < 0) {
11298 return r;
11299 }
11300
11301 BucketShard bs(this);
11302 int ret = bs.init(obj_instance.bucket, obj_instance);
11303 if (ret < 0) {
11304 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11305 return ret;
11306 }
11307
11308 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11309
11310 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11311
11312 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11313 ObjectReadOperation op;
11314 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11315 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11316 key, ver_marker, olh_tag, log, is_truncated);
11317 });
11318 if (ret < 0) {
11319 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11320 return ret;
11321 }
11322
11323 return 0;
11324 }
11325
11326 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11327 {
11328 rgw_rados_ref ref;
11329 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11330 if (r < 0) {
11331 return r;
11332 }
11333
11334 BucketShard bs(this);
11335 int ret = bs.init(obj_instance.bucket, obj_instance);
11336 if (ret < 0) {
11337 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11338 return ret;
11339 }
11340
11341 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11342
11343 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11344
11345 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11346 ObjectWriteOperation op;
11347 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11348 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11349 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11350 });
11351 if (ret < 0) {
11352 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11353 return ret;
11354 }
11355
11356 return 0;
11357 }
11358
11359 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11360 {
11361 rgw_rados_ref ref;
11362 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11363 if (r < 0) {
11364 return r;
11365 }
11366
11367 BucketShard bs(this);
11368
11369 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11370
11371 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11372
11373 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11374 ObjectWriteOperation op;
11375 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11376 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11377 });
11378 if (ret < 0) {
11379 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11380 return ret;
11381 }
11382
11383 return 0;
11384 }
11385
11386 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11387 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11388 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11389 {
11390 if (log.empty()) {
11391 return 0;
11392 }
11393
11394 librados::ObjectWriteOperation op;
11395
11396 uint64_t last_ver = log.rbegin()->first;
11397 *plast_ver = last_ver;
11398
11399 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11400
11401 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11402 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11403
11404 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11405 op.mtime2(&mtime_ts);
11406
11407 bool need_to_link = false;
11408 cls_rgw_obj_key key;
11409 bool delete_marker = false;
11410 list<cls_rgw_obj_key> remove_instances;
11411 bool need_to_remove = false;
11412
11413 for (iter = log.begin(); iter != log.end(); ++iter) {
11414 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11415 for (; viter != iter->second.end(); ++viter) {
11416 rgw_bucket_olh_log_entry& entry = *viter;
11417
11418 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11419 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11420 << (entry.delete_marker ? "(delete)" : "") << dendl;
11421 switch (entry.op) {
11422 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11423 remove_instances.push_back(entry.key);
11424 break;
11425 case CLS_RGW_OLH_OP_LINK_OLH:
11426 need_to_link = true;
11427 need_to_remove = false;
11428 key = entry.key;
11429 delete_marker = entry.delete_marker;
11430 break;
11431 case CLS_RGW_OLH_OP_UNLINK_OLH:
11432 need_to_remove = true;
11433 need_to_link = false;
11434 break;
11435 default:
11436 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11437 return -EIO;
11438 }
11439 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11440 attr_name.append(entry.op_tag);
11441 op.rmxattr(attr_name.c_str());
11442 }
11443 }
11444
11445 rgw_rados_ref ref;
11446 int r = get_obj_head_ref(bucket_info, obj, &ref);
11447 if (r < 0) {
11448 return r;
11449 }
11450
11451 const rgw_bucket& bucket = obj.bucket;
11452
11453 if (need_to_link) {
11454 rgw_obj target(bucket, key);
11455 RGWOLHInfo info;
11456 info.target = target;
11457 info.removed = delete_marker;
11458 bufferlist bl;
11459 ::encode(info, bl);
11460 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11461 }
11462
11463 /* first remove object instances */
11464 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11465 liter != remove_instances.end(); ++liter) {
11466 cls_rgw_obj_key& key = *liter;
11467 rgw_obj obj_instance(bucket, key);
11468 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11469 if (ret < 0 && ret != -ENOENT) {
11470 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11471 return ret;
11472 }
11473 }
11474
11475 /* update olh object */
11476 r = ref.ioctx.operate(ref.oid, &op);
11477 if (r == -ECANCELED) {
11478 r = 0;
11479 }
11480 if (r < 0) {
11481 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11482 return r;
11483 }
11484
11485 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11486 if (r < 0) {
11487 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11488 return r;
11489 }
11490
11491 if (need_to_remove) {
11492 ObjectWriteOperation rm_op;
11493
11494 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11495 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11496 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11497 rm_op.remove();
11498
11499 r = ref.ioctx.operate(ref.oid, &rm_op);
11500 if (r == -ECANCELED) {
11501 return 0; /* someone else won this race */
11502 } else {
11503 /*
11504 * only clear if was successful, otherwise we might clobber pending operations on this object
11505 */
11506 r = bucket_index_clear_olh(bucket_info, state, obj);
11507 if (r < 0) {
11508 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11509 return r;
11510 }
11511 }
11512 }
11513
11514 return 0;
11515 }
11516
11517 /*
11518 * read olh log and apply it
11519 */
11520 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11521 {
11522 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11523 bool is_truncated;
11524 uint64_t ver_marker = 0;
11525
11526 do {
11527 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11528 if (ret < 0) {
11529 return ret;
11530 }
11531 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11532 if (ret < 0) {
11533 return ret;
11534 }
11535 } while (is_truncated);
11536
11537 return 0;
11538 }
11539
11540 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11541 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11542 {
11543 string op_tag;
11544
11545 rgw_obj olh_obj = target_obj;
11546 olh_obj.key.instance.clear();
11547
11548 RGWObjState *state = NULL;
11549
11550 int ret = 0;
11551 int i;
11552
11553 #define MAX_ECANCELED_RETRY 100
11554 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11555 if (ret == -ECANCELED) {
11556 obj_ctx.obj.invalidate(olh_obj);
11557 }
11558
11559 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11560 if (ret < 0) {
11561 return ret;
11562 }
11563
11564 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11565 if (ret < 0) {
11566 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11567 if (ret == -ECANCELED) {
11568 continue;
11569 }
11570 return ret;
11571 }
11572 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11573 if (ret < 0) {
11574 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11575 if (ret == -ECANCELED) {
11576 continue;
11577 }
11578 return ret;
11579 }
11580 break;
11581 }
11582
11583 if (i == MAX_ECANCELED_RETRY) {
11584 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11585 return -EIO;
11586 }
11587
11588 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11589 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11590 ret = 0;
11591 }
11592 if (ret < 0) {
11593 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11594 return ret;
11595 }
11596
11597 return 0;
11598 }
11599
11600 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11601 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11602 {
11603 string op_tag;
11604
11605 rgw_obj olh_obj = target_obj;
11606 olh_obj.key.instance.clear();
11607
11608 RGWObjState *state = NULL;
11609
11610 int ret = 0;
11611 int i;
11612
11613 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11614 if (ret == -ECANCELED) {
11615 obj_ctx.obj.invalidate(olh_obj);
11616 }
11617
11618 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11619 if (ret < 0)
11620 return ret;
11621
11622 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11623 if (ret < 0) {
11624 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11625 if (ret == -ECANCELED) {
11626 continue;
11627 }
11628 return ret;
11629 }
11630
11631 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11632
11633 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11634 if (ret < 0) {
11635 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11636 if (ret == -ECANCELED) {
11637 continue;
11638 }
11639 return ret;
11640 }
11641 break;
11642 }
11643
11644 if (i == MAX_ECANCELED_RETRY) {
11645 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11646 return -EIO;
11647 }
11648
11649 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11650 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11651 return 0;
11652 }
11653 if (ret < 0) {
11654 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11655 return ret;
11656 }
11657
11658 return 0;
11659 }
11660
11661 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11662 {
11663 #define OBJ_INSTANCE_LEN 32
11664 char buf[OBJ_INSTANCE_LEN + 1];
11665
11666 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11667 no underscore for instance name due to the way we encode the raw keys */
11668
11669 target_obj->key.set_instance(buf);
11670 }
11671
11672 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11673 map<string, bufferlist> *attrset)
11674 {
11675 attrset->clear();
11676 map<string, bufferlist>::iterator iter;
11677 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11678 iter != unfiltered_attrset.end(); ++iter) {
11679 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11680 break;
11681 (*attrset)[iter->first] = iter->second;
11682 }
11683 }
11684
11685 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11686 {
11687 map<string, bufferlist> unfiltered_attrset;
11688
11689 ObjectReadOperation op;
11690 op.getxattrs(&unfiltered_attrset, NULL);
11691
11692 bufferlist outbl;
11693 int r = obj_operate(bucket_info, obj, &op);
11694
11695 if (r < 0) {
11696 return r;
11697 }
11698 map<string, bufferlist> attrset;
11699
11700 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11701
11702 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11703 if (iter == attrset.end()) { /* not an olh */
11704 return -EINVAL;
11705 }
11706
11707 try {
11708 bufferlist::iterator biter = iter->second.begin();
11709 ::decode(*olh, biter);
11710 } catch (buffer::error& err) {
11711 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11712 return -EIO;
11713 }
11714
11715 return 0;
11716 }
11717
11718 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11719 map<string, bufferlist> *rm_pending_entries)
11720 {
11721 map<string, bufferlist>::iterator iter = pending_entries.begin();
11722
11723 real_time now = real_clock::now();
11724
11725 while (iter != pending_entries.end()) {
11726 bufferlist::iterator biter = iter->second.begin();
11727 RGWOLHPendingInfo pending_info;
11728 try {
11729 ::decode(pending_info, biter);
11730 } catch (buffer::error& err) {
11731 /* skipping bad entry, we could remove it but it might hide a bug */
11732 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11733 ++iter;
11734 continue;
11735 }
11736
11737 map<string, bufferlist>::iterator cur_iter = iter;
11738 ++iter;
11739 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11740 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11741 pending_entries.erase(cur_iter);
11742 } else {
11743 /* entries names are sorted by time (rounded to a second) */
11744 break;
11745 }
11746 }
11747 }
11748
11749 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11750 {
11751 ObjectWriteOperation op;
11752
11753 bucket_index_guard_olh_op(state, op);
11754
11755 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11756 op.rmxattr(iter->first.c_str());
11757 }
11758
11759 rgw_rados_ref ref;
11760 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11761 if (r < 0) {
11762 return r;
11763 }
11764
11765 /* update olh object */
11766 r = ref.ioctx.operate(ref.oid, &op);
11767 if (r == -ENOENT || r == -ECANCELED) {
11768 /* raced with some other change, shouldn't sweat about it */
11769 r = 0;
11770 }
11771 if (r < 0) {
11772 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11773 return r;
11774 }
11775
11776 return 0;
11777 }
11778
11779 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11780 {
11781 map<string, bufferlist> pending_entries;
11782 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11783
11784 map<string, bufferlist> rm_pending_entries;
11785 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11786
11787 if (!rm_pending_entries.empty()) {
11788 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11789 if (ret < 0) {
11790 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11791 return ret;
11792 }
11793 }
11794 if (!pending_entries.empty()) {
11795 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11796
11797 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11798 if (ret < 0) {
11799 return ret;
11800 }
11801 }
11802
11803 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11804 assert(iter != state->attrset.end());
11805 RGWOLHInfo olh;
11806 try {
11807 bufferlist::iterator biter = iter->second.begin();
11808 ::decode(olh, biter);
11809 } catch (buffer::error& err) {
11810 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11811 return -EIO;
11812 }
11813
11814 if (olh.removed) {
11815 return -ENOENT;
11816 }
11817
11818 *target = olh.target;
11819
11820 return 0;
11821 }
11822
11823 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11824 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11825 RGWObjVersionTracker *objv_tracker)
11826 {
11827 rgw_rados_ref ref;
11828 int r = get_raw_obj_ref(obj, &ref);
11829 if (r < 0) {
11830 return r;
11831 }
11832
11833 map<string, bufferlist> unfiltered_attrset;
11834 uint64_t size = 0;
11835 struct timespec mtime_ts;
11836
11837 ObjectReadOperation op;
11838 if (objv_tracker) {
11839 objv_tracker->prepare_op_for_read(&op);
11840 }
11841 if (attrs) {
11842 op.getxattrs(&unfiltered_attrset, NULL);
11843 }
11844 if (psize || pmtime) {
11845 op.stat2(&size, &mtime_ts, NULL);
11846 }
11847 if (first_chunk) {
11848 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11849 }
11850 bufferlist outbl;
11851 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11852
11853 if (epoch) {
11854 *epoch = ref.ioctx.get_last_version();
11855 }
11856
11857 if (r < 0)
11858 return r;
11859
11860 if (psize)
11861 *psize = size;
11862 if (pmtime)
11863 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11864 if (attrs) {
11865 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11866 }
11867
11868 return 0;
11869 }
11870
11871 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11872 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11873 {
11874 map<string, rgw_bucket_dir_header> headers;
11875 map<int, string> bucket_instance_ids;
11876 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11877 if (r < 0) {
11878 return r;
11879 }
11880
11881 assert(headers.size() == bucket_instance_ids.size());
11882
11883 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11884 map<int, string>::iterator viter = bucket_instance_ids.begin();
11885 BucketIndexShardsManager ver_mgr;
11886 BucketIndexShardsManager master_ver_mgr;
11887 BucketIndexShardsManager marker_mgr;
11888 char buf[64];
11889 for(; iter != headers.end(); ++iter, ++viter) {
11890 accumulate_raw_stats(iter->second, stats);
11891 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11892 ver_mgr.add(viter->first, string(buf));
11893 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11894 master_ver_mgr.add(viter->first, string(buf));
11895 if (shard_id >= 0) {
11896 *max_marker = iter->second.max_marker;
11897 } else {
11898 marker_mgr.add(viter->first, iter->second.max_marker);
11899 }
11900 if (syncstopped != NULL)
11901 *syncstopped = iter->second.syncstopped;
11902 }
11903 ver_mgr.to_string(bucket_ver);
11904 master_ver_mgr.to_string(master_ver);
11905 if (shard_id < 0) {
11906 marker_mgr.to_string(max_marker);
11907 }
11908 return 0;
11909 }
11910
11911 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11912 map<int, string>& markers)
11913 {
11914 map<string, rgw_bucket_dir_header> headers;
11915 map<int, string> bucket_instance_ids;
11916 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11917 if (r < 0)
11918 return r;
11919
11920 assert(headers.size() == bucket_instance_ids.size());
11921
11922 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11923 map<int, string>::iterator viter = bucket_instance_ids.begin();
11924
11925 for(; iter != headers.end(); ++iter, ++viter) {
11926 if (shard_id >= 0) {
11927 markers[shard_id] = iter->second.max_marker;
11928 } else {
11929 markers[viter->first] = iter->second.max_marker;
11930 }
11931 }
11932 return 0;
11933 }
11934
11935 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11936 RGWGetBucketStats_CB *cb;
11937 uint32_t pendings;
11938 map<RGWObjCategory, RGWStorageStats> stats;
11939 int ret_code;
11940 bool should_cb;
11941 Mutex lock;
11942
11943 public:
11944 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11945 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11946 lock("RGWGetBucketStatsContext") {}
11947
11948 void handle_response(int r, rgw_bucket_dir_header& header) override {
11949 Mutex::Locker l(lock);
11950 if (should_cb) {
11951 if ( r >= 0) {
11952 accumulate_raw_stats(header, stats);
11953 } else {
11954 ret_code = r;
11955 }
11956
11957 // Are we all done?
11958 if (--pendings == 0) {
11959 if (!ret_code) {
11960 cb->set_response(&stats);
11961 }
11962 cb->handle_response(ret_code);
11963 cb->put();
11964 }
11965 }
11966 }
11967
11968 void unset_cb() {
11969 Mutex::Locker l(lock);
11970 should_cb = false;
11971 }
11972 };
11973
11974 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11975 {
11976 int num_aio = 0;
11977 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11978 assert(get_ctx);
11979 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11980 if (r < 0) {
11981 ctx->put();
11982 if (num_aio) {
11983 get_ctx->unset_cb();
11984 }
11985 }
11986 get_ctx->put();
11987 return r;
11988 }
11989
11990 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11991 RGWGetUserStats_CB *cb;
11992
11993 public:
11994 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11995 : cb(cb) {}
11996
11997 void handle_response(int r, cls_user_header& header) override {
11998 const cls_user_stats& hs = header.stats;
11999 if (r >= 0) {
12000 RGWStorageStats stats;
12001
12002 stats.size = hs.total_bytes;
12003 stats.size_rounded = hs.total_bytes_rounded;
12004 stats.num_objects = hs.total_entries;
12005
12006 cb->set_response(stats);
12007 }
12008
12009 cb->handle_response(r);
12010
12011 cb->put();
12012 }
12013 };
12014
12015 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
12016 {
12017 string user_str = user.to_str();
12018
12019 cls_user_header header;
12020 int r = cls_user_get_header(user_str, &header);
12021 if (r < 0)
12022 return r;
12023
12024 const cls_user_stats& hs = header.stats;
12025
12026 stats.size = hs.total_bytes;
12027 stats.size_rounded = hs.total_bytes_rounded;
12028 stats.num_objects = hs.total_entries;
12029
12030 return 0;
12031 }
12032
12033 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
12034 {
12035 string user_str = user.to_str();
12036
12037 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
12038 int r = cls_user_get_header_async(user_str, get_ctx);
12039 if (r < 0) {
12040 ctx->put();
12041 delete get_ctx;
12042 return r;
12043 }
12044
12045 return 0;
12046 }
12047
12048 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
12049 {
12050 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
12051 }
12052
12053 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
12054 {
12055 if (!bucket.oid.empty()) {
12056 obj.init(get_zone_params().domain_root, bucket.oid);
12057 } else {
12058 string oid;
12059 get_bucket_meta_oid(bucket, oid);
12060 obj.init(get_zone_params().domain_root, oid);
12061 }
12062 }
12063
12064 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
12065 real_time *pmtime, map<string, bufferlist> *pattrs)
12066 {
12067 size_t pos = meta_key.find(':');
12068 if (pos == string::npos) {
12069 return -EINVAL;
12070 }
12071 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
12072 rgw_bucket_instance_key_to_oid(oid);
12073
12074 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12075 }
12076
12077 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
12078 real_time *pmtime, map<string, bufferlist> *pattrs)
12079 {
12080 string oid;
12081 if (bucket.oid.empty()) {
12082 get_bucket_meta_oid(bucket, oid);
12083 } else {
12084 oid = bucket.oid;
12085 }
12086
12087 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
12088 }
12089
12090 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
12091 real_time *pmtime, map<string, bufferlist> *pattrs,
12092 rgw_cache_entry_info *cache_info,
12093 boost::optional<obj_version> refresh_version)
12094 {
12095 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
12096
12097 bufferlist epbl;
12098
12099 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12100 oid, epbl, &info.objv_tracker, pmtime, pattrs,
12101 cache_info, refresh_version);
12102 if (ret < 0) {
12103 return ret;
12104 }
12105
12106 bufferlist::iterator iter = epbl.begin();
12107 try {
12108 ::decode(info, iter);
12109 } catch (buffer::error& err) {
12110 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12111 return -EIO;
12112 }
12113 info.bucket.oid = oid;
12114 return 0;
12115 }
12116
12117 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
12118 const string& tenant_name,
12119 const string& bucket_name,
12120 RGWBucketEntryPoint& entry_point,
12121 RGWObjVersionTracker *objv_tracker,
12122 real_time *pmtime,
12123 map<string, bufferlist> *pattrs,
12124 rgw_cache_entry_info *cache_info,
12125 boost::optional<obj_version> refresh_version)
12126 {
12127 bufferlist bl;
12128 string bucket_entry;
12129
12130 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12131 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
12132 bucket_entry, bl, objv_tracker, pmtime, pattrs,
12133 cache_info, refresh_version);
12134 if (ret < 0) {
12135 return ret;
12136 }
12137
12138 bufferlist::iterator iter = bl.begin();
12139 try {
12140 ::decode(entry_point, iter);
12141 } catch (buffer::error& err) {
12142 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
12143 return -EIO;
12144 }
12145 return 0;
12146 }
12147
12148 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
12149 const string& tenant_name,
12150 const string& bucket_name)
12151 {
12152 RGWBucketEntryPoint entry_point;
12153 real_time ep_mtime;
12154 RGWObjVersionTracker ot;
12155 map<string, bufferlist> attrs;
12156 RGWBucketInfo info;
12157
12158 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
12159
12160 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
12161 if (ret < 0) {
12162 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
12163 return ret;
12164 }
12165
12166 if (!entry_point.has_bucket_info) {
12167 /* already converted! */
12168 return 0;
12169 }
12170
12171 info = entry_point.old_bucket_info;
12172 info.bucket.oid = bucket_name;
12173 info.ep_objv = ot.read_version;
12174
12175 ot.generate_new_write_ver(cct);
12176
12177 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12178 if (ret < 0) {
12179 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12180 return ret;
12181 }
12182
12183 return 0;
12184 }
12185
12186 int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12187 const string& tenant,
12188 const string& bucket_name,
12189 RGWBucketInfo& info,
12190 real_time *pmtime,
12191 map<string, bufferlist> *pattrs,
12192 boost::optional<obj_version> refresh_version)
12193 {
12194 bucket_info_entry e;
12195 string bucket_entry;
12196 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12197
12198
12199 if (binfo_cache->find(bucket_entry, &e)) {
12200 if (refresh_version &&
12201 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12202 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12203 << "a failure that should be debugged. I am a nice machine, "
12204 << "so I will try to recover." << dendl;
12205 binfo_cache->invalidate(bucket_entry);
12206 }
12207 info = e.info;
12208 if (pattrs)
12209 *pattrs = e.attrs;
12210 if (pmtime)
12211 *pmtime = e.mtime;
12212 return 0;
12213 }
12214
12215 RGWBucketEntryPoint entry_point;
12216 real_time ep_mtime;
12217 RGWObjVersionTracker ot;
12218 rgw_cache_entry_info entry_cache_info;
12219 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12220 entry_point, &ot, &ep_mtime, pattrs,
12221 &entry_cache_info, refresh_version);
12222 if (ret < 0) {
12223 /* only init these fields */
12224 info.bucket.tenant = tenant;
12225 info.bucket.name = bucket_name;
12226 return ret;
12227 }
12228
12229 if (entry_point.has_bucket_info) {
12230 info = entry_point.old_bucket_info;
12231 info.bucket.oid = bucket_name;
12232 info.bucket.tenant = tenant;
12233 info.ep_objv = ot.read_version;
12234 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12235 return 0;
12236 }
12237
12238 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12239 * that we got
12240 */
12241 if (pattrs) {
12242 pattrs->clear();
12243 }
12244
12245 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12246
12247
12248 /* read bucket instance info */
12249
12250 string oid;
12251 get_bucket_meta_oid(entry_point.bucket, oid);
12252
12253 rgw_cache_entry_info cache_info;
12254
12255 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12256 &cache_info, refresh_version);
12257 e.info.ep_objv = ot.read_version;
12258 info = e.info;
12259 if (ret < 0) {
12260 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
12261 info.bucket.tenant = tenant;
12262 info.bucket.name = bucket_name;
12263 // XXX and why return anything in case of an error anyway?
12264 return ret;
12265 }
12266
12267 if (pmtime)
12268 *pmtime = e.mtime;
12269 if (pattrs)
12270 *pattrs = e.attrs;
12271
12272 list<rgw_cache_entry_info *> cache_info_entries;
12273 cache_info_entries.push_back(&entry_cache_info);
12274 cache_info_entries.push_back(&cache_info);
12275
12276
12277 /* chain to both bucket entry point and bucket instance */
12278 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12279 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12280 }
12281
12282 if (refresh_version &&
12283 refresh_version->compare(&info.objv_tracker.read_version)) {
12284 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12285 << "have gone squirrelly. An administrator may have forced a "
12286 << "change; otherwise there is a problem somewhere." << dendl;
12287 }
12288
12289 return 0;
12290 }
12291
12292 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12293 const string& tenant, const string& bucket_name,
12294 RGWBucketInfo& info,
12295 real_time *pmtime, map<string, bufferlist> *pattrs)
12296 {
12297 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12298 pattrs, boost::none);
12299 }
12300
12301 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12302 ceph::real_time *pmtime,
12303 map<string, bufferlist> *pattrs)
12304 {
12305 RGWObjectCtx obj_ctx(this);
12306
12307 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12308 info, pmtime, pattrs, info.objv_tracker.read_version);
12309 }
12310
12311 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12312 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12313 map<string, bufferlist> *pattrs)
12314 {
12315 bufferlist epbl;
12316 ::encode(entry_point, epbl);
12317 string bucket_entry;
12318 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12319 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12320 }
12321
12322 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12323 real_time mtime, map<string, bufferlist> *pattrs)
12324 {
12325 info.has_instance_obj = true;
12326 bufferlist bl;
12327
12328 ::encode(info, bl);
12329
12330 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12331 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12332 if (ret == -EEXIST) {
12333 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12334 * bucket operation on this specific bucket (e.g., being synced from the master), but
12335 * since bucket instace meta object is unique for this specific bucket instace, we don't
12336 * need to return an error.
12337 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12338 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12339 * locally, while in the sync thread we sync the new bucket.
12340 */
12341 ret = 0;
12342 }
12343 return ret;
12344 }
12345
12346 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12347 map<string, bufferlist> *pattrs, bool create_entry_point)
12348 {
12349 bool create_head = !info.has_instance_obj || create_entry_point;
12350
12351 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12352 if (ret < 0) {
12353 return ret;
12354 }
12355
12356 if (!create_head)
12357 return 0; /* done! */
12358
12359 RGWBucketEntryPoint entry_point;
12360 entry_point.bucket = info.bucket;
12361 entry_point.owner = info.owner;
12362 entry_point.creation_time = info.creation_time;
12363 entry_point.linked = true;
12364 RGWObjVersionTracker ot;
12365 if (pep_objv && !pep_objv->tag.empty()) {
12366 ot.write_version = *pep_objv;
12367 } else {
12368 ot.generate_new_write_ver(cct);
12369 if (pep_objv) {
12370 *pep_objv = ot.write_version;
12371 }
12372 }
12373 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12374 if (ret < 0)
12375 return ret;
12376
12377 return 0;
12378 }
12379
12380 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12381 {
12382 rgw_rados_ref ref;
12383 int r = get_raw_obj_ref(obj, &ref);
12384 if (r < 0) {
12385 return r;
12386 }
12387
12388 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12389 if (r < 0)
12390 return r;
12391
12392 return 0;
12393
12394 }
12395
12396 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12397 std::map<string, bufferlist>& m)
12398 {
12399 rgw_rados_ref ref;
12400 int r = get_raw_obj_ref(obj, &ref);
12401 if (r < 0) {
12402 return r;
12403 }
12404
12405 #define MAX_OMAP_GET_ENTRIES 1024
12406 const int count = MAX_OMAP_GET_ENTRIES;
12407 string start_after;
12408
12409 while (true) {
12410 std::map<string, bufferlist> t;
12411 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12412 if (r < 0) {
12413 return r;
12414 }
12415 if (t.empty()) {
12416 break;
12417 }
12418 start_after = t.rbegin()->first;
12419 m.insert(t.begin(), t.end());
12420 }
12421 return 0;
12422 }
12423
12424 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12425 {
12426 rgw_rados_ref ref;
12427 int r = get_raw_obj_ref(obj, &ref);
12428 if (r < 0) {
12429 return r;
12430 }
12431 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12432
12433 map<string, bufferlist> m;
12434 m[key] = bl;
12435
12436 r = ref.ioctx.omap_set(ref.oid, m);
12437
12438 return r;
12439 }
12440
12441 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12442 {
12443 rgw_rados_ref ref;
12444 int r = get_raw_obj_ref(obj, &ref);
12445 if (r < 0) {
12446 return r;
12447 }
12448
12449 r = ref.ioctx.omap_set(ref.oid, m);
12450
12451 return r;
12452 }
12453
12454 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12455 {
12456 rgw_rados_ref ref;
12457 int r = get_raw_obj_ref(obj, &ref);
12458 if (r < 0) {
12459 return r;
12460 }
12461
12462 set<string> k;
12463 k.insert(key);
12464
12465 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12466 return r;
12467 }
12468
12469 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12470 {
12471 RGWObjectCtx obj_ctx(this);
12472
12473 map<string, RGWBucketEnt>::iterator iter;
12474 for (iter = m.begin(); iter != m.end(); ++iter) {
12475 RGWBucketEnt& ent = iter->second;
12476 rgw_bucket& bucket = ent.bucket;
12477 ent.count = 0;
12478 ent.size = 0;
12479 ent.size_rounded = 0;
12480
12481 map<string, rgw_bucket_dir_header> headers;
12482
12483 RGWBucketInfo bucket_info;
12484 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12485 if (ret < 0) {
12486 return ret;
12487 }
12488
12489 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12490 if (r < 0)
12491 return r;
12492
12493 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12494 for (; hiter != headers.end(); ++hiter) {
12495 RGWObjCategory category = main_category;
12496 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12497 if (iter != hiter->second.stats.end()) {
12498 struct rgw_bucket_category_stats& stats = iter->second;
12499 ent.count += stats.num_entries;
12500 ent.size += stats.total_size;
12501 ent.size_rounded += stats.total_size_rounded;
12502 }
12503 }
12504
12505 // fill in placement_rule from the bucket instance for use in swift's
12506 // per-storage policy statistics
12507 ent.placement_rule = std::move(bucket_info.placement_rule);
12508 }
12509
12510 return m.size();
12511 }
12512
12513 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12514 {
12515 rgw_rados_ref ref;
12516 int r = get_raw_obj_ref(obj, &ref);
12517 if (r < 0) {
12518 return r;
12519 }
12520 librados::Rados *rad = get_rados_handle();
12521 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12522
12523 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12524 completion->release();
12525 return r;
12526 }
12527
12528 int RGWRados::distribute(const string& key, bufferlist& bl)
12529 {
12530 /*
12531 * we were called before watch was initialized. This can only happen if we're updating some system
12532 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12533 * objects, they're currently only read on startup anyway.
12534 */
12535 if (!watch_initialized)
12536 return 0;
12537
12538 string notify_oid;
12539 pick_control_oid(key, notify_oid);
12540
12541 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12542 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12543 }
12544
12545 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12546 {
12547 librados::IoCtx& io_ctx = ctx.io_ctx;
12548 librados::NObjectIterator& iter = ctx.iter;
12549
12550 int r = open_pool_ctx(pool, io_ctx);
12551 if (r < 0)
12552 return r;
12553
12554 iter = io_ctx.nobjects_begin();
12555
12556 return 0;
12557 }
12558
12559 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12560 {
12561 librados::IoCtx& io_ctx = ctx.io_ctx;
12562 librados::NObjectIterator& iter = ctx.iter;
12563
12564 int r = open_pool_ctx(pool, io_ctx);
12565 if (r < 0)
12566 return r;
12567
12568 librados::ObjectCursor oc;
12569 if (!oc.from_str(cursor)) {
12570 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12571 return -EINVAL;
12572 }
12573
12574 iter = io_ctx.nobjects_begin(oc);
12575
12576 return 0;
12577 }
12578
12579 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12580 {
12581 return ctx.iter.get_cursor().to_str();
12582 }
12583
12584 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12585 bool *is_truncated, RGWAccessListFilter *filter)
12586 {
12587 librados::IoCtx& io_ctx = ctx.io_ctx;
12588 librados::NObjectIterator& iter = ctx.iter;
12589
12590 if (iter == io_ctx.nobjects_end())
12591 return -ENOENT;
12592
12593 uint32_t i;
12594
12595 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12596 rgw_bucket_dir_entry e;
12597
12598 string oid = iter->get_oid();
12599 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12600
12601 // fill it in with initial values; we may correct later
12602 if (filter && !filter->filter(oid, oid))
12603 continue;
12604
12605 e.key = oid;
12606 objs.push_back(e);
12607 }
12608
12609 if (is_truncated)
12610 *is_truncated = (iter != io_ctx.nobjects_end());
12611
12612 return objs.size();
12613 }
12614 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12615 string prefix;
12616
12617 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12618 bool filter(string& name, string& key) override {
12619 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12620 }
12621 };
12622
12623 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
12624 {
12625 if (!ctx->initialized) {
12626 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
12627 if (r < 0) {
12628 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12629 return r;
12630 }
12631 ctx->initialized = true;
12632 }
12633 return 0;
12634 }
12635
12636 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12637 RGWListRawObjsCtx& ctx, list<string>& oids,
12638 bool *is_truncated)
12639 {
12640 if (!ctx.initialized) {
12641 return -EINVAL;
12642 }
12643 RGWAccessListFilterPrefix filter(prefix_filter);
12644 vector<rgw_bucket_dir_entry> objs;
12645 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12646 if (r < 0) {
12647 if(r != -ENOENT)
12648 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12649 return r;
12650 }
12651
12652 vector<rgw_bucket_dir_entry>::iterator iter;
12653 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12654 oids.push_back(iter->key.name);
12655 }
12656
12657 return oids.size();
12658 }
12659
12660 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12661 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12662 bool *is_truncated)
12663 {
12664 if (!ctx.initialized) {
12665 int r = list_raw_objects_init(pool, string(), &ctx);
12666 if (r < 0) {
12667 return r;
12668 }
12669 }
12670
12671 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12672 }
12673
12674 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12675 {
12676 return pool_iterate_get_cursor(ctx.iter_ctx);
12677 }
12678
12679 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12680 std::list<rgw_bi_log_entry>& result, bool *truncated)
12681 {
12682 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12683 result.clear();
12684
12685 librados::IoCtx index_ctx;
12686 map<int, string> oids;
12687 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12688 map<int, string> bucket_instance_ids;
12689 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12690 if (r < 0)
12691 return r;
12692
12693 BucketIndexShardsManager marker_mgr;
12694 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12695 // If there are multiple shards for the bucket index object, the marker
12696 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12697 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12698 // only contain one record, and the key is the bucket instance id.
12699 r = marker_mgr.from_string(marker, shard_id);
12700 if (r < 0)
12701 return r;
12702
12703 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12704 if (r < 0)
12705 return r;
12706
12707 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12708 map<int, list<rgw_bi_log_entry>::iterator> vends;
12709 if (truncated) {
12710 *truncated = false;
12711 }
12712 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12713 for (; miter != bi_log_lists.end(); ++miter) {
12714 int shard_id = miter->first;
12715 vcurrents[shard_id] = miter->second.entries.begin();
12716 vends[shard_id] = miter->second.entries.end();
12717 if (truncated) {
12718 *truncated = (*truncated || miter->second.truncated);
12719 }
12720 }
12721
12722 size_t total = 0;
12723 bool has_more = true;
12724 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12725 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12726 while (total < max && has_more) {
12727 has_more = false;
12728
12729 viter = vcurrents.begin();
12730 eiter = vends.begin();
12731
12732 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12733 assert (eiter != vends.end());
12734
12735 int shard_id = viter->first;
12736 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12737
12738 if (liter == eiter->second){
12739 continue;
12740 }
12741 rgw_bi_log_entry& entry = *(liter);
12742 if (has_shards) {
12743 char buf[16];
12744 snprintf(buf, sizeof(buf), "%d", shard_id);
12745 string tmp_id;
12746 build_bucket_index_marker(buf, entry.id, &tmp_id);
12747 entry.id.swap(tmp_id);
12748 }
12749 marker_mgr.add(shard_id, entry.id);
12750 result.push_back(entry);
12751 total++;
12752 has_more = true;
12753 ++liter;
12754 }
12755 }
12756
12757 if (truncated) {
12758 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12759 assert (eiter != vends.end());
12760 *truncated = (*truncated || (viter->second != eiter->second));
12761 }
12762 }
12763
12764 // Refresh marker, if there are multiple shards, the output will look like
12765 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12766 // if there is no sharding, the simply marker (without oid) is returned
12767 if (has_shards) {
12768 marker_mgr.to_string(&marker);
12769 } else {
12770 if (!result.empty()) {
12771 marker = result.rbegin()->id;
12772 }
12773 }
12774
12775 return 0;
12776 }
12777
12778 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12779 {
12780 librados::IoCtx index_ctx;
12781 map<int, string> bucket_objs;
12782
12783 BucketIndexShardsManager start_marker_mgr;
12784 BucketIndexShardsManager end_marker_mgr;
12785
12786 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12787 if (r < 0) {
12788 return r;
12789 }
12790
12791 r = start_marker_mgr.from_string(start_marker, shard_id);
12792 if (r < 0) {
12793 return r;
12794 }
12795
12796 r = end_marker_mgr.from_string(end_marker, shard_id);
12797 if (r < 0) {
12798 return r;
12799 }
12800
12801 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12802 cct->_conf->rgw_bucket_index_max_aio)();
12803
12804 return r;
12805 }
12806
12807 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12808 {
12809 librados::IoCtx index_ctx;
12810 map<int, string> bucket_objs;
12811 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12812 if (r < 0)
12813 return r;
12814
12815 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12816 }
12817
12818 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12819 {
12820 librados::IoCtx index_ctx;
12821 map<int, string> bucket_objs;
12822 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12823 if (r < 0)
12824 return r;
12825
12826 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12827 }
12828
12829 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12830 {
12831 rgw_rados_ref ref;
12832 int r = get_obj_head_ref(bucket_info, obj, &ref);
12833 if (r < 0) {
12834 return r;
12835 }
12836
12837 rgw_cls_bi_entry bi_entry;
12838 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12839 if (r < 0 && r != -ENOENT) {
12840 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12841 }
12842 if (r < 0) {
12843 return r;
12844 }
12845 bufferlist::iterator iter = bi_entry.data.begin();
12846 try {
12847 ::decode(*dirent, iter);
12848 } catch (buffer::error& err) {
12849 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12850 return -EIO;
12851 }
12852
12853 return 0;
12854 }
12855
12856 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12857 {
12858 BucketShard bs(this);
12859 int ret = bs.init(bucket, obj);
12860 if (ret < 0) {
12861 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12862 return ret;
12863 }
12864
12865 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12866
12867 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12868 if (ret < 0)
12869 return ret;
12870
12871 return 0;
12872 }
12873
12874 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12875 {
12876 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12877 }
12878
12879 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12880 {
12881 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12882 if (ret < 0)
12883 return ret;
12884
12885 return 0;
12886 }
12887
12888 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12889 {
12890 BucketShard bs(this);
12891 int ret = bs.init(bucket, obj);
12892 if (ret < 0) {
12893 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12894 return ret;
12895 }
12896
12897 return bi_put(bs, entry);
12898 }
12899
12900 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12901 {
12902 rgw_obj obj(bucket, obj_name);
12903 BucketShard bs(this);
12904 int ret = bs.init(bucket, obj);
12905 if (ret < 0) {
12906 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12907 return ret;
12908 }
12909
12910 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12911 if (ret == -ENOENT) {
12912 *is_truncated = false;
12913 }
12914 if (ret < 0)
12915 return ret;
12916
12917 return 0;
12918 }
12919
12920 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12921 {
12922 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12923 if (ret < 0)
12924 return ret;
12925
12926 return 0;
12927 }
12928
12929 int RGWRados::bi_remove(BucketShard& bs)
12930 {
12931 int ret = bs.index_ctx.remove(bs.bucket_obj);
12932 if (ret == -ENOENT) {
12933 ret = 0;
12934 }
12935 if (ret < 0) {
12936 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12937 return ret;
12938 }
12939
12940 return 0;
12941 }
12942
12943 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12944 {
12945 BucketShard bs(this);
12946 int ret = bs.init(bucket, shard_id);
12947 if (ret < 0) {
12948 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12949 return ret;
12950 }
12951
12952 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12953 }
12954
12955 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12956 {
12957 return gc_pool_ctx.operate(oid, op);
12958 }
12959
12960 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12961 {
12962 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12963 int r = gc_pool_ctx.aio_operate(oid, c, op);
12964 c->release();
12965 return r;
12966 }
12967
12968 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12969 {
12970 return gc_pool_ctx.operate(oid, op, pbl);
12971 }
12972
12973 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12974 {
12975 return gc->list(index, marker, max, expired_only, result, truncated);
12976 }
12977
12978 int RGWRados::process_gc()
12979 {
12980 return gc->process();
12981 }
12982
12983 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12984 {
12985 return lc->list_lc_progress(marker, max_entries, progress_map);
12986 }
12987
12988 int RGWRados::process_lc()
12989 {
12990 return lc->process();
12991 }
12992
12993 bool RGWRados::process_expire_objects()
12994 {
12995 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12996 }
12997
12998 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12999 {
13000 bufferlist in;
13001 cls_rgw_bucket_init(op);
13002 return index_ctx.operate(oid, &op);
13003 }
13004
13005 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
13006 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
13007 {
13008 rgw_zone_set zones_trace;
13009 if (_zones_trace) {
13010 zones_trace = *_zones_trace;
13011 }
13012 zones_trace.insert(get_zone().id);
13013
13014 ObjectWriteOperation o;
13015 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
13016 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13017 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
13018 return bs.index_ctx.operate(bs.bucket_obj, &o);
13019 }
13020
13021 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
13022 int64_t pool, uint64_t epoch,
13023 rgw_bucket_dir_entry& ent, RGWObjCategory category,
13024 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
13025 {
13026 ObjectWriteOperation o;
13027 rgw_bucket_dir_entry_meta dir_meta;
13028 dir_meta = ent.meta;
13029 dir_meta.category = category;
13030
13031 rgw_zone_set zones_trace;
13032 if (_zones_trace) {
13033 zones_trace = *_zones_trace;
13034 }
13035 zones_trace.insert(get_zone().id);
13036
13037 rgw_bucket_entry_ver ver;
13038 ver.pool = pool;
13039 ver.epoch = epoch;
13040 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
13041 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
13042 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
13043 get_zone().log_data, bilog_flags, &zones_trace);
13044 complete_op_data *arg;
13045 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
13046 get_zone().log_data, bilog_flags, &zones_trace, &arg);
13047 librados::AioCompletion *completion = arg->rados_completion;
13048 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
13049 completion->release(); /* can't reference arg here, as it might have already been released */
13050 return ret;
13051 }
13052
13053 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
13054 int64_t pool, uint64_t epoch,
13055 rgw_bucket_dir_entry& ent, RGWObjCategory category,
13056 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
13057 {
13058 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
13059 }
13060
13061 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
13062 int64_t pool, uint64_t epoch,
13063 rgw_obj& obj,
13064 real_time& removed_mtime,
13065 list<rgw_obj_index_key> *remove_objs,
13066 uint16_t bilog_flags,
13067 rgw_zone_set *zones_trace)
13068 {
13069 rgw_bucket_dir_entry ent;
13070 ent.meta.mtime = removed_mtime;
13071 obj.key.get_index_key(&ent.key);
13072 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
13073 }
13074
13075 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
13076 {
13077 rgw_bucket_dir_entry ent;
13078 obj.key.get_index_key(&ent.key);
13079 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
13080 }
13081
13082 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
13083 {
13084 librados::IoCtx index_ctx;
13085 map<int, string> bucket_objs;
13086 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
13087 if (r < 0)
13088 return r;
13089
13090 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
13091 }
13092
13093
13094 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
13095 int shard_id,
13096 rgw_obj_index_key& start,
13097 const string& prefix,
13098 uint32_t num_entries,
13099 bool list_versions,
13100 map<string, rgw_bucket_dir_entry>& m,
13101 bool *is_truncated,
13102 rgw_obj_index_key *last_entry,
13103 bool (*force_check_filter)(const string& name))
13104 {
13105 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
13106 " start " << start.name << "[" << start.instance << "] num_entries " <<
13107 num_entries << dendl;
13108
13109 librados::IoCtx index_ctx;
13110 // key - oid (for different shards if there is any)
13111 // value - list result for the corresponding oid (shard), it is filled by
13112 // the AIO callback
13113 map<int, string> oids;
13114 map<int, struct rgw_cls_list_ret> list_results;
13115 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13116 if (r < 0)
13117 return r;
13118
13119 cls_rgw_obj_key start_key(start.name, start.instance);
13120 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
13121 list_versions, oids, list_results,
13122 cct->_conf->rgw_bucket_index_max_aio)();
13123 if (r < 0)
13124 return r;
13125
13126 // Create a list of iterators that are used to iterate each shard
13127 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
13128 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
13129 vector<string> vnames(list_results.size());
13130 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13131 *is_truncated = false;
13132 for (; iter != list_results.end(); ++iter) {
13133 vcurrents.push_back(iter->second.dir.m.begin());
13134 vends.push_back(iter->second.dir.m.end());
13135 vnames.push_back(oids[iter->first]);
13136 *is_truncated = (*is_truncated || iter->second.is_truncated);
13137 }
13138
13139 // Create a map to track the next candidate entry from each shard, if the entry
13140 // from a specified shard is selected/erased, the next entry from that shard will
13141 // be inserted for next round selection
13142 map<string, size_t> candidates;
13143 for (size_t i = 0; i < vcurrents.size(); ++i) {
13144 if (vcurrents[i] != vends[i]) {
13145 candidates[vcurrents[i]->first] = i;
13146 }
13147 }
13148
13149 map<string, bufferlist> updates;
13150 uint32_t count = 0;
13151 while (count < num_entries && !candidates.empty()) {
13152 r = 0;
13153 // Select the next one
13154 int pos = candidates.begin()->second;
13155 const string& name = vcurrents[pos]->first;
13156 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
13157
13158 bool force_check = force_check_filter &&
13159 force_check_filter(dirent.key.name);
13160 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13161 !dirent.pending_map.empty() ||
13162 force_check) {
13163 /* there are uncommitted ops. We need to check the current state,
13164 * and if the tags are old we need to do cleanup as well. */
13165 librados::IoCtx sub_ctx;
13166 sub_ctx.dup(index_ctx);
13167 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
13168 updates[vnames[pos]]);
13169 if (r < 0 && r != -ENOENT) {
13170 return r;
13171 }
13172 }
13173 if (r >= 0) {
13174 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
13175 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13176 m[name] = std::move(dirent);
13177 ++count;
13178 }
13179
13180 // Refresh the candidates map
13181 candidates.erase(candidates.begin());
13182 ++vcurrents[pos];
13183 if (vcurrents[pos] != vends[pos]) {
13184 candidates[vcurrents[pos]->first] = pos;
13185 }
13186 }
13187
13188 // Suggest updates if there is any
13189 map<string, bufferlist>::iterator miter = updates.begin();
13190 for (; miter != updates.end(); ++miter) {
13191 if (miter->second.length()) {
13192 ObjectWriteOperation o;
13193 cls_rgw_suggest_changes(o, miter->second);
13194 // we don't care if we lose suggested updates, send them off blindly
13195 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13196 index_ctx.aio_operate(miter->first, c, &o);
13197 c->release();
13198 }
13199 }
13200
13201 // Check if all the returned entries are consumed or not
13202 for (size_t i = 0; i < vcurrents.size(); ++i) {
13203 if (vcurrents[i] != vends[i]) {
13204 *is_truncated = true;
13205 break;
13206 }
13207 }
13208 if (!m.empty())
13209 *last_entry = m.rbegin()->first;
13210
13211 return 0;
13212 }
13213
13214
13215 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
13216 int shard_id,
13217 rgw_obj_index_key& start,
13218 const string& prefix,
13219 uint32_t num_entries,
13220 bool list_versions,
13221 std::vector<rgw_bucket_dir_entry>& ent_list,
13222 bool *is_truncated,
13223 rgw_obj_index_key *last_entry,
13224 bool (*force_check_filter)(const string& name)) {
13225 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
13226 " start " << start.name << "[" << start.instance <<
13227 "] num_entries " << num_entries << dendl;
13228
13229 *is_truncated = false;
13230 librados::IoCtx index_ctx;
13231
13232 rgw_obj_index_key my_start = start;
13233
13234 map<int, string> oids;
13235 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
13236 if (r < 0)
13237 return r;
13238 const uint32_t num_shards = oids.size();
13239
13240 uint32_t current_shard;
13241 if (shard_id >= 0) {
13242 current_shard = shard_id;
13243 } else if (my_start.empty()) {
13244 current_shard = 0u;
13245 } else {
13246 current_shard =
13247 rgw_bucket_shard_index(my_start.name, num_shards);
13248 }
13249
13250 uint32_t count = 0u;
13251 map<string, bufferlist> updates;
13252 std::string last_added_entry;
13253 while (count <= num_entries &&
13254 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
13255 current_shard < num_shards)) {
13256 // key - oid (for different shards if there is any)
13257 // value - list result for the corresponding oid (shard), it is filled by
13258 // the AIO callback
13259 map<int, struct rgw_cls_list_ret> list_results;
13260 r = CLSRGWIssueBucketList(index_ctx, my_start, prefix, num_entries,
13261 list_versions, oids, list_results,
13262 cct->_conf->rgw_bucket_index_max_aio)();
13263 if (r < 0)
13264 return r;
13265
13266 const std::string& oid = oids[current_shard];
13267 assert(list_results.find(current_shard) != list_results.end());
13268 auto& result = list_results[current_shard];
13269 for (auto& entry : result.dir.m) {
13270 rgw_bucket_dir_entry& dirent = entry.second;
13271
13272 bool force_check = force_check_filter &&
13273 force_check_filter(dirent.key.name);
13274 if ((!dirent.exists && !dirent.is_delete_marker()) ||
13275 !dirent.pending_map.empty() ||
13276 force_check) {
13277 /* there are uncommitted ops. We need to check the current state,
13278 * and if the tags are old we need to do cleanup as well. */
13279 librados::IoCtx sub_ctx;
13280 sub_ctx.dup(index_ctx);
13281 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
13282 if (r < 0 && r != -ENOENT) {
13283 return r;
13284 }
13285 }
13286
13287 // at this point either r >=0 or r == -ENOENT
13288 if (r >= 0) { // i.e., if r != -ENOENT
13289 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
13290 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13291
13292 if (count < num_entries) {
13293 last_added_entry = entry.first;
13294 my_start = dirent.key;
13295 ent_list.emplace_back(std::move(dirent));
13296 ++count;
13297 } else {
13298 *is_truncated = true;
13299 goto check_updates;
13300 }
13301 } else { // r == -ENOENT
13302 // in the case of -ENOENT, make sure we're advancing marker
13303 // for possible next call to CLSRGWIssueBucketList
13304 my_start = dirent.key;
13305 }
13306 } // entry for loop
13307
13308 if (!result.is_truncated) {
13309 // if we reached the end of the shard read next shard
13310 ++current_shard;
13311 my_start = rgw_obj_index_key();
13312 }
13313 } // shard loop
13314
13315 check_updates:
13316 // suggest updates if there is any
13317 map<string, bufferlist>::iterator miter = updates.begin();
13318 for (; miter != updates.end(); ++miter) {
13319 if (miter->second.length()) {
13320 ObjectWriteOperation o;
13321 cls_rgw_suggest_changes(o, miter->second);
13322 // we don't care if we lose suggested updates, send them off blindly
13323 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13324 index_ctx.aio_operate(miter->first, c, &o);
13325 c->release();
13326 }
13327 }
13328
13329 if (last_entry && !ent_list.empty()) {
13330 *last_entry = last_added_entry;
13331 }
13332
13333 return 0;
13334 }
13335
13336
13337 int RGWRados::cls_obj_usage_log_add(const string& oid,
13338 rgw_usage_log_info& info)
13339 {
13340 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13341
13342 rgw_rados_ref ref;
13343 int r = get_raw_obj_ref(obj, &ref);
13344 if (r < 0) {
13345 return r;
13346 }
13347
13348 ObjectWriteOperation op;
13349 cls_rgw_usage_log_add(op, info);
13350
13351 r = ref.ioctx.operate(ref.oid, &op);
13352 return r;
13353 }
13354
13355 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13356 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13357 {
13358 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13359
13360 rgw_rados_ref ref;
13361 int r = get_raw_obj_ref(obj, &ref);
13362 if (r < 0) {
13363 return r;
13364 }
13365
13366 *is_truncated = false;
13367
13368 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13369 max_entries, read_iter, usage, is_truncated);
13370
13371 return r;
13372 }
13373
13374 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13375 {
13376 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13377
13378 rgw_rados_ref ref;
13379 int r = get_raw_obj_ref(obj, &ref);
13380 if (r < 0) {
13381 return r;
13382 }
13383
13384 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
13385 return r;
13386 }
13387
13388 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13389 {
13390 librados::IoCtx index_ctx;
13391 string dir_oid;
13392
13393 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13394
13395 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13396 if (r < 0)
13397 return r;
13398
13399 bufferlist updates;
13400
13401 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13402 rgw_bucket_dir_entry entry;
13403 entry.key = *iter;
13404 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13405 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13406 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13407 ::encode(entry, updates);
13408 }
13409
13410 bufferlist out;
13411
13412 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13413
13414 return r;
13415 }
13416
13417 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13418 const RGWBucketInfo& bucket_info,
13419 rgw_bucket_dir_entry& list_state,
13420 rgw_bucket_dir_entry& object,
13421 bufferlist& suggested_updates)
13422 {
13423 const rgw_bucket& bucket = bucket_info.bucket;
13424 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13425
13426 std::string loc;
13427
13428 rgw_obj obj(bucket, list_state.key);
13429
13430 string oid;
13431 get_obj_bucket_and_oid_loc(obj, oid, loc);
13432
13433 if (loc != list_state.locator) {
13434 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13435 }
13436
13437 io_ctx.locator_set_key(list_state.locator);
13438
13439 RGWObjState *astate = NULL;
13440 RGWObjectCtx rctx(this);
13441 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13442 if (r < 0)
13443 return r;
13444
13445 list_state.pending_map.clear(); // we don't need this and it inflates size
13446 if (!astate->exists) {
13447 /* object doesn't exist right now -- hopefully because it's
13448 * marked as !exists and got deleted */
13449 if (list_state.exists) {
13450 /* FIXME: what should happen now? Work out if there are any
13451 * non-bad ways this could happen (there probably are, but annoying
13452 * to handle!) */
13453 }
13454 // encode a suggested removal of that key
13455 list_state.ver.epoch = io_ctx.get_last_version();
13456 list_state.ver.pool = io_ctx.get_id();
13457 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13458 return -ENOENT;
13459 }
13460
13461 string etag;
13462 string content_type;
13463 ACLOwner owner;
13464
13465 object.meta.size = astate->size;
13466 object.meta.accounted_size = astate->accounted_size;
13467 object.meta.mtime = astate->mtime;
13468
13469 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13470 if (iter != astate->attrset.end()) {
13471 etag = iter->second.c_str();
13472 }
13473 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13474 if (iter != astate->attrset.end()) {
13475 content_type = iter->second.c_str();
13476 }
13477 iter = astate->attrset.find(RGW_ATTR_ACL);
13478 if (iter != astate->attrset.end()) {
13479 r = decode_policy(iter->second, &owner);
13480 if (r < 0) {
13481 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13482 }
13483 }
13484
13485 if (astate->has_manifest) {
13486 RGWObjManifest::obj_iterator miter;
13487 RGWObjManifest& manifest = astate->manifest;
13488 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13489 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13490 rgw_obj loc;
13491 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13492
13493 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13494 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13495 r = delete_obj_index(loc);
13496 if (r < 0) {
13497 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13498 }
13499 }
13500 }
13501 }
13502
13503 object.meta.etag = etag;
13504 object.meta.content_type = content_type;
13505 object.meta.owner = owner.get_id().to_str();
13506 object.meta.owner_display_name = owner.get_display_name();
13507
13508 // encode suggested updates
13509 list_state.ver.pool = io_ctx.get_id();
13510 list_state.ver.epoch = astate->epoch;
13511 list_state.meta.size = object.meta.size;
13512 list_state.meta.accounted_size = object.meta.accounted_size;
13513 list_state.meta.mtime = object.meta.mtime;
13514 list_state.meta.category = main_category;
13515 list_state.meta.etag = etag;
13516 list_state.meta.content_type = content_type;
13517 if (astate->obj_tag.length() > 0)
13518 list_state.tag = astate->obj_tag.c_str();
13519 list_state.meta.owner = owner.get_id().to_str();
13520 list_state.meta.owner_display_name = owner.get_display_name();
13521
13522 list_state.exists = true;
13523 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13524 return 0;
13525 }
13526
13527 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13528 {
13529 librados::IoCtx index_ctx;
13530 map<int, string> oids;
13531 map<int, struct rgw_cls_list_ret> list_results;
13532 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13533 if (r < 0)
13534 return r;
13535
13536 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13537 if (r < 0)
13538 return r;
13539
13540 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13541 for(; iter != list_results.end(); ++iter) {
13542 headers[oids[iter->first]] = iter->second.dir.header;
13543 }
13544 return 0;
13545 }
13546
13547 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13548 {
13549 librados::IoCtx index_ctx;
13550 map<int, string> bucket_objs;
13551 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13552 if (r < 0)
13553 return r;
13554
13555 map<int, string>::iterator iter = bucket_objs.begin();
13556 for (; iter != bucket_objs.end(); ++iter) {
13557 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13558 if (r < 0) {
13559 ctx->put();
13560 break;
13561 } else {
13562 (*num_aio)++;
13563 }
13564 }
13565 return r;
13566 }
13567
13568 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13569 {
13570 string buckets_obj_id;
13571 rgw_get_buckets_obj(user_id, buckets_obj_id);
13572 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13573
13574 rgw_rados_ref ref;
13575 int r = get_raw_obj_ref(obj, &ref);
13576 if (r < 0) {
13577 return r;
13578 }
13579
13580 librados::ObjectReadOperation op;
13581 int rc;
13582 ::cls_user_get_header(op, header, &rc);
13583 bufferlist ibl;
13584 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13585 if (r < 0)
13586 return r;
13587 if (rc < 0)
13588 return rc;
13589
13590 return 0;
13591 }
13592
13593 int RGWRados::cls_user_reset_stats(const string& user_id)
13594 {
13595 string buckets_obj_id;
13596 rgw_get_buckets_obj(user_id, buckets_obj_id);
13597 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13598
13599 rgw_rados_ref ref;
13600 int r = get_raw_obj_ref(obj, &ref);
13601 if (r < 0) {
13602 return r;
13603 }
13604
13605 librados::ObjectWriteOperation op;
13606 ::cls_user_reset_stats(op);
13607 return ref.ioctx.operate(ref.oid, &op);
13608 }
13609
13610 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13611 {
13612 string buckets_obj_id;
13613 rgw_get_buckets_obj(user_id, buckets_obj_id);
13614 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13615
13616 rgw_rados_ref ref;
13617 int r = get_raw_obj_ref(obj, &ref);
13618 if (r < 0) {
13619 return r;
13620 }
13621
13622 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13623 if (r < 0)
13624 return r;
13625
13626 return 0;
13627 }
13628
13629 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13630 {
13631 map<string, struct rgw_bucket_dir_header> headers;
13632 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13633 if (r < 0) {
13634 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13635 return r;
13636 }
13637
13638 cls_user_bucket_entry entry;
13639
13640 bucket_info.bucket.convert(&entry.bucket);
13641
13642 for (const auto& hiter : headers) {
13643 for (const auto& iter : hiter.second.stats) {
13644 const struct rgw_bucket_category_stats& header_stats = iter.second;
13645 entry.size += header_stats.total_size;
13646 entry.size_rounded += header_stats.total_size_rounded;
13647 entry.count += header_stats.num_entries;
13648 }
13649 }
13650
13651 list<cls_user_bucket_entry> entries;
13652 entries.push_back(entry);
13653
13654 r = cls_user_update_buckets(user_obj, entries, false);
13655 if (r < 0) {
13656 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13657 return r;
13658 }
13659
13660 return 0;
13661 }
13662
13663 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13664 {
13665 map<string, struct rgw_bucket_dir_header> headers;
13666 RGWBucketInfo bucket_info;
13667 RGWObjectCtx obj_ctx(this);
13668 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13669 if (ret < 0) {
13670 return ret;
13671 }
13672
13673 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13674 if (ret < 0) {
13675 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13676 return ret;
13677 }
13678
13679 bucket.convert(&entry.bucket);
13680
13681 for (const auto& hiter : headers) {
13682 for (const auto& iter : hiter.second.stats) {
13683 const struct rgw_bucket_category_stats& header_stats = iter.second;
13684 entry.size += header_stats.total_size;
13685 entry.size_rounded += header_stats.total_size_rounded;
13686 entry.count += header_stats.num_entries;
13687 }
13688 }
13689
13690 return 0;
13691 }
13692
13693 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13694 const string& in_marker,
13695 const string& end_marker,
13696 const int max_entries,
13697 list<cls_user_bucket_entry>& entries,
13698 string * const out_marker,
13699 bool * const truncated)
13700 {
13701 rgw_rados_ref ref;
13702 int r = get_raw_obj_ref(obj, &ref);
13703 if (r < 0) {
13704 return r;
13705 }
13706
13707 librados::ObjectReadOperation op;
13708 int rc;
13709
13710 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13711 bufferlist ibl;
13712 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13713 if (r < 0)
13714 return r;
13715 if (rc < 0)
13716 return rc;
13717
13718 return 0;
13719 }
13720
13721 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13722 {
13723 rgw_rados_ref ref;
13724 int r = get_raw_obj_ref(obj, &ref);
13725 if (r < 0) {
13726 return r;
13727 }
13728
13729 librados::ObjectWriteOperation op;
13730 cls_user_set_buckets(op, entries, add);
13731 r = ref.ioctx.operate(ref.oid, &op);
13732 if (r < 0)
13733 return r;
13734
13735 return 0;
13736 }
13737
13738 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13739 {
13740 string buckets_obj_id;
13741 rgw_get_buckets_obj(user_id, buckets_obj_id);
13742 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13743 return cls_user_complete_stats_sync(obj);
13744 }
13745
13746 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13747 {
13748 rgw_rados_ref ref;
13749 int r = get_raw_obj_ref(obj, &ref);
13750 if (r < 0) {
13751 return r;
13752 }
13753
13754 librados::ObjectWriteOperation op;
13755 ::cls_user_complete_stats_sync(op);
13756 r = ref.ioctx.operate(ref.oid, &op);
13757 if (r < 0)
13758 return r;
13759
13760 return 0;
13761 }
13762
13763 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13764 {
13765 list<cls_user_bucket_entry> l;
13766 l.push_back(entry);
13767
13768 return cls_user_update_buckets(obj, l, true);
13769 }
13770
13771 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13772 {
13773 rgw_rados_ref ref;
13774 int r = get_system_obj_ref(obj, &ref);
13775 if (r < 0) {
13776 return r;
13777 }
13778
13779 librados::ObjectWriteOperation op;
13780 ::cls_user_remove_bucket(op, bucket);
13781 r = ref.ioctx.operate(ref.oid, &op);
13782 if (r < 0)
13783 return r;
13784
13785 return 0;
13786 }
13787
13788 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13789 RGWQuotaInfo& bucket_quota)
13790 {
13791 if (!cct->_conf->rgw_dynamic_resharding) {
13792 return 0;
13793 }
13794
13795 bool need_resharding = false;
13796 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13797 uint32_t suggested_num_shards;
13798
13799 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13800 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13801 1, need_resharding, &suggested_num_shards);
13802 if (ret < 0) {
13803 return ret;
13804 }
13805
13806 if (need_resharding) {
13807 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13808 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13809 dendl;
13810 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13811 }
13812
13813 return ret;
13814 }
13815
13816 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13817 {
13818 RGWReshard reshard(this);
13819
13820 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13821
13822 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13823 if (new_num_shards <= num_source_shards) {
13824 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13825 return 0;
13826 }
13827
13828 cls_rgw_reshard_entry entry;
13829 entry.time = real_clock::now();
13830 entry.tenant = bucket_info.owner.tenant;
13831 entry.bucket_name = bucket_info.bucket.name;
13832 entry.bucket_id = bucket_info.bucket.bucket_id;
13833 entry.old_num_shards = num_source_shards;
13834 entry.new_num_shards = new_num_shards;
13835
13836 return reshard.add(entry);
13837 }
13838
13839 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13840 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13841 {
13842 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13843 }
13844
13845 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13846 uint32_t num_shards,
13847 map<int, string>& bucket_objects,
13848 int shard_id) {
13849 if (!num_shards) {
13850 bucket_objects[0] = bucket_oid_base;
13851 } else {
13852 char buf[bucket_oid_base.size() + 32];
13853 if (shard_id < 0) {
13854 for (uint32_t i = 0; i < num_shards; ++i) {
13855 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13856 bucket_objects[i] = buf;
13857 }
13858 } else {
13859 if ((uint32_t)shard_id > num_shards) {
13860 return;
13861 }
13862 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13863 bucket_objects[shard_id] = buf;
13864 }
13865 }
13866 }
13867
13868 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13869 {
13870 const rgw_bucket& bucket = bucket_info.bucket;
13871 string plain_id = bucket.name + ":" + bucket.bucket_id;
13872 if (!bucket_info.num_shards) {
13873 (*result)[0] = plain_id;
13874 } else {
13875 char buf[16];
13876 if (shard_id < 0) {
13877 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13878 snprintf(buf, sizeof(buf), ":%d", i);
13879 (*result)[i] = plain_id + buf;
13880 }
13881 } else {
13882 if ((uint32_t)shard_id > bucket_info.num_shards) {
13883 return;
13884 }
13885 snprintf(buf, sizeof(buf), ":%d", shard_id);
13886 (*result)[shard_id] = plain_id + buf;
13887 }
13888 }
13889 }
13890
13891 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13892 int *shard_id)
13893 {
13894 int r = 0;
13895 switch (bucket_info.bucket_index_shard_hash_type) {
13896 case RGWBucketInfo::MOD:
13897 if (!bucket_info.num_shards) {
13898 if (shard_id) {
13899 *shard_id = -1;
13900 }
13901 } else {
13902 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
13903 if (shard_id) {
13904 *shard_id = (int)sid;
13905 }
13906 }
13907 break;
13908 default:
13909 r = -ENOTSUP;
13910 }
13911 return r;
13912 }
13913
13914 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13915 int shard_id, string *bucket_obj)
13916 {
13917 if (!num_shards) {
13918 // By default with no sharding, we use the bucket oid as itself
13919 (*bucket_obj) = bucket_oid_base;
13920 } else {
13921 char buf[bucket_oid_base.size() + 32];
13922 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13923 (*bucket_obj) = buf;
13924 }
13925 }
13926
13927 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13928 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13929 {
13930 int r = 0;
13931 switch (hash_type) {
13932 case RGWBucketInfo::MOD:
13933 if (!num_shards) {
13934 // By default with no sharding, we use the bucket oid as itself
13935 (*bucket_obj) = bucket_oid_base;
13936 if (shard_id) {
13937 *shard_id = -1;
13938 }
13939 } else {
13940 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
13941 char buf[bucket_oid_base.size() + 32];
13942 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13943 (*bucket_obj) = buf;
13944 if (shard_id) {
13945 *shard_id = (int)sid;
13946 }
13947 }
13948 break;
13949 default:
13950 r = -ENOTSUP;
13951 }
13952 return r;
13953 }
13954
13955 void RGWStateLog::oid_str(int shard, string& oid) {
13956 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13957 char buf[16];
13958 snprintf(buf, sizeof(buf), "%d", shard);
13959 oid += buf;
13960 }
13961
13962 int RGWStateLog::get_shard_num(const string& object) {
13963 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13964 return val % num_shards;
13965 }
13966
13967 string RGWStateLog::get_oid(const string& object) {
13968 int shard = get_shard_num(object);
13969 string oid;
13970 oid_str(shard, oid);
13971 return oid;
13972 }
13973
13974 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13975 rgw_pool pool;
13976 store->get_log_pool(pool);
13977 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13978 if (r < 0) {
13979 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13980 return r;
13981 }
13982 return 0;
13983 }
13984
13985 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13986 uint32_t state, bufferlist *bl, uint32_t *check_state)
13987 {
13988 if (client_id.empty() ||
13989 op_id.empty() ||
13990 object.empty()) {
13991 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13992 }
13993
13994 librados::IoCtx ioctx;
13995 int r = open_ioctx(ioctx);
13996 if (r < 0)
13997 return r;
13998
13999 string oid = get_oid(object);
14000
14001 librados::ObjectWriteOperation op;
14002 if (check_state) {
14003 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
14004 }
14005 utime_t ts = ceph_clock_now();
14006 bufferlist nobl;
14007 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
14008 r = ioctx.operate(oid, &op);
14009 if (r < 0) {
14010 return r;
14011 }
14012
14013 return 0;
14014 }
14015
14016 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
14017 {
14018 if (client_id.empty() ||
14019 op_id.empty() ||
14020 object.empty()) {
14021 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
14022 }
14023
14024 librados::IoCtx ioctx;
14025 int r = open_ioctx(ioctx);
14026 if (r < 0)
14027 return r;
14028
14029 string oid = get_oid(object);
14030
14031 librados::ObjectWriteOperation op;
14032 cls_statelog_remove_by_object(op, object, op_id);
14033 r = ioctx.operate(oid, &op);
14034 if (r < 0) {
14035 return r;
14036 }
14037
14038 return 0;
14039 }
14040
14041 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
14042 void **handle)
14043 {
14044 list_state *state = new list_state;
14045 state->client_id = client_id;
14046 state->op_id = op_id;
14047 state->object = object;
14048 if (object.empty()) {
14049 state->cur_shard = 0;
14050 state->max_shard = num_shards - 1;
14051 } else {
14052 state->cur_shard = state->max_shard = get_shard_num(object);
14053 }
14054 *handle = (void *)state;
14055 }
14056
14057 int RGWStateLog::list_entries(void *handle, int max_entries,
14058 list<cls_statelog_entry>& entries,
14059 bool *done)
14060 {
14061 list_state *state = static_cast<list_state *>(handle);
14062
14063 librados::IoCtx ioctx;
14064 int r = open_ioctx(ioctx);
14065 if (r < 0)
14066 return r;
14067
14068 entries.clear();
14069
14070 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
14071 string oid;
14072 oid_str(state->cur_shard, oid);
14073
14074 librados::ObjectReadOperation op;
14075 list<cls_statelog_entry> ents;
14076 bool truncated;
14077 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
14078 max_entries, ents, &state->marker, &truncated);
14079 bufferlist ibl;
14080 r = ioctx.operate(oid, &op, &ibl);
14081 if (r == -ENOENT) {
14082 truncated = false;
14083 r = 0;
14084 }
14085 if (r < 0) {
14086 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
14087 return r;
14088 }
14089
14090 if (!truncated) {
14091 state->marker.clear();
14092 }
14093
14094 max_entries -= ents.size();
14095
14096 entries.splice(entries.end(), ents);
14097
14098 if (truncated)
14099 break;
14100 }
14101
14102 *done = (state->cur_shard > state->max_shard);
14103
14104 return 0;
14105 }
14106
14107 void RGWStateLog::finish_list_entries(void *handle)
14108 {
14109 list_state *state = static_cast<list_state *>(handle);
14110 delete state;
14111 }
14112
14113 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
14114 {
14115 f->open_object_section("statelog_entry");
14116 f->dump_string("client_id", entry.client_id);
14117 f->dump_string("op_id", entry.op_id);
14118 f->dump_string("object", entry.object);
14119 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
14120 if (!dump_entry_internal(entry, f)) {
14121 f->dump_int("state", entry.state);
14122 }
14123 f->close_section();
14124 }
14125
14126 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
14127 {
14128 }
14129
14130 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
14131 {
14132 string s;
14133 switch ((OpState)entry.state) {
14134 case OPSTATE_UNKNOWN:
14135 s = "unknown";
14136 break;
14137 case OPSTATE_IN_PROGRESS:
14138 s = "in-progress";
14139 break;
14140 case OPSTATE_COMPLETE:
14141 s = "complete";
14142 break;
14143 case OPSTATE_ERROR:
14144 s = "error";
14145 break;
14146 case OPSTATE_ABORT:
14147 s = "abort";
14148 break;
14149 case OPSTATE_CANCELLED:
14150 s = "cancelled";
14151 break;
14152 default:
14153 s = "invalid";
14154 }
14155 f->dump_string("state", s);
14156 return true;
14157 }
14158
14159 int RGWOpState::state_from_str(const string& s, OpState *state)
14160 {
14161 if (s == "unknown") {
14162 *state = OPSTATE_UNKNOWN;
14163 } else if (s == "in-progress") {
14164 *state = OPSTATE_IN_PROGRESS;
14165 } else if (s == "complete") {
14166 *state = OPSTATE_COMPLETE;
14167 } else if (s == "error") {
14168 *state = OPSTATE_ERROR;
14169 } else if (s == "abort") {
14170 *state = OPSTATE_ABORT;
14171 } else if (s == "cancelled") {
14172 *state = OPSTATE_CANCELLED;
14173 } else {
14174 return -EINVAL;
14175 }
14176
14177 return 0;
14178 }
14179
14180 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
14181 {
14182 uint32_t s = (uint32_t)state;
14183 return store_entry(client_id, op_id, object, s, NULL, NULL);
14184 }
14185
14186 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
14187 {
14188 uint32_t s = (uint32_t)state;
14189 return store_entry(client_id, op_id, object, s, NULL, &s);
14190 }
14191
14192 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
14193 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
14194 {
14195 cct = store->ctx();
14196 cur_state = RGWOpState::OPSTATE_UNKNOWN;
14197 }
14198
14199 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
14200 last_update = real_clock::now();
14201 cur_state = state;
14202 return os.set_state(client_id, op_id, object, state);
14203 }
14204
14205 int RGWOpStateSingleOp::renew_state() {
14206 real_time now = real_clock::now();
14207
14208 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
14209
14210 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
14211 return 0;
14212 }
14213
14214 last_update = now;
14215 return os.renew_state(client_id, op_id, object, cur_state);
14216 }
14217
14218
14219 uint64_t RGWRados::instance_id()
14220 {
14221 return get_rados_handle()->get_instance_id();
14222 }
14223
14224 uint64_t RGWRados::next_bucket_id()
14225 {
14226 Mutex::Locker l(bucket_id_lock);
14227 return ++max_bucket_id;
14228 }
14229
14230 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
14231 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
14232 {
14233 RGWRados *store = NULL;
14234 if (!use_cache) {
14235 store = new RGWRados;
14236 } else {
14237 store = new RGWCache<RGWRados>;
14238 }
14239
14240 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
14241 delete store;
14242 return NULL;
14243 }
14244
14245 return store;
14246 }
14247
14248 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
14249 {
14250 RGWRados *store = NULL;
14251 store = new RGWRados;
14252
14253 store->set_context(cct);
14254
14255 if (store->init_rados() < 0) {
14256 delete store;
14257 return NULL;
14258 }
14259
14260 return store;
14261 }
14262
14263 void RGWStoreManager::close_storage(RGWRados *store)
14264 {
14265 if (!store)
14266 return;
14267
14268 store->finalize();
14269
14270 delete store;
14271 }
14272
14273 librados::Rados* RGWRados::get_rados_handle()
14274 {
14275 if (rados.size() == 1) {
14276 return &rados[0];
14277 } else {
14278 handle_lock.get_read();
14279 pthread_t id = pthread_self();
14280 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
14281
14282 if (it != rados_map.end()) {
14283 handle_lock.put_read();
14284 return &rados[it->second];
14285 } else {
14286 handle_lock.put_read();
14287 handle_lock.get_write();
14288 const uint32_t handle = next_rados_handle;
14289 rados_map[id] = handle;
14290 if (++next_rados_handle == rados.size()) {
14291 next_rados_handle = 0;
14292 }
14293 handle_lock.put_write();
14294 return &rados[handle];
14295 }
14296 }
14297 }
14298
14299 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
14300 {
14301 rgw_rados_ref ref;
14302 int ret = get_raw_obj_ref(obj, &ref);
14303 if (ret < 0) {
14304 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14305 return ret;
14306 }
14307
14308 ObjectWriteOperation op;
14309 list<string> prefixes;
14310 cls_rgw_remove_obj(op, prefixes);
14311
14312 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14313 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14314 if (ret < 0) {
14315 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14316 c->release();
14317 return ret;
14318 }
14319
14320 handles.push_back(c);
14321
14322 return 0;
14323 }
14324
14325 int RGWRados::delete_obj_aio(const rgw_obj& obj,
14326 RGWBucketInfo& bucket_info, RGWObjState *astate,
14327 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14328 {
14329 rgw_rados_ref ref;
14330 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14331 if (ret < 0) {
14332 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14333 return ret;
14334 }
14335
14336 if (keep_index_consistent) {
14337 RGWRados::Bucket bop(this, bucket_info);
14338 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14339
14340 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14341 if (ret < 0) {
14342 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14343 return ret;
14344 }
14345 }
14346
14347 ObjectWriteOperation op;
14348 list<string> prefixes;
14349 cls_rgw_remove_obj(op, prefixes);
14350
14351 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14352 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14353 if (ret < 0) {
14354 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14355 c->release();
14356 return ret;
14357 }
14358
14359 handles.push_back(c);
14360
14361 if (keep_index_consistent) {
14362 ret = delete_obj_index(obj);
14363 if (ret < 0) {
14364 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14365 return ret;
14366 }
14367 }
14368 return ret;
14369 }
14370
14371 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14372 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14373 if (value != attrs.end()) {
14374 bufferlist::iterator bliter = value->second.begin();
14375 try {
14376 ::decode(cs_info, bliter);
14377 } catch (buffer::error& err) {
14378 return -EIO;
14379 }
14380 if (cs_info.blocks.size() == 0) {
14381 return -EIO;
14382 }
14383 if (cs_info.compression_type != "none")
14384 need_decompress = true;
14385 else
14386 need_decompress = false;
14387 return 0;
14388 } else {
14389 need_decompress = false;
14390 return 0;
14391 }
14392 }
14393
14394 bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14395 bufferlist& out)
14396 {
14397 if (command == "cache list") {
14398 boost::optional<std::string> filter;
14399 auto i = cmdmap.find("filter");
14400 if (i != cmdmap.cend()) {
14401 filter = boost::get<std::string>(i->second);
14402 }
14403 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14404 if (f) {
14405 f->open_array_section("cache_entries");
14406 call_list(filter, f.get());
14407 f->close_section();
14408 f->flush(out);
14409 return true;
14410 } else {
14411 out.append("Unable to create Formatter.\n");
14412 return false;
14413 }
14414 } else if (command == "cache inspect") {
14415 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14416 if (f) {
14417 const auto& target = boost::get<std::string>(cmdmap["target"]);
14418 if (call_inspect(target, f.get())) {
14419 f->flush(out);
14420 return true;
14421 } else {
14422 out.append(string("Unable to find entry ") + target + string(".\n"));
14423 return false;
14424 }
14425 } else {
14426 out.append("Unable to create Formatter.\n");
14427 return false;
14428 }
14429 } else if (command == "cache erase") {
14430 const auto& target = boost::get<std::string>(cmdmap["target"]);
14431 if (call_erase(target)) {
14432 return true;
14433 } else {
14434 out.append(string("Unable to find entry ") + target + string(".\n"));
14435 return false;
14436 }
14437 } else if (command == "cache zap") {
14438 call_zap();
14439 return true;
14440 }
14441 return false;
14442 }
14443
14444 void RGWRados::call_list(const boost::optional<std::string>&,
14445 ceph::Formatter*)
14446 {
14447 return;
14448 }
14449
14450 bool RGWRados::call_inspect(const std::string&, Formatter*)
14451 {
14452 return false;
14453 }
14454
14455 bool RGWRados::call_erase(const std::string&) {
14456 return false;
14457 }
14458
14459 void RGWRados::call_zap() {
14460 return;
14461 }