]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to v12.2.3
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1
2 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
3 // vim: ts=8 sw=2 smarttab
4
5 #include "include/compat.h"
6 #include <errno.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <boost/algorithm/string.hpp>
10
11 #include <boost/format.hpp>
12 #include <boost/optional.hpp>
13 #include <boost/utility/in_place_factory.hpp>
14
15 #include "common/ceph_json.h"
16 #include "common/utf8.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21 #include "common/Finisher.h"
22
23 #include "rgw_rados.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_metadata.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32
33 #include "cls/rgw/cls_rgw_ops.h"
34 #include "cls/rgw/cls_rgw_types.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/statelog/cls_statelog_client.h"
41 #include "cls/timeindex/cls_timeindex_client.h"
42 #include "cls/lock/cls_lock_client.h"
43 #include "cls/user/cls_user_client.h"
44 #include "osd/osd_types.h"
45
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
49
50 #undef fork // fails to compile RGWPeriod::fork() below
51
52 #include "common/Clock.h"
53
54 #include "include/rados/librados.hpp"
55 using namespace librados;
56
57 #include <string>
58 #include <iostream>
59 #include <vector>
60 #include <atomic>
61 #include <list>
62 #include <map>
63 #include "auth/Crypto.h" // get_random_bytes()
64
65 #include "rgw_log.h"
66
67 #include "rgw_gc.h"
68 #include "rgw_lc.h"
69
70 #include "rgw_object_expirer_core.h"
71 #include "rgw_sync.h"
72 #include "rgw_data_sync.h"
73 #include "rgw_realm_watcher.h"
74 #include "rgw_reshard.h"
75
76 #include "compressor/Compressor.h"
77
78 #define dout_context g_ceph_context
79 #define dout_subsys ceph_subsys_rgw
80
81 using namespace std;
82
83 static string notify_oid_prefix = "notify";
84 static string *notify_oids = NULL;
85 static string shadow_ns = "shadow";
86 static string dir_oid_prefix = ".dir.";
87 static string default_storage_pool_suffix = "rgw.buckets.data";
88 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
89 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
90 static string avail_pools = ".pools.avail";
91
92 static string zone_info_oid_prefix = "zone_info.";
93 static string zone_names_oid_prefix = "zone_names.";
94 static string region_info_oid_prefix = "region_info.";
95 static string zone_group_info_oid_prefix = "zonegroup_info.";
96 static string realm_names_oid_prefix = "realms_names.";
97 static string realm_info_oid_prefix = "realms.";
98 static string default_region_info_oid = "default.region";
99 static string default_zone_group_info_oid = "default.zonegroup";
100 static string period_info_oid_prefix = "periods.";
101 static string period_latest_epoch_info_oid = ".latest_epoch";
102 static string region_map_oid = "region_map";
103 static string zonegroup_map_oid = "zonegroup_map";
104 static string log_lock_name = "rgw_log_lock";
105 static string default_realm_info_oid = "default.realm";
106 const string default_zonegroup_name = "default";
107 const string default_zone_name = "default";
108 static string zonegroup_names_oid_prefix = "zonegroups_names.";
109 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
110 #define RGW_USAGE_OBJ_PREFIX "usage."
111 #define FIRST_EPOCH 1
112 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
115 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
116
117 #define RGW_STATELOG_OBJ_PREFIX "statelog."
118
119 #define dout_subsys ceph_subsys_rgw
120
121
122 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
123 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
124 {
125 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
126 RGWZonePlacementInfo placement;
127 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
128 return false;
129 }
130
131 if (!obj.in_extra_data) {
132 *pool = placement.data_pool;
133 } else {
134 *pool = placement.get_data_extra_pool();
135 }
136 }
137
138 return true;
139 }
140
141 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
142 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
143 {
144 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
145
146 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
147 }
148
149 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
150 {
151 if (!is_raw) {
152 rgw_raw_obj r;
153 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
154 return r;
155 }
156 return raw_obj;
157 }
158
159 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
160 {
161 if (!is_raw) {
162 rgw_raw_obj r;
163 store->obj_to_raw(placement_rule, obj, &r);
164 return r;
165 }
166 return raw_obj;
167 }
168
169 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
170 {
171 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
172 if (r == -ENOENT && create) {
173 r = rados->pool_create(pool.name.c_str());
174 if (r < 0 && r != -EEXIST) {
175 return r;
176 }
177
178 r = rados->ioctx_create(pool.name.c_str(), ioctx);
179 if (r < 0) {
180 return r;
181 }
182
183 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
184 if (r < 0 && r != -EOPNOTSUPP) {
185 return r;
186 }
187 } else if (r < 0) {
188 return r;
189 }
190 if (!pool.ns.empty()) {
191 ioctx.set_namespace(pool.ns);
192 }
193 return 0;
194 }
195
196 template<>
197 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
198 RWLock::WLocker wl(lock);
199 auto iter = objs_state.find(obj);
200 if (iter == objs_state.end()) {
201 return;
202 }
203 bool is_atomic = iter->second.is_atomic;
204 bool prefetch_data = iter->second.prefetch_data;
205
206 objs_state.erase(iter);
207
208 if (is_atomic || prefetch_data) {
209 auto& s = objs_state[obj];
210 s.is_atomic = is_atomic;
211 s.prefetch_data = prefetch_data;
212 }
213 }
214
215 template<>
216 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
217 RWLock::WLocker wl(lock);
218 auto iter = objs_state.find(obj);
219 if (iter == objs_state.end()) {
220 return;
221 }
222
223 objs_state.erase(iter);
224 }
225
226 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
227 encode_json("default_zonegroup", default_zonegroup, f);
228 }
229
230 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
231
232 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
233 /* backward compatability with region */
234 if (default_zonegroup.empty()) {
235 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
236 }
237 }
238
239 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
240 {
241 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
242 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
243 }
244
245 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
246 }
247
248 int RGWZoneGroup::create_default(bool old_format)
249 {
250 name = default_zonegroup_name;
251 is_master = true;
252
253 RGWZoneGroupPlacementTarget placement_target;
254 placement_target.name = "default-placement";
255 placement_targets[placement_target.name] = placement_target;
256 default_placement = "default-placement";
257
258 RGWZoneParams zone_params(default_zone_name);
259
260 int r = zone_params.init(cct, store, false);
261 if (r < 0) {
262 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
263 return r;
264 }
265
266 r = zone_params.create_default();
267 if (r < 0 && r != -EEXIST) {
268 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
269 return r;
270 } else if (r == -EEXIST) {
271 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
272 zone_params.clear_id();
273 r = zone_params.init(cct, store);
274 if (r < 0) {
275 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 }
278 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
279 << dendl;
280 }
281
282 RGWZone& default_zone = zones[zone_params.get_id()];
283 default_zone.name = zone_params.get_name();
284 default_zone.id = zone_params.get_id();
285 master_zone = default_zone.id;
286
287 r = create();
288 if (r < 0 && r != -EEXIST) {
289 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
290 return r;
291 }
292
293 if (r == -EEXIST) {
294 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
295 id.clear();
296 r = init(cct, store);
297 if (r < 0) {
298 return r;
299 }
300 }
301
302 if (old_format) {
303 name = id;
304 }
305
306 post_process_params();
307
308 return 0;
309 }
310
311 const string RGWZoneGroup::get_default_oid(bool old_region_format)
312 {
313 if (old_region_format) {
314 if (cct->_conf->rgw_default_region_info_oid.empty()) {
315 return default_region_info_oid;
316 }
317 return cct->_conf->rgw_default_region_info_oid;
318 }
319
320 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
321
322 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
323 default_oid = default_zone_group_info_oid;
324 }
325
326 default_oid += "." + realm_id;
327
328 return default_oid;
329 }
330
331 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
332 {
333 if (old_region_format) {
334 return region_info_oid_prefix;
335 }
336 return zone_group_info_oid_prefix;
337 }
338
339 const string& RGWZoneGroup::get_names_oid_prefix()
340 {
341 return zonegroup_names_oid_prefix;
342 }
343
344 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
345 return cct->_conf->rgw_zonegroup;
346 }
347
348 int RGWZoneGroup::equals(const string& other_zonegroup) const
349 {
350 if (is_master && other_zonegroup.empty())
351 return true;
352
353 return (id == other_zonegroup);
354 }
355
356 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
357 const list<string>& endpoints, const string *ptier_type,
358 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
359 {
360 auto& zone_id = zone_params.get_id();
361 auto& zone_name = zone_params.get_name();
362
363 // check for duplicate zone name on insert
364 if (!zones.count(zone_id)) {
365 for (const auto& zone : zones) {
366 if (zone.second.name == zone_name) {
367 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
368 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
369 return -EEXIST;
370 }
371 }
372 }
373
374 if (is_master) {
375 if (*is_master) {
376 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
377 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
378 }
379 master_zone = zone_params.get_id();
380 } else if (master_zone == zone_params.get_id()) {
381 master_zone.clear();
382 }
383 }
384
385 RGWZone& zone = zones[zone_params.get_id()];
386 zone.name = zone_params.get_name();
387 zone.id = zone_params.get_id();
388 if (!endpoints.empty()) {
389 zone.endpoints = endpoints;
390 }
391 if (read_only) {
392 zone.read_only = *read_only;
393 }
394 if (ptier_type) {
395 zone.tier_type = *ptier_type;
396 }
397
398 if (psync_from_all) {
399 zone.sync_from_all = *psync_from_all;
400 }
401
402 for (auto add : sync_from) {
403 zone.sync_from.insert(add);
404 }
405
406 for (auto rm : sync_from_rm) {
407 zone.sync_from.erase(rm);
408 }
409
410 post_process_params();
411
412 return update();
413 }
414
415
416 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
417 {
418 RGWZone& zone = zones[zone_params.get_id()];
419 zone.name = zone_params.get_name();
420
421 return update();
422 }
423
424 void RGWZoneGroup::post_process_params()
425 {
426 bool log_data = zones.size() > 1;
427
428 if (master_zone.empty()) {
429 map<string, RGWZone>::iterator iter = zones.begin();
430 if (iter != zones.end()) {
431 master_zone = iter->first;
432 }
433 }
434
435 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
436 RGWZone& zone = iter->second;
437 zone.log_data = log_data;
438
439 RGWZoneParams zone_params(zone.id, zone.name);
440 int ret = zone_params.init(cct, store);
441 if (ret < 0) {
442 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
443 continue;
444 }
445
446 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
447 iter != zone_params.placement_pools.end(); ++iter) {
448 const string& placement_name = iter->first;
449 if (placement_targets.find(placement_name) == placement_targets.end()) {
450 RGWZoneGroupPlacementTarget placement_target;
451 placement_target.name = placement_name;
452 placement_targets[placement_name] = placement_target;
453 }
454 }
455 }
456
457 if (default_placement.empty() && !placement_targets.empty()) {
458 default_placement = placement_targets.begin()->first;
459 }
460 }
461
462 int RGWZoneGroup::remove_zone(const std::string& zone_id)
463 {
464 map<string, RGWZone>::iterator iter = zones.find(zone_id);
465 if (iter == zones.end()) {
466 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
467 << name << dendl;
468 return -ENOENT;
469 }
470
471 zones.erase(iter);
472
473 post_process_params();
474
475 return update();
476 }
477
478 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
479 {
480 if (realm_id.empty()) {
481 /* try using default realm */
482 RGWRealm realm;
483 int ret = realm.init(cct, store);
484 // no default realm exist
485 if (ret < 0) {
486 return read_id(default_zonegroup_name, default_id);
487 }
488 realm_id = realm.get_id();
489 }
490
491 return RGWSystemMetaObj::read_default_id(default_id, old_format);
492 }
493
494 int RGWZoneGroup::set_as_default(bool exclusive)
495 {
496 if (realm_id.empty()) {
497 /* try using default realm */
498 RGWRealm realm;
499 int ret = realm.init(cct, store);
500 if (ret < 0) {
501 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
502 return -EINVAL;
503 }
504 realm_id = realm.get_id();
505 }
506
507 return RGWSystemMetaObj::set_as_default(exclusive);
508 }
509
510 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
511 {
512 cct = _cct;
513 store = _store;
514
515 if (!setup_obj)
516 return 0;
517
518 if (old_format && id.empty()) {
519 id = name;
520 }
521
522 if (id.empty()) {
523 int r;
524 if (name.empty()) {
525 name = get_predefined_name(cct);
526 }
527 if (name.empty()) {
528 r = use_default(old_format);
529 if (r < 0) {
530 return r;
531 }
532 } else if (!old_format) {
533 r = read_id(name, id);
534 if (r < 0) {
535 if (r != -ENOENT) {
536 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
537 }
538 return r;
539 }
540 }
541 }
542
543 return read_info(id, old_format);
544 }
545
546 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
547 {
548 auto pool = get_pool(cct);
549 bufferlist bl;
550 RGWObjectCtx obj_ctx(store);
551 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
552 if (ret < 0)
553 return ret;
554
555 try {
556 bufferlist::iterator iter = bl.begin();
557 ::decode(default_info, iter);
558 } catch (buffer::error& err) {
559 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
560 return -EIO;
561 }
562
563 return 0;
564 }
565
566 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
567 {
568 RGWDefaultSystemMetaObjInfo default_info;
569
570 int ret = read_default(default_info, get_default_oid(old_format));
571 if (ret < 0) {
572 return ret;
573 }
574
575 default_id = default_info.default_id;
576
577 return 0;
578 }
579
580 int RGWSystemMetaObj::use_default(bool old_format)
581 {
582 return read_default_id(id, old_format);
583 }
584
585 int RGWSystemMetaObj::set_as_default(bool exclusive)
586 {
587 string oid = get_default_oid();
588
589 rgw_pool pool(get_pool(cct));
590 bufferlist bl;
591
592 RGWDefaultSystemMetaObjInfo default_info;
593 default_info.default_id = id;
594
595 ::encode(default_info, bl);
596
597 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
598 exclusive, NULL, real_time(), NULL);
599 if (ret < 0)
600 return ret;
601
602 return 0;
603 }
604
605 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
606 {
607 rgw_pool pool(get_pool(cct));
608 bufferlist bl;
609
610 string oid = get_names_oid_prefix() + obj_name;
611
612 RGWObjectCtx obj_ctx(store);
613 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
614 if (ret < 0) {
615 return ret;
616 }
617
618 RGWNameToId nameToId;
619 try {
620 bufferlist::iterator iter = bl.begin();
621 ::decode(nameToId, iter);
622 } catch (buffer::error& err) {
623 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
624 return -EIO;
625 }
626 object_id = nameToId.obj_id;
627 return 0;
628 }
629
630 int RGWSystemMetaObj::delete_obj(bool old_format)
631 {
632 rgw_pool pool(get_pool(cct));
633
634 /* check to see if obj is the default */
635 RGWDefaultSystemMetaObjInfo default_info;
636 int ret = read_default(default_info, get_default_oid(old_format));
637 if (ret < 0 && ret != -ENOENT)
638 return ret;
639 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
640 string oid = get_default_oid(old_format);
641 rgw_raw_obj default_named_obj(pool, oid);
642 ret = store->delete_system_obj(default_named_obj);
643 if (ret < 0) {
644 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
645 return ret;
646 }
647 }
648 if (!old_format) {
649 string oid = get_names_oid_prefix() + name;
650 rgw_raw_obj object_name(pool, oid);
651 ret = store->delete_system_obj(object_name);
652 if (ret < 0) {
653 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
654 return ret;
655 }
656 }
657
658 string oid = get_info_oid_prefix(old_format);
659 if (old_format) {
660 oid += name;
661 } else {
662 oid += id;
663 }
664
665 rgw_raw_obj object_id(pool, oid);
666 ret = store->delete_system_obj(object_id);
667 if (ret < 0) {
668 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
669 }
670
671 return ret;
672 }
673
674 int RGWSystemMetaObj::store_name(bool exclusive)
675 {
676 rgw_pool pool(get_pool(cct));
677 string oid = get_names_oid_prefix() + name;
678
679 RGWNameToId nameToId;
680 nameToId.obj_id = id;
681
682 bufferlist bl;
683 ::encode(nameToId, bl);
684 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
685 }
686
687 int RGWSystemMetaObj::rename(const string& new_name)
688 {
689 string new_id;
690 int ret = read_id(new_name, new_id);
691 if (!ret) {
692 return -EEXIST;
693 }
694 if (ret < 0 && ret != -ENOENT) {
695 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
696 return ret;
697 }
698 string old_name = name;
699 name = new_name;
700 ret = update();
701 if (ret < 0) {
702 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 ret = store_name(true);
706 if (ret < 0) {
707 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
708 return ret;
709 }
710 /* delete old name */
711 rgw_pool pool(get_pool(cct));
712 string oid = get_names_oid_prefix() + old_name;
713 rgw_raw_obj old_name_obj(pool, oid);
714 ret = store->delete_system_obj(old_name_obj);
715 if (ret < 0) {
716 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
717 return ret;
718 }
719
720 return ret;
721 }
722
723 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
724 {
725 rgw_pool pool(get_pool(cct));
726
727 bufferlist bl;
728
729 string oid = get_info_oid_prefix(old_format) + obj_id;
730
731 RGWObjectCtx obj_ctx(store);
732 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
733 if (ret < 0) {
734 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
735 return ret;
736 }
737
738 try {
739 bufferlist::iterator iter = bl.begin();
740 ::decode(*this, iter);
741 } catch (buffer::error& err) {
742 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
743 return -EIO;
744 }
745
746 return 0;
747 }
748
749 int RGWSystemMetaObj::read()
750 {
751 int ret = read_id(name, id);
752 if (ret < 0) {
753 return ret;
754 }
755
756 return read_info(id);
757 }
758
759 int RGWSystemMetaObj::create(bool exclusive)
760 {
761 int ret;
762
763 /* check to see the name is not used */
764 ret = read_id(name, id);
765 if (exclusive && ret == 0) {
766 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
767 return -EEXIST;
768 } else if ( ret < 0 && ret != -ENOENT) {
769 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
770 return ret;
771 }
772
773 if (id.empty()) {
774 /* create unique id */
775 uuid_d new_uuid;
776 char uuid_str[37];
777 new_uuid.generate_random();
778 new_uuid.print(uuid_str);
779 id = uuid_str;
780 }
781
782 ret = store_info(exclusive);
783 if (ret < 0) {
784 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
785 return ret;
786 }
787
788 return store_name(exclusive);
789 }
790
791 int RGWSystemMetaObj::store_info(bool exclusive)
792 {
793 rgw_pool pool(get_pool(cct));
794
795 string oid = get_info_oid_prefix() + id;
796
797 bufferlist bl;
798 ::encode(*this, bl);
799 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
800 }
801
802 int RGWSystemMetaObj::write(bool exclusive)
803 {
804 int ret = store_info(exclusive);
805 if (ret < 0) {
806 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
807 return ret;
808 }
809 ret = store_name(exclusive);
810 if (ret < 0) {
811 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
812 return ret;
813 }
814 return 0;
815 }
816
817
818 const string& RGWRealm::get_predefined_name(CephContext *cct) {
819 return cct->_conf->rgw_realm;
820 }
821
822 int RGWRealm::create(bool exclusive)
823 {
824 int ret = RGWSystemMetaObj::create(exclusive);
825 if (ret < 0) {
826 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
827 return ret;
828 }
829 // create the control object for watch/notify
830 ret = create_control(exclusive);
831 if (ret < 0) {
832 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
833 return ret;
834 }
835 RGWPeriod period;
836 if (current_period.empty()) {
837 /* create new period for the realm */
838 ret = period.init(cct, store, id, name, false);
839 if (ret < 0 ) {
840 return ret;
841 }
842 ret = period.create(true);
843 if (ret < 0) {
844 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
845 return ret;
846 }
847 } else {
848 period = RGWPeriod(current_period, 0);
849 int ret = period.init(cct, store, id, name);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
852 return ret;
853 }
854 }
855 ret = set_current_period(period);
856 if (ret < 0) {
857 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
858 return ret;
859 }
860 // try to set as default. may race with another create, so pass exclusive=true
861 // so we don't override an existing default
862 ret = set_as_default(true);
863 if (ret < 0 && ret != -EEXIST) {
864 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
865 }
866
867 return 0;
868 }
869
870 int RGWRealm::delete_obj()
871 {
872 int ret = RGWSystemMetaObj::delete_obj();
873 if (ret < 0) {
874 return ret;
875 }
876 return delete_control();
877 }
878
879 int RGWRealm::create_control(bool exclusive)
880 {
881 auto pool = rgw_pool{get_pool(cct)};
882 auto oid = get_control_oid();
883 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
884 nullptr, real_time(), nullptr);
885 }
886
887 int RGWRealm::delete_control()
888 {
889 auto pool = rgw_pool{get_pool(cct)};
890 auto obj = rgw_raw_obj{pool, get_control_oid()};
891 return store->delete_system_obj(obj);
892 }
893
894 rgw_pool RGWRealm::get_pool(CephContext *cct)
895 {
896 if (cct->_conf->rgw_realm_root_pool.empty()) {
897 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
898 }
899 return rgw_pool(cct->_conf->rgw_realm_root_pool);
900 }
901
902 const string RGWRealm::get_default_oid(bool old_format)
903 {
904 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
905 return default_realm_info_oid;
906 }
907 return cct->_conf->rgw_default_realm_info_oid;
908 }
909
910 const string& RGWRealm::get_names_oid_prefix()
911 {
912 return realm_names_oid_prefix;
913 }
914
915 const string& RGWRealm::get_info_oid_prefix(bool old_format)
916 {
917 return realm_info_oid_prefix;
918 }
919
920 int RGWRealm::set_current_period(RGWPeriod& period)
921 {
922 // update realm epoch to match the period's
923 if (epoch > period.get_realm_epoch()) {
924 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
925 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
926 return -EINVAL;
927 }
928 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
929 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
930 << period.get_realm_epoch() << ", but different period id "
931 << period.get_id() << " != " << current_period << dendl;
932 return -EINVAL;
933 }
934
935 epoch = period.get_realm_epoch();
936 current_period = period.get_id();
937
938 int ret = update();
939 if (ret < 0) {
940 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
941 return ret;
942 }
943
944 ret = period.reflect();
945 if (ret < 0) {
946 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
947 return ret;
948 }
949
950 return 0;
951 }
952
953 string RGWRealm::get_control_oid()
954 {
955 return get_info_oid_prefix() + id + ".control";
956 }
957
958 int RGWRealm::notify_zone(bufferlist& bl)
959 {
960 // open a context on the realm's pool
961 rgw_pool pool{get_pool(cct)};
962 librados::IoCtx ctx;
963 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
964 if (r < 0) {
965 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
966 return r;
967 }
968 // send a notify on the realm object
969 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
970 if (r < 0) {
971 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
972 return r;
973 }
974 return 0;
975 }
976
977 int RGWRealm::notify_new_period(const RGWPeriod& period)
978 {
979 bufferlist bl;
980 // push the period to dependent zonegroups/zones
981 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
982 ::encode(period, bl);
983 // reload the gateway with the new period
984 ::encode(RGWRealmNotify::Reload, bl);
985
986 return notify_zone(bl);
987 }
988
989 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
990 {
991 if (realm_id.empty()) {
992 return "period_config.default";
993 }
994 return "period_config." + realm_id;
995 }
996
997 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
998 {
999 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1000 if (pool_name.empty()) {
1001 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1002 }
1003 return {pool_name};
1004 }
1005
1006 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1007 {
1008 RGWObjectCtx obj_ctx(store);
1009 const auto& pool = get_pool(store->ctx());
1010 const auto& oid = get_oid(realm_id);
1011 bufferlist bl;
1012
1013 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1014 if (ret < 0) {
1015 return ret;
1016 }
1017 try {
1018 bufferlist::iterator iter = bl.begin();
1019 ::decode(*this, iter);
1020 } catch (buffer::error& err) {
1021 return -EIO;
1022 }
1023 return 0;
1024 }
1025
1026 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1027 {
1028 const auto& pool = get_pool(store->ctx());
1029 const auto& oid = get_oid(realm_id);
1030 bufferlist bl;
1031 ::encode(*this, bl);
1032 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1033 false, nullptr, real_time(), nullptr);
1034 }
1035
1036 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1037 const string& period_realm_name, bool setup_obj)
1038 {
1039 cct = _cct;
1040 store = _store;
1041 realm_id = period_realm_id;
1042 realm_name = period_realm_name;
1043
1044 if (!setup_obj)
1045 return 0;
1046
1047 return init(_cct, _store, setup_obj);
1048 }
1049
1050
1051 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1052 {
1053 cct = _cct;
1054 store = _store;
1055
1056 if (!setup_obj)
1057 return 0;
1058
1059 if (id.empty()) {
1060 RGWRealm realm(realm_id, realm_name);
1061 int ret = realm.init(cct, store);
1062 if (ret < 0) {
1063 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1064 cpp_strerror(-ret) << dendl;
1065 return ret;
1066 }
1067 id = realm.get_current_period();
1068 realm_id = realm.get_id();
1069 }
1070
1071 if (!epoch) {
1072 int ret = use_latest_epoch();
1073 if (ret < 0) {
1074 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1075 << " : " << cpp_strerror(-ret) << dendl;
1076 return ret;
1077 }
1078 }
1079
1080 return read_info();
1081 }
1082
1083
1084 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1085 map<string, RGWZoneGroup>::const_iterator iter;
1086 if (!zonegroup_id.empty()) {
1087 iter = period_map.zonegroups.find(zonegroup_id);
1088 } else {
1089 iter = period_map.zonegroups.find("default");
1090 }
1091 if (iter != period_map.zonegroups.end()) {
1092 zonegroup = iter->second;
1093 return 0;
1094 }
1095
1096 return -ENOENT;
1097 }
1098
1099 const string& RGWPeriod::get_latest_epoch_oid()
1100 {
1101 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1102 return period_latest_epoch_info_oid;
1103 }
1104 return cct->_conf->rgw_period_latest_epoch_info_oid;
1105 }
1106
1107 const string& RGWPeriod::get_info_oid_prefix()
1108 {
1109 return period_info_oid_prefix;
1110 }
1111
1112 const string RGWPeriod::get_period_oid_prefix()
1113 {
1114 return get_info_oid_prefix() + id;
1115 }
1116
1117 const string RGWPeriod::get_period_oid()
1118 {
1119 std::ostringstream oss;
1120 oss << get_period_oid_prefix();
1121 // skip the epoch for the staging period
1122 if (id != get_staging_id(realm_id))
1123 oss << "." << epoch;
1124 return oss.str();
1125 }
1126
1127 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1128 RGWObjVersionTracker *objv)
1129 {
1130 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1131
1132 rgw_pool pool(get_pool(cct));
1133 bufferlist bl;
1134 RGWObjectCtx obj_ctx(store);
1135 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1136 if (ret < 0) {
1137 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1138 return ret;
1139 }
1140 try {
1141 bufferlist::iterator iter = bl.begin();
1142 ::decode(info, iter);
1143 } catch (buffer::error& err) {
1144 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1145 return -EIO;
1146 }
1147
1148 return 0;
1149 }
1150
1151 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1152 {
1153 RGWPeriodLatestEpochInfo info;
1154
1155 int ret = read_latest_epoch(info);
1156 if (ret < 0) {
1157 return ret;
1158 }
1159
1160 latest_epoch = info.epoch;
1161
1162 return 0;
1163 }
1164
1165 int RGWPeriod::use_latest_epoch()
1166 {
1167 RGWPeriodLatestEpochInfo info;
1168 int ret = read_latest_epoch(info);
1169 if (ret < 0) {
1170 return ret;
1171 }
1172
1173 epoch = info.epoch;
1174
1175 return 0;
1176 }
1177
1178 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1179 RGWObjVersionTracker *objv)
1180 {
1181 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1182
1183 rgw_pool pool(get_pool(cct));
1184 bufferlist bl;
1185
1186 RGWPeriodLatestEpochInfo info;
1187 info.epoch = epoch;
1188
1189 ::encode(info, bl);
1190
1191 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1192 exclusive, objv, real_time(), nullptr);
1193 }
1194
1195 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1196 {
1197 static constexpr int MAX_RETRIES = 20;
1198
1199 for (int i = 0; i < MAX_RETRIES; i++) {
1200 RGWPeriodLatestEpochInfo info;
1201 RGWObjVersionTracker objv;
1202 bool exclusive = false;
1203
1204 // read existing epoch
1205 int r = read_latest_epoch(info, &objv);
1206 if (r == -ENOENT) {
1207 // use an exclusive create to set the epoch atomically
1208 exclusive = true;
1209 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1210 << " for period=" << id << dendl;
1211 } else if (r < 0) {
1212 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1213 return r;
1214 } else if (epoch <= info.epoch) {
1215 r = -EEXIST; // fail with EEXIST if epoch is not newer
1216 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1217 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1218 return r;
1219 } else {
1220 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1221 << " -> " << epoch << " on period=" << id << dendl;
1222 }
1223
1224 r = set_latest_epoch(epoch, exclusive, &objv);
1225 if (r == -EEXIST) {
1226 continue; // exclusive create raced with another update, retry
1227 } else if (r == -ECANCELED) {
1228 continue; // write raced with a conflicting version, retry
1229 }
1230 if (r < 0) {
1231 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1232 return r;
1233 }
1234 return 0; // return success
1235 }
1236
1237 return -ECANCELED; // fail after max retries
1238 }
1239
1240 int RGWPeriod::delete_obj()
1241 {
1242 rgw_pool pool(get_pool(cct));
1243
1244 // delete the object for each period epoch
1245 for (epoch_t e = 1; e <= epoch; e++) {
1246 RGWPeriod p{get_id(), e};
1247 rgw_raw_obj oid{pool, p.get_period_oid()};
1248 int ret = store->delete_system_obj(oid);
1249 if (ret < 0) {
1250 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1251 << ": " << cpp_strerror(-ret) << dendl;
1252 }
1253 }
1254
1255 // delete the .latest_epoch object
1256 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1257 int ret = store->delete_system_obj(oid);
1258 if (ret < 0) {
1259 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1260 << ": " << cpp_strerror(-ret) << dendl;
1261 }
1262 return ret;
1263 }
1264
1265 int RGWPeriod::read_info()
1266 {
1267 rgw_pool pool(get_pool(cct));
1268
1269 bufferlist bl;
1270
1271 RGWObjectCtx obj_ctx(store);
1272 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1273 if (ret < 0) {
1274 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1275 return ret;
1276 }
1277
1278 try {
1279 bufferlist::iterator iter = bl.begin();
1280 ::decode(*this, iter);
1281 } catch (buffer::error& err) {
1282 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1283 return -EIO;
1284 }
1285
1286 return 0;
1287 }
1288
1289 int RGWPeriod::create(bool exclusive)
1290 {
1291 int ret;
1292
1293 /* create unique id */
1294 uuid_d new_uuid;
1295 char uuid_str[37];
1296 new_uuid.generate_random();
1297 new_uuid.print(uuid_str);
1298 id = uuid_str;
1299
1300 epoch = FIRST_EPOCH;
1301
1302 period_map.id = id;
1303
1304 ret = store_info(exclusive);
1305 if (ret < 0) {
1306 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1307 return ret;
1308 }
1309
1310 ret = set_latest_epoch(epoch);
1311 if (ret < 0) {
1312 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1313 }
1314
1315 return ret;
1316 }
1317
1318 int RGWPeriod::store_info(bool exclusive)
1319 {
1320 rgw_pool pool(get_pool(cct));
1321
1322 string oid = get_period_oid();
1323 bufferlist bl;
1324 ::encode(*this, bl);
1325
1326 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1327 exclusive, NULL, real_time(), NULL);
1328 }
1329
1330 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1331 {
1332 if (cct->_conf->rgw_period_root_pool.empty()) {
1333 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1334 }
1335 return rgw_pool(cct->_conf->rgw_period_root_pool);
1336 }
1337
1338 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1339 {
1340 if (zonegroup.realm_id != realm_id) {
1341 return 0;
1342 }
1343 int ret = period_map.update(zonegroup, cct);
1344 if (ret < 0) {
1345 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1346 return ret;
1347 }
1348
1349 return store_info(false);
1350 }
1351
1352 int RGWPeriod::update()
1353 {
1354 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1355 list<string> zonegroups;
1356 int ret = store->list_zonegroups(zonegroups);
1357 if (ret < 0) {
1358 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1359 return ret;
1360 }
1361
1362 // clear zone short ids of removed zones. period_map.update() will add the
1363 // remaining zones back
1364 period_map.short_zone_ids.clear();
1365
1366 for (auto& iter : zonegroups) {
1367 RGWZoneGroup zg(string(), iter);
1368 ret = zg.init(cct, store);
1369 if (ret < 0) {
1370 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1371 continue;
1372 }
1373
1374 if (zg.realm_id != realm_id) {
1375 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1376 continue;
1377 }
1378
1379 if (zg.master_zone.empty()) {
1380 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1381 return -EINVAL;
1382 }
1383
1384 if (zg.is_master_zonegroup()) {
1385 master_zonegroup = zg.get_id();
1386 master_zone = zg.master_zone;
1387 }
1388
1389 int ret = period_map.update(zg, cct);
1390 if (ret < 0) {
1391 return ret;
1392 }
1393 }
1394
1395 ret = period_config.read(store, realm_id);
1396 if (ret < 0 && ret != -ENOENT) {
1397 ldout(cct, 0) << "ERROR: failed to read period config: "
1398 << cpp_strerror(ret) << dendl;
1399 return ret;
1400 }
1401 return 0;
1402 }
1403
1404 int RGWPeriod::reflect()
1405 {
1406 for (auto& iter : period_map.zonegroups) {
1407 RGWZoneGroup& zg = iter.second;
1408 zg.reinit_instance(cct, store);
1409 int r = zg.write(false);
1410 if (r < 0) {
1411 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1412 return r;
1413 }
1414 if (zg.is_master_zonegroup()) {
1415 // set master as default if no default exists
1416 r = zg.set_as_default(true);
1417 if (r == 0) {
1418 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1419 << " as the default" << dendl;
1420 }
1421 }
1422 }
1423
1424 int r = period_config.write(store, realm_id);
1425 if (r < 0) {
1426 ldout(cct, 0) << "ERROR: failed to store period config: "
1427 << cpp_strerror(-r) << dendl;
1428 return r;
1429 }
1430 return 0;
1431 }
1432
1433 void RGWPeriod::fork()
1434 {
1435 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1436 predecessor_uuid = id;
1437 id = get_staging_id(realm_id);
1438 period_map.reset();
1439 realm_epoch++;
1440 }
1441
1442 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1443 {
1444 // initialize a sync status manager to read the status
1445 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1446 int r = mgr.init();
1447 if (r < 0) {
1448 return r;
1449 }
1450 r = mgr.read_sync_status(sync_status);
1451 mgr.stop();
1452 return r;
1453 }
1454
1455 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1456 std::ostream& error_stream,
1457 bool force_if_stale)
1458 {
1459 rgw_meta_sync_status status;
1460 int r = read_sync_status(store, &status);
1461 if (r < 0) {
1462 ldout(cct, 0) << "period failed to read sync status: "
1463 << cpp_strerror(-r) << dendl;
1464 return r;
1465 }
1466
1467 std::vector<std::string> markers;
1468
1469 const auto current_epoch = current_period.get_realm_epoch();
1470 if (current_epoch != status.sync_info.realm_epoch) {
1471 // no sync status markers for the current period
1472 assert(current_epoch > status.sync_info.realm_epoch);
1473 const int behind = current_epoch - status.sync_info.realm_epoch;
1474 if (!force_if_stale && current_epoch > 1) {
1475 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1476 "the current master zone in metadata sync. If this zone is promoted "
1477 "to master, any metadata changes during that time are likely to "
1478 "be lost.\n"
1479 "Waiting for this zone to catch up on metadata sync (see "
1480 "'radosgw-admin sync status') is recommended.\n"
1481 "To promote this zone to master anyway, add the flag "
1482 "--yes-i-really-mean-it." << std::endl;
1483 return -EINVAL;
1484 }
1485 // empty sync status markers - other zones will skip this period during
1486 // incremental metadata sync
1487 markers.resize(status.sync_info.num_shards);
1488 } else {
1489 markers.reserve(status.sync_info.num_shards);
1490 for (auto& i : status.sync_markers) {
1491 auto& marker = i.second;
1492 // filter out markers from other periods
1493 if (marker.realm_epoch != current_epoch) {
1494 marker.marker.clear();
1495 }
1496 markers.emplace_back(std::move(marker.marker));
1497 }
1498 }
1499
1500 std::swap(sync_status, markers);
1501 return 0;
1502 }
1503
1504 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1505 std::ostream& error_stream, bool force_if_stale)
1506 {
1507 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1508 // gateway must be in the master zone to commit
1509 if (master_zone != store->get_zone_params().get_id()) {
1510 error_stream << "Cannot commit period on zone "
1511 << store->get_zone_params().get_id() << ", it must be sent to "
1512 "the period's master zone " << master_zone << '.' << std::endl;
1513 return -EINVAL;
1514 }
1515 // period predecessor must match current period
1516 if (predecessor_uuid != current_period.get_id()) {
1517 error_stream << "Period predecessor " << predecessor_uuid
1518 << " does not match current period " << current_period.get_id()
1519 << ". Use 'period pull' to get the latest period from the master, "
1520 "reapply your changes, and try again." << std::endl;
1521 return -EINVAL;
1522 }
1523 // realm epoch must be 1 greater than current period
1524 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1525 error_stream << "Period's realm epoch " << realm_epoch
1526 << " does not come directly after current realm epoch "
1527 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1528 "latest realm and period from the master zone, reapply your changes, "
1529 "and try again." << std::endl;
1530 return -EINVAL;
1531 }
1532 // did the master zone change?
1533 if (master_zone != current_period.get_master_zone()) {
1534 // store the current metadata sync status in the period
1535 int r = update_sync_status(current_period, error_stream, force_if_stale);
1536 if (r < 0) {
1537 ldout(cct, 0) << "failed to update metadata sync status: "
1538 << cpp_strerror(-r) << dendl;
1539 return r;
1540 }
1541 // create an object with a new period id
1542 r = create(true);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1545 return r;
1546 }
1547 // set as current period
1548 r = realm.set_current_period(*this);
1549 if (r < 0) {
1550 ldout(cct, 0) << "failed to update realm's current period: "
1551 << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 ldout(cct, 4) << "Promoted to master zone and committed new period "
1555 << id << dendl;
1556 realm.notify_new_period(*this);
1557 return 0;
1558 }
1559 // period must be based on current epoch
1560 if (epoch != current_period.get_epoch()) {
1561 error_stream << "Period epoch " << epoch << " does not match "
1562 "predecessor epoch " << current_period.get_epoch()
1563 << ". Use 'period pull' to get the latest epoch from the master zone, "
1564 "reapply your changes, and try again." << std::endl;
1565 return -EINVAL;
1566 }
1567 // set period as next epoch
1568 set_id(current_period.get_id());
1569 set_epoch(current_period.get_epoch() + 1);
1570 set_predecessor(current_period.get_predecessor());
1571 realm_epoch = current_period.get_realm_epoch();
1572 // write the period to rados
1573 int r = store_info(false);
1574 if (r < 0) {
1575 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1576 return r;
1577 }
1578 // set as latest epoch
1579 r = update_latest_epoch(epoch);
1580 if (r == -EEXIST) {
1581 // already have this epoch (or a more recent one)
1582 return 0;
1583 }
1584 if (r < 0) {
1585 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1586 return r;
1587 }
1588 r = reflect();
1589 if (r < 0) {
1590 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1591 return r;
1592 }
1593 ldout(cct, 4) << "Committed new epoch " << epoch
1594 << " for period " << id << dendl;
1595 realm.notify_new_period(*this);
1596 return 0;
1597 }
1598
1599 int RGWZoneParams::create_default(bool old_format)
1600 {
1601 name = default_zone_name;
1602
1603 int r = create();
1604 if (r < 0) {
1605 return r;
1606 }
1607
1608 if (old_format) {
1609 name = id;
1610 }
1611
1612 return r;
1613 }
1614
1615
1616 int get_zones_pool_set(CephContext* cct,
1617 RGWRados* store,
1618 const list<string>& zones,
1619 const string& my_zone_id,
1620 set<rgw_pool>& pool_names)
1621 {
1622 for(auto const& iter : zones) {
1623 RGWZoneParams zone(iter);
1624 int r = zone.init(cct, store);
1625 if (r < 0) {
1626 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1627 return r;
1628 }
1629 if (zone.get_id() != my_zone_id) {
1630 pool_names.insert(zone.domain_root);
1631 pool_names.insert(zone.metadata_heap);
1632 pool_names.insert(zone.control_pool);
1633 pool_names.insert(zone.gc_pool);
1634 pool_names.insert(zone.log_pool);
1635 pool_names.insert(zone.intent_log_pool);
1636 pool_names.insert(zone.usage_log_pool);
1637 pool_names.insert(zone.user_keys_pool);
1638 pool_names.insert(zone.user_email_pool);
1639 pool_names.insert(zone.user_swift_pool);
1640 pool_names.insert(zone.user_uid_pool);
1641 pool_names.insert(zone.roles_pool);
1642 pool_names.insert(zone.reshard_pool);
1643 for(auto& iter : zone.placement_pools) {
1644 pool_names.insert(iter.second.index_pool);
1645 pool_names.insert(iter.second.data_pool);
1646 pool_names.insert(iter.second.data_extra_pool);
1647 }
1648 }
1649 }
1650 return 0;
1651 }
1652
1653 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1654 const string& default_prefix,
1655 const string& default_suffix,
1656 const rgw_pool& suggested_pool)
1657 {
1658 string suggested_name = suggested_pool.to_str();
1659
1660 string prefix = default_prefix;
1661 string suffix = default_suffix;
1662
1663 if (!suggested_pool.empty()) {
1664 prefix = suggested_name.substr(0, suggested_name.find("."));
1665 suffix = suggested_name.substr(prefix.length());
1666 }
1667
1668 rgw_pool pool(prefix + suffix);
1669
1670 if (pools.find(pool) == pools.end()) {
1671 return pool;
1672 } else {
1673 while(true) {
1674 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1675 if (pools.find(pool) == pools.end()) {
1676 return pool;
1677 }
1678 }
1679 }
1680 }
1681
1682 int RGWZoneParams::fix_pool_names()
1683 {
1684
1685 list<string> zones;
1686 int r = store->list_zones(zones);
1687 if (r < 0) {
1688 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1689 }
1690
1691 set<rgw_pool> pools;
1692 r = get_zones_pool_set(cct, store, zones, id, pools);
1693 if (r < 0) {
1694 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1695 return r;
1696 }
1697
1698 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1699 if (!metadata_heap.name.empty()) {
1700 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1701 }
1702 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1703 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1704 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1705 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1706 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1707 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1708 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1709 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1710 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1711 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1712 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1713 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1714
1715 for(auto& iter : placement_pools) {
1716 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1717 iter.second.index_pool);
1718 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1719 iter.second.data_pool);
1720 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1721 iter.second.data_extra_pool);
1722 }
1723
1724 return 0;
1725 }
1726
1727 int RGWZoneParams::create(bool exclusive)
1728 {
1729 /* check for old pools config */
1730 rgw_raw_obj obj(domain_root, avail_pools);
1731 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1732 if (r < 0) {
1733 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1734 /* a new system, let's set new placement info */
1735 RGWZonePlacementInfo default_placement;
1736 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1737 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1738 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1739 placement_pools["default-placement"] = default_placement;
1740 }
1741
1742 r = fix_pool_names();
1743 if (r < 0) {
1744 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1745 return r;
1746 }
1747
1748 r = RGWSystemMetaObj::create(exclusive);
1749 if (r < 0) {
1750 return r;
1751 }
1752
1753 // try to set as default. may race with another create, so pass exclusive=true
1754 // so we don't override an existing default
1755 r = set_as_default(true);
1756 if (r < 0 && r != -EEXIST) {
1757 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1758 }
1759
1760 return 0;
1761 }
1762
1763 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1764 {
1765 if (cct->_conf->rgw_zone_root_pool.empty()) {
1766 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1767 }
1768
1769 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1770 }
1771
1772 const string RGWZoneParams::get_default_oid(bool old_format)
1773 {
1774 if (old_format) {
1775 return cct->_conf->rgw_default_zone_info_oid;
1776 }
1777
1778 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1779 }
1780
1781 const string& RGWZoneParams::get_names_oid_prefix()
1782 {
1783 return zone_names_oid_prefix;
1784 }
1785
1786 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1787 {
1788 return zone_info_oid_prefix;
1789 }
1790
1791 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1792 return cct->_conf->rgw_zone;
1793 }
1794
1795 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1796 {
1797 if (name.empty()) {
1798 name = cct->_conf->rgw_zone;
1799 }
1800
1801 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1802 }
1803
1804 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1805 {
1806 if (realm_id.empty()) {
1807 /* try using default realm */
1808 RGWRealm realm;
1809 int ret = realm.init(cct, store);
1810 //no default realm exist
1811 if (ret < 0) {
1812 return read_id(default_zone_name, default_id);
1813 }
1814 realm_id = realm.get_id();
1815 }
1816
1817 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1818 }
1819
1820
1821 int RGWZoneParams::set_as_default(bool exclusive)
1822 {
1823 if (realm_id.empty()) {
1824 /* try using default realm */
1825 RGWRealm realm;
1826 int ret = realm.init(cct, store);
1827 if (ret < 0) {
1828 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1829 return -EINVAL;
1830 }
1831 realm_id = realm.get_id();
1832 }
1833
1834 return RGWSystemMetaObj::set_as_default(exclusive);
1835 }
1836
1837 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1838 {
1839 static const std::string NONE{"none"};
1840 auto p = placement_pools.find(placement_rule);
1841 if (p == placement_pools.end()) {
1842 return NONE;
1843 }
1844 const auto& type = p->second.compression_type;
1845 return !type.empty() ? type : NONE;
1846 }
1847
1848 void RGWPeriodMap::encode(bufferlist& bl) const {
1849 ENCODE_START(2, 1, bl);
1850 ::encode(id, bl);
1851 ::encode(zonegroups, bl);
1852 ::encode(master_zonegroup, bl);
1853 ::encode(short_zone_ids, bl);
1854 ENCODE_FINISH(bl);
1855 }
1856
1857 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1858 DECODE_START(2, bl);
1859 ::decode(id, bl);
1860 ::decode(zonegroups, bl);
1861 ::decode(master_zonegroup, bl);
1862 if (struct_v >= 2) {
1863 ::decode(short_zone_ids, bl);
1864 }
1865 DECODE_FINISH(bl);
1866
1867 zonegroups_by_api.clear();
1868 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1869 iter != zonegroups.end(); ++iter) {
1870 RGWZoneGroup& zonegroup = iter->second;
1871 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1872 if (zonegroup.is_master_zonegroup()) {
1873 master_zonegroup = zonegroup.get_id();
1874 }
1875 }
1876 }
1877
1878 // run an MD5 hash on the zone_id and return the first 32 bits
1879 static uint32_t gen_short_zone_id(const std::string zone_id)
1880 {
1881 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1882 MD5 hash;
1883 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1884 hash.Final(md5);
1885
1886 uint32_t short_id;
1887 memcpy((char *)&short_id, md5, sizeof(short_id));
1888 return std::max(short_id, 1u);
1889 }
1890
1891 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1892 {
1893 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1894 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1895 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1896 return -EINVAL;
1897 }
1898 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1899 if (iter != zonegroups.end()) {
1900 RGWZoneGroup& old_zonegroup = iter->second;
1901 if (!old_zonegroup.api_name.empty()) {
1902 zonegroups_by_api.erase(old_zonegroup.api_name);
1903 }
1904 }
1905 zonegroups[zonegroup.get_id()] = zonegroup;
1906
1907 if (!zonegroup.api_name.empty()) {
1908 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1909 }
1910
1911 if (zonegroup.is_master_zonegroup()) {
1912 master_zonegroup = zonegroup.get_id();
1913 } else if (master_zonegroup == zonegroup.get_id()) {
1914 master_zonegroup = "";
1915 }
1916
1917 for (auto& i : zonegroup.zones) {
1918 auto& zone = i.second;
1919 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1920 continue;
1921 }
1922 // calculate the zone's short id
1923 uint32_t short_id = gen_short_zone_id(zone.id);
1924
1925 // search for an existing zone with the same short id
1926 for (auto& s : short_zone_ids) {
1927 if (s.second == short_id) {
1928 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1929 << ") generates the same short_zone_id " << short_id
1930 << " as existing zone id " << s.first << dendl;
1931 return -EEXIST;
1932 }
1933 }
1934
1935 short_zone_ids[zone.id] = short_id;
1936 }
1937
1938 return 0;
1939 }
1940
1941 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1942 {
1943 auto i = short_zone_ids.find(zone_id);
1944 if (i == short_zone_ids.end()) {
1945 return 0;
1946 }
1947 return i->second;
1948 }
1949
1950 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1951 {
1952
1953 RGWPeriod period;
1954 int ret = period.init(cct, store);
1955 if (ret < 0) {
1956 cerr << "failed to read current period info: " << cpp_strerror(ret);
1957 return ret;
1958 }
1959
1960 bucket_quota = period.get_config().bucket_quota;
1961 user_quota = period.get_config().user_quota;
1962 zonegroups = period.get_map().zonegroups;
1963 zonegroups_by_api = period.get_map().zonegroups_by_api;
1964 master_zonegroup = period.get_map().master_zonegroup;
1965
1966 return 0;
1967 }
1968
1969 void RGWRegionMap::encode(bufferlist& bl) const {
1970 ENCODE_START( 3, 1, bl);
1971 ::encode(regions, bl);
1972 ::encode(master_region, bl);
1973 ::encode(bucket_quota, bl);
1974 ::encode(user_quota, bl);
1975 ENCODE_FINISH(bl);
1976 }
1977
1978 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1979 DECODE_START(3, bl);
1980 ::decode(regions, bl);
1981 ::decode(master_region, bl);
1982 if (struct_v >= 2)
1983 ::decode(bucket_quota, bl);
1984 if (struct_v >= 3)
1985 ::decode(user_quota, bl);
1986 DECODE_FINISH(bl);
1987 }
1988
1989 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1990 ENCODE_START( 3, 1, bl);
1991 ::encode(zonegroups, bl);
1992 ::encode(master_zonegroup, bl);
1993 ::encode(bucket_quota, bl);
1994 ::encode(user_quota, bl);
1995 ENCODE_FINISH(bl);
1996 }
1997
1998 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1999 DECODE_START(3, bl);
2000 ::decode(zonegroups, bl);
2001 ::decode(master_zonegroup, bl);
2002 if (struct_v >= 2)
2003 ::decode(bucket_quota, bl);
2004 if (struct_v >= 3)
2005 ::decode(user_quota, bl);
2006 DECODE_FINISH(bl);
2007
2008 zonegroups_by_api.clear();
2009 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2010 iter != zonegroups.end(); ++iter) {
2011 RGWZoneGroup& zonegroup = iter->second;
2012 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2013 if (zonegroup.is_master_zonegroup()) {
2014 master_zonegroup = zonegroup.get_name();
2015 }
2016 }
2017 }
2018
2019 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2020 {
2021 obj_version *check_objv = version_for_check();
2022
2023 if (check_objv) {
2024 cls_version_check(*op, *check_objv, VER_COND_EQ);
2025 }
2026
2027 cls_version_read(*op, &read_version);
2028 }
2029
2030 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2031 {
2032 obj_version *check_objv = version_for_check();
2033 obj_version *modify_version = version_for_write();
2034
2035 if (check_objv) {
2036 cls_version_check(*op, *check_objv, VER_COND_EQ);
2037 }
2038
2039 if (modify_version) {
2040 cls_version_set(*op, *modify_version);
2041 } else {
2042 cls_version_inc(*op);
2043 }
2044 }
2045
2046 void RGWObjManifest::obj_iterator::operator++()
2047 {
2048 if (manifest->explicit_objs) {
2049 ++explicit_iter;
2050
2051 if (explicit_iter == manifest->objs.end()) {
2052 ofs = manifest->obj_size;
2053 return;
2054 }
2055
2056 update_explicit_pos();
2057
2058 update_location();
2059 return;
2060 }
2061
2062 uint64_t obj_size = manifest->get_obj_size();
2063 uint64_t head_size = manifest->get_head_size();
2064
2065 if (ofs == obj_size) {
2066 return;
2067 }
2068
2069 if (manifest->rules.empty()) {
2070 return;
2071 }
2072
2073 /* are we still pointing at the head? */
2074 if (ofs < head_size) {
2075 rule_iter = manifest->rules.begin();
2076 RGWObjManifestRule *rule = &rule_iter->second;
2077 ofs = MIN(head_size, obj_size);
2078 stripe_ofs = ofs;
2079 cur_stripe = 1;
2080 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2081 if (rule->part_size > 0) {
2082 stripe_size = MIN(stripe_size, rule->part_size);
2083 }
2084 update_location();
2085 return;
2086 }
2087
2088 RGWObjManifestRule *rule = &rule_iter->second;
2089
2090 stripe_ofs += rule->stripe_max_size;
2091 cur_stripe++;
2092 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2093
2094 if (rule->part_size > 0) {
2095 /* multi part, multi stripes object */
2096
2097 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2098
2099 if (stripe_ofs >= part_ofs + rule->part_size) {
2100 /* moved to the next part */
2101 cur_stripe = 0;
2102 part_ofs += rule->part_size;
2103 stripe_ofs = part_ofs;
2104
2105 bool last_rule = (next_rule_iter == manifest->rules.end());
2106 /* move to the next rule? */
2107 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2108 rule_iter = next_rule_iter;
2109 last_rule = (next_rule_iter == manifest->rules.end());
2110 if (!last_rule) {
2111 ++next_rule_iter;
2112 }
2113 cur_part_id = rule_iter->second.start_part_num;
2114 } else {
2115 cur_part_id++;
2116 }
2117
2118 rule = &rule_iter->second;
2119 }
2120
2121 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2122 }
2123
2124 cur_override_prefix = rule->override_prefix;
2125
2126 ofs = stripe_ofs;
2127 if (ofs > obj_size) {
2128 ofs = obj_size;
2129 stripe_ofs = ofs;
2130 stripe_size = 0;
2131 }
2132
2133 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2134 update_location();
2135 }
2136
2137 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2138 {
2139 manifest = _m;
2140
2141 manifest->set_tail_placement(placement_rule, _b);
2142 manifest->set_head(placement_rule, _obj, 0);
2143 last_ofs = 0;
2144
2145 if (manifest->get_prefix().empty()) {
2146 char buf[33];
2147 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2148
2149 string oid_prefix = ".";
2150 oid_prefix.append(buf);
2151 oid_prefix.append("_");
2152
2153 manifest->set_prefix(oid_prefix);
2154 }
2155
2156 bool found = manifest->get_rule(0, &rule);
2157 if (!found) {
2158 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2159 return -EIO;
2160 }
2161
2162 uint64_t head_size = manifest->get_head_size();
2163
2164 if (head_size > 0) {
2165 cur_stripe_size = head_size;
2166 } else {
2167 cur_stripe_size = rule.stripe_max_size;
2168 }
2169
2170 cur_part_id = rule.start_part_num;
2171
2172 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2173
2174 // Normal object which not generated through copy operation
2175 manifest->set_tail_instance(_obj.key.instance);
2176
2177 manifest->update_iterators();
2178
2179 return 0;
2180 }
2181
2182 int RGWObjManifest::generator::create_next(uint64_t ofs)
2183 {
2184 if (ofs < last_ofs) /* only going forward */
2185 return -EINVAL;
2186
2187 uint64_t max_head_size = manifest->get_max_head_size();
2188
2189 if (ofs < max_head_size) {
2190 manifest->set_head_size(ofs);
2191 }
2192
2193 if (ofs >= max_head_size) {
2194 manifest->set_head_size(max_head_size);
2195 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2196 cur_stripe_size = rule.stripe_max_size;
2197
2198 if (cur_part_id == 0 && max_head_size > 0) {
2199 cur_stripe++;
2200 }
2201 }
2202
2203 last_ofs = ofs;
2204 manifest->set_obj_size(ofs);
2205
2206 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2207
2208 manifest->update_iterators();
2209
2210 return 0;
2211 }
2212
2213 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2214 {
2215 return begin_iter;
2216 }
2217
2218 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2219 {
2220 return end_iter;
2221 }
2222
2223 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2224 {
2225 if (ofs > obj_size) {
2226 ofs = obj_size;
2227 }
2228 RGWObjManifest::obj_iterator iter(this);
2229 iter.seek(ofs);
2230 return iter;
2231 }
2232
2233 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2234 {
2235 if (explicit_objs || m.explicit_objs) {
2236 return append_explicit(m, zonegroup, zone_params);
2237 }
2238
2239 if (rules.empty()) {
2240 *this = m;
2241 return 0;
2242 }
2243
2244 string override_prefix;
2245
2246 if (prefix.empty()) {
2247 prefix = m.prefix;
2248 }
2249
2250 if (prefix != m.prefix) {
2251 override_prefix = m.prefix;
2252 }
2253
2254 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2255 if (miter == m.rules.end()) {
2256 return append_explicit(m, zonegroup, zone_params);
2257 }
2258
2259 for (; miter != m.rules.end(); ++miter) {
2260 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2261
2262 RGWObjManifestRule& rule = last_rule->second;
2263
2264 if (rule.part_size == 0) {
2265 rule.part_size = obj_size - rule.start_ofs;
2266 }
2267
2268 RGWObjManifestRule& next_rule = miter->second;
2269 if (!next_rule.part_size) {
2270 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2271 }
2272
2273 string rule_prefix = prefix;
2274 if (!rule.override_prefix.empty()) {
2275 rule_prefix = rule.override_prefix;
2276 }
2277
2278 string next_rule_prefix = m.prefix;
2279 if (!next_rule.override_prefix.empty()) {
2280 next_rule_prefix = next_rule.override_prefix;
2281 }
2282
2283 if (rule.part_size != next_rule.part_size ||
2284 rule.stripe_max_size != next_rule.stripe_max_size ||
2285 rule_prefix != next_rule_prefix) {
2286 if (next_rule_prefix != prefix) {
2287 append_rules(m, miter, &next_rule_prefix);
2288 } else {
2289 append_rules(m, miter, NULL);
2290 }
2291 break;
2292 }
2293
2294 uint64_t expected_part_num = rule.start_part_num + 1;
2295 if (rule.part_size > 0) {
2296 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2297 }
2298
2299 if (expected_part_num != next_rule.start_part_num) {
2300 append_rules(m, miter, NULL);
2301 break;
2302 }
2303 }
2304
2305 set_obj_size(obj_size + m.obj_size);
2306
2307 return 0;
2308 }
2309
2310 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2311 {
2312 return append(m, store->get_zonegroup(), store->get_zone_params());
2313 }
2314
2315 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2316 string *override_prefix)
2317 {
2318 for (; miter != m.rules.end(); ++miter) {
2319 RGWObjManifestRule rule = miter->second;
2320 rule.start_ofs += obj_size;
2321 if (override_prefix)
2322 rule.override_prefix = *override_prefix;
2323 rules[rule.start_ofs] = rule;
2324 }
2325 }
2326
2327 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2328 {
2329 if (explicit_objs) {
2330 return;
2331 }
2332 obj_iterator iter = obj_begin();
2333
2334 while (iter != obj_end()) {
2335 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2336 const rgw_obj_select& os = iter.get_location();
2337 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2338 part.loc_ofs = 0;
2339
2340 uint64_t ofs = iter.get_stripe_ofs();
2341
2342 if (ofs == 0) {
2343 part.loc = obj;
2344 } else {
2345 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2346 }
2347 ++iter;
2348 uint64_t next_ofs = iter.get_stripe_ofs();
2349
2350 part.size = next_ofs - ofs;
2351 }
2352
2353 explicit_objs = true;
2354 rules.clear();
2355 prefix.clear();
2356 }
2357
2358 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2359 {
2360 if (!explicit_objs) {
2361 convert_to_explicit(zonegroup, zone_params);
2362 }
2363 if (!m.explicit_objs) {
2364 m.convert_to_explicit(zonegroup, zone_params);
2365 }
2366 map<uint64_t, RGWObjManifestPart>::iterator iter;
2367 uint64_t base = obj_size;
2368 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2369 RGWObjManifestPart& part = iter->second;
2370 objs[base + iter->first] = part;
2371 }
2372 obj_size += m.obj_size;
2373
2374 return 0;
2375 }
2376
2377 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2378 {
2379 if (rules.empty()) {
2380 return false;
2381 }
2382
2383 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2384 if (iter != rules.begin()) {
2385 --iter;
2386 }
2387
2388 *rule = iter->second;
2389
2390 return true;
2391 }
2392
2393 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2394 {
2395 write_version.ver = 1;
2396 #define TAG_LEN 24
2397
2398 write_version.tag.clear();
2399 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2400 }
2401
2402 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2403 real_time *mtime, real_time set_mtime,
2404 map<string, bufferlist>& attrs, real_time delete_at,
2405 const char *if_match, const char *if_nomatch, const string *user_data,
2406 rgw_zone_set *zones_trace)
2407 {
2408 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2409 if (r < 0)
2410 return r;
2411
2412 is_complete = !canceled;
2413 return 0;
2414 }
2415
2416 CephContext *RGWPutObjProcessor::ctx()
2417 {
2418 return store->ctx();
2419 }
2420
2421 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2422 {
2423 drain_pending();
2424
2425 if (is_complete)
2426 return;
2427
2428 set<rgw_raw_obj>::iterator iter;
2429 bool need_to_remove_head = false;
2430 rgw_raw_obj raw_head;
2431
2432 if (!head_obj.empty()) {
2433 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2434 }
2435
2436 /**
2437 * We should delete the object in the "multipart" namespace to avoid race condition.
2438 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2439 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2440 * written by the second upload may be deleted by the first upload.
2441 * details is describled on #11749
2442 *
2443 * The above comment still stands, but instead of searching for a specific object in the multipart
2444 * namespace, we just make sure that we remove the object that is marked as the head object after
2445 * we remove all the other raw objects. Note that we use different call to remove the head object,
2446 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2447 */
2448 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2449 const rgw_raw_obj& obj = *iter;
2450 if (!head_obj.empty() && obj == raw_head) {
2451 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2452 need_to_remove_head = true;
2453 continue;
2454 }
2455
2456 int r = store->delete_raw_obj(obj);
2457 if (r < 0 && r != -ENOENT) {
2458 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2459 }
2460 }
2461
2462 if (need_to_remove_head) {
2463 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2464 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2465 if (r < 0 && r != -ENOENT) {
2466 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2467 }
2468 }
2469 }
2470
2471 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2472 {
2473 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2474 obj_len = abs_ofs + bl.length();
2475
2476 if (!(obj == last_written_obj)) {
2477 last_written_obj = obj;
2478 }
2479
2480 // For the first call pass -1 as the offset to
2481 // do a write_full.
2482 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2483 }
2484
2485 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2486 {
2487 struct put_obj_aio_info info;
2488 info = pending.front();
2489 pending.pop_front();
2490 pending_size -= info.size;
2491 return info;
2492 }
2493
2494 int RGWPutObjProcessor_Aio::wait_pending_front()
2495 {
2496 if (pending.empty()) {
2497 return 0;
2498 }
2499 struct put_obj_aio_info info = pop_pending();
2500 int ret = store->aio_wait(info.handle);
2501
2502 if (ret >= 0) {
2503 add_written_obj(info.obj);
2504 }
2505
2506 return ret;
2507 }
2508
2509 bool RGWPutObjProcessor_Aio::pending_has_completed()
2510 {
2511 if (pending.empty())
2512 return false;
2513
2514 struct put_obj_aio_info& info = pending.front();
2515 return store->aio_completed(info.handle);
2516 }
2517
2518 int RGWPutObjProcessor_Aio::drain_pending()
2519 {
2520 int ret = 0;
2521 while (!pending.empty()) {
2522 int r = wait_pending_front();
2523 if (r < 0)
2524 ret = r;
2525 }
2526 return ret;
2527 }
2528
2529 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2530 {
2531 bool _wait = need_to_wait;
2532
2533 if (handle) {
2534 struct put_obj_aio_info info;
2535 info.handle = handle;
2536 info.obj = obj;
2537 info.size = size;
2538 pending_size += size;
2539 pending.push_back(info);
2540 }
2541 size_t orig_size = pending_size;
2542
2543 /* first drain complete IOs */
2544 while (pending_has_completed()) {
2545 int r = wait_pending_front();
2546 if (r < 0)
2547 return r;
2548
2549 _wait = false;
2550 }
2551
2552 /* resize window in case messages are draining too fast */
2553 if (orig_size - pending_size >= window_size) {
2554 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2555 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2556 if (window_size > max_window_size) {
2557 window_size = max_window_size;
2558 }
2559 }
2560
2561 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2562 if (pending_size > window_size || _wait) {
2563 int r = wait_pending_front();
2564 if (r < 0)
2565 return r;
2566 }
2567 return 0;
2568 }
2569
2570 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2571 {
2572 if (ofs >= next_part_ofs) {
2573 int r = prepare_next_part(ofs);
2574 if (r < 0) {
2575 return r;
2576 }
2577 }
2578
2579 *pobj = cur_obj;
2580
2581 if (!bl.length()) {
2582 *phandle = nullptr;
2583 return 0;
2584 }
2585
2586 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2587 }
2588
2589 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2590 {
2591 RGWPutObjProcessor::prepare(store, oid_rand);
2592
2593 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2594
2595 return 0;
2596 }
2597
2598 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2599 {
2600 *phandle = NULL;
2601 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2602
2603 pending_data_bl.claim_append(bl);
2604 if (pending_data_bl.length() < max_write_size) {
2605 *again = false;
2606 return 0;
2607 }
2608
2609 pending_data_bl.splice(0, max_write_size, &bl);
2610
2611 /* do we have enough data pending accumulated that needs to be written? */
2612 *again = (pending_data_bl.length() >= max_chunk_size);
2613
2614 if (!data_ofs && !immutable_head()) {
2615 first_chunk.claim(bl);
2616 obj_len = (uint64_t)first_chunk.length();
2617 int r = prepare_next_part(obj_len);
2618 if (r < 0) {
2619 return r;
2620 }
2621 data_ofs = obj_len;
2622 return 0;
2623 }
2624 off_t write_ofs = data_ofs;
2625 data_ofs = write_ofs + bl.length();
2626 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2627 we could be racing with another upload, to the same
2628 object and cleanup can be messy */
2629 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2630 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2631 bl.clear();
2632 }
2633 return ret;
2634 }
2635
2636
2637 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2638 {
2639 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2640
2641 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2642 if (r < 0) {
2643 return r;
2644 }
2645
2646 return 0;
2647 }
2648
2649 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2650 {
2651 head_obj.init(bucket, obj_str);
2652
2653 int r = prepare_init(store, oid_rand);
2654 if (r < 0) {
2655 return r;
2656 }
2657
2658 if (!version_id.empty()) {
2659 head_obj.key.set_instance(version_id);
2660 } else if (versioned_object) {
2661 store->gen_rand_obj_instance_name(&head_obj);
2662 }
2663
2664 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2665
2666 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2667 if (r < 0) {
2668 return r;
2669 }
2670
2671 return 0;
2672 }
2673
2674 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2675
2676 int ret = manifest_gen.create_next(ofs);
2677 if (ret < 0) {
2678 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2679 return ret;
2680 }
2681 cur_part_ofs = ofs;
2682 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2683 cur_obj = manifest_gen.get_cur_obj(store);
2684
2685 return 0;
2686 }
2687
2688 int RGWPutObjProcessor_Atomic::complete_parts()
2689 {
2690 if (obj_len > (uint64_t)cur_part_ofs) {
2691 return prepare_next_part(obj_len);
2692 }
2693 return 0;
2694 }
2695
2696 int RGWPutObjProcessor_Atomic::complete_writing_data()
2697 {
2698 if (!data_ofs && !immutable_head()) {
2699 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2700 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2701 * clobber first_chunk
2702 */
2703 if (pending_data_bl.length() > 0) {
2704 first_chunk.claim(pending_data_bl);
2705 }
2706 obj_len = (uint64_t)first_chunk.length();
2707 }
2708 while (pending_data_bl.length()) {
2709 void *handle = nullptr;
2710 rgw_raw_obj obj;
2711 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2712 if (max_write_size > pending_data_bl.length()) {
2713 max_write_size = pending_data_bl.length();
2714 }
2715 bufferlist bl;
2716 pending_data_bl.splice(0, max_write_size, &bl);
2717 uint64_t write_len = bl.length();
2718 int r = write_data(bl, data_ofs, &handle, &obj, false);
2719 if (r < 0) {
2720 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2721 return r;
2722 }
2723 data_ofs += write_len;
2724 r = throttle_data(handle, obj, write_len, false);
2725 if (r < 0) {
2726 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2727 return r;
2728 }
2729
2730 if (data_ofs >= next_part_ofs) {
2731 r = prepare_next_part(data_ofs);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2734 return r;
2735 }
2736 }
2737 }
2738 int r = complete_parts();
2739 if (r < 0) {
2740 return r;
2741 }
2742
2743 r = drain_pending();
2744 if (r < 0)
2745 return r;
2746
2747 return 0;
2748 }
2749
2750 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2751 real_time *mtime, real_time set_mtime,
2752 map<string, bufferlist>& attrs,
2753 real_time delete_at,
2754 const char *if_match,
2755 const char *if_nomatch, const string *user_data,
2756 rgw_zone_set *zones_trace) {
2757 int r = complete_writing_data();
2758 if (r < 0)
2759 return r;
2760
2761 obj_ctx.obj.set_atomic(head_obj);
2762
2763 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2764
2765 /* some object types shouldn't be versioned, e.g., multipart parts */
2766 op_target.set_versioning_disabled(!versioned_object);
2767
2768 RGWRados::Object::Write obj_op(&op_target);
2769
2770 obj_op.meta.data = &first_chunk;
2771 obj_op.meta.manifest = &manifest;
2772 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2773 obj_op.meta.if_match = if_match;
2774 obj_op.meta.if_nomatch = if_nomatch;
2775 obj_op.meta.mtime = mtime;
2776 obj_op.meta.set_mtime = set_mtime;
2777 obj_op.meta.owner = bucket_info.owner;
2778 obj_op.meta.flags = PUT_OBJ_CREATE;
2779 obj_op.meta.olh_epoch = olh_epoch;
2780 obj_op.meta.delete_at = delete_at;
2781 obj_op.meta.user_data = user_data;
2782 obj_op.meta.zones_trace = zones_trace;
2783 obj_op.meta.modify_tail = true;
2784
2785 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2786 if (r < 0) {
2787 return r;
2788 }
2789
2790 canceled = obj_op.meta.canceled;
2791
2792 return 0;
2793 }
2794
2795 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2796 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2797 if (r < 0)
2798 return r;
2799 return 0;
2800 }
2801
2802 int RGWRados::unwatch(uint64_t watch_handle)
2803 {
2804 int r = control_pool_ctx.unwatch2(watch_handle);
2805 if (r < 0) {
2806 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2807 return r;
2808 }
2809 r = rados[0].watch_flush();
2810 if (r < 0) {
2811 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2812 return r;
2813 }
2814 return 0;
2815 }
2816
2817 void RGWRados::add_watcher(int i)
2818 {
2819 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2820 Mutex::Locker l(watchers_lock);
2821 watchers_set.insert(i);
2822 if (watchers_set.size() == (size_t)num_watchers) {
2823 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2824 set_cache_enabled(true);
2825 }
2826 }
2827
2828 void RGWRados::remove_watcher(int i)
2829 {
2830 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2831 Mutex::Locker l(watchers_lock);
2832 size_t orig_size = watchers_set.size();
2833 watchers_set.erase(i);
2834 if (orig_size == (size_t)num_watchers &&
2835 watchers_set.size() < orig_size) { /* actually removed */
2836 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2837 set_cache_enabled(false);
2838 }
2839 }
2840
2841 class RGWWatcher : public librados::WatchCtx2 {
2842 RGWRados *rados;
2843 int index;
2844 string oid;
2845 uint64_t watch_handle;
2846
2847 class C_ReinitWatch : public Context {
2848 RGWWatcher *watcher;
2849 public:
2850 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2851 void finish(int r) override {
2852 watcher->reinit();
2853 }
2854 };
2855 public:
2856 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2857 void handle_notify(uint64_t notify_id,
2858 uint64_t cookie,
2859 uint64_t notifier_id,
2860 bufferlist& bl) override {
2861 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2862 << " notify_id " << notify_id
2863 << " cookie " << cookie
2864 << " notifier " << notifier_id
2865 << " bl.length()=" << bl.length() << dendl;
2866 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2867
2868 bufferlist reply_bl; // empty reply payload
2869 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2870 }
2871 void handle_error(uint64_t cookie, int err) override {
2872 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2873 << " err " << cpp_strerror(err) << dendl;
2874 rados->remove_watcher(index);
2875 rados->schedule_context(new C_ReinitWatch(this));
2876 }
2877
2878 void reinit() {
2879 int ret = unregister_watch();
2880 if (ret < 0) {
2881 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2882 return;
2883 }
2884 ret = register_watch();
2885 if (ret < 0) {
2886 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2887 return;
2888 }
2889 }
2890
2891 int unregister_watch() {
2892 int r = rados->unwatch(watch_handle);
2893 if (r < 0) {
2894 return r;
2895 }
2896 rados->remove_watcher(index);
2897 return 0;
2898 }
2899
2900 int register_watch() {
2901 int r = rados->watch(oid, &watch_handle, this);
2902 if (r < 0) {
2903 return r;
2904 }
2905 rados->add_watcher(index);
2906 return 0;
2907 }
2908 };
2909
2910 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2911 RGWRados *store;
2912 RGWHTTPManager http_manager;
2913
2914 public:
2915 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2916 http_manager(store->ctx(), completion_mgr) {
2917 http_manager.set_threaded();
2918 }
2919
2920 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2921 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2922 { "notify", NULL },
2923 { NULL, NULL } };
2924
2925 list<RGWCoroutinesStack *> stacks;
2926 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2927 RGWRESTConn *conn = iter->second;
2928 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2929 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2930
2931 stacks.push_back(stack);
2932 }
2933 return run(stacks);
2934 }
2935 };
2936
2937 class RGWDataNotifierManager : public RGWCoroutinesManager {
2938 RGWRados *store;
2939 RGWHTTPManager http_manager;
2940
2941 public:
2942 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2943 http_manager(store->ctx(), completion_mgr) {
2944 http_manager.set_threaded();
2945 }
2946
2947 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2948 rgw_http_param_pair pairs[] = { { "type", "data" },
2949 { "notify", NULL },
2950 { "source-zone", store->get_zone_params().get_id().c_str() },
2951 { NULL, NULL } };
2952
2953 list<RGWCoroutinesStack *> stacks;
2954 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2955 RGWRESTConn *conn = iter->second;
2956 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2957 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2958
2959 stacks.push_back(stack);
2960 }
2961 return run(stacks);
2962 }
2963 };
2964
2965 class RGWRadosThread {
2966 class Worker : public Thread {
2967 CephContext *cct;
2968 RGWRadosThread *processor;
2969 Mutex lock;
2970 Cond cond;
2971
2972 void wait() {
2973 Mutex::Locker l(lock);
2974 cond.Wait(lock);
2975 };
2976
2977 void wait_interval(const utime_t& wait_time) {
2978 Mutex::Locker l(lock);
2979 cond.WaitInterval(lock, wait_time);
2980 }
2981
2982 public:
2983 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2984 void *entry() override;
2985 void signal() {
2986 Mutex::Locker l(lock);
2987 cond.Signal();
2988 }
2989 };
2990
2991 Worker *worker;
2992
2993 protected:
2994 CephContext *cct;
2995 RGWRados *store;
2996
2997 std::atomic<bool> down_flag = { false };
2998
2999 string thread_name;
3000
3001 virtual uint64_t interval_msec() = 0;
3002 virtual void stop_process() {}
3003 public:
3004 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3005 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3006 virtual ~RGWRadosThread() {
3007 stop();
3008 }
3009
3010 virtual int init() { return 0; }
3011 virtual int process() = 0;
3012
3013 bool going_down() { return down_flag; }
3014
3015 void start();
3016 void stop();
3017
3018 void signal() {
3019 if (worker) {
3020 worker->signal();
3021 }
3022 }
3023 };
3024
3025 void RGWRadosThread::start()
3026 {
3027 worker = new Worker(cct, this);
3028 worker->create(thread_name.c_str());
3029 }
3030
3031 void RGWRadosThread::stop()
3032 {
3033 down_flag = true;
3034 stop_process();
3035 if (worker) {
3036 worker->signal();
3037 worker->join();
3038 }
3039 delete worker;
3040 worker = NULL;
3041 }
3042
3043 void *RGWRadosThread::Worker::entry() {
3044 uint64_t msec = processor->interval_msec();
3045 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3046
3047 do {
3048 utime_t start = ceph_clock_now();
3049 int r = processor->process();
3050 if (r < 0) {
3051 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3052 }
3053
3054 if (processor->going_down())
3055 break;
3056
3057 utime_t end = ceph_clock_now();
3058 end -= start;
3059
3060 uint64_t cur_msec = processor->interval_msec();
3061 if (cur_msec != msec) { /* was it reconfigured? */
3062 msec = cur_msec;
3063 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3064 }
3065
3066 if (cur_msec > 0) {
3067 if (interval <= end)
3068 continue; // next round
3069
3070 utime_t wait_time = interval;
3071 wait_time -= end;
3072
3073 wait_interval(wait_time);
3074 } else {
3075 wait();
3076 }
3077 } while (!processor->going_down());
3078
3079 return NULL;
3080 }
3081
3082 class RGWMetaNotifier : public RGWRadosThread {
3083 RGWMetaNotifierManager notify_mgr;
3084 RGWMetadataLog *const log;
3085
3086 uint64_t interval_msec() override {
3087 return cct->_conf->rgw_md_notify_interval_msec;
3088 }
3089 public:
3090 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3091 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3092
3093 int process() override;
3094 };
3095
3096 int RGWMetaNotifier::process()
3097 {
3098 set<int> shards;
3099
3100 log->read_clear_modified(shards);
3101
3102 if (shards.empty()) {
3103 return 0;
3104 }
3105
3106 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3107 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3108 }
3109
3110 notify_mgr.notify_all(store->zone_conn_map, shards);
3111
3112 return 0;
3113 }
3114
3115 class RGWDataNotifier : public RGWRadosThread {
3116 RGWDataNotifierManager notify_mgr;
3117
3118 uint64_t interval_msec() override {
3119 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3120 }
3121 public:
3122 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3123
3124 int process() override;
3125 };
3126
3127 int RGWDataNotifier::process()
3128 {
3129 if (!store->data_log) {
3130 return 0;
3131 }
3132
3133 map<int, set<string> > shards;
3134
3135 store->data_log->read_clear_modified(shards);
3136
3137 if (shards.empty()) {
3138 return 0;
3139 }
3140
3141 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3142 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3143 }
3144
3145 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3146
3147 return 0;
3148 }
3149
3150 class RGWSyncProcessorThread : public RGWRadosThread {
3151 public:
3152 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3153 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3154 ~RGWSyncProcessorThread() override {}
3155 int init() override = 0 ;
3156 int process() override = 0;
3157 };
3158
3159 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3160 {
3161 RGWMetaSyncStatusManager sync;
3162
3163 uint64_t interval_msec() override {
3164 return 0; /* no interval associated, it'll run once until stopped */
3165 }
3166 void stop_process() override {
3167 sync.stop();
3168 }
3169 public:
3170 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3171 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3172
3173 void wakeup_sync_shards(set<int>& shard_ids) {
3174 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3175 sync.wakeup(*iter);
3176 }
3177 }
3178 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3179
3180 int init() override {
3181 int ret = sync.init();
3182 if (ret < 0) {
3183 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3184 return ret;
3185 }
3186 return 0;
3187 }
3188
3189 int process() override {
3190 sync.run();
3191 return 0;
3192 }
3193 };
3194
3195 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3196 {
3197 RGWDataSyncStatusManager sync;
3198 bool initialized;
3199
3200 uint64_t interval_msec() override {
3201 if (initialized) {
3202 return 0; /* no interval associated, it'll run once until stopped */
3203 } else {
3204 #define DATA_SYNC_INIT_WAIT_SEC 20
3205 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3206 }
3207 }
3208 void stop_process() override {
3209 sync.stop();
3210 }
3211 public:
3212 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3213 const string& _source_zone,
3214 rgw::BucketChangeObserver *observer)
3215 : RGWSyncProcessorThread(_store, "data-sync"),
3216 sync(_store, async_rados, _source_zone, observer),
3217 initialized(false) {}
3218
3219 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3220 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3221 sync.wakeup(iter->first, iter->second);
3222 }
3223 }
3224 RGWDataSyncStatusManager* get_manager() { return &sync; }
3225
3226 int init() override {
3227 return 0;
3228 }
3229
3230 int process() override {
3231 while (!initialized) {
3232 if (going_down()) {
3233 return 0;
3234 }
3235 int ret = sync.init();
3236 if (ret >= 0) {
3237 initialized = true;
3238 break;
3239 }
3240 /* we'll be back! */
3241 return 0;
3242 }
3243 sync.run();
3244 return 0;
3245 }
3246 };
3247
3248 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3249 {
3250 RGWCoroutinesManager crs;
3251 RGWRados *store;
3252 rgw::BucketTrimManager *bucket_trim;
3253 RGWHTTPManager http;
3254 const utime_t trim_interval;
3255
3256 uint64_t interval_msec() override { return 0; }
3257 void stop_process() override { crs.stop(); }
3258 public:
3259 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3260 int interval)
3261 : RGWSyncProcessorThread(store, "sync-log-trim"),
3262 crs(store->ctx(), store->get_cr_registry()), store(store),
3263 bucket_trim(bucket_trim),
3264 http(store->ctx(), crs.get_completion_mgr()),
3265 trim_interval(interval, 0)
3266 {}
3267
3268 int init() override {
3269 return http.set_threaded();
3270 }
3271 int process() override {
3272 list<RGWCoroutinesStack*> stacks;
3273 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3274 meta->call(create_meta_log_trim_cr(store, &http,
3275 cct->_conf->rgw_md_log_max_shards,
3276 trim_interval));
3277 stacks.push_back(meta);
3278
3279 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3280 data->call(create_data_log_trim_cr(store, &http,
3281 cct->_conf->rgw_data_log_num_shards,
3282 trim_interval));
3283 stacks.push_back(data);
3284
3285 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3286 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3287 stacks.push_back(bucket);
3288
3289 crs.run(stacks);
3290 return 0;
3291 }
3292 };
3293
3294 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3295 {
3296 Mutex::Locker l(meta_sync_thread_lock);
3297 if (meta_sync_processor_thread) {
3298 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3299 }
3300 }
3301
3302 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3303 {
3304 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3305 Mutex::Locker l(data_sync_thread_lock);
3306 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3307 if (iter == data_sync_processor_threads.end()) {
3308 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3309 return;
3310 }
3311
3312 RGWDataSyncProcessorThread *thread = iter->second;
3313 assert(thread);
3314 thread->wakeup_sync_shards(shard_ids);
3315 }
3316
3317 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3318 {
3319 Mutex::Locker l(meta_sync_thread_lock);
3320 if (meta_sync_processor_thread) {
3321 return meta_sync_processor_thread->get_manager();
3322 }
3323 return nullptr;
3324 }
3325
3326 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3327 {
3328 Mutex::Locker l(data_sync_thread_lock);
3329 auto thread = data_sync_processor_threads.find(source_zone);
3330 if (thread == data_sync_processor_threads.end()) {
3331 return nullptr;
3332 }
3333 return thread->second->get_manager();
3334 }
3335
3336 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3337 {
3338 IoCtx ioctx;
3339 int r = open_pool_ctx(pool, ioctx);
3340 if (r < 0) {
3341 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3342 return r;
3343 }
3344
3345 bool requires;
3346 r = ioctx.pool_requires_alignment2(&requires);
3347 if (r < 0) {
3348 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3349 << r << dendl;
3350 return r;
3351 }
3352
3353 if (!requires) {
3354 *alignment = 0;
3355 return 0;
3356 }
3357
3358 uint64_t align;
3359 r = ioctx.pool_required_alignment2(&align);
3360 if (r < 0) {
3361 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3362 << r << dendl;
3363 return r;
3364 }
3365 if (align != 0) {
3366 ldout(cct, 20) << "required alignment=" << align << dendl;
3367 }
3368 *alignment = align;
3369 return 0;
3370 }
3371
3372 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3373 {
3374 uint64_t alignment = 0;
3375 int r = get_required_alignment(pool, &alignment);
3376 if (r < 0) {
3377 return r;
3378 }
3379
3380 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3381
3382 if (alignment == 0) {
3383 *max_chunk_size = config_chunk_size;
3384 return 0;
3385 }
3386
3387 if (config_chunk_size <= alignment) {
3388 *max_chunk_size = alignment;
3389 return 0;
3390 }
3391
3392 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3393
3394 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3395
3396 return 0;
3397 }
3398
3399 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3400 {
3401 rgw_pool pool;
3402 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3403 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3404 return -EIO;
3405 }
3406 return get_max_chunk_size(pool, max_chunk_size);
3407 }
3408
3409 class RGWIndexCompletionManager;
3410
3411 struct complete_op_data {
3412 Mutex lock{"complete_op_data"};
3413 AioCompletion *rados_completion{nullptr};
3414 int manager_shard_id{-1};
3415 RGWIndexCompletionManager *manager{nullptr};
3416 rgw_obj obj;
3417 RGWModifyOp op;
3418 string tag;
3419 rgw_bucket_entry_ver ver;
3420 cls_rgw_obj_key key;
3421 rgw_bucket_dir_entry_meta dir_meta;
3422 list<cls_rgw_obj_key> remove_objs;
3423 bool log_op;
3424 uint16_t bilog_op;
3425 rgw_zone_set zones_trace;
3426
3427 bool stopped{false};
3428
3429 void stop() {
3430 Mutex::Locker l(lock);
3431 stopped = true;
3432 }
3433 };
3434
3435 class RGWIndexCompletionThread : public RGWRadosThread {
3436 RGWRados *store;
3437
3438 uint64_t interval_msec() override {
3439 return 0;
3440 }
3441
3442 list<complete_op_data *> completions;
3443
3444 Mutex completions_lock;
3445 public:
3446 RGWIndexCompletionThread(RGWRados *_store)
3447 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3448
3449 int process() override;
3450
3451 void add_completion(complete_op_data *completion) {
3452 {
3453 Mutex::Locker l(completions_lock);
3454 completions.push_back(completion);
3455 }
3456
3457 signal();
3458 }
3459 };
3460
3461 int RGWIndexCompletionThread::process()
3462 {
3463 list<complete_op_data *> comps;
3464
3465 {
3466 Mutex::Locker l(completions_lock);
3467 completions.swap(comps);
3468 }
3469
3470 for (auto c : comps) {
3471 std::unique_ptr<complete_op_data> up{c};
3472
3473 if (going_down()) {
3474 continue;
3475 }
3476 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3477
3478 RGWRados::BucketShard bs(store);
3479
3480 int r = bs.init(c->obj.bucket, c->obj);
3481 if (r < 0) {
3482 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3483 /* not much to do */
3484 continue;
3485 }
3486
3487 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3488 librados::ObjectWriteOperation o;
3489 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3490 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3491 c->log_op, c->bilog_op, &c->zones_trace);
3492
3493 return bs->index_ctx.operate(bs->bucket_obj, &o);
3494 });
3495 if (r < 0) {
3496 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3497 /* ignoring error, can't do anything about it */
3498 continue;
3499 }
3500 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3501 if (r < 0) {
3502 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3503 }
3504 }
3505
3506 return 0;
3507 }
3508
3509 class RGWIndexCompletionManager {
3510 RGWRados *store{nullptr};
3511 vector<Mutex *> locks;
3512 vector<set<complete_op_data *> > completions;
3513
3514 RGWIndexCompletionThread *completion_thread{nullptr};
3515
3516 int num_shards;
3517
3518 std::atomic<int> cur_shard {0};
3519
3520
3521 public:
3522 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3523 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3524
3525 for (int i = 0; i < num_shards; i++) {
3526 char buf[64];
3527 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3528 locks.push_back(new Mutex(buf));
3529 }
3530
3531 completions.resize(num_shards);
3532 }
3533 ~RGWIndexCompletionManager() {
3534 stop();
3535
3536 for (auto l : locks) {
3537 delete l;
3538 }
3539 }
3540
3541 int next_shard() {
3542 int result = cur_shard % num_shards;
3543 cur_shard++;
3544 return result;
3545 }
3546
3547 void create_completion(const rgw_obj& obj,
3548 RGWModifyOp op, string& tag,
3549 rgw_bucket_entry_ver& ver,
3550 const cls_rgw_obj_key& key,
3551 rgw_bucket_dir_entry_meta& dir_meta,
3552 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3553 uint16_t bilog_op,
3554 rgw_zone_set *zones_trace,
3555 complete_op_data **result);
3556 bool handle_completion(completion_t cb, complete_op_data *arg);
3557
3558 int start() {
3559 completion_thread = new RGWIndexCompletionThread(store);
3560 int ret = completion_thread->init();
3561 if (ret < 0) {
3562 return ret;
3563 }
3564 completion_thread->start();
3565 return 0;
3566 }
3567 void stop() {
3568 if (completion_thread) {
3569 completion_thread->stop();
3570 delete completion_thread;
3571 }
3572
3573 for (int i = 0; i < num_shards; ++i) {
3574 Mutex::Locker l(*locks[i]);
3575 for (auto c : completions[i]) {
3576 Mutex::Locker cl(c->lock);
3577 c->stop();
3578 }
3579 }
3580 completions.clear();
3581 }
3582 };
3583
3584 static void obj_complete_cb(completion_t cb, void *arg)
3585 {
3586 complete_op_data *completion = (complete_op_data *)arg;
3587 completion->lock.Lock();
3588 if (completion->stopped) {
3589 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3590 delete completion;
3591 return;
3592 }
3593 bool need_delete = completion->manager->handle_completion(cb, completion);
3594 completion->lock.Unlock();
3595 if (need_delete) {
3596 delete completion;
3597 }
3598 }
3599
3600
3601 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3602 RGWModifyOp op, string& tag,
3603 rgw_bucket_entry_ver& ver,
3604 const cls_rgw_obj_key& key,
3605 rgw_bucket_dir_entry_meta& dir_meta,
3606 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3607 uint16_t bilog_op,
3608 rgw_zone_set *zones_trace,
3609 complete_op_data **result)
3610 {
3611 complete_op_data *entry = new complete_op_data;
3612
3613 int shard_id = next_shard();
3614
3615 entry->manager_shard_id = shard_id;
3616 entry->manager = this;
3617 entry->obj = obj;
3618 entry->op = op;
3619 entry->tag = tag;
3620 entry->ver = ver;
3621 entry->key = key;
3622 entry->dir_meta = dir_meta;
3623 entry->log_op = log_op;
3624 entry->bilog_op = bilog_op;
3625
3626 if (remove_objs) {
3627 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3628 entry->remove_objs.push_back(*iter);
3629 }
3630 }
3631
3632 if (zones_trace) {
3633 entry->zones_trace = *zones_trace;
3634 } else {
3635 entry->zones_trace.insert(store->get_zone().id);
3636 }
3637
3638 *result = entry;
3639
3640 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3641
3642 Mutex::Locker l(*locks[shard_id]);
3643 completions[shard_id].insert(entry);
3644 }
3645
3646 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3647 {
3648 int shard_id = arg->manager_shard_id;
3649 {
3650 Mutex::Locker l(*locks[shard_id]);
3651
3652 auto& comps = completions[shard_id];
3653
3654 auto iter = comps.find(arg);
3655 if (iter == comps.end()) {
3656 return true;
3657 }
3658
3659 comps.erase(iter);
3660 }
3661
3662 int r = rados_aio_get_return_value(cb);
3663 if (r != -ERR_BUSY_RESHARDING) {
3664 return true;
3665 }
3666 completion_thread->add_completion(arg);
3667 return false;
3668 }
3669
3670 void RGWRados::finalize()
3671 {
3672 if (run_sync_thread) {
3673 Mutex::Locker l(meta_sync_thread_lock);
3674 meta_sync_processor_thread->stop();
3675
3676 Mutex::Locker dl(data_sync_thread_lock);
3677 for (auto iter : data_sync_processor_threads) {
3678 RGWDataSyncProcessorThread *thread = iter.second;
3679 thread->stop();
3680 }
3681 if (sync_log_trimmer) {
3682 sync_log_trimmer->stop();
3683 }
3684 }
3685 if (async_rados) {
3686 async_rados->stop();
3687 }
3688 if (run_sync_thread) {
3689 delete meta_sync_processor_thread;
3690 meta_sync_processor_thread = NULL;
3691 Mutex::Locker dl(data_sync_thread_lock);
3692 for (auto iter : data_sync_processor_threads) {
3693 RGWDataSyncProcessorThread *thread = iter.second;
3694 delete thread;
3695 }
3696 data_sync_processor_threads.clear();
3697 delete sync_log_trimmer;
3698 sync_log_trimmer = nullptr;
3699 bucket_trim = boost::none;
3700 }
3701 if (finisher) {
3702 finisher->stop();
3703 }
3704 if (need_watch_notify()) {
3705 finalize_watch();
3706 }
3707 if (finisher) {
3708 /* delete finisher only after cleaning up watches, as watch error path might call
3709 * into finisher. We stop finisher before finalizing watch to make sure we don't
3710 * actually handle any racing work
3711 */
3712 delete finisher;
3713 }
3714 if (meta_notifier) {
3715 meta_notifier->stop();
3716 delete meta_notifier;
3717 }
3718 if (data_notifier) {
3719 data_notifier->stop();
3720 delete data_notifier;
3721 }
3722 delete data_log;
3723 if (async_rados) {
3724 delete async_rados;
3725 }
3726
3727 delete lc;
3728 lc = NULL;
3729
3730 delete gc;
3731 gc = NULL;
3732
3733 delete obj_expirer;
3734 obj_expirer = NULL;
3735
3736 delete rest_master_conn;
3737
3738 map<string, RGWRESTConn *>::iterator iter;
3739 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3740 RGWRESTConn *conn = iter->second;
3741 delete conn;
3742 }
3743
3744 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3745 RGWRESTConn *conn = iter->second;
3746 delete conn;
3747 }
3748 RGWQuotaHandler::free_handler(quota_handler);
3749 if (cr_registry) {
3750 cr_registry->put();
3751 }
3752 delete meta_mgr;
3753 delete binfo_cache;
3754 delete obj_tombstone_cache;
3755 delete sync_modules_manager;
3756
3757 if (reshard_wait.get()) {
3758 reshard_wait->stop();
3759 reshard_wait.reset();
3760 }
3761
3762 if (run_reshard_thread) {
3763 reshard->stop_processor();
3764 }
3765 delete reshard;
3766 delete index_completion_manager;
3767 }
3768
3769 /**
3770 * Initialize the RADOS instance and prepare to do other ops
3771 * Returns 0 on success, -ERR# on failure.
3772 */
3773 int RGWRados::init_rados()
3774 {
3775 int ret = 0;
3776 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3777
3778 for (auto& r : handles) {
3779 ret = r.init_with_context(cct);
3780 if (ret < 0) {
3781 return ret;
3782 }
3783 ret = r.connect();
3784 if (ret < 0) {
3785 return ret;
3786 }
3787 }
3788
3789 sync_modules_manager = new RGWSyncModulesManager();
3790
3791 rgw_register_sync_modules(sync_modules_manager);
3792
3793 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3794 new RGWCoroutinesManagerRegistry(cct)};
3795 ret = crs->hook_to_admin_command("cr dump");
3796 if (ret < 0) {
3797 return ret;
3798 }
3799
3800 meta_mgr = new RGWMetadataManager(cct, this);
3801 data_log = new RGWDataChangesLog(cct, this);
3802 cr_registry = crs.release();
3803
3804 std::swap(handles, rados);
3805 return ret;
3806 }
3807
3808
3809 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3810 {
3811 map<string,string> metadata = meta;
3812 metadata["num_handles"] = stringify(rados.size());
3813 metadata["zonegroup_id"] = zonegroup.get_id();
3814 metadata["zonegroup_name"] = zonegroup.get_name();
3815 metadata["zone_name"] = zone_name();
3816 metadata["zone_id"] = zone_id();;
3817 string name = cct->_conf->name.get_id();
3818 if (name.find("rgw.") == 0) {
3819 name = name.substr(4);
3820 }
3821 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3822 if (ret < 0) {
3823 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3824 return ret;
3825 }
3826
3827 return 0;
3828 }
3829
3830 /**
3831 * Add new connection to connections map
3832 * @param zonegroup_conn_map map which new connection will be added to
3833 * @param zonegroup zonegroup which new connection will connect to
3834 * @param new_connection pointer to new connection instance
3835 */
3836 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3837 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3838 {
3839 // Delete if connection is already exists
3840 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3841 if (iterZoneGroup != zonegroup_conn_map.end()) {
3842 delete iterZoneGroup->second;
3843 }
3844
3845 // Add new connection to connections map
3846 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3847 }
3848
3849 int RGWRados::convert_regionmap()
3850 {
3851 RGWZoneGroupMap zonegroupmap;
3852
3853 string pool_name = cct->_conf->rgw_zone_root_pool;
3854 if (pool_name.empty()) {
3855 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3856 }
3857 string oid = region_map_oid;
3858
3859 rgw_pool pool(pool_name);
3860 bufferlist bl;
3861 RGWObjectCtx obj_ctx(this);
3862 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3863 if (ret < 0 && ret != -ENOENT) {
3864 return ret;
3865 } else if (ret == -ENOENT) {
3866 return 0;
3867 }
3868
3869 try {
3870 bufferlist::iterator iter = bl.begin();
3871 ::decode(zonegroupmap, iter);
3872 } catch (buffer::error& err) {
3873 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3874 return -EIO;
3875 }
3876
3877 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3878 iter != zonegroupmap.zonegroups.end(); ++iter) {
3879 RGWZoneGroup& zonegroup = iter->second;
3880 ret = zonegroup.init(cct, this, false);
3881 ret = zonegroup.update();
3882 if (ret < 0 && ret != -ENOENT) {
3883 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3884 cpp_strerror(-ret) << dendl;
3885 return ret;
3886 } else if (ret == -ENOENT) {
3887 ret = zonegroup.create();
3888 if (ret < 0) {
3889 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3890 cpp_strerror(-ret) << dendl;
3891 return ret;
3892 }
3893 }
3894 }
3895
3896 current_period.set_user_quota(zonegroupmap.user_quota);
3897 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3898
3899 // remove the region_map so we don't try to convert again
3900 rgw_raw_obj obj(pool, oid);
3901 ret = delete_system_obj(obj);
3902 if (ret < 0) {
3903 ldout(cct, 0) << "Error could not remove " << obj
3904 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3905 return ret;
3906 }
3907
3908 return 0;
3909 }
3910
3911 /**
3912 * Replace all region configuration with zonegroup for
3913 * backward compatability
3914 * Returns 0 on success, -ERR# on failure.
3915 */
3916 int RGWRados::replace_region_with_zonegroup()
3917 {
3918 /* copy default region */
3919 /* convert default region to default zonegroup */
3920 string default_oid = cct->_conf->rgw_default_region_info_oid;
3921 if (default_oid.empty()) {
3922 default_oid = default_region_info_oid;
3923 }
3924
3925
3926 RGWZoneGroup default_zonegroup;
3927 rgw_pool pool{default_zonegroup.get_pool(cct)};
3928 string oid = "converted";
3929 bufferlist bl;
3930 RGWObjectCtx obj_ctx(this);
3931
3932 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3933 if (ret < 0 && ret != -ENOENT) {
3934 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3935 << dendl;
3936 return ret;
3937 } else if (ret != -ENOENT) {
3938 ldout(cct, 20) << "System already converted " << dendl;
3939 return 0;
3940 }
3941
3942 string default_region;
3943 ret = default_zonegroup.init(cct, this, false, true);
3944 if (ret < 0) {
3945 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3946 return ret;
3947 }
3948 ret = default_zonegroup.read_default_id(default_region, true);
3949 if (ret < 0 && ret != -ENOENT) {
3950 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3951 return ret;
3952 }
3953
3954 /* convert regions to zonegroups */
3955 list<string> regions;
3956 ret = list_regions(regions);
3957 if (ret < 0 && ret != -ENOENT) {
3958 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3959 return ret;
3960 } else if (ret == -ENOENT || regions.empty()) {
3961 RGWZoneParams zoneparams(default_zone_name);
3962 int ret = zoneparams.init(cct, this);
3963 if (ret < 0 && ret != -ENOENT) {
3964 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3965 return ret;
3966 }
3967 /* update master zone */
3968 RGWZoneGroup default_zg(default_zonegroup_name);
3969 ret = default_zg.init(cct, this);
3970 if (ret < 0 && ret != -ENOENT) {
3971 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3972 return ret;
3973 }
3974 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3975 default_zg.master_zone = zoneparams.get_id();
3976 return default_zg.update();
3977 }
3978 return 0;
3979 }
3980
3981 string master_region, master_zone;
3982 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3983 if (*iter != default_zonegroup_name){
3984 RGWZoneGroup region(*iter);
3985 int ret = region.init(cct, this, true, true);
3986 if (ret < 0) {
3987 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3988 return ret;
3989 }
3990 if (region.is_master_zonegroup()) {
3991 master_region = region.get_id();
3992 master_zone = region.master_zone;
3993 }
3994 }
3995 }
3996
3997 /* create realm if there is none.
3998 The realm name will be the region and zone concatenated
3999 realm id will be mds of its name */
4000 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4001 string new_realm_name = master_region + "." + master_zone;
4002 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4003 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4004 MD5 hash;
4005 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4006 hash.Final(md5);
4007 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4008 string new_realm_id(md5_str);
4009 RGWRealm new_realm(new_realm_id,new_realm_name);
4010 ret = new_realm.init(cct, this, false);
4011 if (ret < 0) {
4012 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4013 return ret;
4014 }
4015 ret = new_realm.create();
4016 if (ret < 0 && ret != -EEXIST) {
4017 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4018 return ret;
4019 }
4020 ret = new_realm.set_as_default();
4021 if (ret < 0) {
4022 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4023 return ret;
4024 }
4025 ret = realm.init(cct, this);
4026 if (ret < 0) {
4027 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4028 return ret;
4029 }
4030 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4031 if (ret < 0) {
4032 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4033 return ret;
4034 }
4035 }
4036
4037 list<string>::iterator iter;
4038 /* create zonegroups */
4039 for (iter = regions.begin(); iter != regions.end(); ++iter)
4040 {
4041 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4042 /* check to see if we don't have already a zonegroup with this name */
4043 RGWZoneGroup new_zonegroup(*iter);
4044 ret = new_zonegroup.init(cct , this);
4045 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4046 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4047 " skipping conversion " << dendl;
4048 continue;
4049 }
4050 RGWZoneGroup zonegroup(*iter);
4051 zonegroup.set_id(*iter);
4052 int ret = zonegroup.init(cct, this, true, true);
4053 if (ret < 0) {
4054 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4055 return ret;
4056 }
4057 zonegroup.realm_id = realm.get_id();
4058 /* fix default region master zone */
4059 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4060 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4061 zonegroup.master_zone = default_zone_name;
4062 }
4063 ret = zonegroup.update();
4064 if (ret < 0 && ret != -EEXIST) {
4065 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4066 << dendl;
4067 return ret;
4068 }
4069 ret = zonegroup.update_name();
4070 if (ret < 0 && ret != -EEXIST) {
4071 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4072 << dendl;
4073 return ret;
4074 }
4075 if (zonegroup.get_name() == default_region) {
4076 ret = zonegroup.set_as_default();
4077 if (ret < 0) {
4078 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4079 << dendl;
4080 return ret;
4081 }
4082 }
4083 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4084 ++iter) {
4085 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4086 RGWZoneParams zoneparams(iter->first, iter->first);
4087 zoneparams.set_id(iter->first);
4088 zoneparams.realm_id = realm.get_id();
4089 ret = zoneparams.init(cct, this);
4090 if (ret < 0 && ret != -ENOENT) {
4091 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4092 return ret;
4093 } else if (ret == -ENOENT) {
4094 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4095 continue;
4096 }
4097 zonegroup.realm_id = realm.get_id();
4098 ret = zoneparams.update();
4099 if (ret < 0 && ret != -EEXIST) {
4100 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4101 return ret;
4102 }
4103 ret = zoneparams.update_name();
4104 if (ret < 0 && ret != -EEXIST) {
4105 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4106 return ret;
4107 }
4108 }
4109
4110 if (!current_period.get_id().empty()) {
4111 ret = current_period.add_zonegroup(zonegroup);
4112 if (ret < 0) {
4113 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4114 return ret;
4115 }
4116 }
4117 }
4118
4119 if (!current_period.get_id().empty()) {
4120 ret = current_period.update();
4121 if (ret < 0) {
4122 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4123 return ret;
4124 }
4125 ret = current_period.store_info(false);
4126 if (ret < 0) {
4127 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4128 return ret;
4129 }
4130 ret = current_period.reflect();
4131 if (ret < 0) {
4132 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4133 return ret;
4134 }
4135 }
4136
4137 for (auto const& iter : regions) {
4138 RGWZoneGroup zonegroup(iter);
4139 int ret = zonegroup.init(cct, this, true, true);
4140 if (ret < 0) {
4141 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4142 return ret;
4143 }
4144 ret = zonegroup.delete_obj(true);
4145 if (ret < 0 && ret != -ENOENT) {
4146 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4147 << dendl;
4148 return ret;
4149 }
4150 }
4151
4152 /* mark as converted */
4153 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4154 true, NULL, real_time(), NULL);
4155 if (ret < 0 ) {
4156 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4157 << dendl;
4158 return ret;
4159 }
4160
4161 return 0;
4162 }
4163
4164 int RGWRados::init_zg_from_period(bool *initialized)
4165 {
4166 *initialized = false;
4167
4168 if (current_period.get_id().empty()) {
4169 return 0;
4170 }
4171
4172 int ret = zonegroup.init(cct, this);
4173 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4174 if (ret == -ENOENT) {
4175 return 0;
4176 }
4177 if (ret < 0) {
4178 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4179 return ret;
4180 }
4181 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4182
4183 map<string, RGWZoneGroup>::const_iterator iter =
4184 current_period.get_map().zonegroups.find(zonegroup.get_id());
4185
4186 if (iter != current_period.get_map().zonegroups.end()) {
4187 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4188 zonegroup = iter->second;
4189 ret = zonegroup.init(cct, this, false);
4190 if (ret < 0) {
4191 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4192 return ret;
4193 }
4194 ret = zone_params.init(cct, this);
4195 if (ret < 0 && ret != -ENOENT) {
4196 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4197 return ret;
4198 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4199 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4200 zone_params.set_name(default_zone_name);
4201 ret = zone_params.init(cct, this);
4202 if (ret < 0 && ret != -ENOENT) {
4203 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4204 return ret;
4205 }
4206 }
4207 }
4208 for (iter = current_period.get_map().zonegroups.begin();
4209 iter != current_period.get_map().zonegroups.end(); ++iter){
4210 const RGWZoneGroup& zg = iter->second;
4211 // use endpoints from the zonegroup's master zone
4212 auto master = zg.zones.find(zg.master_zone);
4213 if (master == zg.zones.end()) {
4214 // fix missing master zone for a single zone zonegroup
4215 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4216 master = zg.zones.begin();
4217 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4218 master->second.name << " id:" << master->second.id << " as master" << dendl;
4219 if (zonegroup.get_id() == zg.get_id()) {
4220 zonegroup.master_zone = master->second.id;
4221 ret = zonegroup.update();
4222 if (ret < 0) {
4223 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4224 return ret;
4225 }
4226 } else {
4227 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4228 ret = fixed_zg.init(cct, this);
4229 if (ret < 0) {
4230 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4231 return ret;
4232 }
4233 fixed_zg.master_zone = master->second.id;
4234 ret = fixed_zg.update();
4235 if (ret < 0) {
4236 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4237 return ret;
4238 }
4239 }
4240 } else {
4241 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4242 zg.master_zone << dendl;
4243 return -EINVAL;
4244 }
4245 }
4246 const auto& endpoints = master->second.endpoints;
4247 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4248 if (!current_period.get_master_zonegroup().empty() &&
4249 zg.get_id() == current_period.get_master_zonegroup()) {
4250 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4251 }
4252 }
4253
4254 *initialized = true;
4255
4256 return 0;
4257 }
4258
4259 int RGWRados::init_zg_from_local(bool *creating_defaults)
4260 {
4261 int ret = zonegroup.init(cct, this);
4262 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4263 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4264 return ret;
4265 } else if (ret == -ENOENT) {
4266 *creating_defaults = true;
4267 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4268 ret = zonegroup.create_default();
4269 if (ret < 0) {
4270 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4271 << dendl;
4272 return ret;
4273 }
4274 ret = zonegroup.init(cct, this);
4275 if (ret < 0) {
4276 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4277 << dendl;
4278 return ret;
4279 }
4280 }
4281 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4282 if (zonegroup.is_master_zonegroup()) {
4283 // use endpoints from the zonegroup's master zone
4284 auto master = zonegroup.zones.find(zonegroup.master_zone);
4285 if (master == zonegroup.zones.end()) {
4286 // fix missing master zone for a single zone zonegroup
4287 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4288 master = zonegroup.zones.begin();
4289 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4290 master->second.name << " id:" << master->second.id << " as master" << dendl;
4291 zonegroup.master_zone = master->second.id;
4292 ret = zonegroup.update();
4293 if (ret < 0) {
4294 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4295 return ret;
4296 }
4297 } else {
4298 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4299 "master_zone=" << zonegroup.master_zone << dendl;
4300 return -EINVAL;
4301 }
4302 }
4303 const auto& endpoints = master->second.endpoints;
4304 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4305 }
4306
4307 return 0;
4308 }
4309
4310
4311 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4312 {
4313 return target_zone.syncs_from(source_zone.name) &&
4314 sync_modules_manager->supports_data_export(source_zone.tier_type);
4315 }
4316
4317 /**
4318 * Initialize the RADOS instance and prepare to do other ops
4319 * Returns 0 on success, -ERR# on failure.
4320 */
4321 int RGWRados::init_complete()
4322 {
4323 int ret = realm.init(cct, this);
4324 if (ret < 0 && ret != -ENOENT) {
4325 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4326 return ret;
4327 } else if (ret != -ENOENT) {
4328 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4329 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4330 if (ret < 0 && ret != -ENOENT) {
4331 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4332 return ret;
4333 }
4334 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4335 }
4336
4337 ret = replace_region_with_zonegroup();
4338 if (ret < 0) {
4339 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4340 return ret;
4341 }
4342
4343 ret = convert_regionmap();
4344 if (ret < 0) {
4345 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4346 return ret;
4347 }
4348
4349 bool zg_initialized = false;
4350
4351 if (!current_period.get_id().empty()) {
4352 ret = init_zg_from_period(&zg_initialized);
4353 if (ret < 0) {
4354 return ret;
4355 }
4356 }
4357
4358 bool creating_defaults = false;
4359 bool using_local = (!zg_initialized);
4360 if (using_local) {
4361 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4362 ret = init_zg_from_local(&creating_defaults);
4363 if (ret < 0) {
4364 return ret;
4365 }
4366 // read period_config into current_period
4367 auto& period_config = current_period.get_config();
4368 ret = period_config.read(this, zonegroup.realm_id);
4369 if (ret < 0 && ret != -ENOENT) {
4370 ldout(cct, 0) << "ERROR: failed to read period config: "
4371 << cpp_strerror(ret) << dendl;
4372 return ret;
4373 }
4374 }
4375
4376 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4377 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4378 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4379 zone_params.set_name(default_zone_name);
4380 }
4381
4382 ret = zone_params.init(cct, this);
4383 if (ret < 0 && ret != -ENOENT) {
4384 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4385 return ret;
4386 }
4387 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4388 if (zone_iter == get_zonegroup().zones.end()) {
4389 if (using_local) {
4390 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4391 return -EINVAL;
4392 }
4393 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4394 ret = init_zg_from_local(&creating_defaults);
4395 if (ret < 0) {
4396 return ret;
4397 }
4398 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4399 }
4400 if (zone_iter != get_zonegroup().zones.end()) {
4401 zone_public_config = zone_iter->second;
4402 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4403 } else {
4404 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4405 return -EINVAL;
4406 }
4407
4408 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4409
4410 if (run_sync_thread) {
4411 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4412 if (ret < 0) {
4413 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4414 return ret;
4415 }
4416 }
4417
4418 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4419
4420 init_unique_trans_id_deps();
4421
4422 finisher = new Finisher(cct);
4423 finisher->start();
4424
4425 period_puller.reset(new RGWPeriodPuller(this));
4426 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4427 current_period));
4428
4429 if (need_watch_notify()) {
4430 ret = init_watch();
4431 if (ret < 0) {
4432 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4433 return ret;
4434 }
4435 }
4436
4437 /* first build all zones index */
4438 for (auto ziter : get_zonegroup().zones) {
4439 const string& id = ziter.first;
4440 RGWZone& z = ziter.second;
4441 zone_id_by_name[z.name] = id;
4442 zone_by_id[id] = z;
4443 }
4444
4445 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4446 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4447 }
4448 zone_public_config = zone_by_id[zone_id()];
4449 for (auto ziter : get_zonegroup().zones) {
4450 const string& id = ziter.first;
4451 RGWZone& z = ziter.second;
4452 if (id == zone_id()) {
4453 continue;
4454 }
4455 if (z.endpoints.empty()) {
4456 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4457 continue;
4458 }
4459 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4460 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4461 zone_conn_map[id] = conn;
4462 if (zone_syncs_from(zone_public_config, z) ||
4463 zone_syncs_from(z, zone_public_config)) {
4464 if (zone_syncs_from(zone_public_config, z)) {
4465 zone_data_sync_from_map[id] = conn;
4466 }
4467 if (zone_syncs_from(z, zone_public_config)) {
4468 zone_data_notify_to_map[id] = conn;
4469 }
4470 } else {
4471 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4472 }
4473 }
4474
4475 ret = open_root_pool_ctx();
4476 if (ret < 0)
4477 return ret;
4478
4479 ret = open_gc_pool_ctx();
4480 if (ret < 0)
4481 return ret;
4482
4483 ret = open_lc_pool_ctx();
4484 if (ret < 0)
4485 return ret;
4486
4487 ret = open_objexp_pool_ctx();
4488 if (ret < 0)
4489 return ret;
4490
4491 ret = open_reshard_pool_ctx();
4492 if (ret < 0)
4493 return ret;
4494
4495 pools_initialized = true;
4496
4497 gc = new RGWGC();
4498 gc->initialize(cct, this);
4499
4500 obj_expirer = new RGWObjectExpirer(this);
4501
4502 if (use_gc_thread) {
4503 gc->start_processor();
4504 obj_expirer->start_processor();
4505 }
4506
4507 /* no point of running sync thread if we don't have a master zone configured
4508 or there is no rest_master_conn */
4509 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4510 || current_period.get_id().empty()) {
4511 run_sync_thread = false;
4512 }
4513
4514 if (run_sync_thread) {
4515 // initialize the log period history
4516 meta_mgr->init_oldest_log_period();
4517 }
4518
4519 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4520 async_rados->start();
4521
4522 ret = meta_mgr->init(current_period.get_id());
4523 if (ret < 0) {
4524 lderr(cct) << "ERROR: failed to initialize metadata log: "
4525 << cpp_strerror(-ret) << dendl;
4526 return ret;
4527 }
4528
4529 if (is_meta_master()) {
4530 auto md_log = meta_mgr->get_log(current_period.get_id());
4531 meta_notifier = new RGWMetaNotifier(this, md_log);
4532 meta_notifier->start();
4533 }
4534
4535 if (run_sync_thread) {
4536 Mutex::Locker l(meta_sync_thread_lock);
4537 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4538 ret = meta_sync_processor_thread->init();
4539 if (ret < 0) {
4540 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4541 return ret;
4542 }
4543 meta_sync_processor_thread->start();
4544
4545 // configure the bucket trim manager
4546 rgw::BucketTrimConfig config;
4547 rgw::configure_bucket_trim(cct, config);
4548
4549 bucket_trim.emplace(this, config);
4550 ret = bucket_trim->init();
4551 if (ret < 0) {
4552 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4553 return ret;
4554 }
4555
4556 Mutex::Locker dl(data_sync_thread_lock);
4557 for (auto iter : zone_data_sync_from_map) {
4558 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4559 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first,
4560 &*bucket_trim);
4561 ret = thread->init();
4562 if (ret < 0) {
4563 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4564 return ret;
4565 }
4566 thread->start();
4567 data_sync_processor_threads[iter.first] = thread;
4568 }
4569 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4570 if (interval > 0) {
4571 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
4572 ret = sync_log_trimmer->init();
4573 if (ret < 0) {
4574 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4575 return ret;
4576 }
4577 sync_log_trimmer->start();
4578 }
4579 }
4580 data_notifier = new RGWDataNotifier(this);
4581 data_notifier->start();
4582
4583 lc = new RGWLC();
4584 lc->initialize(cct, this);
4585
4586 if (use_lc_thread)
4587 lc->start_processor();
4588
4589 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4590
4591 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4592 get_zone().bucket_index_max_shards);
4593 if (bucket_index_max_shards > get_max_bucket_shards()) {
4594 bucket_index_max_shards = get_max_bucket_shards();
4595 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4596 << get_max_bucket_shards() << dendl;
4597 }
4598 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4599
4600 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4601 binfo_cache->init(this);
4602
4603 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4604
4605 if (need_tombstone_cache) {
4606 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4607 }
4608
4609 reshard_wait = std::make_shared<RGWReshardWait>(this);
4610
4611 reshard = new RGWReshard(this);
4612
4613 /* only the master zone in the zonegroup reshards buckets */
4614 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4615 if (run_reshard_thread) {
4616 reshard->start_processor();
4617 }
4618
4619 index_completion_manager = new RGWIndexCompletionManager(this);
4620 ret = index_completion_manager->start();
4621
4622 return ret;
4623 }
4624
4625 /**
4626 * Initialize the RADOS instance and prepare to do other ops
4627 * Returns 0 on success, -ERR# on failure.
4628 */
4629 int RGWRados::initialize()
4630 {
4631 int ret;
4632
4633 ret = init_rados();
4634 if (ret < 0)
4635 return ret;
4636
4637 return init_complete();
4638 }
4639
4640 void RGWRados::finalize_watch()
4641 {
4642 for (int i = 0; i < num_watchers; i++) {
4643 RGWWatcher *watcher = watchers[i];
4644 watcher->unregister_watch();
4645 delete watcher;
4646 }
4647
4648 delete[] notify_oids;
4649 delete[] watchers;
4650 }
4651
4652 void RGWRados::schedule_context(Context *c) {
4653 finisher->queue(c);
4654 }
4655
4656 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4657 {
4658 bool is_truncated;
4659 RGWListRawObjsCtx ctx;
4660 do {
4661 list<string> oids;
4662 int r = list_raw_objects(pool, prefix, 1000,
4663 ctx, oids, &is_truncated);
4664 if (r < 0) {
4665 return r;
4666 }
4667 list<string>::iterator iter;
4668 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4669 string& val = *iter;
4670 if (val.size() > prefix.size())
4671 result.push_back(val.substr(prefix.size()));
4672 }
4673 } while (is_truncated);
4674
4675 return 0;
4676 }
4677
4678 int RGWRados::list_regions(list<string>& regions)
4679 {
4680 RGWZoneGroup zonegroup;
4681
4682 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4683 }
4684
4685 int RGWRados::list_zonegroups(list<string>& zonegroups)
4686 {
4687 RGWZoneGroup zonegroup;
4688
4689 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4690 }
4691
4692 int RGWRados::list_zones(list<string>& zones)
4693 {
4694 RGWZoneParams zoneparams;
4695
4696 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4697 }
4698
4699 int RGWRados::list_realms(list<string>& realms)
4700 {
4701 RGWRealm realm(cct, this);
4702 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4703 }
4704
4705 int RGWRados::list_periods(list<string>& periods)
4706 {
4707 RGWPeriod period;
4708 list<string> raw_periods;
4709 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4710 if (ret < 0) {
4711 return ret;
4712 }
4713 for (const auto& oid : raw_periods) {
4714 size_t pos = oid.find(".");
4715 if (pos != std::string::npos) {
4716 periods.push_back(oid.substr(0, pos));
4717 } else {
4718 periods.push_back(oid);
4719 }
4720 }
4721 periods.sort(); // unique() only detects duplicates if they're adjacent
4722 periods.unique();
4723 return 0;
4724 }
4725
4726
4727 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4728 {
4729 int ret = 0;
4730 string period_id = current_period;
4731 while(!period_id.empty()) {
4732 RGWPeriod period(period_id);
4733 ret = period.init(cct, this);
4734 if (ret < 0) {
4735 return ret;
4736 }
4737 periods.push_back(period.get_id());
4738 period_id = period.get_predecessor();
4739 }
4740
4741 return ret;
4742 }
4743
4744 /**
4745 * Open the pool used as root for this gateway
4746 * Returns: 0 on success, -ERR# otherwise.
4747 */
4748 int RGWRados::open_root_pool_ctx()
4749 {
4750 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4751 }
4752
4753 int RGWRados::open_gc_pool_ctx()
4754 {
4755 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4756 }
4757
4758 int RGWRados::open_lc_pool_ctx()
4759 {
4760 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4761 }
4762
4763 int RGWRados::open_objexp_pool_ctx()
4764 {
4765 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4766 }
4767
4768 int RGWRados::open_reshard_pool_ctx()
4769 {
4770 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4771 }
4772
4773 int RGWRados::init_watch()
4774 {
4775 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4776 if (r < 0) {
4777 return r;
4778 }
4779
4780 num_watchers = cct->_conf->rgw_num_control_oids;
4781
4782 bool compat_oid = (num_watchers == 0);
4783
4784 if (num_watchers <= 0)
4785 num_watchers = 1;
4786
4787 notify_oids = new string[num_watchers];
4788 watchers = new RGWWatcher *[num_watchers];
4789
4790 for (int i=0; i < num_watchers; i++) {
4791 string& notify_oid = notify_oids[i];
4792 notify_oid = notify_oid_prefix;
4793 if (!compat_oid) {
4794 char buf[16];
4795 snprintf(buf, sizeof(buf), ".%d", i);
4796 notify_oid.append(buf);
4797 }
4798 r = control_pool_ctx.create(notify_oid, false);
4799 if (r < 0 && r != -EEXIST)
4800 return r;
4801
4802 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4803 watchers[i] = watcher;
4804
4805 r = watcher->register_watch();
4806 if (r < 0)
4807 return r;
4808 }
4809
4810 watch_initialized = true;
4811
4812 set_cache_enabled(true);
4813
4814 return 0;
4815 }
4816
4817 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4818 {
4819 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4820
4821 int i = r % num_watchers;
4822 char buf[16];
4823 snprintf(buf, sizeof(buf), ".%d", i);
4824
4825 notify_oid = notify_oid_prefix;
4826 notify_oid.append(buf);
4827 }
4828
4829 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4830 {
4831 librados::Rados *rad = get_rados_handle();
4832 int r = rgw_init_ioctx(rad, pool, io_ctx);
4833 if (r != -ENOENT)
4834 return r;
4835
4836 if (!pools_initialized)
4837 return r;
4838
4839 r = rad->pool_create(pool.name.c_str());
4840 if (r < 0 && r != -EEXIST)
4841 return r;
4842
4843 r = rgw_init_ioctx(rad, pool, io_ctx);
4844 if (r < 0)
4845 return r;
4846
4847 r = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
4848 if (r < 0 && r != -EOPNOTSUPP)
4849 return r;
4850 return 0;
4851 }
4852
4853 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4854 string *marker) {
4855 if (marker) {
4856 *marker = shard_id_str;
4857 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4858 marker->append(shard_marker);
4859 }
4860 }
4861
4862 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4863 {
4864 const string *rule = &bucket_info.placement_rule;
4865 if (rule->empty()) {
4866 rule = &zonegroup.default_placement;
4867 }
4868 auto iter = zone_params.placement_pools.find(*rule);
4869 if (iter == zone_params.placement_pools.end()) {
4870 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4871 return -EINVAL;
4872 }
4873
4874 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4875 if (r < 0)
4876 return r;
4877
4878 return 0;
4879 }
4880
4881 /**
4882 * set up a bucket listing.
4883 * handle is filled in.
4884 * Returns 0 on success, -ERR# otherwise.
4885 */
4886 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4887 {
4888 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4889 *handle = (RGWAccessHandle)state;
4890 return 0;
4891 }
4892
4893 /**
4894 * get the next bucket in the listing.
4895 * obj is filled in,
4896 * handle is updated.
4897 * returns 0 on success, -ERR# otherwise.
4898 */
4899 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4900 {
4901 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4902
4903 do {
4904 if (*state == root_pool_ctx.nobjects_end()) {
4905 delete state;
4906 return -ENOENT;
4907 }
4908
4909 obj.key.name = (*state)->get_oid();
4910 if (obj.key.name[0] == '_') {
4911 obj.key.name = obj.key.name.substr(1);
4912 }
4913
4914 (*state)++;
4915 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4916
4917 return 0;
4918 }
4919
4920
4921 /**** logs ****/
4922
4923 struct log_list_state {
4924 string prefix;
4925 librados::IoCtx io_ctx;
4926 librados::NObjectIterator obit;
4927 };
4928
4929 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4930 {
4931 log_list_state *state = new log_list_state;
4932 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4933 if (r < 0) {
4934 delete state;
4935 return r;
4936 }
4937 state->prefix = prefix;
4938 state->obit = state->io_ctx.nobjects_begin();
4939 *handle = (RGWAccessHandle)state;
4940 return 0;
4941 }
4942
4943 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4944 {
4945 log_list_state *state = static_cast<log_list_state *>(handle);
4946 while (true) {
4947 if (state->obit == state->io_ctx.nobjects_end()) {
4948 delete state;
4949 return -ENOENT;
4950 }
4951 if (state->prefix.length() &&
4952 state->obit->get_oid().find(state->prefix) != 0) {
4953 state->obit++;
4954 continue;
4955 }
4956 *name = state->obit->get_oid();
4957 state->obit++;
4958 break;
4959 }
4960 return 0;
4961 }
4962
4963 int RGWRados::log_remove(const string& name)
4964 {
4965 librados::IoCtx io_ctx;
4966 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4967 if (r < 0)
4968 return r;
4969 return io_ctx.remove(name);
4970 }
4971
4972 struct log_show_state {
4973 librados::IoCtx io_ctx;
4974 bufferlist bl;
4975 bufferlist::iterator p;
4976 string name;
4977 uint64_t pos;
4978 bool eof;
4979 log_show_state() : pos(0), eof(false) {}
4980 };
4981
4982 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4983 {
4984 log_show_state *state = new log_show_state;
4985 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4986 if (r < 0) {
4987 delete state;
4988 return r;
4989 }
4990 state->name = name;
4991 *handle = (RGWAccessHandle)state;
4992 return 0;
4993 }
4994
4995 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4996 {
4997 log_show_state *state = static_cast<log_show_state *>(handle);
4998 off_t off = state->p.get_off();
4999
5000 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5001 << " off " << off
5002 << " eof " << (int)state->eof
5003 << dendl;
5004 // read some?
5005 unsigned chunk = 1024*1024;
5006 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5007 bufferlist more;
5008 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5009 if (r < 0)
5010 return r;
5011 state->pos += r;
5012 bufferlist old;
5013 try {
5014 old.substr_of(state->bl, off, state->bl.length() - off);
5015 } catch (buffer::error& err) {
5016 return -EINVAL;
5017 }
5018 state->bl.clear();
5019 state->bl.claim(old);
5020 state->bl.claim_append(more);
5021 state->p = state->bl.begin();
5022 if ((unsigned)r < chunk)
5023 state->eof = true;
5024 ldout(cct, 10) << " read " << r << dendl;
5025 }
5026
5027 if (state->p.end())
5028 return 0; // end of file
5029 try {
5030 ::decode(*entry, state->p);
5031 }
5032 catch (const buffer::error &e) {
5033 return -EINVAL;
5034 }
5035 return 1;
5036 }
5037
5038 /**
5039 * usage_log_hash: get usage log key hash, based on name and index
5040 *
5041 * Get the usage object name. Since a user may have more than 1
5042 * object holding that info (multiple shards), we use index to
5043 * specify that shard number. Once index exceeds max shards it
5044 * wraps.
5045 * If name is not being set, results for all users will be returned
5046 * and index will wrap only after total shards number.
5047 *
5048 * @param cct [in] ceph context
5049 * @param name [in] user name
5050 * @param hash [out] hash value
5051 * @param index [in] shard index number
5052 */
5053 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5054 {
5055 uint32_t val = index;
5056
5057 if (!name.empty()) {
5058 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5059 val %= max_user_shards;
5060 val += ceph_str_hash_linux(name.c_str(), name.size());
5061 }
5062 char buf[17];
5063 int max_shards = cct->_conf->rgw_usage_max_shards;
5064 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5065 hash = buf;
5066 }
5067
5068 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5069 {
5070 uint32_t index = 0;
5071
5072 map<string, rgw_usage_log_info> log_objs;
5073
5074 string hash;
5075 string last_user;
5076
5077 /* restructure usage map, zone by object hash */
5078 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5079 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5080 const rgw_user_bucket& ub = iter->first;
5081 RGWUsageBatch& info = iter->second;
5082
5083 if (ub.user.empty()) {
5084 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5085 continue;
5086 }
5087
5088 if (ub.user != last_user) {
5089 /* index *should* be random, but why waste extra cycles
5090 in most cases max user shards is not going to exceed 1,
5091 so just incrementing it */
5092 usage_log_hash(cct, ub.user, hash, index++);
5093 }
5094 last_user = ub.user;
5095 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5096
5097 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5098 v.push_back(miter->second);
5099 }
5100 }
5101
5102 map<string, rgw_usage_log_info>::iterator liter;
5103
5104 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5105 int r = cls_obj_usage_log_add(liter->first, liter->second);
5106 if (r < 0)
5107 return r;
5108 }
5109 return 0;
5110 }
5111
5112 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5113 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5114 {
5115 uint32_t num = max_entries;
5116 string hash, first_hash;
5117 string user_str = user.to_str();
5118 usage_log_hash(cct, user_str, first_hash, 0);
5119
5120 if (usage_iter.index) {
5121 usage_log_hash(cct, user_str, hash, usage_iter.index);
5122 } else {
5123 hash = first_hash;
5124 }
5125
5126 usage.clear();
5127
5128 do {
5129 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5130 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5131
5132 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5133 usage_iter.read_iter, ret_usage, is_truncated);
5134 if (ret == -ENOENT)
5135 goto next;
5136
5137 if (ret < 0)
5138 return ret;
5139
5140 num -= ret_usage.size();
5141
5142 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5143 usage[iter->first].aggregate(iter->second);
5144 }
5145
5146 next:
5147 if (!*is_truncated) {
5148 usage_iter.read_iter.clear();
5149 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5150 }
5151 } while (num && !*is_truncated && hash != first_hash);
5152 return 0;
5153 }
5154
5155 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5156 {
5157 uint32_t index = 0;
5158 string hash, first_hash;
5159 string user_str = user.to_str();
5160 usage_log_hash(cct, user_str, first_hash, index);
5161
5162 hash = first_hash;
5163 do {
5164 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5165
5166 if (ret < 0 && ret != -ENOENT)
5167 return ret;
5168
5169 usage_log_hash(cct, user_str, hash, ++index);
5170 } while (hash != first_hash);
5171
5172 return 0;
5173 }
5174
5175 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5176 {
5177 return rgw_shards_hash(key, max_shards);
5178 }
5179
5180 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5181 {
5182 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5183 char buf[16];
5184 if (shard_id) {
5185 *shard_id = val % max_shards;
5186 }
5187 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5188 name = prefix + buf;
5189 }
5190
5191 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5192 {
5193 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5194 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5195 char buf[16];
5196 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5197 name = prefix + buf;
5198 }
5199
5200 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5201 {
5202 char buf[16];
5203 snprintf(buf, sizeof(buf), "%u", shard_id);
5204 name = prefix + buf;
5205
5206 }
5207
5208 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5209 {
5210 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5211 }
5212
5213 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5214 {
5215 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5216
5217 }
5218
5219 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5220 {
5221 librados::IoCtx io_ctx;
5222
5223 int r = time_log_add_init(io_ctx);
5224 if (r < 0) {
5225 return r;
5226 }
5227
5228 ObjectWriteOperation op;
5229 utime_t t(ut);
5230 cls_log_add(op, t, section, key, bl);
5231
5232 return io_ctx.operate(oid, &op);
5233 }
5234
5235 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5236 librados::AioCompletion *completion, bool monotonic_inc)
5237 {
5238 librados::IoCtx io_ctx;
5239
5240 int r = time_log_add_init(io_ctx);
5241 if (r < 0) {
5242 return r;
5243 }
5244
5245 ObjectWriteOperation op;
5246 cls_log_add(op, entries, monotonic_inc);
5247
5248 if (!completion) {
5249 r = io_ctx.operate(oid, &op);
5250 } else {
5251 r = io_ctx.aio_operate(oid, completion, &op);
5252 }
5253 return r;
5254 }
5255
5256 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5257 int max_entries, list<cls_log_entry>& entries,
5258 const string& marker,
5259 string *out_marker,
5260 bool *truncated)
5261 {
5262 librados::IoCtx io_ctx;
5263
5264 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5265 if (r < 0)
5266 return r;
5267 librados::ObjectReadOperation op;
5268
5269 utime_t st(start_time);
5270 utime_t et(end_time);
5271
5272 cls_log_list(op, st, et, marker, max_entries, entries,
5273 out_marker, truncated);
5274
5275 bufferlist obl;
5276
5277 int ret = io_ctx.operate(oid, &op, &obl);
5278 if (ret < 0)
5279 return ret;
5280
5281 return 0;
5282 }
5283
5284 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5285 {
5286 librados::IoCtx io_ctx;
5287
5288 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5289 if (r < 0)
5290 return r;
5291 librados::ObjectReadOperation op;
5292
5293 cls_log_info(op, header);
5294
5295 bufferlist obl;
5296
5297 int ret = io_ctx.operate(oid, &op, &obl);
5298 if (ret < 0)
5299 return ret;
5300
5301 return 0;
5302 }
5303
5304 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5305 {
5306 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5307 if (r < 0)
5308 return r;
5309
5310 librados::ObjectReadOperation op;
5311
5312 cls_log_info(op, header);
5313
5314 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5315 if (ret < 0)
5316 return ret;
5317
5318 return 0;
5319 }
5320
5321 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5322 const string& from_marker, const string& to_marker,
5323 librados::AioCompletion *completion)
5324 {
5325 librados::IoCtx io_ctx;
5326
5327 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5328 if (r < 0)
5329 return r;
5330
5331 utime_t st(start_time);
5332 utime_t et(end_time);
5333
5334 ObjectWriteOperation op;
5335 cls_log_trim(op, st, et, from_marker, to_marker);
5336
5337 if (!completion) {
5338 r = io_ctx.operate(oid, &op);
5339 } else {
5340 r = io_ctx.aio_operate(oid, completion, &op);
5341 }
5342 return r;
5343 }
5344
5345 string RGWRados::objexp_hint_get_shardname(int shard_num)
5346 {
5347 char buf[32];
5348 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5349
5350 string objname("obj_delete_at_hint.");
5351 return objname + buf;
5352 }
5353
5354 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5355 {
5356 string obj_key = key.name + key.instance;
5357 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5358 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5359 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5360 sid = rgw_shards_mod(sid2, num_shards);
5361 return sid;
5362 }
5363
5364 static string objexp_hint_get_keyext(const string& tenant_name,
5365 const string& bucket_name,
5366 const string& bucket_id,
5367 const rgw_obj_key& obj_key)
5368 {
5369 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5370 ":" + obj_key.name + ":" + obj_key.instance;
5371 }
5372
5373 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5374 const string& tenant_name,
5375 const string& bucket_name,
5376 const string& bucket_id,
5377 const rgw_obj_index_key& obj_key)
5378 {
5379 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5380 bucket_id, obj_key);
5381 objexp_hint_entry he = {
5382 .tenant = tenant_name,
5383 .bucket_name = bucket_name,
5384 .bucket_id = bucket_id,
5385 .obj_key = obj_key,
5386 .exp_time = delete_at };
5387 bufferlist hebl;
5388 ::encode(he, hebl);
5389 ObjectWriteOperation op;
5390 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5391
5392 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5393 return objexp_pool_ctx.operate(shard_name, &op);
5394 }
5395
5396 void RGWRados::objexp_get_shard(int shard_num,
5397 string& shard) /* out */
5398 {
5399 shard = objexp_hint_get_shardname(shard_num);
5400 }
5401
5402 int RGWRados::objexp_hint_list(const string& oid,
5403 const ceph::real_time& start_time,
5404 const ceph::real_time& end_time,
5405 const int max_entries,
5406 const string& marker,
5407 list<cls_timeindex_entry>& entries, /* out */
5408 string *out_marker, /* out */
5409 bool *truncated) /* out */
5410 {
5411 librados::ObjectReadOperation op;
5412 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5413 out_marker, truncated);
5414
5415 bufferlist obl;
5416 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5417
5418 if ((ret < 0 ) && (ret != -ENOENT)) {
5419 return ret;
5420 }
5421
5422 if ((ret == -ENOENT) && truncated) {
5423 *truncated = false;
5424 }
5425
5426 return 0;
5427 }
5428
5429 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5430 objexp_hint_entry& hint_entry) /* out */
5431 {
5432 try {
5433 bufferlist::iterator iter = ti_entry.value.begin();
5434 ::decode(hint_entry, iter);
5435 } catch (buffer::error& err) {
5436 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5437 }
5438
5439 return 0;
5440 }
5441
5442 int RGWRados::objexp_hint_trim(const string& oid,
5443 const ceph::real_time& start_time,
5444 const ceph::real_time& end_time,
5445 const string& from_marker,
5446 const string& to_marker)
5447 {
5448 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5449 from_marker, to_marker);
5450 if ((ret < 0 ) && (ret != -ENOENT)) {
5451 return ret;
5452 }
5453
5454 return 0;
5455 }
5456
5457 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5458 string& zone_id, string& owner_id) {
5459 librados::IoCtx io_ctx;
5460
5461 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5462 if (r < 0) {
5463 return r;
5464 }
5465 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5466 utime_t ut(msec / 1000, msec % 1000);
5467
5468 rados::cls::lock::Lock l(log_lock_name);
5469 l.set_duration(ut);
5470 l.set_cookie(owner_id);
5471 l.set_tag(zone_id);
5472 l.set_renew(true);
5473
5474 return l.lock_exclusive(&io_ctx, oid);
5475 }
5476
5477 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5478 librados::IoCtx io_ctx;
5479
5480 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5481 if (r < 0) {
5482 return r;
5483 }
5484
5485 rados::cls::lock::Lock l(log_lock_name);
5486 l.set_tag(zone_id);
5487 l.set_cookie(owner_id);
5488
5489 return l.unlock(&io_ctx, oid);
5490 }
5491
5492 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5493 {
5494 bufferlist::iterator i = bl.begin();
5495 RGWAccessControlPolicy policy(cct);
5496 try {
5497 policy.decode_owner(i);
5498 } catch (buffer::error& err) {
5499 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5500 return -EIO;
5501 }
5502 *owner = policy.get_owner();
5503 return 0;
5504 }
5505
5506 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5507 {
5508 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5509 if (aiter == attrset.end())
5510 return -EIO;
5511
5512 bufferlist& bl = aiter->second;
5513 bufferlist::iterator iter = bl.begin();
5514 try {
5515 policy->decode(iter);
5516 } catch (buffer::error& err) {
5517 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5518 return -EIO;
5519 }
5520 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5521 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5522 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5523 s3policy->to_xml(*_dout);
5524 *_dout << dendl;
5525 }
5526 return 0;
5527 }
5528
5529
5530 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5531 {
5532 rgw_bucket bucket = bucket_info.bucket;
5533 bucket.update_bucket_id(new_bucket_id);
5534
5535 RGWObjectCtx obj_ctx(store);
5536
5537 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5538 if (ret < 0) {
5539 return ret;
5540 }
5541
5542 return 0;
5543 }
5544
5545 /**
5546 * get listing of the objects in a bucket.
5547 *
5548 * max: maximum number of results to return
5549 * bucket: bucket to list contents of
5550 * prefix: only return results that match this prefix
5551 * delim: do not include results that match this string.
5552 * Any skipped results will have the matching portion of their name
5553 * inserted in common_prefixes with a "true" mark.
5554 * marker: if filled in, begin the listing with this object.
5555 * end_marker: if filled in, end the listing with this object.
5556 * result: the objects are put in here.
5557 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5558 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5559 */
5560 int RGWRados::Bucket::List::list_objects(int64_t max,
5561 vector<rgw_bucket_dir_entry> *result,
5562 map<string, bool> *common_prefixes,
5563 bool *is_truncated)
5564 {
5565 RGWRados *store = target->get_store();
5566 CephContext *cct = store->ctx();
5567 int shard_id = target->get_shard_id();
5568
5569 int count = 0;
5570 bool truncated = true;
5571 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5572
5573 result->clear();
5574
5575 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5576 rgw_obj_index_key cur_marker;
5577 marker_obj.get_index_key(&cur_marker);
5578
5579 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5580 params.ns);
5581 rgw_obj_index_key cur_end_marker;
5582 end_marker_obj.get_index_key(&cur_end_marker);
5583 const bool cur_end_marker_valid = !params.end_marker.empty();
5584
5585 rgw_obj_key prefix_obj(params.prefix);
5586 prefix_obj.ns = params.ns;
5587 string cur_prefix = prefix_obj.get_index_key_name();
5588
5589 string bigger_than_delim;
5590
5591 if (!params.delim.empty()) {
5592 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5593 char buf[params.delim.size() + 16];
5594 int r = encode_utf8(val + 1, (unsigned char *)buf);
5595 if (r < 0) {
5596 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5597 return -EINVAL;
5598 }
5599 buf[r] = '\0';
5600
5601 bigger_than_delim = buf;
5602
5603 /* if marker points at a common prefix, fast forward it into its upperbound string */
5604 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5605 if (delim_pos >= 0) {
5606 string s = cur_marker.name.substr(0, delim_pos);
5607 s.append(bigger_than_delim);
5608 cur_marker = s;
5609 }
5610 }
5611
5612 string skip_after_delim;
5613 while (truncated && count <= max) {
5614 if (skip_after_delim > cur_marker.name) {
5615 cur_marker = skip_after_delim;
5616 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5617 }
5618 std::map<string, rgw_bucket_dir_entry> ent_map;
5619 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5620 read_ahead + 1 - count, params.list_versions, ent_map,
5621 &truncated, &cur_marker);
5622 if (r < 0)
5623 return r;
5624
5625 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5626 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5627 rgw_bucket_dir_entry& entry = eiter->second;
5628 rgw_obj_index_key index_key = entry.key;
5629
5630 rgw_obj_key obj(index_key);
5631
5632 /* note that parse_raw_oid() here will not set the correct object's instance, as
5633 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5634 * not needed for the checks here and we end up using the raw entry for the return vector
5635 */
5636 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5637 if (!valid) {
5638 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5639 continue;
5640 }
5641 bool check_ns = (obj.ns == params.ns);
5642 if (!params.list_versions && !entry.is_visible()) {
5643 continue;
5644 }
5645
5646 if (params.enforce_ns && !check_ns) {
5647 if (!params.ns.empty()) {
5648 /* we've iterated past the namespace we're searching -- done now */
5649 truncated = false;
5650 goto done;
5651 }
5652
5653 /* we're not looking at the namespace this object is in, next! */
5654 continue;
5655 }
5656
5657 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5658 truncated = false;
5659 goto done;
5660 }
5661
5662 if (count < max) {
5663 params.marker = index_key;
5664 next_marker = index_key;
5665 }
5666
5667 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5668 continue;
5669
5670 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5671 continue;
5672
5673 if (!params.delim.empty()) {
5674 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5675
5676 if (delim_pos >= 0) {
5677 string prefix_key = obj.name.substr(0, delim_pos + 1);
5678
5679 if (common_prefixes &&
5680 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5681 if (count >= max) {
5682 truncated = true;
5683 goto done;
5684 }
5685 next_marker = prefix_key;
5686 (*common_prefixes)[prefix_key] = true;
5687
5688 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5689
5690 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5691 skip_after_delim.append(bigger_than_delim);
5692
5693 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5694
5695 count++;
5696 }
5697
5698 continue;
5699 }
5700 }
5701
5702 if (count >= max) {
5703 truncated = true;
5704 goto done;
5705 }
5706
5707 result->emplace_back(std::move(entry));
5708 count++;
5709 }
5710
5711 // Either the back-end telling us truncated, or we don't consume all
5712 // items returned per the amount caller request
5713 truncated = (truncated || eiter != ent_map.end());
5714 }
5715
5716 done:
5717 if (is_truncated)
5718 *is_truncated = truncated;
5719
5720 return 0;
5721 }
5722
5723 /**
5724 * create a rados pool, associated meta info
5725 * returns 0 on success, -ERR# otherwise.
5726 */
5727 int RGWRados::create_pool(const rgw_pool& pool)
5728 {
5729 int ret = 0;
5730
5731 librados::Rados *rad = get_rados_handle();
5732 ret = rad->pool_create(pool.name.c_str(), 0);
5733 if (ret == -EEXIST)
5734 ret = 0;
5735 else if (ret == -ERANGE) {
5736 ldout(cct, 0)
5737 << __func__
5738 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5739 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5740 << dendl;
5741 }
5742 if (ret < 0)
5743 return ret;
5744
5745 librados::IoCtx io_ctx;
5746 ret = rad->ioctx_create(pool.name.c_str(), io_ctx);
5747 if (ret < 0)
5748 return ret;
5749
5750 ret = io_ctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
5751 if (ret < 0 && ret != -EOPNOTSUPP)
5752 return ret;
5753 return 0;
5754 }
5755
5756 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5757 {
5758 librados::IoCtx index_ctx; // context for new bucket
5759
5760 string dir_oid = dir_oid_prefix;
5761 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5762 if (r < 0) {
5763 return r;
5764 }
5765
5766 dir_oid.append(bucket_info.bucket.bucket_id);
5767
5768 map<int, string> bucket_objs;
5769 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5770
5771 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5772 }
5773
5774 void RGWRados::create_bucket_id(string *bucket_id)
5775 {
5776 uint64_t iid = instance_id();
5777 uint64_t bid = next_bucket_id();
5778 char buf[get_zone_params().get_id().size() + 48];
5779 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5780 *bucket_id = buf;
5781 }
5782
5783 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5784 const string& zonegroup_id,
5785 const string& placement_rule,
5786 const string& swift_ver_location,
5787 const RGWQuotaInfo * pquota_info,
5788 map<std::string, bufferlist>& attrs,
5789 RGWBucketInfo& info,
5790 obj_version *pobjv,
5791 obj_version *pep_objv,
5792 real_time creation_time,
5793 rgw_bucket *pmaster_bucket,
5794 uint32_t *pmaster_num_shards,
5795 bool exclusive)
5796 {
5797 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5798 string selected_placement_rule_name;
5799 RGWZonePlacementInfo rule_info;
5800
5801 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5802 int ret = 0;
5803 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5804 &selected_placement_rule_name, &rule_info);
5805 if (ret < 0)
5806 return ret;
5807
5808 if (!pmaster_bucket) {
5809 create_bucket_id(&bucket.marker);
5810 bucket.bucket_id = bucket.marker;
5811 } else {
5812 bucket.marker = pmaster_bucket->marker;
5813 bucket.bucket_id = pmaster_bucket->bucket_id;
5814 }
5815
5816 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5817
5818 if (pobjv) {
5819 objv_tracker.write_version = *pobjv;
5820 } else {
5821 objv_tracker.generate_new_write_ver(cct);
5822 }
5823
5824 info.bucket = bucket;
5825 info.owner = owner.user_id;
5826 info.zonegroup = zonegroup_id;
5827 info.placement_rule = selected_placement_rule_name;
5828 info.index_type = rule_info.index_type;
5829 info.swift_ver_location = swift_ver_location;
5830 info.swift_versioning = (!swift_ver_location.empty());
5831 if (pmaster_num_shards) {
5832 info.num_shards = *pmaster_num_shards;
5833 } else {
5834 info.num_shards = bucket_index_max_shards;
5835 }
5836 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5837 info.requester_pays = false;
5838 if (real_clock::is_zero(creation_time)) {
5839 info.creation_time = ceph::real_clock::now();
5840 } else {
5841 info.creation_time = creation_time;
5842 }
5843 if (pquota_info) {
5844 info.quota = *pquota_info;
5845 }
5846
5847 int r = init_bucket_index(info, info.num_shards);
5848 if (r < 0) {
5849 return r;
5850 }
5851
5852 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5853 if (ret == -EEXIST) {
5854 librados::IoCtx index_ctx;
5855 map<int, string> bucket_objs;
5856 int r = open_bucket_index(info, index_ctx, bucket_objs);
5857 if (r < 0)
5858 return r;
5859
5860 /* we need to reread the info and return it, caller will have a use for it */
5861 RGWObjVersionTracker instance_ver = info.objv_tracker;
5862 info.objv_tracker.clear();
5863 RGWObjectCtx obj_ctx(this);
5864 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5865 if (r < 0) {
5866 if (r == -ENOENT) {
5867 continue;
5868 }
5869 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5870 return r;
5871 }
5872
5873 /* only remove it if it's a different bucket instance */
5874 if (info.bucket.bucket_id != bucket.bucket_id) {
5875 /* remove bucket meta instance */
5876 string entry = bucket.get_key();
5877 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5878 if (r < 0)
5879 return r;
5880
5881 map<int, string>::const_iterator biter;
5882 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5883 // Do best effort removal
5884 index_ctx.remove(biter->second);
5885 }
5886 }
5887 /* ret == -ENOENT here */
5888 }
5889 return ret;
5890 }
5891
5892 /* this is highly unlikely */
5893 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5894 return -ENOENT;
5895 }
5896
5897 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5898 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5899
5900 {
5901 /* first check that zonegroup exists within current period. */
5902 RGWZoneGroup zonegroup;
5903 int ret = get_zonegroup(zonegroup_id, zonegroup);
5904 if (ret < 0) {
5905 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5906 return ret;
5907 }
5908
5909 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5910 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
5911
5912 if (!request_rule.empty()) {
5913 titer = zonegroup.placement_targets.find(request_rule);
5914 if (titer == zonegroup.placement_targets.end()) {
5915 ldout(cct, 0) << "could not find requested placement id " << request_rule
5916 << " within zonegroup " << dendl;
5917 return -ERR_INVALID_LOCATION_CONSTRAINT;
5918 }
5919 } else if (!user_info.default_placement.empty()) {
5920 titer = zonegroup.placement_targets.find(user_info.default_placement);
5921 if (titer == zonegroup.placement_targets.end()) {
5922 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
5923 << " within zonegroup " << dendl;
5924 return -ERR_INVALID_LOCATION_CONSTRAINT;
5925 }
5926 } else {
5927 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
5928 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
5929 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
5930 } else {
5931 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
5932 if (titer == zonegroup.placement_targets.end()) {
5933 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
5934 << " within zonegroup " << dendl;
5935 return -ERR_INVALID_LOCATION_CONSTRAINT;
5936 }
5937 }
5938 }
5939
5940 /* now check tag for the rule, whether user is permitted to use rule */
5941 const auto& target_rule = titer->second;
5942 if (!target_rule.user_permitted(user_info.placement_tags)) {
5943 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
5944 return -EPERM;
5945 }
5946
5947 if (pselected_rule_name)
5948 *pselected_rule_name = titer->first;
5949
5950 return select_bucket_location_by_rule(titer->first, rule_info);
5951 }
5952
5953 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5954 {
5955 if (location_rule.empty()) {
5956 /* we can only reach here if we're trying to set a bucket location from a bucket
5957 * created on a different zone, using a legacy / default pool configuration
5958 */
5959 return select_legacy_bucket_placement(rule_info);
5960 }
5961
5962 /*
5963 * make sure that zone has this rule configured. We're
5964 * checking it for the local zone, because that's where this bucket object is going to
5965 * reside.
5966 */
5967 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5968 if (piter == get_zone_params().placement_pools.end()) {
5969 /* couldn't find, means we cannot really place data for this bucket in this zone */
5970 if (get_zonegroup().equals(zonegroup.get_id())) {
5971 /* that's a configuration error, zone should have that rule, as we're within the requested
5972 * zonegroup */
5973 return -EINVAL;
5974 } else {
5975 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5976 return 0;
5977 }
5978 }
5979
5980 RGWZonePlacementInfo& placement_info = piter->second;
5981
5982 if (rule_info) {
5983 *rule_info = placement_info;
5984 }
5985
5986 return 0;
5987 }
5988
5989 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5990 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5991 {
5992 if (!get_zone_params().placement_pools.empty()) {
5993 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5994 pselected_rule_name, rule_info);
5995 }
5996
5997 if (pselected_rule_name) {
5998 pselected_rule_name->clear();
5999 }
6000
6001 return select_legacy_bucket_placement(rule_info);
6002 }
6003
6004 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6005 {
6006 bufferlist map_bl;
6007 map<string, bufferlist> m;
6008 string pool_name;
6009 bool write_map = false;
6010
6011 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6012
6013 RGWObjectCtx obj_ctx(this);
6014 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6015 if (ret < 0) {
6016 goto read_omap;
6017 }
6018
6019 try {
6020 bufferlist::iterator iter = map_bl.begin();
6021 ::decode(m, iter);
6022 } catch (buffer::error& err) {
6023 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6024 }
6025
6026 read_omap:
6027 if (m.empty()) {
6028 bufferlist header;
6029 ret = omap_get_all(obj, header, m);
6030
6031 write_map = true;
6032 }
6033
6034 if (ret < 0 || m.empty()) {
6035 vector<rgw_pool> pools;
6036 string s = string("default.") + default_storage_pool_suffix;
6037 pools.push_back(rgw_pool(s));
6038 vector<int> retcodes;
6039 bufferlist bl;
6040 ret = create_pools(pools, retcodes);
6041 if (ret < 0)
6042 return ret;
6043 ret = omap_set(obj, s, bl);
6044 if (ret < 0)
6045 return ret;
6046 m[s] = bl;
6047 }
6048
6049 if (write_map) {
6050 bufferlist new_bl;
6051 ::encode(m, new_bl);
6052 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6053 if (ret < 0) {
6054 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6055 }
6056 }
6057
6058 map<string, bufferlist>::iterator miter;
6059 if (m.size() > 1) {
6060 vector<string> v;
6061 for (miter = m.begin(); miter != m.end(); ++miter) {
6062 v.push_back(miter->first);
6063 }
6064
6065 uint32_t r;
6066 ret = get_random_bytes((char *)&r, sizeof(r));
6067 if (ret < 0)
6068 return ret;
6069
6070 int i = r % v.size();
6071 pool_name = v[i];
6072 } else {
6073 miter = m.begin();
6074 pool_name = miter->first;
6075 }
6076
6077 rule_info->data_pool = pool_name;
6078 rule_info->data_extra_pool = pool_name;
6079 rule_info->index_pool = pool_name;
6080 rule_info->index_type = RGWBIType_Normal;
6081
6082 return 0;
6083 }
6084
6085 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6086 {
6087 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6088 }
6089
6090 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6091 {
6092 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6093
6094 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6095 }
6096
6097 int RGWRados::update_placement_map()
6098 {
6099 bufferlist header;
6100 map<string, bufferlist> m;
6101 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6102 int ret = omap_get_all(obj, header, m);
6103 if (ret < 0)
6104 return ret;
6105
6106 bufferlist new_bl;
6107 ::encode(m, new_bl);
6108 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6109 if (ret < 0) {
6110 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6111 }
6112
6113 return ret;
6114 }
6115
6116 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6117 {
6118 librados::Rados *rad = get_rados_handle();
6119 int ret = rad->pool_lookup(new_pool.name.c_str());
6120 if (ret < 0) // DNE, or something
6121 return ret;
6122
6123 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6124 bufferlist empty_bl;
6125 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6126
6127 // don't care about return value
6128 update_placement_map();
6129
6130 return ret;
6131 }
6132
6133 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6134 {
6135 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6136 int ret = omap_del(obj, old_pool.to_str());
6137
6138 // don't care about return value
6139 update_placement_map();
6140
6141 return ret;
6142 }
6143
6144 int RGWRados::list_placement_set(set<rgw_pool>& names)
6145 {
6146 bufferlist header;
6147 map<string, bufferlist> m;
6148
6149 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6150 int ret = omap_get_all(obj, header, m);
6151 if (ret < 0)
6152 return ret;
6153
6154 names.clear();
6155 map<string, bufferlist>::iterator miter;
6156 for (miter = m.begin(); miter != m.end(); ++miter) {
6157 names.insert(rgw_pool(miter->first));
6158 }
6159
6160 return names.size();
6161 }
6162
6163 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6164 {
6165 vector<librados::PoolAsyncCompletion *> completions;
6166 vector<int> rets;
6167
6168 librados::Rados *rad = get_rados_handle();
6169 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6170 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6171 completions.push_back(c);
6172 rgw_pool& pool = *iter;
6173 int ret = rad->pool_create_async(pool.name.c_str(), c);
6174 rets.push_back(ret);
6175 }
6176
6177 vector<int>::iterator riter;
6178 vector<librados::PoolAsyncCompletion *>::iterator citer;
6179
6180 bool error = false;
6181 assert(rets.size() == completions.size());
6182 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6183 int r = *riter;
6184 PoolAsyncCompletion *c = *citer;
6185 if (r == 0) {
6186 c->wait();
6187 r = c->get_return_value();
6188 if (r < 0) {
6189 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6190 error = true;
6191 }
6192 }
6193 c->release();
6194 retcodes.push_back(r);
6195 }
6196 if (error) {
6197 return 0;
6198 }
6199
6200 std::vector<librados::IoCtx> io_ctxs;
6201 retcodes.clear();
6202 for (auto pool : pools) {
6203 io_ctxs.emplace_back();
6204 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6205 if (ret < 0) {
6206 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6207 error = true;
6208 }
6209 retcodes.push_back(ret);
6210 }
6211 if (error) {
6212 return 0;
6213 }
6214
6215 completions.clear();
6216 for (auto &io_ctx : io_ctxs) {
6217 librados::PoolAsyncCompletion *c =
6218 librados::Rados::pool_async_create_completion();
6219 completions.push_back(c);
6220 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6221 false, c);
6222 assert(ret == 0);
6223 }
6224
6225 retcodes.clear();
6226 for (auto c : completions) {
6227 c->wait();
6228 int ret = c->get_return_value();
6229 if (ret == -EOPNOTSUPP) {
6230 ret = 0;
6231 } else if (ret < 0) {
6232 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6233 << dendl;
6234 error = true;
6235 }
6236 c->release();
6237 retcodes.push_back(ret);
6238 }
6239 return 0;
6240 }
6241
6242 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6243 {
6244 string oid, key;
6245 get_obj_bucket_and_oid_loc(obj, oid, key);
6246
6247 rgw_pool pool;
6248 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6249 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6250 return -EIO;
6251 }
6252
6253 int r = open_pool_ctx(pool, *ioctx);
6254 if (r < 0) {
6255 return r;
6256 }
6257
6258 ioctx->locator_set_key(key);
6259
6260 return 0;
6261 }
6262
6263 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6264 {
6265 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6266
6267 rgw_pool pool;
6268 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6269 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6270 return -EIO;
6271 }
6272
6273 int r = open_pool_ctx(pool, ref->ioctx);
6274 if (r < 0) {
6275 return r;
6276 }
6277
6278 ref->ioctx.locator_set_key(ref->key);
6279
6280 return 0;
6281 }
6282
6283 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6284 {
6285 ref->oid = obj.oid;
6286 ref->key = obj.loc;
6287
6288 int r;
6289
6290 if (ref->oid.empty()) {
6291 ref->oid = obj.pool.to_str();
6292 ref->pool = get_zone_params().domain_root;
6293 } else {
6294 ref->pool = obj.pool;
6295 }
6296 r = open_pool_ctx(ref->pool, ref->ioctx);
6297 if (r < 0)
6298 return r;
6299
6300 ref->ioctx.locator_set_key(ref->key);
6301
6302 return 0;
6303 }
6304
6305 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6306 {
6307 return get_raw_obj_ref(obj, ref);
6308 }
6309
6310 /*
6311 * fixes an issue where head objects were supposed to have a locator created, but ended
6312 * up without one
6313 */
6314 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6315 {
6316 const rgw_bucket& bucket = bucket_info.bucket;
6317 string oid;
6318 string locator;
6319
6320 rgw_obj obj(bucket, key);
6321
6322 get_obj_bucket_and_oid_loc(obj, oid, locator);
6323
6324 if (locator.empty()) {
6325 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6326 return 0;
6327 }
6328
6329 librados::IoCtx ioctx;
6330
6331 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6332 if (ret < 0) {
6333 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6334 return ret;
6335 }
6336 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6337
6338 uint64_t size;
6339 bufferlist data;
6340
6341 struct timespec mtime_ts;
6342 map<string, bufferlist> attrs;
6343 librados::ObjectReadOperation op;
6344 op.getxattrs(&attrs, NULL);
6345 op.stat2(&size, &mtime_ts, NULL);
6346 #define HEAD_SIZE 512 * 1024
6347 op.read(0, HEAD_SIZE, &data, NULL);
6348
6349 ret = ioctx.operate(oid, &op, NULL);
6350 if (ret < 0) {
6351 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6352 return ret;
6353 }
6354
6355 if (size > HEAD_SIZE) {
6356 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6357 return -EIO;
6358 }
6359
6360 if (size != data.length()) {
6361 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6362 return -EIO;
6363 }
6364
6365 if (copy_obj) {
6366 librados::ObjectWriteOperation wop;
6367
6368 wop.mtime2(&mtime_ts);
6369
6370 map<string, bufferlist>::iterator iter;
6371 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6372 wop.setxattr(iter->first.c_str(), iter->second);
6373 }
6374
6375 wop.write(0, data);
6376
6377 ioctx.locator_set_key(locator);
6378 ioctx.operate(oid, &wop);
6379 }
6380
6381 if (remove_bad) {
6382 ioctx.locator_set_key(string());
6383
6384 ret = ioctx.remove(oid);
6385 if (ret < 0) {
6386 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6387 return ret;
6388 }
6389 }
6390
6391 return 0;
6392 }
6393
6394 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6395 const string& src_oid, const string& src_locator,
6396 librados::IoCtx& dst_ioctx,
6397 const string& dst_oid, const string& dst_locator)
6398 {
6399
6400 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6401 bool done = false;
6402 uint64_t chunk_size = COPY_BUF_SIZE;
6403 uint64_t ofs = 0;
6404 int ret = 0;
6405 real_time mtime;
6406 struct timespec mtime_ts;
6407 uint64_t size;
6408
6409 if (src_oid == dst_oid && src_locator == dst_locator) {
6410 return 0;
6411 }
6412
6413 src_ioctx.locator_set_key(src_locator);
6414 dst_ioctx.locator_set_key(dst_locator);
6415
6416 do {
6417 bufferlist data;
6418 ObjectReadOperation rop;
6419 ObjectWriteOperation wop;
6420
6421 if (ofs == 0) {
6422 rop.stat2(&size, &mtime_ts, NULL);
6423 mtime = real_clock::from_timespec(mtime_ts);
6424 }
6425 rop.read(ofs, chunk_size, &data, NULL);
6426 ret = src_ioctx.operate(src_oid, &rop, NULL);
6427 if (ret < 0) {
6428 goto done_err;
6429 }
6430
6431 if (data.length() == 0) {
6432 break;
6433 }
6434
6435 if (ofs == 0) {
6436 wop.create(true); /* make it exclusive */
6437 wop.mtime2(&mtime_ts);
6438 mtime = real_clock::from_timespec(mtime_ts);
6439 }
6440 wop.write(ofs, data);
6441 ret = dst_ioctx.operate(dst_oid, &wop);
6442 ofs += data.length();
6443 done = data.length() != chunk_size;
6444 } while (!done);
6445
6446 if (ofs != size) {
6447 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6448 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6449 ret = -EIO;
6450 goto done_err;
6451 }
6452
6453 src_ioctx.remove(src_oid);
6454
6455 return 0;
6456
6457 done_err:
6458 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6459 return ret;
6460 }
6461
6462 /*
6463 * fixes an issue where head objects were supposed to have a locator created, but ended
6464 * up without one
6465 */
6466 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6467 {
6468 const rgw_bucket& bucket = bucket_info.bucket;
6469 rgw_obj obj(bucket, key);
6470
6471 if (need_fix) {
6472 *need_fix = false;
6473 }
6474
6475 rgw_rados_ref ref;
6476 int r = get_obj_head_ref(bucket_info, obj, &ref);
6477 if (r < 0) {
6478 return r;
6479 }
6480
6481 RGWObjState *astate = NULL;
6482 RGWObjectCtx rctx(this);
6483 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6484 if (r < 0)
6485 return r;
6486
6487 if (astate->has_manifest) {
6488 RGWObjManifest::obj_iterator miter;
6489 RGWObjManifest& manifest = astate->manifest;
6490 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6491 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6492 rgw_obj loc;
6493 string oid;
6494 string locator;
6495
6496 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6497
6498 if (loc.key.ns.empty()) {
6499 /* continue, we're only interested in tail objects */
6500 continue;
6501 }
6502
6503 get_obj_bucket_and_oid_loc(loc, oid, locator);
6504 ref.ioctx.locator_set_key(locator);
6505
6506 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6507
6508 r = ref.ioctx.stat(oid, NULL, NULL);
6509 if (r != -ENOENT) {
6510 continue;
6511 }
6512
6513 string bad_loc;
6514 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6515
6516 /* create a new ioctx with the bad locator */
6517 librados::IoCtx src_ioctx;
6518 src_ioctx.dup(ref.ioctx);
6519 src_ioctx.locator_set_key(bad_loc);
6520
6521 r = src_ioctx.stat(oid, NULL, NULL);
6522 if (r != 0) {
6523 /* cannot find a broken part */
6524 continue;
6525 }
6526 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6527 if (need_fix) {
6528 *need_fix = true;
6529 }
6530 if (fix) {
6531 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6532 if (r < 0) {
6533 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6534 }
6535 }
6536 }
6537 }
6538
6539 return 0;
6540 }
6541
6542 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6543 {
6544 bucket = _bucket;
6545
6546 RGWObjectCtx obj_ctx(store);
6547
6548 RGWBucketInfo bucket_info;
6549 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6550 if (ret < 0) {
6551 return ret;
6552 }
6553
6554 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6555 if (ret < 0) {
6556 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6557 return ret;
6558 }
6559 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6560
6561 return 0;
6562 }
6563
6564 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6565 {
6566 bucket = _bucket;
6567 shard_id = sid;
6568
6569 RGWObjectCtx obj_ctx(store);
6570
6571 RGWBucketInfo bucket_info;
6572 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6573 if (ret < 0) {
6574 return ret;
6575 }
6576
6577 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6578 if (ret < 0) {
6579 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6580 return ret;
6581 }
6582 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6583
6584 return 0;
6585 }
6586
6587 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6588 {
6589 bucket = bucket_info.bucket;
6590 shard_id = sid;
6591
6592 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6593 if (ret < 0) {
6594 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6595 return ret;
6596 }
6597 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6598
6599 return 0;
6600 }
6601
6602
6603 /* Execute @handler on last item in bucket listing for bucket specified
6604 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6605 * to objects matching these criterias. */
6606 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6607 const std::string& obj_prefix,
6608 const std::string& obj_delim,
6609 std::function<int(const rgw_bucket_dir_entry&)> handler)
6610 {
6611 RGWRados::Bucket target(this, bucket_info);
6612 RGWRados::Bucket::List list_op(&target);
6613
6614 list_op.params.prefix = obj_prefix;
6615 list_op.params.delim = obj_delim;
6616
6617 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6618 << ", obj_prefix=" << obj_prefix
6619 << ", obj_delim=" << obj_delim
6620 << dendl;
6621
6622 bool is_truncated = false;
6623
6624 boost::optional<rgw_bucket_dir_entry> last_entry;
6625 /* We need to rewind to the last object in a listing. */
6626 do {
6627 /* List bucket entries in chunks. */
6628 static constexpr int MAX_LIST_OBJS = 100;
6629 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6630
6631 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6632 &is_truncated);
6633 if (ret < 0) {
6634 return ret;
6635 } else if (!entries.empty()) {
6636 last_entry = entries.back();
6637 }
6638 } while (is_truncated);
6639
6640 if (last_entry) {
6641 return handler(*last_entry);
6642 }
6643
6644 /* Empty listing - no items we can run handler on. */
6645 return 0;
6646 }
6647
6648
6649 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6650 const rgw_user& user,
6651 RGWBucketInfo& bucket_info,
6652 rgw_obj& obj)
6653 {
6654 if (! swift_versioning_enabled(bucket_info)) {
6655 return 0;
6656 }
6657
6658 obj_ctx.obj.set_atomic(obj);
6659
6660 RGWObjState * state = nullptr;
6661 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6662 if (r < 0) {
6663 return r;
6664 }
6665
6666 if (!state->exists) {
6667 return 0;
6668 }
6669
6670 string client_id;
6671 string op_id;
6672
6673 const string& src_name = obj.get_oid();
6674 char buf[src_name.size() + 32];
6675 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6676 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6677 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6678
6679 RGWBucketInfo dest_bucket_info;
6680
6681 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6682 if (r < 0) {
6683 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6684 if (r == -ENOENT) {
6685 return -ERR_PRECONDITION_FAILED;
6686 }
6687 return r;
6688 }
6689
6690 if (dest_bucket_info.owner != bucket_info.owner) {
6691 return -ERR_PRECONDITION_FAILED;
6692 }
6693
6694 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6695 obj_ctx.obj.set_atomic(dest_obj);
6696
6697 string no_zone;
6698
6699 r = copy_obj(obj_ctx,
6700 user,
6701 client_id,
6702 op_id,
6703 NULL, /* req_info *info */
6704 no_zone,
6705 dest_obj,
6706 obj,
6707 dest_bucket_info,
6708 bucket_info,
6709 NULL, /* time_t *src_mtime */
6710 NULL, /* time_t *mtime */
6711 NULL, /* const time_t *mod_ptr */
6712 NULL, /* const time_t *unmod_ptr */
6713 false, /* bool high_precision_time */
6714 NULL, /* const char *if_match */
6715 NULL, /* const char *if_nomatch */
6716 RGWRados::ATTRSMOD_NONE,
6717 true, /* bool copy_if_newer */
6718 state->attrset,
6719 RGW_OBJ_CATEGORY_MAIN,
6720 0, /* uint64_t olh_epoch */
6721 real_time(), /* time_t delete_at */
6722 NULL, /* string *version_id */
6723 NULL, /* string *ptag */
6724 NULL, /* string *petag */
6725 NULL, /* void (*progress_cb)(off_t, void *) */
6726 NULL); /* void *progress_data */
6727 if (r == -ECANCELED || r == -ENOENT) {
6728 /* Has already been overwritten, meaning another rgw process already
6729 * copied it out */
6730 return 0;
6731 }
6732
6733 return r;
6734 }
6735
6736 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6737 const rgw_user& user,
6738 RGWBucketInfo& bucket_info,
6739 rgw_obj& obj,
6740 bool& restored) /* out */
6741 {
6742 if (! swift_versioning_enabled(bucket_info)) {
6743 return 0;
6744 }
6745
6746 /* Bucket info of the bucket that stores previous versions of our object. */
6747 RGWBucketInfo archive_binfo;
6748
6749 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6750 bucket_info.swift_ver_location, archive_binfo,
6751 nullptr, nullptr);
6752 if (ret < 0) {
6753 return ret;
6754 }
6755
6756 /* Abort the operation if the bucket storing our archive belongs to someone
6757 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6758 * into consideration. For we can live with that.
6759 *
6760 * TODO: delegate this check to un upper layer and compare with ACLs. */
6761 if (bucket_info.owner != archive_binfo.owner) {
6762 return -EPERM;
6763 }
6764
6765 /* This code will be executed on latest version of the object. */
6766 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6767 std::string no_client_id;
6768 std::string no_op_id;
6769 std::string no_zone;
6770
6771 /* We don't support object versioning of Swift API on those buckets that
6772 * are already versioned using the S3 mechanism. This affects also bucket
6773 * storing archived objects. Otherwise the delete operation would create
6774 * a deletion marker. */
6775 if (archive_binfo.versioned()) {
6776 restored = false;
6777 return -ERR_PRECONDITION_FAILED;
6778 }
6779
6780 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6781 * irrelevant and may be safely skipped. */
6782 std::map<std::string, ceph::bufferlist> no_attrs;
6783
6784 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6785 obj_ctx.obj.set_atomic(archive_obj);
6786 obj_ctx.obj.set_atomic(obj);
6787
6788 int ret = copy_obj(obj_ctx,
6789 user,
6790 no_client_id,
6791 no_op_id,
6792 nullptr, /* req_info *info */
6793 no_zone,
6794 obj, /* dest obj */
6795 archive_obj, /* src obj */
6796 bucket_info, /* dest bucket info */
6797 archive_binfo, /* src bucket info */
6798 nullptr, /* time_t *src_mtime */
6799 nullptr, /* time_t *mtime */
6800 nullptr, /* const time_t *mod_ptr */
6801 nullptr, /* const time_t *unmod_ptr */
6802 false, /* bool high_precision_time */
6803 nullptr, /* const char *if_match */
6804 nullptr, /* const char *if_nomatch */
6805 RGWRados::ATTRSMOD_NONE,
6806 true, /* bool copy_if_newer */
6807 no_attrs,
6808 RGW_OBJ_CATEGORY_MAIN,
6809 0, /* uint64_t olh_epoch */
6810 real_time(), /* time_t delete_at */
6811 nullptr, /* string *version_id */
6812 nullptr, /* string *ptag */
6813 nullptr, /* string *petag */
6814 nullptr, /* void (*progress_cb)(off_t, void *) */
6815 nullptr); /* void *progress_data */
6816 if (ret == -ECANCELED || ret == -ENOENT) {
6817 /* Has already been overwritten, meaning another rgw process already
6818 * copied it out */
6819 return 0;
6820 } else if (ret < 0) {
6821 return ret;
6822 } else {
6823 restored = true;
6824 }
6825
6826 /* Need to remove the archived copy. */
6827 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6828 archive_binfo.versioning_status());
6829
6830 return ret;
6831 };
6832
6833 const std::string& obj_name = obj.get_oid();
6834 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6835 % obj_name);
6836
6837 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6838 handler);
6839 }
6840
6841 /**
6842 * Write/overwrite an object to the bucket storage.
6843 * bucket: the bucket to store the object in
6844 * obj: the object name/key
6845 * data: the object contents/value
6846 * size: the amount of data to write (data must be this long)
6847 * accounted_size: original size of data before compression, encryption
6848 * mtime: if non-NULL, writes the given mtime to the bucket storage
6849 * attrs: all the given attrs are written to bucket storage for the given object
6850 * exclusive: create object exclusively
6851 * Returns: 0 on success, -ERR# otherwise.
6852 */
6853 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6854 map<string, bufferlist>& attrs,
6855 bool assume_noent, bool modify_tail,
6856 void *_index_op)
6857 {
6858 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6859 RGWRados *store = target->get_store();
6860
6861 ObjectWriteOperation op;
6862
6863 RGWObjState *state;
6864 int r = target->get_state(&state, false, assume_noent);
6865 if (r < 0)
6866 return r;
6867
6868 rgw_obj& obj = target->get_obj();
6869
6870 if (obj.get_oid().empty()) {
6871 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6872 return -EIO;
6873 }
6874
6875 rgw_rados_ref ref;
6876 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6877 if (r < 0)
6878 return r;
6879
6880 bool is_olh = state->is_olh;
6881
6882 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6883
6884 const string *ptag = meta.ptag;
6885 if (!ptag && !index_op->get_optag()->empty()) {
6886 ptag = index_op->get_optag();
6887 }
6888 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
6889 if (r < 0)
6890 return r;
6891
6892 if (real_clock::is_zero(meta.set_mtime)) {
6893 meta.set_mtime = real_clock::now();
6894 }
6895
6896 if (state->is_olh) {
6897 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6898 }
6899
6900 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6901 op.mtime2(&mtime_ts);
6902
6903 if (meta.data) {
6904 /* if we want to overwrite the data, we also want to overwrite the
6905 xattrs, so just remove the object */
6906 op.write_full(*meta.data);
6907 }
6908
6909 string etag;
6910 string content_type;
6911 bufferlist acl_bl;
6912
6913 map<string, bufferlist>::iterator iter;
6914 if (meta.rmattrs) {
6915 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6916 const string& name = iter->first;
6917 op.rmxattr(name.c_str());
6918 }
6919 }
6920
6921 if (meta.manifest) {
6922 /* remove existing manifest attr */
6923 iter = attrs.find(RGW_ATTR_MANIFEST);
6924 if (iter != attrs.end())
6925 attrs.erase(iter);
6926
6927 bufferlist bl;
6928 ::encode(*meta.manifest, bl);
6929 op.setxattr(RGW_ATTR_MANIFEST, bl);
6930 }
6931
6932 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6933 const string& name = iter->first;
6934 bufferlist& bl = iter->second;
6935
6936 if (!bl.length())
6937 continue;
6938
6939 op.setxattr(name.c_str(), bl);
6940
6941 if (name.compare(RGW_ATTR_ETAG) == 0) {
6942 etag = bl.c_str();
6943 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6944 content_type = bl.c_str();
6945 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6946 acl_bl = bl;
6947 }
6948 }
6949 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6950 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6951 }
6952
6953 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6954 bufferlist bl;
6955 ::encode(store->get_zone_short_id(), bl);
6956 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6957 }
6958
6959 if (!op.size())
6960 return 0;
6961
6962 uint64_t epoch;
6963 int64_t poolid;
6964 bool orig_exists;
6965 uint64_t orig_size;
6966
6967 if (!reset_obj) { //Multipart upload, it has immutable head.
6968 orig_exists = false;
6969 orig_size = 0;
6970 } else {
6971 orig_exists = state->exists;
6972 orig_size = state->accounted_size;
6973 }
6974
6975 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6976
6977 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6978
6979 if (versioned_op) {
6980 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6981 }
6982
6983 if (!index_op->is_prepared()) {
6984 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6985 if (r < 0)
6986 return r;
6987 }
6988
6989 r = ref.ioctx.operate(ref.oid, &op);
6990 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6991 or -ENOENT if was removed, or -EEXIST if it did not exist
6992 before and now it does */
6993 if (r == -EEXIST && assume_noent) {
6994 target->invalidate_state();
6995 return r;
6996 }
6997 goto done_cancel;
6998 }
6999
7000 epoch = ref.ioctx.get_last_version();
7001 poolid = ref.ioctx.get_id();
7002
7003 r = target->complete_atomic_modification();
7004 if (r < 0) {
7005 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7006 }
7007
7008 r = index_op->complete(poolid, epoch, size, accounted_size,
7009 meta.set_mtime, etag, content_type, &acl_bl,
7010 meta.category, meta.remove_objs, meta.user_data);
7011 if (r < 0)
7012 goto done_cancel;
7013
7014 if (meta.mtime) {
7015 *meta.mtime = meta.set_mtime;
7016 }
7017
7018 /* note that index_op was using state so we couldn't invalidate it earlier */
7019 target->invalidate_state();
7020 state = NULL;
7021
7022 if (versioned_op) {
7023 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
7024 if (r < 0) {
7025 return r;
7026 }
7027 }
7028
7029 if (!real_clock::is_zero(meta.delete_at)) {
7030 rgw_obj_index_key obj_key;
7031 obj.key.get_index_key(&obj_key);
7032
7033 r = store->objexp_hint_add(meta.delete_at,
7034 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7035 if (r < 0) {
7036 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7037 /* ignoring error, nothing we can do at this point */
7038 }
7039 }
7040 meta.canceled = false;
7041
7042 /* update quota cache */
7043 if (meta.completeMultipart){
7044 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7045 0, orig_size);
7046 }
7047 else {
7048 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7049 accounted_size, orig_size);
7050 }
7051 return 0;
7052
7053 done_cancel:
7054 int ret = index_op->cancel();
7055 if (ret < 0) {
7056 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7057 }
7058
7059 meta.canceled = true;
7060
7061 /* we lost in a race. There are a few options:
7062 * - existing object was rewritten (ECANCELED)
7063 * - non existing object was created (EEXIST)
7064 * - object was removed (ENOENT)
7065 * should treat it as a success
7066 */
7067 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7068 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7069 r = 0;
7070 }
7071 } else {
7072 if (meta.if_match != NULL) {
7073 // only overwrite existing object
7074 if (strcmp(meta.if_match, "*") == 0) {
7075 if (r == -ENOENT) {
7076 r = -ERR_PRECONDITION_FAILED;
7077 } else if (r == -ECANCELED) {
7078 r = 0;
7079 }
7080 }
7081 }
7082
7083 if (meta.if_nomatch != NULL) {
7084 // only create a new object
7085 if (strcmp(meta.if_nomatch, "*") == 0) {
7086 if (r == -EEXIST) {
7087 r = -ERR_PRECONDITION_FAILED;
7088 } else if (r == -ENOENT) {
7089 r = 0;
7090 }
7091 }
7092 }
7093 }
7094
7095 return r;
7096 }
7097
7098 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7099 map<string, bufferlist>& attrs)
7100 {
7101 RGWBucketInfo& bucket_info = target->get_bucket_info();
7102
7103 RGWRados::Bucket bop(target->get_store(), bucket_info);
7104 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7105 index_op.set_zones_trace(meta.zones_trace);
7106
7107 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7108 int r;
7109 if (assume_noent) {
7110 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7111 if (r == -EEXIST) {
7112 assume_noent = false;
7113 }
7114 }
7115 if (!assume_noent) {
7116 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7117 }
7118 return r;
7119 }
7120
7121 /** Write/overwrite a system object. */
7122 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7123 map<std::string, bufferlist>& attrs, int flags,
7124 bufferlist& data,
7125 RGWObjVersionTracker *objv_tracker,
7126 real_time set_mtime /* 0 for don't set */)
7127 {
7128 rgw_rados_ref ref;
7129 int r = get_system_obj_ref(obj, &ref);
7130 if (r < 0)
7131 return r;
7132
7133 ObjectWriteOperation op;
7134
7135 if (flags & PUT_OBJ_EXCL) {
7136 if (!(flags & PUT_OBJ_CREATE))
7137 return -EINVAL;
7138 op.create(true); // exclusive create
7139 } else {
7140 op.remove();
7141 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7142 op.create(false);
7143 }
7144
7145 if (objv_tracker) {
7146 objv_tracker->prepare_op_for_write(&op);
7147 }
7148
7149 if (real_clock::is_zero(set_mtime)) {
7150 set_mtime = real_clock::now();
7151 }
7152
7153 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7154 op.mtime2(&mtime_ts);
7155 op.write_full(data);
7156
7157 bufferlist acl_bl;
7158
7159 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7160 const string& name = iter->first;
7161 bufferlist& bl = iter->second;
7162
7163 if (!bl.length())
7164 continue;
7165
7166 op.setxattr(name.c_str(), bl);
7167 }
7168
7169 r = ref.ioctx.operate(ref.oid, &op);
7170 if (r < 0) {
7171 return r;
7172 }
7173
7174 if (objv_tracker) {
7175 objv_tracker->apply_write();
7176 }
7177
7178 if (mtime) {
7179 *mtime = set_mtime;
7180 }
7181
7182 return 0;
7183 }
7184
7185 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7186 off_t ofs, bool exclusive,
7187 RGWObjVersionTracker *objv_tracker)
7188 {
7189 rgw_rados_ref ref;
7190 int r = get_system_obj_ref(obj, &ref);
7191 if (r < 0) {
7192 return r;
7193 }
7194
7195 ObjectWriteOperation op;
7196
7197 if (exclusive)
7198 op.create(true);
7199
7200 if (objv_tracker) {
7201 objv_tracker->prepare_op_for_write(&op);
7202 }
7203 if (ofs == -1) {
7204 op.write_full(bl);
7205 } else {
7206 op.write(ofs, bl);
7207 }
7208 r = ref.ioctx.operate(ref.oid, &op);
7209 if (r < 0)
7210 return r;
7211
7212 if (objv_tracker) {
7213 objv_tracker->apply_write();
7214 }
7215 return 0;
7216 }
7217
7218 /**
7219 * Write/overwrite an object to the bucket storage.
7220 * bucket: the bucket to store the object in
7221 * obj: the object name/key
7222 * data: the object contents/value
7223 * offset: the offet to write to in the object
7224 * If this is -1, we will overwrite the whole object.
7225 * size: the amount of data to write (data must be this long)
7226 * attrs: all the given attrs are written to bucket storage for the given object
7227 * Returns: 0 on success, -ERR# otherwise.
7228 */
7229
7230 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7231 off_t ofs, bool exclusive,
7232 void **handle)
7233 {
7234 rgw_rados_ref ref;
7235 int r = get_raw_obj_ref(obj, &ref);
7236 if (r < 0) {
7237 return r;
7238 }
7239
7240 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7241 *handle = c;
7242
7243 ObjectWriteOperation op;
7244
7245 if (exclusive)
7246 op.create(true);
7247
7248 if (ofs == -1) {
7249 op.write_full(bl);
7250 } else {
7251 op.write(ofs, bl);
7252 }
7253 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7254 if (r < 0)
7255 return r;
7256
7257 return 0;
7258 }
7259
7260 int RGWRados::aio_wait(void *handle)
7261 {
7262 AioCompletion *c = (AioCompletion *)handle;
7263 c->wait_for_safe();
7264 int ret = c->get_return_value();
7265 c->release();
7266 return ret;
7267 }
7268
7269 bool RGWRados::aio_completed(void *handle)
7270 {
7271 AioCompletion *c = (AioCompletion *)handle;
7272 return c->is_safe();
7273 }
7274
7275 class RGWRadosPutObj : public RGWGetDataCB
7276 {
7277 CephContext* cct;
7278 rgw_obj obj;
7279 RGWPutObjDataProcessor *filter;
7280 boost::optional<RGWPutObj_Compress>& compressor;
7281 CompressorRef& plugin;
7282 RGWPutObjProcessor_Atomic *processor;
7283 RGWOpStateSingleOp *opstate;
7284 void (*progress_cb)(off_t, void *);
7285 void *progress_data;
7286 bufferlist extra_data_bl;
7287 uint64_t extra_data_left;
7288 uint64_t data_len;
7289 map<string, bufferlist> src_attrs;
7290 public:
7291 RGWRadosPutObj(CephContext* cct,
7292 CompressorRef& plugin,
7293 boost::optional<RGWPutObj_Compress>& compressor,
7294 RGWPutObjProcessor_Atomic *p,
7295 RGWOpStateSingleOp *_ops,
7296 void (*_progress_cb)(off_t, void *),
7297 void *_progress_data) :
7298 cct(cct),
7299 filter(p),
7300 compressor(compressor),
7301 plugin(plugin),
7302 processor(p),
7303 opstate(_ops),
7304 progress_cb(_progress_cb),
7305 progress_data(_progress_data),
7306 extra_data_left(0),
7307 data_len(0) {}
7308
7309 int process_attrs(void) {
7310 if (extra_data_bl.length()) {
7311 JSONParser jp;
7312 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7313 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7314 return -EIO;
7315 }
7316
7317 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7318
7319 src_attrs.erase(RGW_ATTR_COMPRESSION);
7320 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7321 }
7322
7323 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7324 //do not compress if object is encrypted
7325 compressor = boost::in_place(cct, plugin, filter);
7326 filter = &*compressor;
7327 }
7328 return 0;
7329 }
7330
7331 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7332 if (progress_cb) {
7333 progress_cb(ofs, progress_data);
7334 }
7335 if (extra_data_left) {
7336 size_t extra_len = bl.length();
7337 if (extra_len > extra_data_left)
7338 extra_len = extra_data_left;
7339
7340 bufferlist extra;
7341 bl.splice(0, extra_len, &extra);
7342 extra_data_bl.append(extra);
7343
7344 extra_data_left -= extra_len;
7345 if (extra_data_left == 0) {
7346 int res = process_attrs();
7347 if (res < 0)
7348 return res;
7349 }
7350 if (bl.length() == 0) {
7351 return 0;
7352 }
7353 ofs += extra_len;
7354 }
7355 // adjust ofs based on extra_data_len, so the result is a logical offset
7356 // into the object data
7357 assert(uint64_t(ofs) >= extra_data_len);
7358 ofs -= extra_data_len;
7359
7360 data_len += bl.length();
7361 bool again = false;
7362
7363 bool need_opstate = true;
7364
7365 do {
7366 void *handle = NULL;
7367 rgw_raw_obj obj;
7368 uint64_t size = bl.length();
7369 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7370 if (ret < 0)
7371 return ret;
7372
7373 if (need_opstate && opstate) {
7374 /* need to update opstate repository with new state. This is ratelimited, so we're not
7375 * really doing it every time
7376 */
7377 ret = opstate->renew_state();
7378 if (ret < 0) {
7379 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7380 int r = filter->throttle_data(handle, obj, size, false);
7381 if (r < 0) {
7382 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7383 }
7384 /* could not renew state! might have been marked as cancelled */
7385 return ret;
7386 }
7387 need_opstate = false;
7388 }
7389
7390 ret = filter->throttle_data(handle, obj, size, false);
7391 if (ret < 0)
7392 return ret;
7393 } while (again);
7394
7395 return 0;
7396 }
7397
7398 bufferlist& get_extra_data() { return extra_data_bl; }
7399
7400 map<string, bufferlist>& get_attrs() { return src_attrs; }
7401
7402 void set_extra_data_len(uint64_t len) override {
7403 extra_data_left = len;
7404 RGWGetDataCB::set_extra_data_len(len);
7405 }
7406
7407 uint64_t get_data_len() {
7408 return data_len;
7409 }
7410
7411 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7412 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7413 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7414 }
7415
7416 bool is_canceled() {
7417 return processor->is_canceled();
7418 }
7419 };
7420
7421 /*
7422 * prepare attrset depending on attrs_mod.
7423 */
7424 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7425 map<string, bufferlist>& attrs,
7426 RGWRados::AttrsMod attrs_mod)
7427 {
7428 switch (attrs_mod) {
7429 case RGWRados::ATTRSMOD_NONE:
7430 attrs = src_attrs;
7431 break;
7432 case RGWRados::ATTRSMOD_REPLACE:
7433 if (!attrs[RGW_ATTR_ETAG].length()) {
7434 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7435 }
7436 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7437 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7438 if (ttiter != src_attrs.end()) {
7439 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7440 }
7441 }
7442 break;
7443 case RGWRados::ATTRSMOD_MERGE:
7444 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7445 if (attrs.find(it->first) == attrs.end()) {
7446 attrs[it->first] = it->second;
7447 }
7448 }
7449 break;
7450 }
7451 }
7452
7453 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7454 {
7455 map<string, bufferlist> attrset;
7456
7457 real_time mtime;
7458 uint64_t obj_size;
7459 RGWObjectCtx rctx(this);
7460
7461 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7462 RGWRados::Object::Read read_op(&op_target);
7463
7464 read_op.params.attrs = &attrset;
7465 read_op.params.lastmod = &mtime;
7466 read_op.params.obj_size = &obj_size;
7467
7468 int ret = read_op.prepare();
7469 if (ret < 0)
7470 return ret;
7471
7472 attrset.erase(RGW_ATTR_ID_TAG);
7473 attrset.erase(RGW_ATTR_TAIL_TAG);
7474
7475 uint64_t max_chunk_size;
7476
7477 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7478 if (ret < 0) {
7479 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7480 return ret;
7481 }
7482
7483 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7484 max_chunk_size, NULL, mtime, attrset,
7485 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7486 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7487 NULL, NULL);
7488 }
7489
7490 struct obj_time_weight {
7491 real_time mtime;
7492 uint32_t zone_short_id;
7493 uint64_t pg_ver;
7494 bool high_precision;
7495
7496 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7497
7498 bool compare_low_precision(const obj_time_weight& rhs) {
7499 struct timespec l = ceph::real_clock::to_timespec(mtime);
7500 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7501 l.tv_nsec = 0;
7502 r.tv_nsec = 0;
7503 if (l > r) {
7504 return false;
7505 }
7506 if (l < r) {
7507 return true;
7508 }
7509 if (zone_short_id != rhs.zone_short_id) {
7510 return (zone_short_id < rhs.zone_short_id);
7511 }
7512 return (pg_ver < rhs.pg_ver);
7513
7514 }
7515
7516 bool operator<(const obj_time_weight& rhs) {
7517 if (!high_precision || !rhs.high_precision) {
7518 return compare_low_precision(rhs);
7519 }
7520 if (mtime > rhs.mtime) {
7521 return false;
7522 }
7523 if (mtime < rhs.mtime) {
7524 return true;
7525 }
7526 if (zone_short_id != rhs.zone_short_id) {
7527 return (zone_short_id < rhs.zone_short_id);
7528 }
7529 return (pg_ver < rhs.pg_ver);
7530 }
7531
7532 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7533 mtime = _mtime;
7534 zone_short_id = _short_id;
7535 pg_ver = _pg_ver;
7536 }
7537
7538 void init(RGWObjState *state) {
7539 mtime = state->mtime;
7540 zone_short_id = state->zone_short_id;
7541 pg_ver = state->pg_ver;
7542 }
7543 };
7544
7545 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7546 out << o.mtime;
7547
7548 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7549 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7550 }
7551
7552 return out;
7553 }
7554
7555 class RGWGetExtraDataCB : public RGWGetDataCB {
7556 bufferlist extra_data;
7557 public:
7558 RGWGetExtraDataCB() {}
7559 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7560 if (extra_data.length() < extra_data_len) {
7561 off_t max = extra_data_len - extra_data.length();
7562 if (max > bl_len) {
7563 max = bl_len;
7564 }
7565 bl.splice(0, max, &extra_data);
7566 }
7567 return bl_len;
7568 }
7569
7570 bufferlist& get_extra_data() {
7571 return extra_data;
7572 }
7573 };
7574
7575 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7576 const rgw_user& user_id,
7577 const string& client_id,
7578 req_info *info,
7579 const string& source_zone,
7580 rgw_obj& src_obj,
7581 RGWBucketInfo& src_bucket_info,
7582 real_time *src_mtime,
7583 uint64_t *psize,
7584 const real_time *mod_ptr,
7585 const real_time *unmod_ptr,
7586 bool high_precision_time,
7587 const char *if_match,
7588 const char *if_nomatch,
7589 map<string, bufferlist> *pattrs,
7590 string *version_id,
7591 string *ptag,
7592 string *petag)
7593 {
7594 /* source is in a different zonegroup, copy from there */
7595
7596 RGWRESTStreamRWRequest *in_stream_req;
7597 string tag;
7598 map<string, bufferlist> src_attrs;
7599 append_rand_alpha(cct, tag, tag, 32);
7600 obj_time_weight set_mtime_weight;
7601 set_mtime_weight.high_precision = high_precision_time;
7602
7603 RGWRESTConn *conn;
7604 if (source_zone.empty()) {
7605 if (src_bucket_info.zonegroup.empty()) {
7606 /* source is in the master zonegroup */
7607 conn = rest_master_conn;
7608 } else {
7609 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7610 if (iter == zonegroup_conn_map.end()) {
7611 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7612 return -ENOENT;
7613 }
7614 conn = iter->second;
7615 }
7616 } else {
7617 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7618 if (iter == zone_conn_map.end()) {
7619 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7620 return -ENOENT;
7621 }
7622 conn = iter->second;
7623 }
7624
7625 RGWGetExtraDataCB cb;
7626 string etag;
7627 map<string, string> req_headers;
7628 real_time set_mtime;
7629
7630 const real_time *pmod = mod_ptr;
7631
7632 obj_time_weight dest_mtime_weight;
7633
7634 constexpr bool prepend_meta = true;
7635 constexpr bool get_op = true;
7636 constexpr bool rgwx_stat = true;
7637 constexpr bool sync_manifest = true;
7638 constexpr bool skip_decrypt = true;
7639 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7640 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7641 prepend_meta, get_op, rgwx_stat,
7642 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7643 if (ret < 0) {
7644 return ret;
7645 }
7646
7647 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7648 if (ret < 0) {
7649 return ret;
7650 }
7651
7652 bufferlist& extra_data_bl = cb.get_extra_data();
7653 if (extra_data_bl.length()) {
7654 JSONParser jp;
7655 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7656 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7657 return -EIO;
7658 }
7659
7660 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7661
7662 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7663 }
7664
7665 if (src_mtime) {
7666 *src_mtime = set_mtime;
7667 }
7668
7669 if (petag) {
7670 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7671 if (iter != src_attrs.end()) {
7672 bufferlist& etagbl = iter->second;
7673 *petag = etagbl.to_str();
7674 }
7675 }
7676
7677 if (pattrs) {
7678 *pattrs = src_attrs;
7679 }
7680
7681 return 0;
7682 }
7683
7684 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7685 const rgw_user& user_id,
7686 const string& client_id,
7687 const string& op_id,
7688 bool record_op_state,
7689 req_info *info,
7690 const string& source_zone,
7691 rgw_obj& dest_obj,
7692 rgw_obj& src_obj,
7693 RGWBucketInfo& dest_bucket_info,
7694 RGWBucketInfo& src_bucket_info,
7695 real_time *src_mtime,
7696 real_time *mtime,
7697 const real_time *mod_ptr,
7698 const real_time *unmod_ptr,
7699 bool high_precision_time,
7700 const char *if_match,
7701 const char *if_nomatch,
7702 AttrsMod attrs_mod,
7703 bool copy_if_newer,
7704 map<string, bufferlist>& attrs,
7705 RGWObjCategory category,
7706 uint64_t olh_epoch,
7707 real_time delete_at,
7708 string *version_id,
7709 string *ptag,
7710 ceph::buffer::list *petag,
7711 void (*progress_cb)(off_t, void *),
7712 void *progress_data,
7713 rgw_zone_set *zones_trace)
7714 {
7715 /* source is in a different zonegroup, copy from there */
7716
7717 RGWRESTStreamRWRequest *in_stream_req;
7718 string tag;
7719 int i;
7720 append_rand_alpha(cct, tag, tag, 32);
7721 obj_time_weight set_mtime_weight;
7722 set_mtime_weight.high_precision = high_precision_time;
7723
7724 RGWPutObjProcessor_Atomic processor(obj_ctx,
7725 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7726 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7727 if (version_id && *version_id != "null") {
7728 processor.set_version_id(*version_id);
7729 }
7730 processor.set_olh_epoch(olh_epoch);
7731 int ret = processor.prepare(this, NULL);
7732 if (ret < 0) {
7733 return ret;
7734 }
7735
7736 RGWRESTConn *conn;
7737 if (source_zone.empty()) {
7738 if (dest_bucket_info.zonegroup.empty()) {
7739 /* source is in the master zonegroup */
7740 conn = rest_master_conn;
7741 } else {
7742 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7743 if (iter == zonegroup_conn_map.end()) {
7744 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7745 return -ENOENT;
7746 }
7747 conn = iter->second;
7748 }
7749 } else {
7750 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7751 if (iter == zone_conn_map.end()) {
7752 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7753 return -ENOENT;
7754 }
7755 conn = iter->second;
7756 }
7757
7758 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7759
7760 RGWOpStateSingleOp *opstate = NULL;
7761
7762 if (record_op_state) {
7763 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7764
7765 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7766 if (ret < 0) {
7767 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7768 delete opstate;
7769 return ret;
7770 }
7771 }
7772
7773 boost::optional<RGWPutObj_Compress> compressor;
7774 CompressorRef plugin;
7775
7776 const auto& compression_type = zone_params.get_compression_type(
7777 dest_bucket_info.placement_rule);
7778 if (compression_type != "none") {
7779 plugin = Compressor::create(cct, compression_type);
7780 if (!plugin) {
7781 ldout(cct, 1) << "Cannot load plugin for compression type "
7782 << compression_type << dendl;
7783 }
7784 }
7785
7786 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7787
7788 string etag;
7789 map<string, string> req_headers;
7790 real_time set_mtime;
7791
7792 RGWObjState *dest_state = NULL;
7793
7794 const real_time *pmod = mod_ptr;
7795
7796 obj_time_weight dest_mtime_weight;
7797
7798 if (copy_if_newer) {
7799 /* need to get mtime for destination */
7800 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7801 if (ret < 0)
7802 goto set_err_state;
7803
7804 if (!real_clock::is_zero(dest_state->mtime)) {
7805 dest_mtime_weight.init(dest_state);
7806 pmod = &dest_mtime_weight.mtime;
7807 }
7808 }
7809
7810 static constexpr bool prepend_meta = true;
7811 static constexpr bool get_op = true;
7812 static constexpr bool rgwx_stat = false;
7813 static constexpr bool sync_manifest = true;
7814 static constexpr bool skip_decrypt = true;
7815 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7816 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7817 prepend_meta, get_op, rgwx_stat,
7818 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7819 if (ret < 0) {
7820 goto set_err_state;
7821 }
7822
7823 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7824 if (ret < 0) {
7825 goto set_err_state;
7826 }
7827 if (compressor && compressor->is_compressed()) {
7828 bufferlist tmp;
7829 RGWCompressionInfo cs_info;
7830 cs_info.compression_type = plugin->get_type_name();
7831 cs_info.orig_size = cb.get_data_len();
7832 cs_info.blocks = move(compressor->get_compression_blocks());
7833 ::encode(cs_info, tmp);
7834 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7835 }
7836
7837 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7838 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7839 } else {
7840 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7841 if (iter != cb.get_attrs().end()) {
7842 try {
7843 ::decode(delete_at, iter->second);
7844 } catch (buffer::error& err) {
7845 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7846 }
7847 }
7848 }
7849
7850 if (src_mtime) {
7851 *src_mtime = set_mtime;
7852 }
7853
7854 if (petag) {
7855 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7856 if (iter != cb.get_attrs().end()) {
7857 *petag = iter->second;
7858 }
7859 }
7860
7861 if (source_zone.empty()) {
7862 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7863 } else {
7864 attrs = cb.get_attrs();
7865 }
7866
7867 if (copy_if_newer) {
7868 uint64_t pg_ver = 0;
7869 auto i = attrs.find(RGW_ATTR_PG_VER);
7870 if (i != attrs.end() && i->second.length() > 0) {
7871 bufferlist::iterator iter = i->second.begin();
7872 try {
7873 ::decode(pg_ver, iter);
7874 } catch (buffer::error& err) {
7875 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7876 /* non critical error */
7877 }
7878 }
7879 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7880 }
7881
7882 #define MAX_COMPLETE_RETRY 100
7883 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7884 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7885 if (ret < 0) {
7886 goto set_err_state;
7887 }
7888 if (copy_if_newer && cb.is_canceled()) {
7889 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7890 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7891 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7892 if (ret < 0) {
7893 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7894 goto set_err_state;
7895 }
7896 dest_mtime_weight.init(dest_state);
7897 dest_mtime_weight.high_precision = high_precision_time;
7898 if (!dest_state->exists ||
7899 dest_mtime_weight < set_mtime_weight) {
7900 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7901 continue;
7902 } else {
7903 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7904 }
7905 }
7906 break;
7907 }
7908
7909 if (i == MAX_COMPLETE_RETRY) {
7910 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7911 ret = -EIO;
7912 goto set_err_state;
7913 }
7914
7915 if (opstate) {
7916 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7917 if (ret < 0) {
7918 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7919 }
7920 delete opstate;
7921 }
7922
7923 return 0;
7924 set_err_state:
7925 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7926 ret = 0;
7927 }
7928 if (opstate) {
7929 RGWOpState::OpState state;
7930 if (ret < 0) {
7931 state = RGWOpState::OPSTATE_ERROR;
7932 } else {
7933 state = RGWOpState::OPSTATE_COMPLETE;
7934 }
7935 int r = opstate->set_state(state);
7936 if (r < 0) {
7937 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7938 }
7939 delete opstate;
7940 }
7941 return ret;
7942 }
7943
7944
7945 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7946 map<string, bufferlist>& src_attrs,
7947 RGWRados::Object::Read& read_op,
7948 const rgw_user& user_id,
7949 rgw_obj& dest_obj,
7950 real_time *mtime)
7951 {
7952 string etag;
7953
7954 RGWRESTStreamWriteRequest *out_stream_req;
7955
7956 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7957 if (ret < 0) {
7958 return ret;
7959 }
7960
7961 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7962 if (ret < 0) {
7963 delete out_stream_req;
7964 return ret;
7965 }
7966
7967 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7968 if (ret < 0)
7969 return ret;
7970
7971 return 0;
7972 }
7973
7974 /**
7975 * Copy an object.
7976 * dest_obj: the object to copy into
7977 * src_obj: the object to copy from
7978 * attrs: usage depends on attrs_mod parameter
7979 * attrs_mod: the modification mode of the attrs, may have the following values:
7980 * ATTRSMOD_NONE - the attributes of the source object will be
7981 * copied without modifications, attrs parameter is ignored;
7982 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7983 * parameter, source object attributes are not copied;
7984 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7985 * are overwritten by values contained in attrs parameter.
7986 * err: stores any errors resulting from the get of the original object
7987 * Returns: 0 on success, -ERR# otherwise.
7988 */
7989 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7990 const rgw_user& user_id,
7991 const string& client_id,
7992 const string& op_id,
7993 req_info *info,
7994 const string& source_zone,
7995 rgw_obj& dest_obj,
7996 rgw_obj& src_obj,
7997 RGWBucketInfo& dest_bucket_info,
7998 RGWBucketInfo& src_bucket_info,
7999 real_time *src_mtime,
8000 real_time *mtime,
8001 const real_time *mod_ptr,
8002 const real_time *unmod_ptr,
8003 bool high_precision_time,
8004 const char *if_match,
8005 const char *if_nomatch,
8006 AttrsMod attrs_mod,
8007 bool copy_if_newer,
8008 map<string, bufferlist>& attrs,
8009 RGWObjCategory category,
8010 uint64_t olh_epoch,
8011 real_time delete_at,
8012 string *version_id,
8013 string *ptag,
8014 ceph::buffer::list *petag,
8015 void (*progress_cb)(off_t, void *),
8016 void *progress_data)
8017 {
8018 int ret;
8019 uint64_t obj_size;
8020 rgw_obj shadow_obj = dest_obj;
8021 string shadow_oid;
8022
8023 bool remote_src;
8024 bool remote_dest;
8025
8026 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8027 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8028
8029 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8030 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8031
8032 if (remote_src && remote_dest) {
8033 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8034 return -EINVAL;
8035 }
8036
8037 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8038
8039 if (remote_src || !source_zone.empty()) {
8040 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8041 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8042 unmod_ptr, high_precision_time,
8043 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
8044 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
8045 }
8046
8047 map<string, bufferlist> src_attrs;
8048 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8049 RGWRados::Object::Read read_op(&src_op_target);
8050
8051 read_op.conds.mod_ptr = mod_ptr;
8052 read_op.conds.unmod_ptr = unmod_ptr;
8053 read_op.conds.high_precision_time = high_precision_time;
8054 read_op.conds.if_match = if_match;
8055 read_op.conds.if_nomatch = if_nomatch;
8056 read_op.params.attrs = &src_attrs;
8057 read_op.params.lastmod = src_mtime;
8058 read_op.params.obj_size = &obj_size;
8059
8060 ret = read_op.prepare();
8061 if (ret < 0) {
8062 return ret;
8063 }
8064
8065 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8066 src_attrs.erase(RGW_ATTR_DELETE_AT);
8067
8068 set_copy_attrs(src_attrs, attrs, attrs_mod);
8069 attrs.erase(RGW_ATTR_ID_TAG);
8070 attrs.erase(RGW_ATTR_PG_VER);
8071 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8072 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8073 if (cmp != src_attrs.end())
8074 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8075
8076 RGWObjManifest manifest;
8077 RGWObjState *astate = NULL;
8078
8079 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8080 if (ret < 0) {
8081 return ret;
8082 }
8083
8084 vector<rgw_raw_obj> ref_objs;
8085
8086 if (remote_dest) {
8087 /* dest is in a different zonegroup, copy it there */
8088 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8089 }
8090 uint64_t max_chunk_size;
8091
8092 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8093 if (ret < 0) {
8094 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8095 return ret;
8096 }
8097
8098 rgw_pool src_pool;
8099 rgw_pool dest_pool;
8100 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8101 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8102 return -EIO;
8103 }
8104 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8105 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8106 return -EIO;
8107 }
8108
8109
8110 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8111 bool copy_first = false;
8112 if (astate->has_manifest) {
8113 if (!astate->manifest.has_tail()) {
8114 copy_data = true;
8115 } else {
8116 uint64_t head_size = astate->manifest.get_head_size();
8117
8118 if (head_size > 0) {
8119 if (head_size > max_chunk_size) {
8120 copy_data = true;
8121 } else {
8122 copy_first = true;
8123 }
8124 }
8125 }
8126 }
8127
8128 if (petag) {
8129 const auto iter = attrs.find(RGW_ATTR_ETAG);
8130 if (iter != attrs.end()) {
8131 *petag = iter->second;
8132 }
8133 }
8134
8135 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8136 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8137 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8138 version_id, ptag, petag);
8139 }
8140
8141 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8142
8143 if (copy_first) { // we need to copy first chunk, not increase refcount
8144 ++miter;
8145 }
8146
8147 rgw_rados_ref ref;
8148 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8149 if (ret < 0) {
8150 return ret;
8151 }
8152
8153 bool versioned_dest = dest_bucket_info.versioning_enabled();
8154
8155 if (version_id && !version_id->empty()) {
8156 versioned_dest = true;
8157 dest_obj.key.set_instance(*version_id);
8158 } else if (versioned_dest) {
8159 gen_rand_obj_instance_name(&dest_obj);
8160 }
8161
8162 bufferlist first_chunk;
8163
8164 bool copy_itself = (dest_obj == src_obj);
8165 RGWObjManifest *pmanifest;
8166 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8167
8168 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8169 RGWRados::Object::Write write_op(&dest_op_target);
8170
8171 string tag;
8172
8173 if (ptag) {
8174 tag = *ptag;
8175 }
8176
8177 if (tag.empty()) {
8178 append_rand_alpha(cct, tag, tag, 32);
8179 }
8180
8181 if (!copy_itself) {
8182 attrs.erase(RGW_ATTR_TAIL_TAG);
8183 manifest = astate->manifest;
8184 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8185 if (tail_placement.bucket.name.empty()) {
8186 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8187 }
8188 string ref_tag;
8189 for (; miter != astate->manifest.obj_end(); ++miter) {
8190 ObjectWriteOperation op;
8191 ref_tag = tag + '\0';
8192 cls_refcount_get(op, ref_tag, true);
8193 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8194 ref.ioctx.locator_set_key(loc.loc);
8195
8196 ret = ref.ioctx.operate(loc.oid, &op);
8197 if (ret < 0) {
8198 goto done_ret;
8199 }
8200
8201 ref_objs.push_back(loc);
8202 }
8203
8204 pmanifest = &manifest;
8205 } else {
8206 pmanifest = &astate->manifest;
8207 /* don't send the object's tail for garbage collection */
8208 astate->keep_tail = true;
8209 }
8210
8211 if (copy_first) {
8212 ret = read_op.read(0, max_chunk_size, first_chunk);
8213 if (ret < 0) {
8214 goto done_ret;
8215 }
8216
8217 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8218 } else {
8219 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8220 }
8221
8222 write_op.meta.data = &first_chunk;
8223 write_op.meta.manifest = pmanifest;
8224 write_op.meta.ptag = &tag;
8225 write_op.meta.owner = dest_bucket_info.owner;
8226 write_op.meta.mtime = mtime;
8227 write_op.meta.flags = PUT_OBJ_CREATE;
8228 write_op.meta.category = category;
8229 write_op.meta.olh_epoch = olh_epoch;
8230 write_op.meta.delete_at = delete_at;
8231 write_op.meta.modify_tail = !copy_itself;
8232
8233 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8234 if (ret < 0) {
8235 goto done_ret;
8236 }
8237
8238 return 0;
8239
8240 done_ret:
8241 if (!copy_itself) {
8242 vector<rgw_raw_obj>::iterator riter;
8243
8244 /* rollback reference */
8245 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8246 ObjectWriteOperation op;
8247 cls_refcount_put(op, tag, true);
8248
8249 ref.ioctx.locator_set_key(riter->loc);
8250
8251 int r = ref.ioctx.operate(riter->oid, &op);
8252 if (r < 0) {
8253 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8254 }
8255 }
8256 }
8257 return ret;
8258 }
8259
8260
8261 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8262 RGWBucketInfo& dest_bucket_info,
8263 RGWRados::Object::Read& read_op, off_t end,
8264 rgw_obj& dest_obj,
8265 rgw_obj& src_obj,
8266 uint64_t max_chunk_size,
8267 real_time *mtime,
8268 real_time set_mtime,
8269 map<string, bufferlist>& attrs,
8270 RGWObjCategory category,
8271 uint64_t olh_epoch,
8272 real_time delete_at,
8273 string *version_id,
8274 string *ptag,
8275 ceph::buffer::list *petag)
8276 {
8277 bufferlist first_chunk;
8278 RGWObjManifest manifest;
8279
8280 string tag;
8281 append_rand_alpha(cct, tag, tag, 32);
8282
8283 RGWPutObjProcessor_Atomic processor(obj_ctx,
8284 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
8285 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8286 if (version_id) {
8287 processor.set_version_id(*version_id);
8288 }
8289 processor.set_olh_epoch(olh_epoch);
8290 int ret = processor.prepare(this, NULL);
8291 if (ret < 0)
8292 return ret;
8293
8294 off_t ofs = 0;
8295
8296 do {
8297 bufferlist bl;
8298 ret = read_op.read(ofs, end, bl);
8299
8300 uint64_t read_len = ret;
8301 bool again;
8302
8303 do {
8304 void *handle;
8305 rgw_raw_obj obj;
8306
8307 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8308 if (ret < 0) {
8309 return ret;
8310 }
8311 ret = processor.throttle_data(handle, obj, read_len, false);
8312 if (ret < 0)
8313 return ret;
8314 } while (again);
8315
8316 ofs += read_len;
8317 } while (ofs <= end);
8318
8319 string etag;
8320 auto iter = attrs.find(RGW_ATTR_ETAG);
8321 if (iter != attrs.end()) {
8322 bufferlist& bl = iter->second;
8323 etag = string(bl.c_str(), bl.length());
8324 if (petag) {
8325 *petag = bl;
8326 }
8327 }
8328
8329 uint64_t accounted_size;
8330 {
8331 bool compressed{false};
8332 RGWCompressionInfo cs_info;
8333 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8334 if (ret < 0) {
8335 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8336 return ret;
8337 }
8338 // pass original size if compressed
8339 accounted_size = compressed ? cs_info.orig_size : ofs;
8340 }
8341
8342 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8343 }
8344
8345 bool RGWRados::is_meta_master()
8346 {
8347 if (!get_zonegroup().is_master_zonegroup()) {
8348 return false;
8349 }
8350
8351 return (get_zonegroup().master_zone == zone_public_config.id);
8352 }
8353
8354 /**
8355 * Check to see if the bucket metadata could be synced
8356 * bucket: the bucket to check
8357 * Returns false is the bucket is not synced
8358 */
8359 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8360 {
8361
8362 /* no current period */
8363 if (current_period.get_id().empty()) {
8364 return false;
8365 }
8366
8367 /* zonegroup is not master zonegroup */
8368 if (!get_zonegroup().is_master_zonegroup()) {
8369 return false;
8370 }
8371
8372 /* single zonegroup and a single zone */
8373 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8374 return false;
8375 }
8376
8377 /* zone is not master */
8378 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8379 return false;
8380 }
8381
8382 return true;
8383 }
8384
8385 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8386 {
8387 std::map<string, rgw_bucket_dir_entry> ent_map;
8388 rgw_obj_index_key marker;
8389 string prefix;
8390 bool is_truncated;
8391
8392 do {
8393 #define NUM_ENTRIES 1000
8394 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8395 &is_truncated, &marker);
8396 if (r < 0)
8397 return r;
8398
8399 string ns;
8400 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8401 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8402 rgw_obj_key obj;
8403
8404 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8405 return -ENOTEMPTY;
8406 }
8407 } while (is_truncated);
8408 return 0;
8409 }
8410
8411 /**
8412 * Delete a bucket.
8413 * bucket: the name of the bucket to delete
8414 * Returns 0 on success, -ERR# otherwise.
8415 */
8416 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8417 {
8418 const rgw_bucket& bucket = bucket_info.bucket;
8419 librados::IoCtx index_ctx;
8420 map<int, string> bucket_objs;
8421 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8422 if (r < 0)
8423 return r;
8424
8425 if (check_empty) {
8426 r = check_bucket_empty(bucket_info);
8427 if (r < 0) {
8428 return r;
8429 }
8430 }
8431
8432 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8433 if (r < 0)
8434 return r;
8435
8436 /* if the bucket is not synced we can remove the meta file */
8437 if (!is_syncing_bucket_meta(bucket)) {
8438 RGWObjVersionTracker objv_tracker;
8439 string entry = bucket.get_key();
8440 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8441 if (r < 0) {
8442 return r;
8443 }
8444 /* remove bucket index objects*/
8445 map<int, string>::const_iterator biter;
8446 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8447 index_ctx.remove(biter->second);
8448 }
8449 }
8450 return 0;
8451 }
8452
8453 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8454 {
8455 RGWBucketInfo info;
8456 map<string, bufferlist> attrs;
8457 RGWObjectCtx obj_ctx(this);
8458 int r;
8459 if (bucket.bucket_id.empty()) {
8460 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8461 } else {
8462 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8463 }
8464 if (r < 0) {
8465 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8466 return r;
8467 }
8468
8469 info.owner = owner.get_id();
8470
8471 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8472 if (r < 0) {
8473 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8474 return r;
8475 }
8476
8477 return 0;
8478 }
8479
8480
8481 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8482 {
8483 int ret = 0;
8484
8485 vector<rgw_bucket>::iterator iter;
8486
8487 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8488 rgw_bucket& bucket = *iter;
8489 if (enabled)
8490 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8491 else
8492 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8493
8494 RGWBucketInfo info;
8495 map<string, bufferlist> attrs;
8496 RGWObjectCtx obj_ctx(this);
8497 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8498 if (r < 0) {
8499 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8500 ret = r;
8501 continue;
8502 }
8503 if (enabled) {
8504 info.flags &= ~BUCKET_SUSPENDED;
8505 } else {
8506 info.flags |= BUCKET_SUSPENDED;
8507 }
8508
8509 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8510 if (r < 0) {
8511 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8512 ret = r;
8513 continue;
8514 }
8515 }
8516 return ret;
8517 }
8518
8519 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8520 {
8521 RGWBucketInfo bucket_info;
8522 RGWObjectCtx obj_ctx(this);
8523 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8524 if (ret < 0) {
8525 return ret;
8526 }
8527
8528 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8529 return 0;
8530 }
8531
8532 int RGWRados::Object::complete_atomic_modification()
8533 {
8534 if (!state->has_manifest || state->keep_tail)
8535 return 0;
8536
8537 cls_rgw_obj_chain chain;
8538 store->update_gc_chain(obj, state->manifest, &chain);
8539
8540 if (chain.empty()) {
8541 return 0;
8542 }
8543
8544 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
8545 return store->gc->send_chain(chain, tag, false); // do it async
8546 }
8547
8548 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8549 {
8550 RGWObjManifest::obj_iterator iter;
8551 rgw_raw_obj raw_head;
8552 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8553 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8554 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8555 if (mobj == raw_head)
8556 continue;
8557 cls_rgw_obj_key key(mobj.oid);
8558 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8559 }
8560 }
8561
8562 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8563 {
8564 return gc->send_chain(chain, tag, sync);
8565 }
8566
8567 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8568 {
8569 const rgw_bucket& bucket = bucket_info.bucket;
8570 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8571 if (r < 0)
8572 return r;
8573
8574 if (bucket.bucket_id.empty()) {
8575 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8576 return -EIO;
8577 }
8578
8579 bucket_oid = dir_oid_prefix;
8580 bucket_oid.append(bucket.bucket_id);
8581
8582 return 0;
8583 }
8584
8585 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8586 string& bucket_oid_base) {
8587 const rgw_bucket& bucket = bucket_info.bucket;
8588 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8589 if (r < 0)
8590 return r;
8591
8592 if (bucket.bucket_id.empty()) {
8593 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8594 return -EIO;
8595 }
8596
8597 bucket_oid_base = dir_oid_prefix;
8598 bucket_oid_base.append(bucket.bucket_id);
8599
8600 return 0;
8601
8602 }
8603
8604 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8605 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8606 string bucket_oid_base;
8607 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8608 if (ret < 0) {
8609 return ret;
8610 }
8611
8612 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8613 if (bucket_instance_ids) {
8614 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8615 }
8616 return 0;
8617 }
8618
8619 template<typename T>
8620 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8621 map<int, string>& oids, map<int, T>& bucket_objs,
8622 int shard_id, map<int, string> *bucket_instance_ids)
8623 {
8624 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8625 if (ret < 0)
8626 return ret;
8627
8628 map<int, string>::const_iterator iter = oids.begin();
8629 for (; iter != oids.end(); ++iter) {
8630 bucket_objs[iter->first] = T();
8631 }
8632 return 0;
8633 }
8634
8635 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8636 const string& obj_key, string *bucket_obj, int *shard_id)
8637 {
8638 string bucket_oid_base;
8639 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8640 if (ret < 0)
8641 return ret;
8642
8643 RGWObjectCtx obj_ctx(this);
8644
8645 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8646 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8647 if (ret < 0) {
8648 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8649 return ret;
8650 }
8651 return 0;
8652 }
8653
8654 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8655 int shard_id, string *bucket_obj)
8656 {
8657 string bucket_oid_base;
8658 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8659 if (ret < 0)
8660 return ret;
8661
8662 RGWObjectCtx obj_ctx(this);
8663
8664 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8665 shard_id, bucket_obj);
8666 return 0;
8667 }
8668
8669 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8670 map<RGWObjCategory, RGWStorageStats>& stats)
8671 {
8672 for (const auto& pair : header.stats) {
8673 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8674 const rgw_bucket_category_stats& header_stats = pair.second;
8675
8676 RGWStorageStats& s = stats[category];
8677
8678 s.category = category;
8679 s.size += header_stats.total_size;
8680 s.size_rounded += header_stats.total_size_rounded;
8681 s.size_utilized += header_stats.actual_size;
8682 s.num_objects += header_stats.num_entries;
8683 }
8684 }
8685
8686 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8687 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8688 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8689 {
8690 librados::IoCtx index_ctx;
8691 // key - bucket index object id
8692 // value - bucket index check OP returned result with the given bucket index object (shard)
8693 map<int, string> oids;
8694 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8695
8696 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8697 if (ret < 0) {
8698 return ret;
8699 }
8700
8701 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8702 if (ret < 0) {
8703 return ret;
8704 }
8705
8706 // Aggregate results (from different shards if there is any)
8707 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8708 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8709 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8710 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8711 }
8712
8713 return 0;
8714 }
8715
8716 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8717 {
8718 librados::IoCtx index_ctx;
8719 map<int, string> bucket_objs;
8720
8721 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8722 if (r < 0) {
8723 return r;
8724 }
8725
8726 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8727 }
8728
8729 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8730 {
8731 librados::IoCtx index_ctx;
8732 map<int, string> bucket_objs;
8733
8734 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8735 if (r < 0) {
8736 return r;
8737 }
8738
8739 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8740 }
8741
8742 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8743 {
8744 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8745 std::string oid, key;
8746 get_obj_bucket_and_oid_loc(obj, oid, key);
8747 if (!rctx)
8748 return 0;
8749
8750 RGWObjState *state = NULL;
8751
8752 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8753 if (r < 0)
8754 return r;
8755
8756 if (!state->is_atomic) {
8757 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8758 return -EINVAL;
8759 }
8760
8761 string tag;
8762
8763 if (state->tail_tag.length() > 0) {
8764 tag = state->tail_tag.c_str();
8765 } else if (state->obj_tag.length() > 0) {
8766 tag = state->obj_tag.c_str();
8767 } else {
8768 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8769 return -EINVAL;
8770 }
8771
8772 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8773
8774 return gc->defer_chain(tag, false);
8775 }
8776
8777 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8778 {
8779 list<string> prefixes;
8780 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8781 cls_rgw_remove_obj(op, prefixes);
8782 }
8783
8784 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8785 {
8786 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8787 }
8788
8789 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8790 {
8791 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8792 }
8793
8794
8795 /**
8796 * Delete an object.
8797 * bucket: name of the bucket storing the object
8798 * obj: name of the object to delete
8799 * Returns: 0 on success, -ERR# otherwise.
8800 */
8801 int RGWRados::Object::Delete::delete_obj()
8802 {
8803 RGWRados *store = target->get_store();
8804 rgw_obj& src_obj = target->get_obj();
8805 const string& instance = src_obj.key.instance;
8806 rgw_obj obj = src_obj;
8807
8808 if (instance == "null") {
8809 obj.key.instance.clear();
8810 }
8811
8812 bool explicit_marker_version = (!params.marker_version_id.empty());
8813
8814 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8815 if (instance.empty() || explicit_marker_version) {
8816 rgw_obj marker = obj;
8817
8818 if (!params.marker_version_id.empty()) {
8819 if (params.marker_version_id != "null") {
8820 marker.key.set_instance(params.marker_version_id);
8821 }
8822 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8823 store->gen_rand_obj_instance_name(&marker);
8824 }
8825
8826 result.version_id = marker.key.instance;
8827 result.delete_marker = true;
8828
8829 struct rgw_bucket_dir_entry_meta meta;
8830
8831 meta.owner = params.obj_owner.get_id().to_str();
8832 meta.owner_display_name = params.obj_owner.get_display_name();
8833
8834 if (real_clock::is_zero(params.mtime)) {
8835 meta.mtime = real_clock::now();
8836 } else {
8837 meta.mtime = params.mtime;
8838 }
8839
8840 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8841 if (r < 0) {
8842 return r;
8843 }
8844 } else {
8845 rgw_bucket_dir_entry dirent;
8846
8847 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8848 if (r < 0) {
8849 return r;
8850 }
8851 result.delete_marker = dirent.is_delete_marker();
8852 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8853 if (r < 0) {
8854 return r;
8855 }
8856 result.version_id = instance;
8857 }
8858
8859 BucketShard *bs;
8860 int r = target->get_bucket_shard(&bs);
8861 if (r < 0) {
8862 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8863 return r;
8864 }
8865
8866 if (target->bucket_info.datasync_flag_enabled()) {
8867 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8868 if (r < 0) {
8869 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8870 return r;
8871 }
8872 }
8873
8874 return 0;
8875 }
8876
8877 rgw_rados_ref ref;
8878 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8879 if (r < 0) {
8880 return r;
8881 }
8882
8883 RGWObjState *state;
8884 r = target->get_state(&state, false);
8885 if (r < 0)
8886 return r;
8887
8888 ObjectWriteOperation op;
8889
8890 if (!real_clock::is_zero(params.unmod_since)) {
8891 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8892 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8893 if (!params.high_precision_time) {
8894 ctime.tv_nsec = 0;
8895 unmod.tv_nsec = 0;
8896 }
8897
8898 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8899 if (ctime > unmod) {
8900 return -ERR_PRECONDITION_FAILED;
8901 }
8902
8903 /* only delete object if mtime is less than or equal to params.unmod_since */
8904 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8905 }
8906 uint64_t obj_size = state->size;
8907
8908 if (!real_clock::is_zero(params.expiration_time)) {
8909 bufferlist bl;
8910 real_time delete_at;
8911
8912 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8913 try {
8914 bufferlist::iterator iter = bl.begin();
8915 ::decode(delete_at, iter);
8916 } catch (buffer::error& err) {
8917 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8918 return -EIO;
8919 }
8920
8921 if (params.expiration_time != delete_at) {
8922 return -ERR_PRECONDITION_FAILED;
8923 }
8924 } else {
8925 return -ERR_PRECONDITION_FAILED;
8926 }
8927 }
8928
8929 if (!state->exists) {
8930 target->invalidate_state();
8931 return -ENOENT;
8932 }
8933
8934 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
8935 if (r < 0)
8936 return r;
8937
8938 RGWBucketInfo& bucket_info = target->get_bucket_info();
8939
8940 RGWRados::Bucket bop(store, bucket_info);
8941 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8942
8943 index_op.set_zones_trace(params.zones_trace);
8944 index_op.set_bilog_flags(params.bilog_flags);
8945
8946 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8947 if (r < 0)
8948 return r;
8949
8950 store->remove_rgw_head_obj(op);
8951 r = ref.ioctx.operate(ref.oid, &op);
8952 bool need_invalidate = false;
8953 if (r == -ECANCELED) {
8954 /* raced with another operation, we can regard it as removed */
8955 need_invalidate = true;
8956 r = 0;
8957 }
8958
8959 int64_t poolid = ref.ioctx.get_id();
8960 if (r >= 0) {
8961 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8962 if (obj_tombstone_cache) {
8963 tombstone_entry entry{*state};
8964 obj_tombstone_cache->add(obj, entry);
8965 }
8966 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8967
8968 int ret = target->complete_atomic_modification();
8969 if (ret < 0) {
8970 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8971 }
8972 /* other than that, no need to propagate error */
8973 } else {
8974 int ret = index_op.cancel();
8975 if (ret < 0) {
8976 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8977 }
8978 }
8979
8980 if (need_invalidate) {
8981 target->invalidate_state();
8982 }
8983
8984 if (r < 0)
8985 return r;
8986
8987 /* update quota cache */
8988 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8989
8990 return 0;
8991 }
8992
8993 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8994 const RGWBucketInfo& bucket_info,
8995 const rgw_obj& obj,
8996 int versioning_status,
8997 uint16_t bilog_flags,
8998 const real_time& expiration_time,
8999 rgw_zone_set *zones_trace)
9000 {
9001 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9002 RGWRados::Object::Delete del_op(&del_target);
9003
9004 del_op.params.bucket_owner = bucket_info.owner;
9005 del_op.params.versioning_status = versioning_status;
9006 del_op.params.bilog_flags = bilog_flags;
9007 del_op.params.expiration_time = expiration_time;
9008 del_op.params.zones_trace = zones_trace;
9009
9010 return del_op.delete_obj();
9011 }
9012
9013 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9014 {
9015 rgw_rados_ref ref;
9016 int r = get_raw_obj_ref(obj, &ref);
9017 if (r < 0) {
9018 return r;
9019 }
9020
9021 ObjectWriteOperation op;
9022
9023 op.remove();
9024 r = ref.ioctx.operate(ref.oid, &op);
9025 if (r < 0)
9026 return r;
9027
9028 return 0;
9029 }
9030
9031 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9032 {
9033 if (obj.empty()) {
9034 ldout(cct, 1) << "delete_system_obj got empty object name "
9035 << obj << ", returning EINVAL" << dendl;
9036 return -EINVAL;
9037 }
9038 rgw_rados_ref ref;
9039 int r = get_raw_obj_ref(obj, &ref);
9040 if (r < 0) {
9041 return r;
9042 }
9043
9044 ObjectWriteOperation op;
9045
9046 if (objv_tracker) {
9047 objv_tracker->prepare_op_for_write(&op);
9048 }
9049
9050 op.remove();
9051 r = ref.ioctx.operate(ref.oid, &op);
9052 if (r < 0)
9053 return r;
9054
9055 return 0;
9056 }
9057
9058 int RGWRados::delete_obj_index(const rgw_obj& obj)
9059 {
9060 std::string oid, key;
9061 get_obj_bucket_and_oid_loc(obj, oid, key);
9062
9063 RGWObjectCtx obj_ctx(this);
9064
9065 RGWBucketInfo bucket_info;
9066 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9067 if (ret < 0) {
9068 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9069 return ret;
9070 }
9071
9072 RGWRados::Bucket bop(this, bucket_info);
9073 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9074
9075 real_time removed_mtime;
9076 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9077
9078 return r;
9079 }
9080
9081 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9082 {
9083 string tag;
9084
9085 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9086 if (mi != manifest.obj_end()) {
9087 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9088 ++mi;
9089 tag = mi.get_location().get_raw_obj(store).oid;
9090 tag.append("_");
9091 }
9092
9093 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9094 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9095 MD5 hash;
9096 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9097
9098 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9099 if (iter != attrset.end()) {
9100 bufferlist& bl = iter->second;
9101 hash.Update((const byte *)bl.c_str(), bl.length());
9102 }
9103
9104 hash.Final(md5);
9105 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9106 tag.append(md5_str);
9107
9108 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9109
9110 tag_bl.append(tag.c_str(), tag.size() + 1);
9111 }
9112
9113 static bool is_olh(map<string, bufferlist>& attrs)
9114 {
9115 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9116 return (iter != attrs.end());
9117 }
9118
9119 static bool has_olh_tag(map<string, bufferlist>& attrs)
9120 {
9121 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9122 return (iter != attrs.end());
9123 }
9124
9125 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9126 RGWObjState *olh_state, RGWObjState **target_state)
9127 {
9128 assert(olh_state->is_olh);
9129
9130 rgw_obj target;
9131 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9132 if (r < 0) {
9133 return r;
9134 }
9135 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9136 if (r < 0) {
9137 return r;
9138 }
9139
9140 return 0;
9141 }
9142
9143 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9144 {
9145 if (obj.empty()) {
9146 return -EINVAL;
9147 }
9148
9149 RGWRawObjState *s = rctx->raw.get_state(obj);
9150 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9151 *state = s;
9152 if (s->has_attrs) {
9153 return 0;
9154 }
9155
9156 s->obj = obj;
9157
9158 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9159 if (r == -ENOENT) {
9160 s->exists = false;
9161 s->has_attrs = true;
9162 s->mtime = real_time();
9163 return 0;
9164 }
9165 if (r < 0)
9166 return r;
9167
9168 s->exists = true;
9169 s->has_attrs = true;
9170 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9171
9172 if (s->obj_tag.length())
9173 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9174 << s->obj_tag.c_str() << dendl;
9175 else
9176 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9177
9178 return 0;
9179 }
9180
9181 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9182 {
9183 int ret;
9184
9185 do {
9186 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9187 } while (ret == -EAGAIN);
9188
9189 return ret;
9190 }
9191
9192 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9193 RGWObjState **state, bool follow_olh, bool assume_noent)
9194 {
9195 if (obj.empty()) {
9196 return -EINVAL;
9197 }
9198
9199 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9200
9201 RGWObjState *s = rctx->obj.get_state(obj);
9202 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9203 *state = s;
9204 if (s->has_attrs) {
9205 if (s->is_olh && need_follow_olh) {
9206 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9207 }
9208 return 0;
9209 }
9210
9211 s->obj = obj;
9212
9213 rgw_raw_obj raw_obj;
9214 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9215
9216 int r = -ENOENT;
9217
9218 if (!assume_noent) {
9219 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9220 }
9221
9222 if (r == -ENOENT) {
9223 s->exists = false;
9224 s->has_attrs = true;
9225 tombstone_entry entry;
9226 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9227 s->mtime = entry.mtime;
9228 s->zone_short_id = entry.zone_short_id;
9229 s->pg_ver = entry.pg_ver;
9230 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9231 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9232 } else {
9233 s->mtime = real_time();
9234 }
9235 return 0;
9236 }
9237 if (r < 0)
9238 return r;
9239
9240 s->exists = true;
9241 s->has_attrs = true;
9242 s->accounted_size = s->size;
9243
9244 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9245 const bool compressed = (iter != s->attrset.end());
9246 if (compressed) {
9247 // use uncompressed size for accounted_size
9248 try {
9249 RGWCompressionInfo info;
9250 auto p = iter->second.begin();
9251 ::decode(info, p);
9252 s->accounted_size = info.orig_size;
9253 } catch (buffer::error&) {
9254 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9255 return -EIO;
9256 }
9257 }
9258
9259 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9260 if (iter != s->attrset.end()) {
9261 bufferlist bl = iter->second;
9262 bufferlist::iterator it = bl.begin();
9263 it.copy(bl.length(), s->shadow_obj);
9264 s->shadow_obj[bl.length()] = '\0';
9265 }
9266 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9267 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9268 if (ttiter != s->attrset.end()) {
9269 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9270 }
9271
9272 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9273 if (manifest_bl.length()) {
9274 bufferlist::iterator miter = manifest_bl.begin();
9275 try {
9276 ::decode(s->manifest, miter);
9277 s->has_manifest = true;
9278 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9279 broken due to old bugs */
9280 s->size = s->manifest.get_obj_size();
9281 if (!compressed)
9282 s->accounted_size = s->size;
9283 } catch (buffer::error& err) {
9284 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9285 return -EIO;
9286 }
9287 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9288 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9289 RGWObjManifest::obj_iterator mi;
9290 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9291 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9292 }
9293 }
9294
9295 if (!s->obj_tag.length()) {
9296 /*
9297 * Uh oh, something's wrong, object with manifest should have tag. Let's
9298 * create one out of the manifest, would be unique
9299 */
9300 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9301 s->fake_tag = true;
9302 }
9303 }
9304 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9305 if (aiter != s->attrset.end()) {
9306 bufferlist& pg_ver_bl = aiter->second;
9307 if (pg_ver_bl.length()) {
9308 bufferlist::iterator pgbl = pg_ver_bl.begin();
9309 try {
9310 ::decode(s->pg_ver, pgbl);
9311 } catch (buffer::error& err) {
9312 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9313 }
9314 }
9315 }
9316 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9317 if (aiter != s->attrset.end()) {
9318 bufferlist& zone_short_id_bl = aiter->second;
9319 if (zone_short_id_bl.length()) {
9320 bufferlist::iterator zbl = zone_short_id_bl.begin();
9321 try {
9322 ::decode(s->zone_short_id, zbl);
9323 } catch (buffer::error& err) {
9324 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9325 }
9326 }
9327 }
9328 if (s->obj_tag.length())
9329 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9330 else
9331 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9332
9333 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9334 * it exist, and not only if is_olh() returns true
9335 */
9336 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9337 if (iter != s->attrset.end()) {
9338 s->olh_tag = iter->second;
9339 }
9340
9341 if (is_olh(s->attrset)) {
9342 s->is_olh = true;
9343
9344 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9345
9346 if (need_follow_olh) {
9347 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9348 }
9349 }
9350
9351 return 0;
9352 }
9353
9354 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9355 bool follow_olh, bool assume_noent)
9356 {
9357 int ret;
9358
9359 do {
9360 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9361 } while (ret == -EAGAIN);
9362
9363 return ret;
9364 }
9365
9366 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9367 {
9368 RGWObjState *astate;
9369 int r = get_state(&astate, true);
9370 if (r < 0) {
9371 return r;
9372 }
9373
9374 *pmanifest = &astate->manifest;
9375
9376 return 0;
9377 }
9378
9379 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9380 {
9381 RGWObjState *state;
9382 int r = source->get_state(&state, true);
9383 if (r < 0)
9384 return r;
9385 if (!state->exists)
9386 return -ENOENT;
9387 if (!state->get_attr(name, dest))
9388 return -ENODATA;
9389
9390 return 0;
9391 }
9392
9393
9394 int RGWRados::Object::Stat::stat_async()
9395 {
9396 RGWObjectCtx& ctx = source->get_ctx();
9397 rgw_obj& obj = source->get_obj();
9398 RGWRados *store = source->get_store();
9399
9400 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9401 result.obj = obj;
9402 if (s->has_attrs) {
9403 state.ret = 0;
9404 result.size = s->size;
9405 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9406 result.attrs = s->attrset;
9407 result.has_manifest = s->has_manifest;
9408 result.manifest = s->manifest;
9409 return 0;
9410 }
9411
9412 string oid;
9413 string loc;
9414 get_obj_bucket_and_oid_loc(obj, oid, loc);
9415
9416 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9417 if (r < 0) {
9418 return r;
9419 }
9420
9421 librados::ObjectReadOperation op;
9422 op.stat2(&result.size, &result.mtime, NULL);
9423 op.getxattrs(&result.attrs, NULL);
9424 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9425 state.io_ctx.locator_set_key(loc);
9426 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9427 if (r < 0) {
9428 ldout(store->ctx(), 5) << __func__
9429 << ": ERROR: aio_operate() returned ret=" << r
9430 << dendl;
9431 return r;
9432 }
9433
9434 return 0;
9435 }
9436
9437
9438 int RGWRados::Object::Stat::wait()
9439 {
9440 if (!state.completion) {
9441 return state.ret;
9442 }
9443
9444 state.completion->wait_for_safe();
9445 state.ret = state.completion->get_return_value();
9446 state.completion->release();
9447
9448 if (state.ret != 0) {
9449 return state.ret;
9450 }
9451
9452 return finish();
9453 }
9454
9455 int RGWRados::Object::Stat::finish()
9456 {
9457 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9458 if (iter != result.attrs.end()) {
9459 bufferlist& bl = iter->second;
9460 bufferlist::iterator biter = bl.begin();
9461 try {
9462 ::decode(result.manifest, biter);
9463 } catch (buffer::error& err) {
9464 RGWRados *store = source->get_store();
9465 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9466 return -EIO;
9467 }
9468 result.has_manifest = true;
9469 }
9470
9471 return 0;
9472 }
9473
9474 /**
9475 * Get an attribute for a system object.
9476 * obj: the object to get attr
9477 * name: name of the attr to retrieve
9478 * dest: bufferlist to store the result in
9479 * Returns: 0 on success, -ERR# otherwise.
9480 */
9481 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9482 {
9483 rgw_rados_ref ref;
9484 int r = get_system_obj_ref(obj, &ref);
9485 if (r < 0) {
9486 return r;
9487 }
9488
9489 ObjectReadOperation op;
9490
9491 int rval;
9492 op.getxattr(name, &dest, &rval);
9493
9494 r = ref.ioctx.operate(ref.oid, &op, NULL);
9495 if (r < 0)
9496 return r;
9497
9498 return 0;
9499 }
9500
9501 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9502 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9503 ObjectOperation& op, RGWObjState **pstate)
9504 {
9505 if (!rctx)
9506 return 0;
9507
9508 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9509 if (r < 0)
9510 return r;
9511
9512 RGWObjState *state = *pstate;
9513
9514 if (!state->is_atomic) {
9515 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9516 return 0;
9517 }
9518
9519 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9520 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9521 } else {
9522 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9523 }
9524 return 0;
9525 }
9526
9527 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9528 {
9529 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9530 }
9531
9532 void RGWRados::Object::invalidate_state()
9533 {
9534 ctx.obj.invalidate(obj);
9535 }
9536
9537 void RGWRados::SystemObject::invalidate_state()
9538 {
9539 ctx.raw.invalidate(obj);
9540 }
9541
9542 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9543 const char *if_match, const char *if_nomatch, bool removal_op,
9544 bool modify_tail)
9545 {
9546 int r = get_state(&state, false);
9547 if (r < 0)
9548 return r;
9549
9550 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9551 if_match != NULL || if_nomatch != NULL) &&
9552 (!state->fake_tag);
9553
9554 if (!state->is_atomic) {
9555 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9556
9557 if (reset_obj) {
9558 op.create(false);
9559 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9560 }
9561
9562 return 0;
9563 }
9564
9565 if (need_guard) {
9566 /* first verify that the object wasn't replaced under */
9567 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9568 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9569 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9570 }
9571
9572 if (if_match) {
9573 if (strcmp(if_match, "*") == 0) {
9574 // test the object is existing
9575 if (!state->exists) {
9576 return -ERR_PRECONDITION_FAILED;
9577 }
9578 } else {
9579 bufferlist bl;
9580 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9581 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9582 return -ERR_PRECONDITION_FAILED;
9583 }
9584 }
9585 }
9586
9587 if (if_nomatch) {
9588 if (strcmp(if_nomatch, "*") == 0) {
9589 // test the object is NOT existing
9590 if (state->exists) {
9591 return -ERR_PRECONDITION_FAILED;
9592 }
9593 } else {
9594 bufferlist bl;
9595 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9596 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9597 return -ERR_PRECONDITION_FAILED;
9598 }
9599 }
9600 }
9601 }
9602
9603 if (reset_obj) {
9604 if (state->exists) {
9605 op.create(false);
9606 store->remove_rgw_head_obj(op);
9607 } else {
9608 op.create(true);
9609 }
9610 }
9611
9612 if (removal_op) {
9613 /* the object is being removed, no need to update its tag */
9614 return 0;
9615 }
9616
9617 if (ptag) {
9618 state->write_tag = *ptag;
9619 } else {
9620 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9621 }
9622 bufferlist bl;
9623 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9624
9625 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9626
9627 op.setxattr(RGW_ATTR_ID_TAG, bl);
9628 if (modify_tail) {
9629 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9630 }
9631
9632 return 0;
9633 }
9634
9635 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9636 RGWObjVersionTracker *objv_tracker)
9637 {
9638 map<string, bufferlist> attrs;
9639 attrs[name] = bl;
9640 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9641 }
9642
9643 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9644 map<string, bufferlist>& attrs,
9645 map<string, bufferlist>* rmattrs,
9646 RGWObjVersionTracker *objv_tracker)
9647 {
9648 rgw_rados_ref ref;
9649 int r = get_system_obj_ref(obj, &ref);
9650 if (r < 0) {
9651 return r;
9652 }
9653 ObjectWriteOperation op;
9654
9655 if (objv_tracker) {
9656 objv_tracker->prepare_op_for_write(&op);
9657 }
9658
9659 map<string, bufferlist>::iterator iter;
9660 if (rmattrs) {
9661 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9662 const string& name = iter->first;
9663 op.rmxattr(name.c_str());
9664 }
9665 }
9666
9667 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9668 const string& name = iter->first;
9669 bufferlist& bl = iter->second;
9670
9671 if (!bl.length())
9672 continue;
9673
9674 op.setxattr(name.c_str(), bl);
9675 }
9676
9677 if (!op.size())
9678 return 0;
9679
9680 bufferlist bl;
9681
9682 r = ref.ioctx.operate(ref.oid, &op);
9683 if (r < 0)
9684 return r;
9685
9686 return 0;
9687 }
9688
9689 /**
9690 * Set an attr on an object.
9691 * bucket: name of the bucket holding the object
9692 * obj: name of the object to set the attr on
9693 * name: the attr to set
9694 * bl: the contents of the attr
9695 * Returns: 0 on success, -ERR# otherwise.
9696 */
9697 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9698 {
9699 map<string, bufferlist> attrs;
9700 attrs[name] = bl;
9701 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9702 }
9703
9704 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9705 map<string, bufferlist>& attrs,
9706 map<string, bufferlist>* rmattrs)
9707 {
9708 rgw_rados_ref ref;
9709 int r = get_obj_head_ref(bucket_info, obj, &ref);
9710 if (r < 0) {
9711 return r;
9712 }
9713 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9714
9715 ObjectWriteOperation op;
9716 RGWObjState *state = NULL;
9717
9718 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9719 if (r < 0)
9720 return r;
9721
9722 map<string, bufferlist>::iterator iter;
9723 if (rmattrs) {
9724 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9725 const string& name = iter->first;
9726 op.rmxattr(name.c_str());
9727 }
9728 }
9729
9730 const rgw_bucket& bucket = obj.bucket;
9731
9732 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9733 const string& name = iter->first;
9734 bufferlist& bl = iter->second;
9735
9736 if (!bl.length())
9737 continue;
9738
9739 op.setxattr(name.c_str(), bl);
9740
9741 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9742 real_time ts;
9743 try {
9744 ::decode(ts, bl);
9745
9746 rgw_obj_index_key obj_key;
9747 obj.key.get_index_key(&obj_key);
9748
9749 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9750 } catch (buffer::error& err) {
9751 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9752 }
9753 }
9754 }
9755
9756 if (!op.size())
9757 return 0;
9758
9759 RGWObjectCtx obj_ctx(this);
9760
9761 bufferlist bl;
9762 RGWRados::Bucket bop(this, bucket_info);
9763 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9764
9765 if (state) {
9766 string tag;
9767 append_rand_alpha(cct, tag, tag, 32);
9768 state->write_tag = tag;
9769 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9770
9771 if (r < 0)
9772 return r;
9773
9774 bl.append(tag.c_str(), tag.size() + 1);
9775 op.setxattr(RGW_ATTR_ID_TAG, bl);
9776 }
9777
9778
9779 real_time mtime = real_clock::now();
9780 struct timespec mtime_ts = real_clock::to_timespec(mtime);
9781 op.mtime2(&mtime_ts);
9782 r = ref.ioctx.operate(ref.oid, &op);
9783 if (state) {
9784 if (r >= 0) {
9785 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9786 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9787 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9788 string etag(etag_bl.c_str(), etag_bl.length());
9789 string content_type(content_type_bl.c_str(), content_type_bl.length());
9790 uint64_t epoch = ref.ioctx.get_last_version();
9791 int64_t poolid = ref.ioctx.get_id();
9792 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9793 mtime, etag, content_type, &acl_bl,
9794 RGW_OBJ_CATEGORY_MAIN, NULL);
9795 } else {
9796 int ret = index_op.cancel();
9797 if (ret < 0) {
9798 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9799 }
9800 }
9801 }
9802 if (r < 0)
9803 return r;
9804
9805 if (state) {
9806 state->obj_tag.swap(bl);
9807 if (rmattrs) {
9808 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9809 state->attrset.erase(iter->first);
9810 }
9811 }
9812 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9813 state->attrset[iter->first] = iter->second;
9814 }
9815 }
9816
9817 return 0;
9818 }
9819
9820 int RGWRados::Object::Read::prepare()
9821 {
9822 RGWRados *store = source->get_store();
9823 CephContext *cct = store->ctx();
9824
9825 bufferlist etag;
9826
9827 map<string, bufferlist>::iterator iter;
9828
9829 RGWObjState *astate;
9830 int r = source->get_state(&astate, true);
9831 if (r < 0)
9832 return r;
9833
9834 if (!astate->exists) {
9835 return -ENOENT;
9836 }
9837
9838 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9839
9840 state.obj = astate->obj;
9841 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9842
9843 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9844 if (r < 0) {
9845 return r;
9846 }
9847 if (params.attrs) {
9848 *params.attrs = astate->attrset;
9849 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9850 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9851 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9852 }
9853 }
9854 }
9855
9856 /* Convert all times go GMT to make them compatible */
9857 if (conds.mod_ptr || conds.unmod_ptr) {
9858 obj_time_weight src_weight;
9859 src_weight.init(astate);
9860 src_weight.high_precision = conds.high_precision_time;
9861
9862 obj_time_weight dest_weight;
9863 dest_weight.high_precision = conds.high_precision_time;
9864
9865 if (conds.mod_ptr) {
9866 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9867 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9868 if (!(dest_weight < src_weight)) {
9869 return -ERR_NOT_MODIFIED;
9870 }
9871 }
9872
9873 if (conds.unmod_ptr) {
9874 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9875 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9876 if (dest_weight < src_weight) {
9877 return -ERR_PRECONDITION_FAILED;
9878 }
9879 }
9880 }
9881 if (conds.if_match || conds.if_nomatch) {
9882 r = get_attr(RGW_ATTR_ETAG, etag);
9883 if (r < 0)
9884 return r;
9885
9886 if (conds.if_match) {
9887 string if_match_str = rgw_string_unquote(conds.if_match);
9888 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9889 if (if_match_str.compare(etag.c_str()) != 0) {
9890 return -ERR_PRECONDITION_FAILED;
9891 }
9892 }
9893
9894 if (conds.if_nomatch) {
9895 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9896 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9897 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9898 return -ERR_NOT_MODIFIED;
9899 }
9900 }
9901 }
9902
9903 if (params.obj_size)
9904 *params.obj_size = astate->size;
9905 if (params.lastmod)
9906 *params.lastmod = astate->mtime;
9907
9908 return 0;
9909 }
9910
9911 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9912 {
9913 if (ofs < 0) {
9914 ofs += obj_size;
9915 if (ofs < 0)
9916 ofs = 0;
9917 end = obj_size - 1;
9918 } else if (end < 0) {
9919 end = obj_size - 1;
9920 }
9921
9922 if (obj_size > 0) {
9923 if (ofs >= (off_t)obj_size) {
9924 return -ERANGE;
9925 }
9926 if (end >= (off_t)obj_size) {
9927 end = obj_size - 1;
9928 }
9929 }
9930 return 0;
9931 }
9932
9933 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9934 {
9935 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9936 }
9937
9938 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9939 RGWRados::SystemObject::Read::GetObjState& state,
9940 rgw_raw_obj& obj,
9941 map<string, bufferlist> *attrs,
9942 real_time *lastmod,
9943 uint64_t *obj_size,
9944 RGWObjVersionTracker *objv_tracker)
9945 {
9946 RGWRawObjState *astate = NULL;
9947
9948 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9949 if (r < 0)
9950 return r;
9951
9952 if (!astate->exists) {
9953 return -ENOENT;
9954 }
9955
9956 if (attrs) {
9957 *attrs = astate->attrset;
9958 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9959 map<string, bufferlist>::iterator iter;
9960 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9961 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9962 }
9963 }
9964 }
9965
9966 if (obj_size)
9967 *obj_size = astate->size;
9968 if (lastmod)
9969 *lastmod = astate->mtime;
9970
9971 return 0;
9972 }
9973
9974
9975 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
9976 {
9977 RGWRados *store = target->get_store();
9978 BucketShard *bs;
9979 int r;
9980
9981 #define NUM_RESHARD_RETRIES 10
9982 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
9983 int ret = get_bucket_shard(&bs);
9984 if (ret < 0) {
9985 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9986 return ret;
9987 }
9988 r = call(bs);
9989 if (r != -ERR_BUSY_RESHARDING) {
9990 break;
9991 }
9992 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
9993 string new_bucket_id;
9994 r = store->block_while_resharding(bs, &new_bucket_id);
9995 if (r == -ERR_BUSY_RESHARDING) {
9996 continue;
9997 }
9998 if (r < 0) {
9999 return r;
10000 }
10001 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10002 i = 0; /* resharding is finished, make sure we can retry */
10003 r = target->update_bucket_id(new_bucket_id);
10004 if (r < 0) {
10005 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10006 return r;
10007 }
10008 invalidate_bs();
10009 }
10010
10011 if (r < 0) {
10012 return r;
10013 }
10014
10015 if (pbs) {
10016 *pbs = bs;
10017 }
10018
10019 return 0;
10020 }
10021
10022 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10023 {
10024 RGWRados *store = source->get_store();
10025 rgw_raw_obj& obj = source->get_obj();
10026
10027 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10028 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10029 }
10030
10031 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10032 {
10033 if (blind) {
10034 return 0;
10035 }
10036 RGWRados *store = target->get_store();
10037
10038 if (write_tag && write_tag->length()) {
10039 optag = string(write_tag->c_str(), write_tag->length());
10040 } else {
10041 if (optag.empty()) {
10042 append_rand_alpha(store->ctx(), optag, optag, 32);
10043 }
10044 }
10045
10046 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10047 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10048 });
10049
10050 if (r < 0) {
10051 return r;
10052 }
10053 prepared = true;
10054
10055 return 0;
10056 }
10057
10058 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10059 uint64_t size, uint64_t accounted_size,
10060 ceph::real_time& ut, const string& etag,
10061 const string& content_type,
10062 bufferlist *acl_bl,
10063 RGWObjCategory category,
10064 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10065 {
10066 if (blind) {
10067 return 0;
10068 }
10069 RGWRados *store = target->get_store();
10070 BucketShard *bs;
10071
10072 int ret = get_bucket_shard(&bs);
10073 if (ret < 0) {
10074 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10075 return ret;
10076 }
10077
10078 rgw_bucket_dir_entry ent;
10079 obj.key.get_index_key(&ent.key);
10080 ent.meta.size = size;
10081 ent.meta.accounted_size = accounted_size;
10082 ent.meta.mtime = ut;
10083 ent.meta.etag = etag;
10084 if (user_data)
10085 ent.meta.user_data = *user_data;
10086
10087 ACLOwner owner;
10088 if (acl_bl && acl_bl->length()) {
10089 int ret = store->decode_policy(*acl_bl, &owner);
10090 if (ret < 0) {
10091 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10092 }
10093 }
10094 ent.meta.owner = owner.get_id().to_str();
10095 ent.meta.owner_display_name = owner.get_display_name();
10096 ent.meta.content_type = content_type;
10097
10098 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10099
10100 if (target->bucket_info.datasync_flag_enabled()) {
10101 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10102 if (r < 0) {
10103 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10104 }
10105 }
10106
10107 return ret;
10108 }
10109
10110 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10111 real_time& removed_mtime,
10112 list<rgw_obj_index_key> *remove_objs)
10113 {
10114 if (blind) {
10115 return 0;
10116 }
10117 RGWRados *store = target->get_store();
10118 BucketShard *bs;
10119
10120 int ret = get_bucket_shard(&bs);
10121 if (ret < 0) {
10122 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10123 return ret;
10124 }
10125
10126 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10127
10128 if (target->bucket_info.datasync_flag_enabled()) {
10129 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10130 if (r < 0) {
10131 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10132 }
10133 }
10134
10135 return ret;
10136 }
10137
10138
10139 int RGWRados::Bucket::UpdateIndex::cancel()
10140 {
10141 if (blind) {
10142 return 0;
10143 }
10144 RGWRados *store = target->get_store();
10145 BucketShard *bs;
10146
10147 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10148 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10149 });
10150
10151 /*
10152 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10153 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10154 * have no way to tell that they're all caught up
10155 */
10156 if (target->bucket_info.datasync_flag_enabled()) {
10157 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10158 if (r < 0) {
10159 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10160 }
10161 }
10162
10163 return ret;
10164 }
10165
10166 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10167 {
10168 RGWRados *store = source->get_store();
10169 CephContext *cct = store->ctx();
10170
10171 rgw_raw_obj read_obj;
10172 uint64_t read_ofs = ofs;
10173 uint64_t len, read_len;
10174 bool reading_from_head = true;
10175 ObjectReadOperation op;
10176
10177 bool merge_bl = false;
10178 bufferlist *pbl = &bl;
10179 bufferlist read_bl;
10180 uint64_t max_chunk_size;
10181
10182 RGWObjState *astate;
10183 int r = source->get_state(&astate, true);
10184 if (r < 0)
10185 return r;
10186
10187 if (end < 0)
10188 len = 0;
10189 else
10190 len = end - ofs + 1;
10191
10192 if (astate->has_manifest && astate->manifest.has_tail()) {
10193 /* now get the relevant object part */
10194 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10195
10196 uint64_t stripe_ofs = iter.get_stripe_ofs();
10197 read_obj = iter.get_location().get_raw_obj(store);
10198 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10199 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10200 reading_from_head = (read_obj == state.head_obj);
10201 } else {
10202 read_obj = state.head_obj;
10203 }
10204
10205 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10206 if (r < 0) {
10207 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10208 return r;
10209 }
10210
10211 if (len > max_chunk_size)
10212 len = max_chunk_size;
10213
10214
10215 state.io_ctx.locator_set_key(read_obj.loc);
10216
10217 read_len = len;
10218
10219 if (reading_from_head) {
10220 /* only when reading from the head object do we need to do the atomic test */
10221 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10222 if (r < 0)
10223 return r;
10224
10225 if (astate && astate->prefetch_data) {
10226 if (!ofs && astate->data.length() >= len) {
10227 bl = astate->data;
10228 return bl.length();
10229 }
10230
10231 if (ofs < astate->data.length()) {
10232 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10233 astate->data.copy(ofs, copy_len, bl);
10234 read_len -= copy_len;
10235 read_ofs += copy_len;
10236 if (!read_len)
10237 return bl.length();
10238
10239 merge_bl = true;
10240 pbl = &read_bl;
10241 }
10242 }
10243 }
10244
10245 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10246 op.read(read_ofs, read_len, pbl, NULL);
10247
10248 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10249 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10250
10251 if (r < 0) {
10252 return r;
10253 }
10254
10255 if (merge_bl) {
10256 bl.append(read_bl);
10257 }
10258
10259 return bl.length();
10260 }
10261
10262 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10263 {
10264 if (!has_ref) {
10265 int r = store->get_raw_obj_ref(obj, &ref);
10266 if (r < 0) {
10267 return r;
10268 }
10269 has_ref = true;
10270 }
10271 *pref = &ref;
10272 return 0;
10273
10274 }
10275
10276 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10277 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10278 bufferlist& bl, off_t ofs, off_t end,
10279 map<string, bufferlist> *attrs,
10280 rgw_cache_entry_info *cache_info,
10281 boost::optional<obj_version>)
10282 {
10283 uint64_t len;
10284 ObjectReadOperation op;
10285
10286 if (end < 0)
10287 len = 0;
10288 else
10289 len = end - ofs + 1;
10290
10291 if (objv_tracker) {
10292 objv_tracker->prepare_op_for_read(&op);
10293 }
10294
10295 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10296 op.read(ofs, len, &bl, NULL);
10297
10298 if (attrs) {
10299 op.getxattrs(attrs, NULL);
10300 }
10301
10302 rgw_rados_ref *ref;
10303 int r = read_state.get_ref(this, obj, &ref);
10304 if (r < 0) {
10305 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10306 return r;
10307 }
10308 r = ref->ioctx.operate(ref->oid, &op, NULL);
10309 if (r < 0) {
10310 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10311 return r;
10312 }
10313 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10314
10315 uint64_t op_ver = ref->ioctx.get_last_version();
10316
10317 if (read_state.last_ver > 0 &&
10318 read_state.last_ver != op_ver) {
10319 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10320 return -ECANCELED;
10321 }
10322
10323 read_state.last_ver = op_ver;
10324
10325 return bl.length();
10326 }
10327
10328 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10329 RGWObjVersionTracker *objv_tracker,
10330 boost::optional<obj_version> refresh_version)
10331 {
10332 RGWRados *store = source->get_store();
10333 rgw_raw_obj& obj = source->get_obj();
10334
10335 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10336 ofs, end, read_params.attrs,
10337 read_params.cache_info, refresh_version);
10338 }
10339
10340 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10341 {
10342 RGWRados *store = source->get_store();
10343 rgw_raw_obj& obj = source->get_obj();
10344
10345 return store->system_obj_get_attr(obj, name, dest);
10346 }
10347
10348 struct get_obj_data;
10349
10350 struct get_obj_aio_data {
10351 struct get_obj_data *op_data;
10352 off_t ofs;
10353 off_t len;
10354 };
10355
10356 struct get_obj_io {
10357 off_t len;
10358 bufferlist bl;
10359 };
10360
10361 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10362
10363 struct get_obj_data : public RefCountedObject {
10364 CephContext *cct;
10365 RGWRados *rados;
10366 RGWObjectCtx *ctx;
10367 IoCtx io_ctx;
10368 map<off_t, get_obj_io> io_map;
10369 map<off_t, librados::AioCompletion *> completion_map;
10370 uint64_t total_read;
10371 Mutex lock;
10372 Mutex data_lock;
10373 list<get_obj_aio_data> aio_data;
10374 RGWGetDataCB *client_cb;
10375 std::atomic<bool> cancelled = { false };
10376 std::atomic<int64_t> err_code = { 0 };
10377 Throttle throttle;
10378 list<bufferlist> read_list;
10379
10380 explicit get_obj_data(CephContext *_cct)
10381 : cct(_cct),
10382 rados(NULL), ctx(NULL),
10383 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10384 client_cb(NULL),
10385 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10386 ~get_obj_data() override { }
10387 void set_cancelled(int r) {
10388 cancelled = true;
10389 err_code = r;
10390 }
10391
10392 bool is_cancelled() {
10393 return cancelled;
10394 }
10395
10396 int get_err_code() {
10397 return err_code;
10398 }
10399
10400 int wait_next_io(bool *done) {
10401 lock.Lock();
10402 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10403 if (iter == completion_map.end()) {
10404 *done = true;
10405 lock.Unlock();
10406 return 0;
10407 }
10408 off_t cur_ofs = iter->first;
10409 librados::AioCompletion *c = iter->second;
10410 lock.Unlock();
10411
10412 c->wait_for_safe_and_cb();
10413 int r = c->get_return_value();
10414
10415 lock.Lock();
10416 completion_map.erase(cur_ofs);
10417
10418 if (completion_map.empty()) {
10419 *done = true;
10420 }
10421 lock.Unlock();
10422
10423 c->release();
10424
10425 return r;
10426 }
10427
10428 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10429 Mutex::Locker l(lock);
10430
10431 const auto& io_iter = io_map.insert(
10432 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10433
10434 assert(io_iter.second); // assert new insertion
10435
10436 get_obj_io& io = (io_iter.first)->second;
10437 *pbl = &io.bl;
10438
10439 struct get_obj_aio_data aio;
10440 aio.ofs = ofs;
10441 aio.len = len;
10442 aio.op_data = this;
10443
10444 aio_data.push_back(aio);
10445
10446 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10447
10448 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10449 completion_map[ofs] = c;
10450
10451 *pc = c;
10452
10453 /* we have a reference per IO, plus one reference for the calling function.
10454 * reference is dropped for each callback, plus when we're done iterating
10455 * over the parts */
10456 get();
10457 }
10458
10459 void cancel_io(off_t ofs) {
10460 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10461 lock.Lock();
10462 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10463 if (iter != completion_map.end()) {
10464 AioCompletion *c = iter->second;
10465 c->release();
10466 completion_map.erase(ofs);
10467 io_map.erase(ofs);
10468 }
10469 lock.Unlock();
10470
10471 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10472 * need IoCtx to live, as io callback may still be called
10473 */
10474 }
10475
10476 void cancel_all_io() {
10477 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10478 Mutex::Locker l(lock);
10479 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10480 iter != completion_map.end(); ++iter) {
10481 librados::AioCompletion *c = iter->second;
10482 c->release();
10483 }
10484 }
10485
10486 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10487 Mutex::Locker l(lock);
10488
10489 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10490
10491 if (liter == io_map.end() ||
10492 liter->first != ofs) {
10493 return 0;
10494 }
10495
10496 map<off_t, librados::AioCompletion *>::iterator aiter;
10497 aiter = completion_map.find(ofs);
10498 if (aiter == completion_map.end()) {
10499 /* completion map does not hold this io, it was cancelled */
10500 return 0;
10501 }
10502
10503 AioCompletion *completion = aiter->second;
10504 int r = completion->get_return_value();
10505 if (r < 0)
10506 return r;
10507
10508 for (; aiter != completion_map.end(); ++aiter) {
10509 completion = aiter->second;
10510 if (!completion->is_safe()) {
10511 /* reached a request that is not yet complete, stop */
10512 break;
10513 }
10514
10515 r = completion->get_return_value();
10516 if (r < 0) {
10517 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10518 return r;
10519 }
10520
10521 total_read += r;
10522
10523 map<off_t, get_obj_io>::iterator old_liter = liter++;
10524 bl_list.push_back(old_liter->second.bl);
10525 io_map.erase(old_liter);
10526 }
10527
10528 return 0;
10529 }
10530 };
10531
10532 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10533 {
10534 struct get_obj_data *d = (struct get_obj_data *)arg;
10535
10536 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10537 }
10538
10539 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10540 {
10541 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10542 struct get_obj_data *d = aio_data->op_data;
10543
10544 d->rados->get_obj_aio_completion_cb(cb, arg);
10545 }
10546
10547
10548 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10549 {
10550 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10551 struct get_obj_data *d = aio_data->op_data;
10552 off_t ofs = aio_data->ofs;
10553 off_t len = aio_data->len;
10554
10555 list<bufferlist> bl_list;
10556 list<bufferlist>::iterator iter;
10557 int r;
10558
10559 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10560 d->throttle.put(len);
10561
10562 r = rados_aio_get_return_value(c);
10563 if (r < 0) {
10564 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10565 d->set_cancelled(r);
10566 goto done;
10567 }
10568
10569 if (d->is_cancelled()) {
10570 goto done;
10571 }
10572
10573 d->data_lock.Lock();
10574
10575 r = d->get_complete_ios(ofs, bl_list);
10576 if (r < 0) {
10577 goto done_unlock;
10578 }
10579
10580 d->read_list.splice(d->read_list.end(), bl_list);
10581
10582 done_unlock:
10583 d->data_lock.Unlock();
10584 done:
10585 d->put();
10586 return;
10587 }
10588
10589 int RGWRados::flush_read_list(struct get_obj_data *d)
10590 {
10591 d->data_lock.Lock();
10592 list<bufferlist> l;
10593 l.swap(d->read_list);
10594 d->get();
10595 d->read_list.clear();
10596
10597 d->data_lock.Unlock();
10598
10599 int r = 0;
10600
10601 list<bufferlist>::iterator iter;
10602 for (iter = l.begin(); iter != l.end(); ++iter) {
10603 bufferlist& bl = *iter;
10604 r = d->client_cb->handle_data(bl, 0, bl.length());
10605 if (r < 0) {
10606 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10607 break;
10608 }
10609 }
10610
10611 d->data_lock.Lock();
10612 d->put();
10613 if (r < 0) {
10614 d->set_cancelled(r);
10615 }
10616 d->data_lock.Unlock();
10617 return r;
10618 }
10619
10620 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10621 const RGWBucketInfo& bucket_info,
10622 const rgw_obj& obj,
10623 const rgw_raw_obj& read_obj,
10624 off_t obj_ofs,
10625 off_t read_ofs, off_t len,
10626 bool is_head_obj, void *arg)
10627 {
10628 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10629 ObjectReadOperation op;
10630 struct get_obj_data *d = (struct get_obj_data *)arg;
10631 string oid, key;
10632 bufferlist *pbl;
10633 AioCompletion *c;
10634
10635 int r;
10636
10637 if (is_head_obj) {
10638 /* only when reading from the head object do we need to do the atomic test */
10639 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10640 if (r < 0)
10641 return r;
10642
10643 if (astate &&
10644 obj_ofs < astate->data.length()) {
10645 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10646
10647 d->data_lock.Lock();
10648 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10649 d->data_lock.Unlock();
10650 if (r < 0)
10651 return r;
10652
10653 d->lock.Lock();
10654 d->total_read += chunk_len;
10655 d->lock.Unlock();
10656
10657 len -= chunk_len;
10658 read_ofs += chunk_len;
10659 obj_ofs += chunk_len;
10660 if (!len)
10661 return 0;
10662 }
10663 }
10664
10665 d->throttle.get(len);
10666 if (d->is_cancelled()) {
10667 return d->get_err_code();
10668 }
10669
10670 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10671 * cleaning up
10672 */
10673 d->add_io(obj_ofs, len, &pbl, &c);
10674
10675 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10676 op.read(read_ofs, len, pbl, NULL);
10677
10678 librados::IoCtx io_ctx(d->io_ctx);
10679 io_ctx.locator_set_key(read_obj.loc);
10680
10681 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10682 if (r < 0) {
10683 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10684 goto done_err;
10685 }
10686
10687 // Flush data to client if there is any
10688 r = flush_read_list(d);
10689 if (r < 0)
10690 return r;
10691
10692 return 0;
10693
10694 done_err:
10695 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10696 d->set_cancelled(r);
10697 d->cancel_io(obj_ofs);
10698
10699 return r;
10700 }
10701
10702 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10703 {
10704 RGWRados *store = source->get_store();
10705 CephContext *cct = store->ctx();
10706
10707 struct get_obj_data *data = new get_obj_data(cct);
10708 bool done = false;
10709
10710 RGWObjectCtx& obj_ctx = source->get_ctx();
10711
10712 data->rados = store;
10713 data->io_ctx.dup(state.io_ctx);
10714 data->client_cb = cb;
10715
10716 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10717 if (r < 0) {
10718 data->cancel_all_io();
10719 goto done;
10720 }
10721
10722 while (!done) {
10723 r = data->wait_next_io(&done);
10724 if (r < 0) {
10725 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10726 data->cancel_all_io();
10727 break;
10728 }
10729 r = store->flush_read_list(data);
10730 if (r < 0) {
10731 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10732 data->cancel_all_io();
10733 break;
10734 }
10735 }
10736
10737 done:
10738 data->put();
10739 return r;
10740 }
10741
10742 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10743 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10744 off_t ofs, off_t end,
10745 uint64_t max_chunk_size,
10746 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10747 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10748 RGWObjState *, void *),
10749 void *arg)
10750 {
10751 rgw_raw_obj head_obj;
10752 rgw_raw_obj read_obj;
10753 uint64_t read_ofs = ofs;
10754 uint64_t len;
10755 bool reading_from_head = true;
10756 RGWObjState *astate = NULL;
10757
10758 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10759
10760 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10761 if (r < 0) {
10762 return r;
10763 }
10764
10765 if (end < 0)
10766 len = 0;
10767 else
10768 len = end - ofs + 1;
10769
10770 if (astate->has_manifest) {
10771 /* now get the relevant object stripe */
10772 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10773
10774 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10775
10776 for (; iter != obj_end && ofs <= end; ++iter) {
10777 off_t stripe_ofs = iter.get_stripe_ofs();
10778 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10779
10780 while (ofs < next_stripe_ofs && ofs <= end) {
10781 read_obj = iter.get_location().get_raw_obj(this);
10782 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10783 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10784
10785 if (read_len > max_chunk_size) {
10786 read_len = max_chunk_size;
10787 }
10788
10789 reading_from_head = (read_obj == head_obj);
10790 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10791 if (r < 0) {
10792 return r;
10793 }
10794
10795 len -= read_len;
10796 ofs += read_len;
10797 }
10798 }
10799 } else {
10800 while (ofs <= end) {
10801 read_obj = head_obj;
10802 uint64_t read_len = min(len, max_chunk_size);
10803
10804 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10805 if (r < 0) {
10806 return r;
10807 }
10808
10809 len -= read_len;
10810 ofs += read_len;
10811 }
10812 }
10813
10814 return 0;
10815 }
10816
10817 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10818 {
10819 rgw_rados_ref ref;
10820 int r = get_obj_head_ref(bucket_info, obj, &ref);
10821 if (r < 0) {
10822 return r;
10823 }
10824
10825 return ref.ioctx.operate(ref.oid, op);
10826 }
10827
10828 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10829 {
10830 rgw_rados_ref ref;
10831 int r = get_obj_head_ref(bucket_info, obj, &ref);
10832 if (r < 0) {
10833 return r;
10834 }
10835
10836 bufferlist outbl;
10837
10838 return ref.ioctx.operate(ref.oid, op, &outbl);
10839 }
10840
10841 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10842 {
10843 ObjectWriteOperation op;
10844
10845 assert(olh_obj.key.instance.empty());
10846
10847 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10848
10849 if (!state.exists) {
10850 op.create(true);
10851 } else {
10852 op.assert_exists();
10853 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
10854 op.mtime2(&mtime_ts);
10855 }
10856
10857 /*
10858 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10859 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10860 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10861 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10862 * log will reflect that.
10863 *
10864 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10865 * is used for object data instance, olh_tag for olh instance.
10866 */
10867 if (has_tag) {
10868 /* guard against racing writes */
10869 bucket_index_guard_olh_op(state, op);
10870 }
10871
10872 if (!has_tag) {
10873 /* obj tag */
10874 string obj_tag;
10875 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10876 if (ret < 0) {
10877 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10878 return ret;
10879 }
10880 bufferlist bl;
10881 bl.append(obj_tag.c_str(), obj_tag.size());
10882 op.setxattr(RGW_ATTR_ID_TAG, bl);
10883
10884 state.attrset[RGW_ATTR_ID_TAG] = bl;
10885 state.obj_tag = bl;
10886
10887 /* olh tag */
10888 string olh_tag;
10889 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10890 if (ret < 0) {
10891 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10892 return ret;
10893 }
10894 bufferlist olh_bl;
10895 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10896 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10897
10898 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10899 state.olh_tag = olh_bl;
10900 state.is_olh = true;
10901
10902 bufferlist verbl;
10903 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10904 }
10905
10906 bufferlist bl;
10907 RGWOLHPendingInfo pending_info;
10908 pending_info.time = real_clock::now();
10909 ::encode(pending_info, bl);
10910
10911 #define OLH_PENDING_TAG_LEN 32
10912 /* tag will start with current time epoch, this so that entries are sorted by time */
10913 char buf[32];
10914 utime_t ut(pending_info.time);
10915 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10916 *op_tag = buf;
10917
10918 string s;
10919 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10920 if (ret < 0) {
10921 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10922 return ret;
10923 }
10924 op_tag->append(s);
10925
10926 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10927 attr_name.append(*op_tag);
10928
10929 op.setxattr(attr_name.c_str(), bl);
10930
10931 ret = obj_operate(bucket_info, olh_obj, &op);
10932 if (ret < 0) {
10933 return ret;
10934 }
10935
10936 state.exists = true;
10937 state.attrset[attr_name] = bl;
10938
10939 return 0;
10940 }
10941
10942 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10943 {
10944 int ret;
10945
10946 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10947 if (ret == -EEXIST) {
10948 ret = -ECANCELED;
10949 }
10950
10951 return ret;
10952 }
10953
10954 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
10955 {
10956 rgw_obj obj;
10957 const rgw_obj *pobj = &obj_instance;
10958 int r;
10959
10960 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10961 r = bs->init(pobj->bucket, *pobj);
10962 if (r < 0) {
10963 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
10964 return r;
10965 }
10966 r = call(bs);
10967 if (r != -ERR_BUSY_RESHARDING) {
10968 break;
10969 }
10970 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10971 string new_bucket_id;
10972 r = block_while_resharding(bs, &new_bucket_id);
10973 if (r == -ERR_BUSY_RESHARDING) {
10974 continue;
10975 }
10976 if (r < 0) {
10977 return r;
10978 }
10979 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10980 i = 0; /* resharding is finished, make sure we can retry */
10981
10982 obj = *pobj;
10983 obj.bucket.update_bucket_id(new_bucket_id);
10984 pobj = &obj;
10985 }
10986
10987 if (r < 0) {
10988 return r;
10989 }
10990
10991 return 0;
10992 }
10993
10994 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
10995 {
10996 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
10997
10998 return waiter->block_while_resharding(bs, new_bucket_id);
10999 }
11000
11001 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11002 bool delete_marker,
11003 const string& op_tag,
11004 struct rgw_bucket_dir_entry_meta *meta,
11005 uint64_t olh_epoch,
11006 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
11007 {
11008 rgw_rados_ref ref;
11009 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11010 if (r < 0) {
11011 return r;
11012 }
11013
11014 rgw_zone_set zones_trace;
11015 if (_zones_trace) {
11016 zones_trace = *_zones_trace;
11017 } else {
11018 zones_trace.insert(get_zone().id);
11019 }
11020
11021 BucketShard bs(this);
11022
11023 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11024 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11025 librados::ObjectWriteOperation op;
11026 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11027 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11028 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11029 unmod_since, high_precision_time,
11030 get_zone().log_data, zones_trace);
11031 });
11032 if (r < 0) {
11033 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11034 return r;
11035 }
11036
11037 return 0;
11038 }
11039
11040 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11041 {
11042 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11043 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11044 }
11045
11046 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
11047 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
11048 {
11049 rgw_rados_ref ref;
11050 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11051 if (r < 0) {
11052 return r;
11053 }
11054
11055 rgw_zone_set zones_trace;
11056 if (_zones_trace) {
11057 zones_trace = *_zones_trace;
11058 }
11059 zones_trace.insert(get_zone().id);
11060
11061 BucketShard bs(this);
11062
11063 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11064 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11065 librados::ObjectWriteOperation op;
11066 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11067 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11068 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11069 });
11070 if (r < 0) {
11071 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11072 return r;
11073 }
11074
11075 return 0;
11076 }
11077
11078 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11079 const rgw_obj& obj_instance, uint64_t ver_marker,
11080 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11081 bool *is_truncated)
11082 {
11083 rgw_rados_ref ref;
11084 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11085 if (r < 0) {
11086 return r;
11087 }
11088
11089 BucketShard bs(this);
11090 int ret = bs.init(obj_instance.bucket, obj_instance);
11091 if (ret < 0) {
11092 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11093 return ret;
11094 }
11095
11096 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11097
11098 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11099
11100 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11101 ObjectReadOperation op;
11102 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11103 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11104 key, ver_marker, olh_tag, log, is_truncated);
11105 });
11106 if (ret < 0) {
11107 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11108 return ret;
11109 }
11110
11111 return 0;
11112 }
11113
11114 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11115 {
11116 rgw_rados_ref ref;
11117 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11118 if (r < 0) {
11119 return r;
11120 }
11121
11122 BucketShard bs(this);
11123 int ret = bs.init(obj_instance.bucket, obj_instance);
11124 if (ret < 0) {
11125 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11126 return ret;
11127 }
11128
11129 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11130
11131 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11132
11133 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11134 ObjectWriteOperation op;
11135 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11136 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11137 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11138 });
11139 if (ret < 0) {
11140 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11141 return ret;
11142 }
11143
11144 return 0;
11145 }
11146
11147 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11148 {
11149 rgw_rados_ref ref;
11150 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11151 if (r < 0) {
11152 return r;
11153 }
11154
11155 BucketShard bs(this);
11156
11157 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11158
11159 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11160
11161 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11162 ObjectWriteOperation op;
11163 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11164 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11165 });
11166 if (ret < 0) {
11167 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11168 return ret;
11169 }
11170
11171 return 0;
11172 }
11173
11174 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11175 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11176 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11177 {
11178 if (log.empty()) {
11179 return 0;
11180 }
11181
11182 librados::ObjectWriteOperation op;
11183
11184 uint64_t last_ver = log.rbegin()->first;
11185 *plast_ver = last_ver;
11186
11187 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11188
11189 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11190 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11191
11192 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11193 op.mtime2(&mtime_ts);
11194
11195 bool need_to_link = false;
11196 cls_rgw_obj_key key;
11197 bool delete_marker = false;
11198 list<cls_rgw_obj_key> remove_instances;
11199 bool need_to_remove = false;
11200
11201 for (iter = log.begin(); iter != log.end(); ++iter) {
11202 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11203 for (; viter != iter->second.end(); ++viter) {
11204 rgw_bucket_olh_log_entry& entry = *viter;
11205
11206 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11207 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11208 << (entry.delete_marker ? "(delete)" : "") << dendl;
11209 switch (entry.op) {
11210 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11211 remove_instances.push_back(entry.key);
11212 break;
11213 case CLS_RGW_OLH_OP_LINK_OLH:
11214 need_to_link = true;
11215 need_to_remove = false;
11216 key = entry.key;
11217 delete_marker = entry.delete_marker;
11218 break;
11219 case CLS_RGW_OLH_OP_UNLINK_OLH:
11220 need_to_remove = true;
11221 need_to_link = false;
11222 break;
11223 default:
11224 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11225 return -EIO;
11226 }
11227 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11228 attr_name.append(entry.op_tag);
11229 op.rmxattr(attr_name.c_str());
11230 }
11231 }
11232
11233 rgw_rados_ref ref;
11234 int r = get_obj_head_ref(bucket_info, obj, &ref);
11235 if (r < 0) {
11236 return r;
11237 }
11238
11239 const rgw_bucket& bucket = obj.bucket;
11240
11241 if (need_to_link) {
11242 rgw_obj target(bucket, key);
11243 RGWOLHInfo info;
11244 info.target = target;
11245 info.removed = delete_marker;
11246 bufferlist bl;
11247 ::encode(info, bl);
11248 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11249 }
11250
11251 /* first remove object instances */
11252 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11253 liter != remove_instances.end(); ++liter) {
11254 cls_rgw_obj_key& key = *liter;
11255 rgw_obj obj_instance(bucket, key);
11256 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11257 if (ret < 0 && ret != -ENOENT) {
11258 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11259 return ret;
11260 }
11261 }
11262
11263 /* update olh object */
11264 r = ref.ioctx.operate(ref.oid, &op);
11265 if (r == -ECANCELED) {
11266 r = 0;
11267 }
11268 if (r < 0) {
11269 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11270 return r;
11271 }
11272
11273 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11274 if (r < 0) {
11275 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11276 return r;
11277 }
11278
11279 if (need_to_remove) {
11280 ObjectWriteOperation rm_op;
11281
11282 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11283 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11284 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11285 rm_op.remove();
11286
11287 r = ref.ioctx.operate(ref.oid, &rm_op);
11288 if (r == -ECANCELED) {
11289 return 0; /* someone else won this race */
11290 } else {
11291 /*
11292 * only clear if was successful, otherwise we might clobber pending operations on this object
11293 */
11294 r = bucket_index_clear_olh(bucket_info, state, obj);
11295 if (r < 0) {
11296 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11297 return r;
11298 }
11299 }
11300 }
11301
11302 return 0;
11303 }
11304
11305 /*
11306 * read olh log and apply it
11307 */
11308 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11309 {
11310 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11311 bool is_truncated;
11312 uint64_t ver_marker = 0;
11313
11314 do {
11315 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11316 if (ret < 0) {
11317 return ret;
11318 }
11319 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11320 if (ret < 0) {
11321 return ret;
11322 }
11323 } while (is_truncated);
11324
11325 return 0;
11326 }
11327
11328 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11329 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11330 {
11331 string op_tag;
11332
11333 rgw_obj olh_obj = target_obj;
11334 olh_obj.key.instance.clear();
11335
11336 RGWObjState *state = NULL;
11337
11338 int ret = 0;
11339 int i;
11340
11341 #define MAX_ECANCELED_RETRY 100
11342 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11343 if (ret == -ECANCELED) {
11344 obj_ctx.obj.invalidate(olh_obj);
11345 }
11346
11347 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11348 if (ret < 0) {
11349 return ret;
11350 }
11351
11352 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11353 if (ret < 0) {
11354 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11355 if (ret == -ECANCELED) {
11356 continue;
11357 }
11358 return ret;
11359 }
11360 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11361 if (ret < 0) {
11362 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11363 if (ret == -ECANCELED) {
11364 continue;
11365 }
11366 return ret;
11367 }
11368 break;
11369 }
11370
11371 if (i == MAX_ECANCELED_RETRY) {
11372 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11373 return -EIO;
11374 }
11375
11376 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11377 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11378 ret = 0;
11379 }
11380 if (ret < 0) {
11381 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11382 return ret;
11383 }
11384
11385 return 0;
11386 }
11387
11388 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11389 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11390 {
11391 string op_tag;
11392
11393 rgw_obj olh_obj = target_obj;
11394 olh_obj.key.instance.clear();
11395
11396 RGWObjState *state = NULL;
11397
11398 int ret = 0;
11399 int i;
11400
11401 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11402 if (ret == -ECANCELED) {
11403 obj_ctx.obj.invalidate(olh_obj);
11404 }
11405
11406 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11407 if (ret < 0)
11408 return ret;
11409
11410 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11411 if (ret < 0) {
11412 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11413 if (ret == -ECANCELED) {
11414 continue;
11415 }
11416 return ret;
11417 }
11418
11419 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11420
11421 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11422 if (ret < 0) {
11423 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11424 if (ret == -ECANCELED) {
11425 continue;
11426 }
11427 return ret;
11428 }
11429 break;
11430 }
11431
11432 if (i == MAX_ECANCELED_RETRY) {
11433 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11434 return -EIO;
11435 }
11436
11437 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11438 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11439 return 0;
11440 }
11441 if (ret < 0) {
11442 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11443 return ret;
11444 }
11445
11446 return 0;
11447 }
11448
11449 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11450 {
11451 #define OBJ_INSTANCE_LEN 32
11452 char buf[OBJ_INSTANCE_LEN + 1];
11453
11454 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11455 no underscore for instance name due to the way we encode the raw keys */
11456
11457 target_obj->key.set_instance(buf);
11458 }
11459
11460 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11461 map<string, bufferlist> *attrset)
11462 {
11463 attrset->clear();
11464 map<string, bufferlist>::iterator iter;
11465 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11466 iter != unfiltered_attrset.end(); ++iter) {
11467 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11468 break;
11469 (*attrset)[iter->first] = iter->second;
11470 }
11471 }
11472
11473 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11474 {
11475 map<string, bufferlist> unfiltered_attrset;
11476
11477 ObjectReadOperation op;
11478 op.getxattrs(&unfiltered_attrset, NULL);
11479
11480 bufferlist outbl;
11481 int r = obj_operate(bucket_info, obj, &op);
11482
11483 if (r < 0) {
11484 return r;
11485 }
11486 map<string, bufferlist> attrset;
11487
11488 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11489
11490 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11491 if (iter == attrset.end()) { /* not an olh */
11492 return -EINVAL;
11493 }
11494
11495 try {
11496 bufferlist::iterator biter = iter->second.begin();
11497 ::decode(*olh, biter);
11498 } catch (buffer::error& err) {
11499 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11500 return -EIO;
11501 }
11502
11503 return 0;
11504 }
11505
11506 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11507 map<string, bufferlist> *rm_pending_entries)
11508 {
11509 map<string, bufferlist>::iterator iter = pending_entries.begin();
11510
11511 real_time now = real_clock::now();
11512
11513 while (iter != pending_entries.end()) {
11514 bufferlist::iterator biter = iter->second.begin();
11515 RGWOLHPendingInfo pending_info;
11516 try {
11517 ::decode(pending_info, biter);
11518 } catch (buffer::error& err) {
11519 /* skipping bad entry, we could remove it but it might hide a bug */
11520 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11521 ++iter;
11522 continue;
11523 }
11524
11525 map<string, bufferlist>::iterator cur_iter = iter;
11526 ++iter;
11527 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11528 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11529 pending_entries.erase(cur_iter);
11530 } else {
11531 /* entries names are sorted by time (rounded to a second) */
11532 break;
11533 }
11534 }
11535 }
11536
11537 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11538 {
11539 ObjectWriteOperation op;
11540
11541 bucket_index_guard_olh_op(state, op);
11542
11543 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11544 op.rmxattr(iter->first.c_str());
11545 }
11546
11547 rgw_rados_ref ref;
11548 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11549 if (r < 0) {
11550 return r;
11551 }
11552
11553 /* update olh object */
11554 r = ref.ioctx.operate(ref.oid, &op);
11555 if (r == -ENOENT || r == -ECANCELED) {
11556 /* raced with some other change, shouldn't sweat about it */
11557 r = 0;
11558 }
11559 if (r < 0) {
11560 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11561 return r;
11562 }
11563
11564 return 0;
11565 }
11566
11567 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11568 {
11569 map<string, bufferlist> pending_entries;
11570 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11571
11572 map<string, bufferlist> rm_pending_entries;
11573 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11574
11575 if (!rm_pending_entries.empty()) {
11576 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11577 if (ret < 0) {
11578 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11579 return ret;
11580 }
11581 }
11582 if (!pending_entries.empty()) {
11583 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11584
11585 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11586 if (ret < 0) {
11587 return ret;
11588 }
11589 }
11590
11591 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11592 assert(iter != state->attrset.end());
11593 RGWOLHInfo olh;
11594 try {
11595 bufferlist::iterator biter = iter->second.begin();
11596 ::decode(olh, biter);
11597 } catch (buffer::error& err) {
11598 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11599 return -EIO;
11600 }
11601
11602 if (olh.removed) {
11603 return -ENOENT;
11604 }
11605
11606 *target = olh.target;
11607
11608 return 0;
11609 }
11610
11611 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11612 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11613 RGWObjVersionTracker *objv_tracker)
11614 {
11615 rgw_rados_ref ref;
11616 int r = get_raw_obj_ref(obj, &ref);
11617 if (r < 0) {
11618 return r;
11619 }
11620
11621 map<string, bufferlist> unfiltered_attrset;
11622 uint64_t size = 0;
11623 struct timespec mtime_ts;
11624
11625 ObjectReadOperation op;
11626 if (objv_tracker) {
11627 objv_tracker->prepare_op_for_read(&op);
11628 }
11629 if (attrs) {
11630 op.getxattrs(&unfiltered_attrset, NULL);
11631 }
11632 if (psize || pmtime) {
11633 op.stat2(&size, &mtime_ts, NULL);
11634 }
11635 if (first_chunk) {
11636 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11637 }
11638 bufferlist outbl;
11639 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11640
11641 if (epoch) {
11642 *epoch = ref.ioctx.get_last_version();
11643 }
11644
11645 if (r < 0)
11646 return r;
11647
11648 if (psize)
11649 *psize = size;
11650 if (pmtime)
11651 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11652 if (attrs) {
11653 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11654 }
11655
11656 return 0;
11657 }
11658
11659 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11660 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11661 {
11662 map<string, rgw_bucket_dir_header> headers;
11663 map<int, string> bucket_instance_ids;
11664 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11665 if (r < 0) {
11666 return r;
11667 }
11668
11669 assert(headers.size() == bucket_instance_ids.size());
11670
11671 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11672 map<int, string>::iterator viter = bucket_instance_ids.begin();
11673 BucketIndexShardsManager ver_mgr;
11674 BucketIndexShardsManager master_ver_mgr;
11675 BucketIndexShardsManager marker_mgr;
11676 char buf[64];
11677 for(; iter != headers.end(); ++iter, ++viter) {
11678 accumulate_raw_stats(iter->second, stats);
11679 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11680 ver_mgr.add(viter->first, string(buf));
11681 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11682 master_ver_mgr.add(viter->first, string(buf));
11683 if (shard_id >= 0) {
11684 *max_marker = iter->second.max_marker;
11685 } else {
11686 marker_mgr.add(viter->first, iter->second.max_marker);
11687 }
11688 if (syncstopped != NULL)
11689 *syncstopped = iter->second.syncstopped;
11690 }
11691 ver_mgr.to_string(bucket_ver);
11692 master_ver_mgr.to_string(master_ver);
11693 if (shard_id < 0) {
11694 marker_mgr.to_string(max_marker);
11695 }
11696 return 0;
11697 }
11698
11699 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11700 map<int, string>& markers)
11701 {
11702 map<string, rgw_bucket_dir_header> headers;
11703 map<int, string> bucket_instance_ids;
11704 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11705 if (r < 0)
11706 return r;
11707
11708 assert(headers.size() == bucket_instance_ids.size());
11709
11710 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11711 map<int, string>::iterator viter = bucket_instance_ids.begin();
11712
11713 for(; iter != headers.end(); ++iter, ++viter) {
11714 if (shard_id >= 0) {
11715 markers[shard_id] = iter->second.max_marker;
11716 } else {
11717 markers[viter->first] = iter->second.max_marker;
11718 }
11719 }
11720 return 0;
11721 }
11722
11723 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11724 RGWGetBucketStats_CB *cb;
11725 uint32_t pendings;
11726 map<RGWObjCategory, RGWStorageStats> stats;
11727 int ret_code;
11728 bool should_cb;
11729 Mutex lock;
11730
11731 public:
11732 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11733 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11734 lock("RGWGetBucketStatsContext") {}
11735
11736 void handle_response(int r, rgw_bucket_dir_header& header) override {
11737 Mutex::Locker l(lock);
11738 if (should_cb) {
11739 if ( r >= 0) {
11740 accumulate_raw_stats(header, stats);
11741 } else {
11742 ret_code = r;
11743 }
11744
11745 // Are we all done?
11746 if (--pendings == 0) {
11747 if (!ret_code) {
11748 cb->set_response(&stats);
11749 }
11750 cb->handle_response(ret_code);
11751 cb->put();
11752 }
11753 }
11754 }
11755
11756 void unset_cb() {
11757 Mutex::Locker l(lock);
11758 should_cb = false;
11759 }
11760 };
11761
11762 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11763 {
11764 int num_aio = 0;
11765 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11766 assert(get_ctx);
11767 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11768 if (r < 0) {
11769 ctx->put();
11770 if (num_aio) {
11771 get_ctx->unset_cb();
11772 }
11773 }
11774 get_ctx->put();
11775 return r;
11776 }
11777
11778 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11779 RGWGetUserStats_CB *cb;
11780
11781 public:
11782 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11783 : cb(cb) {}
11784
11785 void handle_response(int r, cls_user_header& header) override {
11786 const cls_user_stats& hs = header.stats;
11787 if (r >= 0) {
11788 RGWStorageStats stats;
11789
11790 stats.size = hs.total_bytes;
11791 stats.size_rounded = hs.total_bytes_rounded;
11792 stats.num_objects = hs.total_entries;
11793
11794 cb->set_response(stats);
11795 }
11796
11797 cb->handle_response(r);
11798
11799 cb->put();
11800 }
11801 };
11802
11803 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11804 {
11805 string user_str = user.to_str();
11806
11807 cls_user_header header;
11808 int r = cls_user_get_header(user_str, &header);
11809 if (r < 0)
11810 return r;
11811
11812 const cls_user_stats& hs = header.stats;
11813
11814 stats.size = hs.total_bytes;
11815 stats.size_rounded = hs.total_bytes_rounded;
11816 stats.num_objects = hs.total_entries;
11817
11818 return 0;
11819 }
11820
11821 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11822 {
11823 string user_str = user.to_str();
11824
11825 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11826 int r = cls_user_get_header_async(user_str, get_ctx);
11827 if (r < 0) {
11828 ctx->put();
11829 delete get_ctx;
11830 return r;
11831 }
11832
11833 return 0;
11834 }
11835
11836 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11837 {
11838 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11839 }
11840
11841 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11842 {
11843 if (!bucket.oid.empty()) {
11844 obj.init(get_zone_params().domain_root, bucket.oid);
11845 } else {
11846 string oid;
11847 get_bucket_meta_oid(bucket, oid);
11848 obj.init(get_zone_params().domain_root, oid);
11849 }
11850 }
11851
11852 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11853 real_time *pmtime, map<string, bufferlist> *pattrs)
11854 {
11855 size_t pos = meta_key.find(':');
11856 if (pos == string::npos) {
11857 return -EINVAL;
11858 }
11859 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11860 rgw_bucket_instance_key_to_oid(oid);
11861
11862 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11863 }
11864
11865 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11866 real_time *pmtime, map<string, bufferlist> *pattrs)
11867 {
11868 string oid;
11869 if (bucket.oid.empty()) {
11870 get_bucket_meta_oid(bucket, oid);
11871 } else {
11872 oid = bucket.oid;
11873 }
11874
11875 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11876 }
11877
11878 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11879 real_time *pmtime, map<string, bufferlist> *pattrs,
11880 rgw_cache_entry_info *cache_info,
11881 boost::optional<obj_version> refresh_version)
11882 {
11883 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11884
11885 bufferlist epbl;
11886
11887 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11888 oid, epbl, &info.objv_tracker, pmtime, pattrs,
11889 cache_info, refresh_version);
11890 if (ret < 0) {
11891 return ret;
11892 }
11893
11894 bufferlist::iterator iter = epbl.begin();
11895 try {
11896 ::decode(info, iter);
11897 } catch (buffer::error& err) {
11898 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11899 return -EIO;
11900 }
11901 info.bucket.oid = oid;
11902 return 0;
11903 }
11904
11905 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11906 const string& tenant_name,
11907 const string& bucket_name,
11908 RGWBucketEntryPoint& entry_point,
11909 RGWObjVersionTracker *objv_tracker,
11910 real_time *pmtime,
11911 map<string, bufferlist> *pattrs,
11912 rgw_cache_entry_info *cache_info,
11913 boost::optional<obj_version> refresh_version)
11914 {
11915 bufferlist bl;
11916 string bucket_entry;
11917
11918 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11919 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11920 bucket_entry, bl, objv_tracker, pmtime, pattrs,
11921 cache_info, refresh_version);
11922 if (ret < 0) {
11923 return ret;
11924 }
11925
11926 bufferlist::iterator iter = bl.begin();
11927 try {
11928 ::decode(entry_point, iter);
11929 } catch (buffer::error& err) {
11930 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11931 return -EIO;
11932 }
11933 return 0;
11934 }
11935
11936 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11937 const string& tenant_name,
11938 const string& bucket_name)
11939 {
11940 RGWBucketEntryPoint entry_point;
11941 real_time ep_mtime;
11942 RGWObjVersionTracker ot;
11943 map<string, bufferlist> attrs;
11944 RGWBucketInfo info;
11945
11946 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11947
11948 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11949 if (ret < 0) {
11950 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11951 return ret;
11952 }
11953
11954 if (!entry_point.has_bucket_info) {
11955 /* already converted! */
11956 return 0;
11957 }
11958
11959 info = entry_point.old_bucket_info;
11960 info.bucket.oid = bucket_name;
11961 info.ep_objv = ot.read_version;
11962
11963 ot.generate_new_write_ver(cct);
11964
11965 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11966 if (ret < 0) {
11967 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11968 return ret;
11969 }
11970
11971 return 0;
11972 }
11973
11974 int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
11975 const string& tenant,
11976 const string& bucket_name,
11977 RGWBucketInfo& info,
11978 real_time *pmtime,
11979 map<string, bufferlist> *pattrs,
11980 boost::optional<obj_version> refresh_version)
11981 {
11982 bucket_info_entry e;
11983 string bucket_entry;
11984 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11985
11986
11987 if (binfo_cache->find(bucket_entry, &e)) {
11988 if (refresh_version &&
11989 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
11990 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
11991 << "a failure that should be debugged. I am a nice machine, "
11992 << "so I will try to recover." << dendl;
11993 binfo_cache->invalidate(bucket_entry);
11994 }
11995 info = e.info;
11996 if (pattrs)
11997 *pattrs = e.attrs;
11998 if (pmtime)
11999 *pmtime = e.mtime;
12000 return 0;
12001 }
12002
12003 RGWBucketEntryPoint entry_point;
12004 real_time ep_mtime;
12005 RGWObjVersionTracker ot;
12006 rgw_cache_entry_info entry_cache_info;
12007 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12008 entry_point, &ot, &ep_mtime, pattrs,
12009 &entry_cache_info, refresh_version);
12010 if (ret < 0) {
12011 /* only init these fields */
12012 info.bucket.tenant = tenant;
12013 info.bucket.name = bucket_name;
12014 return ret;
12015 }
12016
12017 if (entry_point.has_bucket_info) {
12018 info = entry_point.old_bucket_info;
12019 info.bucket.oid = bucket_name;
12020 info.bucket.tenant = tenant;
12021 info.ep_objv = ot.read_version;
12022 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12023 return 0;
12024 }
12025
12026 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12027 * that we got
12028 */
12029 if (pattrs) {
12030 pattrs->clear();
12031 }
12032
12033 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12034
12035
12036 /* read bucket instance info */
12037
12038 string oid;
12039 get_bucket_meta_oid(entry_point.bucket, oid);
12040
12041 rgw_cache_entry_info cache_info;
12042
12043 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12044 &cache_info, refresh_version);
12045 e.info.ep_objv = ot.read_version;
12046 info = e.info;
12047 if (ret < 0) {
12048 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
12049 info.bucket.tenant = tenant;
12050 info.bucket.name = bucket_name;
12051 // XXX and why return anything in case of an error anyway?
12052 return ret;
12053 }
12054
12055 if (pmtime)
12056 *pmtime = e.mtime;
12057 if (pattrs)
12058 *pattrs = e.attrs;
12059
12060 list<rgw_cache_entry_info *> cache_info_entries;
12061 cache_info_entries.push_back(&entry_cache_info);
12062 cache_info_entries.push_back(&cache_info);
12063
12064
12065 /* chain to both bucket entry point and bucket instance */
12066 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12067 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12068 }
12069
12070 if (refresh_version &&
12071 refresh_version->compare(&info.objv_tracker.read_version)) {
12072 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12073 << "have gone squirrelly. An administrator may have forced a "
12074 << "change; otherwise there is a problem somewhere." << dendl;
12075 }
12076
12077 return 0;
12078 }
12079
12080 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12081 const string& tenant, const string& bucket_name,
12082 RGWBucketInfo& info,
12083 real_time *pmtime, map<string, bufferlist> *pattrs)
12084 {
12085 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12086 pattrs, boost::none);
12087 }
12088
12089 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12090 ceph::real_time *pmtime,
12091 map<string, bufferlist> *pattrs)
12092 {
12093 RGWObjectCtx obj_ctx(this);
12094
12095 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12096 info, pmtime, pattrs, info.objv_tracker.read_version);
12097 }
12098
12099 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12100 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12101 map<string, bufferlist> *pattrs)
12102 {
12103 bufferlist epbl;
12104 ::encode(entry_point, epbl);
12105 string bucket_entry;
12106 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12107 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12108 }
12109
12110 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12111 real_time mtime, map<string, bufferlist> *pattrs)
12112 {
12113 info.has_instance_obj = true;
12114 bufferlist bl;
12115
12116 ::encode(info, bl);
12117
12118 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12119 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12120 if (ret == -EEXIST) {
12121 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12122 * bucket operation on this specific bucket (e.g., being synced from the master), but
12123 * since bucket instace meta object is unique for this specific bucket instace, we don't
12124 * need to return an error.
12125 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12126 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12127 * locally, while in the sync thread we sync the new bucket.
12128 */
12129 ret = 0;
12130 }
12131 return ret;
12132 }
12133
12134 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12135 map<string, bufferlist> *pattrs, bool create_entry_point)
12136 {
12137 bool create_head = !info.has_instance_obj || create_entry_point;
12138
12139 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12140 if (ret < 0) {
12141 return ret;
12142 }
12143
12144 if (!create_head)
12145 return 0; /* done! */
12146
12147 RGWBucketEntryPoint entry_point;
12148 entry_point.bucket = info.bucket;
12149 entry_point.owner = info.owner;
12150 entry_point.creation_time = info.creation_time;
12151 entry_point.linked = true;
12152 RGWObjVersionTracker ot;
12153 if (pep_objv && !pep_objv->tag.empty()) {
12154 ot.write_version = *pep_objv;
12155 } else {
12156 ot.generate_new_write_ver(cct);
12157 if (pep_objv) {
12158 *pep_objv = ot.write_version;
12159 }
12160 }
12161 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12162 if (ret < 0)
12163 return ret;
12164
12165 return 0;
12166 }
12167
12168 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12169 {
12170 rgw_rados_ref ref;
12171 int r = get_raw_obj_ref(obj, &ref);
12172 if (r < 0) {
12173 return r;
12174 }
12175
12176 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12177 if (r < 0)
12178 return r;
12179
12180 return 0;
12181
12182 }
12183
12184 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12185 std::map<string, bufferlist>& m)
12186 {
12187 rgw_rados_ref ref;
12188 int r = get_raw_obj_ref(obj, &ref);
12189 if (r < 0) {
12190 return r;
12191 }
12192
12193 #define MAX_OMAP_GET_ENTRIES 1024
12194 const int count = MAX_OMAP_GET_ENTRIES;
12195 string start_after;
12196
12197 while (true) {
12198 std::map<string, bufferlist> t;
12199 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12200 if (r < 0) {
12201 return r;
12202 }
12203 if (t.empty()) {
12204 break;
12205 }
12206 start_after = t.rbegin()->first;
12207 m.insert(t.begin(), t.end());
12208 }
12209 return 0;
12210 }
12211
12212 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12213 {
12214 rgw_rados_ref ref;
12215 int r = get_raw_obj_ref(obj, &ref);
12216 if (r < 0) {
12217 return r;
12218 }
12219 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12220
12221 map<string, bufferlist> m;
12222 m[key] = bl;
12223
12224 r = ref.ioctx.omap_set(ref.oid, m);
12225
12226 return r;
12227 }
12228
12229 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12230 {
12231 rgw_rados_ref ref;
12232 int r = get_raw_obj_ref(obj, &ref);
12233 if (r < 0) {
12234 return r;
12235 }
12236
12237 r = ref.ioctx.omap_set(ref.oid, m);
12238
12239 return r;
12240 }
12241
12242 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12243 {
12244 rgw_rados_ref ref;
12245 int r = get_raw_obj_ref(obj, &ref);
12246 if (r < 0) {
12247 return r;
12248 }
12249
12250 set<string> k;
12251 k.insert(key);
12252
12253 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12254 return r;
12255 }
12256
12257 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12258 {
12259 RGWObjectCtx obj_ctx(this);
12260
12261 map<string, RGWBucketEnt>::iterator iter;
12262 for (iter = m.begin(); iter != m.end(); ++iter) {
12263 RGWBucketEnt& ent = iter->second;
12264 rgw_bucket& bucket = ent.bucket;
12265 ent.count = 0;
12266 ent.size = 0;
12267 ent.size_rounded = 0;
12268
12269 map<string, rgw_bucket_dir_header> headers;
12270
12271 RGWBucketInfo bucket_info;
12272 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12273 if (ret < 0) {
12274 return ret;
12275 }
12276
12277 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12278 if (r < 0)
12279 return r;
12280
12281 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12282 for (; hiter != headers.end(); ++hiter) {
12283 RGWObjCategory category = main_category;
12284 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12285 if (iter != hiter->second.stats.end()) {
12286 struct rgw_bucket_category_stats& stats = iter->second;
12287 ent.count += stats.num_entries;
12288 ent.size += stats.total_size;
12289 ent.size_rounded += stats.total_size_rounded;
12290 }
12291 }
12292
12293 // fill in placement_rule from the bucket instance for use in swift's
12294 // per-storage policy statistics
12295 ent.placement_rule = std::move(bucket_info.placement_rule);
12296 }
12297
12298 return m.size();
12299 }
12300
12301 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12302 {
12303 rgw_rados_ref ref;
12304 int r = get_raw_obj_ref(obj, &ref);
12305 if (r < 0) {
12306 return r;
12307 }
12308 librados::Rados *rad = get_rados_handle();
12309 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12310
12311 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12312 completion->release();
12313 return r;
12314 }
12315
12316 int RGWRados::distribute(const string& key, bufferlist& bl)
12317 {
12318 /*
12319 * we were called before watch was initialized. This can only happen if we're updating some system
12320 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12321 * objects, they're currently only read on startup anyway.
12322 */
12323 if (!watch_initialized)
12324 return 0;
12325
12326 string notify_oid;
12327 pick_control_oid(key, notify_oid);
12328
12329 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12330 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12331 }
12332
12333 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12334 {
12335 librados::IoCtx& io_ctx = ctx.io_ctx;
12336 librados::NObjectIterator& iter = ctx.iter;
12337
12338 int r = open_pool_ctx(pool, io_ctx);
12339 if (r < 0)
12340 return r;
12341
12342 iter = io_ctx.nobjects_begin();
12343
12344 return 0;
12345 }
12346
12347 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12348 {
12349 librados::IoCtx& io_ctx = ctx.io_ctx;
12350 librados::NObjectIterator& iter = ctx.iter;
12351
12352 int r = open_pool_ctx(pool, io_ctx);
12353 if (r < 0)
12354 return r;
12355
12356 librados::ObjectCursor oc;
12357 if (!oc.from_str(cursor)) {
12358 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12359 return -EINVAL;
12360 }
12361
12362 iter = io_ctx.nobjects_begin(oc);
12363
12364 return 0;
12365 }
12366
12367 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12368 {
12369 return ctx.iter.get_cursor().to_str();
12370 }
12371
12372 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12373 bool *is_truncated, RGWAccessListFilter *filter)
12374 {
12375 librados::IoCtx& io_ctx = ctx.io_ctx;
12376 librados::NObjectIterator& iter = ctx.iter;
12377
12378 if (iter == io_ctx.nobjects_end())
12379 return -ENOENT;
12380
12381 uint32_t i;
12382
12383 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12384 rgw_bucket_dir_entry e;
12385
12386 string oid = iter->get_oid();
12387 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12388
12389 // fill it in with initial values; we may correct later
12390 if (filter && !filter->filter(oid, oid))
12391 continue;
12392
12393 e.key = oid;
12394 objs.push_back(e);
12395 }
12396
12397 if (is_truncated)
12398 *is_truncated = (iter != io_ctx.nobjects_end());
12399
12400 return objs.size();
12401 }
12402 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12403 string prefix;
12404
12405 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12406 bool filter(string& name, string& key) override {
12407 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12408 }
12409 };
12410
12411 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
12412 {
12413 if (!ctx->initialized) {
12414 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
12415 if (r < 0) {
12416 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12417 return r;
12418 }
12419 ctx->initialized = true;
12420 }
12421 return 0;
12422 }
12423
12424 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12425 RGWListRawObjsCtx& ctx, list<string>& oids,
12426 bool *is_truncated)
12427 {
12428 if (!ctx.initialized) {
12429 return -EINVAL;
12430 }
12431 RGWAccessListFilterPrefix filter(prefix_filter);
12432 vector<rgw_bucket_dir_entry> objs;
12433 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12434 if (r < 0) {
12435 if(r != -ENOENT)
12436 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12437 return r;
12438 }
12439
12440 vector<rgw_bucket_dir_entry>::iterator iter;
12441 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12442 oids.push_back(iter->key.name);
12443 }
12444
12445 return oids.size();
12446 }
12447
12448 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12449 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12450 bool *is_truncated)
12451 {
12452 if (!ctx.initialized) {
12453 int r = list_raw_objects_init(pool, string(), &ctx);
12454 if (r < 0) {
12455 return r;
12456 }
12457 }
12458
12459 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12460 }
12461
12462 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12463 {
12464 return pool_iterate_get_cursor(ctx.iter_ctx);
12465 }
12466
12467 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12468 std::list<rgw_bi_log_entry>& result, bool *truncated)
12469 {
12470 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12471 result.clear();
12472
12473 librados::IoCtx index_ctx;
12474 map<int, string> oids;
12475 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12476 map<int, string> bucket_instance_ids;
12477 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12478 if (r < 0)
12479 return r;
12480
12481 BucketIndexShardsManager marker_mgr;
12482 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12483 // If there are multiple shards for the bucket index object, the marker
12484 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12485 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12486 // only contain one record, and the key is the bucket instance id.
12487 r = marker_mgr.from_string(marker, shard_id);
12488 if (r < 0)
12489 return r;
12490
12491 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12492 if (r < 0)
12493 return r;
12494
12495 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12496 map<int, list<rgw_bi_log_entry>::iterator> vends;
12497 if (truncated) {
12498 *truncated = false;
12499 }
12500 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12501 for (; miter != bi_log_lists.end(); ++miter) {
12502 int shard_id = miter->first;
12503 vcurrents[shard_id] = miter->second.entries.begin();
12504 vends[shard_id] = miter->second.entries.end();
12505 if (truncated) {
12506 *truncated = (*truncated || miter->second.truncated);
12507 }
12508 }
12509
12510 size_t total = 0;
12511 bool has_more = true;
12512 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12513 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12514 while (total < max && has_more) {
12515 has_more = false;
12516
12517 viter = vcurrents.begin();
12518 eiter = vends.begin();
12519
12520 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12521 assert (eiter != vends.end());
12522
12523 int shard_id = viter->first;
12524 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12525
12526 if (liter == eiter->second){
12527 continue;
12528 }
12529 rgw_bi_log_entry& entry = *(liter);
12530 if (has_shards) {
12531 char buf[16];
12532 snprintf(buf, sizeof(buf), "%d", shard_id);
12533 string tmp_id;
12534 build_bucket_index_marker(buf, entry.id, &tmp_id);
12535 entry.id.swap(tmp_id);
12536 }
12537 marker_mgr.add(shard_id, entry.id);
12538 result.push_back(entry);
12539 total++;
12540 has_more = true;
12541 ++liter;
12542 }
12543 }
12544
12545 if (truncated) {
12546 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12547 assert (eiter != vends.end());
12548 *truncated = (*truncated || (viter->second != eiter->second));
12549 }
12550 }
12551
12552 // Refresh marker, if there are multiple shards, the output will look like
12553 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12554 // if there is no sharding, the simply marker (without oid) is returned
12555 if (has_shards) {
12556 marker_mgr.to_string(&marker);
12557 } else {
12558 if (!result.empty()) {
12559 marker = result.rbegin()->id;
12560 }
12561 }
12562
12563 return 0;
12564 }
12565
12566 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12567 {
12568 librados::IoCtx index_ctx;
12569 map<int, string> bucket_objs;
12570
12571 BucketIndexShardsManager start_marker_mgr;
12572 BucketIndexShardsManager end_marker_mgr;
12573
12574 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12575 if (r < 0) {
12576 return r;
12577 }
12578
12579 r = start_marker_mgr.from_string(start_marker, shard_id);
12580 if (r < 0) {
12581 return r;
12582 }
12583
12584 r = end_marker_mgr.from_string(end_marker, shard_id);
12585 if (r < 0) {
12586 return r;
12587 }
12588
12589 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12590 cct->_conf->rgw_bucket_index_max_aio)();
12591
12592 return r;
12593 }
12594
12595 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12596 {
12597 librados::IoCtx index_ctx;
12598 map<int, string> bucket_objs;
12599 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12600 if (r < 0)
12601 return r;
12602
12603 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12604 }
12605
12606 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12607 {
12608 librados::IoCtx index_ctx;
12609 map<int, string> bucket_objs;
12610 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12611 if (r < 0)
12612 return r;
12613
12614 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12615 }
12616
12617 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12618 {
12619 rgw_rados_ref ref;
12620 int r = get_obj_head_ref(bucket_info, obj, &ref);
12621 if (r < 0) {
12622 return r;
12623 }
12624
12625 rgw_cls_bi_entry bi_entry;
12626 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12627 if (r < 0 && r != -ENOENT) {
12628 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12629 }
12630 if (r < 0) {
12631 return r;
12632 }
12633 bufferlist::iterator iter = bi_entry.data.begin();
12634 try {
12635 ::decode(*dirent, iter);
12636 } catch (buffer::error& err) {
12637 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12638 return -EIO;
12639 }
12640
12641 return 0;
12642 }
12643
12644 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12645 {
12646 BucketShard bs(this);
12647 int ret = bs.init(bucket, obj);
12648 if (ret < 0) {
12649 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12650 return ret;
12651 }
12652
12653 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12654
12655 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12656 if (ret < 0)
12657 return ret;
12658
12659 return 0;
12660 }
12661
12662 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12663 {
12664 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12665 }
12666
12667 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12668 {
12669 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12670 if (ret < 0)
12671 return ret;
12672
12673 return 0;
12674 }
12675
12676 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12677 {
12678 BucketShard bs(this);
12679 int ret = bs.init(bucket, obj);
12680 if (ret < 0) {
12681 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12682 return ret;
12683 }
12684
12685 return bi_put(bs, entry);
12686 }
12687
12688 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12689 {
12690 rgw_obj obj(bucket, obj_name);
12691 BucketShard bs(this);
12692 int ret = bs.init(bucket, obj);
12693 if (ret < 0) {
12694 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12695 return ret;
12696 }
12697
12698 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12699 if (ret == -ENOENT) {
12700 *is_truncated = false;
12701 }
12702 if (ret < 0)
12703 return ret;
12704
12705 return 0;
12706 }
12707
12708 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12709 {
12710 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12711 if (ret < 0)
12712 return ret;
12713
12714 return 0;
12715 }
12716
12717 int RGWRados::bi_remove(BucketShard& bs)
12718 {
12719 int ret = bs.index_ctx.remove(bs.bucket_obj);
12720 if (ret == -ENOENT) {
12721 ret = 0;
12722 }
12723 if (ret < 0) {
12724 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12725 return ret;
12726 }
12727
12728 return 0;
12729 }
12730
12731 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12732 {
12733 BucketShard bs(this);
12734 int ret = bs.init(bucket, shard_id);
12735 if (ret < 0) {
12736 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12737 return ret;
12738 }
12739
12740 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12741 }
12742
12743 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12744 {
12745 return gc_pool_ctx.operate(oid, op);
12746 }
12747
12748 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12749 {
12750 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12751 int r = gc_pool_ctx.aio_operate(oid, c, op);
12752 c->release();
12753 return r;
12754 }
12755
12756 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12757 {
12758 return gc_pool_ctx.operate(oid, op, pbl);
12759 }
12760
12761 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12762 {
12763 return gc->list(index, marker, max, expired_only, result, truncated);
12764 }
12765
12766 int RGWRados::process_gc()
12767 {
12768 return gc->process();
12769 }
12770
12771 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12772 {
12773 return lc->list_lc_progress(marker, max_entries, progress_map);
12774 }
12775
12776 int RGWRados::process_lc()
12777 {
12778 return lc->process();
12779 }
12780
12781 int RGWRados::process_expire_objects()
12782 {
12783 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12784 return 0;
12785 }
12786
12787 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12788 {
12789 bufferlist in;
12790 cls_rgw_bucket_init(op);
12791 return index_ctx.operate(oid, &op);
12792 }
12793
12794 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12795 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12796 {
12797 rgw_zone_set zones_trace;
12798 if (_zones_trace) {
12799 zones_trace = *_zones_trace;
12800 }
12801 else {
12802 zones_trace.insert(get_zone().id);
12803 }
12804
12805 ObjectWriteOperation o;
12806 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12807 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12808 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12809 return bs.index_ctx.operate(bs.bucket_obj, &o);
12810 }
12811
12812 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12813 int64_t pool, uint64_t epoch,
12814 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12815 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12816 {
12817 ObjectWriteOperation o;
12818 rgw_bucket_dir_entry_meta dir_meta;
12819 dir_meta = ent.meta;
12820 dir_meta.category = category;
12821
12822 rgw_bucket_entry_ver ver;
12823 ver.pool = pool;
12824 ver.epoch = epoch;
12825 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12826 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12827 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12828 get_zone().log_data, bilog_flags, _zones_trace);
12829 complete_op_data *arg;
12830 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12831 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12832 librados::AioCompletion *completion = arg->rados_completion;
12833 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12834 completion->release(); /* can't reference arg here, as it might have already been released */
12835 return ret;
12836 }
12837
12838 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12839 int64_t pool, uint64_t epoch,
12840 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12841 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12842 {
12843 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12844 }
12845
12846 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12847 int64_t pool, uint64_t epoch,
12848 rgw_obj& obj,
12849 real_time& removed_mtime,
12850 list<rgw_obj_index_key> *remove_objs,
12851 uint16_t bilog_flags,
12852 rgw_zone_set *zones_trace)
12853 {
12854 rgw_bucket_dir_entry ent;
12855 ent.meta.mtime = removed_mtime;
12856 obj.key.get_index_key(&ent.key);
12857 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12858 }
12859
12860 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12861 {
12862 rgw_bucket_dir_entry ent;
12863 obj.key.get_index_key(&ent.key);
12864 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12865 }
12866
12867 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12868 {
12869 librados::IoCtx index_ctx;
12870 map<int, string> bucket_objs;
12871 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12872 if (r < 0)
12873 return r;
12874
12875 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12876 }
12877
12878 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12879 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12880 bool *is_truncated, rgw_obj_index_key *last_entry,
12881 bool (*force_check_filter)(const string& name))
12882 {
12883 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12884
12885 librados::IoCtx index_ctx;
12886 // key - oid (for different shards if there is any)
12887 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12888 map<int, string> oids;
12889 map<int, struct rgw_cls_list_ret> list_results;
12890 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12891 if (r < 0)
12892 return r;
12893
12894 cls_rgw_obj_key start_key(start.name, start.instance);
12895 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12896 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12897 if (r < 0)
12898 return r;
12899
12900 // Create a list of iterators that are used to iterate each shard
12901 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12902 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12903 vector<string> vnames(list_results.size());
12904 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12905 *is_truncated = false;
12906 for (; iter != list_results.end(); ++iter) {
12907 vcurrents.push_back(iter->second.dir.m.begin());
12908 vends.push_back(iter->second.dir.m.end());
12909 vnames.push_back(oids[iter->first]);
12910 *is_truncated = (*is_truncated || iter->second.is_truncated);
12911 }
12912
12913 // Create a map to track the next candidate entry from each shard, if the entry
12914 // from a specified shard is selected/erased, the next entry from that shard will
12915 // be inserted for next round selection
12916 map<string, size_t> candidates;
12917 for (size_t i = 0; i < vcurrents.size(); ++i) {
12918 if (vcurrents[i] != vends[i]) {
12919 candidates[vcurrents[i]->first] = i;
12920 }
12921 }
12922
12923 map<string, bufferlist> updates;
12924 uint32_t count = 0;
12925 while (count < num_entries && !candidates.empty()) {
12926 r = 0;
12927 // Select the next one
12928 int pos = candidates.begin()->second;
12929 const string& name = vcurrents[pos]->first;
12930 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12931
12932 bool force_check = force_check_filter &&
12933 force_check_filter(dirent.key.name);
12934 if ((!dirent.exists && !dirent.is_delete_marker()) ||
12935 !dirent.pending_map.empty() ||
12936 force_check) {
12937 /* there are uncommitted ops. We need to check the current state,
12938 * and if the tags are old we need to do cleanup as well. */
12939 librados::IoCtx sub_ctx;
12940 sub_ctx.dup(index_ctx);
12941 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12942 if (r < 0 && r != -ENOENT) {
12943 return r;
12944 }
12945 }
12946 if (r >= 0) {
12947 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12948 m[name] = std::move(dirent);
12949 ++count;
12950 }
12951
12952 // Refresh the candidates map
12953 candidates.erase(candidates.begin());
12954 ++vcurrents[pos];
12955 if (vcurrents[pos] != vends[pos]) {
12956 candidates[vcurrents[pos]->first] = pos;
12957 }
12958 }
12959
12960 // Suggest updates if there is any
12961 map<string, bufferlist>::iterator miter = updates.begin();
12962 for (; miter != updates.end(); ++miter) {
12963 if (miter->second.length()) {
12964 ObjectWriteOperation o;
12965 cls_rgw_suggest_changes(o, miter->second);
12966 // we don't care if we lose suggested updates, send them off blindly
12967 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12968 index_ctx.aio_operate(miter->first, c, &o);
12969 c->release();
12970 }
12971 }
12972
12973 // Check if all the returned entries are consumed or not
12974 for (size_t i = 0; i < vcurrents.size(); ++i) {
12975 if (vcurrents[i] != vends[i])
12976 *is_truncated = true;
12977 }
12978 if (!m.empty())
12979 *last_entry = m.rbegin()->first;
12980
12981 return 0;
12982 }
12983
12984 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12985 {
12986 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12987
12988 rgw_rados_ref ref;
12989 int r = get_raw_obj_ref(obj, &ref);
12990 if (r < 0) {
12991 return r;
12992 }
12993
12994 ObjectWriteOperation op;
12995 cls_rgw_usage_log_add(op, info);
12996
12997 r = ref.ioctx.operate(ref.oid, &op);
12998 return r;
12999 }
13000
13001 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13002 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13003 {
13004 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13005
13006 rgw_rados_ref ref;
13007 int r = get_raw_obj_ref(obj, &ref);
13008 if (r < 0) {
13009 return r;
13010 }
13011
13012 *is_truncated = false;
13013
13014 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13015 max_entries, read_iter, usage, is_truncated);
13016
13017 return r;
13018 }
13019
13020 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13021 {
13022 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13023
13024 rgw_rados_ref ref;
13025 int r = get_raw_obj_ref(obj, &ref);
13026 if (r < 0) {
13027 return r;
13028 }
13029
13030 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
13031 return r;
13032 }
13033
13034 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13035 {
13036 librados::IoCtx index_ctx;
13037 string dir_oid;
13038
13039 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13040
13041 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13042 if (r < 0)
13043 return r;
13044
13045 bufferlist updates;
13046
13047 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13048 rgw_bucket_dir_entry entry;
13049 entry.key = *iter;
13050 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13051 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13052 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13053 ::encode(entry, updates);
13054 }
13055
13056 bufferlist out;
13057
13058 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13059
13060 return r;
13061 }
13062
13063 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13064 const RGWBucketInfo& bucket_info,
13065 rgw_bucket_dir_entry& list_state,
13066 rgw_bucket_dir_entry& object,
13067 bufferlist& suggested_updates)
13068 {
13069 const rgw_bucket& bucket = bucket_info.bucket;
13070 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13071
13072 std::string loc;
13073
13074 rgw_obj obj(bucket, list_state.key);
13075
13076 string oid;
13077 get_obj_bucket_and_oid_loc(obj, oid, loc);
13078
13079 if (loc != list_state.locator) {
13080 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13081 }
13082
13083 io_ctx.locator_set_key(list_state.locator);
13084
13085 RGWObjState *astate = NULL;
13086 RGWObjectCtx rctx(this);
13087 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13088 if (r < 0)
13089 return r;
13090
13091 list_state.pending_map.clear(); // we don't need this and it inflates size
13092 if (!astate->exists) {
13093 /* object doesn't exist right now -- hopefully because it's
13094 * marked as !exists and got deleted */
13095 if (list_state.exists) {
13096 /* FIXME: what should happen now? Work out if there are any
13097 * non-bad ways this could happen (there probably are, but annoying
13098 * to handle!) */
13099 }
13100 // encode a suggested removal of that key
13101 list_state.ver.epoch = io_ctx.get_last_version();
13102 list_state.ver.pool = io_ctx.get_id();
13103 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13104 return -ENOENT;
13105 }
13106
13107 string etag;
13108 string content_type;
13109 ACLOwner owner;
13110
13111 object.meta.size = astate->size;
13112 object.meta.accounted_size = astate->accounted_size;
13113 object.meta.mtime = astate->mtime;
13114
13115 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13116 if (iter != astate->attrset.end()) {
13117 etag = iter->second.c_str();
13118 }
13119 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13120 if (iter != astate->attrset.end()) {
13121 content_type = iter->second.c_str();
13122 }
13123 iter = astate->attrset.find(RGW_ATTR_ACL);
13124 if (iter != astate->attrset.end()) {
13125 r = decode_policy(iter->second, &owner);
13126 if (r < 0) {
13127 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13128 }
13129 }
13130
13131 if (astate->has_manifest) {
13132 RGWObjManifest::obj_iterator miter;
13133 RGWObjManifest& manifest = astate->manifest;
13134 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13135 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13136 rgw_obj loc;
13137 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13138
13139 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13140 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13141 r = delete_obj_index(loc);
13142 if (r < 0) {
13143 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13144 }
13145 }
13146 }
13147 }
13148
13149 object.meta.etag = etag;
13150 object.meta.content_type = content_type;
13151 object.meta.owner = owner.get_id().to_str();
13152 object.meta.owner_display_name = owner.get_display_name();
13153
13154 // encode suggested updates
13155 list_state.ver.pool = io_ctx.get_id();
13156 list_state.ver.epoch = astate->epoch;
13157 list_state.meta.size = object.meta.size;
13158 list_state.meta.accounted_size = object.meta.accounted_size;
13159 list_state.meta.mtime = object.meta.mtime;
13160 list_state.meta.category = main_category;
13161 list_state.meta.etag = etag;
13162 list_state.meta.content_type = content_type;
13163 if (astate->obj_tag.length() > 0)
13164 list_state.tag = astate->obj_tag.c_str();
13165 list_state.meta.owner = owner.get_id().to_str();
13166 list_state.meta.owner_display_name = owner.get_display_name();
13167
13168 list_state.exists = true;
13169 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13170 return 0;
13171 }
13172
13173 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13174 {
13175 librados::IoCtx index_ctx;
13176 map<int, string> oids;
13177 map<int, struct rgw_cls_list_ret> list_results;
13178 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13179 if (r < 0)
13180 return r;
13181
13182 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13183 if (r < 0)
13184 return r;
13185
13186 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13187 for(; iter != list_results.end(); ++iter) {
13188 headers[oids[iter->first]] = iter->second.dir.header;
13189 }
13190 return 0;
13191 }
13192
13193 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13194 {
13195 librados::IoCtx index_ctx;
13196 map<int, string> bucket_objs;
13197 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13198 if (r < 0)
13199 return r;
13200
13201 map<int, string>::iterator iter = bucket_objs.begin();
13202 for (; iter != bucket_objs.end(); ++iter) {
13203 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13204 if (r < 0) {
13205 ctx->put();
13206 break;
13207 } else {
13208 (*num_aio)++;
13209 }
13210 }
13211 return r;
13212 }
13213
13214 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13215 {
13216 string buckets_obj_id;
13217 rgw_get_buckets_obj(user_id, buckets_obj_id);
13218 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13219
13220 rgw_rados_ref ref;
13221 int r = get_raw_obj_ref(obj, &ref);
13222 if (r < 0) {
13223 return r;
13224 }
13225
13226 librados::ObjectReadOperation op;
13227 int rc;
13228 ::cls_user_get_header(op, header, &rc);
13229 bufferlist ibl;
13230 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13231 if (r < 0)
13232 return r;
13233 if (rc < 0)
13234 return rc;
13235
13236 return 0;
13237 }
13238
13239 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13240 {
13241 string buckets_obj_id;
13242 rgw_get_buckets_obj(user_id, buckets_obj_id);
13243 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13244
13245 rgw_rados_ref ref;
13246 int r = get_raw_obj_ref(obj, &ref);
13247 if (r < 0) {
13248 return r;
13249 }
13250
13251 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13252 if (r < 0)
13253 return r;
13254
13255 return 0;
13256 }
13257
13258 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13259 {
13260 map<string, struct rgw_bucket_dir_header> headers;
13261 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13262 if (r < 0) {
13263 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13264 return r;
13265 }
13266
13267 cls_user_bucket_entry entry;
13268
13269 bucket_info.bucket.convert(&entry.bucket);
13270
13271 for (const auto& hiter : headers) {
13272 for (const auto& iter : hiter.second.stats) {
13273 const struct rgw_bucket_category_stats& header_stats = iter.second;
13274 entry.size += header_stats.total_size;
13275 entry.size_rounded += header_stats.total_size_rounded;
13276 entry.count += header_stats.num_entries;
13277 }
13278 }
13279
13280 list<cls_user_bucket_entry> entries;
13281 entries.push_back(entry);
13282
13283 r = cls_user_update_buckets(user_obj, entries, false);
13284 if (r < 0) {
13285 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13286 return r;
13287 }
13288
13289 return 0;
13290 }
13291
13292 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13293 {
13294 map<string, struct rgw_bucket_dir_header> headers;
13295 RGWBucketInfo bucket_info;
13296 RGWObjectCtx obj_ctx(this);
13297 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13298 if (ret < 0) {
13299 return ret;
13300 }
13301
13302 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13303 if (ret < 0) {
13304 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13305 return ret;
13306 }
13307
13308 bucket.convert(&entry.bucket);
13309
13310 for (const auto& hiter : headers) {
13311 for (const auto& iter : hiter.second.stats) {
13312 const struct rgw_bucket_category_stats& header_stats = iter.second;
13313 entry.size += header_stats.total_size;
13314 entry.size_rounded += header_stats.total_size_rounded;
13315 entry.count += header_stats.num_entries;
13316 }
13317 }
13318
13319 return 0;
13320 }
13321
13322 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13323 const string& in_marker,
13324 const string& end_marker,
13325 const int max_entries,
13326 list<cls_user_bucket_entry>& entries,
13327 string * const out_marker,
13328 bool * const truncated)
13329 {
13330 rgw_rados_ref ref;
13331 int r = get_raw_obj_ref(obj, &ref);
13332 if (r < 0) {
13333 return r;
13334 }
13335
13336 librados::ObjectReadOperation op;
13337 int rc;
13338
13339 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13340 bufferlist ibl;
13341 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13342 if (r < 0)
13343 return r;
13344 if (rc < 0)
13345 return rc;
13346
13347 return 0;
13348 }
13349
13350 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13351 {
13352 rgw_rados_ref ref;
13353 int r = get_raw_obj_ref(obj, &ref);
13354 if (r < 0) {
13355 return r;
13356 }
13357
13358 librados::ObjectWriteOperation op;
13359 cls_user_set_buckets(op, entries, add);
13360 r = ref.ioctx.operate(ref.oid, &op);
13361 if (r < 0)
13362 return r;
13363
13364 return 0;
13365 }
13366
13367 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13368 {
13369 string buckets_obj_id;
13370 rgw_get_buckets_obj(user_id, buckets_obj_id);
13371 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13372 return cls_user_complete_stats_sync(obj);
13373 }
13374
13375 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13376 {
13377 rgw_rados_ref ref;
13378 int r = get_raw_obj_ref(obj, &ref);
13379 if (r < 0) {
13380 return r;
13381 }
13382
13383 librados::ObjectWriteOperation op;
13384 ::cls_user_complete_stats_sync(op);
13385 r = ref.ioctx.operate(ref.oid, &op);
13386 if (r < 0)
13387 return r;
13388
13389 return 0;
13390 }
13391
13392 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13393 {
13394 list<cls_user_bucket_entry> l;
13395 l.push_back(entry);
13396
13397 return cls_user_update_buckets(obj, l, true);
13398 }
13399
13400 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13401 {
13402 rgw_rados_ref ref;
13403 int r = get_system_obj_ref(obj, &ref);
13404 if (r < 0) {
13405 return r;
13406 }
13407
13408 librados::ObjectWriteOperation op;
13409 ::cls_user_remove_bucket(op, bucket);
13410 r = ref.ioctx.operate(ref.oid, &op);
13411 if (r < 0)
13412 return r;
13413
13414 return 0;
13415 }
13416
13417 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13418 RGWQuotaInfo& bucket_quota)
13419 {
13420 if (!cct->_conf->rgw_dynamic_resharding) {
13421 return 0;
13422 }
13423
13424 bool need_resharding = false;
13425 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13426 uint32_t suggested_num_shards;
13427
13428 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13429 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13430 1, need_resharding, &suggested_num_shards);
13431 if (ret < 0) {
13432 return ret;
13433 }
13434
13435 if (need_resharding) {
13436 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13437 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13438 dendl;
13439 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13440 }
13441
13442 return ret;
13443 }
13444
13445 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13446 {
13447 RGWReshard reshard(this);
13448
13449 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13450
13451 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13452 if (new_num_shards <= num_source_shards) {
13453 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13454 return 0;
13455 }
13456
13457 cls_rgw_reshard_entry entry;
13458 entry.time = real_clock::now();
13459 entry.tenant = bucket_info.owner.tenant;
13460 entry.bucket_name = bucket_info.bucket.name;
13461 entry.bucket_id = bucket_info.bucket.bucket_id;
13462 entry.old_num_shards = num_source_shards;
13463 entry.new_num_shards = new_num_shards;
13464
13465 return reshard.add(entry);
13466 }
13467
13468 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13469 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13470 {
13471 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13472 }
13473
13474 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13475 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13476 {
13477 if (!num_shards) {
13478 bucket_objects[0] = bucket_oid_base;
13479 } else {
13480 char buf[bucket_oid_base.size() + 32];
13481 if (shard_id < 0) {
13482 for (uint32_t i = 0; i < num_shards; ++i) {
13483 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13484 bucket_objects[i] = buf;
13485 }
13486 } else {
13487 if ((uint32_t)shard_id > num_shards) {
13488 return;
13489 }
13490 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13491 bucket_objects[shard_id] = buf;
13492 }
13493 }
13494 }
13495
13496 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13497 {
13498 const rgw_bucket& bucket = bucket_info.bucket;
13499 string plain_id = bucket.name + ":" + bucket.bucket_id;
13500 if (!bucket_info.num_shards) {
13501 (*result)[0] = plain_id;
13502 } else {
13503 char buf[16];
13504 if (shard_id < 0) {
13505 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13506 snprintf(buf, sizeof(buf), ":%d", i);
13507 (*result)[i] = plain_id + buf;
13508 }
13509 } else {
13510 if ((uint32_t)shard_id > bucket_info.num_shards) {
13511 return;
13512 }
13513 snprintf(buf, sizeof(buf), ":%d", shard_id);
13514 (*result)[shard_id] = plain_id + buf;
13515 }
13516 }
13517 }
13518
13519 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13520 int *shard_id)
13521 {
13522 int r = 0;
13523 switch (bucket_info.bucket_index_shard_hash_type) {
13524 case RGWBucketInfo::MOD:
13525 if (!bucket_info.num_shards) {
13526 if (shard_id) {
13527 *shard_id = -1;
13528 }
13529 } else {
13530 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13531 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13532 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13533 if (shard_id) {
13534 *shard_id = (int)sid;
13535 }
13536 }
13537 break;
13538 default:
13539 r = -ENOTSUP;
13540 }
13541 return r;
13542 }
13543
13544 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13545 int shard_id, string *bucket_obj)
13546 {
13547 if (!num_shards) {
13548 // By default with no sharding, we use the bucket oid as itself
13549 (*bucket_obj) = bucket_oid_base;
13550 } else {
13551 char buf[bucket_oid_base.size() + 32];
13552 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13553 (*bucket_obj) = buf;
13554 }
13555 }
13556
13557 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13558 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13559 {
13560 int r = 0;
13561 switch (hash_type) {
13562 case RGWBucketInfo::MOD:
13563 if (!num_shards) {
13564 // By default with no sharding, we use the bucket oid as itself
13565 (*bucket_obj) = bucket_oid_base;
13566 if (shard_id) {
13567 *shard_id = -1;
13568 }
13569 } else {
13570 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13571 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13572 sid = rgw_shards_mod(sid2, num_shards);
13573 char buf[bucket_oid_base.size() + 32];
13574 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13575 (*bucket_obj) = buf;
13576 if (shard_id) {
13577 *shard_id = (int)sid;
13578 }
13579 }
13580 break;
13581 default:
13582 r = -ENOTSUP;
13583 }
13584 return r;
13585 }
13586
13587 void RGWStateLog::oid_str(int shard, string& oid) {
13588 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13589 char buf[16];
13590 snprintf(buf, sizeof(buf), "%d", shard);
13591 oid += buf;
13592 }
13593
13594 int RGWStateLog::get_shard_num(const string& object) {
13595 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13596 return val % num_shards;
13597 }
13598
13599 string RGWStateLog::get_oid(const string& object) {
13600 int shard = get_shard_num(object);
13601 string oid;
13602 oid_str(shard, oid);
13603 return oid;
13604 }
13605
13606 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13607 rgw_pool pool;
13608 store->get_log_pool(pool);
13609 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13610 if (r < 0) {
13611 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13612 return r;
13613 }
13614 return 0;
13615 }
13616
13617 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13618 uint32_t state, bufferlist *bl, uint32_t *check_state)
13619 {
13620 if (client_id.empty() ||
13621 op_id.empty() ||
13622 object.empty()) {
13623 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13624 }
13625
13626 librados::IoCtx ioctx;
13627 int r = open_ioctx(ioctx);
13628 if (r < 0)
13629 return r;
13630
13631 string oid = get_oid(object);
13632
13633 librados::ObjectWriteOperation op;
13634 if (check_state) {
13635 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13636 }
13637 utime_t ts = ceph_clock_now();
13638 bufferlist nobl;
13639 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13640 r = ioctx.operate(oid, &op);
13641 if (r < 0) {
13642 return r;
13643 }
13644
13645 return 0;
13646 }
13647
13648 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13649 {
13650 if (client_id.empty() ||
13651 op_id.empty() ||
13652 object.empty()) {
13653 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13654 }
13655
13656 librados::IoCtx ioctx;
13657 int r = open_ioctx(ioctx);
13658 if (r < 0)
13659 return r;
13660
13661 string oid = get_oid(object);
13662
13663 librados::ObjectWriteOperation op;
13664 cls_statelog_remove_by_object(op, object, op_id);
13665 r = ioctx.operate(oid, &op);
13666 if (r < 0) {
13667 return r;
13668 }
13669
13670 return 0;
13671 }
13672
13673 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13674 void **handle)
13675 {
13676 list_state *state = new list_state;
13677 state->client_id = client_id;
13678 state->op_id = op_id;
13679 state->object = object;
13680 if (object.empty()) {
13681 state->cur_shard = 0;
13682 state->max_shard = num_shards - 1;
13683 } else {
13684 state->cur_shard = state->max_shard = get_shard_num(object);
13685 }
13686 *handle = (void *)state;
13687 }
13688
13689 int RGWStateLog::list_entries(void *handle, int max_entries,
13690 list<cls_statelog_entry>& entries,
13691 bool *done)
13692 {
13693 list_state *state = static_cast<list_state *>(handle);
13694
13695 librados::IoCtx ioctx;
13696 int r = open_ioctx(ioctx);
13697 if (r < 0)
13698 return r;
13699
13700 entries.clear();
13701
13702 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13703 string oid;
13704 oid_str(state->cur_shard, oid);
13705
13706 librados::ObjectReadOperation op;
13707 list<cls_statelog_entry> ents;
13708 bool truncated;
13709 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13710 max_entries, ents, &state->marker, &truncated);
13711 bufferlist ibl;
13712 r = ioctx.operate(oid, &op, &ibl);
13713 if (r == -ENOENT) {
13714 truncated = false;
13715 r = 0;
13716 }
13717 if (r < 0) {
13718 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13719 return r;
13720 }
13721
13722 if (!truncated) {
13723 state->marker.clear();
13724 }
13725
13726 max_entries -= ents.size();
13727
13728 entries.splice(entries.end(), ents);
13729
13730 if (truncated)
13731 break;
13732 }
13733
13734 *done = (state->cur_shard > state->max_shard);
13735
13736 return 0;
13737 }
13738
13739 void RGWStateLog::finish_list_entries(void *handle)
13740 {
13741 list_state *state = static_cast<list_state *>(handle);
13742 delete state;
13743 }
13744
13745 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13746 {
13747 f->open_object_section("statelog_entry");
13748 f->dump_string("client_id", entry.client_id);
13749 f->dump_string("op_id", entry.op_id);
13750 f->dump_string("object", entry.object);
13751 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13752 if (!dump_entry_internal(entry, f)) {
13753 f->dump_int("state", entry.state);
13754 }
13755 f->close_section();
13756 }
13757
13758 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13759 {
13760 }
13761
13762 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13763 {
13764 string s;
13765 switch ((OpState)entry.state) {
13766 case OPSTATE_UNKNOWN:
13767 s = "unknown";
13768 break;
13769 case OPSTATE_IN_PROGRESS:
13770 s = "in-progress";
13771 break;
13772 case OPSTATE_COMPLETE:
13773 s = "complete";
13774 break;
13775 case OPSTATE_ERROR:
13776 s = "error";
13777 break;
13778 case OPSTATE_ABORT:
13779 s = "abort";
13780 break;
13781 case OPSTATE_CANCELLED:
13782 s = "cancelled";
13783 break;
13784 default:
13785 s = "invalid";
13786 }
13787 f->dump_string("state", s);
13788 return true;
13789 }
13790
13791 int RGWOpState::state_from_str(const string& s, OpState *state)
13792 {
13793 if (s == "unknown") {
13794 *state = OPSTATE_UNKNOWN;
13795 } else if (s == "in-progress") {
13796 *state = OPSTATE_IN_PROGRESS;
13797 } else if (s == "complete") {
13798 *state = OPSTATE_COMPLETE;
13799 } else if (s == "error") {
13800 *state = OPSTATE_ERROR;
13801 } else if (s == "abort") {
13802 *state = OPSTATE_ABORT;
13803 } else if (s == "cancelled") {
13804 *state = OPSTATE_CANCELLED;
13805 } else {
13806 return -EINVAL;
13807 }
13808
13809 return 0;
13810 }
13811
13812 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13813 {
13814 uint32_t s = (uint32_t)state;
13815 return store_entry(client_id, op_id, object, s, NULL, NULL);
13816 }
13817
13818 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13819 {
13820 uint32_t s = (uint32_t)state;
13821 return store_entry(client_id, op_id, object, s, NULL, &s);
13822 }
13823
13824 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13825 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13826 {
13827 cct = store->ctx();
13828 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13829 }
13830
13831 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13832 last_update = real_clock::now();
13833 cur_state = state;
13834 return os.set_state(client_id, op_id, object, state);
13835 }
13836
13837 int RGWOpStateSingleOp::renew_state() {
13838 real_time now = real_clock::now();
13839
13840 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13841
13842 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13843 return 0;
13844 }
13845
13846 last_update = now;
13847 return os.renew_state(client_id, op_id, object, cur_state);
13848 }
13849
13850
13851 uint64_t RGWRados::instance_id()
13852 {
13853 return get_rados_handle()->get_instance_id();
13854 }
13855
13856 uint64_t RGWRados::next_bucket_id()
13857 {
13858 Mutex::Locker l(bucket_id_lock);
13859 return ++max_bucket_id;
13860 }
13861
13862 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13863 {
13864 int use_cache = cct->_conf->rgw_cache_enabled;
13865 RGWRados *store = NULL;
13866 if (!use_cache) {
13867 store = new RGWRados;
13868 } else {
13869 store = new RGWCache<RGWRados>;
13870 }
13871
13872 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13873 delete store;
13874 return NULL;
13875 }
13876
13877 return store;
13878 }
13879
13880 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13881 {
13882 RGWRados *store = NULL;
13883 store = new RGWRados;
13884
13885 store->set_context(cct);
13886
13887 if (store->init_rados() < 0) {
13888 delete store;
13889 return NULL;
13890 }
13891
13892 return store;
13893 }
13894
13895 void RGWStoreManager::close_storage(RGWRados *store)
13896 {
13897 if (!store)
13898 return;
13899
13900 store->finalize();
13901
13902 delete store;
13903 }
13904
13905 librados::Rados* RGWRados::get_rados_handle()
13906 {
13907 if (rados.size() == 1) {
13908 return &rados[0];
13909 } else {
13910 handle_lock.get_read();
13911 pthread_t id = pthread_self();
13912 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13913
13914 if (it != rados_map.end()) {
13915 handle_lock.put_read();
13916 return &rados[it->second];
13917 } else {
13918 handle_lock.put_read();
13919 handle_lock.get_write();
13920 const uint32_t handle = next_rados_handle;
13921 rados_map[id] = handle;
13922 if (++next_rados_handle == rados.size()) {
13923 next_rados_handle = 0;
13924 }
13925 handle_lock.put_write();
13926 return &rados[handle];
13927 }
13928 }
13929 }
13930
13931 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13932 {
13933 rgw_rados_ref ref;
13934 int ret = get_raw_obj_ref(obj, &ref);
13935 if (ret < 0) {
13936 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13937 return ret;
13938 }
13939
13940 ObjectWriteOperation op;
13941 list<string> prefixes;
13942 cls_rgw_remove_obj(op, prefixes);
13943
13944 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13945 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13946 if (ret < 0) {
13947 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13948 c->release();
13949 return ret;
13950 }
13951
13952 handles.push_back(c);
13953
13954 return 0;
13955 }
13956
13957 int RGWRados::delete_obj_aio(const rgw_obj& obj,
13958 RGWBucketInfo& bucket_info, RGWObjState *astate,
13959 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13960 {
13961 rgw_rados_ref ref;
13962 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13963 if (ret < 0) {
13964 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13965 return ret;
13966 }
13967
13968 if (keep_index_consistent) {
13969 RGWRados::Bucket bop(this, bucket_info);
13970 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13971
13972 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13973 if (ret < 0) {
13974 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13975 return ret;
13976 }
13977 }
13978
13979 ObjectWriteOperation op;
13980 list<string> prefixes;
13981 cls_rgw_remove_obj(op, prefixes);
13982
13983 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13984 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13985 if (ret < 0) {
13986 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13987 c->release();
13988 return ret;
13989 }
13990
13991 handles.push_back(c);
13992
13993 if (keep_index_consistent) {
13994 ret = delete_obj_index(obj);
13995 if (ret < 0) {
13996 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13997 return ret;
13998 }
13999 }
14000 return ret;
14001 }
14002
14003 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14004 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14005 if (value != attrs.end()) {
14006 bufferlist::iterator bliter = value->second.begin();
14007 try {
14008 ::decode(cs_info, bliter);
14009 } catch (buffer::error& err) {
14010 return -EIO;
14011 }
14012 if (cs_info.blocks.size() == 0) {
14013 return -EIO;
14014 }
14015 if (cs_info.compression_type != "none")
14016 need_decompress = true;
14017 else
14018 need_decompress = false;
14019 return 0;
14020 } else {
14021 need_decompress = false;
14022 return 0;
14023 }
14024 }
14025