]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
d21e0a5631b7c362dbbfc7fa5b9b040860e5fe2c
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
13
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
16
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
24 #include "rgw_acl.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
31
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43
44 #include "rgw_tools.h"
45 #include "rgw_coroutine.h"
46 #include "rgw_compression.h"
47
48 #undef fork // fails to compile RGWPeriod::fork() below
49
50 #include "common/Clock.h"
51
52 #include "include/rados/librados.hpp"
53 using namespace librados;
54
55 #include <string>
56 #include <iostream>
57 #include <vector>
58 #include <atomic>
59 #include <list>
60 #include <map>
61 #include "auth/Crypto.h" // get_random_bytes()
62
63 #include "rgw_log.h"
64
65 #include "rgw_gc.h"
66 #include "rgw_lc.h"
67
68 #include "rgw_object_expirer_core.h"
69 #include "rgw_sync.h"
70 #include "rgw_data_sync.h"
71 #include "rgw_realm_watcher.h"
72 #include "rgw_reshard.h"
73
74 #include "compressor/Compressor.h"
75
76 #define dout_context g_ceph_context
77 #define dout_subsys ceph_subsys_rgw
78
79 using namespace std;
80
81 static string notify_oid_prefix = "notify";
82 static string *notify_oids = NULL;
83 static string shadow_ns = "shadow";
84 static string dir_oid_prefix = ".dir.";
85 static string default_storage_pool_suffix = "rgw.buckets.data";
86 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
87 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
88 static string avail_pools = ".pools.avail";
89
90 static string zone_info_oid_prefix = "zone_info.";
91 static string zone_names_oid_prefix = "zone_names.";
92 static string region_info_oid_prefix = "region_info.";
93 static string zone_group_info_oid_prefix = "zonegroup_info.";
94 static string realm_names_oid_prefix = "realms_names.";
95 static string realm_info_oid_prefix = "realms.";
96 static string default_region_info_oid = "default.region";
97 static string default_zone_group_info_oid = "default.zonegroup";
98 static string period_info_oid_prefix = "periods.";
99 static string period_latest_epoch_info_oid = ".latest_epoch";
100 static string region_map_oid = "region_map";
101 static string zonegroup_map_oid = "zonegroup_map";
102 static string log_lock_name = "rgw_log_lock";
103 static string default_realm_info_oid = "default.realm";
104 const string default_zonegroup_name = "default";
105 const string default_zone_name = "default";
106 static string zonegroup_names_oid_prefix = "zonegroups_names.";
107 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
108 #define RGW_USAGE_OBJ_PREFIX "usage."
109 #define FIRST_EPOCH 1
110 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
111 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
112 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
114
115 #define RGW_STATELOG_OBJ_PREFIX "statelog."
116
117 #define dout_subsys ceph_subsys_rgw
118
119
120 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
121 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
122 {
123 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
124 RGWZonePlacementInfo placement;
125 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
126 return false;
127 }
128
129 if (!obj.in_extra_data) {
130 *pool = placement.data_pool;
131 } else {
132 *pool = placement.get_data_extra_pool();
133 }
134 }
135
136 return true;
137 }
138
139 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
140 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
141 {
142 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
143
144 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
145 }
146
147 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
148 {
149 if (!is_raw) {
150 rgw_raw_obj r;
151 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
152 return r;
153 }
154 return raw_obj;
155 }
156
157 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
158 {
159 if (!is_raw) {
160 rgw_raw_obj r;
161 store->obj_to_raw(placement_rule, obj, &r);
162 return r;
163 }
164 return raw_obj;
165 }
166
167 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
168 {
169 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
170 if (r == -ENOENT && create) {
171 r = rados->pool_create(pool.name.c_str());
172 if (r < 0 && r != -EEXIST) {
173 return r;
174 }
175
176 r = rados->ioctx_create(pool.name.c_str(), ioctx);
177 }
178 if (r < 0) {
179 return r;
180 }
181 if (!pool.ns.empty()) {
182 ioctx.set_namespace(pool.ns);
183 }
184 return 0;
185 }
186
187 template<>
188 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
189 RWLock::WLocker wl(lock);
190 auto iter = objs_state.find(obj);
191 if (iter == objs_state.end()) {
192 return;
193 }
194 bool is_atomic = iter->second.is_atomic;
195 bool prefetch_data = iter->second.prefetch_data;
196
197 objs_state.erase(iter);
198
199 if (is_atomic || prefetch_data) {
200 auto& s = objs_state[obj];
201 s.is_atomic = is_atomic;
202 s.prefetch_data = prefetch_data;
203 }
204 }
205
206 template<>
207 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
208 RWLock::WLocker wl(lock);
209 auto iter = objs_state.find(obj);
210 if (iter == objs_state.end()) {
211 return;
212 }
213
214 objs_state.erase(iter);
215 }
216
217 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
218 encode_json("default_zonegroup", default_zonegroup, f);
219 }
220
221 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
222
223 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
224 /* backward compatability with region */
225 if (default_zonegroup.empty()) {
226 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
227 }
228 }
229
230 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
231 {
232 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
233 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
234 }
235
236 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
237 }
238
239 int RGWZoneGroup::create_default(bool old_format)
240 {
241 name = default_zonegroup_name;
242 is_master = true;
243
244 RGWZoneGroupPlacementTarget placement_target;
245 placement_target.name = "default-placement";
246 placement_targets[placement_target.name] = placement_target;
247 default_placement = "default-placement";
248
249 RGWZoneParams zone_params(default_zone_name);
250
251 int r = zone_params.init(cct, store, false);
252 if (r < 0) {
253 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
254 return r;
255 }
256
257 r = zone_params.create_default();
258 if (r < 0 && r != -EEXIST) {
259 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
260 return r;
261 } else if (r == -EEXIST) {
262 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
263 zone_params.clear_id();
264 r = zone_params.init(cct, store);
265 if (r < 0) {
266 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
267 return r;
268 }
269 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
270 << dendl;
271 }
272
273 RGWZone& default_zone = zones[zone_params.get_id()];
274 default_zone.name = zone_params.get_name();
275 default_zone.id = zone_params.get_id();
276 master_zone = default_zone.id;
277
278 r = create();
279 if (r < 0 && r != -EEXIST) {
280 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
281 return r;
282 }
283
284 if (r == -EEXIST) {
285 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
286 id.clear();
287 r = init(cct, store);
288 if (r < 0) {
289 return r;
290 }
291 }
292
293 if (old_format) {
294 name = id;
295 }
296
297 post_process_params();
298
299 return 0;
300 }
301
302 const string RGWZoneGroup::get_default_oid(bool old_region_format)
303 {
304 if (old_region_format) {
305 if (cct->_conf->rgw_default_region_info_oid.empty()) {
306 return default_region_info_oid;
307 }
308 return cct->_conf->rgw_default_region_info_oid;
309 }
310
311 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
312
313 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
314 default_oid = default_zone_group_info_oid;
315 }
316
317 default_oid += "." + realm_id;
318
319 return default_oid;
320 }
321
322 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
323 {
324 if (old_region_format) {
325 return region_info_oid_prefix;
326 }
327 return zone_group_info_oid_prefix;
328 }
329
330 const string& RGWZoneGroup::get_names_oid_prefix()
331 {
332 return zonegroup_names_oid_prefix;
333 }
334
335 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
336 return cct->_conf->rgw_zonegroup;
337 }
338
339 int RGWZoneGroup::equals(const string& other_zonegroup) const
340 {
341 if (is_master && other_zonegroup.empty())
342 return true;
343
344 return (id == other_zonegroup);
345 }
346
347 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
348 const list<string>& endpoints, const string *ptier_type,
349 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
350 {
351 auto& zone_id = zone_params.get_id();
352 auto& zone_name = zone_params.get_name();
353
354 // check for duplicate zone name on insert
355 if (!zones.count(zone_id)) {
356 for (const auto& zone : zones) {
357 if (zone.second.name == zone_name) {
358 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
359 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
360 return -EEXIST;
361 }
362 }
363 }
364
365 if (is_master) {
366 if (*is_master) {
367 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
368 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
369 }
370 master_zone = zone_params.get_id();
371 } else if (master_zone == zone_params.get_id()) {
372 master_zone.clear();
373 }
374 }
375
376 RGWZone& zone = zones[zone_params.get_id()];
377 zone.name = zone_params.get_name();
378 zone.id = zone_params.get_id();
379 if (!endpoints.empty()) {
380 zone.endpoints = endpoints;
381 }
382 if (read_only) {
383 zone.read_only = *read_only;
384 }
385 if (ptier_type) {
386 zone.tier_type = *ptier_type;
387 }
388
389 if (psync_from_all) {
390 zone.sync_from_all = *psync_from_all;
391 }
392
393 for (auto add : sync_from) {
394 zone.sync_from.insert(add);
395 }
396
397 for (auto rm : sync_from_rm) {
398 zone.sync_from.erase(rm);
399 }
400
401 post_process_params();
402
403 return update();
404 }
405
406
407 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
408 {
409 RGWZone& zone = zones[zone_params.get_id()];
410 zone.name = zone_params.get_name();
411
412 return update();
413 }
414
415 void RGWZoneGroup::post_process_params()
416 {
417 bool log_data = zones.size() > 1;
418
419 if (master_zone.empty()) {
420 map<string, RGWZone>::iterator iter = zones.begin();
421 if (iter != zones.end()) {
422 master_zone = iter->first;
423 }
424 }
425
426 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
427 RGWZone& zone = iter->second;
428 zone.log_data = log_data;
429 zone.log_meta = (is_master && zone.id == master_zone);
430
431 RGWZoneParams zone_params(zone.id, zone.name);
432 int ret = zone_params.init(cct, store);
433 if (ret < 0) {
434 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
435 continue;
436 }
437
438 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
439 iter != zone_params.placement_pools.end(); ++iter) {
440 const string& placement_name = iter->first;
441 if (placement_targets.find(placement_name) == placement_targets.end()) {
442 RGWZoneGroupPlacementTarget placement_target;
443 placement_target.name = placement_name;
444 placement_targets[placement_name] = placement_target;
445 }
446 }
447 }
448
449 if (default_placement.empty() && !placement_targets.empty()) {
450 default_placement = placement_targets.begin()->first;
451 }
452 }
453
454 int RGWZoneGroup::remove_zone(const std::string& zone_id)
455 {
456 map<string, RGWZone>::iterator iter = zones.find(zone_id);
457 if (iter == zones.end()) {
458 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
459 << name << dendl;
460 return -ENOENT;
461 }
462
463 zones.erase(iter);
464
465 post_process_params();
466
467 return update();
468 }
469
470 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
471 {
472 if (realm_id.empty()) {
473 /* try using default realm */
474 RGWRealm realm;
475 int ret = realm.init(cct, store);
476 if (ret < 0) {
477 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
478 return -ENOENT;
479 }
480 realm_id = realm.get_id();
481 }
482
483 return RGWSystemMetaObj::read_default_id(default_id, old_format);
484 }
485
486 int RGWZoneGroup::set_as_default(bool exclusive)
487 {
488 if (realm_id.empty()) {
489 /* try using default realm */
490 RGWRealm realm;
491 int ret = realm.init(cct, store);
492 if (ret < 0) {
493 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
494 return -EINVAL;
495 }
496 realm_id = realm.get_id();
497 }
498
499 return RGWSystemMetaObj::set_as_default(exclusive);
500 }
501
502 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
503 {
504 cct = _cct;
505 store = _store;
506
507 if (!setup_obj)
508 return 0;
509
510 if (old_format && id.empty()) {
511 id = name;
512 }
513
514 if (id.empty()) {
515 int r;
516 if (name.empty()) {
517 name = get_predefined_name(cct);
518 }
519 if (name.empty()) {
520 r = use_default(old_format);
521 if (r < 0) {
522 return r;
523 }
524 } else if (!old_format) {
525 r = read_id(name, id);
526 if (r < 0) {
527 if (r != -ENOENT) {
528 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
529 }
530 return r;
531 }
532 }
533 }
534
535 return read_info(id, old_format);
536 }
537
538 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
539 {
540 auto pool = get_pool(cct);
541 bufferlist bl;
542 RGWObjectCtx obj_ctx(store);
543 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
544 if (ret < 0)
545 return ret;
546
547 try {
548 bufferlist::iterator iter = bl.begin();
549 ::decode(default_info, iter);
550 } catch (buffer::error& err) {
551 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
552 return -EIO;
553 }
554
555 return 0;
556 }
557
558 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
559 {
560 RGWDefaultSystemMetaObjInfo default_info;
561
562 int ret = read_default(default_info, get_default_oid(old_format));
563 if (ret < 0) {
564 return ret;
565 }
566
567 default_id = default_info.default_id;
568
569 return 0;
570 }
571
572 int RGWSystemMetaObj::use_default(bool old_format)
573 {
574 return read_default_id(id, old_format);
575 }
576
577 int RGWSystemMetaObj::set_as_default(bool exclusive)
578 {
579 string oid = get_default_oid();
580
581 rgw_pool pool(get_pool(cct));
582 bufferlist bl;
583
584 RGWDefaultSystemMetaObjInfo default_info;
585 default_info.default_id = id;
586
587 ::encode(default_info, bl);
588
589 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
590 exclusive, NULL, real_time(), NULL);
591 if (ret < 0)
592 return ret;
593
594 return 0;
595 }
596
597 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
598 {
599 rgw_pool pool(get_pool(cct));
600 bufferlist bl;
601
602 string oid = get_names_oid_prefix() + obj_name;
603
604 RGWObjectCtx obj_ctx(store);
605 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
606 if (ret < 0) {
607 return ret;
608 }
609
610 RGWNameToId nameToId;
611 try {
612 bufferlist::iterator iter = bl.begin();
613 ::decode(nameToId, iter);
614 } catch (buffer::error& err) {
615 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
616 return -EIO;
617 }
618 object_id = nameToId.obj_id;
619 return 0;
620 }
621
622 int RGWSystemMetaObj::delete_obj(bool old_format)
623 {
624 rgw_pool pool(get_pool(cct));
625
626 /* check to see if obj is the default */
627 RGWDefaultSystemMetaObjInfo default_info;
628 int ret = read_default(default_info, get_default_oid(old_format));
629 if (ret < 0 && ret != -ENOENT)
630 return ret;
631 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
632 string oid = get_default_oid(old_format);
633 rgw_raw_obj default_named_obj(pool, oid);
634 ret = store->delete_system_obj(default_named_obj);
635 if (ret < 0) {
636 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
637 return ret;
638 }
639 }
640 if (!old_format) {
641 string oid = get_names_oid_prefix() + name;
642 rgw_raw_obj object_name(pool, oid);
643 ret = store->delete_system_obj(object_name);
644 if (ret < 0) {
645 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
646 return ret;
647 }
648 }
649
650 string oid = get_info_oid_prefix(old_format);
651 if (old_format) {
652 oid += name;
653 } else {
654 oid += id;
655 }
656
657 rgw_raw_obj object_id(pool, oid);
658 ret = store->delete_system_obj(object_id);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
661 }
662
663 return ret;
664 }
665
666 int RGWSystemMetaObj::store_name(bool exclusive)
667 {
668 rgw_pool pool(get_pool(cct));
669 string oid = get_names_oid_prefix() + name;
670
671 RGWNameToId nameToId;
672 nameToId.obj_id = id;
673
674 bufferlist bl;
675 ::encode(nameToId, bl);
676 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
677 }
678
679 int RGWSystemMetaObj::rename(const string& new_name)
680 {
681 string new_id;
682 int ret = read_id(new_name, new_id);
683 if (!ret) {
684 return -EEXIST;
685 }
686 if (ret < 0 && ret != -ENOENT) {
687 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
688 return ret;
689 }
690 string old_name = name;
691 name = new_name;
692 ret = update();
693 if (ret < 0) {
694 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
695 return ret;
696 }
697 ret = store_name(true);
698 if (ret < 0) {
699 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
700 return ret;
701 }
702 /* delete old name */
703 rgw_pool pool(get_pool(cct));
704 string oid = get_names_oid_prefix() + old_name;
705 rgw_raw_obj old_name_obj(pool, oid);
706 ret = store->delete_system_obj(old_name_obj);
707 if (ret < 0) {
708 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
709 return ret;
710 }
711
712 return ret;
713 }
714
715 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
716 {
717 rgw_pool pool(get_pool(cct));
718
719 bufferlist bl;
720
721 string oid = get_info_oid_prefix(old_format) + obj_id;
722
723 RGWObjectCtx obj_ctx(store);
724 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
725 if (ret < 0) {
726 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
727 return ret;
728 }
729
730 try {
731 bufferlist::iterator iter = bl.begin();
732 ::decode(*this, iter);
733 } catch (buffer::error& err) {
734 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
735 return -EIO;
736 }
737
738 return 0;
739 }
740
741 int RGWSystemMetaObj::read()
742 {
743 int ret = read_id(name, id);
744 if (ret < 0) {
745 return ret;
746 }
747
748 return read_info(id);
749 }
750
751 int RGWSystemMetaObj::create(bool exclusive)
752 {
753 int ret;
754
755 /* check to see the name is not used */
756 ret = read_id(name, id);
757 if (exclusive && ret == 0) {
758 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
759 return -EEXIST;
760 } else if ( ret < 0 && ret != -ENOENT) {
761 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
762 return ret;
763 }
764
765 if (id.empty()) {
766 /* create unique id */
767 uuid_d new_uuid;
768 char uuid_str[37];
769 new_uuid.generate_random();
770 new_uuid.print(uuid_str);
771 id = uuid_str;
772 }
773
774 ret = store_info(exclusive);
775 if (ret < 0) {
776 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 return store_name(exclusive);
781 }
782
783 int RGWSystemMetaObj::store_info(bool exclusive)
784 {
785 rgw_pool pool(get_pool(cct));
786
787 string oid = get_info_oid_prefix() + id;
788
789 bufferlist bl;
790 ::encode(*this, bl);
791 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
792 }
793
794 int RGWSystemMetaObj::write(bool exclusive)
795 {
796 int ret = store_info(exclusive);
797 if (ret < 0) {
798 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
799 return ret;
800 }
801 ret = store_name(exclusive);
802 if (ret < 0) {
803 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
804 return ret;
805 }
806 return 0;
807 }
808
809
810 const string& RGWRealm::get_predefined_name(CephContext *cct) {
811 return cct->_conf->rgw_realm;
812 }
813
814 int RGWRealm::create(bool exclusive)
815 {
816 int ret = RGWSystemMetaObj::create(exclusive);
817 if (ret < 0) {
818 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
819 return ret;
820 }
821 // create the control object for watch/notify
822 ret = create_control(exclusive);
823 if (ret < 0) {
824 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
825 return ret;
826 }
827 RGWPeriod period;
828 if (current_period.empty()) {
829 /* create new period for the realm */
830 ret = period.init(cct, store, id, name, false);
831 if (ret < 0 ) {
832 return ret;
833 }
834 ret = period.create(true);
835 if (ret < 0) {
836 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
837 return ret;
838 }
839 } else {
840 period = RGWPeriod(current_period, 0);
841 int ret = period.init(cct, store, id, name);
842 if (ret < 0) {
843 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
844 return ret;
845 }
846 }
847 ret = set_current_period(period);
848 if (ret < 0) {
849 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
850 return ret;
851 }
852 // try to set as default. may race with another create, so pass exclusive=true
853 // so we don't override an existing default
854 ret = set_as_default(true);
855 if (ret < 0 && ret != -EEXIST) {
856 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
857 }
858
859 return 0;
860 }
861
862 int RGWRealm::delete_obj()
863 {
864 int ret = RGWSystemMetaObj::delete_obj();
865 if (ret < 0) {
866 return ret;
867 }
868 return delete_control();
869 }
870
871 int RGWRealm::create_control(bool exclusive)
872 {
873 auto pool = rgw_pool{get_pool(cct)};
874 auto oid = get_control_oid();
875 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
876 nullptr, real_time(), nullptr);
877 }
878
879 int RGWRealm::delete_control()
880 {
881 auto pool = rgw_pool{get_pool(cct)};
882 auto obj = rgw_raw_obj{pool, get_control_oid()};
883 return store->delete_system_obj(obj);
884 }
885
886 rgw_pool RGWRealm::get_pool(CephContext *cct)
887 {
888 if (cct->_conf->rgw_realm_root_pool.empty()) {
889 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
890 }
891 return rgw_pool(cct->_conf->rgw_realm_root_pool);
892 }
893
894 const string RGWRealm::get_default_oid(bool old_format)
895 {
896 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
897 return default_realm_info_oid;
898 }
899 return cct->_conf->rgw_default_realm_info_oid;
900 }
901
902 const string& RGWRealm::get_names_oid_prefix()
903 {
904 return realm_names_oid_prefix;
905 }
906
907 const string& RGWRealm::get_info_oid_prefix(bool old_format)
908 {
909 return realm_info_oid_prefix;
910 }
911
912 int RGWRealm::set_current_period(RGWPeriod& period)
913 {
914 // update realm epoch to match the period's
915 if (epoch > period.get_realm_epoch()) {
916 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
917 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
918 return -EINVAL;
919 }
920 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
921 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
922 << period.get_realm_epoch() << ", but different period id "
923 << period.get_id() << " != " << current_period << dendl;
924 return -EINVAL;
925 }
926
927 epoch = period.get_realm_epoch();
928 current_period = period.get_id();
929
930 int ret = update();
931 if (ret < 0) {
932 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
933 return ret;
934 }
935
936 ret = period.reflect();
937 if (ret < 0) {
938 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
939 return ret;
940 }
941
942 return 0;
943 }
944
945 string RGWRealm::get_control_oid()
946 {
947 return get_info_oid_prefix() + id + ".control";
948 }
949
950 int RGWRealm::notify_zone(bufferlist& bl)
951 {
952 // open a context on the realm's pool
953 rgw_pool pool{get_pool(cct)};
954 librados::IoCtx ctx;
955 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
956 if (r < 0) {
957 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
958 return r;
959 }
960 // send a notify on the realm object
961 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
962 if (r < 0) {
963 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
964 return r;
965 }
966 return 0;
967 }
968
969 int RGWRealm::notify_new_period(const RGWPeriod& period)
970 {
971 bufferlist bl;
972 // push the period to dependent zonegroups/zones
973 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
974 ::encode(period, bl);
975 // reload the gateway with the new period
976 ::encode(RGWRealmNotify::Reload, bl);
977
978 return notify_zone(bl);
979 }
980
981 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
982 {
983 if (realm_id.empty()) {
984 return "period_config.default";
985 }
986 return "period_config." + realm_id;
987 }
988
989 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
990 {
991 const auto& pool_name = cct->_conf->rgw_period_root_pool;
992 if (pool_name.empty()) {
993 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
994 }
995 return {pool_name};
996 }
997
998 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
999 {
1000 RGWObjectCtx obj_ctx(store);
1001 const auto& pool = get_pool(store->ctx());
1002 const auto& oid = get_oid(realm_id);
1003 bufferlist bl;
1004
1005 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1006 if (ret < 0) {
1007 return ret;
1008 }
1009 try {
1010 bufferlist::iterator iter = bl.begin();
1011 ::decode(*this, iter);
1012 } catch (buffer::error& err) {
1013 return -EIO;
1014 }
1015 return 0;
1016 }
1017
1018 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1019 {
1020 const auto& pool = get_pool(store->ctx());
1021 const auto& oid = get_oid(realm_id);
1022 bufferlist bl;
1023 ::encode(*this, bl);
1024 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1025 false, nullptr, real_time(), nullptr);
1026 }
1027
1028 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1029 const string& period_realm_name, bool setup_obj)
1030 {
1031 cct = _cct;
1032 store = _store;
1033 realm_id = period_realm_id;
1034 realm_name = period_realm_name;
1035
1036 if (!setup_obj)
1037 return 0;
1038
1039 return init(_cct, _store, setup_obj);
1040 }
1041
1042
1043 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1044 {
1045 cct = _cct;
1046 store = _store;
1047
1048 if (!setup_obj)
1049 return 0;
1050
1051 if (id.empty()) {
1052 RGWRealm realm(realm_id, realm_name);
1053 int ret = realm.init(cct, store);
1054 if (ret < 0) {
1055 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1056 cpp_strerror(-ret) << dendl;
1057 return ret;
1058 }
1059 id = realm.get_current_period();
1060 realm_id = realm.get_id();
1061 }
1062
1063 if (!epoch) {
1064 int ret = use_latest_epoch();
1065 if (ret < 0) {
1066 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1067 << " : " << cpp_strerror(-ret) << dendl;
1068 return ret;
1069 }
1070 }
1071
1072 return read_info();
1073 }
1074
1075
1076 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1077 map<string, RGWZoneGroup>::const_iterator iter;
1078 if (!zonegroup_id.empty()) {
1079 iter = period_map.zonegroups.find(zonegroup_id);
1080 } else {
1081 iter = period_map.zonegroups.find("default");
1082 }
1083 if (iter != period_map.zonegroups.end()) {
1084 zonegroup = iter->second;
1085 return 0;
1086 }
1087
1088 return -ENOENT;
1089 }
1090
1091 bool RGWPeriod::is_single_zonegroup(CephContext *cct, RGWRados *store)
1092 {
1093 return (period_map.zonegroups.size() == 1);
1094 }
1095
1096 const string& RGWPeriod::get_latest_epoch_oid()
1097 {
1098 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1099 return period_latest_epoch_info_oid;
1100 }
1101 return cct->_conf->rgw_period_latest_epoch_info_oid;
1102 }
1103
1104 const string& RGWPeriod::get_info_oid_prefix()
1105 {
1106 return period_info_oid_prefix;
1107 }
1108
1109 const string RGWPeriod::get_period_oid_prefix()
1110 {
1111 return get_info_oid_prefix() + id;
1112 }
1113
1114 const string RGWPeriod::get_period_oid()
1115 {
1116 std::ostringstream oss;
1117 oss << get_period_oid_prefix();
1118 // skip the epoch for the staging period
1119 if (id != get_staging_id(realm_id))
1120 oss << "." << epoch;
1121 return oss.str();
1122 }
1123
1124 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info)
1125 {
1126 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1127
1128 rgw_pool pool(get_pool(cct));
1129 bufferlist bl;
1130 RGWObjectCtx obj_ctx(store);
1131 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
1132 if (ret < 0) {
1133 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1134 return ret;
1135 }
1136 try {
1137 bufferlist::iterator iter = bl.begin();
1138 ::decode(info, iter);
1139 } catch (buffer::error& err) {
1140 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1141 return -EIO;
1142 }
1143
1144 return 0;
1145 }
1146
1147 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1148 {
1149 RGWPeriodLatestEpochInfo info;
1150
1151 int ret = read_latest_epoch(info);
1152 if (ret < 0) {
1153 return ret;
1154 }
1155
1156 latest_epoch = info.epoch;
1157
1158 return 0;
1159 }
1160
1161 int RGWPeriod::use_latest_epoch()
1162 {
1163 RGWPeriodLatestEpochInfo info;
1164 int ret = read_latest_epoch(info);
1165 if (ret < 0) {
1166 return ret;
1167 }
1168
1169 epoch = info.epoch;
1170
1171 return 0;
1172 }
1173
1174 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive)
1175 {
1176 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1177
1178 rgw_pool pool(get_pool(cct));
1179 bufferlist bl;
1180
1181 RGWPeriodLatestEpochInfo info;
1182 info.epoch = epoch;
1183
1184 ::encode(info, bl);
1185
1186 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1187 exclusive, NULL, real_time(), NULL);
1188 }
1189
1190 int RGWPeriod::delete_obj()
1191 {
1192 rgw_pool pool(get_pool(cct));
1193
1194 // delete the object for each period epoch
1195 for (epoch_t e = 1; e <= epoch; e++) {
1196 RGWPeriod p{get_id(), e};
1197 rgw_raw_obj oid{pool, p.get_period_oid()};
1198 int ret = store->delete_system_obj(oid);
1199 if (ret < 0) {
1200 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1201 << ": " << cpp_strerror(-ret) << dendl;
1202 }
1203 }
1204
1205 // delete the .latest_epoch object
1206 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1207 int ret = store->delete_system_obj(oid);
1208 if (ret < 0) {
1209 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1210 << ": " << cpp_strerror(-ret) << dendl;
1211 }
1212 return ret;
1213 }
1214
1215 int RGWPeriod::read_info()
1216 {
1217 rgw_pool pool(get_pool(cct));
1218
1219 bufferlist bl;
1220
1221 RGWObjectCtx obj_ctx(store);
1222 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1223 if (ret < 0) {
1224 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1225 return ret;
1226 }
1227
1228 try {
1229 bufferlist::iterator iter = bl.begin();
1230 ::decode(*this, iter);
1231 } catch (buffer::error& err) {
1232 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1233 return -EIO;
1234 }
1235
1236 return 0;
1237 }
1238
1239 int RGWPeriod::create(bool exclusive)
1240 {
1241 int ret;
1242
1243 /* create unique id */
1244 uuid_d new_uuid;
1245 char uuid_str[37];
1246 new_uuid.generate_random();
1247 new_uuid.print(uuid_str);
1248 id = uuid_str;
1249
1250 epoch = FIRST_EPOCH;
1251
1252 period_map.id = id;
1253
1254 ret = store_info(exclusive);
1255 if (ret < 0) {
1256 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1257 }
1258
1259 ret = set_latest_epoch(epoch);
1260 if (ret < 0) {
1261 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1262 }
1263
1264 return ret;
1265 }
1266
1267 int RGWPeriod::store_info(bool exclusive)
1268 {
1269 epoch_t latest_epoch = FIRST_EPOCH - 1;
1270 int ret = get_latest_epoch(latest_epoch);
1271 if (ret < 0 && ret != -ENOENT) {
1272 ldout(cct, 0) << "ERROR: RGWPeriod::get_latest_epoch() returned " << cpp_strerror(-ret) << dendl;
1273 return ret;
1274 }
1275
1276 rgw_pool pool(get_pool(cct));
1277
1278 string oid = get_period_oid();
1279 bufferlist bl;
1280 ::encode(*this, bl);
1281 ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
1282 if (ret < 0) {
1283 ldout(cct, 0) << "ERROR: rgw_put_system_obj(" << pool << ":" << oid << "): " << cpp_strerror(-ret) << dendl;
1284 return ret;
1285 }
1286 if (latest_epoch < epoch) {
1287 ret = set_latest_epoch(epoch);
1288 if (ret < 0) {
1289 ldout(cct, 0) << "ERROR: RGWPeriod::set_latest_epoch() returned " << cpp_strerror(-ret) << dendl;
1290 return ret;
1291 }
1292 }
1293 return 0;
1294 }
1295
1296 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1297 {
1298 if (cct->_conf->rgw_period_root_pool.empty()) {
1299 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1300 }
1301 return rgw_pool(cct->_conf->rgw_period_root_pool);
1302 }
1303
1304 int RGWPeriod::use_next_epoch()
1305 {
1306 epoch_t latest_epoch;
1307 int ret = get_latest_epoch(latest_epoch);
1308 if (ret < 0) {
1309 return ret;
1310 }
1311 epoch = latest_epoch + 1;
1312 ret = read_info();
1313 if (ret < 0 && ret != -ENOENT) {
1314 return ret;
1315 }
1316 if (ret == -ENOENT) {
1317 ret = create();
1318 if (ret < 0) {
1319 ldout(cct, 0) << "Error creating new epoch " << epoch << dendl;
1320 return ret;
1321 }
1322 }
1323 return 0;
1324 }
1325
1326 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1327 {
1328 if (zonegroup.realm_id != realm_id) {
1329 return 0;
1330 }
1331 int ret = period_map.update(zonegroup, cct);
1332 if (ret < 0) {
1333 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1334 return ret;
1335 }
1336
1337 return store_info(false);
1338 }
1339
1340 int RGWPeriod::update()
1341 {
1342 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1343 list<string> zonegroups;
1344 int ret = store->list_zonegroups(zonegroups);
1345 if (ret < 0) {
1346 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1347 return ret;
1348 }
1349
1350 // clear zone short ids of removed zones. period_map.update() will add the
1351 // remaining zones back
1352 period_map.short_zone_ids.clear();
1353
1354 for (auto& iter : zonegroups) {
1355 RGWZoneGroup zg(string(), iter);
1356 ret = zg.init(cct, store);
1357 if (ret < 0) {
1358 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1359 continue;
1360 }
1361
1362 if (zg.realm_id != realm_id) {
1363 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1364 continue;
1365 }
1366
1367 if (zg.master_zone.empty()) {
1368 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1369 return -EINVAL;
1370 }
1371
1372 if (zg.is_master_zonegroup()) {
1373 master_zonegroup = zg.get_id();
1374 master_zone = zg.master_zone;
1375 }
1376
1377 int ret = period_map.update(zg, cct);
1378 if (ret < 0) {
1379 return ret;
1380 }
1381 }
1382
1383 ret = period_config.read(store, realm_id);
1384 if (ret < 0 && ret != -ENOENT) {
1385 ldout(cct, 0) << "ERROR: failed to read period config: "
1386 << cpp_strerror(ret) << dendl;
1387 return ret;
1388 }
1389 return 0;
1390 }
1391
1392 int RGWPeriod::reflect()
1393 {
1394 for (auto& iter : period_map.zonegroups) {
1395 RGWZoneGroup& zg = iter.second;
1396 zg.reinit_instance(cct, store);
1397 int r = zg.write(false);
1398 if (r < 0) {
1399 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1400 return r;
1401 }
1402 if (zg.is_master_zonegroup()) {
1403 // set master as default if no default exists
1404 r = zg.set_as_default(true);
1405 if (r == 0) {
1406 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1407 << " as the default" << dendl;
1408 }
1409 }
1410 }
1411
1412 int r = period_config.write(store, realm_id);
1413 if (r < 0) {
1414 ldout(cct, 0) << "ERROR: failed to store period config: "
1415 << cpp_strerror(-r) << dendl;
1416 return r;
1417 }
1418 return 0;
1419 }
1420
1421 void RGWPeriod::fork()
1422 {
1423 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1424 predecessor_uuid = id;
1425 id = get_staging_id(realm_id);
1426 period_map.reset();
1427 realm_epoch++;
1428 }
1429
1430 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1431 {
1432 // initialize a sync status manager to read the status
1433 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1434 int r = mgr.init();
1435 if (r < 0) {
1436 return r;
1437 }
1438 r = mgr.read_sync_status(sync_status);
1439 mgr.stop();
1440 return r;
1441 }
1442
1443 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1444 std::ostream& error_stream,
1445 bool force_if_stale)
1446 {
1447 rgw_meta_sync_status status;
1448 int r = read_sync_status(store, &status);
1449 if (r < 0) {
1450 ldout(cct, 0) << "period failed to read sync status: "
1451 << cpp_strerror(-r) << dendl;
1452 return r;
1453 }
1454
1455 std::vector<std::string> markers;
1456
1457 const auto current_epoch = current_period.get_realm_epoch();
1458 if (current_epoch != status.sync_info.realm_epoch) {
1459 // no sync status markers for the current period
1460 assert(current_epoch > status.sync_info.realm_epoch);
1461 const int behind = current_epoch - status.sync_info.realm_epoch;
1462 if (!force_if_stale && current_epoch > 1) {
1463 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1464 "the current master zone in metadata sync. If this zone is promoted "
1465 "to master, any metadata changes during that time are likely to "
1466 "be lost.\n"
1467 "Waiting for this zone to catch up on metadata sync (see "
1468 "'radosgw-admin sync status') is recommended.\n"
1469 "To promote this zone to master anyway, add the flag "
1470 "--yes-i-really-mean-it." << std::endl;
1471 return -EINVAL;
1472 }
1473 // empty sync status markers - other zones will skip this period during
1474 // incremental metadata sync
1475 markers.resize(status.sync_info.num_shards);
1476 } else {
1477 markers.reserve(status.sync_info.num_shards);
1478 for (auto& i : status.sync_markers) {
1479 auto& marker = i.second;
1480 // filter out markers from other periods
1481 if (marker.realm_epoch != current_epoch) {
1482 marker.marker.clear();
1483 }
1484 markers.emplace_back(std::move(marker.marker));
1485 }
1486 }
1487
1488 std::swap(sync_status, markers);
1489 return 0;
1490 }
1491
1492 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1493 std::ostream& error_stream, bool force_if_stale)
1494 {
1495 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1496 // gateway must be in the master zone to commit
1497 if (master_zone != store->get_zone_params().get_id()) {
1498 error_stream << "Cannot commit period on zone "
1499 << store->get_zone_params().get_id() << ", it must be sent to "
1500 "the period's master zone " << master_zone << '.' << std::endl;
1501 return -EINVAL;
1502 }
1503 // period predecessor must match current period
1504 if (predecessor_uuid != current_period.get_id()) {
1505 error_stream << "Period predecessor " << predecessor_uuid
1506 << " does not match current period " << current_period.get_id()
1507 << ". Use 'period pull' to get the latest period from the master, "
1508 "reapply your changes, and try again." << std::endl;
1509 return -EINVAL;
1510 }
1511 // realm epoch must be 1 greater than current period
1512 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1513 error_stream << "Period's realm epoch " << realm_epoch
1514 << " does not come directly after current realm epoch "
1515 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1516 "latest realm and period from the master zone, reapply your changes, "
1517 "and try again." << std::endl;
1518 return -EINVAL;
1519 }
1520 // did the master zone change?
1521 if (master_zone != current_period.get_master_zone()) {
1522 // store the current metadata sync status in the period
1523 int r = update_sync_status(current_period, error_stream, force_if_stale);
1524 if (r < 0) {
1525 ldout(cct, 0) << "failed to update metadata sync status: "
1526 << cpp_strerror(-r) << dendl;
1527 return r;
1528 }
1529 // create an object with a new period id
1530 r = create(true);
1531 if (r < 0) {
1532 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1533 return r;
1534 }
1535 // set as current period
1536 r = realm.set_current_period(*this);
1537 if (r < 0) {
1538 ldout(cct, 0) << "failed to update realm's current period: "
1539 << cpp_strerror(-r) << dendl;
1540 return r;
1541 }
1542 ldout(cct, 4) << "Promoted to master zone and committed new period "
1543 << id << dendl;
1544 realm.notify_new_period(*this);
1545 return 0;
1546 }
1547 // period must be based on current epoch
1548 if (epoch != current_period.get_epoch()) {
1549 error_stream << "Period epoch " << epoch << " does not match "
1550 "predecessor epoch " << current_period.get_epoch()
1551 << ". Use 'period pull' to get the latest epoch from the master zone, "
1552 "reapply your changes, and try again." << std::endl;
1553 return -EINVAL;
1554 }
1555 // set period as next epoch
1556 set_id(current_period.get_id());
1557 set_epoch(current_period.get_epoch() + 1);
1558 set_predecessor(current_period.get_predecessor());
1559 realm_epoch = current_period.get_realm_epoch();
1560 // write the period to rados
1561 int r = store_info(false);
1562 if (r < 0) {
1563 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1564 return r;
1565 }
1566 // set as latest epoch
1567 r = set_latest_epoch(epoch);
1568 if (r < 0) {
1569 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1570 return r;
1571 }
1572 r = reflect();
1573 if (r < 0) {
1574 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1575 return r;
1576 }
1577 ldout(cct, 4) << "Committed new epoch " << epoch
1578 << " for period " << id << dendl;
1579 realm.notify_new_period(*this);
1580 return 0;
1581 }
1582
1583 int RGWZoneParams::create_default(bool old_format)
1584 {
1585 name = default_zone_name;
1586
1587 int r = create();
1588 if (r < 0) {
1589 return r;
1590 }
1591
1592 if (old_format) {
1593 name = id;
1594 }
1595
1596 return r;
1597 }
1598
1599
1600 int get_zones_pool_set(CephContext* cct,
1601 RGWRados* store,
1602 const list<string>& zones,
1603 const string& my_zone_id,
1604 set<rgw_pool>& pool_names)
1605 {
1606 for(auto const& iter : zones) {
1607 RGWZoneParams zone(iter);
1608 int r = zone.init(cct, store);
1609 if (r < 0) {
1610 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1611 return r;
1612 }
1613 if (zone.get_id() != my_zone_id) {
1614 pool_names.insert(zone.domain_root);
1615 pool_names.insert(zone.metadata_heap);
1616 pool_names.insert(zone.control_pool);
1617 pool_names.insert(zone.gc_pool);
1618 pool_names.insert(zone.log_pool);
1619 pool_names.insert(zone.intent_log_pool);
1620 pool_names.insert(zone.usage_log_pool);
1621 pool_names.insert(zone.user_keys_pool);
1622 pool_names.insert(zone.user_email_pool);
1623 pool_names.insert(zone.user_swift_pool);
1624 pool_names.insert(zone.user_uid_pool);
1625 pool_names.insert(zone.roles_pool);
1626 pool_names.insert(zone.reshard_pool);
1627 for(auto& iter : zone.placement_pools) {
1628 pool_names.insert(iter.second.index_pool);
1629 pool_names.insert(iter.second.data_pool);
1630 pool_names.insert(iter.second.data_extra_pool);
1631 }
1632 }
1633 }
1634 return 0;
1635 }
1636
1637 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1638 const string& default_prefix,
1639 const string& default_suffix,
1640 const rgw_pool& suggested_pool)
1641 {
1642 string suggested_name = suggested_pool.to_str();
1643
1644 string prefix = default_prefix;
1645 string suffix = default_suffix;
1646
1647 if (!suggested_pool.empty()) {
1648 prefix = suggested_name.substr(0, suggested_name.find("."));
1649 suffix = suggested_name.substr(prefix.length());
1650 }
1651
1652 rgw_pool pool(prefix + suffix);
1653
1654 if (pools.find(pool) == pools.end()) {
1655 return pool;
1656 } else {
1657 while(true) {
1658 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1659 if (pools.find(pool) == pools.end()) {
1660 return pool;
1661 }
1662 }
1663 }
1664 }
1665
1666 int RGWZoneParams::fix_pool_names()
1667 {
1668
1669 list<string> zones;
1670 int r = store->list_zones(zones);
1671 if (r < 0) {
1672 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1673 }
1674
1675 set<rgw_pool> pools;
1676 r = get_zones_pool_set(cct, store, zones, id, pools);
1677 if (r < 0) {
1678 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1679 return r;
1680 }
1681
1682 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1683 if (!metadata_heap.name.empty()) {
1684 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1685 }
1686 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1687 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1688 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1689 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1690 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1691 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1692 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1693 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1694 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1695 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1696 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1697 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1698
1699 for(auto& iter : placement_pools) {
1700 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1701 iter.second.index_pool);
1702 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1703 iter.second.data_pool);
1704 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1705 iter.second.data_extra_pool);
1706 }
1707
1708 return 0;
1709 }
1710
1711 int RGWZoneParams::create(bool exclusive)
1712 {
1713 /* check for old pools config */
1714 rgw_raw_obj obj(domain_root, avail_pools);
1715 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1716 if (r < 0) {
1717 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1718 /* a new system, let's set new placement info */
1719 RGWZonePlacementInfo default_placement;
1720 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1721 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1722 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1723 placement_pools["default-placement"] = default_placement;
1724 }
1725
1726 r = fix_pool_names();
1727 if (r < 0) {
1728 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1729 return r;
1730 }
1731
1732 r = RGWSystemMetaObj::create(exclusive);
1733 if (r < 0) {
1734 return r;
1735 }
1736
1737 // try to set as default. may race with another create, so pass exclusive=true
1738 // so we don't override an existing default
1739 r = set_as_default(true);
1740 if (r < 0 && r != -EEXIST) {
1741 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1742 }
1743
1744 return 0;
1745 }
1746
1747 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1748 {
1749 if (cct->_conf->rgw_zone_root_pool.empty()) {
1750 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1751 }
1752
1753 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1754 }
1755
1756 const string RGWZoneParams::get_default_oid(bool old_format)
1757 {
1758 if (old_format) {
1759 return cct->_conf->rgw_default_zone_info_oid;
1760 }
1761
1762 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1763 }
1764
1765 const string& RGWZoneParams::get_names_oid_prefix()
1766 {
1767 return zone_names_oid_prefix;
1768 }
1769
1770 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1771 {
1772 return zone_info_oid_prefix;
1773 }
1774
1775 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1776 return cct->_conf->rgw_zone;
1777 }
1778
1779 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1780 {
1781 if (name.empty()) {
1782 name = cct->_conf->rgw_zone;
1783 }
1784
1785 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1786 }
1787
1788 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1789 {
1790 if (realm_id.empty()) {
1791 /* try using default realm */
1792 RGWRealm realm;
1793 int ret = realm.init(cct, store);
1794 if (ret < 0) {
1795 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1796 return -ENOENT;
1797 }
1798 realm_id = realm.get_id();
1799 }
1800
1801 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1802 }
1803
1804
1805 int RGWZoneParams::set_as_default(bool exclusive)
1806 {
1807 if (realm_id.empty()) {
1808 /* try using default realm */
1809 RGWRealm realm;
1810 int ret = realm.init(cct, store);
1811 if (ret < 0) {
1812 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1813 return -EINVAL;
1814 }
1815 realm_id = realm.get_id();
1816 }
1817
1818 return RGWSystemMetaObj::set_as_default(exclusive);
1819 }
1820
1821 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1822 {
1823 static const std::string NONE{"none"};
1824 auto p = placement_pools.find(placement_rule);
1825 if (p == placement_pools.end()) {
1826 return NONE;
1827 }
1828 const auto& type = p->second.compression_type;
1829 return !type.empty() ? type : NONE;
1830 }
1831
1832 void RGWPeriodMap::encode(bufferlist& bl) const {
1833 ENCODE_START(2, 1, bl);
1834 ::encode(id, bl);
1835 ::encode(zonegroups, bl);
1836 ::encode(master_zonegroup, bl);
1837 ::encode(short_zone_ids, bl);
1838 ENCODE_FINISH(bl);
1839 }
1840
1841 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1842 DECODE_START(2, bl);
1843 ::decode(id, bl);
1844 ::decode(zonegroups, bl);
1845 ::decode(master_zonegroup, bl);
1846 if (struct_v >= 2) {
1847 ::decode(short_zone_ids, bl);
1848 }
1849 DECODE_FINISH(bl);
1850
1851 zonegroups_by_api.clear();
1852 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1853 iter != zonegroups.end(); ++iter) {
1854 RGWZoneGroup& zonegroup = iter->second;
1855 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1856 if (zonegroup.is_master_zonegroup()) {
1857 master_zonegroup = zonegroup.get_id();
1858 }
1859 }
1860 }
1861
1862 // run an MD5 hash on the zone_id and return the first 32 bits
1863 static uint32_t gen_short_zone_id(const std::string zone_id)
1864 {
1865 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1866 MD5 hash;
1867 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1868 hash.Final(md5);
1869
1870 uint32_t short_id;
1871 memcpy((char *)&short_id, md5, sizeof(short_id));
1872 return std::max(short_id, 1u);
1873 }
1874
1875 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1876 {
1877 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1878 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1879 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1880 return -EINVAL;
1881 }
1882 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1883 if (iter != zonegroups.end()) {
1884 RGWZoneGroup& old_zonegroup = iter->second;
1885 if (!old_zonegroup.api_name.empty()) {
1886 zonegroups_by_api.erase(old_zonegroup.api_name);
1887 }
1888 }
1889 zonegroups[zonegroup.get_id()] = zonegroup;
1890
1891 if (!zonegroup.api_name.empty()) {
1892 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1893 }
1894
1895 if (zonegroup.is_master_zonegroup()) {
1896 master_zonegroup = zonegroup.get_id();
1897 } else if (master_zonegroup == zonegroup.get_id()) {
1898 master_zonegroup = "";
1899 }
1900
1901 for (auto& i : zonegroup.zones) {
1902 auto& zone = i.second;
1903 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1904 continue;
1905 }
1906 // calculate the zone's short id
1907 uint32_t short_id = gen_short_zone_id(zone.id);
1908
1909 // search for an existing zone with the same short id
1910 for (auto& s : short_zone_ids) {
1911 if (s.second == short_id) {
1912 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1913 << ") generates the same short_zone_id " << short_id
1914 << " as existing zone id " << s.first << dendl;
1915 return -EEXIST;
1916 }
1917 }
1918
1919 short_zone_ids[zone.id] = short_id;
1920 }
1921
1922 return 0;
1923 }
1924
1925 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1926 {
1927 auto i = short_zone_ids.find(zone_id);
1928 if (i == short_zone_ids.end()) {
1929 return 0;
1930 }
1931 return i->second;
1932 }
1933
1934 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1935 {
1936
1937 RGWPeriod period;
1938 int ret = period.init(cct, store);
1939 if (ret < 0) {
1940 cerr << "failed to read current period info: " << cpp_strerror(ret);
1941 return ret;
1942 }
1943
1944 bucket_quota = period.get_config().bucket_quota;
1945 user_quota = period.get_config().user_quota;
1946 zonegroups = period.get_map().zonegroups;
1947 zonegroups_by_api = period.get_map().zonegroups_by_api;
1948 master_zonegroup = period.get_map().master_zonegroup;
1949
1950 return 0;
1951 }
1952
1953 void RGWRegionMap::encode(bufferlist& bl) const {
1954 ENCODE_START( 3, 1, bl);
1955 ::encode(regions, bl);
1956 ::encode(master_region, bl);
1957 ::encode(bucket_quota, bl);
1958 ::encode(user_quota, bl);
1959 ENCODE_FINISH(bl);
1960 }
1961
1962 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1963 DECODE_START(3, bl);
1964 ::decode(regions, bl);
1965 ::decode(master_region, bl);
1966 if (struct_v >= 2)
1967 ::decode(bucket_quota, bl);
1968 if (struct_v >= 3)
1969 ::decode(user_quota, bl);
1970 DECODE_FINISH(bl);
1971 }
1972
1973 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1974 ENCODE_START( 3, 1, bl);
1975 ::encode(zonegroups, bl);
1976 ::encode(master_zonegroup, bl);
1977 ::encode(bucket_quota, bl);
1978 ::encode(user_quota, bl);
1979 ENCODE_FINISH(bl);
1980 }
1981
1982 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
1983 DECODE_START(3, bl);
1984 ::decode(zonegroups, bl);
1985 ::decode(master_zonegroup, bl);
1986 if (struct_v >= 2)
1987 ::decode(bucket_quota, bl);
1988 if (struct_v >= 3)
1989 ::decode(user_quota, bl);
1990 DECODE_FINISH(bl);
1991
1992 zonegroups_by_api.clear();
1993 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1994 iter != zonegroups.end(); ++iter) {
1995 RGWZoneGroup& zonegroup = iter->second;
1996 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1997 if (zonegroup.is_master_zonegroup()) {
1998 master_zonegroup = zonegroup.get_name();
1999 }
2000 }
2001 }
2002
2003 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2004 {
2005 obj_version *check_objv = version_for_check();
2006
2007 if (check_objv) {
2008 cls_version_check(*op, *check_objv, VER_COND_EQ);
2009 }
2010
2011 cls_version_read(*op, &read_version);
2012 }
2013
2014 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2015 {
2016 obj_version *check_objv = version_for_check();
2017 obj_version *modify_version = version_for_write();
2018
2019 if (check_objv) {
2020 cls_version_check(*op, *check_objv, VER_COND_EQ);
2021 }
2022
2023 if (modify_version) {
2024 cls_version_set(*op, *modify_version);
2025 } else {
2026 cls_version_inc(*op);
2027 }
2028 }
2029
2030 void RGWObjManifest::obj_iterator::operator++()
2031 {
2032 if (manifest->explicit_objs) {
2033 ++explicit_iter;
2034
2035 if (explicit_iter == manifest->objs.end()) {
2036 ofs = manifest->obj_size;
2037 return;
2038 }
2039
2040 update_explicit_pos();
2041
2042 update_location();
2043 return;
2044 }
2045
2046 uint64_t obj_size = manifest->get_obj_size();
2047 uint64_t head_size = manifest->get_head_size();
2048
2049 if (ofs == obj_size) {
2050 return;
2051 }
2052
2053 if (manifest->rules.empty()) {
2054 return;
2055 }
2056
2057 /* are we still pointing at the head? */
2058 if (ofs < head_size) {
2059 rule_iter = manifest->rules.begin();
2060 RGWObjManifestRule *rule = &rule_iter->second;
2061 ofs = MIN(head_size, obj_size);
2062 stripe_ofs = ofs;
2063 cur_stripe = 1;
2064 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2065 if (rule->part_size > 0) {
2066 stripe_size = MIN(stripe_size, rule->part_size);
2067 }
2068 update_location();
2069 return;
2070 }
2071
2072 RGWObjManifestRule *rule = &rule_iter->second;
2073
2074 stripe_ofs += rule->stripe_max_size;
2075 cur_stripe++;
2076 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2077
2078 if (rule->part_size > 0) {
2079 /* multi part, multi stripes object */
2080
2081 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2082
2083 if (stripe_ofs >= part_ofs + rule->part_size) {
2084 /* moved to the next part */
2085 cur_stripe = 0;
2086 part_ofs += rule->part_size;
2087 stripe_ofs = part_ofs;
2088
2089 bool last_rule = (next_rule_iter == manifest->rules.end());
2090 /* move to the next rule? */
2091 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2092 rule_iter = next_rule_iter;
2093 last_rule = (next_rule_iter == manifest->rules.end());
2094 if (!last_rule) {
2095 ++next_rule_iter;
2096 }
2097 cur_part_id = rule_iter->second.start_part_num;
2098 } else {
2099 cur_part_id++;
2100 }
2101
2102 rule = &rule_iter->second;
2103 }
2104
2105 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2106 }
2107
2108 cur_override_prefix = rule->override_prefix;
2109
2110 ofs = stripe_ofs;
2111 if (ofs > obj_size) {
2112 ofs = obj_size;
2113 stripe_ofs = ofs;
2114 stripe_size = 0;
2115 }
2116
2117 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2118 update_location();
2119 }
2120
2121 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2122 {
2123 manifest = _m;
2124
2125 manifest->set_tail_placement(placement_rule, _b);
2126 manifest->set_head(placement_rule, _obj, 0);
2127 last_ofs = 0;
2128
2129 if (manifest->get_prefix().empty()) {
2130 char buf[33];
2131 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2132
2133 string oid_prefix = ".";
2134 oid_prefix.append(buf);
2135 oid_prefix.append("_");
2136
2137 manifest->set_prefix(oid_prefix);
2138 }
2139
2140 bool found = manifest->get_rule(0, &rule);
2141 if (!found) {
2142 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2143 return -EIO;
2144 }
2145
2146 uint64_t head_size = manifest->get_head_size();
2147
2148 if (head_size > 0) {
2149 cur_stripe_size = head_size;
2150 } else {
2151 cur_stripe_size = rule.stripe_max_size;
2152 }
2153
2154 cur_part_id = rule.start_part_num;
2155
2156 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2157
2158 // Normal object which not generated through copy operation
2159 manifest->set_tail_instance(_obj.key.instance);
2160
2161 manifest->update_iterators();
2162
2163 return 0;
2164 }
2165
2166 int RGWObjManifest::generator::create_next(uint64_t ofs)
2167 {
2168 if (ofs < last_ofs) /* only going forward */
2169 return -EINVAL;
2170
2171 uint64_t max_head_size = manifest->get_max_head_size();
2172
2173 if (ofs < max_head_size) {
2174 manifest->set_head_size(ofs);
2175 }
2176
2177 if (ofs >= max_head_size) {
2178 manifest->set_head_size(max_head_size);
2179 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2180 cur_stripe_size = rule.stripe_max_size;
2181
2182 if (cur_part_id == 0 && max_head_size > 0) {
2183 cur_stripe++;
2184 }
2185 }
2186
2187 last_ofs = ofs;
2188 manifest->set_obj_size(ofs);
2189
2190 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2191
2192 manifest->update_iterators();
2193
2194 return 0;
2195 }
2196
2197 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2198 {
2199 return begin_iter;
2200 }
2201
2202 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2203 {
2204 return end_iter;
2205 }
2206
2207 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2208 {
2209 if (ofs > obj_size) {
2210 ofs = obj_size;
2211 }
2212 RGWObjManifest::obj_iterator iter(this);
2213 iter.seek(ofs);
2214 return iter;
2215 }
2216
2217 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2218 {
2219 if (explicit_objs || m.explicit_objs) {
2220 return append_explicit(m, zonegroup, zone_params);
2221 }
2222
2223 if (rules.empty()) {
2224 *this = m;
2225 return 0;
2226 }
2227
2228 string override_prefix;
2229
2230 if (prefix.empty()) {
2231 prefix = m.prefix;
2232 }
2233
2234 if (prefix != m.prefix) {
2235 override_prefix = m.prefix;
2236 }
2237
2238 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2239 if (miter == m.rules.end()) {
2240 return append_explicit(m, zonegroup, zone_params);
2241 }
2242
2243 for (; miter != m.rules.end(); ++miter) {
2244 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2245
2246 RGWObjManifestRule& rule = last_rule->second;
2247
2248 if (rule.part_size == 0) {
2249 rule.part_size = obj_size - rule.start_ofs;
2250 }
2251
2252 RGWObjManifestRule& next_rule = miter->second;
2253 if (!next_rule.part_size) {
2254 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2255 }
2256
2257 string rule_prefix = prefix;
2258 if (!rule.override_prefix.empty()) {
2259 rule_prefix = rule.override_prefix;
2260 }
2261
2262 string next_rule_prefix = m.prefix;
2263 if (!next_rule.override_prefix.empty()) {
2264 next_rule_prefix = next_rule.override_prefix;
2265 }
2266
2267 if (rule.part_size != next_rule.part_size ||
2268 rule.stripe_max_size != next_rule.stripe_max_size ||
2269 rule_prefix != next_rule_prefix) {
2270 if (next_rule_prefix != prefix) {
2271 append_rules(m, miter, &next_rule_prefix);
2272 } else {
2273 append_rules(m, miter, NULL);
2274 }
2275 break;
2276 }
2277
2278 uint64_t expected_part_num = rule.start_part_num + 1;
2279 if (rule.part_size > 0) {
2280 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2281 }
2282
2283 if (expected_part_num != next_rule.start_part_num) {
2284 append_rules(m, miter, NULL);
2285 break;
2286 }
2287 }
2288
2289 set_obj_size(obj_size + m.obj_size);
2290
2291 return 0;
2292 }
2293
2294 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2295 {
2296 return append(m, store->get_zonegroup(), store->get_zone_params());
2297 }
2298
2299 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2300 string *override_prefix)
2301 {
2302 for (; miter != m.rules.end(); ++miter) {
2303 RGWObjManifestRule rule = miter->second;
2304 rule.start_ofs += obj_size;
2305 if (override_prefix)
2306 rule.override_prefix = *override_prefix;
2307 rules[rule.start_ofs] = rule;
2308 }
2309 }
2310
2311 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2312 {
2313 if (explicit_objs) {
2314 return;
2315 }
2316 obj_iterator iter = obj_begin();
2317
2318 while (iter != obj_end()) {
2319 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2320 const rgw_obj_select& os = iter.get_location();
2321 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2322 part.loc_ofs = 0;
2323
2324 uint64_t ofs = iter.get_stripe_ofs();
2325
2326 if (ofs == 0) {
2327 part.loc = obj;
2328 } else {
2329 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2330 }
2331 ++iter;
2332 uint64_t next_ofs = iter.get_stripe_ofs();
2333
2334 part.size = next_ofs - ofs;
2335 }
2336
2337 explicit_objs = true;
2338 rules.clear();
2339 prefix.clear();
2340 }
2341
2342 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2343 {
2344 if (!explicit_objs) {
2345 convert_to_explicit(zonegroup, zone_params);
2346 }
2347 if (!m.explicit_objs) {
2348 m.convert_to_explicit(zonegroup, zone_params);
2349 }
2350 map<uint64_t, RGWObjManifestPart>::iterator iter;
2351 uint64_t base = obj_size;
2352 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2353 RGWObjManifestPart& part = iter->second;
2354 objs[base + iter->first] = part;
2355 }
2356 obj_size += m.obj_size;
2357
2358 return 0;
2359 }
2360
2361 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2362 {
2363 if (rules.empty()) {
2364 return false;
2365 }
2366
2367 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2368 if (iter != rules.begin()) {
2369 --iter;
2370 }
2371
2372 *rule = iter->second;
2373
2374 return true;
2375 }
2376
2377 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2378 {
2379 write_version.ver = 1;
2380 #define TAG_LEN 24
2381
2382 write_version.tag.clear();
2383 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2384 }
2385
2386 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2387 real_time *mtime, real_time set_mtime,
2388 map<string, bufferlist>& attrs, real_time delete_at,
2389 const char *if_match, const char *if_nomatch, const string *user_data,
2390 rgw_zone_set *zones_trace)
2391 {
2392 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2393 if (r < 0)
2394 return r;
2395
2396 is_complete = !canceled;
2397 return 0;
2398 }
2399
2400 CephContext *RGWPutObjProcessor::ctx()
2401 {
2402 return store->ctx();
2403 }
2404
2405 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2406 {
2407 drain_pending();
2408
2409 if (is_complete)
2410 return;
2411
2412 set<rgw_raw_obj>::iterator iter;
2413 bool need_to_remove_head = false;
2414 rgw_raw_obj raw_head;
2415
2416 if (!head_obj.empty()) {
2417 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2418 }
2419
2420 /**
2421 * We should delete the object in the "multipart" namespace to avoid race condition.
2422 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2423 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2424 * written by the second upload may be deleted by the first upload.
2425 * details is describled on #11749
2426 *
2427 * The above comment still stands, but instead of searching for a specific object in the multipart
2428 * namespace, we just make sure that we remove the object that is marked as the head object after
2429 * we remove all the other raw objects. Note that we use different call to remove the head object,
2430 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2431 */
2432 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2433 const rgw_raw_obj& obj = *iter;
2434 if (!head_obj.empty() && obj == raw_head) {
2435 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2436 need_to_remove_head = true;
2437 continue;
2438 }
2439
2440 int r = store->delete_raw_obj(obj);
2441 if (r < 0 && r != -ENOENT) {
2442 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2443 }
2444 }
2445
2446 if (need_to_remove_head) {
2447 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2448 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2449 if (r < 0 && r != -ENOENT) {
2450 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2451 }
2452 }
2453 }
2454
2455 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2456 {
2457 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2458 obj_len = abs_ofs + bl.length();
2459
2460 if (!(obj == last_written_obj)) {
2461 last_written_obj = obj;
2462 }
2463
2464 // For the first call pass -1 as the offset to
2465 // do a write_full.
2466 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2467 }
2468
2469 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2470 {
2471 struct put_obj_aio_info info;
2472 info = pending.front();
2473 pending.pop_front();
2474 pending_size -= info.size;
2475 return info;
2476 }
2477
2478 int RGWPutObjProcessor_Aio::wait_pending_front()
2479 {
2480 if (pending.empty()) {
2481 return 0;
2482 }
2483 struct put_obj_aio_info info = pop_pending();
2484 int ret = store->aio_wait(info.handle);
2485
2486 if (ret >= 0) {
2487 add_written_obj(info.obj);
2488 }
2489
2490 return ret;
2491 }
2492
2493 bool RGWPutObjProcessor_Aio::pending_has_completed()
2494 {
2495 if (pending.empty())
2496 return false;
2497
2498 struct put_obj_aio_info& info = pending.front();
2499 return store->aio_completed(info.handle);
2500 }
2501
2502 int RGWPutObjProcessor_Aio::drain_pending()
2503 {
2504 int ret = 0;
2505 while (!pending.empty()) {
2506 int r = wait_pending_front();
2507 if (r < 0)
2508 ret = r;
2509 }
2510 return ret;
2511 }
2512
2513 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2514 {
2515 bool _wait = need_to_wait;
2516
2517 if (handle) {
2518 struct put_obj_aio_info info;
2519 info.handle = handle;
2520 info.obj = obj;
2521 info.size = size;
2522 pending_size += size;
2523 pending.push_back(info);
2524 }
2525 size_t orig_size = pending_size;
2526
2527 /* first drain complete IOs */
2528 while (pending_has_completed()) {
2529 int r = wait_pending_front();
2530 if (r < 0)
2531 return r;
2532
2533 _wait = false;
2534 }
2535
2536 /* resize window in case messages are draining too fast */
2537 if (orig_size - pending_size >= window_size) {
2538 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2539 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2540 if (window_size > max_window_size) {
2541 window_size = max_window_size;
2542 }
2543 }
2544
2545 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2546 if (pending_size > window_size || _wait) {
2547 int r = wait_pending_front();
2548 if (r < 0)
2549 return r;
2550 }
2551 return 0;
2552 }
2553
2554 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2555 {
2556 if (ofs >= next_part_ofs) {
2557 int r = prepare_next_part(ofs);
2558 if (r < 0) {
2559 return r;
2560 }
2561 }
2562
2563 *pobj = cur_obj;
2564
2565 if (!bl.length())
2566 return 0;
2567
2568 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2569 }
2570
2571 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2572 {
2573 RGWPutObjProcessor::prepare(store, oid_rand);
2574
2575 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2576
2577 return 0;
2578 }
2579
2580 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2581 {
2582 *phandle = NULL;
2583 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2584
2585 pending_data_bl.claim_append(bl);
2586 if (pending_data_bl.length() < max_write_size) {
2587 *again = false;
2588 return 0;
2589 }
2590
2591 pending_data_bl.splice(0, max_write_size, &bl);
2592
2593 /* do we have enough data pending accumulated that needs to be written? */
2594 *again = (pending_data_bl.length() >= max_chunk_size);
2595
2596 if (!data_ofs && !immutable_head()) {
2597 first_chunk.claim(bl);
2598 obj_len = (uint64_t)first_chunk.length();
2599 int r = prepare_next_part(obj_len);
2600 if (r < 0) {
2601 return r;
2602 }
2603 data_ofs = obj_len;
2604 return 0;
2605 }
2606 off_t write_ofs = data_ofs;
2607 data_ofs = write_ofs + bl.length();
2608 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2609 we could be racing with another upload, to the same
2610 object and cleanup can be messy */
2611 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2612 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2613 bl.clear();
2614 }
2615 return ret;
2616 }
2617
2618
2619 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2620 {
2621 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2622
2623 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2624 if (r < 0) {
2625 return r;
2626 }
2627
2628 return 0;
2629 }
2630
2631 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2632 {
2633 head_obj.init(bucket, obj_str);
2634
2635 int r = prepare_init(store, oid_rand);
2636 if (r < 0) {
2637 return r;
2638 }
2639
2640 if (!version_id.empty()) {
2641 head_obj.key.set_instance(version_id);
2642 } else if (versioned_object) {
2643 store->gen_rand_obj_instance_name(&head_obj);
2644 }
2645
2646 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2647
2648 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2649 if (r < 0) {
2650 return r;
2651 }
2652
2653 return 0;
2654 }
2655
2656 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2657
2658 int ret = manifest_gen.create_next(ofs);
2659 if (ret < 0) {
2660 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2661 return ret;
2662 }
2663 cur_part_ofs = ofs;
2664 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2665 cur_obj = manifest_gen.get_cur_obj(store);
2666
2667 return 0;
2668 }
2669
2670 int RGWPutObjProcessor_Atomic::complete_parts()
2671 {
2672 if (obj_len > (uint64_t)cur_part_ofs) {
2673 return prepare_next_part(obj_len);
2674 }
2675 return 0;
2676 }
2677
2678 int RGWPutObjProcessor_Atomic::complete_writing_data()
2679 {
2680 if (!data_ofs && !immutable_head()) {
2681 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2682 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2683 * clobber first_chunk
2684 */
2685 if (pending_data_bl.length() > 0) {
2686 first_chunk.claim(pending_data_bl);
2687 }
2688 obj_len = (uint64_t)first_chunk.length();
2689 }
2690 while (pending_data_bl.length()) {
2691 void *handle;
2692 rgw_raw_obj obj;
2693 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2694 if (max_write_size > pending_data_bl.length()) {
2695 max_write_size = pending_data_bl.length();
2696 }
2697 bufferlist bl;
2698 pending_data_bl.splice(0, max_write_size, &bl);
2699 uint64_t write_len = bl.length();
2700 int r = write_data(bl, data_ofs, &handle, &obj, false);
2701 if (r < 0) {
2702 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2703 return r;
2704 }
2705 data_ofs += write_len;
2706 r = throttle_data(handle, obj, write_len, false);
2707 if (r < 0) {
2708 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2709 return r;
2710 }
2711
2712 if (data_ofs >= next_part_ofs) {
2713 r = prepare_next_part(data_ofs);
2714 if (r < 0) {
2715 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2716 return r;
2717 }
2718 }
2719 }
2720 int r = complete_parts();
2721 if (r < 0) {
2722 return r;
2723 }
2724
2725 r = drain_pending();
2726 if (r < 0)
2727 return r;
2728
2729 return 0;
2730 }
2731
2732 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2733 real_time *mtime, real_time set_mtime,
2734 map<string, bufferlist>& attrs,
2735 real_time delete_at,
2736 const char *if_match,
2737 const char *if_nomatch, const string *user_data,
2738 rgw_zone_set *zones_trace) {
2739 int r = complete_writing_data();
2740 if (r < 0)
2741 return r;
2742
2743 obj_ctx.obj.set_atomic(head_obj);
2744
2745 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2746
2747 /* some object types shouldn't be versioned, e.g., multipart parts */
2748 op_target.set_versioning_disabled(!versioned_object);
2749
2750 RGWRados::Object::Write obj_op(&op_target);
2751
2752 obj_op.meta.data = &first_chunk;
2753 obj_op.meta.manifest = &manifest;
2754 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2755 obj_op.meta.if_match = if_match;
2756 obj_op.meta.if_nomatch = if_nomatch;
2757 obj_op.meta.mtime = mtime;
2758 obj_op.meta.set_mtime = set_mtime;
2759 obj_op.meta.owner = bucket_info.owner;
2760 obj_op.meta.flags = PUT_OBJ_CREATE;
2761 obj_op.meta.olh_epoch = olh_epoch;
2762 obj_op.meta.delete_at = delete_at;
2763 obj_op.meta.user_data = user_data;
2764 obj_op.meta.zones_trace = zones_trace;
2765
2766 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2767 if (r < 0) {
2768 return r;
2769 }
2770
2771 canceled = obj_op.meta.canceled;
2772
2773 return 0;
2774 }
2775
2776 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2777 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2778 if (r < 0)
2779 return r;
2780 return 0;
2781 }
2782
2783 int RGWRados::unwatch(uint64_t watch_handle)
2784 {
2785 int r = control_pool_ctx.unwatch2(watch_handle);
2786 if (r < 0) {
2787 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2788 return r;
2789 }
2790 r = rados[0].watch_flush();
2791 if (r < 0) {
2792 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2793 return r;
2794 }
2795 return 0;
2796 }
2797
2798 void RGWRados::add_watcher(int i)
2799 {
2800 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2801 Mutex::Locker l(watchers_lock);
2802 watchers_set.insert(i);
2803 if (watchers_set.size() == (size_t)num_watchers) {
2804 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2805 set_cache_enabled(true);
2806 }
2807 }
2808
2809 void RGWRados::remove_watcher(int i)
2810 {
2811 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2812 Mutex::Locker l(watchers_lock);
2813 size_t orig_size = watchers_set.size();
2814 watchers_set.erase(i);
2815 if (orig_size == (size_t)num_watchers &&
2816 watchers_set.size() < orig_size) { /* actually removed */
2817 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2818 set_cache_enabled(false);
2819 }
2820 }
2821
2822 class RGWWatcher : public librados::WatchCtx2 {
2823 RGWRados *rados;
2824 int index;
2825 string oid;
2826 uint64_t watch_handle;
2827
2828 class C_ReinitWatch : public Context {
2829 RGWWatcher *watcher;
2830 public:
2831 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2832 void finish(int r) override {
2833 watcher->reinit();
2834 }
2835 };
2836 public:
2837 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2838 void handle_notify(uint64_t notify_id,
2839 uint64_t cookie,
2840 uint64_t notifier_id,
2841 bufferlist& bl) override {
2842 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2843 << " notify_id " << notify_id
2844 << " cookie " << cookie
2845 << " notifier " << notifier_id
2846 << " bl.length()=" << bl.length() << dendl;
2847 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2848
2849 bufferlist reply_bl; // empty reply payload
2850 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2851 }
2852 void handle_error(uint64_t cookie, int err) override {
2853 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2854 << " err " << cpp_strerror(err) << dendl;
2855 rados->remove_watcher(index);
2856 rados->schedule_context(new C_ReinitWatch(this));
2857 }
2858
2859 void reinit() {
2860 int ret = unregister_watch();
2861 if (ret < 0) {
2862 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2863 return;
2864 }
2865 ret = register_watch();
2866 if (ret < 0) {
2867 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2868 return;
2869 }
2870 }
2871
2872 int unregister_watch() {
2873 int r = rados->unwatch(watch_handle);
2874 if (r < 0) {
2875 return r;
2876 }
2877 rados->remove_watcher(index);
2878 return 0;
2879 }
2880
2881 int register_watch() {
2882 int r = rados->watch(oid, &watch_handle, this);
2883 if (r < 0) {
2884 return r;
2885 }
2886 rados->add_watcher(index);
2887 return 0;
2888 }
2889 };
2890
2891 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2892 RGWRados *store;
2893 RGWHTTPManager http_manager;
2894
2895 public:
2896 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2897 http_manager(store->ctx(), completion_mgr) {
2898 http_manager.set_threaded();
2899 }
2900
2901 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2902 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2903 { "notify", NULL },
2904 { NULL, NULL } };
2905
2906 list<RGWCoroutinesStack *> stacks;
2907 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2908 RGWRESTConn *conn = iter->second;
2909 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2910 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2911
2912 stacks.push_back(stack);
2913 }
2914 return run(stacks);
2915 }
2916 };
2917
2918 class RGWDataNotifierManager : public RGWCoroutinesManager {
2919 RGWRados *store;
2920 RGWHTTPManager http_manager;
2921
2922 public:
2923 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2924 http_manager(store->ctx(), completion_mgr) {
2925 http_manager.set_threaded();
2926 }
2927
2928 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2929 rgw_http_param_pair pairs[] = { { "type", "data" },
2930 { "notify", NULL },
2931 { "source-zone", store->get_zone_params().get_id().c_str() },
2932 { NULL, NULL } };
2933
2934 list<RGWCoroutinesStack *> stacks;
2935 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2936 RGWRESTConn *conn = iter->second;
2937 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2938 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2939
2940 stacks.push_back(stack);
2941 }
2942 return run(stacks);
2943 }
2944 };
2945
2946 class RGWRadosThread {
2947 class Worker : public Thread {
2948 CephContext *cct;
2949 RGWRadosThread *processor;
2950 Mutex lock;
2951 Cond cond;
2952
2953 void wait() {
2954 Mutex::Locker l(lock);
2955 cond.Wait(lock);
2956 };
2957
2958 void wait_interval(const utime_t& wait_time) {
2959 Mutex::Locker l(lock);
2960 cond.WaitInterval(lock, wait_time);
2961 }
2962
2963 public:
2964 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
2965 void *entry() override;
2966 void signal() {
2967 Mutex::Locker l(lock);
2968 cond.Signal();
2969 }
2970 };
2971
2972 Worker *worker;
2973
2974 protected:
2975 CephContext *cct;
2976 RGWRados *store;
2977
2978 std::atomic<bool> down_flag = { false };
2979
2980 string thread_name;
2981
2982 virtual uint64_t interval_msec() = 0;
2983 virtual void stop_process() {}
2984 public:
2985 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
2986 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
2987 virtual ~RGWRadosThread() {
2988 stop();
2989 }
2990
2991 virtual int init() { return 0; }
2992 virtual int process() = 0;
2993
2994 bool going_down() { return down_flag; }
2995
2996 void start();
2997 void stop();
2998
2999 void signal() {
3000 if (worker) {
3001 worker->signal();
3002 }
3003 }
3004 };
3005
3006 void RGWRadosThread::start()
3007 {
3008 worker = new Worker(cct, this);
3009 worker->create(thread_name.c_str());
3010 }
3011
3012 void RGWRadosThread::stop()
3013 {
3014 down_flag = true;
3015 stop_process();
3016 if (worker) {
3017 worker->signal();
3018 worker->join();
3019 }
3020 delete worker;
3021 worker = NULL;
3022 }
3023
3024 void *RGWRadosThread::Worker::entry() {
3025 uint64_t msec = processor->interval_msec();
3026 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3027
3028 do {
3029 utime_t start = ceph_clock_now();
3030 int r = processor->process();
3031 if (r < 0) {
3032 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3033 }
3034
3035 if (processor->going_down())
3036 break;
3037
3038 utime_t end = ceph_clock_now();
3039 end -= start;
3040
3041 uint64_t cur_msec = processor->interval_msec();
3042 if (cur_msec != msec) { /* was it reconfigured? */
3043 msec = cur_msec;
3044 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3045 }
3046
3047 if (cur_msec > 0) {
3048 if (interval <= end)
3049 continue; // next round
3050
3051 utime_t wait_time = interval;
3052 wait_time -= end;
3053
3054 wait_interval(wait_time);
3055 } else {
3056 wait();
3057 }
3058 } while (!processor->going_down());
3059
3060 return NULL;
3061 }
3062
3063 class RGWMetaNotifier : public RGWRadosThread {
3064 RGWMetaNotifierManager notify_mgr;
3065 RGWMetadataLog *const log;
3066
3067 uint64_t interval_msec() override {
3068 return cct->_conf->rgw_md_notify_interval_msec;
3069 }
3070 public:
3071 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3072 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3073
3074 int process() override;
3075 };
3076
3077 int RGWMetaNotifier::process()
3078 {
3079 set<int> shards;
3080
3081 log->read_clear_modified(shards);
3082
3083 if (shards.empty()) {
3084 return 0;
3085 }
3086
3087 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3088 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3089 }
3090
3091 notify_mgr.notify_all(store->zone_conn_map, shards);
3092
3093 return 0;
3094 }
3095
3096 class RGWDataNotifier : public RGWRadosThread {
3097 RGWDataNotifierManager notify_mgr;
3098
3099 uint64_t interval_msec() override {
3100 return cct->_conf->rgw_md_notify_interval_msec;
3101 }
3102 public:
3103 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3104
3105 int process() override;
3106 };
3107
3108 int RGWDataNotifier::process()
3109 {
3110 if (!store->data_log) {
3111 return 0;
3112 }
3113
3114 map<int, set<string> > shards;
3115
3116 store->data_log->read_clear_modified(shards);
3117
3118 if (shards.empty()) {
3119 return 0;
3120 }
3121
3122 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3123 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3124 }
3125
3126 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3127
3128 return 0;
3129 }
3130
3131 class RGWSyncProcessorThread : public RGWRadosThread {
3132 public:
3133 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3134 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3135 ~RGWSyncProcessorThread() override {}
3136 int init() override = 0 ;
3137 int process() override = 0;
3138 };
3139
3140 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3141 {
3142 RGWMetaSyncStatusManager sync;
3143
3144 uint64_t interval_msec() override {
3145 return 0; /* no interval associated, it'll run once until stopped */
3146 }
3147 void stop_process() override {
3148 sync.stop();
3149 }
3150 public:
3151 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3152 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3153
3154 void wakeup_sync_shards(set<int>& shard_ids) {
3155 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3156 sync.wakeup(*iter);
3157 }
3158 }
3159 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3160
3161 int init() override {
3162 int ret = sync.init();
3163 if (ret < 0) {
3164 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3165 return ret;
3166 }
3167 return 0;
3168 }
3169
3170 int process() override {
3171 sync.run();
3172 return 0;
3173 }
3174 };
3175
3176 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3177 {
3178 RGWDataSyncStatusManager sync;
3179 bool initialized;
3180
3181 uint64_t interval_msec() override {
3182 if (initialized) {
3183 return 0; /* no interval associated, it'll run once until stopped */
3184 } else {
3185 #define DATA_SYNC_INIT_WAIT_SEC 20
3186 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3187 }
3188 }
3189 void stop_process() override {
3190 sync.stop();
3191 }
3192 public:
3193 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3194 const string& _source_zone)
3195 : RGWSyncProcessorThread(_store, "data-sync"), sync(_store, async_rados, _source_zone),
3196 initialized(false) {}
3197
3198 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3199 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3200 sync.wakeup(iter->first, iter->second);
3201 }
3202 }
3203 RGWDataSyncStatusManager* get_manager() { return &sync; }
3204
3205 int init() override {
3206 return 0;
3207 }
3208
3209 int process() override {
3210 while (!initialized) {
3211 if (going_down()) {
3212 return 0;
3213 }
3214 int ret = sync.init();
3215 if (ret >= 0) {
3216 initialized = true;
3217 break;
3218 }
3219 /* we'll be back! */
3220 return 0;
3221 }
3222 sync.run();
3223 return 0;
3224 }
3225 };
3226
3227 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3228 {
3229 RGWCoroutinesManager crs;
3230 RGWRados *store;
3231 RGWHTTPManager http;
3232 const utime_t trim_interval;
3233
3234 uint64_t interval_msec() override { return 0; }
3235 void stop_process() override { crs.stop(); }
3236 public:
3237 RGWSyncLogTrimThread(RGWRados *store, int interval)
3238 : RGWSyncProcessorThread(store, "sync-log-trim"),
3239 crs(store->ctx(), store->get_cr_registry()), store(store),
3240 http(store->ctx(), crs.get_completion_mgr()),
3241 trim_interval(interval, 0)
3242 {}
3243
3244 int init() override {
3245 return http.set_threaded();
3246 }
3247 int process() override {
3248 list<RGWCoroutinesStack*> stacks;
3249 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3250 meta->call(create_meta_log_trim_cr(store, &http,
3251 cct->_conf->rgw_md_log_max_shards,
3252 trim_interval));
3253 stacks.push_back(meta);
3254
3255 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3256 data->call(create_data_log_trim_cr(store, &http,
3257 cct->_conf->rgw_data_log_num_shards,
3258 trim_interval));
3259 stacks.push_back(data);
3260
3261 crs.run(stacks);
3262 return 0;
3263 }
3264 };
3265
3266 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3267 {
3268 Mutex::Locker l(meta_sync_thread_lock);
3269 if (meta_sync_processor_thread) {
3270 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3271 }
3272 }
3273
3274 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3275 {
3276 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3277 Mutex::Locker l(data_sync_thread_lock);
3278 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3279 if (iter == data_sync_processor_threads.end()) {
3280 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3281 return;
3282 }
3283
3284 RGWDataSyncProcessorThread *thread = iter->second;
3285 assert(thread);
3286 thread->wakeup_sync_shards(shard_ids);
3287 }
3288
3289 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3290 {
3291 Mutex::Locker l(meta_sync_thread_lock);
3292 if (meta_sync_processor_thread) {
3293 return meta_sync_processor_thread->get_manager();
3294 }
3295 return nullptr;
3296 }
3297
3298 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3299 {
3300 Mutex::Locker l(data_sync_thread_lock);
3301 auto thread = data_sync_processor_threads.find(source_zone);
3302 if (thread == data_sync_processor_threads.end()) {
3303 return nullptr;
3304 }
3305 return thread->second->get_manager();
3306 }
3307
3308 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3309 {
3310 IoCtx ioctx;
3311 int r = open_pool_ctx(pool, ioctx);
3312 if (r < 0) {
3313 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3314 return r;
3315 }
3316
3317 bool requires;
3318 r = ioctx.pool_requires_alignment2(&requires);
3319 if (r < 0) {
3320 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3321 << r << dendl;
3322 return r;
3323 }
3324
3325 if (!requires) {
3326 *alignment = 0;
3327 return 0;
3328 }
3329
3330 uint64_t align;
3331 r = ioctx.pool_required_alignment2(&align);
3332 if (r < 0) {
3333 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3334 << r << dendl;
3335 return r;
3336 }
3337 if (align != 0) {
3338 ldout(cct, 20) << "required alignment=" << align << dendl;
3339 }
3340 *alignment = align;
3341 return 0;
3342 }
3343
3344 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3345 {
3346 uint64_t alignment;
3347 int r = get_required_alignment(pool, &alignment);
3348 if (r < 0) {
3349 return r;
3350 }
3351
3352 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3353
3354 if (alignment == 0) {
3355 *max_chunk_size = config_chunk_size;
3356 return 0;
3357 }
3358
3359 if (config_chunk_size <= alignment) {
3360 *max_chunk_size = alignment;
3361 return 0;
3362 }
3363
3364 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3365
3366 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3367
3368 return 0;
3369 }
3370
3371 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3372 {
3373 rgw_pool pool;
3374 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3375 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3376 return -EIO;
3377 }
3378 return get_max_chunk_size(pool, max_chunk_size);
3379 }
3380
3381 class RGWIndexCompletionManager;
3382
3383 struct complete_op_data {
3384 Mutex lock{"complete_op_data"};
3385 AioCompletion *rados_completion{nullptr};
3386 int manager_shard_id{-1};
3387 RGWIndexCompletionManager *manager{nullptr};
3388 rgw_obj obj;
3389 RGWModifyOp op;
3390 string tag;
3391 rgw_bucket_entry_ver ver;
3392 cls_rgw_obj_key key;
3393 rgw_bucket_dir_entry_meta dir_meta;
3394 list<cls_rgw_obj_key> remove_objs;
3395 bool log_op;
3396 uint16_t bilog_op;
3397 rgw_zone_set zones_trace;
3398
3399 bool stopped{false};
3400
3401 void stop() {
3402 Mutex::Locker l(lock);
3403 stopped = true;
3404 }
3405 };
3406
3407 class RGWIndexCompletionThread : public RGWRadosThread {
3408 RGWRados *store;
3409
3410 uint64_t interval_msec() override {
3411 return 0;
3412 }
3413
3414 list<complete_op_data *> completions;
3415
3416 Mutex completions_lock;
3417 public:
3418 RGWIndexCompletionThread(RGWRados *_store)
3419 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3420
3421 int process() override;
3422
3423 void add_completion(complete_op_data *completion) {
3424 {
3425 Mutex::Locker l(completions_lock);
3426 completions.push_back(completion);
3427 }
3428
3429 signal();
3430 }
3431 };
3432
3433 int RGWIndexCompletionThread::process()
3434 {
3435 list<complete_op_data *> comps;
3436
3437 {
3438 Mutex::Locker l(completions_lock);
3439 completions.swap(comps);
3440 }
3441
3442 for (auto c : comps) {
3443 std::unique_ptr<complete_op_data> up{c};
3444
3445 if (going_down()) {
3446 continue;
3447 }
3448 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3449
3450 RGWRados::BucketShard bs(store);
3451
3452 int r = bs.init(c->obj.bucket, c->obj);
3453 if (r < 0) {
3454 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3455 /* not much to do */
3456 continue;
3457 }
3458
3459 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3460 librados::ObjectWriteOperation o;
3461 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3462 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3463 c->log_op, c->bilog_op, &c->zones_trace);
3464
3465 return bs->index_ctx.operate(bs->bucket_obj, &o);
3466 });
3467 if (r < 0) {
3468 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3469 /* ignoring error, can't do anything about it */
3470 continue;
3471 }
3472 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3473 if (r < 0) {
3474 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3475 }
3476 }
3477
3478 return 0;
3479 }
3480
3481 class RGWIndexCompletionManager {
3482 RGWRados *store{nullptr};
3483 vector<Mutex *> locks;
3484 vector<set<complete_op_data *> > completions;
3485
3486 RGWIndexCompletionThread *completion_thread{nullptr};
3487
3488 int num_shards;
3489
3490 std::atomic<int> cur_shard {0};
3491
3492
3493 public:
3494 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3495 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3496
3497 for (int i = 0; i < num_shards; i++) {
3498 char buf[64];
3499 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3500 locks.push_back(new Mutex(buf));
3501 }
3502
3503 completions.resize(num_shards);
3504 }
3505 ~RGWIndexCompletionManager() {
3506 stop();
3507
3508 for (auto l : locks) {
3509 delete l;
3510 }
3511 }
3512
3513 int next_shard() {
3514 int result = cur_shard % num_shards;
3515 cur_shard++;
3516 return result;
3517 }
3518
3519 void create_completion(const rgw_obj& obj,
3520 RGWModifyOp op, string& tag,
3521 rgw_bucket_entry_ver& ver,
3522 const cls_rgw_obj_key& key,
3523 rgw_bucket_dir_entry_meta& dir_meta,
3524 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3525 uint16_t bilog_op,
3526 rgw_zone_set *zones_trace,
3527 complete_op_data **result);
3528 bool handle_completion(completion_t cb, complete_op_data *arg);
3529
3530 int start() {
3531 completion_thread = new RGWIndexCompletionThread(store);
3532 int ret = completion_thread->init();
3533 if (ret < 0) {
3534 return ret;
3535 }
3536 completion_thread->start();
3537 return 0;
3538 }
3539 void stop() {
3540 if (completion_thread) {
3541 completion_thread->stop();
3542 delete completion_thread;
3543 }
3544
3545 for (int i = 0; i < num_shards; ++i) {
3546 Mutex::Locker l(*locks[i]);
3547 for (auto c : completions[i]) {
3548 Mutex::Locker cl(c->lock);
3549 c->stop();
3550 }
3551 }
3552 completions.clear();
3553 }
3554 };
3555
3556 static void obj_complete_cb(completion_t cb, void *arg)
3557 {
3558 complete_op_data *completion = (complete_op_data *)arg;
3559 completion->lock.Lock();
3560 if (completion->stopped) {
3561 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3562 delete completion;
3563 return;
3564 }
3565 bool need_delete = completion->manager->handle_completion(cb, completion);
3566 completion->lock.Unlock();
3567 if (need_delete) {
3568 delete completion;
3569 }
3570 }
3571
3572
3573 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3574 RGWModifyOp op, string& tag,
3575 rgw_bucket_entry_ver& ver,
3576 const cls_rgw_obj_key& key,
3577 rgw_bucket_dir_entry_meta& dir_meta,
3578 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3579 uint16_t bilog_op,
3580 rgw_zone_set *zones_trace,
3581 complete_op_data **result)
3582 {
3583 complete_op_data *entry = new complete_op_data;
3584
3585 int shard_id = next_shard();
3586
3587 entry->manager_shard_id = shard_id;
3588 entry->manager = this;
3589 entry->obj = obj;
3590 entry->op = op;
3591 entry->tag = tag;
3592 entry->ver = ver;
3593 entry->key = key;
3594 entry->dir_meta = dir_meta;
3595 entry->log_op = log_op;
3596 entry->bilog_op = bilog_op;
3597
3598 if (remove_objs) {
3599 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3600 entry->remove_objs.push_back(*iter);
3601 }
3602 }
3603
3604 if (zones_trace) {
3605 entry->zones_trace = *zones_trace;
3606 } else {
3607 entry->zones_trace.insert(store->get_zone().id);
3608 }
3609
3610 *result = entry;
3611
3612 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3613
3614 Mutex::Locker l(*locks[shard_id]);
3615 completions[shard_id].insert(entry);
3616 }
3617
3618 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3619 {
3620 int shard_id = arg->manager_shard_id;
3621 {
3622 Mutex::Locker l(*locks[shard_id]);
3623
3624 auto& comps = completions[shard_id];
3625
3626 auto iter = comps.find(arg);
3627 if (iter == comps.end()) {
3628 return true;
3629 }
3630
3631 comps.erase(iter);
3632 }
3633
3634 int r = rados_aio_get_return_value(cb);
3635 if (r != -ERR_BUSY_RESHARDING) {
3636 return true;
3637 }
3638 completion_thread->add_completion(arg);
3639 return false;
3640 }
3641
3642 void RGWRados::finalize()
3643 {
3644 if (run_sync_thread) {
3645 Mutex::Locker l(meta_sync_thread_lock);
3646 meta_sync_processor_thread->stop();
3647
3648 Mutex::Locker dl(data_sync_thread_lock);
3649 for (auto iter : data_sync_processor_threads) {
3650 RGWDataSyncProcessorThread *thread = iter.second;
3651 thread->stop();
3652 }
3653 if (sync_log_trimmer) {
3654 sync_log_trimmer->stop();
3655 }
3656 }
3657 if (async_rados) {
3658 async_rados->stop();
3659 }
3660 if (run_sync_thread) {
3661 delete meta_sync_processor_thread;
3662 meta_sync_processor_thread = NULL;
3663 Mutex::Locker dl(data_sync_thread_lock);
3664 for (auto iter : data_sync_processor_threads) {
3665 RGWDataSyncProcessorThread *thread = iter.second;
3666 delete thread;
3667 }
3668 data_sync_processor_threads.clear();
3669 delete sync_log_trimmer;
3670 sync_log_trimmer = nullptr;
3671 }
3672 if (finisher) {
3673 finisher->stop();
3674 }
3675 if (need_watch_notify()) {
3676 finalize_watch();
3677 }
3678 if (finisher) {
3679 /* delete finisher only after cleaning up watches, as watch error path might call
3680 * into finisher. We stop finisher before finalizing watch to make sure we don't
3681 * actually handle any racing work
3682 */
3683 delete finisher;
3684 }
3685 if (meta_notifier) {
3686 meta_notifier->stop();
3687 delete meta_notifier;
3688 }
3689 if (data_notifier) {
3690 data_notifier->stop();
3691 delete data_notifier;
3692 }
3693 delete data_log;
3694 if (async_rados) {
3695 delete async_rados;
3696 }
3697 if (use_gc_thread) {
3698 gc->stop_processor();
3699 obj_expirer->stop_processor();
3700 }
3701 delete gc;
3702 gc = NULL;
3703
3704 if (use_lc_thread) {
3705 lc->stop_processor();
3706 }
3707 delete lc;
3708 lc = NULL;
3709
3710 delete obj_expirer;
3711 obj_expirer = NULL;
3712
3713 delete rest_master_conn;
3714
3715 map<string, RGWRESTConn *>::iterator iter;
3716 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3717 RGWRESTConn *conn = iter->second;
3718 delete conn;
3719 }
3720
3721 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3722 RGWRESTConn *conn = iter->second;
3723 delete conn;
3724 }
3725 RGWQuotaHandler::free_handler(quota_handler);
3726 if (cr_registry) {
3727 cr_registry->put();
3728 }
3729 delete meta_mgr;
3730 delete binfo_cache;
3731 delete obj_tombstone_cache;
3732 delete sync_modules_manager;
3733
3734 if (reshard_wait.get()) {
3735 reshard_wait->stop();
3736 reshard_wait.reset();
3737 }
3738
3739 if (run_reshard_thread) {
3740 reshard->stop_processor();
3741 }
3742 delete reshard;
3743 delete index_completion_manager;
3744 }
3745
3746 /**
3747 * Initialize the RADOS instance and prepare to do other ops
3748 * Returns 0 on success, -ERR# on failure.
3749 */
3750 int RGWRados::init_rados()
3751 {
3752 int ret = 0;
3753 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3754
3755 for (auto& r : handles) {
3756 ret = r.init_with_context(cct);
3757 if (ret < 0) {
3758 return ret;
3759 }
3760
3761 ret = r.connect();
3762 if (ret < 0) {
3763 return ret;
3764 }
3765 }
3766
3767 sync_modules_manager = new RGWSyncModulesManager();
3768
3769 rgw_register_sync_modules(sync_modules_manager);
3770
3771 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3772 new RGWCoroutinesManagerRegistry(cct)};
3773 ret = crs->hook_to_admin_command("cr dump");
3774 if (ret < 0) {
3775 return ret;
3776 }
3777
3778 meta_mgr = new RGWMetadataManager(cct, this);
3779 data_log = new RGWDataChangesLog(cct, this);
3780 cr_registry = crs.release();
3781
3782 std::swap(handles, rados);
3783 return ret;
3784 }
3785
3786 /**
3787 * Add new connection to connections map
3788 * @param zonegroup_conn_map map which new connection will be added to
3789 * @param zonegroup zonegroup which new connection will connect to
3790 * @param new_connection pointer to new connection instance
3791 */
3792 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3793 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3794 {
3795 // Delete if connection is already exists
3796 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3797 if (iterZoneGroup != zonegroup_conn_map.end()) {
3798 delete iterZoneGroup->second;
3799 }
3800
3801 // Add new connection to connections map
3802 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3803 }
3804
3805 int RGWRados::convert_regionmap()
3806 {
3807 RGWZoneGroupMap zonegroupmap;
3808
3809 string pool_name = cct->_conf->rgw_zone_root_pool;
3810 if (pool_name.empty()) {
3811 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3812 }
3813 string oid = region_map_oid;
3814
3815 rgw_pool pool(pool_name);
3816 bufferlist bl;
3817 RGWObjectCtx obj_ctx(this);
3818 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3819 if (ret < 0 && ret != -ENOENT) {
3820 return ret;
3821 } else if (ret == -ENOENT) {
3822 return 0;
3823 }
3824
3825 try {
3826 bufferlist::iterator iter = bl.begin();
3827 ::decode(zonegroupmap, iter);
3828 } catch (buffer::error& err) {
3829 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3830 return -EIO;
3831 }
3832
3833 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3834 iter != zonegroupmap.zonegroups.end(); ++iter) {
3835 RGWZoneGroup& zonegroup = iter->second;
3836 ret = zonegroup.init(cct, this, false);
3837 ret = zonegroup.update();
3838 if (ret < 0 && ret != -ENOENT) {
3839 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3840 cpp_strerror(-ret) << dendl;
3841 return ret;
3842 } else if (ret == -ENOENT) {
3843 ret = zonegroup.create();
3844 if (ret < 0) {
3845 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3846 cpp_strerror(-ret) << dendl;
3847 return ret;
3848 }
3849 }
3850 }
3851
3852 current_period.set_user_quota(zonegroupmap.user_quota);
3853 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3854
3855 // remove the region_map so we don't try to convert again
3856 rgw_raw_obj obj(pool, oid);
3857 ret = delete_system_obj(obj);
3858 if (ret < 0) {
3859 ldout(cct, 0) << "Error could not remove " << obj
3860 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3861 return ret;
3862 }
3863
3864 return 0;
3865 }
3866
3867 /**
3868 * Replace all region configuration with zonegroup for
3869 * backward compatability
3870 * Returns 0 on success, -ERR# on failure.
3871 */
3872 int RGWRados::replace_region_with_zonegroup()
3873 {
3874 /* copy default region */
3875 /* convert default region to default zonegroup */
3876 string default_oid = cct->_conf->rgw_default_region_info_oid;
3877 if (default_oid.empty()) {
3878 default_oid = default_region_info_oid;
3879 }
3880
3881
3882 RGWZoneGroup default_zonegroup;
3883 rgw_pool pool{default_zonegroup.get_pool(cct)};
3884 string oid = "converted";
3885 bufferlist bl;
3886 RGWObjectCtx obj_ctx(this);
3887
3888 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3889 if (ret < 0 && ret != -ENOENT) {
3890 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3891 << dendl;
3892 return ret;
3893 } else if (ret != -ENOENT) {
3894 ldout(cct, 20) << "System already converted " << dendl;
3895 return 0;
3896 }
3897
3898 string default_region;
3899 ret = default_zonegroup.init(cct, this, false, true);
3900 if (ret < 0) {
3901 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3902 return ret;
3903 }
3904 ret = default_zonegroup.read_default_id(default_region, true);
3905 if (ret < 0 && ret != -ENOENT) {
3906 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3907 return ret;
3908 }
3909
3910 /* convert regions to zonegroups */
3911 list<string> regions;
3912 ret = list_regions(regions);
3913 if (ret < 0 && ret != -ENOENT) {
3914 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3915 return ret;
3916 } else if (ret == -ENOENT || regions.empty()) {
3917 RGWZoneParams zoneparams(default_zone_name);
3918 int ret = zoneparams.init(cct, this);
3919 if (ret < 0 && ret != -ENOENT) {
3920 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
3921 return ret;
3922 }
3923 /* update master zone */
3924 RGWZoneGroup default_zg(default_zonegroup_name);
3925 ret = default_zg.init(cct, this);
3926 if (ret < 0 && ret != -ENOENT) {
3927 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
3928 return ret;
3929 }
3930 if (ret != -ENOENT && default_zg.master_zone.empty()) {
3931 default_zg.master_zone = zoneparams.get_id();
3932 return default_zg.update();
3933 }
3934 return 0;
3935 }
3936
3937 string master_region, master_zone;
3938 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
3939 if (*iter != default_zonegroup_name){
3940 RGWZoneGroup region(*iter);
3941 int ret = region.init(cct, this, true, true);
3942 if (ret < 0) {
3943 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
3944 return ret;
3945 }
3946 if (region.is_master_zonegroup()) {
3947 master_region = region.get_id();
3948 master_zone = region.master_zone;
3949 }
3950 }
3951 }
3952
3953 /* create realm if there is none.
3954 The realm name will be the region and zone concatenated
3955 realm id will be mds of its name */
3956 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
3957 string new_realm_name = master_region + "." + master_zone;
3958 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
3959 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
3960 MD5 hash;
3961 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
3962 hash.Final(md5);
3963 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
3964 string new_realm_id(md5_str);
3965 RGWRealm new_realm(new_realm_id,new_realm_name);
3966 ret = new_realm.init(cct, this, false);
3967 if (ret < 0) {
3968 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
3969 return ret;
3970 }
3971 ret = new_realm.create();
3972 if (ret < 0 && ret != -EEXIST) {
3973 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
3974 return ret;
3975 }
3976 ret = new_realm.set_as_default();
3977 if (ret < 0) {
3978 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
3979 return ret;
3980 }
3981 ret = realm.init(cct, this);
3982 if (ret < 0) {
3983 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
3984 return ret;
3985 }
3986 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
3987 if (ret < 0) {
3988 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
3989 return ret;
3990 }
3991 }
3992
3993 list<string>::iterator iter;
3994 /* create zonegroups */
3995 for (iter = regions.begin(); iter != regions.end(); ++iter)
3996 {
3997 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
3998 /* check to see if we don't have already a zonegroup with this name */
3999 RGWZoneGroup new_zonegroup(*iter);
4000 ret = new_zonegroup.init(cct , this);
4001 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4002 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4003 " skipping conversion " << dendl;
4004 continue;
4005 }
4006 RGWZoneGroup zonegroup(*iter);
4007 zonegroup.set_id(*iter);
4008 int ret = zonegroup.init(cct, this, true, true);
4009 if (ret < 0) {
4010 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4011 return ret;
4012 }
4013 zonegroup.realm_id = realm.get_id();
4014 /* fix default region master zone */
4015 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4016 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4017 zonegroup.master_zone = default_zone_name;
4018 }
4019 ret = zonegroup.update();
4020 if (ret < 0 && ret != -EEXIST) {
4021 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4022 << dendl;
4023 return ret;
4024 }
4025 ret = zonegroup.update_name();
4026 if (ret < 0 && ret != -EEXIST) {
4027 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4028 << dendl;
4029 return ret;
4030 }
4031 if (zonegroup.get_name() == default_region) {
4032 ret = zonegroup.set_as_default();
4033 if (ret < 0) {
4034 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4035 << dendl;
4036 return ret;
4037 }
4038 }
4039 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4040 ++iter) {
4041 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4042 RGWZoneParams zoneparams(iter->first, iter->first);
4043 zoneparams.set_id(iter->first);
4044 zoneparams.realm_id = realm.get_id();
4045 ret = zoneparams.init(cct, this);
4046 if (ret < 0 && ret != -ENOENT) {
4047 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4048 return ret;
4049 } else if (ret == -ENOENT) {
4050 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4051 continue;
4052 }
4053 zonegroup.realm_id = realm.get_id();
4054 ret = zoneparams.update();
4055 if (ret < 0 && ret != -EEXIST) {
4056 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4057 return ret;
4058 }
4059 ret = zoneparams.update_name();
4060 if (ret < 0 && ret != -EEXIST) {
4061 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4062 return ret;
4063 }
4064 }
4065
4066 if (!current_period.get_id().empty()) {
4067 ret = current_period.add_zonegroup(zonegroup);
4068 if (ret < 0) {
4069 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4070 return ret;
4071 }
4072 }
4073 }
4074
4075 if (!current_period.get_id().empty()) {
4076 ret = current_period.update();
4077 if (ret < 0) {
4078 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4079 return ret;
4080 }
4081 ret = current_period.store_info(false);
4082 if (ret < 0) {
4083 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4084 return ret;
4085 }
4086 ret = current_period.reflect();
4087 if (ret < 0) {
4088 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4089 return ret;
4090 }
4091 }
4092
4093 for (auto const& iter : regions) {
4094 RGWZoneGroup zonegroup(iter);
4095 int ret = zonegroup.init(cct, this, true, true);
4096 if (ret < 0) {
4097 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4098 return ret;
4099 }
4100 ret = zonegroup.delete_obj(true);
4101 if (ret < 0 && ret != -ENOENT) {
4102 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4103 << dendl;
4104 return ret;
4105 }
4106 }
4107
4108 /* mark as converted */
4109 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4110 true, NULL, real_time(), NULL);
4111 if (ret < 0 ) {
4112 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4113 << dendl;
4114 return ret;
4115 }
4116
4117 return 0;
4118 }
4119
4120 int RGWRados::init_zg_from_period(bool *initialized)
4121 {
4122 *initialized = false;
4123
4124 if (current_period.get_id().empty()) {
4125 return 0;
4126 }
4127
4128 int ret = zonegroup.init(cct, this);
4129 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4130 if (ret == -ENOENT) {
4131 return 0;
4132 }
4133 if (ret < 0) {
4134 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4135 return ret;
4136 }
4137 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4138
4139 map<string, RGWZoneGroup>::const_iterator iter =
4140 current_period.get_map().zonegroups.find(zonegroup.get_id());
4141
4142 if (iter != current_period.get_map().zonegroups.end()) {
4143 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4144 zonegroup = iter->second;
4145 ret = zonegroup.init(cct, this, false);
4146 if (ret < 0) {
4147 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4148 return ret;
4149 }
4150 ret = zone_params.init(cct, this);
4151 if (ret < 0 && ret != -ENOENT) {
4152 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4153 return ret;
4154 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4155 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4156 zone_params.set_name(default_zone_name);
4157 ret = zone_params.init(cct, this);
4158 if (ret < 0 && ret != -ENOENT) {
4159 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4160 return ret;
4161 }
4162 }
4163 }
4164 for (iter = current_period.get_map().zonegroups.begin();
4165 iter != current_period.get_map().zonegroups.end(); ++iter){
4166 const RGWZoneGroup& zg = iter->second;
4167 // use endpoints from the zonegroup's master zone
4168 auto master = zg.zones.find(zg.master_zone);
4169 if (master == zg.zones.end()) {
4170 // fix missing master zone for a single zone zonegroup
4171 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4172 master = zg.zones.begin();
4173 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4174 master->second.name << " id:" << master->second.id << " as master" << dendl;
4175 if (zonegroup.get_id() == zg.get_id()) {
4176 zonegroup.master_zone = master->second.id;
4177 ret = zonegroup.update();
4178 if (ret < 0) {
4179 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4180 return ret;
4181 }
4182 } else {
4183 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4184 ret = fixed_zg.init(cct, this);
4185 if (ret < 0) {
4186 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4187 return ret;
4188 }
4189 fixed_zg.master_zone = master->second.id;
4190 ret = fixed_zg.update();
4191 if (ret < 0) {
4192 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4193 return ret;
4194 }
4195 }
4196 } else {
4197 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4198 zg.master_zone << dendl;
4199 return -EINVAL;
4200 }
4201 }
4202 const auto& endpoints = master->second.endpoints;
4203 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4204 if (!current_period.get_master_zonegroup().empty() &&
4205 zg.get_id() == current_period.get_master_zonegroup()) {
4206 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4207 }
4208 }
4209
4210 *initialized = true;
4211
4212 return 0;
4213 }
4214
4215 int RGWRados::init_zg_from_local(bool *creating_defaults)
4216 {
4217 int ret = zonegroup.init(cct, this);
4218 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4219 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4220 return ret;
4221 } else if (ret == -ENOENT) {
4222 *creating_defaults = true;
4223 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4224 ret = zonegroup.create_default();
4225 if (ret < 0) {
4226 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4227 << dendl;
4228 return ret;
4229 }
4230 ret = zonegroup.init(cct, this);
4231 if (ret < 0) {
4232 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4233 << dendl;
4234 return ret;
4235 }
4236 }
4237 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4238 if (zonegroup.is_master_zonegroup()) {
4239 // use endpoints from the zonegroup's master zone
4240 auto master = zonegroup.zones.find(zonegroup.master_zone);
4241 if (master == zonegroup.zones.end()) {
4242 // fix missing master zone for a single zone zonegroup
4243 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4244 master = zonegroup.zones.begin();
4245 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4246 master->second.name << " id:" << master->second.id << " as master" << dendl;
4247 zonegroup.master_zone = master->second.id;
4248 ret = zonegroup.update();
4249 if (ret < 0) {
4250 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4251 return ret;
4252 }
4253 } else {
4254 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4255 "master_zone=" << zonegroup.master_zone << dendl;
4256 return -EINVAL;
4257 }
4258 }
4259 const auto& endpoints = master->second.endpoints;
4260 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4261 }
4262
4263 return 0;
4264 }
4265
4266
4267 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4268 {
4269 return target_zone.syncs_from(source_zone.name) &&
4270 sync_modules_manager->supports_data_export(source_zone.tier_type);
4271 }
4272
4273 /**
4274 * Initialize the RADOS instance and prepare to do other ops
4275 * Returns 0 on success, -ERR# on failure.
4276 */
4277 int RGWRados::init_complete()
4278 {
4279 int ret = realm.init(cct, this);
4280 if (ret < 0 && ret != -ENOENT) {
4281 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4282 return ret;
4283 } else if (ret != -ENOENT) {
4284 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4285 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4286 if (ret < 0 && ret != -ENOENT) {
4287 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4288 return ret;
4289 }
4290 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4291 }
4292
4293 ret = replace_region_with_zonegroup();
4294 if (ret < 0) {
4295 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4296 return ret;
4297 }
4298
4299 ret = convert_regionmap();
4300 if (ret < 0) {
4301 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4302 return ret;
4303 }
4304
4305 bool zg_initialized = false;
4306
4307 if (!current_period.get_id().empty()) {
4308 ret = init_zg_from_period(&zg_initialized);
4309 if (ret < 0) {
4310 return ret;
4311 }
4312 }
4313
4314 bool creating_defaults = false;
4315 bool using_local = (!zg_initialized);
4316 if (using_local) {
4317 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4318 ret = init_zg_from_local(&creating_defaults);
4319 if (ret < 0) {
4320 return ret;
4321 }
4322 // read period_config into current_period
4323 auto& period_config = current_period.get_config();
4324 ret = period_config.read(this, zonegroup.realm_id);
4325 if (ret < 0 && ret != -ENOENT) {
4326 ldout(cct, 0) << "ERROR: failed to read period config: "
4327 << cpp_strerror(ret) << dendl;
4328 return ret;
4329 }
4330 }
4331
4332 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4333 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4334 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4335 zone_params.set_name(default_zone_name);
4336 }
4337
4338 ret = zone_params.init(cct, this);
4339 if (ret < 0 && ret != -ENOENT) {
4340 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4341 return ret;
4342 }
4343 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4344 if (zone_iter == get_zonegroup().zones.end()) {
4345 if (using_local) {
4346 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4347 return -EINVAL;
4348 }
4349 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4350 ret = init_zg_from_local(&creating_defaults);
4351 if (ret < 0) {
4352 return ret;
4353 }
4354 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4355 }
4356 if (zone_iter != get_zonegroup().zones.end()) {
4357 zone_public_config = zone_iter->second;
4358 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4359 } else {
4360 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4361 return -EINVAL;
4362 }
4363
4364 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4365
4366 if (run_sync_thread) {
4367 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4368 if (ret < 0) {
4369 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4370 return ret;
4371 }
4372 }
4373
4374 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4375
4376 init_unique_trans_id_deps();
4377
4378 finisher = new Finisher(cct);
4379 finisher->start();
4380
4381 period_puller.reset(new RGWPeriodPuller(this));
4382 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4383 current_period));
4384
4385 if (need_watch_notify()) {
4386 ret = init_watch();
4387 if (ret < 0) {
4388 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4389 return ret;
4390 }
4391 }
4392
4393 /* first build all zones index */
4394 for (auto ziter : get_zonegroup().zones) {
4395 const string& id = ziter.first;
4396 RGWZone& z = ziter.second;
4397 zone_id_by_name[z.name] = id;
4398 zone_by_id[id] = z;
4399 }
4400
4401 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4402 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4403 }
4404 zone_public_config = zone_by_id[zone_id()];
4405 for (auto ziter : get_zonegroup().zones) {
4406 const string& id = ziter.first;
4407 RGWZone& z = ziter.second;
4408 if (id == zone_id()) {
4409 continue;
4410 }
4411 if (z.endpoints.empty()) {
4412 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4413 continue;
4414 }
4415 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4416 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4417 zone_conn_map[id] = conn;
4418 if (zone_syncs_from(zone_public_config, z) ||
4419 zone_syncs_from(z, zone_public_config)) {
4420 if (zone_syncs_from(zone_public_config, z)) {
4421 zone_data_sync_from_map[id] = conn;
4422 }
4423 if (zone_syncs_from(z, zone_public_config)) {
4424 zone_data_notify_to_map[id] = conn;
4425 }
4426 } else {
4427 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4428 }
4429 }
4430
4431 ret = open_root_pool_ctx();
4432 if (ret < 0)
4433 return ret;
4434
4435 ret = open_gc_pool_ctx();
4436 if (ret < 0)
4437 return ret;
4438
4439 ret = open_lc_pool_ctx();
4440 if (ret < 0)
4441 return ret;
4442
4443 ret = open_objexp_pool_ctx();
4444 if (ret < 0)
4445 return ret;
4446
4447 ret = open_reshard_pool_ctx();
4448 if (ret < 0)
4449 return ret;
4450
4451 pools_initialized = true;
4452
4453 gc = new RGWGC();
4454 gc->initialize(cct, this);
4455
4456 obj_expirer = new RGWObjectExpirer(this);
4457
4458 if (use_gc_thread) {
4459 gc->start_processor();
4460 obj_expirer->start_processor();
4461 }
4462
4463 if (run_sync_thread) {
4464 // initialize the log period history. we want to do this any time we're not
4465 // running under radosgw-admin, so we check run_sync_thread here before
4466 // disabling it based on the zone/zonegroup setup
4467 meta_mgr->init_oldest_log_period();
4468 }
4469
4470 /* no point of running sync thread if we don't have a master zone configured
4471 or there is no rest_master_conn */
4472 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4473 || current_period.get_id().empty()) {
4474 run_sync_thread = false;
4475 }
4476
4477 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4478 async_rados->start();
4479
4480 ret = meta_mgr->init(current_period.get_id());
4481 if (ret < 0) {
4482 lderr(cct) << "ERROR: failed to initialize metadata log: "
4483 << cpp_strerror(-ret) << dendl;
4484 return ret;
4485 }
4486
4487 if (is_meta_master()) {
4488 auto md_log = meta_mgr->get_log(current_period.get_id());
4489 meta_notifier = new RGWMetaNotifier(this, md_log);
4490 meta_notifier->start();
4491 }
4492
4493 if (run_sync_thread) {
4494 Mutex::Locker l(meta_sync_thread_lock);
4495 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4496 ret = meta_sync_processor_thread->init();
4497 if (ret < 0) {
4498 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4499 return ret;
4500 }
4501 meta_sync_processor_thread->start();
4502
4503 Mutex::Locker dl(data_sync_thread_lock);
4504 for (auto iter : zone_data_sync_from_map) {
4505 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4506 RGWDataSyncProcessorThread *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first);
4507 ret = thread->init();
4508 if (ret < 0) {
4509 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4510 return ret;
4511 }
4512 thread->start();
4513 data_sync_processor_threads[iter.first] = thread;
4514 }
4515 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4516 if (interval > 0) {
4517 sync_log_trimmer = new RGWSyncLogTrimThread(this, interval);
4518 ret = sync_log_trimmer->init();
4519 if (ret < 0) {
4520 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4521 return ret;
4522 }
4523 sync_log_trimmer->start();
4524 }
4525 }
4526 data_notifier = new RGWDataNotifier(this);
4527 data_notifier->start();
4528
4529 lc = new RGWLC();
4530 lc->initialize(cct, this);
4531
4532 if (use_lc_thread)
4533 lc->start_processor();
4534
4535 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4536
4537 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4538 get_zone().bucket_index_max_shards);
4539 if (bucket_index_max_shards > get_max_bucket_shards()) {
4540 bucket_index_max_shards = get_max_bucket_shards();
4541 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4542 << get_max_bucket_shards() << dendl;
4543 }
4544 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4545
4546 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4547 binfo_cache->init(this);
4548
4549 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4550
4551 if (need_tombstone_cache) {
4552 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4553 }
4554
4555 reshard_wait = std::make_shared<RGWReshardWait>(this);
4556
4557 reshard = new RGWReshard(this);
4558
4559 /* only the master zone in the zonegroup reshards buckets */
4560 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4561 if (run_reshard_thread) {
4562 reshard->start_processor();
4563 }
4564
4565 index_completion_manager = new RGWIndexCompletionManager(this);
4566 ret = index_completion_manager->start();
4567
4568 return ret;
4569 }
4570
4571 /**
4572 * Initialize the RADOS instance and prepare to do other ops
4573 * Returns 0 on success, -ERR# on failure.
4574 */
4575 int RGWRados::initialize()
4576 {
4577 int ret;
4578
4579 ret = init_rados();
4580 if (ret < 0)
4581 return ret;
4582
4583 return init_complete();
4584 }
4585
4586 void RGWRados::finalize_watch()
4587 {
4588 for (int i = 0; i < num_watchers; i++) {
4589 RGWWatcher *watcher = watchers[i];
4590 watcher->unregister_watch();
4591 delete watcher;
4592 }
4593
4594 delete[] notify_oids;
4595 delete[] watchers;
4596 }
4597
4598 void RGWRados::schedule_context(Context *c) {
4599 finisher->queue(c);
4600 }
4601
4602 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4603 {
4604 bool is_truncated;
4605 RGWListRawObjsCtx ctx;
4606 do {
4607 list<string> oids;
4608 int r = list_raw_objects(pool, prefix, 1000,
4609 ctx, oids, &is_truncated);
4610 if (r < 0) {
4611 return r;
4612 }
4613 list<string>::iterator iter;
4614 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4615 string& val = *iter;
4616 if (val.size() > prefix.size())
4617 result.push_back(val.substr(prefix.size()));
4618 }
4619 } while (is_truncated);
4620
4621 return 0;
4622 }
4623
4624 int RGWRados::list_regions(list<string>& regions)
4625 {
4626 RGWZoneGroup zonegroup;
4627
4628 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4629 }
4630
4631 int RGWRados::list_zonegroups(list<string>& zonegroups)
4632 {
4633 RGWZoneGroup zonegroup;
4634
4635 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4636 }
4637
4638 int RGWRados::list_zones(list<string>& zones)
4639 {
4640 RGWZoneParams zoneparams;
4641
4642 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4643 }
4644
4645 int RGWRados::list_realms(list<string>& realms)
4646 {
4647 RGWRealm realm(cct, this);
4648 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4649 }
4650
4651 int RGWRados::list_periods(list<string>& periods)
4652 {
4653 RGWPeriod period;
4654 list<string> raw_periods;
4655 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4656 if (ret < 0) {
4657 return ret;
4658 }
4659 for (const auto& oid : raw_periods) {
4660 size_t pos = oid.find(".");
4661 if (pos != std::string::npos) {
4662 periods.push_back(oid.substr(0, pos));
4663 } else {
4664 periods.push_back(oid);
4665 }
4666 }
4667 periods.sort(); // unique() only detects duplicates if they're adjacent
4668 periods.unique();
4669 return 0;
4670 }
4671
4672
4673 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4674 {
4675 int ret = 0;
4676 string period_id = current_period;
4677 while(!period_id.empty()) {
4678 RGWPeriod period(period_id);
4679 ret = period.init(cct, this);
4680 if (ret < 0) {
4681 return ret;
4682 }
4683 periods.push_back(period.get_id());
4684 period_id = period.get_predecessor();
4685 }
4686
4687 return ret;
4688 }
4689
4690 /**
4691 * Open the pool used as root for this gateway
4692 * Returns: 0 on success, -ERR# otherwise.
4693 */
4694 int RGWRados::open_root_pool_ctx()
4695 {
4696 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4697 }
4698
4699 int RGWRados::open_gc_pool_ctx()
4700 {
4701 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4702 }
4703
4704 int RGWRados::open_lc_pool_ctx()
4705 {
4706 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4707 }
4708
4709 int RGWRados::open_objexp_pool_ctx()
4710 {
4711 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4712 }
4713
4714 int RGWRados::open_reshard_pool_ctx()
4715 {
4716 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4717 }
4718
4719 int RGWRados::init_watch()
4720 {
4721 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4722 if (r < 0) {
4723 return r;
4724 }
4725
4726 num_watchers = cct->_conf->rgw_num_control_oids;
4727
4728 bool compat_oid = (num_watchers == 0);
4729
4730 if (num_watchers <= 0)
4731 num_watchers = 1;
4732
4733 notify_oids = new string[num_watchers];
4734 watchers = new RGWWatcher *[num_watchers];
4735
4736 for (int i=0; i < num_watchers; i++) {
4737 string& notify_oid = notify_oids[i];
4738 notify_oid = notify_oid_prefix;
4739 if (!compat_oid) {
4740 char buf[16];
4741 snprintf(buf, sizeof(buf), ".%d", i);
4742 notify_oid.append(buf);
4743 }
4744 r = control_pool_ctx.create(notify_oid, false);
4745 if (r < 0 && r != -EEXIST)
4746 return r;
4747
4748 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4749 watchers[i] = watcher;
4750
4751 r = watcher->register_watch();
4752 if (r < 0)
4753 return r;
4754 }
4755
4756 watch_initialized = true;
4757
4758 set_cache_enabled(true);
4759
4760 return 0;
4761 }
4762
4763 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4764 {
4765 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4766
4767 int i = r % num_watchers;
4768 char buf[16];
4769 snprintf(buf, sizeof(buf), ".%d", i);
4770
4771 notify_oid = notify_oid_prefix;
4772 notify_oid.append(buf);
4773 }
4774
4775 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4776 {
4777 librados::Rados *rad = get_rados_handle();
4778 int r = rgw_init_ioctx(rad, pool, io_ctx);
4779 if (r != -ENOENT)
4780 return r;
4781
4782 if (!pools_initialized)
4783 return r;
4784
4785 r = rad->pool_create(pool.name.c_str());
4786 if (r < 0 && r != -EEXIST)
4787 return r;
4788
4789 return rgw_init_ioctx(rad, pool, io_ctx);
4790 }
4791
4792 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4793 string *marker) {
4794 if (marker) {
4795 *marker = shard_id_str;
4796 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4797 marker->append(shard_marker);
4798 }
4799 }
4800
4801 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4802 {
4803 const string *rule = &bucket_info.placement_rule;
4804 if (rule->empty()) {
4805 rule = &zonegroup.default_placement;
4806 }
4807 auto iter = zone_params.placement_pools.find(*rule);
4808 if (iter == zone_params.placement_pools.end()) {
4809 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4810 return -EINVAL;
4811 }
4812
4813 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4814 if (r < 0)
4815 return r;
4816
4817 return 0;
4818 }
4819
4820 /**
4821 * set up a bucket listing.
4822 * handle is filled in.
4823 * Returns 0 on success, -ERR# otherwise.
4824 */
4825 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4826 {
4827 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4828 *handle = (RGWAccessHandle)state;
4829 return 0;
4830 }
4831
4832 /**
4833 * get the next bucket in the listing.
4834 * obj is filled in,
4835 * handle is updated.
4836 * returns 0 on success, -ERR# otherwise.
4837 */
4838 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4839 {
4840 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4841
4842 do {
4843 if (*state == root_pool_ctx.nobjects_end()) {
4844 delete state;
4845 return -ENOENT;
4846 }
4847
4848 obj.key.name = (*state)->get_oid();
4849 if (obj.key.name[0] == '_') {
4850 obj.key.name = obj.key.name.substr(1);
4851 }
4852
4853 (*state)++;
4854 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4855
4856 return 0;
4857 }
4858
4859
4860 /**** logs ****/
4861
4862 struct log_list_state {
4863 string prefix;
4864 librados::IoCtx io_ctx;
4865 librados::NObjectIterator obit;
4866 };
4867
4868 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4869 {
4870 log_list_state *state = new log_list_state;
4871 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4872 if (r < 0) {
4873 delete state;
4874 return r;
4875 }
4876 state->prefix = prefix;
4877 state->obit = state->io_ctx.nobjects_begin();
4878 *handle = (RGWAccessHandle)state;
4879 return 0;
4880 }
4881
4882 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4883 {
4884 log_list_state *state = static_cast<log_list_state *>(handle);
4885 while (true) {
4886 if (state->obit == state->io_ctx.nobjects_end()) {
4887 delete state;
4888 return -ENOENT;
4889 }
4890 if (state->prefix.length() &&
4891 state->obit->get_oid().find(state->prefix) != 0) {
4892 state->obit++;
4893 continue;
4894 }
4895 *name = state->obit->get_oid();
4896 state->obit++;
4897 break;
4898 }
4899 return 0;
4900 }
4901
4902 int RGWRados::log_remove(const string& name)
4903 {
4904 librados::IoCtx io_ctx;
4905 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4906 if (r < 0)
4907 return r;
4908 return io_ctx.remove(name);
4909 }
4910
4911 struct log_show_state {
4912 librados::IoCtx io_ctx;
4913 bufferlist bl;
4914 bufferlist::iterator p;
4915 string name;
4916 uint64_t pos;
4917 bool eof;
4918 log_show_state() : pos(0), eof(false) {}
4919 };
4920
4921 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
4922 {
4923 log_show_state *state = new log_show_state;
4924 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4925 if (r < 0) {
4926 delete state;
4927 return r;
4928 }
4929 state->name = name;
4930 *handle = (RGWAccessHandle)state;
4931 return 0;
4932 }
4933
4934 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
4935 {
4936 log_show_state *state = static_cast<log_show_state *>(handle);
4937 off_t off = state->p.get_off();
4938
4939 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
4940 << " off " << off
4941 << " eof " << (int)state->eof
4942 << dendl;
4943 // read some?
4944 unsigned chunk = 1024*1024;
4945 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
4946 bufferlist more;
4947 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
4948 if (r < 0)
4949 return r;
4950 state->pos += r;
4951 bufferlist old;
4952 try {
4953 old.substr_of(state->bl, off, state->bl.length() - off);
4954 } catch (buffer::error& err) {
4955 return -EINVAL;
4956 }
4957 state->bl.clear();
4958 state->bl.claim(old);
4959 state->bl.claim_append(more);
4960 state->p = state->bl.begin();
4961 if ((unsigned)r < chunk)
4962 state->eof = true;
4963 ldout(cct, 10) << " read " << r << dendl;
4964 }
4965
4966 if (state->p.end())
4967 return 0; // end of file
4968 try {
4969 ::decode(*entry, state->p);
4970 }
4971 catch (const buffer::error &e) {
4972 return -EINVAL;
4973 }
4974 return 1;
4975 }
4976
4977 /**
4978 * usage_log_hash: get usage log key hash, based on name and index
4979 *
4980 * Get the usage object name. Since a user may have more than 1
4981 * object holding that info (multiple shards), we use index to
4982 * specify that shard number. Once index exceeds max shards it
4983 * wraps.
4984 * If name is not being set, results for all users will be returned
4985 * and index will wrap only after total shards number.
4986 *
4987 * @param cct [in] ceph context
4988 * @param name [in] user name
4989 * @param hash [out] hash value
4990 * @param index [in] shard index number
4991 */
4992 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
4993 {
4994 uint32_t val = index;
4995
4996 if (!name.empty()) {
4997 int max_user_shards = max(cct->_conf->rgw_usage_max_user_shards, 1);
4998 val %= max_user_shards;
4999 val += ceph_str_hash_linux(name.c_str(), name.size());
5000 }
5001 char buf[17];
5002 int max_shards = max(cct->_conf->rgw_usage_max_shards, 1);
5003 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5004 hash = buf;
5005 }
5006
5007 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5008 {
5009 uint32_t index = 0;
5010
5011 map<string, rgw_usage_log_info> log_objs;
5012
5013 string hash;
5014 string last_user;
5015
5016 /* restructure usage map, zone by object hash */
5017 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5018 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5019 const rgw_user_bucket& ub = iter->first;
5020 RGWUsageBatch& info = iter->second;
5021
5022 if (ub.user.empty()) {
5023 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5024 continue;
5025 }
5026
5027 if (ub.user != last_user) {
5028 /* index *should* be random, but why waste extra cycles
5029 in most cases max user shards is not going to exceed 1,
5030 so just incrementing it */
5031 usage_log_hash(cct, ub.user, hash, index++);
5032 }
5033 last_user = ub.user;
5034 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5035
5036 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5037 v.push_back(miter->second);
5038 }
5039 }
5040
5041 map<string, rgw_usage_log_info>::iterator liter;
5042
5043 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5044 int r = cls_obj_usage_log_add(liter->first, liter->second);
5045 if (r < 0)
5046 return r;
5047 }
5048 return 0;
5049 }
5050
5051 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5052 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5053 {
5054 uint32_t num = max_entries;
5055 string hash, first_hash;
5056 string user_str = user.to_str();
5057 usage_log_hash(cct, user_str, first_hash, 0);
5058
5059 if (usage_iter.index) {
5060 usage_log_hash(cct, user_str, hash, usage_iter.index);
5061 } else {
5062 hash = first_hash;
5063 }
5064
5065 usage.clear();
5066
5067 do {
5068 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5069 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5070
5071 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5072 usage_iter.read_iter, ret_usage, is_truncated);
5073 if (ret == -ENOENT)
5074 goto next;
5075
5076 if (ret < 0)
5077 return ret;
5078
5079 num -= ret_usage.size();
5080
5081 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5082 usage[iter->first].aggregate(iter->second);
5083 }
5084
5085 next:
5086 if (!*is_truncated) {
5087 usage_iter.read_iter.clear();
5088 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5089 }
5090 } while (num && !*is_truncated && hash != first_hash);
5091 return 0;
5092 }
5093
5094 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5095 {
5096 uint32_t index = 0;
5097 string hash, first_hash;
5098 string user_str = user.to_str();
5099 usage_log_hash(cct, user_str, first_hash, index);
5100
5101 hash = first_hash;
5102
5103 do {
5104 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5105 if (ret == -ENOENT)
5106 goto next;
5107
5108 if (ret < 0)
5109 return ret;
5110
5111 next:
5112 usage_log_hash(cct, user_str, hash, ++index);
5113 } while (hash != first_hash);
5114
5115 return 0;
5116 }
5117
5118 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5119 {
5120 return rgw_shards_hash(key, max_shards);
5121 }
5122
5123 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5124 {
5125 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5126 char buf[16];
5127 if (shard_id) {
5128 *shard_id = val % max_shards;
5129 }
5130 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5131 name = prefix + buf;
5132 }
5133
5134 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5135 {
5136 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5137 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5138 char buf[16];
5139 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5140 name = prefix + buf;
5141 }
5142
5143 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5144 {
5145 char buf[16];
5146 snprintf(buf, sizeof(buf), "%u", shard_id);
5147 name = prefix + buf;
5148
5149 }
5150
5151 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5152 {
5153 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5154 }
5155
5156 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5157 {
5158 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5159
5160 }
5161
5162 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5163 {
5164 librados::IoCtx io_ctx;
5165
5166 int r = time_log_add_init(io_ctx);
5167 if (r < 0) {
5168 return r;
5169 }
5170
5171 ObjectWriteOperation op;
5172 utime_t t(ut);
5173 cls_log_add(op, t, section, key, bl);
5174
5175 return io_ctx.operate(oid, &op);
5176 }
5177
5178 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5179 librados::AioCompletion *completion, bool monotonic_inc)
5180 {
5181 librados::IoCtx io_ctx;
5182
5183 int r = time_log_add_init(io_ctx);
5184 if (r < 0) {
5185 return r;
5186 }
5187
5188 ObjectWriteOperation op;
5189 cls_log_add(op, entries, monotonic_inc);
5190
5191 if (!completion) {
5192 r = io_ctx.operate(oid, &op);
5193 } else {
5194 r = io_ctx.aio_operate(oid, completion, &op);
5195 }
5196 return r;
5197 }
5198
5199 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5200 int max_entries, list<cls_log_entry>& entries,
5201 const string& marker,
5202 string *out_marker,
5203 bool *truncated)
5204 {
5205 librados::IoCtx io_ctx;
5206
5207 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5208 if (r < 0)
5209 return r;
5210 librados::ObjectReadOperation op;
5211
5212 utime_t st(start_time);
5213 utime_t et(end_time);
5214
5215 cls_log_list(op, st, et, marker, max_entries, entries,
5216 out_marker, truncated);
5217
5218 bufferlist obl;
5219
5220 int ret = io_ctx.operate(oid, &op, &obl);
5221 if (ret < 0)
5222 return ret;
5223
5224 return 0;
5225 }
5226
5227 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5228 {
5229 librados::IoCtx io_ctx;
5230
5231 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5232 if (r < 0)
5233 return r;
5234 librados::ObjectReadOperation op;
5235
5236 cls_log_info(op, header);
5237
5238 bufferlist obl;
5239
5240 int ret = io_ctx.operate(oid, &op, &obl);
5241 if (ret < 0)
5242 return ret;
5243
5244 return 0;
5245 }
5246
5247 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5248 {
5249 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5250 if (r < 0)
5251 return r;
5252
5253 librados::ObjectReadOperation op;
5254
5255 cls_log_info(op, header);
5256
5257 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5258 if (ret < 0)
5259 return ret;
5260
5261 return 0;
5262 }
5263
5264 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5265 const string& from_marker, const string& to_marker,
5266 librados::AioCompletion *completion)
5267 {
5268 librados::IoCtx io_ctx;
5269
5270 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5271 if (r < 0)
5272 return r;
5273
5274 utime_t st(start_time);
5275 utime_t et(end_time);
5276
5277 ObjectWriteOperation op;
5278 cls_log_trim(op, st, et, from_marker, to_marker);
5279
5280 if (!completion) {
5281 r = io_ctx.operate(oid, &op);
5282 } else {
5283 r = io_ctx.aio_operate(oid, completion, &op);
5284 }
5285 return r;
5286 }
5287
5288 string RGWRados::objexp_hint_get_shardname(int shard_num)
5289 {
5290 char buf[32];
5291 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5292
5293 string objname("obj_delete_at_hint.");
5294 return objname + buf;
5295 }
5296
5297 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5298 {
5299 string obj_key = key.name + key.instance;
5300 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5301 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5302 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5303 sid = rgw_shards_mod(sid2, num_shards);
5304 return sid;
5305 }
5306
5307 static string objexp_hint_get_keyext(const string& tenant_name,
5308 const string& bucket_name,
5309 const string& bucket_id,
5310 const rgw_obj_key& obj_key)
5311 {
5312 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5313 ":" + obj_key.name + ":" + obj_key.instance;
5314 }
5315
5316 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5317 const string& tenant_name,
5318 const string& bucket_name,
5319 const string& bucket_id,
5320 const rgw_obj_index_key& obj_key)
5321 {
5322 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5323 bucket_id, obj_key);
5324 objexp_hint_entry he = {
5325 .tenant = tenant_name,
5326 .bucket_name = bucket_name,
5327 .bucket_id = bucket_id,
5328 .obj_key = obj_key,
5329 .exp_time = delete_at };
5330 bufferlist hebl;
5331 ::encode(he, hebl);
5332 ObjectWriteOperation op;
5333 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5334
5335 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5336 return objexp_pool_ctx.operate(shard_name, &op);
5337 }
5338
5339 void RGWRados::objexp_get_shard(int shard_num,
5340 string& shard) /* out */
5341 {
5342 shard = objexp_hint_get_shardname(shard_num);
5343 }
5344
5345 int RGWRados::objexp_hint_list(const string& oid,
5346 const ceph::real_time& start_time,
5347 const ceph::real_time& end_time,
5348 const int max_entries,
5349 const string& marker,
5350 list<cls_timeindex_entry>& entries, /* out */
5351 string *out_marker, /* out */
5352 bool *truncated) /* out */
5353 {
5354 librados::ObjectReadOperation op;
5355 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5356 out_marker, truncated);
5357
5358 bufferlist obl;
5359 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5360
5361 if ((ret < 0 ) && (ret != -ENOENT)) {
5362 return ret;
5363 }
5364
5365 if ((ret == -ENOENT) && truncated) {
5366 *truncated = false;
5367 }
5368
5369 return 0;
5370 }
5371
5372 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5373 objexp_hint_entry& hint_entry) /* out */
5374 {
5375 try {
5376 bufferlist::iterator iter = ti_entry.value.begin();
5377 ::decode(hint_entry, iter);
5378 } catch (buffer::error& err) {
5379 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5380 }
5381
5382 return 0;
5383 }
5384
5385 int RGWRados::objexp_hint_trim(const string& oid,
5386 const ceph::real_time& start_time,
5387 const ceph::real_time& end_time,
5388 const string& from_marker,
5389 const string& to_marker)
5390 {
5391 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5392 from_marker, to_marker);
5393 if ((ret < 0 ) && (ret != -ENOENT)) {
5394 return ret;
5395 }
5396
5397 return 0;
5398 }
5399
5400 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5401 string& zone_id, string& owner_id) {
5402 librados::IoCtx io_ctx;
5403
5404 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5405 if (r < 0) {
5406 return r;
5407 }
5408 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5409 utime_t ut(msec / 1000, msec % 1000);
5410
5411 rados::cls::lock::Lock l(log_lock_name);
5412 l.set_duration(ut);
5413 l.set_cookie(owner_id);
5414 l.set_tag(zone_id);
5415 l.set_renew(true);
5416
5417 return l.lock_exclusive(&io_ctx, oid);
5418 }
5419
5420 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5421 librados::IoCtx io_ctx;
5422
5423 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5424 if (r < 0) {
5425 return r;
5426 }
5427
5428 rados::cls::lock::Lock l(log_lock_name);
5429 l.set_tag(zone_id);
5430 l.set_cookie(owner_id);
5431
5432 return l.unlock(&io_ctx, oid);
5433 }
5434
5435 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5436 {
5437 bufferlist::iterator i = bl.begin();
5438 RGWAccessControlPolicy policy(cct);
5439 try {
5440 policy.decode_owner(i);
5441 } catch (buffer::error& err) {
5442 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5443 return -EIO;
5444 }
5445 *owner = policy.get_owner();
5446 return 0;
5447 }
5448
5449 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5450 {
5451 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5452 if (aiter == attrset.end())
5453 return -EIO;
5454
5455 bufferlist& bl = aiter->second;
5456 bufferlist::iterator iter = bl.begin();
5457 try {
5458 policy->decode(iter);
5459 } catch (buffer::error& err) {
5460 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5461 return -EIO;
5462 }
5463 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5464 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5465 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5466 s3policy->to_xml(*_dout);
5467 *_dout << dendl;
5468 }
5469 return 0;
5470 }
5471
5472
5473 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5474 {
5475 rgw_bucket bucket = bucket_info.bucket;
5476 bucket.update_bucket_id(new_bucket_id);
5477
5478 RGWObjectCtx obj_ctx(store);
5479
5480 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5481 if (ret < 0) {
5482 return ret;
5483 }
5484
5485 return 0;
5486 }
5487
5488 /**
5489 * get listing of the objects in a bucket.
5490 *
5491 * max: maximum number of results to return
5492 * bucket: bucket to list contents of
5493 * prefix: only return results that match this prefix
5494 * delim: do not include results that match this string.
5495 * Any skipped results will have the matching portion of their name
5496 * inserted in common_prefixes with a "true" mark.
5497 * marker: if filled in, begin the listing with this object.
5498 * end_marker: if filled in, end the listing with this object.
5499 * result: the objects are put in here.
5500 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5501 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5502 */
5503 int RGWRados::Bucket::List::list_objects(int max, vector<rgw_bucket_dir_entry> *result,
5504 map<string, bool> *common_prefixes,
5505 bool *is_truncated)
5506 {
5507 RGWRados *store = target->get_store();
5508 CephContext *cct = store->ctx();
5509 int shard_id = target->get_shard_id();
5510
5511 int count = 0;
5512 bool truncated = true;
5513 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5514
5515 result->clear();
5516
5517 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5518
5519 rgw_obj_key end_marker_obj;
5520 rgw_obj_index_key cur_end_marker;
5521 if (!params.ns.empty()) {
5522 end_marker_obj = rgw_obj_key(params.end_marker.name, params.end_marker.instance, params.ns);
5523 end_marker_obj.ns = params.ns;
5524 end_marker_obj.get_index_key(&cur_end_marker);
5525 }
5526 rgw_obj_index_key cur_marker;
5527 marker_obj.get_index_key(&cur_marker);
5528
5529 const bool cur_end_marker_valid = !params.end_marker.empty();
5530
5531 rgw_obj_key prefix_obj(params.prefix);
5532 prefix_obj.ns = params.ns;
5533 string cur_prefix = prefix_obj.get_index_key_name();
5534
5535 string bigger_than_delim;
5536
5537 if (!params.delim.empty()) {
5538 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5539 char buf[params.delim.size() + 16];
5540 int r = encode_utf8(val + 1, (unsigned char *)buf);
5541 if (r < 0) {
5542 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5543 return -EINVAL;
5544 }
5545 buf[r] = '\0';
5546
5547 bigger_than_delim = buf;
5548
5549 /* if marker points at a common prefix, fast forward it into its upperbound string */
5550 int delim_pos = cur_marker.name.find(params.delim, params.prefix.size());
5551 if (delim_pos >= 0) {
5552 string s = cur_marker.name.substr(0, delim_pos);
5553 s.append(bigger_than_delim);
5554 cur_marker = s;
5555 }
5556 }
5557
5558 string skip_after_delim;
5559 while (truncated && count <= max) {
5560 if (skip_after_delim > cur_marker.name) {
5561 cur_marker = skip_after_delim;
5562 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5563 }
5564 std::map<string, rgw_bucket_dir_entry> ent_map;
5565 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5566 read_ahead + 1 - count, params.list_versions, ent_map,
5567 &truncated, &cur_marker);
5568 if (r < 0)
5569 return r;
5570
5571 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5572 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5573 rgw_bucket_dir_entry& entry = eiter->second;
5574 rgw_obj_index_key index_key = entry.key;
5575
5576 rgw_obj_key obj(index_key);
5577
5578 /* note that parse_raw_oid() here will not set the correct object's instance, as
5579 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5580 * not needed for the checks here and we end up using the raw entry for the return vector
5581 */
5582 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5583 if (!valid) {
5584 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5585 continue;
5586 }
5587 bool check_ns = (obj.ns == params.ns);
5588 if (!params.list_versions && !entry.is_visible()) {
5589 continue;
5590 }
5591
5592 if (params.enforce_ns && !check_ns) {
5593 if (!params.ns.empty()) {
5594 /* we've iterated past the namespace we're searching -- done now */
5595 truncated = false;
5596 goto done;
5597 }
5598
5599 /* we're not looking at the namespace this object is in, next! */
5600 continue;
5601 }
5602
5603 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5604 truncated = false;
5605 goto done;
5606 }
5607
5608 if (count < max) {
5609 params.marker = index_key;
5610 next_marker = index_key;
5611 }
5612
5613 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5614 continue;
5615
5616 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5617 continue;
5618
5619 if (!params.delim.empty()) {
5620 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5621
5622 if (delim_pos >= 0) {
5623 string prefix_key = obj.name.substr(0, delim_pos + 1);
5624
5625 if (common_prefixes &&
5626 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5627 if (count >= max) {
5628 truncated = true;
5629 goto done;
5630 }
5631 next_marker = prefix_key;
5632 (*common_prefixes)[prefix_key] = true;
5633
5634 skip_after_delim = obj.name.substr(0, delim_pos);
5635 skip_after_delim.append(bigger_than_delim);
5636
5637 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5638
5639 count++;
5640 }
5641
5642 continue;
5643 }
5644 }
5645
5646 if (count >= max) {
5647 truncated = true;
5648 goto done;
5649 }
5650
5651 result->emplace_back(std::move(entry));
5652 count++;
5653 }
5654
5655 // Either the back-end telling us truncated, or we don't consume all
5656 // items returned per the amount caller request
5657 truncated = (truncated || eiter != ent_map.end());
5658 }
5659
5660 done:
5661 if (is_truncated)
5662 *is_truncated = truncated;
5663
5664 return 0;
5665 }
5666
5667 /**
5668 * create a rados pool, associated meta info
5669 * returns 0 on success, -ERR# otherwise.
5670 */
5671 int RGWRados::create_pool(const rgw_pool& pool)
5672 {
5673 int ret = 0;
5674
5675 librados::Rados *rad = get_rados_handle();
5676 ret = rad->pool_create(pool.name.c_str(), 0);
5677 if (ret == -EEXIST)
5678 ret = 0;
5679 else if (ret == -ERANGE) {
5680 ldout(cct, 0)
5681 << __func__
5682 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-ret)
5683 << " (this can be due to a pool or placement group misconfiguration, e.g., pg_num < pgp_num)"
5684 << dendl;
5685 }
5686 if (ret < 0)
5687 return ret;
5688
5689 return 0;
5690 }
5691
5692 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5693 {
5694 librados::IoCtx index_ctx; // context for new bucket
5695
5696 string dir_oid = dir_oid_prefix;
5697 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5698 if (r < 0) {
5699 return r;
5700 }
5701
5702 dir_oid.append(bucket_info.bucket.bucket_id);
5703
5704 map<int, string> bucket_objs;
5705 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5706
5707 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5708 }
5709
5710 void RGWRados::create_bucket_id(string *bucket_id)
5711 {
5712 uint64_t iid = instance_id();
5713 uint64_t bid = next_bucket_id();
5714 char buf[get_zone_params().get_id().size() + 48];
5715 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5716 *bucket_id = buf;
5717 }
5718
5719 /**
5720 * create a bucket with name bucket and the given list of attrs
5721 * returns 0 on success, -ERR# otherwise.
5722 */
5723 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5724 const string& zonegroup_id,
5725 const string& placement_rule,
5726 const string& swift_ver_location,
5727 const RGWQuotaInfo * pquota_info,
5728 map<std::string, bufferlist>& attrs,
5729 RGWBucketInfo& info,
5730 obj_version *pobjv,
5731 obj_version *pep_objv,
5732 real_time creation_time,
5733 rgw_bucket *pmaster_bucket,
5734 uint32_t *pmaster_num_shards,
5735 bool exclusive)
5736 {
5737 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5738 string selected_placement_rule_name;
5739 RGWZonePlacementInfo rule_info;
5740
5741 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5742 int ret = 0;
5743 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5744 &selected_placement_rule_name, &rule_info);
5745 if (ret < 0)
5746 return ret;
5747
5748 if (!pmaster_bucket) {
5749 create_bucket_id(&bucket.marker);
5750 bucket.bucket_id = bucket.marker;
5751 } else {
5752 bucket.marker = pmaster_bucket->marker;
5753 bucket.bucket_id = pmaster_bucket->bucket_id;
5754 }
5755
5756 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5757
5758 if (pobjv) {
5759 objv_tracker.write_version = *pobjv;
5760 } else {
5761 objv_tracker.generate_new_write_ver(cct);
5762 }
5763
5764 info.bucket = bucket;
5765 info.owner = owner.user_id;
5766 info.zonegroup = zonegroup_id;
5767 info.placement_rule = selected_placement_rule_name;
5768 info.index_type = rule_info.index_type;
5769 info.swift_ver_location = swift_ver_location;
5770 info.swift_versioning = (!swift_ver_location.empty());
5771 if (pmaster_num_shards) {
5772 info.num_shards = *pmaster_num_shards;
5773 } else {
5774 info.num_shards = bucket_index_max_shards;
5775 }
5776 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5777 info.requester_pays = false;
5778 if (real_clock::is_zero(creation_time)) {
5779 info.creation_time = ceph::real_clock::now();
5780 } else {
5781 info.creation_time = creation_time;
5782 }
5783 if (pquota_info) {
5784 info.quota = *pquota_info;
5785 }
5786
5787 int r = init_bucket_index(info, info.num_shards);
5788 if (r < 0) {
5789 return r;
5790 }
5791
5792 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5793 if (ret == -EEXIST) {
5794 librados::IoCtx index_ctx;
5795 map<int, string> bucket_objs;
5796 int r = open_bucket_index(info, index_ctx, bucket_objs);
5797 if (r < 0)
5798 return r;
5799
5800 /* we need to reread the info and return it, caller will have a use for it */
5801 RGWObjVersionTracker instance_ver = info.objv_tracker;
5802 info.objv_tracker.clear();
5803 RGWObjectCtx obj_ctx(this);
5804 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5805 if (r < 0) {
5806 if (r == -ENOENT) {
5807 continue;
5808 }
5809 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5810 return r;
5811 }
5812
5813 /* only remove it if it's a different bucket instance */
5814 if (info.bucket.bucket_id != bucket.bucket_id) {
5815 /* remove bucket meta instance */
5816 string entry = bucket.get_key();
5817 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5818 if (r < 0)
5819 return r;
5820
5821 map<int, string>::const_iterator biter;
5822 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5823 // Do best effort removal
5824 index_ctx.remove(biter->second);
5825 }
5826 }
5827 /* ret == -ENOENT here */
5828 }
5829 return ret;
5830 }
5831
5832 /* this is highly unlikely */
5833 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5834 return -ENOENT;
5835 }
5836
5837 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5838 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5839
5840 {
5841 /* first check that rule exists within the specific zonegroup */
5842 RGWZoneGroup zonegroup;
5843 int ret = get_zonegroup(zonegroup_id, zonegroup);
5844 if (ret < 0) {
5845 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5846 return ret;
5847 }
5848
5849 /* now check that tag exists within zonegroup */
5850 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5851 string rule = request_rule;
5852 if (rule.empty()) {
5853 rule = user_info.default_placement;
5854 if (rule.empty())
5855 rule = zonegroup.default_placement;
5856 }
5857
5858 if (rule.empty()) {
5859 ldout(cct, 0) << "misconfiguration, should not have an empty placement rule name" << dendl;
5860 return -EIO;
5861 }
5862
5863 map<string, RGWZoneGroupPlacementTarget>::iterator titer = zonegroup.placement_targets.find(rule);
5864 if (titer == zonegroup.placement_targets.end()) {
5865 ldout(cct, 0) << "could not find placement rule " << rule << " within zonegroup " << dendl;
5866 return -EINVAL;
5867 }
5868
5869 /* now check tag for the rule, whether user is permitted to use rule */
5870 RGWZoneGroupPlacementTarget& target_rule = titer->second;
5871 if (!target_rule.user_permitted(user_info.placement_tags)) {
5872 ldout(cct, 0) << "user not permitted to use placement rule" << dendl;
5873 return -EPERM;
5874 }
5875
5876 if (pselected_rule_name)
5877 *pselected_rule_name = rule;
5878
5879 return select_bucket_location_by_rule(rule, rule_info);
5880 }
5881
5882 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5883 {
5884 if (location_rule.empty()) {
5885 /* we can only reach here if we're trying to set a bucket location from a bucket
5886 * created on a different zone, using a legacy / default pool configuration
5887 */
5888 return select_legacy_bucket_placement(rule_info);
5889 }
5890
5891 /*
5892 * make sure that zone has this rule configured. We're
5893 * checking it for the local zone, because that's where this bucket object is going to
5894 * reside.
5895 */
5896 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5897 if (piter == get_zone_params().placement_pools.end()) {
5898 /* couldn't find, means we cannot really place data for this bucket in this zone */
5899 if (get_zonegroup().equals(zonegroup_id)) {
5900 /* that's a configuration error, zone should have that rule, as we're within the requested
5901 * zonegroup */
5902 return -EINVAL;
5903 } else {
5904 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5905 return 0;
5906 }
5907 }
5908
5909 RGWZonePlacementInfo& placement_info = piter->second;
5910
5911 if (rule_info) {
5912 *rule_info = placement_info;
5913 }
5914
5915 return 0;
5916 }
5917
5918 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5919 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5920 {
5921 if (!get_zone_params().placement_pools.empty()) {
5922 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
5923 pselected_rule_name, rule_info);
5924 }
5925
5926 if (pselected_rule_name) {
5927 pselected_rule_name->clear();
5928 }
5929
5930 return select_legacy_bucket_placement(rule_info);
5931 }
5932
5933 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
5934 {
5935 bufferlist map_bl;
5936 map<string, bufferlist> m;
5937 string pool_name;
5938 bool write_map = false;
5939
5940 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
5941
5942 RGWObjectCtx obj_ctx(this);
5943 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
5944 if (ret < 0) {
5945 goto read_omap;
5946 }
5947
5948 try {
5949 bufferlist::iterator iter = map_bl.begin();
5950 ::decode(m, iter);
5951 } catch (buffer::error& err) {
5952 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5953 }
5954
5955 read_omap:
5956 if (m.empty()) {
5957 bufferlist header;
5958 ret = omap_get_all(obj, header, m);
5959
5960 write_map = true;
5961 }
5962
5963 if (ret < 0 || m.empty()) {
5964 vector<rgw_pool> pools;
5965 string s = string("default.") + default_storage_pool_suffix;
5966 pools.push_back(rgw_pool(s));
5967 vector<int> retcodes;
5968 bufferlist bl;
5969 ret = create_pools(pools, retcodes);
5970 if (ret < 0)
5971 return ret;
5972 ret = omap_set(obj, s, bl);
5973 if (ret < 0)
5974 return ret;
5975 m[s] = bl;
5976 }
5977
5978 if (write_map) {
5979 bufferlist new_bl;
5980 ::encode(m, new_bl);
5981 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
5982 if (ret < 0) {
5983 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
5984 }
5985 }
5986
5987 map<string, bufferlist>::iterator miter;
5988 if (m.size() > 1) {
5989 vector<string> v;
5990 for (miter = m.begin(); miter != m.end(); ++miter) {
5991 v.push_back(miter->first);
5992 }
5993
5994 uint32_t r;
5995 ret = get_random_bytes((char *)&r, sizeof(r));
5996 if (ret < 0)
5997 return ret;
5998
5999 int i = r % v.size();
6000 pool_name = v[i];
6001 } else {
6002 miter = m.begin();
6003 pool_name = miter->first;
6004 }
6005
6006 rule_info->data_pool = pool_name;
6007 rule_info->data_extra_pool = pool_name;
6008 rule_info->index_pool = pool_name;
6009 rule_info->index_type = RGWBIType_Normal;
6010
6011 return 0;
6012 }
6013
6014 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6015 {
6016 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6017 }
6018
6019 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6020 {
6021 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6022
6023 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6024 }
6025
6026 int RGWRados::update_placement_map()
6027 {
6028 bufferlist header;
6029 map<string, bufferlist> m;
6030 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6031 int ret = omap_get_all(obj, header, m);
6032 if (ret < 0)
6033 return ret;
6034
6035 bufferlist new_bl;
6036 ::encode(m, new_bl);
6037 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6038 if (ret < 0) {
6039 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6040 }
6041
6042 return ret;
6043 }
6044
6045 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6046 {
6047 librados::Rados *rad = get_rados_handle();
6048 int ret = rad->pool_lookup(new_pool.name.c_str());
6049 if (ret < 0) // DNE, or something
6050 return ret;
6051
6052 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6053 bufferlist empty_bl;
6054 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6055
6056 // don't care about return value
6057 update_placement_map();
6058
6059 return ret;
6060 }
6061
6062 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6063 {
6064 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6065 int ret = omap_del(obj, old_pool.to_str());
6066
6067 // don't care about return value
6068 update_placement_map();
6069
6070 return ret;
6071 }
6072
6073 int RGWRados::list_placement_set(set<rgw_pool>& names)
6074 {
6075 bufferlist header;
6076 map<string, bufferlist> m;
6077
6078 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6079 int ret = omap_get_all(obj, header, m);
6080 if (ret < 0)
6081 return ret;
6082
6083 names.clear();
6084 map<string, bufferlist>::iterator miter;
6085 for (miter = m.begin(); miter != m.end(); ++miter) {
6086 names.insert(rgw_pool(miter->first));
6087 }
6088
6089 return names.size();
6090 }
6091
6092 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6093 {
6094 vector<librados::PoolAsyncCompletion *> completions;
6095 vector<int> rets;
6096
6097 librados::Rados *rad = get_rados_handle();
6098 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6099 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6100 completions.push_back(c);
6101 rgw_pool& pool = *iter;
6102 int ret = rad->pool_create_async(pool.name.c_str(), c);
6103 rets.push_back(ret);
6104 }
6105
6106 vector<int>::iterator riter;
6107 vector<librados::PoolAsyncCompletion *>::iterator citer;
6108
6109 assert(rets.size() == completions.size());
6110 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6111 int r = *riter;
6112 PoolAsyncCompletion *c = *citer;
6113 if (r == 0) {
6114 c->wait();
6115 r = c->get_return_value();
6116 if (r < 0) {
6117 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6118 }
6119 }
6120 c->release();
6121 retcodes.push_back(r);
6122 }
6123 return 0;
6124 }
6125
6126 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6127 {
6128 string oid, key;
6129 get_obj_bucket_and_oid_loc(obj, oid, key);
6130
6131 rgw_pool pool;
6132 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6133 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6134 return -EIO;
6135 }
6136
6137 int r = open_pool_ctx(pool, *ioctx);
6138 if (r < 0) {
6139 return r;
6140 }
6141
6142 ioctx->locator_set_key(key);
6143
6144 return 0;
6145 }
6146
6147 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6148 {
6149 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6150
6151 rgw_pool pool;
6152 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6153 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6154 return -EIO;
6155 }
6156
6157 int r = open_pool_ctx(pool, ref->ioctx);
6158 if (r < 0) {
6159 return r;
6160 }
6161
6162 ref->ioctx.locator_set_key(ref->key);
6163
6164 return 0;
6165 }
6166
6167 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref, rgw_pool *pool)
6168 {
6169 ref->oid = obj.oid;
6170 ref->key = obj.loc;
6171
6172 int r;
6173
6174 if (ref->oid.empty()) {
6175 ref->oid = obj.pool.to_str();
6176 ref->pool = get_zone_params().domain_root;
6177 } else {
6178 ref->pool = obj.pool;
6179 }
6180 if (pool) {
6181 *pool = ref->pool;
6182 }
6183 r = open_pool_ctx(ref->pool, ref->ioctx);
6184 if (r < 0)
6185 return r;
6186
6187 ref->ioctx.locator_set_key(ref->key);
6188
6189 return 0;
6190 }
6191
6192 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref, rgw_pool *pool)
6193 {
6194 return get_raw_obj_ref(obj, ref, pool);
6195 }
6196
6197 /*
6198 * fixes an issue where head objects were supposed to have a locator created, but ended
6199 * up without one
6200 */
6201 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6202 {
6203 const rgw_bucket& bucket = bucket_info.bucket;
6204 string oid;
6205 string locator;
6206
6207 rgw_obj obj(bucket, key);
6208
6209 get_obj_bucket_and_oid_loc(obj, oid, locator);
6210
6211 if (locator.empty()) {
6212 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6213 return 0;
6214 }
6215
6216 librados::IoCtx ioctx;
6217
6218 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6219 if (ret < 0) {
6220 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6221 return ret;
6222 }
6223 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6224
6225 uint64_t size;
6226 bufferlist data;
6227
6228 struct timespec mtime_ts;
6229 map<string, bufferlist> attrs;
6230 librados::ObjectReadOperation op;
6231 op.getxattrs(&attrs, NULL);
6232 op.stat2(&size, &mtime_ts, NULL);
6233 #define HEAD_SIZE 512 * 1024
6234 op.read(0, HEAD_SIZE, &data, NULL);
6235
6236 ret = ioctx.operate(oid, &op, NULL);
6237 if (ret < 0) {
6238 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6239 return ret;
6240 }
6241
6242 if (size > HEAD_SIZE) {
6243 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6244 return -EIO;
6245 }
6246
6247 if (size != data.length()) {
6248 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6249 return -EIO;
6250 }
6251
6252 if (copy_obj) {
6253 librados::ObjectWriteOperation wop;
6254
6255 wop.mtime2(&mtime_ts);
6256
6257 map<string, bufferlist>::iterator iter;
6258 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6259 wop.setxattr(iter->first.c_str(), iter->second);
6260 }
6261
6262 wop.write(0, data);
6263
6264 ioctx.locator_set_key(locator);
6265 ioctx.operate(oid, &wop);
6266 }
6267
6268 if (remove_bad) {
6269 ioctx.locator_set_key(string());
6270
6271 ret = ioctx.remove(oid);
6272 if (ret < 0) {
6273 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6274 return ret;
6275 }
6276 }
6277
6278 return 0;
6279 }
6280
6281 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6282 const string& src_oid, const string& src_locator,
6283 librados::IoCtx& dst_ioctx,
6284 const string& dst_oid, const string& dst_locator)
6285 {
6286
6287 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6288 bool done = false;
6289 uint64_t chunk_size = COPY_BUF_SIZE;
6290 uint64_t ofs = 0;
6291 int ret = 0;
6292 real_time mtime;
6293 struct timespec mtime_ts;
6294 uint64_t size;
6295
6296 if (src_oid == dst_oid && src_locator == dst_locator) {
6297 return 0;
6298 }
6299
6300 src_ioctx.locator_set_key(src_locator);
6301 dst_ioctx.locator_set_key(dst_locator);
6302
6303 do {
6304 bufferlist data;
6305 ObjectReadOperation rop;
6306 ObjectWriteOperation wop;
6307
6308 if (ofs == 0) {
6309 rop.stat2(&size, &mtime_ts, NULL);
6310 mtime = real_clock::from_timespec(mtime_ts);
6311 }
6312 rop.read(ofs, chunk_size, &data, NULL);
6313 ret = src_ioctx.operate(src_oid, &rop, NULL);
6314 if (ret < 0) {
6315 goto done_err;
6316 }
6317
6318 if (data.length() == 0) {
6319 break;
6320 }
6321
6322 if (ofs == 0) {
6323 wop.create(true); /* make it exclusive */
6324 wop.mtime2(&mtime_ts);
6325 mtime = real_clock::from_timespec(mtime_ts);
6326 }
6327 wop.write(ofs, data);
6328 ret = dst_ioctx.operate(dst_oid, &wop);
6329 ofs += data.length();
6330 done = data.length() != chunk_size;
6331 } while (!done);
6332
6333 if (ofs != size) {
6334 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6335 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6336 ret = -EIO;
6337 goto done_err;
6338 }
6339
6340 src_ioctx.remove(src_oid);
6341
6342 return 0;
6343
6344 done_err:
6345 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6346 return ret;
6347 }
6348
6349 /*
6350 * fixes an issue where head objects were supposed to have a locator created, but ended
6351 * up without one
6352 */
6353 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6354 {
6355 const rgw_bucket& bucket = bucket_info.bucket;
6356 rgw_obj obj(bucket, key);
6357
6358 if (need_fix) {
6359 *need_fix = false;
6360 }
6361
6362 rgw_rados_ref ref;
6363 int r = get_obj_head_ref(bucket_info, obj, &ref);
6364 if (r < 0) {
6365 return r;
6366 }
6367
6368 RGWObjState *astate = NULL;
6369 RGWObjectCtx rctx(this);
6370 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6371 if (r < 0)
6372 return r;
6373
6374 if (astate->has_manifest) {
6375 RGWObjManifest::obj_iterator miter;
6376 RGWObjManifest& manifest = astate->manifest;
6377 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6378 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6379 rgw_obj loc;
6380 string oid;
6381 string locator;
6382
6383 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6384
6385 if (loc.key.ns.empty()) {
6386 /* continue, we're only interested in tail objects */
6387 continue;
6388 }
6389
6390 get_obj_bucket_and_oid_loc(loc, oid, locator);
6391 ref.ioctx.locator_set_key(locator);
6392
6393 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6394
6395 r = ref.ioctx.stat(oid, NULL, NULL);
6396 if (r != -ENOENT) {
6397 continue;
6398 }
6399
6400 string bad_loc;
6401 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6402
6403 /* create a new ioctx with the bad locator */
6404 librados::IoCtx src_ioctx;
6405 src_ioctx.dup(ref.ioctx);
6406 src_ioctx.locator_set_key(bad_loc);
6407
6408 r = src_ioctx.stat(oid, NULL, NULL);
6409 if (r != 0) {
6410 /* cannot find a broken part */
6411 continue;
6412 }
6413 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6414 if (need_fix) {
6415 *need_fix = true;
6416 }
6417 if (fix) {
6418 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6419 if (r < 0) {
6420 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6421 }
6422 }
6423 }
6424 }
6425
6426 return 0;
6427 }
6428
6429 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6430 {
6431 bucket = _bucket;
6432
6433 RGWObjectCtx obj_ctx(store);
6434
6435 RGWBucketInfo bucket_info;
6436 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6437 if (ret < 0) {
6438 return ret;
6439 }
6440
6441 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6442 if (ret < 0) {
6443 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6444 return ret;
6445 }
6446 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6447
6448 return 0;
6449 }
6450
6451 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6452 {
6453 bucket = _bucket;
6454 shard_id = sid;
6455
6456 RGWObjectCtx obj_ctx(store);
6457
6458 RGWBucketInfo bucket_info;
6459 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6460 if (ret < 0) {
6461 return ret;
6462 }
6463
6464 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6465 if (ret < 0) {
6466 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6467 return ret;
6468 }
6469 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6470
6471 return 0;
6472 }
6473
6474
6475 /* Execute @handler on last item in bucket listing for bucket specified
6476 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6477 * to objects matching these criterias. */
6478 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6479 const std::string& obj_prefix,
6480 const std::string& obj_delim,
6481 std::function<int(const rgw_bucket_dir_entry&)> handler)
6482 {
6483 RGWRados::Bucket target(this, bucket_info);
6484 RGWRados::Bucket::List list_op(&target);
6485
6486 list_op.params.prefix = obj_prefix;
6487 list_op.params.delim = obj_delim;
6488
6489 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6490 << ", obj_prefix=" << obj_prefix
6491 << ", obj_delim=" << obj_delim
6492 << dendl;
6493
6494 bool is_truncated = false;
6495
6496 boost::optional<rgw_bucket_dir_entry> last_entry;
6497 /* We need to rewind to the last object in a listing. */
6498 do {
6499 /* List bucket entries in chunks. */
6500 static constexpr int MAX_LIST_OBJS = 100;
6501 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6502
6503 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6504 &is_truncated);
6505 if (ret < 0) {
6506 return ret;
6507 } else if (!entries.empty()) {
6508 last_entry = entries.back();
6509 }
6510 } while (is_truncated);
6511
6512 if (last_entry) {
6513 return handler(*last_entry);
6514 }
6515
6516 /* Empty listing - no items we can run handler on. */
6517 return 0;
6518 }
6519
6520
6521 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6522 const rgw_user& user,
6523 RGWBucketInfo& bucket_info,
6524 rgw_obj& obj)
6525 {
6526 if (! swift_versioning_enabled(bucket_info)) {
6527 return 0;
6528 }
6529
6530 obj_ctx.obj.set_atomic(obj);
6531
6532 RGWObjState * state = nullptr;
6533 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6534 if (r < 0) {
6535 return r;
6536 }
6537
6538 if (!state->exists) {
6539 return 0;
6540 }
6541
6542 string client_id;
6543 string op_id;
6544
6545 const string& src_name = obj.get_oid();
6546 char buf[src_name.size() + 32];
6547 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6548 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6549 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6550
6551 RGWBucketInfo dest_bucket_info;
6552
6553 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6554 if (r < 0) {
6555 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6556 if (r == -ENOENT) {
6557 return -ERR_PRECONDITION_FAILED;
6558 }
6559 return r;
6560 }
6561
6562 if (dest_bucket_info.owner != bucket_info.owner) {
6563 return -ERR_PRECONDITION_FAILED;
6564 }
6565
6566 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6567 obj_ctx.obj.set_atomic(dest_obj);
6568
6569 string no_zone;
6570
6571 r = copy_obj(obj_ctx,
6572 user,
6573 client_id,
6574 op_id,
6575 NULL, /* req_info *info */
6576 no_zone,
6577 dest_obj,
6578 obj,
6579 dest_bucket_info,
6580 bucket_info,
6581 NULL, /* time_t *src_mtime */
6582 NULL, /* time_t *mtime */
6583 NULL, /* const time_t *mod_ptr */
6584 NULL, /* const time_t *unmod_ptr */
6585 false, /* bool high_precision_time */
6586 NULL, /* const char *if_match */
6587 NULL, /* const char *if_nomatch */
6588 RGWRados::ATTRSMOD_NONE,
6589 true, /* bool copy_if_newer */
6590 state->attrset,
6591 RGW_OBJ_CATEGORY_MAIN,
6592 0, /* uint64_t olh_epoch */
6593 real_time(), /* time_t delete_at */
6594 NULL, /* string *version_id */
6595 NULL, /* string *ptag */
6596 NULL, /* string *petag */
6597 NULL, /* void (*progress_cb)(off_t, void *) */
6598 NULL); /* void *progress_data */
6599 if (r == -ECANCELED || r == -ENOENT) {
6600 /* Has already been overwritten, meaning another rgw process already
6601 * copied it out */
6602 return 0;
6603 }
6604
6605 return r;
6606 }
6607
6608 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6609 const rgw_user& user,
6610 RGWBucketInfo& bucket_info,
6611 rgw_obj& obj,
6612 bool& restored) /* out */
6613 {
6614 if (! swift_versioning_enabled(bucket_info)) {
6615 return 0;
6616 }
6617
6618 /* Bucket info of the bucket that stores previous versions of our object. */
6619 RGWBucketInfo archive_binfo;
6620
6621 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6622 bucket_info.swift_ver_location, archive_binfo,
6623 nullptr, nullptr);
6624 if (ret < 0) {
6625 return ret;
6626 }
6627
6628 /* Abort the operation if the bucket storing our archive belongs to someone
6629 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6630 * into consideration. For we can live with that.
6631 *
6632 * TODO: delegate this check to un upper layer and compare with ACLs. */
6633 if (bucket_info.owner != archive_binfo.owner) {
6634 return -EPERM;
6635 }
6636
6637 /* This code will be executed on latest version of the object. */
6638 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6639 std::string no_client_id;
6640 std::string no_op_id;
6641 std::string no_zone;
6642
6643 /* We don't support object versioning of Swift API on those buckets that
6644 * are already versioned using the S3 mechanism. This affects also bucket
6645 * storing archived objects. Otherwise the delete operation would create
6646 * a deletion marker. */
6647 if (archive_binfo.versioned()) {
6648 restored = false;
6649 return -ERR_PRECONDITION_FAILED;
6650 }
6651
6652 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6653 * irrelevant and may be safely skipped. */
6654 std::map<std::string, ceph::bufferlist> no_attrs;
6655
6656 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6657 obj_ctx.obj.set_atomic(archive_obj);
6658 obj_ctx.obj.set_atomic(obj);
6659
6660 int ret = copy_obj(obj_ctx,
6661 user,
6662 no_client_id,
6663 no_op_id,
6664 nullptr, /* req_info *info */
6665 no_zone,
6666 obj, /* dest obj */
6667 archive_obj, /* src obj */
6668 bucket_info, /* dest bucket info */
6669 archive_binfo, /* src bucket info */
6670 nullptr, /* time_t *src_mtime */
6671 nullptr, /* time_t *mtime */
6672 nullptr, /* const time_t *mod_ptr */
6673 nullptr, /* const time_t *unmod_ptr */
6674 false, /* bool high_precision_time */
6675 nullptr, /* const char *if_match */
6676 nullptr, /* const char *if_nomatch */
6677 RGWRados::ATTRSMOD_NONE,
6678 true, /* bool copy_if_newer */
6679 no_attrs,
6680 RGW_OBJ_CATEGORY_MAIN,
6681 0, /* uint64_t olh_epoch */
6682 real_time(), /* time_t delete_at */
6683 nullptr, /* string *version_id */
6684 nullptr, /* string *ptag */
6685 nullptr, /* string *petag */
6686 nullptr, /* void (*progress_cb)(off_t, void *) */
6687 nullptr); /* void *progress_data */
6688 if (ret == -ECANCELED || ret == -ENOENT) {
6689 /* Has already been overwritten, meaning another rgw process already
6690 * copied it out */
6691 return 0;
6692 } else if (ret < 0) {
6693 return ret;
6694 } else {
6695 restored = true;
6696 }
6697
6698 /* Need to remove the archived copy. */
6699 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6700 archive_binfo.versioning_status());
6701
6702 return ret;
6703 };
6704
6705 const std::string& obj_name = obj.get_oid();
6706 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6707 % obj_name);
6708
6709 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6710 handler);
6711 }
6712
6713 /**
6714 * Write/overwrite an object to the bucket storage.
6715 * bucket: the bucket to store the object in
6716 * obj: the object name/key
6717 * data: the object contents/value
6718 * size: the amount of data to write (data must be this long)
6719 * accounted_size: original size of data before compression, encryption
6720 * mtime: if non-NULL, writes the given mtime to the bucket storage
6721 * attrs: all the given attrs are written to bucket storage for the given object
6722 * exclusive: create object exclusively
6723 * Returns: 0 on success, -ERR# otherwise.
6724 */
6725 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6726 map<string, bufferlist>& attrs, bool assume_noent,
6727 void *_index_op)
6728 {
6729 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6730 rgw_pool pool;
6731 rgw_rados_ref ref;
6732 RGWRados *store = target->get_store();
6733
6734 ObjectWriteOperation op;
6735
6736 RGWObjState *state;
6737 int r = target->get_state(&state, false, assume_noent);
6738 if (r < 0)
6739 return r;
6740
6741 rgw_obj& obj = target->get_obj();
6742
6743 if (obj.get_oid().empty()) {
6744 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6745 return -EIO;
6746 }
6747
6748 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6749 if (r < 0)
6750 return r;
6751
6752 bool is_olh = state->is_olh;
6753
6754 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6755
6756 const string *ptag = meta.ptag;
6757 if (!ptag && !index_op->get_optag()->empty()) {
6758 ptag = index_op->get_optag();
6759 }
6760 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false);
6761 if (r < 0)
6762 return r;
6763
6764 if (real_clock::is_zero(meta.set_mtime)) {
6765 meta.set_mtime = real_clock::now();
6766 }
6767
6768 if (state->is_olh) {
6769 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6770 }
6771
6772 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6773 op.mtime2(&mtime_ts);
6774
6775 if (meta.data) {
6776 /* if we want to overwrite the data, we also want to overwrite the
6777 xattrs, so just remove the object */
6778 op.write_full(*meta.data);
6779 }
6780
6781 string etag;
6782 string content_type;
6783 bufferlist acl_bl;
6784
6785 map<string, bufferlist>::iterator iter;
6786 if (meta.rmattrs) {
6787 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6788 const string& name = iter->first;
6789 op.rmxattr(name.c_str());
6790 }
6791 }
6792
6793 if (meta.manifest) {
6794 /* remove existing manifest attr */
6795 iter = attrs.find(RGW_ATTR_MANIFEST);
6796 if (iter != attrs.end())
6797 attrs.erase(iter);
6798
6799 bufferlist bl;
6800 ::encode(*meta.manifest, bl);
6801 op.setxattr(RGW_ATTR_MANIFEST, bl);
6802 }
6803
6804 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6805 const string& name = iter->first;
6806 bufferlist& bl = iter->second;
6807
6808 if (!bl.length())
6809 continue;
6810
6811 op.setxattr(name.c_str(), bl);
6812
6813 if (name.compare(RGW_ATTR_ETAG) == 0) {
6814 etag = bl.c_str();
6815 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6816 content_type = bl.c_str();
6817 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6818 acl_bl = bl;
6819 }
6820 }
6821 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6822 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6823 }
6824
6825 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6826 bufferlist bl;
6827 ::encode(store->get_zone_short_id(), bl);
6828 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6829 }
6830
6831 if (!op.size())
6832 return 0;
6833
6834 uint64_t epoch;
6835 int64_t poolid;
6836
6837 bool orig_exists = state->exists;
6838 uint64_t orig_size = state->accounted_size;
6839
6840 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6841
6842 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6843
6844 if (versioned_op) {
6845 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6846 }
6847
6848 if (!index_op->is_prepared()) {
6849 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6850 if (r < 0)
6851 return r;
6852 }
6853
6854 r = ref.ioctx.operate(ref.oid, &op);
6855 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
6856 or -ENOENT if was removed, or -EEXIST if it did not exist
6857 before and now it does */
6858 if (r == -EEXIST && assume_noent) {
6859 target->invalidate_state();
6860 return r;
6861 }
6862 goto done_cancel;
6863 }
6864
6865 epoch = ref.ioctx.get_last_version();
6866 poolid = ref.ioctx.get_id();
6867
6868 r = target->complete_atomic_modification();
6869 if (r < 0) {
6870 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
6871 }
6872
6873 r = index_op->complete(poolid, epoch, size, accounted_size,
6874 meta.set_mtime, etag, content_type, &acl_bl,
6875 meta.category, meta.remove_objs, meta.user_data);
6876 if (r < 0)
6877 goto done_cancel;
6878
6879 if (meta.mtime) {
6880 *meta.mtime = meta.set_mtime;
6881 }
6882
6883 /* note that index_op was using state so we couldn't invalidate it earlier */
6884 target->invalidate_state();
6885 state = NULL;
6886
6887 if (versioned_op) {
6888 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
6889 if (r < 0) {
6890 return r;
6891 }
6892 }
6893
6894 if (!real_clock::is_zero(meta.delete_at)) {
6895 rgw_obj_index_key obj_key;
6896 obj.key.get_index_key(&obj_key);
6897
6898 r = store->objexp_hint_add(meta.delete_at,
6899 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
6900 if (r < 0) {
6901 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
6902 /* ignoring error, nothing we can do at this point */
6903 }
6904 }
6905 meta.canceled = false;
6906
6907 /* update quota cache */
6908 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
6909 accounted_size, orig_size);
6910 return 0;
6911
6912 done_cancel:
6913 int ret = index_op->cancel();
6914 if (ret < 0) {
6915 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
6916 }
6917
6918 meta.canceled = true;
6919
6920 /* we lost in a race. There are a few options:
6921 * - existing object was rewritten (ECANCELED)
6922 * - non existing object was created (EEXIST)
6923 * - object was removed (ENOENT)
6924 * should treat it as a success
6925 */
6926 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
6927 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
6928 r = 0;
6929 }
6930 } else {
6931 if (meta.if_match != NULL) {
6932 // only overwrite existing object
6933 if (strcmp(meta.if_match, "*") == 0) {
6934 if (r == -ENOENT) {
6935 r = -ERR_PRECONDITION_FAILED;
6936 } else if (r == -ECANCELED) {
6937 r = 0;
6938 }
6939 }
6940 }
6941
6942 if (meta.if_nomatch != NULL) {
6943 // only create a new object
6944 if (strcmp(meta.if_nomatch, "*") == 0) {
6945 if (r == -EEXIST) {
6946 r = -ERR_PRECONDITION_FAILED;
6947 } else if (r == -ENOENT) {
6948 r = 0;
6949 }
6950 }
6951 }
6952 }
6953
6954 return r;
6955 }
6956
6957 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
6958 map<string, bufferlist>& attrs)
6959 {
6960 RGWBucketInfo& bucket_info = target->get_bucket_info();
6961
6962 RGWRados::Bucket bop(target->get_store(), bucket_info);
6963 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
6964 index_op.set_zones_trace(meta.zones_trace);
6965
6966 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
6967 int r;
6968 if (assume_noent) {
6969 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
6970 if (r == -EEXIST) {
6971 assume_noent = false;
6972 }
6973 }
6974 if (!assume_noent) {
6975 r = _do_write_meta(size, accounted_size, attrs, assume_noent, (void *)&index_op);
6976 }
6977 return r;
6978 }
6979
6980 /** Write/overwrite a system object. */
6981 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
6982 map<std::string, bufferlist>& attrs, int flags,
6983 bufferlist& data,
6984 RGWObjVersionTracker *objv_tracker,
6985 real_time set_mtime /* 0 for don't set */)
6986 {
6987 rgw_pool pool;
6988 rgw_rados_ref ref;
6989 int r = get_system_obj_ref(obj, &ref, &pool);
6990 if (r < 0)
6991 return r;
6992
6993 ObjectWriteOperation op;
6994
6995 if (flags & PUT_OBJ_EXCL) {
6996 if (!(flags & PUT_OBJ_CREATE))
6997 return -EINVAL;
6998 op.create(true); // exclusive create
6999 } else {
7000 op.remove();
7001 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7002 op.create(false);
7003 }
7004
7005 if (objv_tracker) {
7006 objv_tracker->prepare_op_for_write(&op);
7007 }
7008
7009 if (real_clock::is_zero(set_mtime)) {
7010 set_mtime = real_clock::now();
7011 }
7012
7013 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7014 op.mtime2(&mtime_ts);
7015 op.write_full(data);
7016
7017 bufferlist acl_bl;
7018
7019 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7020 const string& name = iter->first;
7021 bufferlist& bl = iter->second;
7022
7023 if (!bl.length())
7024 continue;
7025
7026 op.setxattr(name.c_str(), bl);
7027 }
7028
7029 r = ref.ioctx.operate(ref.oid, &op);
7030 if (r < 0) {
7031 return r;
7032 }
7033
7034 if (objv_tracker) {
7035 objv_tracker->apply_write();
7036 }
7037
7038 if (mtime) {
7039 *mtime = set_mtime;
7040 }
7041
7042 return 0;
7043 }
7044
7045 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7046 off_t ofs, bool exclusive,
7047 RGWObjVersionTracker *objv_tracker)
7048 {
7049 rgw_rados_ref ref;
7050 rgw_pool pool;
7051 int r = get_system_obj_ref(obj, &ref, &pool);
7052 if (r < 0) {
7053 return r;
7054 }
7055
7056 ObjectWriteOperation op;
7057
7058 if (exclusive)
7059 op.create(true);
7060
7061 if (objv_tracker) {
7062 objv_tracker->prepare_op_for_write(&op);
7063 }
7064 if (ofs == -1) {
7065 op.write_full(bl);
7066 } else {
7067 op.write(ofs, bl);
7068 }
7069 r = ref.ioctx.operate(ref.oid, &op);
7070 if (r < 0)
7071 return r;
7072
7073 if (objv_tracker) {
7074 objv_tracker->apply_write();
7075 }
7076 return 0;
7077 }
7078
7079 /**
7080 * Write/overwrite an object to the bucket storage.
7081 * bucket: the bucket to store the object in
7082 * obj: the object name/key
7083 * data: the object contents/value
7084 * offset: the offet to write to in the object
7085 * If this is -1, we will overwrite the whole object.
7086 * size: the amount of data to write (data must be this long)
7087 * attrs: all the given attrs are written to bucket storage for the given object
7088 * Returns: 0 on success, -ERR# otherwise.
7089 */
7090
7091 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7092 off_t ofs, bool exclusive,
7093 void **handle)
7094 {
7095 rgw_rados_ref ref;
7096 int r = get_raw_obj_ref(obj, &ref);
7097 if (r < 0) {
7098 return r;
7099 }
7100
7101 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7102 *handle = c;
7103
7104 ObjectWriteOperation op;
7105
7106 if (exclusive)
7107 op.create(true);
7108
7109 if (ofs == -1) {
7110 op.write_full(bl);
7111 } else {
7112 op.write(ofs, bl);
7113 }
7114 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7115 if (r < 0)
7116 return r;
7117
7118 return 0;
7119 }
7120
7121 int RGWRados::aio_wait(void *handle)
7122 {
7123 AioCompletion *c = (AioCompletion *)handle;
7124 c->wait_for_safe();
7125 int ret = c->get_return_value();
7126 c->release();
7127 return ret;
7128 }
7129
7130 bool RGWRados::aio_completed(void *handle)
7131 {
7132 AioCompletion *c = (AioCompletion *)handle;
7133 return c->is_safe();
7134 }
7135
7136 class RGWRadosPutObj : public RGWGetDataCB
7137 {
7138 CephContext* cct;
7139 rgw_obj obj;
7140 RGWPutObjDataProcessor *filter;
7141 boost::optional<RGWPutObj_Compress>& compressor;
7142 CompressorRef& plugin;
7143 RGWPutObjProcessor_Atomic *processor;
7144 RGWOpStateSingleOp *opstate;
7145 void (*progress_cb)(off_t, void *);
7146 void *progress_data;
7147 bufferlist extra_data_bl;
7148 uint64_t extra_data_len;
7149 uint64_t data_len;
7150 map<string, bufferlist> src_attrs;
7151 public:
7152 RGWRadosPutObj(CephContext* cct,
7153 CompressorRef& plugin,
7154 boost::optional<RGWPutObj_Compress>& compressor,
7155 RGWPutObjProcessor_Atomic *p,
7156 RGWOpStateSingleOp *_ops,
7157 void (*_progress_cb)(off_t, void *),
7158 void *_progress_data) :
7159 cct(cct),
7160 filter(p),
7161 compressor(compressor),
7162 plugin(plugin),
7163 processor(p),
7164 opstate(_ops),
7165 progress_cb(_progress_cb),
7166 progress_data(_progress_data),
7167 extra_data_len(0),
7168 data_len(0) {}
7169
7170 int process_attrs(void) {
7171 if (extra_data_bl.length()) {
7172 JSONParser jp;
7173 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7174 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7175 return -EIO;
7176 }
7177
7178 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7179
7180 src_attrs.erase(RGW_ATTR_COMPRESSION);
7181 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7182 }
7183
7184 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7185 //do not compress if object is encrypted
7186 compressor = boost::in_place(cct, plugin, filter);
7187 filter = &*compressor;
7188 }
7189 return 0;
7190 }
7191
7192 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7193 if (progress_cb) {
7194 progress_cb(ofs, progress_data);
7195 }
7196 if (extra_data_len) {
7197 size_t extra_len = bl.length();
7198 if (extra_len > extra_data_len)
7199 extra_len = extra_data_len;
7200
7201 bufferlist extra;
7202 bl.splice(0, extra_len, &extra);
7203 extra_data_bl.append(extra);
7204
7205 extra_data_len -= extra_len;
7206 if (extra_data_len == 0) {
7207 int res = process_attrs();
7208 if (res < 0)
7209 return res;
7210 }
7211 if (bl.length() == 0) {
7212 return 0;
7213 }
7214 }
7215 data_len += bl.length();
7216 bool again = false;
7217
7218 bool need_opstate = true;
7219
7220 do {
7221 void *handle = NULL;
7222 rgw_raw_obj obj;
7223 uint64_t size = bl.length();
7224 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7225 if (ret < 0)
7226 return ret;
7227
7228 if (need_opstate && opstate) {
7229 /* need to update opstate repository with new state. This is ratelimited, so we're not
7230 * really doing it every time
7231 */
7232 ret = opstate->renew_state();
7233 if (ret < 0) {
7234 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7235 int r = filter->throttle_data(handle, obj, size, false);
7236 if (r < 0) {
7237 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7238 }
7239 /* could not renew state! might have been marked as cancelled */
7240 return ret;
7241 }
7242 need_opstate = false;
7243 }
7244
7245 ret = filter->throttle_data(handle, obj, size, false);
7246 if (ret < 0)
7247 return ret;
7248 } while (again);
7249
7250 return 0;
7251 }
7252
7253 bufferlist& get_extra_data() { return extra_data_bl; }
7254
7255 map<string, bufferlist>& get_attrs() { return src_attrs; }
7256
7257 void set_extra_data_len(uint64_t len) override {
7258 extra_data_len = len;
7259 }
7260
7261 uint64_t get_data_len() {
7262 return data_len;
7263 }
7264
7265 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7266 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7267 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7268 }
7269
7270 bool is_canceled() {
7271 return processor->is_canceled();
7272 }
7273 };
7274
7275 /*
7276 * prepare attrset depending on attrs_mod.
7277 */
7278 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7279 map<string, bufferlist>& attrs,
7280 RGWRados::AttrsMod attrs_mod)
7281 {
7282 switch (attrs_mod) {
7283 case RGWRados::ATTRSMOD_NONE:
7284 attrs = src_attrs;
7285 break;
7286 case RGWRados::ATTRSMOD_REPLACE:
7287 if (!attrs[RGW_ATTR_ETAG].length()) {
7288 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7289 }
7290 break;
7291 case RGWRados::ATTRSMOD_MERGE:
7292 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7293 if (attrs.find(it->first) == attrs.end()) {
7294 attrs[it->first] = it->second;
7295 }
7296 }
7297 break;
7298 }
7299 }
7300
7301 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7302 {
7303 map<string, bufferlist> attrset;
7304
7305 real_time mtime;
7306 uint64_t obj_size;
7307 RGWObjectCtx rctx(this);
7308
7309 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7310 RGWRados::Object::Read read_op(&op_target);
7311
7312 read_op.params.attrs = &attrset;
7313 read_op.params.lastmod = &mtime;
7314 read_op.params.obj_size = &obj_size;
7315
7316 int ret = read_op.prepare();
7317 if (ret < 0)
7318 return ret;
7319
7320 attrset.erase(RGW_ATTR_ID_TAG);
7321
7322 uint64_t max_chunk_size;
7323
7324 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7325 if (ret < 0) {
7326 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7327 return ret;
7328 }
7329
7330 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj, max_chunk_size, NULL, mtime, attrset,
7331 RGW_OBJ_CATEGORY_MAIN, 0, real_time(), NULL, NULL, NULL);
7332 }
7333
7334 struct obj_time_weight {
7335 real_time mtime;
7336 uint32_t zone_short_id;
7337 uint64_t pg_ver;
7338 bool high_precision;
7339
7340 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7341
7342 bool compare_low_precision(const obj_time_weight& rhs) {
7343 struct timespec l = ceph::real_clock::to_timespec(mtime);
7344 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7345 l.tv_nsec = 0;
7346 r.tv_nsec = 0;
7347 if (l > r) {
7348 return false;
7349 }
7350 if (l < r) {
7351 return true;
7352 }
7353 if (zone_short_id != rhs.zone_short_id) {
7354 return (zone_short_id < rhs.zone_short_id);
7355 }
7356 return (pg_ver < rhs.pg_ver);
7357
7358 }
7359
7360 bool operator<(const obj_time_weight& rhs) {
7361 if (!high_precision || !rhs.high_precision) {
7362 return compare_low_precision(rhs);
7363 }
7364 if (mtime > rhs.mtime) {
7365 return false;
7366 }
7367 if (mtime < rhs.mtime) {
7368 return true;
7369 }
7370 if (zone_short_id != rhs.zone_short_id) {
7371 return (zone_short_id < rhs.zone_short_id);
7372 }
7373 return (pg_ver < rhs.pg_ver);
7374 }
7375
7376 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7377 mtime = _mtime;
7378 zone_short_id = _short_id;
7379 pg_ver = _pg_ver;
7380 }
7381
7382 void init(RGWObjState *state) {
7383 mtime = state->mtime;
7384 zone_short_id = state->zone_short_id;
7385 pg_ver = state->pg_ver;
7386 }
7387 };
7388
7389 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7390 out << o.mtime;
7391
7392 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7393 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7394 }
7395
7396 return out;
7397 }
7398
7399 class RGWGetExtraDataCB : public RGWGetDataCB {
7400 bufferlist extra_data;
7401 public:
7402 RGWGetExtraDataCB() {}
7403 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7404 if (extra_data.length() < extra_data_len) {
7405 off_t max = extra_data_len - extra_data.length();
7406 if (max > bl_len) {
7407 max = bl_len;
7408 }
7409 bl.splice(0, max, &extra_data);
7410 }
7411 return bl_len;
7412 }
7413
7414 bufferlist& get_extra_data() {
7415 return extra_data;
7416 }
7417 };
7418
7419 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7420 const rgw_user& user_id,
7421 const string& client_id,
7422 req_info *info,
7423 const string& source_zone,
7424 rgw_obj& src_obj,
7425 RGWBucketInfo& src_bucket_info,
7426 real_time *src_mtime,
7427 uint64_t *psize,
7428 const real_time *mod_ptr,
7429 const real_time *unmod_ptr,
7430 bool high_precision_time,
7431 const char *if_match,
7432 const char *if_nomatch,
7433 map<string, bufferlist> *pattrs,
7434 string *version_id,
7435 string *ptag,
7436 string *petag)
7437 {
7438 /* source is in a different zonegroup, copy from there */
7439
7440 RGWRESTStreamRWRequest *in_stream_req;
7441 string tag;
7442 map<string, bufferlist> src_attrs;
7443 append_rand_alpha(cct, tag, tag, 32);
7444 obj_time_weight set_mtime_weight;
7445 set_mtime_weight.high_precision = high_precision_time;
7446
7447 RGWRESTConn *conn;
7448 if (source_zone.empty()) {
7449 if (src_bucket_info.zonegroup.empty()) {
7450 /* source is in the master zonegroup */
7451 conn = rest_master_conn;
7452 } else {
7453 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7454 if (iter == zonegroup_conn_map.end()) {
7455 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7456 return -ENOENT;
7457 }
7458 conn = iter->second;
7459 }
7460 } else {
7461 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7462 if (iter == zone_conn_map.end()) {
7463 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7464 return -ENOENT;
7465 }
7466 conn = iter->second;
7467 }
7468
7469 RGWGetExtraDataCB cb;
7470 string etag;
7471 map<string, string> req_headers;
7472 real_time set_mtime;
7473
7474 const real_time *pmod = mod_ptr;
7475
7476 obj_time_weight dest_mtime_weight;
7477
7478 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7479 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7480 true /* prepend_meta */, true /* GET */, true /* rgwx-stat */,
7481 true /* sync manifest */, &cb, &in_stream_req);
7482 if (ret < 0) {
7483 return ret;
7484 }
7485
7486 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7487 if (ret < 0) {
7488 return ret;
7489 }
7490
7491 bufferlist& extra_data_bl = cb.get_extra_data();
7492 if (extra_data_bl.length()) {
7493 JSONParser jp;
7494 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7495 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7496 return -EIO;
7497 }
7498
7499 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7500
7501 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7502 }
7503
7504 if (src_mtime) {
7505 *src_mtime = set_mtime;
7506 }
7507
7508 if (petag) {
7509 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7510 if (iter != src_attrs.end()) {
7511 bufferlist& etagbl = iter->second;
7512 *petag = etagbl.to_str();
7513 }
7514 }
7515
7516 if (pattrs) {
7517 *pattrs = src_attrs;
7518 }
7519
7520 return 0;
7521 }
7522
7523 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7524 const rgw_user& user_id,
7525 const string& client_id,
7526 const string& op_id,
7527 bool record_op_state,
7528 req_info *info,
7529 const string& source_zone,
7530 rgw_obj& dest_obj,
7531 rgw_obj& src_obj,
7532 RGWBucketInfo& dest_bucket_info,
7533 RGWBucketInfo& src_bucket_info,
7534 real_time *src_mtime,
7535 real_time *mtime,
7536 const real_time *mod_ptr,
7537 const real_time *unmod_ptr,
7538 bool high_precision_time,
7539 const char *if_match,
7540 const char *if_nomatch,
7541 AttrsMod attrs_mod,
7542 bool copy_if_newer,
7543 map<string, bufferlist>& attrs,
7544 RGWObjCategory category,
7545 uint64_t olh_epoch,
7546 real_time delete_at,
7547 string *version_id,
7548 string *ptag,
7549 ceph::buffer::list *petag,
7550 void (*progress_cb)(off_t, void *),
7551 void *progress_data,
7552 rgw_zone_set *zones_trace)
7553 {
7554 /* source is in a different zonegroup, copy from there */
7555
7556 RGWRESTStreamRWRequest *in_stream_req;
7557 string tag;
7558 int i;
7559 append_rand_alpha(cct, tag, tag, 32);
7560 obj_time_weight set_mtime_weight;
7561 set_mtime_weight.high_precision = high_precision_time;
7562
7563 RGWPutObjProcessor_Atomic processor(obj_ctx,
7564 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7565 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7566 if (version_id && *version_id != "null") {
7567 processor.set_version_id(*version_id);
7568 }
7569 processor.set_olh_epoch(olh_epoch);
7570 int ret = processor.prepare(this, NULL);
7571 if (ret < 0) {
7572 return ret;
7573 }
7574
7575 RGWRESTConn *conn;
7576 if (source_zone.empty()) {
7577 if (dest_bucket_info.zonegroup.empty()) {
7578 /* source is in the master zonegroup */
7579 conn = rest_master_conn;
7580 } else {
7581 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7582 if (iter == zonegroup_conn_map.end()) {
7583 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7584 return -ENOENT;
7585 }
7586 conn = iter->second;
7587 }
7588 } else {
7589 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7590 if (iter == zone_conn_map.end()) {
7591 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7592 return -ENOENT;
7593 }
7594 conn = iter->second;
7595 }
7596
7597 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7598
7599 RGWOpStateSingleOp *opstate = NULL;
7600
7601 if (record_op_state) {
7602 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7603
7604 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7605 if (ret < 0) {
7606 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7607 delete opstate;
7608 return ret;
7609 }
7610 }
7611
7612 boost::optional<RGWPutObj_Compress> compressor;
7613 CompressorRef plugin;
7614
7615 const auto& compression_type = zone_params.get_compression_type(
7616 dest_bucket_info.placement_rule);
7617 if (compression_type != "none") {
7618 plugin = Compressor::create(cct, compression_type);
7619 if (!plugin) {
7620 ldout(cct, 1) << "Cannot load plugin for compression type "
7621 << compression_type << dendl;
7622 }
7623 }
7624
7625 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7626
7627 string etag;
7628 map<string, string> req_headers;
7629 real_time set_mtime;
7630
7631 RGWObjState *dest_state = NULL;
7632
7633 const real_time *pmod = mod_ptr;
7634
7635 obj_time_weight dest_mtime_weight;
7636
7637 if (copy_if_newer) {
7638 /* need to get mtime for destination */
7639 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7640 if (ret < 0)
7641 goto set_err_state;
7642
7643 if (!real_clock::is_zero(dest_state->mtime)) {
7644 dest_mtime_weight.init(dest_state);
7645 pmod = &dest_mtime_weight.mtime;
7646 }
7647 }
7648
7649 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7650 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7651 true /* prepend_meta */, true /* GET */, false /* rgwx-stat */,
7652 true /* sync manifest */, &cb, &in_stream_req);
7653 if (ret < 0) {
7654 goto set_err_state;
7655 }
7656
7657 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7658 if (ret < 0) {
7659 goto set_err_state;
7660 }
7661 if (compressor && compressor->is_compressed()) {
7662 bufferlist tmp;
7663 RGWCompressionInfo cs_info;
7664 cs_info.compression_type = plugin->get_type_name();
7665 cs_info.orig_size = cb.get_data_len();
7666 cs_info.blocks = move(compressor->get_compression_blocks());
7667 ::encode(cs_info, tmp);
7668 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7669 }
7670
7671 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7672 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7673 } else {
7674 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7675 if (iter != cb.get_attrs().end()) {
7676 try {
7677 ::decode(delete_at, iter->second);
7678 } catch (buffer::error& err) {
7679 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7680 }
7681 }
7682 }
7683
7684 if (src_mtime) {
7685 *src_mtime = set_mtime;
7686 }
7687
7688 if (petag) {
7689 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7690 if (iter != cb.get_attrs().end()) {
7691 *petag = iter->second;
7692 }
7693 }
7694
7695 if (source_zone.empty()) {
7696 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7697 } else {
7698 attrs = cb.get_attrs();
7699 }
7700
7701 if (copy_if_newer) {
7702 uint64_t pg_ver = 0;
7703 auto i = attrs.find(RGW_ATTR_PG_VER);
7704 if (i != attrs.end() && i->second.length() > 0) {
7705 bufferlist::iterator iter = i->second.begin();
7706 try {
7707 ::decode(pg_ver, iter);
7708 } catch (buffer::error& err) {
7709 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7710 /* non critical error */
7711 }
7712 }
7713 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7714 }
7715
7716 #define MAX_COMPLETE_RETRY 100
7717 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7718 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7719 if (ret < 0) {
7720 goto set_err_state;
7721 }
7722 if (copy_if_newer && cb.is_canceled()) {
7723 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7724 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7725 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7726 if (ret < 0) {
7727 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7728 goto set_err_state;
7729 }
7730 dest_mtime_weight.init(dest_state);
7731 dest_mtime_weight.high_precision = high_precision_time;
7732 if (!dest_state->exists ||
7733 dest_mtime_weight < set_mtime_weight) {
7734 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7735 continue;
7736 } else {
7737 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7738 }
7739 }
7740 break;
7741 }
7742
7743 if (i == MAX_COMPLETE_RETRY) {
7744 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7745 ret = -EIO;
7746 goto set_err_state;
7747 }
7748
7749 if (opstate) {
7750 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7751 if (ret < 0) {
7752 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7753 }
7754 delete opstate;
7755 }
7756
7757 return 0;
7758 set_err_state:
7759 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7760 ret = 0;
7761 }
7762 if (opstate) {
7763 RGWOpState::OpState state;
7764 if (ret < 0) {
7765 state = RGWOpState::OPSTATE_ERROR;
7766 } else {
7767 state = RGWOpState::OPSTATE_COMPLETE;
7768 }
7769 int r = opstate->set_state(state);
7770 if (r < 0) {
7771 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7772 }
7773 delete opstate;
7774 }
7775 return ret;
7776 }
7777
7778
7779 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7780 map<string, bufferlist>& src_attrs,
7781 RGWRados::Object::Read& read_op,
7782 const rgw_user& user_id,
7783 rgw_obj& dest_obj,
7784 real_time *mtime)
7785 {
7786 string etag;
7787
7788 RGWRESTStreamWriteRequest *out_stream_req;
7789
7790 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
7791 if (ret < 0) {
7792 delete out_stream_req;
7793 return ret;
7794 }
7795
7796 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
7797 if (ret < 0)
7798 return ret;
7799
7800 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
7801 if (ret < 0)
7802 return ret;
7803
7804 return 0;
7805 }
7806
7807 /**
7808 * Copy an object.
7809 * dest_obj: the object to copy into
7810 * src_obj: the object to copy from
7811 * attrs: usage depends on attrs_mod parameter
7812 * attrs_mod: the modification mode of the attrs, may have the following values:
7813 * ATTRSMOD_NONE - the attributes of the source object will be
7814 * copied without modifications, attrs parameter is ignored;
7815 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
7816 * parameter, source object attributes are not copied;
7817 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
7818 * are overwritten by values contained in attrs parameter.
7819 * err: stores any errors resulting from the get of the original object
7820 * Returns: 0 on success, -ERR# otherwise.
7821 */
7822 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
7823 const rgw_user& user_id,
7824 const string& client_id,
7825 const string& op_id,
7826 req_info *info,
7827 const string& source_zone,
7828 rgw_obj& dest_obj,
7829 rgw_obj& src_obj,
7830 RGWBucketInfo& dest_bucket_info,
7831 RGWBucketInfo& src_bucket_info,
7832 real_time *src_mtime,
7833 real_time *mtime,
7834 const real_time *mod_ptr,
7835 const real_time *unmod_ptr,
7836 bool high_precision_time,
7837 const char *if_match,
7838 const char *if_nomatch,
7839 AttrsMod attrs_mod,
7840 bool copy_if_newer,
7841 map<string, bufferlist>& attrs,
7842 RGWObjCategory category,
7843 uint64_t olh_epoch,
7844 real_time delete_at,
7845 string *version_id,
7846 string *ptag,
7847 ceph::buffer::list *petag,
7848 void (*progress_cb)(off_t, void *),
7849 void *progress_data)
7850 {
7851 int ret;
7852 uint64_t obj_size;
7853 rgw_obj shadow_obj = dest_obj;
7854 string shadow_oid;
7855
7856 bool remote_src;
7857 bool remote_dest;
7858
7859 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
7860 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
7861
7862 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
7863 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
7864
7865 if (remote_src && remote_dest) {
7866 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7867 return -EINVAL;
7868 }
7869
7870 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7871
7872 if (remote_src || !source_zone.empty()) {
7873 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
7874 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
7875 unmod_ptr, high_precision_time,
7876 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
7877 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
7878 }
7879
7880 map<string, bufferlist> src_attrs;
7881 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
7882 RGWRados::Object::Read read_op(&src_op_target);
7883
7884 read_op.conds.mod_ptr = mod_ptr;
7885 read_op.conds.unmod_ptr = unmod_ptr;
7886 read_op.conds.high_precision_time = high_precision_time;
7887 read_op.conds.if_match = if_match;
7888 read_op.conds.if_nomatch = if_nomatch;
7889 read_op.params.attrs = &src_attrs;
7890 read_op.params.lastmod = src_mtime;
7891 read_op.params.obj_size = &obj_size;
7892
7893 ret = read_op.prepare();
7894 if (ret < 0) {
7895 return ret;
7896 }
7897
7898 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
7899 src_attrs.erase(RGW_ATTR_DELETE_AT);
7900
7901 set_copy_attrs(src_attrs, attrs, attrs_mod);
7902 attrs.erase(RGW_ATTR_ID_TAG);
7903 attrs.erase(RGW_ATTR_PG_VER);
7904 attrs.erase(RGW_ATTR_SOURCE_ZONE);
7905 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
7906 if (cmp != src_attrs.end())
7907 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
7908
7909 RGWObjManifest manifest;
7910 RGWObjState *astate = NULL;
7911
7912 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
7913 if (ret < 0) {
7914 return ret;
7915 }
7916
7917 vector<rgw_raw_obj> ref_objs;
7918
7919 if (remote_dest) {
7920 /* dest is in a different zonegroup, copy it there */
7921 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
7922 }
7923 uint64_t max_chunk_size;
7924
7925 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
7926 if (ret < 0) {
7927 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
7928 return ret;
7929 }
7930
7931 rgw_pool src_pool;
7932 rgw_pool dest_pool;
7933 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
7934 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7935 return -EIO;
7936 }
7937 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
7938 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7939 return -EIO;
7940 }
7941
7942
7943 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
7944 bool copy_first = false;
7945 if (astate->has_manifest) {
7946 if (!astate->manifest.has_tail()) {
7947 copy_data = true;
7948 } else {
7949 uint64_t head_size = astate->manifest.get_head_size();
7950
7951 if (head_size > 0) {
7952 if (head_size > max_chunk_size) {
7953 copy_data = true;
7954 } else {
7955 copy_first = true;
7956 }
7957 }
7958 }
7959 }
7960
7961 if (petag) {
7962 const auto iter = attrs.find(RGW_ATTR_ETAG);
7963 if (iter != attrs.end()) {
7964 *petag = iter->second;
7965 }
7966 }
7967
7968 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
7969 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
7970 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
7971 version_id, ptag, petag);
7972 }
7973
7974 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
7975
7976 if (copy_first) { // we need to copy first chunk, not increase refcount
7977 ++miter;
7978 }
7979
7980 rgw_rados_ref ref;
7981 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
7982 if (ret < 0) {
7983 return ret;
7984 }
7985
7986 bool versioned_dest = dest_bucket_info.versioning_enabled();
7987
7988 if (version_id && !version_id->empty()) {
7989 versioned_dest = true;
7990 dest_obj.key.set_instance(*version_id);
7991 } else if (versioned_dest) {
7992 gen_rand_obj_instance_name(&dest_obj);
7993 }
7994
7995 bufferlist first_chunk;
7996
7997 bool copy_itself = (dest_obj == src_obj);
7998 RGWObjManifest *pmanifest;
7999 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8000
8001 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8002 RGWRados::Object::Write write_op(&dest_op_target);
8003
8004 string tag;
8005
8006 if (ptag) {
8007 tag = *ptag;
8008 }
8009
8010 if (tag.empty()) {
8011 append_rand_alpha(cct, tag, tag, 32);
8012 }
8013
8014 if (!copy_itself) {
8015 manifest = astate->manifest;
8016 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8017 if (tail_placement.bucket.name.empty()) {
8018 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8019 }
8020 string oid, key;
8021 for (; miter != astate->manifest.obj_end(); ++miter) {
8022 ObjectWriteOperation op;
8023 cls_refcount_get(op, tag, true);
8024 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8025 ref.ioctx.locator_set_key(loc.loc);
8026
8027 ret = ref.ioctx.operate(loc.oid, &op);
8028 if (ret < 0) {
8029 goto done_ret;
8030 }
8031
8032 ref_objs.push_back(loc);
8033 }
8034
8035 pmanifest = &manifest;
8036 } else {
8037 pmanifest = &astate->manifest;
8038 /* don't send the object's tail for garbage collection */
8039 astate->keep_tail = true;
8040 }
8041
8042 if (copy_first) {
8043 ret = read_op.read(0, max_chunk_size, first_chunk);
8044 if (ret < 0) {
8045 goto done_ret;
8046 }
8047
8048 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8049 } else {
8050 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8051 }
8052
8053 write_op.meta.data = &first_chunk;
8054 write_op.meta.manifest = pmanifest;
8055 write_op.meta.ptag = &tag;
8056 write_op.meta.owner = dest_bucket_info.owner;
8057 write_op.meta.mtime = mtime;
8058 write_op.meta.flags = PUT_OBJ_CREATE;
8059 write_op.meta.category = category;
8060 write_op.meta.olh_epoch = olh_epoch;
8061 write_op.meta.delete_at = delete_at;
8062
8063 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8064 if (ret < 0) {
8065 goto done_ret;
8066 }
8067
8068 return 0;
8069
8070 done_ret:
8071 if (!copy_itself) {
8072 vector<rgw_raw_obj>::iterator riter;
8073
8074 string oid, key;
8075
8076 /* rollback reference */
8077 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8078 ObjectWriteOperation op;
8079 cls_refcount_put(op, tag, true);
8080
8081 ref.ioctx.locator_set_key(riter->loc);
8082
8083 int r = ref.ioctx.operate(riter->oid, &op);
8084 if (r < 0) {
8085 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8086 }
8087 }
8088 }
8089 return ret;
8090 }
8091
8092
8093 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8094 RGWBucketInfo& dest_bucket_info,
8095 RGWRados::Object::Read& read_op, off_t end,
8096 rgw_obj& dest_obj,
8097 rgw_obj& src_obj,
8098 uint64_t max_chunk_size,
8099 real_time *mtime,
8100 real_time set_mtime,
8101 map<string, bufferlist>& attrs,
8102 RGWObjCategory category,
8103 uint64_t olh_epoch,
8104 real_time delete_at,
8105 string *version_id,
8106 string *ptag,
8107 ceph::buffer::list *petag)
8108 {
8109 bufferlist first_chunk;
8110 RGWObjManifest manifest;
8111
8112 string tag;
8113 append_rand_alpha(cct, tag, tag, 32);
8114
8115 RGWPutObjProcessor_Atomic processor(obj_ctx,
8116 dest_bucket_info, dest_obj.bucket, dest_obj.get_oid(),
8117 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8118 if (version_id) {
8119 processor.set_version_id(*version_id);
8120 }
8121 processor.set_olh_epoch(olh_epoch);
8122 int ret = processor.prepare(this, NULL);
8123 if (ret < 0)
8124 return ret;
8125
8126 off_t ofs = 0;
8127
8128 do {
8129 bufferlist bl;
8130 ret = read_op.read(ofs, end, bl);
8131
8132 uint64_t read_len = ret;
8133 bool again;
8134
8135 do {
8136 void *handle;
8137 rgw_raw_obj obj;
8138
8139 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8140 if (ret < 0) {
8141 return ret;
8142 }
8143 ret = processor.throttle_data(handle, obj, read_len, false);
8144 if (ret < 0)
8145 return ret;
8146 } while (again);
8147
8148 ofs += read_len;
8149 } while (ofs <= end);
8150
8151 string etag;
8152 auto iter = attrs.find(RGW_ATTR_ETAG);
8153 if (iter != attrs.end()) {
8154 bufferlist& bl = iter->second;
8155 etag = string(bl.c_str(), bl.length());
8156 if (petag) {
8157 *petag = bl;
8158 }
8159 }
8160
8161 uint64_t accounted_size;
8162 {
8163 bool compressed{false};
8164 RGWCompressionInfo cs_info;
8165 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8166 if (ret < 0) {
8167 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8168 return ret;
8169 }
8170 // pass original size if compressed
8171 accounted_size = compressed ? cs_info.orig_size : ofs;
8172 }
8173
8174 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8175 }
8176
8177 bool RGWRados::is_meta_master()
8178 {
8179 if (!get_zonegroup().is_master_zonegroup()) {
8180 return false;
8181 }
8182
8183 return (get_zonegroup().master_zone == zone_public_config.id);
8184 }
8185
8186 /**
8187 * Check to see if the bucket metadata could be synced
8188 * bucket: the bucket to check
8189 * Returns false is the bucket is not synced
8190 */
8191 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8192 {
8193
8194 /* no current period */
8195 if (current_period.get_id().empty()) {
8196 return false;
8197 }
8198
8199 /* zonegroup is not master zonegroup */
8200 if (!get_zonegroup().is_master_zonegroup()) {
8201 return false;
8202 }
8203
8204 /* single zonegroup and a single zone */
8205 if (current_period.is_single_zonegroup(cct, this) && get_zonegroup().zones.size() == 1) {
8206 return false;
8207 }
8208
8209 /* zone is not master */
8210 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8211 return false;
8212 }
8213
8214 return true;
8215 }
8216
8217 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8218 {
8219 std::map<string, rgw_bucket_dir_entry> ent_map;
8220 rgw_obj_index_key marker;
8221 string prefix;
8222 bool is_truncated;
8223
8224 do {
8225 #define NUM_ENTRIES 1000
8226 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8227 &is_truncated, &marker);
8228 if (r < 0)
8229 return r;
8230
8231 string ns;
8232 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8233 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8234 rgw_obj_key obj;
8235
8236 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8237 return -ENOTEMPTY;
8238 }
8239 } while (is_truncated);
8240 return 0;
8241 }
8242
8243 /**
8244 * Delete a bucket.
8245 * bucket: the name of the bucket to delete
8246 * Returns 0 on success, -ERR# otherwise.
8247 */
8248 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8249 {
8250 const rgw_bucket& bucket = bucket_info.bucket;
8251 librados::IoCtx index_ctx;
8252 map<int, string> bucket_objs;
8253 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8254 if (r < 0)
8255 return r;
8256
8257 if (check_empty) {
8258 r = check_bucket_empty(bucket_info);
8259 if (r < 0) {
8260 return r;
8261 }
8262 }
8263
8264 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8265 if (r < 0)
8266 return r;
8267
8268 /* if the bucket is not synced we can remove the meta file */
8269 if (!is_syncing_bucket_meta(bucket)) {
8270 RGWObjVersionTracker objv_tracker;
8271 string entry = bucket.get_key();
8272 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8273 if (r < 0) {
8274 return r;
8275 }
8276 /* remove bucket index objects*/
8277 map<int, string>::const_iterator biter;
8278 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8279 index_ctx.remove(biter->second);
8280 }
8281 }
8282 return 0;
8283 }
8284
8285 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8286 {
8287 RGWBucketInfo info;
8288 map<string, bufferlist> attrs;
8289 RGWObjectCtx obj_ctx(this);
8290 int r;
8291 if (bucket.bucket_id.empty()) {
8292 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8293 } else {
8294 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8295 }
8296 if (r < 0) {
8297 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8298 return r;
8299 }
8300
8301 info.owner = owner.get_id();
8302
8303 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8304 if (r < 0) {
8305 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8306 return r;
8307 }
8308
8309 return 0;
8310 }
8311
8312
8313 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8314 {
8315 int ret = 0;
8316
8317 vector<rgw_bucket>::iterator iter;
8318
8319 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8320 rgw_bucket& bucket = *iter;
8321 if (enabled)
8322 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8323 else
8324 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8325
8326 RGWBucketInfo info;
8327 map<string, bufferlist> attrs;
8328 RGWObjectCtx obj_ctx(this);
8329 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8330 if (r < 0) {
8331 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8332 ret = r;
8333 continue;
8334 }
8335 if (enabled) {
8336 info.flags &= ~BUCKET_SUSPENDED;
8337 } else {
8338 info.flags |= BUCKET_SUSPENDED;
8339 }
8340
8341 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8342 if (r < 0) {
8343 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8344 ret = r;
8345 continue;
8346 }
8347 }
8348 return ret;
8349 }
8350
8351 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8352 {
8353 RGWBucketInfo bucket_info;
8354 RGWObjectCtx obj_ctx(this);
8355 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8356 if (ret < 0) {
8357 return ret;
8358 }
8359
8360 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8361 return 0;
8362 }
8363
8364 int RGWRados::Object::complete_atomic_modification()
8365 {
8366 if (!state->has_manifest || state->keep_tail)
8367 return 0;
8368
8369 cls_rgw_obj_chain chain;
8370 store->update_gc_chain(obj, state->manifest, &chain);
8371
8372 if (chain.empty()) {
8373 return 0;
8374 }
8375
8376 string tag = state->obj_tag.to_str();
8377 return store->gc->send_chain(chain, tag, false); // do it async
8378 }
8379
8380 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8381 {
8382 RGWObjManifest::obj_iterator iter;
8383 rgw_raw_obj raw_head;
8384 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8385 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8386 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8387 if (mobj == raw_head)
8388 continue;
8389 cls_rgw_obj_key key(mobj.oid);
8390 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8391 }
8392 }
8393
8394 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8395 {
8396 return gc->send_chain(chain, tag, sync);
8397 }
8398
8399 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8400 {
8401 const rgw_bucket& bucket = bucket_info.bucket;
8402 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8403 if (r < 0)
8404 return r;
8405
8406 if (bucket.bucket_id.empty()) {
8407 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8408 return -EIO;
8409 }
8410
8411 bucket_oid = dir_oid_prefix;
8412 bucket_oid.append(bucket.bucket_id);
8413
8414 return 0;
8415 }
8416
8417 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8418 string& bucket_oid_base) {
8419 const rgw_bucket& bucket = bucket_info.bucket;
8420 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8421 if (r < 0)
8422 return r;
8423
8424 if (bucket.bucket_id.empty()) {
8425 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8426 return -EIO;
8427 }
8428
8429 bucket_oid_base = dir_oid_prefix;
8430 bucket_oid_base.append(bucket.bucket_id);
8431
8432 return 0;
8433
8434 }
8435
8436 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8437 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8438 string bucket_oid_base;
8439 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8440 if (ret < 0) {
8441 return ret;
8442 }
8443
8444 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8445 if (bucket_instance_ids) {
8446 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8447 }
8448 return 0;
8449 }
8450
8451 template<typename T>
8452 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8453 map<int, string>& oids, map<int, T>& bucket_objs,
8454 int shard_id, map<int, string> *bucket_instance_ids)
8455 {
8456 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8457 if (ret < 0)
8458 return ret;
8459
8460 map<int, string>::const_iterator iter = oids.begin();
8461 for (; iter != oids.end(); ++iter) {
8462 bucket_objs[iter->first] = T();
8463 }
8464 return 0;
8465 }
8466
8467 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8468 const string& obj_key, string *bucket_obj, int *shard_id)
8469 {
8470 string bucket_oid_base;
8471 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8472 if (ret < 0)
8473 return ret;
8474
8475 RGWObjectCtx obj_ctx(this);
8476
8477 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8478 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8479 if (ret < 0) {
8480 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8481 return ret;
8482 }
8483 return 0;
8484 }
8485
8486 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8487 int shard_id, string *bucket_obj)
8488 {
8489 string bucket_oid_base;
8490 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8491 if (ret < 0)
8492 return ret;
8493
8494 RGWObjectCtx obj_ctx(this);
8495
8496 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8497 shard_id, bucket_obj);
8498 return 0;
8499 }
8500
8501 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8502 map<RGWObjCategory, RGWStorageStats>& stats)
8503 {
8504 for (const auto& pair : header.stats) {
8505 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8506 const rgw_bucket_category_stats& header_stats = pair.second;
8507
8508 RGWStorageStats& s = stats[category];
8509
8510 s.category = category;
8511 s.size += header_stats.total_size;
8512 s.size_rounded += header_stats.total_size_rounded;
8513 s.size_utilized += header_stats.actual_size;
8514 s.num_objects += header_stats.num_entries;
8515 }
8516 }
8517
8518 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8519 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8520 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8521 {
8522 librados::IoCtx index_ctx;
8523 // key - bucket index object id
8524 // value - bucket index check OP returned result with the given bucket index object (shard)
8525 map<int, string> oids;
8526 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8527
8528 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8529 if (ret < 0) {
8530 return ret;
8531 }
8532
8533 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8534 if (ret < 0) {
8535 return ret;
8536 }
8537
8538 // Aggregate results (from different shards if there is any)
8539 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8540 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8541 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8542 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8543 }
8544
8545 return 0;
8546 }
8547
8548 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8549 {
8550 librados::IoCtx index_ctx;
8551 map<int, string> bucket_objs;
8552
8553 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8554 if (r < 0) {
8555 return r;
8556 }
8557
8558 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8559 }
8560
8561 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8562 {
8563 librados::IoCtx index_ctx;
8564 map<int, string> bucket_objs;
8565
8566 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8567 if (r < 0) {
8568 return r;
8569 }
8570
8571 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8572 }
8573
8574 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8575 {
8576 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8577 std::string oid, key;
8578 get_obj_bucket_and_oid_loc(obj, oid, key);
8579 if (!rctx)
8580 return 0;
8581
8582 RGWObjState *state = NULL;
8583
8584 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8585 if (r < 0)
8586 return r;
8587
8588 if (!state->is_atomic) {
8589 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8590 return -EINVAL;
8591 }
8592
8593 if (state->obj_tag.length() == 0) {// check for backward compatibility
8594 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8595 return -EINVAL;
8596 }
8597
8598 string tag = state->obj_tag.c_str();
8599
8600 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8601
8602 return gc->defer_chain(tag, false);
8603 }
8604
8605 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8606 {
8607 list<string> prefixes;
8608 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8609 cls_rgw_remove_obj(op, prefixes);
8610 }
8611
8612 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8613 {
8614 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8615 }
8616
8617 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8618 {
8619 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8620 }
8621
8622
8623 /**
8624 * Delete an object.
8625 * bucket: name of the bucket storing the object
8626 * obj: name of the object to delete
8627 * Returns: 0 on success, -ERR# otherwise.
8628 */
8629 int RGWRados::Object::Delete::delete_obj()
8630 {
8631 RGWRados *store = target->get_store();
8632 rgw_obj& src_obj = target->get_obj();
8633 const string& instance = src_obj.key.instance;
8634 rgw_obj obj = src_obj;
8635
8636 if (instance == "null") {
8637 obj.key.instance.clear();
8638 }
8639
8640 bool explicit_marker_version = (!params.marker_version_id.empty());
8641
8642 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8643 if (instance.empty() || explicit_marker_version) {
8644 rgw_obj marker = obj;
8645
8646 if (!params.marker_version_id.empty()) {
8647 if (params.marker_version_id != "null") {
8648 marker.key.set_instance(params.marker_version_id);
8649 }
8650 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8651 store->gen_rand_obj_instance_name(&marker);
8652 }
8653
8654 result.version_id = marker.key.instance;
8655 result.delete_marker = true;
8656
8657 struct rgw_bucket_dir_entry_meta meta;
8658
8659 meta.owner = params.obj_owner.get_id().to_str();
8660 meta.owner_display_name = params.obj_owner.get_display_name();
8661
8662 if (real_clock::is_zero(params.mtime)) {
8663 meta.mtime = real_clock::now();
8664 } else {
8665 meta.mtime = params.mtime;
8666 }
8667
8668 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8669 if (r < 0) {
8670 return r;
8671 }
8672 } else {
8673 rgw_bucket_dir_entry dirent;
8674
8675 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8676 if (r < 0) {
8677 return r;
8678 }
8679 result.delete_marker = dirent.is_delete_marker();
8680 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8681 if (r < 0) {
8682 return r;
8683 }
8684 result.version_id = instance;
8685 }
8686
8687 BucketShard *bs;
8688 int r = target->get_bucket_shard(&bs);
8689 if (r < 0) {
8690 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8691 return r;
8692 }
8693
8694 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8695 if (r < 0) {
8696 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8697 return r;
8698 }
8699
8700 return 0;
8701 }
8702
8703 rgw_rados_ref ref;
8704 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8705 if (r < 0) {
8706 return r;
8707 }
8708
8709 RGWObjState *state;
8710 r = target->get_state(&state, false);
8711 if (r < 0)
8712 return r;
8713
8714 ObjectWriteOperation op;
8715
8716 if (!real_clock::is_zero(params.unmod_since)) {
8717 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8718 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8719 if (!params.high_precision_time) {
8720 ctime.tv_nsec = 0;
8721 unmod.tv_nsec = 0;
8722 }
8723
8724 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8725 if (ctime > unmod) {
8726 return -ERR_PRECONDITION_FAILED;
8727 }
8728
8729 /* only delete object if mtime is less than or equal to params.unmod_since */
8730 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8731 }
8732 uint64_t obj_size = state->size;
8733
8734 if (!real_clock::is_zero(params.expiration_time)) {
8735 bufferlist bl;
8736 real_time delete_at;
8737
8738 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8739 try {
8740 bufferlist::iterator iter = bl.begin();
8741 ::decode(delete_at, iter);
8742 } catch (buffer::error& err) {
8743 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8744 return -EIO;
8745 }
8746
8747 if (params.expiration_time != delete_at) {
8748 return -ERR_PRECONDITION_FAILED;
8749 }
8750 } else {
8751 return -ERR_PRECONDITION_FAILED;
8752 }
8753 }
8754
8755 if (!state->exists) {
8756 target->invalidate_state();
8757 return -ENOENT;
8758 }
8759
8760 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true);
8761 if (r < 0)
8762 return r;
8763
8764 RGWBucketInfo& bucket_info = target->get_bucket_info();
8765
8766 RGWRados::Bucket bop(store, bucket_info);
8767 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8768
8769 index_op.set_zones_trace(params.zones_trace);
8770 index_op.set_bilog_flags(params.bilog_flags);
8771
8772
8773 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
8774 if (r < 0)
8775 return r;
8776
8777 store->remove_rgw_head_obj(op);
8778 r = ref.ioctx.operate(ref.oid, &op);
8779 bool need_invalidate = false;
8780 if (r == -ECANCELED) {
8781 /* raced with another operation, we can regard it as removed */
8782 need_invalidate = true;
8783 r = 0;
8784 }
8785 bool removed = (r >= 0);
8786
8787 int64_t poolid = ref.ioctx.get_id();
8788 if (r >= 0) {
8789 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
8790 if (obj_tombstone_cache) {
8791 tombstone_entry entry{*state};
8792 obj_tombstone_cache->add(obj, entry);
8793 }
8794 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
8795 } else {
8796 int ret = index_op.cancel();
8797 if (ret < 0) {
8798 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
8799 }
8800 }
8801 if (removed) {
8802 int ret = target->complete_atomic_modification();
8803 if (ret < 0) {
8804 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
8805 }
8806 /* other than that, no need to propagate error */
8807 }
8808
8809 if (need_invalidate) {
8810 target->invalidate_state();
8811 }
8812
8813 if (r < 0)
8814 return r;
8815
8816 /* update quota cache */
8817 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
8818
8819 return 0;
8820 }
8821
8822 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
8823 const RGWBucketInfo& bucket_info,
8824 const rgw_obj& obj,
8825 int versioning_status,
8826 uint16_t bilog_flags,
8827 const real_time& expiration_time,
8828 rgw_zone_set *zones_trace)
8829 {
8830 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
8831 RGWRados::Object::Delete del_op(&del_target);
8832
8833 del_op.params.bucket_owner = bucket_info.owner;
8834 del_op.params.versioning_status = versioning_status;
8835 del_op.params.bilog_flags = bilog_flags;
8836 del_op.params.expiration_time = expiration_time;
8837 del_op.params.zones_trace = zones_trace;
8838
8839 return del_op.delete_obj();
8840 }
8841
8842 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
8843 {
8844 rgw_rados_ref ref;
8845 rgw_pool pool;
8846 int r = get_raw_obj_ref(obj, &ref, &pool);
8847 if (r < 0) {
8848 return r;
8849 }
8850
8851 ObjectWriteOperation op;
8852
8853 op.remove();
8854 r = ref.ioctx.operate(ref.oid, &op);
8855 if (r < 0)
8856 return r;
8857
8858 return 0;
8859 }
8860
8861 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
8862 {
8863 if (obj.empty()) {
8864 ldout(cct, 1) << "delete_system_obj got empty object name "
8865 << obj << ", returning EINVAL" << dendl;
8866 return -EINVAL;
8867 }
8868 rgw_rados_ref ref;
8869 rgw_pool pool;
8870 int r = get_raw_obj_ref(obj, &ref, &pool);
8871 if (r < 0) {
8872 return r;
8873 }
8874
8875 ObjectWriteOperation op;
8876
8877 if (objv_tracker) {
8878 objv_tracker->prepare_op_for_write(&op);
8879 }
8880
8881 op.remove();
8882 r = ref.ioctx.operate(ref.oid, &op);
8883 if (r < 0)
8884 return r;
8885
8886 return 0;
8887 }
8888
8889 int RGWRados::delete_obj_index(const rgw_obj& obj)
8890 {
8891 std::string oid, key;
8892 get_obj_bucket_and_oid_loc(obj, oid, key);
8893
8894 RGWObjectCtx obj_ctx(this);
8895
8896 RGWBucketInfo bucket_info;
8897 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
8898 if (ret < 0) {
8899 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
8900 return ret;
8901 }
8902
8903 RGWRados::Bucket bop(this, bucket_info);
8904 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
8905
8906 real_time removed_mtime;
8907 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
8908
8909 return r;
8910 }
8911
8912 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
8913 {
8914 string tag;
8915
8916 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
8917 if (mi != manifest.obj_end()) {
8918 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
8919 ++mi;
8920 tag = mi.get_location().get_raw_obj(store).oid;
8921 tag.append("_");
8922 }
8923
8924 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
8925 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
8926 MD5 hash;
8927 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
8928
8929 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
8930 if (iter != attrset.end()) {
8931 bufferlist& bl = iter->second;
8932 hash.Update((const byte *)bl.c_str(), bl.length());
8933 }
8934
8935 hash.Final(md5);
8936 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
8937 tag.append(md5_str);
8938
8939 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
8940
8941 tag_bl.append(tag.c_str(), tag.size() + 1);
8942 }
8943
8944 static bool is_olh(map<string, bufferlist>& attrs)
8945 {
8946 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
8947 return (iter != attrs.end());
8948 }
8949
8950 static bool has_olh_tag(map<string, bufferlist>& attrs)
8951 {
8952 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
8953 return (iter != attrs.end());
8954 }
8955
8956 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8957 RGWObjState *olh_state, RGWObjState **target_state)
8958 {
8959 assert(olh_state->is_olh);
8960
8961 rgw_obj target;
8962 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
8963 if (r < 0) {
8964 return r;
8965 }
8966 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
8967 if (r < 0) {
8968 return r;
8969 }
8970
8971 return 0;
8972 }
8973
8974 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
8975 {
8976 if (obj.empty()) {
8977 return -EINVAL;
8978 }
8979
8980 RGWRawObjState *s = rctx->raw.get_state(obj);
8981 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
8982 *state = s;
8983 if (s->has_attrs) {
8984 return 0;
8985 }
8986
8987 s->obj = obj;
8988
8989 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
8990 if (r == -ENOENT) {
8991 s->exists = false;
8992 s->has_attrs = true;
8993 s->mtime = real_time();
8994 return 0;
8995 }
8996 if (r < 0)
8997 return r;
8998
8999 s->exists = true;
9000 s->has_attrs = true;
9001 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9002
9003 if (s->obj_tag.length())
9004 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9005 << s->obj_tag.c_str() << dendl;
9006 else
9007 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9008
9009 return 0;
9010 }
9011
9012 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9013 {
9014 int ret;
9015
9016 do {
9017 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9018 } while (ret == -EAGAIN);
9019
9020 return ret;
9021 }
9022
9023 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9024 RGWObjState **state, bool follow_olh, bool assume_noent)
9025 {
9026 if (obj.empty()) {
9027 return -EINVAL;
9028 }
9029
9030 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9031
9032 RGWObjState *s = rctx->obj.get_state(obj);
9033 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9034 *state = s;
9035 if (s->has_attrs) {
9036 if (s->is_olh && need_follow_olh) {
9037 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9038 }
9039 return 0;
9040 }
9041
9042 s->obj = obj;
9043
9044 rgw_raw_obj raw_obj;
9045 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9046
9047 int r = -ENOENT;
9048
9049 if (!assume_noent) {
9050 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9051 }
9052
9053 if (r == -ENOENT) {
9054 s->exists = false;
9055 s->has_attrs = true;
9056 tombstone_entry entry;
9057 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9058 s->mtime = entry.mtime;
9059 s->zone_short_id = entry.zone_short_id;
9060 s->pg_ver = entry.pg_ver;
9061 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9062 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9063 } else {
9064 s->mtime = real_time();
9065 }
9066 return 0;
9067 }
9068 if (r < 0)
9069 return r;
9070
9071 s->exists = true;
9072 s->has_attrs = true;
9073 s->accounted_size = s->size;
9074
9075 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9076 const bool compressed = (iter != s->attrset.end());
9077 if (compressed) {
9078 // use uncompressed size for accounted_size
9079 try {
9080 RGWCompressionInfo info;
9081 auto p = iter->second.begin();
9082 ::decode(info, p);
9083 s->accounted_size = info.orig_size;
9084 } catch (buffer::error&) {
9085 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9086 return -EIO;
9087 }
9088 }
9089
9090 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9091 if (iter != s->attrset.end()) {
9092 bufferlist bl = iter->second;
9093 bufferlist::iterator it = bl.begin();
9094 it.copy(bl.length(), s->shadow_obj);
9095 s->shadow_obj[bl.length()] = '\0';
9096 }
9097 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9098
9099 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9100 if (manifest_bl.length()) {
9101 bufferlist::iterator miter = manifest_bl.begin();
9102 try {
9103 ::decode(s->manifest, miter);
9104 s->has_manifest = true;
9105 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9106 broken due to old bugs */
9107 s->size = s->manifest.get_obj_size();
9108 if (!compressed)
9109 s->accounted_size = s->size;
9110 } catch (buffer::error& err) {
9111 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9112 return -EIO;
9113 }
9114 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9115 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9116 RGWObjManifest::obj_iterator mi;
9117 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9118 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9119 }
9120 }
9121
9122 if (!s->obj_tag.length()) {
9123 /*
9124 * Uh oh, something's wrong, object with manifest should have tag. Let's
9125 * create one out of the manifest, would be unique
9126 */
9127 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9128 s->fake_tag = true;
9129 }
9130 }
9131 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9132 if (aiter != s->attrset.end()) {
9133 bufferlist& pg_ver_bl = aiter->second;
9134 if (pg_ver_bl.length()) {
9135 bufferlist::iterator pgbl = pg_ver_bl.begin();
9136 try {
9137 ::decode(s->pg_ver, pgbl);
9138 } catch (buffer::error& err) {
9139 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9140 }
9141 }
9142 }
9143 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9144 if (aiter != s->attrset.end()) {
9145 bufferlist& zone_short_id_bl = aiter->second;
9146 if (zone_short_id_bl.length()) {
9147 bufferlist::iterator zbl = zone_short_id_bl.begin();
9148 try {
9149 ::decode(s->zone_short_id, zbl);
9150 } catch (buffer::error& err) {
9151 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9152 }
9153 }
9154 }
9155 if (s->obj_tag.length())
9156 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9157 else
9158 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9159
9160 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9161 * it exist, and not only if is_olh() returns true
9162 */
9163 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9164 if (iter != s->attrset.end()) {
9165 s->olh_tag = iter->second;
9166 }
9167
9168 if (is_olh(s->attrset)) {
9169 s->is_olh = true;
9170
9171 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9172
9173 if (need_follow_olh) {
9174 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9175 }
9176 }
9177
9178 return 0;
9179 }
9180
9181 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9182 bool follow_olh, bool assume_noent)
9183 {
9184 int ret;
9185
9186 do {
9187 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9188 } while (ret == -EAGAIN);
9189
9190 return ret;
9191 }
9192
9193 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9194 {
9195 RGWObjState *astate;
9196 int r = get_state(&astate, true);
9197 if (r < 0) {
9198 return r;
9199 }
9200
9201 *pmanifest = &astate->manifest;
9202
9203 return 0;
9204 }
9205
9206 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9207 {
9208 RGWObjState *state;
9209 int r = source->get_state(&state, true);
9210 if (r < 0)
9211 return r;
9212 if (!state->exists)
9213 return -ENOENT;
9214 if (!state->get_attr(name, dest))
9215 return -ENODATA;
9216
9217 return 0;
9218 }
9219
9220
9221 int RGWRados::Object::Stat::stat_async()
9222 {
9223 RGWObjectCtx& ctx = source->get_ctx();
9224 rgw_obj& obj = source->get_obj();
9225 RGWRados *store = source->get_store();
9226
9227 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9228 result.obj = obj;
9229 if (s->has_attrs) {
9230 state.ret = 0;
9231 result.size = s->size;
9232 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9233 result.attrs = s->attrset;
9234 result.has_manifest = s->has_manifest;
9235 result.manifest = s->manifest;
9236 return 0;
9237 }
9238
9239 string oid;
9240 string loc;
9241 get_obj_bucket_and_oid_loc(obj, oid, loc);
9242
9243 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9244 if (r < 0) {
9245 return r;
9246 }
9247
9248 librados::ObjectReadOperation op;
9249 op.stat2(&result.size, &result.mtime, NULL);
9250 op.getxattrs(&result.attrs, NULL);
9251 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9252 state.io_ctx.locator_set_key(loc);
9253 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9254 if (r < 0) {
9255 ldout(store->ctx(), 5) << __func__
9256 << ": ERROR: aio_operate() returned ret=" << r
9257 << dendl;
9258 return r;
9259 }
9260
9261 return 0;
9262 }
9263
9264
9265 int RGWRados::Object::Stat::wait()
9266 {
9267 if (!state.completion) {
9268 return state.ret;
9269 }
9270
9271 state.completion->wait_for_safe();
9272 state.ret = state.completion->get_return_value();
9273 state.completion->release();
9274
9275 if (state.ret != 0) {
9276 return state.ret;
9277 }
9278
9279 return finish();
9280 }
9281
9282 int RGWRados::Object::Stat::finish()
9283 {
9284 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9285 if (iter != result.attrs.end()) {
9286 bufferlist& bl = iter->second;
9287 bufferlist::iterator biter = bl.begin();
9288 try {
9289 ::decode(result.manifest, biter);
9290 } catch (buffer::error& err) {
9291 RGWRados *store = source->get_store();
9292 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9293 return -EIO;
9294 }
9295 result.has_manifest = true;
9296 }
9297
9298 return 0;
9299 }
9300
9301 /**
9302 * Get an attribute for a system object.
9303 * obj: the object to get attr
9304 * name: name of the attr to retrieve
9305 * dest: bufferlist to store the result in
9306 * Returns: 0 on success, -ERR# otherwise.
9307 */
9308 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9309 {
9310 rgw_rados_ref ref;
9311 rgw_pool pool;
9312 int r = get_system_obj_ref(obj, &ref, &pool);
9313 if (r < 0) {
9314 return r;
9315 }
9316
9317 ObjectReadOperation op;
9318
9319 int rval;
9320 op.getxattr(name, &dest, &rval);
9321
9322 r = ref.ioctx.operate(ref.oid, &op, NULL);
9323 if (r < 0)
9324 return r;
9325
9326 return 0;
9327 }
9328
9329 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9330 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9331 ObjectOperation& op, RGWObjState **pstate)
9332 {
9333 if (!rctx)
9334 return 0;
9335
9336 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9337 if (r < 0)
9338 return r;
9339
9340 RGWObjState *state = *pstate;
9341
9342 if (!state->is_atomic) {
9343 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9344 return 0;
9345 }
9346
9347 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9348 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9349 } else {
9350 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9351 }
9352 return 0;
9353 }
9354
9355 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9356 {
9357 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9358 }
9359
9360 void RGWRados::Object::invalidate_state()
9361 {
9362 ctx.obj.invalidate(obj);
9363 }
9364
9365 void RGWRados::SystemObject::invalidate_state()
9366 {
9367 ctx.raw.invalidate(obj);
9368 }
9369
9370 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9371 const char *if_match, const char *if_nomatch, bool removal_op)
9372 {
9373 int r = get_state(&state, false);
9374 if (r < 0)
9375 return r;
9376
9377 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9378 if_match != NULL || if_nomatch != NULL) &&
9379 (!state->fake_tag);
9380
9381 if (!state->is_atomic) {
9382 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9383
9384 if (reset_obj) {
9385 op.create(false);
9386 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9387 }
9388
9389 return 0;
9390 }
9391
9392 if (need_guard) {
9393 /* first verify that the object wasn't replaced under */
9394 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9395 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9396 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9397 }
9398
9399 if (if_match) {
9400 if (strcmp(if_match, "*") == 0) {
9401 // test the object is existing
9402 if (!state->exists) {
9403 return -ERR_PRECONDITION_FAILED;
9404 }
9405 } else {
9406 bufferlist bl;
9407 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9408 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9409 return -ERR_PRECONDITION_FAILED;
9410 }
9411 }
9412 }
9413
9414 if (if_nomatch) {
9415 if (strcmp(if_nomatch, "*") == 0) {
9416 // test the object is NOT existing
9417 if (state->exists) {
9418 return -ERR_PRECONDITION_FAILED;
9419 }
9420 } else {
9421 bufferlist bl;
9422 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9423 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9424 return -ERR_PRECONDITION_FAILED;
9425 }
9426 }
9427 }
9428 }
9429
9430 if (reset_obj) {
9431 if (state->exists) {
9432 op.create(false);
9433 store->remove_rgw_head_obj(op);
9434 } else {
9435 op.create(true);
9436 }
9437 }
9438
9439 if (removal_op) {
9440 /* the object is being removed, no need to update its tag */
9441 return 0;
9442 }
9443
9444 if (ptag) {
9445 state->write_tag = *ptag;
9446 } else {
9447 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9448 }
9449 bufferlist bl;
9450 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9451
9452 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9453
9454 op.setxattr(RGW_ATTR_ID_TAG, bl);
9455
9456 return 0;
9457 }
9458
9459 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9460 RGWObjVersionTracker *objv_tracker)
9461 {
9462 map<string, bufferlist> attrs;
9463 attrs[name] = bl;
9464 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9465 }
9466
9467 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9468 map<string, bufferlist>& attrs,
9469 map<string, bufferlist>* rmattrs,
9470 RGWObjVersionTracker *objv_tracker)
9471 {
9472 rgw_rados_ref ref;
9473 rgw_pool pool;
9474 int r = get_system_obj_ref(obj, &ref, &pool);
9475 if (r < 0) {
9476 return r;
9477 }
9478 ObjectWriteOperation op;
9479
9480 if (objv_tracker) {
9481 objv_tracker->prepare_op_for_write(&op);
9482 }
9483
9484 map<string, bufferlist>::iterator iter;
9485 if (rmattrs) {
9486 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9487 const string& name = iter->first;
9488 op.rmxattr(name.c_str());
9489 }
9490 }
9491
9492 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9493 const string& name = iter->first;
9494 bufferlist& bl = iter->second;
9495
9496 if (!bl.length())
9497 continue;
9498
9499 op.setxattr(name.c_str(), bl);
9500 }
9501
9502 if (!op.size())
9503 return 0;
9504
9505 bufferlist bl;
9506
9507 r = ref.ioctx.operate(ref.oid, &op);
9508 if (r < 0)
9509 return r;
9510
9511 return 0;
9512 }
9513
9514 /**
9515 * Set an attr on an object.
9516 * bucket: name of the bucket holding the object
9517 * obj: name of the object to set the attr on
9518 * name: the attr to set
9519 * bl: the contents of the attr
9520 * Returns: 0 on success, -ERR# otherwise.
9521 */
9522 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9523 {
9524 map<string, bufferlist> attrs;
9525 attrs[name] = bl;
9526 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9527 }
9528
9529 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9530 map<string, bufferlist>& attrs,
9531 map<string, bufferlist>* rmattrs)
9532 {
9533 rgw_rados_ref ref;
9534 int r = get_obj_head_ref(bucket_info, obj, &ref);
9535 if (r < 0) {
9536 return r;
9537 }
9538 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9539
9540 ObjectWriteOperation op;
9541 RGWObjState *state = NULL;
9542
9543 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9544 if (r < 0)
9545 return r;
9546
9547 map<string, bufferlist>::iterator iter;
9548 if (rmattrs) {
9549 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9550 const string& name = iter->first;
9551 op.rmxattr(name.c_str());
9552 }
9553 }
9554
9555 const rgw_bucket& bucket = obj.bucket;
9556
9557 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9558 const string& name = iter->first;
9559 bufferlist& bl = iter->second;
9560
9561 if (!bl.length())
9562 continue;
9563
9564 op.setxattr(name.c_str(), bl);
9565
9566 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9567 real_time ts;
9568 try {
9569 ::decode(ts, bl);
9570
9571 rgw_obj_index_key obj_key;
9572 obj.key.get_index_key(&obj_key);
9573
9574 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9575 } catch (buffer::error& err) {
9576 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9577 }
9578 }
9579 }
9580
9581 if (!op.size())
9582 return 0;
9583
9584 RGWObjectCtx obj_ctx(this);
9585
9586 bufferlist bl;
9587 RGWRados::Bucket bop(this, bucket_info);
9588 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9589
9590 if (state) {
9591 string tag;
9592 append_rand_alpha(cct, tag, tag, 32);
9593 state->write_tag = tag;
9594 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9595
9596 if (r < 0)
9597 return r;
9598
9599 bl.append(tag.c_str(), tag.size() + 1);
9600
9601 op.setxattr(RGW_ATTR_ID_TAG, bl);
9602 }
9603
9604 r = ref.ioctx.operate(ref.oid, &op);
9605 if (state) {
9606 if (r >= 0) {
9607 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9608 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9609 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9610 string etag(etag_bl.c_str(), etag_bl.length());
9611 string content_type(content_type_bl.c_str(), content_type_bl.length());
9612 uint64_t epoch = ref.ioctx.get_last_version();
9613 int64_t poolid = ref.ioctx.get_id();
9614 real_time mtime = real_clock::now();
9615 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9616 mtime, etag, content_type, &acl_bl,
9617 RGW_OBJ_CATEGORY_MAIN, NULL);
9618 } else {
9619 int ret = index_op.cancel();
9620 if (ret < 0) {
9621 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9622 }
9623 }
9624 }
9625 if (r < 0)
9626 return r;
9627
9628 if (state) {
9629 state->obj_tag.swap(bl);
9630 if (rmattrs) {
9631 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9632 state->attrset.erase(iter->first);
9633 }
9634 }
9635 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9636 state->attrset[iter->first] = iter->second;
9637 }
9638 }
9639
9640 return 0;
9641 }
9642
9643 int RGWRados::Object::Read::prepare()
9644 {
9645 RGWRados *store = source->get_store();
9646 CephContext *cct = store->ctx();
9647
9648 bufferlist etag;
9649
9650 map<string, bufferlist>::iterator iter;
9651
9652 RGWObjState *astate;
9653 int r = source->get_state(&astate, true);
9654 if (r < 0)
9655 return r;
9656
9657 if (!astate->exists) {
9658 return -ENOENT;
9659 }
9660
9661 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9662
9663 state.obj = astate->obj;
9664 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9665
9666 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9667 if (r < 0) {
9668 return r;
9669 }
9670 if (params.attrs) {
9671 *params.attrs = astate->attrset;
9672 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9673 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9674 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9675 }
9676 }
9677 }
9678
9679 /* Convert all times go GMT to make them compatible */
9680 if (conds.mod_ptr || conds.unmod_ptr) {
9681 obj_time_weight src_weight;
9682 src_weight.init(astate);
9683 src_weight.high_precision = conds.high_precision_time;
9684
9685 obj_time_weight dest_weight;
9686 dest_weight.high_precision = conds.high_precision_time;
9687
9688 if (conds.mod_ptr) {
9689 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9690 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9691 if (!(dest_weight < src_weight)) {
9692 return -ERR_NOT_MODIFIED;
9693 }
9694 }
9695
9696 if (conds.unmod_ptr) {
9697 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9698 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9699 if (dest_weight < src_weight) {
9700 return -ERR_PRECONDITION_FAILED;
9701 }
9702 }
9703 }
9704 if (conds.if_match || conds.if_nomatch) {
9705 r = get_attr(RGW_ATTR_ETAG, etag);
9706 if (r < 0)
9707 return r;
9708
9709 if (conds.if_match) {
9710 string if_match_str = rgw_string_unquote(conds.if_match);
9711 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9712 if (if_match_str.compare(etag.c_str()) != 0) {
9713 return -ERR_PRECONDITION_FAILED;
9714 }
9715 }
9716
9717 if (conds.if_nomatch) {
9718 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9719 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9720 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9721 return -ERR_NOT_MODIFIED;
9722 }
9723 }
9724 }
9725
9726 if (params.obj_size)
9727 *params.obj_size = astate->size;
9728 if (params.lastmod)
9729 *params.lastmod = astate->mtime;
9730
9731 return 0;
9732 }
9733
9734 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9735 {
9736 if (ofs < 0) {
9737 ofs += obj_size;
9738 if (ofs < 0)
9739 ofs = 0;
9740 end = obj_size - 1;
9741 } else if (end < 0) {
9742 end = obj_size - 1;
9743 }
9744
9745 if (obj_size > 0) {
9746 if (ofs >= (off_t)obj_size) {
9747 return -ERANGE;
9748 }
9749 if (end >= (off_t)obj_size) {
9750 end = obj_size - 1;
9751 }
9752 }
9753 return 0;
9754 }
9755
9756 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9757 {
9758 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9759 }
9760
9761 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9762 RGWRados::SystemObject::Read::GetObjState& state,
9763 rgw_raw_obj& obj,
9764 map<string, bufferlist> *attrs,
9765 real_time *lastmod,
9766 uint64_t *obj_size,
9767 RGWObjVersionTracker *objv_tracker)
9768 {
9769 RGWRawObjState *astate = NULL;
9770
9771 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
9772 if (r < 0)
9773 return r;
9774
9775 if (!astate->exists) {
9776 return -ENOENT;
9777 }
9778
9779 if (attrs) {
9780 *attrs = astate->attrset;
9781 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9782 map<string, bufferlist>::iterator iter;
9783 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
9784 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9785 }
9786 }
9787 }
9788
9789 if (obj_size)
9790 *obj_size = astate->size;
9791 if (lastmod)
9792 *lastmod = astate->mtime;
9793
9794 return 0;
9795 }
9796
9797
9798 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
9799 {
9800 RGWRados *store = target->get_store();
9801 BucketShard *bs;
9802 int r;
9803
9804 #define NUM_RESHARD_RETRIES 10
9805 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
9806 int ret = get_bucket_shard(&bs);
9807 if (ret < 0) {
9808 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9809 return ret;
9810 }
9811 r = call(bs);
9812 if (r != -ERR_BUSY_RESHARDING) {
9813 break;
9814 }
9815 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
9816 string new_bucket_id;
9817 r = store->block_while_resharding(bs, &new_bucket_id);
9818 if (r == -ERR_BUSY_RESHARDING) {
9819 continue;
9820 }
9821 if (r < 0) {
9822 return r;
9823 }
9824 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
9825 i = 0; /* resharding is finished, make sure we can retry */
9826 r = target->update_bucket_id(new_bucket_id);
9827 if (r < 0) {
9828 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
9829 return r;
9830 }
9831 invalidate_bs();
9832 }
9833
9834 if (r < 0) {
9835 return r;
9836 }
9837
9838 if (pbs) {
9839 *pbs = bs;
9840 }
9841
9842 return 0;
9843 }
9844
9845 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
9846 {
9847 RGWRados *store = source->get_store();
9848 rgw_raw_obj& obj = source->get_obj();
9849
9850 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
9851 stat_params.lastmod, stat_params.obj_size, objv_tracker);
9852 }
9853
9854 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
9855 {
9856 if (blind) {
9857 return 0;
9858 }
9859 RGWRados *store = target->get_store();
9860
9861 if (write_tag && write_tag->length()) {
9862 optag = string(write_tag->c_str(), write_tag->length());
9863 } else {
9864 if (optag.empty()) {
9865 append_rand_alpha(store->ctx(), optag, optag, 32);
9866 }
9867 }
9868
9869 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
9870 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
9871 });
9872
9873 if (r < 0) {
9874 return r;
9875 }
9876 prepared = true;
9877
9878 return 0;
9879 }
9880
9881 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
9882 uint64_t size, uint64_t accounted_size,
9883 ceph::real_time& ut, const string& etag,
9884 const string& content_type,
9885 bufferlist *acl_bl,
9886 RGWObjCategory category,
9887 list<rgw_obj_index_key> *remove_objs, const string *user_data)
9888 {
9889 if (blind) {
9890 return 0;
9891 }
9892 RGWRados *store = target->get_store();
9893 BucketShard *bs;
9894
9895 int ret = get_bucket_shard(&bs);
9896 if (ret < 0) {
9897 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9898 return ret;
9899 }
9900
9901 rgw_bucket_dir_entry ent;
9902 obj.key.get_index_key(&ent.key);
9903 ent.meta.size = size;
9904 ent.meta.accounted_size = accounted_size;
9905 ent.meta.mtime = ut;
9906 ent.meta.etag = etag;
9907 if (user_data)
9908 ent.meta.user_data = *user_data;
9909
9910 ACLOwner owner;
9911 if (acl_bl && acl_bl->length()) {
9912 int ret = store->decode_policy(*acl_bl, &owner);
9913 if (ret < 0) {
9914 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
9915 }
9916 }
9917 ent.meta.owner = owner.get_id().to_str();
9918 ent.meta.owner_display_name = owner.get_display_name();
9919 ent.meta.content_type = content_type;
9920
9921 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
9922
9923 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9924 if (r < 0) {
9925 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9926 }
9927
9928 return ret;
9929 }
9930
9931 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
9932 real_time& removed_mtime,
9933 list<rgw_obj_index_key> *remove_objs)
9934 {
9935 if (blind) {
9936 return 0;
9937 }
9938 RGWRados *store = target->get_store();
9939 BucketShard *bs;
9940
9941 int ret = get_bucket_shard(&bs);
9942 if (ret < 0) {
9943 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
9944 return ret;
9945 }
9946
9947 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
9948
9949 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9950 if (r < 0) {
9951 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9952 }
9953
9954 return ret;
9955 }
9956
9957
9958 int RGWRados::Bucket::UpdateIndex::cancel()
9959 {
9960 if (blind) {
9961 return 0;
9962 }
9963 RGWRados *store = target->get_store();
9964 BucketShard *bs;
9965
9966 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
9967 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
9968 });
9969
9970 /*
9971 * need to update data log anyhow, so that whoever follows needs to update its internal markers
9972 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
9973 * have no way to tell that they're all caught up
9974 */
9975 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
9976 if (r < 0) {
9977 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
9978 }
9979
9980 return ret;
9981 }
9982
9983 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
9984 {
9985 RGWRados *store = source->get_store();
9986 CephContext *cct = store->ctx();
9987
9988 std::string oid, key;
9989 rgw_raw_obj read_obj;
9990 uint64_t read_ofs = ofs;
9991 uint64_t len, read_len;
9992 bool reading_from_head = true;
9993 ObjectReadOperation op;
9994
9995 bool merge_bl = false;
9996 bufferlist *pbl = &bl;
9997 bufferlist read_bl;
9998 uint64_t max_chunk_size;
9999
10000 RGWObjState *astate;
10001 int r = source->get_state(&astate, true);
10002 if (r < 0)
10003 return r;
10004
10005 if (end < 0)
10006 len = 0;
10007 else
10008 len = end - ofs + 1;
10009
10010 if (astate->has_manifest && astate->manifest.has_tail()) {
10011 /* now get the relevant object part */
10012 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10013
10014 uint64_t stripe_ofs = iter.get_stripe_ofs();
10015 read_obj = iter.get_location().get_raw_obj(store);
10016 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10017 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10018 reading_from_head = (read_obj == state.head_obj);
10019 } else {
10020 read_obj = state.head_obj;
10021 }
10022
10023 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10024 if (r < 0) {
10025 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10026 return r;
10027 }
10028
10029 if (len > max_chunk_size)
10030 len = max_chunk_size;
10031
10032
10033 state.io_ctx.locator_set_key(read_obj.loc);
10034
10035 read_len = len;
10036
10037 if (reading_from_head) {
10038 /* only when reading from the head object do we need to do the atomic test */
10039 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10040 if (r < 0)
10041 return r;
10042
10043 if (astate && astate->prefetch_data) {
10044 if (!ofs && astate->data.length() >= len) {
10045 bl = astate->data;
10046 return bl.length();
10047 }
10048
10049 if (ofs < astate->data.length()) {
10050 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10051 astate->data.copy(ofs, copy_len, bl);
10052 read_len -= copy_len;
10053 read_ofs += copy_len;
10054 if (!read_len)
10055 return bl.length();
10056
10057 merge_bl = true;
10058 pbl = &read_bl;
10059 }
10060 }
10061 }
10062
10063 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10064 op.read(read_ofs, read_len, pbl, NULL);
10065
10066 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10067 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10068
10069 if (r < 0) {
10070 return r;
10071 }
10072
10073 if (merge_bl) {
10074 bl.append(read_bl);
10075 }
10076
10077 return bl.length();
10078 }
10079
10080 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10081 {
10082 if (!has_ref) {
10083 rgw_pool pool;
10084 int r = store->get_raw_obj_ref(obj, &ref, &pool);
10085 if (r < 0) {
10086 return r;
10087 }
10088 has_ref = true;
10089 }
10090 *pref = &ref;
10091 return 0;
10092
10093 }
10094
10095 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10096 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10097 bufferlist& bl, off_t ofs, off_t end,
10098 map<string, bufferlist> *attrs,
10099 rgw_cache_entry_info *cache_info)
10100 {
10101 uint64_t len;
10102 ObjectReadOperation op;
10103
10104 if (end < 0)
10105 len = 0;
10106 else
10107 len = end - ofs + 1;
10108
10109 if (objv_tracker) {
10110 objv_tracker->prepare_op_for_read(&op);
10111 }
10112
10113 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10114 op.read(ofs, len, &bl, NULL);
10115
10116 if (attrs) {
10117 op.getxattrs(attrs, NULL);
10118 }
10119
10120 rgw_rados_ref *ref;
10121 int r = read_state.get_ref(this, obj, &ref);
10122 if (r < 0) {
10123 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10124 return r;
10125 }
10126 r = ref->ioctx.operate(ref->oid, &op, NULL);
10127 if (r < 0) {
10128 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10129 return r;
10130 }
10131 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10132
10133 uint64_t op_ver = ref->ioctx.get_last_version();
10134
10135 if (read_state.last_ver > 0 &&
10136 read_state.last_ver != op_ver) {
10137 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10138 return -ECANCELED;
10139 }
10140
10141 read_state.last_ver = op_ver;
10142
10143 return bl.length();
10144 }
10145
10146 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl, RGWObjVersionTracker *objv_tracker)
10147 {
10148 RGWRados *store = source->get_store();
10149 rgw_raw_obj& obj = source->get_obj();
10150
10151 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl, ofs, end, read_params.attrs, read_params.cache_info);
10152 }
10153
10154 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10155 {
10156 RGWRados *store = source->get_store();
10157 rgw_raw_obj& obj = source->get_obj();
10158
10159 return store->system_obj_get_attr(obj, name, dest);
10160 }
10161
10162 struct get_obj_data;
10163
10164 struct get_obj_aio_data {
10165 struct get_obj_data *op_data;
10166 off_t ofs;
10167 off_t len;
10168 };
10169
10170 struct get_obj_io {
10171 off_t len;
10172 bufferlist bl;
10173 };
10174
10175 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10176
10177 struct get_obj_data : public RefCountedObject {
10178 CephContext *cct;
10179 RGWRados *rados;
10180 RGWObjectCtx *ctx;
10181 IoCtx io_ctx;
10182 map<off_t, get_obj_io> io_map;
10183 map<off_t, librados::AioCompletion *> completion_map;
10184 uint64_t total_read;
10185 Mutex lock;
10186 Mutex data_lock;
10187 list<get_obj_aio_data> aio_data;
10188 RGWGetDataCB *client_cb;
10189 std::atomic<bool> cancelled = { false };
10190 std::atomic<int64_t> err_code = { 0 };
10191 Throttle throttle;
10192 list<bufferlist> read_list;
10193
10194 explicit get_obj_data(CephContext *_cct)
10195 : cct(_cct),
10196 rados(NULL), ctx(NULL),
10197 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10198 client_cb(NULL),
10199 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10200 ~get_obj_data() override { }
10201 void set_cancelled(int r) {
10202 cancelled = true;
10203 err_code = r;
10204 }
10205
10206 bool is_cancelled() {
10207 return cancelled;
10208 }
10209
10210 int get_err_code() {
10211 return err_code;
10212 }
10213
10214 int wait_next_io(bool *done) {
10215 lock.Lock();
10216 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10217 if (iter == completion_map.end()) {
10218 *done = true;
10219 lock.Unlock();
10220 return 0;
10221 }
10222 off_t cur_ofs = iter->first;
10223 librados::AioCompletion *c = iter->second;
10224 lock.Unlock();
10225
10226 c->wait_for_safe_and_cb();
10227 int r = c->get_return_value();
10228
10229 lock.Lock();
10230 completion_map.erase(cur_ofs);
10231
10232 if (completion_map.empty()) {
10233 *done = true;
10234 }
10235 lock.Unlock();
10236
10237 c->release();
10238
10239 return r;
10240 }
10241
10242 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10243 Mutex::Locker l(lock);
10244
10245 const auto& io_iter = io_map.insert(
10246 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10247
10248 assert(io_iter.second); // assert new insertion
10249
10250 get_obj_io& io = (io_iter.first)->second;
10251 *pbl = &io.bl;
10252
10253 struct get_obj_aio_data aio;
10254 aio.ofs = ofs;
10255 aio.len = len;
10256 aio.op_data = this;
10257
10258 aio_data.push_back(aio);
10259
10260 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10261
10262 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10263 completion_map[ofs] = c;
10264
10265 *pc = c;
10266
10267 /* we have a reference per IO, plus one reference for the calling function.
10268 * reference is dropped for each callback, plus when we're done iterating
10269 * over the parts */
10270 get();
10271 }
10272
10273 void cancel_io(off_t ofs) {
10274 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10275 lock.Lock();
10276 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10277 if (iter != completion_map.end()) {
10278 AioCompletion *c = iter->second;
10279 c->release();
10280 completion_map.erase(ofs);
10281 io_map.erase(ofs);
10282 }
10283 lock.Unlock();
10284
10285 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10286 * need IoCtx to live, as io callback may still be called
10287 */
10288 }
10289
10290 void cancel_all_io() {
10291 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10292 Mutex::Locker l(lock);
10293 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10294 iter != completion_map.end(); ++iter) {
10295 librados::AioCompletion *c = iter->second;
10296 c->release();
10297 }
10298 }
10299
10300 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10301 Mutex::Locker l(lock);
10302
10303 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10304
10305 if (liter == io_map.end() ||
10306 liter->first != ofs) {
10307 return 0;
10308 }
10309
10310 map<off_t, librados::AioCompletion *>::iterator aiter;
10311 aiter = completion_map.find(ofs);
10312 if (aiter == completion_map.end()) {
10313 /* completion map does not hold this io, it was cancelled */
10314 return 0;
10315 }
10316
10317 AioCompletion *completion = aiter->second;
10318 int r = completion->get_return_value();
10319 if (r < 0)
10320 return r;
10321
10322 for (; aiter != completion_map.end(); ++aiter) {
10323 completion = aiter->second;
10324 if (!completion->is_safe()) {
10325 /* reached a request that is not yet complete, stop */
10326 break;
10327 }
10328
10329 r = completion->get_return_value();
10330 if (r < 0) {
10331 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10332 return r;
10333 }
10334
10335 total_read += r;
10336
10337 map<off_t, get_obj_io>::iterator old_liter = liter++;
10338 bl_list.push_back(old_liter->second.bl);
10339 io_map.erase(old_liter);
10340 }
10341
10342 return 0;
10343 }
10344 };
10345
10346 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10347 {
10348 struct get_obj_data *d = (struct get_obj_data *)arg;
10349
10350 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10351 }
10352
10353 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10354 {
10355 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10356 struct get_obj_data *d = aio_data->op_data;
10357
10358 d->rados->get_obj_aio_completion_cb(cb, arg);
10359 }
10360
10361
10362 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10363 {
10364 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10365 struct get_obj_data *d = aio_data->op_data;
10366 off_t ofs = aio_data->ofs;
10367 off_t len = aio_data->len;
10368
10369 list<bufferlist> bl_list;
10370 list<bufferlist>::iterator iter;
10371 int r;
10372
10373 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10374 d->throttle.put(len);
10375
10376 r = rados_aio_get_return_value(c);
10377 if (r < 0) {
10378 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10379 d->set_cancelled(r);
10380 goto done;
10381 }
10382
10383 if (d->is_cancelled()) {
10384 goto done;
10385 }
10386
10387 d->data_lock.Lock();
10388
10389 r = d->get_complete_ios(ofs, bl_list);
10390 if (r < 0) {
10391 goto done_unlock;
10392 }
10393
10394 d->read_list.splice(d->read_list.end(), bl_list);
10395
10396 done_unlock:
10397 d->data_lock.Unlock();
10398 done:
10399 d->put();
10400 return;
10401 }
10402
10403 int RGWRados::flush_read_list(struct get_obj_data *d)
10404 {
10405 d->data_lock.Lock();
10406 list<bufferlist> l;
10407 l.swap(d->read_list);
10408 d->get();
10409 d->read_list.clear();
10410
10411 d->data_lock.Unlock();
10412
10413 int r = 0;
10414
10415 list<bufferlist>::iterator iter;
10416 for (iter = l.begin(); iter != l.end(); ++iter) {
10417 bufferlist& bl = *iter;
10418 r = d->client_cb->handle_data(bl, 0, bl.length());
10419 if (r < 0) {
10420 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10421 break;
10422 }
10423 }
10424
10425 d->data_lock.Lock();
10426 d->put();
10427 if (r < 0) {
10428 d->set_cancelled(r);
10429 }
10430 d->data_lock.Unlock();
10431 return r;
10432 }
10433
10434 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10435 const RGWBucketInfo& bucket_info,
10436 const rgw_obj& obj,
10437 const rgw_raw_obj& read_obj,
10438 off_t obj_ofs,
10439 off_t read_ofs, off_t len,
10440 bool is_head_obj, void *arg)
10441 {
10442 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10443 ObjectReadOperation op;
10444 struct get_obj_data *d = (struct get_obj_data *)arg;
10445 string oid, key;
10446 bufferlist *pbl;
10447 AioCompletion *c;
10448
10449 int r;
10450
10451 if (is_head_obj) {
10452 /* only when reading from the head object do we need to do the atomic test */
10453 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10454 if (r < 0)
10455 return r;
10456
10457 if (astate &&
10458 obj_ofs < astate->data.length()) {
10459 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10460
10461 d->data_lock.Lock();
10462 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10463 d->data_lock.Unlock();
10464 if (r < 0)
10465 return r;
10466
10467 d->lock.Lock();
10468 d->total_read += chunk_len;
10469 d->lock.Unlock();
10470
10471 len -= chunk_len;
10472 read_ofs += chunk_len;
10473 obj_ofs += chunk_len;
10474 if (!len)
10475 return 0;
10476 }
10477 }
10478
10479 d->throttle.get(len);
10480 if (d->is_cancelled()) {
10481 return d->get_err_code();
10482 }
10483
10484 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10485 * cleaning up
10486 */
10487 d->add_io(obj_ofs, len, &pbl, &c);
10488
10489 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10490 op.read(read_ofs, len, pbl, NULL);
10491
10492 librados::IoCtx io_ctx(d->io_ctx);
10493 io_ctx.locator_set_key(read_obj.loc);
10494
10495 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10496 if (r < 0) {
10497 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10498 goto done_err;
10499 }
10500
10501 // Flush data to client if there is any
10502 r = flush_read_list(d);
10503 if (r < 0)
10504 return r;
10505
10506 return 0;
10507
10508 done_err:
10509 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10510 d->set_cancelled(r);
10511 d->cancel_io(obj_ofs);
10512
10513 return r;
10514 }
10515
10516 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10517 {
10518 RGWRados *store = source->get_store();
10519 CephContext *cct = store->ctx();
10520
10521 struct get_obj_data *data = new get_obj_data(cct);
10522 bool done = false;
10523
10524 RGWObjectCtx& obj_ctx = source->get_ctx();
10525
10526 data->rados = store;
10527 data->io_ctx.dup(state.io_ctx);
10528 data->client_cb = cb;
10529
10530 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10531 if (r < 0) {
10532 data->cancel_all_io();
10533 goto done;
10534 }
10535
10536 while (!done) {
10537 r = data->wait_next_io(&done);
10538 if (r < 0) {
10539 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10540 data->cancel_all_io();
10541 break;
10542 }
10543 r = store->flush_read_list(data);
10544 if (r < 0) {
10545 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10546 data->cancel_all_io();
10547 break;
10548 }
10549 }
10550
10551 done:
10552 data->put();
10553 return r;
10554 }
10555
10556 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10557 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10558 off_t ofs, off_t end,
10559 uint64_t max_chunk_size,
10560 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10561 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10562 RGWObjState *, void *),
10563 void *arg)
10564 {
10565 rgw_raw_obj head_obj;
10566 rgw_raw_obj read_obj;
10567 uint64_t read_ofs = ofs;
10568 uint64_t len;
10569 bool reading_from_head = true;
10570 RGWObjState *astate = NULL;
10571
10572 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10573
10574 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10575 if (r < 0) {
10576 return r;
10577 }
10578
10579 if (end < 0)
10580 len = 0;
10581 else
10582 len = end - ofs + 1;
10583
10584 if (astate->has_manifest) {
10585 /* now get the relevant object stripe */
10586 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10587
10588 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10589
10590 for (; iter != obj_end && ofs <= end; ++iter) {
10591 off_t stripe_ofs = iter.get_stripe_ofs();
10592 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10593
10594 while (ofs < next_stripe_ofs && ofs <= end) {
10595 read_obj = iter.get_location().get_raw_obj(this);
10596 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10597 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10598
10599 if (read_len > max_chunk_size) {
10600 read_len = max_chunk_size;
10601 }
10602
10603 reading_from_head = (read_obj == head_obj);
10604 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10605 if (r < 0) {
10606 return r;
10607 }
10608
10609 len -= read_len;
10610 ofs += read_len;
10611 }
10612 }
10613 } else {
10614 while (ofs <= end) {
10615 read_obj = head_obj;
10616 uint64_t read_len = min(len, max_chunk_size);
10617
10618 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10619 if (r < 0) {
10620 return r;
10621 }
10622
10623 len -= read_len;
10624 ofs += read_len;
10625 }
10626 }
10627
10628 return 0;
10629 }
10630
10631 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10632 {
10633 rgw_rados_ref ref;
10634 int r = get_obj_head_ref(bucket_info, obj, &ref);
10635 if (r < 0) {
10636 return r;
10637 }
10638
10639 return ref.ioctx.operate(ref.oid, op);
10640 }
10641
10642 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10643 {
10644 rgw_rados_ref ref;
10645 int r = get_obj_head_ref(bucket_info, obj, &ref);
10646 if (r < 0) {
10647 return r;
10648 }
10649
10650 bufferlist outbl;
10651
10652 return ref.ioctx.operate(ref.oid, op, &outbl);
10653 }
10654
10655 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10656 {
10657 ObjectWriteOperation op;
10658
10659 assert(olh_obj.key.instance.empty());
10660
10661 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10662
10663 if (!state.exists) {
10664 op.create(true);
10665 } else {
10666 op.assert_exists();
10667 }
10668
10669 /*
10670 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10671 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10672 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10673 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10674 * log will reflect that.
10675 *
10676 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10677 * is used for object data instance, olh_tag for olh instance.
10678 */
10679 if (has_tag) {
10680 /* guard against racing writes */
10681 bucket_index_guard_olh_op(state, op);
10682 }
10683
10684 if (!has_tag) {
10685 /* obj tag */
10686 string obj_tag;
10687 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10688 if (ret < 0) {
10689 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10690 return ret;
10691 }
10692 bufferlist bl;
10693 bl.append(obj_tag.c_str(), obj_tag.size());
10694 op.setxattr(RGW_ATTR_ID_TAG, bl);
10695
10696 state.attrset[RGW_ATTR_ID_TAG] = bl;
10697 state.obj_tag = bl;
10698
10699 /* olh tag */
10700 string olh_tag;
10701 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10702 if (ret < 0) {
10703 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10704 return ret;
10705 }
10706 bufferlist olh_bl;
10707 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10708 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10709
10710 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10711 state.olh_tag = olh_bl;
10712 state.is_olh = true;
10713
10714 bufferlist verbl;
10715 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10716 }
10717
10718 bufferlist bl;
10719 RGWOLHPendingInfo pending_info;
10720 pending_info.time = real_clock::now();
10721 ::encode(pending_info, bl);
10722
10723 #define OLH_PENDING_TAG_LEN 32
10724 /* tag will start with current time epoch, this so that entries are sorted by time */
10725 char buf[32];
10726 utime_t ut(pending_info.time);
10727 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10728 *op_tag = buf;
10729
10730 string s;
10731 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10732 if (ret < 0) {
10733 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10734 return ret;
10735 }
10736 op_tag->append(s);
10737
10738 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10739 attr_name.append(*op_tag);
10740
10741 op.setxattr(attr_name.c_str(), bl);
10742
10743 ret = obj_operate(bucket_info, olh_obj, &op);
10744 if (ret < 0) {
10745 return ret;
10746 }
10747
10748 state.exists = true;
10749 state.attrset[attr_name] = bl;
10750
10751 return 0;
10752 }
10753
10754 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
10755 {
10756 int ret;
10757
10758 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
10759 if (ret == -EEXIST) {
10760 ret = -ECANCELED;
10761 }
10762
10763 return ret;
10764 }
10765
10766 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
10767 {
10768 rgw_obj obj;
10769 const rgw_obj *pobj = &obj_instance;
10770 int r;
10771
10772 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10773 r = bs->init(pobj->bucket, *pobj);
10774 if (r < 0) {
10775 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
10776 return r;
10777 }
10778 r = call(bs);
10779 if (r != -ERR_BUSY_RESHARDING) {
10780 break;
10781 }
10782 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10783 string new_bucket_id;
10784 r = block_while_resharding(bs, &new_bucket_id);
10785 if (r == -ERR_BUSY_RESHARDING) {
10786 continue;
10787 }
10788 if (r < 0) {
10789 return r;
10790 }
10791 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10792 i = 0; /* resharding is finished, make sure we can retry */
10793
10794 obj = *pobj;
10795 obj.bucket.update_bucket_id(new_bucket_id);
10796 pobj = &obj;
10797 }
10798
10799 if (r < 0) {
10800 return r;
10801 }
10802
10803 return 0;
10804 }
10805
10806 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
10807 {
10808 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
10809
10810 return waiter->block_while_resharding(bs, new_bucket_id);
10811 }
10812
10813 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
10814 bool delete_marker,
10815 const string& op_tag,
10816 struct rgw_bucket_dir_entry_meta *meta,
10817 uint64_t olh_epoch,
10818 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
10819 {
10820 rgw_rados_ref ref;
10821 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10822 if (r < 0) {
10823 return r;
10824 }
10825
10826 rgw_zone_set zones_trace;
10827 if (_zones_trace) {
10828 zones_trace = *_zones_trace;
10829 } else {
10830 zones_trace.insert(get_zone().id);
10831 }
10832
10833 BucketShard bs(this);
10834
10835 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10836 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10837 librados::ObjectWriteOperation op;
10838 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10839 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
10840 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
10841 unmod_since, high_precision_time,
10842 get_zone().log_data, zones_trace);
10843 });
10844 if (r < 0) {
10845 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10846 return r;
10847 }
10848
10849 return 0;
10850 }
10851
10852 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
10853 {
10854 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
10855 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
10856 }
10857
10858 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
10859 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
10860 {
10861 rgw_rados_ref ref;
10862 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10863 if (r < 0) {
10864 return r;
10865 }
10866
10867 rgw_zone_set zones_trace;
10868 if (_zones_trace) {
10869 zones_trace = *_zones_trace;
10870 }
10871 zones_trace.insert(get_zone().id);
10872
10873 BucketShard bs(this);
10874
10875 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
10876 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10877 librados::ObjectWriteOperation op;
10878 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10879 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
10880 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
10881 });
10882 if (r < 0) {
10883 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
10884 return r;
10885 }
10886
10887 return 0;
10888 }
10889
10890 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
10891 const rgw_obj& obj_instance, uint64_t ver_marker,
10892 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
10893 bool *is_truncated)
10894 {
10895 rgw_rados_ref ref;
10896 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10897 if (r < 0) {
10898 return r;
10899 }
10900
10901 BucketShard bs(this);
10902 int ret = bs.init(obj_instance.bucket, obj_instance);
10903 if (ret < 0) {
10904 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10905 return ret;
10906 }
10907
10908 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10909
10910 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10911
10912 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
10913 ObjectReadOperation op;
10914 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10915 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
10916 key, ver_marker, olh_tag, log, is_truncated);
10917 });
10918 if (ret < 0) {
10919 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
10920 return ret;
10921 }
10922
10923 return 0;
10924 }
10925
10926 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
10927 {
10928 rgw_rados_ref ref;
10929 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10930 if (r < 0) {
10931 return r;
10932 }
10933
10934 BucketShard bs(this);
10935 int ret = bs.init(obj_instance.bucket, obj_instance);
10936 if (ret < 0) {
10937 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
10938 return ret;
10939 }
10940
10941 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10942
10943 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10944
10945 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
10946 ObjectWriteOperation op;
10947 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10948 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
10949 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
10950 });
10951 if (ret < 0) {
10952 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
10953 return ret;
10954 }
10955
10956 return 0;
10957 }
10958
10959 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
10960 {
10961 rgw_rados_ref ref;
10962 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
10963 if (r < 0) {
10964 return r;
10965 }
10966
10967 BucketShard bs(this);
10968
10969 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
10970
10971 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
10972
10973 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
10974 ObjectWriteOperation op;
10975 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
10976 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
10977 });
10978 if (ret < 0) {
10979 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
10980 return ret;
10981 }
10982
10983 return 0;
10984 }
10985
10986 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10987 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
10988 uint64_t *plast_ver, rgw_zone_set* zones_trace)
10989 {
10990 if (log.empty()) {
10991 return 0;
10992 }
10993
10994 librados::ObjectWriteOperation op;
10995
10996 uint64_t last_ver = log.rbegin()->first;
10997 *plast_ver = last_ver;
10998
10999 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11000
11001 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11002 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11003
11004 bool need_to_link = false;
11005 cls_rgw_obj_key key;
11006 bool delete_marker = false;
11007 list<cls_rgw_obj_key> remove_instances;
11008 bool need_to_remove = false;
11009
11010 for (iter = log.begin(); iter != log.end(); ++iter) {
11011 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11012 for (; viter != iter->second.end(); ++viter) {
11013 rgw_bucket_olh_log_entry& entry = *viter;
11014
11015 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11016 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11017 << (entry.delete_marker ? "(delete)" : "") << dendl;
11018 switch (entry.op) {
11019 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11020 remove_instances.push_back(entry.key);
11021 break;
11022 case CLS_RGW_OLH_OP_LINK_OLH:
11023 need_to_link = true;
11024 need_to_remove = false;
11025 key = entry.key;
11026 delete_marker = entry.delete_marker;
11027 break;
11028 case CLS_RGW_OLH_OP_UNLINK_OLH:
11029 need_to_remove = true;
11030 need_to_link = false;
11031 break;
11032 default:
11033 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11034 return -EIO;
11035 }
11036 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11037 attr_name.append(entry.op_tag);
11038 op.rmxattr(attr_name.c_str());
11039 }
11040 }
11041
11042 rgw_rados_ref ref;
11043 int r = get_obj_head_ref(bucket_info, obj, &ref);
11044 if (r < 0) {
11045 return r;
11046 }
11047
11048 const rgw_bucket& bucket = obj.bucket;
11049
11050 if (need_to_link) {
11051 rgw_obj target(bucket, key);
11052 RGWOLHInfo info;
11053 info.target = target;
11054 info.removed = delete_marker;
11055 bufferlist bl;
11056 ::encode(info, bl);
11057 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11058 }
11059
11060 /* first remove object instances */
11061 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11062 liter != remove_instances.end(); ++liter) {
11063 cls_rgw_obj_key& key = *liter;
11064 rgw_obj obj_instance(bucket, key);
11065 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11066 if (ret < 0 && ret != -ENOENT) {
11067 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11068 return ret;
11069 }
11070 }
11071
11072 /* update olh object */
11073 r = ref.ioctx.operate(ref.oid, &op);
11074 if (r == -ECANCELED) {
11075 r = 0;
11076 }
11077 if (r < 0) {
11078 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11079 return r;
11080 }
11081
11082 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11083 if (r < 0) {
11084 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11085 return r;
11086 }
11087
11088 if (need_to_remove) {
11089 ObjectWriteOperation rm_op;
11090
11091 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11092 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11093 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11094 rm_op.remove();
11095
11096 r = ref.ioctx.operate(ref.oid, &rm_op);
11097 if (r == -ECANCELED) {
11098 return 0; /* someone else won this race */
11099 } else {
11100 /*
11101 * only clear if was successful, otherwise we might clobber pending operations on this object
11102 */
11103 r = bucket_index_clear_olh(bucket_info, state, obj);
11104 if (r < 0) {
11105 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11106 return r;
11107 }
11108 }
11109 }
11110
11111 return 0;
11112 }
11113
11114 /*
11115 * read olh log and apply it
11116 */
11117 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11118 {
11119 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11120 bool is_truncated;
11121 uint64_t ver_marker = 0;
11122
11123 do {
11124 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11125 if (ret < 0) {
11126 return ret;
11127 }
11128 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11129 if (ret < 0) {
11130 return ret;
11131 }
11132 } while (is_truncated);
11133
11134 return 0;
11135 }
11136
11137 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11138 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11139 {
11140 string op_tag;
11141
11142 rgw_obj olh_obj = target_obj;
11143 olh_obj.key.instance.clear();
11144
11145 RGWObjState *state = NULL;
11146
11147 int ret = 0;
11148 int i;
11149
11150 #define MAX_ECANCELED_RETRY 100
11151 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11152 if (ret == -ECANCELED) {
11153 obj_ctx.obj.invalidate(olh_obj);
11154 }
11155
11156 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11157 if (ret < 0) {
11158 return ret;
11159 }
11160
11161 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11162 if (ret < 0) {
11163 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11164 if (ret == -ECANCELED) {
11165 continue;
11166 }
11167 return ret;
11168 }
11169 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11170 if (ret < 0) {
11171 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11172 if (ret == -ECANCELED) {
11173 continue;
11174 }
11175 return ret;
11176 }
11177 break;
11178 }
11179
11180 if (i == MAX_ECANCELED_RETRY) {
11181 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11182 return -EIO;
11183 }
11184
11185 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11186 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11187 ret = 0;
11188 }
11189 if (ret < 0) {
11190 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11191 return ret;
11192 }
11193
11194 return 0;
11195 }
11196
11197 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11198 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11199 {
11200 string op_tag;
11201
11202 rgw_obj olh_obj = target_obj;
11203 olh_obj.key.instance.clear();
11204
11205 RGWObjState *state = NULL;
11206
11207 int ret = 0;
11208 int i;
11209
11210 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11211 if (ret == -ECANCELED) {
11212 obj_ctx.obj.invalidate(olh_obj);
11213 }
11214
11215 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11216 if (ret < 0)
11217 return ret;
11218
11219 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11220 if (ret < 0) {
11221 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11222 if (ret == -ECANCELED) {
11223 continue;
11224 }
11225 return ret;
11226 }
11227
11228 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11229
11230 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11231 if (ret < 0) {
11232 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11233 if (ret == -ECANCELED) {
11234 continue;
11235 }
11236 return ret;
11237 }
11238 break;
11239 }
11240
11241 if (i == MAX_ECANCELED_RETRY) {
11242 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11243 return -EIO;
11244 }
11245
11246 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11247 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11248 return 0;
11249 }
11250 if (ret < 0) {
11251 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11252 return ret;
11253 }
11254
11255 return 0;
11256 }
11257
11258 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11259 {
11260 #define OBJ_INSTANCE_LEN 32
11261 char buf[OBJ_INSTANCE_LEN + 1];
11262
11263 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11264 no underscore for instance name due to the way we encode the raw keys */
11265
11266 target_obj->key.set_instance(buf);
11267 }
11268
11269 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11270 map<string, bufferlist> *attrset)
11271 {
11272 attrset->clear();
11273 map<string, bufferlist>::iterator iter;
11274 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11275 iter != unfiltered_attrset.end(); ++iter) {
11276 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11277 break;
11278 (*attrset)[iter->first] = iter->second;
11279 }
11280 }
11281
11282 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11283 {
11284 map<string, bufferlist> unfiltered_attrset;
11285
11286 ObjectReadOperation op;
11287 op.getxattrs(&unfiltered_attrset, NULL);
11288
11289 bufferlist outbl;
11290 int r = obj_operate(bucket_info, obj, &op);
11291
11292 if (r < 0) {
11293 return r;
11294 }
11295 map<string, bufferlist> attrset;
11296
11297 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11298
11299 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11300 if (iter == attrset.end()) { /* not an olh */
11301 return -EINVAL;
11302 }
11303
11304 try {
11305 bufferlist::iterator biter = iter->second.begin();
11306 ::decode(*olh, biter);
11307 } catch (buffer::error& err) {
11308 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11309 return -EIO;
11310 }
11311
11312 return 0;
11313 }
11314
11315 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11316 map<string, bufferlist> *rm_pending_entries)
11317 {
11318 map<string, bufferlist>::iterator iter = pending_entries.begin();
11319
11320 real_time now = real_clock::now();
11321
11322 while (iter != pending_entries.end()) {
11323 bufferlist::iterator biter = iter->second.begin();
11324 RGWOLHPendingInfo pending_info;
11325 try {
11326 ::decode(pending_info, biter);
11327 } catch (buffer::error& err) {
11328 /* skipping bad entry, we could remove it but it might hide a bug */
11329 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11330 ++iter;
11331 continue;
11332 }
11333
11334 map<string, bufferlist>::iterator cur_iter = iter;
11335 ++iter;
11336 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11337 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11338 pending_entries.erase(cur_iter);
11339 } else {
11340 /* entries names are sorted by time (rounded to a second) */
11341 break;
11342 }
11343 }
11344 }
11345
11346 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11347 {
11348 ObjectWriteOperation op;
11349
11350 bucket_index_guard_olh_op(state, op);
11351
11352 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11353 op.rmxattr(iter->first.c_str());
11354 }
11355
11356 rgw_rados_ref ref;
11357 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11358 if (r < 0) {
11359 return r;
11360 }
11361
11362 /* update olh object */
11363 r = ref.ioctx.operate(ref.oid, &op);
11364 if (r == -ENOENT || r == -ECANCELED) {
11365 /* raced with some other change, shouldn't sweat about it */
11366 r = 0;
11367 }
11368 if (r < 0) {
11369 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11370 return r;
11371 }
11372
11373 return 0;
11374 }
11375
11376 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11377 {
11378 map<string, bufferlist> pending_entries;
11379 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11380
11381 map<string, bufferlist> rm_pending_entries;
11382 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11383
11384 if (!rm_pending_entries.empty()) {
11385 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11386 if (ret < 0) {
11387 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11388 return ret;
11389 }
11390 }
11391 if (!pending_entries.empty()) {
11392 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11393
11394 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11395 if (ret < 0) {
11396 return ret;
11397 }
11398 }
11399
11400 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11401 assert(iter != state->attrset.end());
11402 RGWOLHInfo olh;
11403 try {
11404 bufferlist::iterator biter = iter->second.begin();
11405 ::decode(olh, biter);
11406 } catch (buffer::error& err) {
11407 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11408 return -EIO;
11409 }
11410
11411 if (olh.removed) {
11412 return -ENOENT;
11413 }
11414
11415 *target = olh.target;
11416
11417 return 0;
11418 }
11419
11420 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11421 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11422 RGWObjVersionTracker *objv_tracker)
11423 {
11424 rgw_rados_ref ref;
11425 int r = get_raw_obj_ref(obj, &ref);
11426 if (r < 0) {
11427 return r;
11428 }
11429
11430 map<string, bufferlist> unfiltered_attrset;
11431 uint64_t size = 0;
11432 struct timespec mtime_ts;
11433
11434 ObjectReadOperation op;
11435 if (objv_tracker) {
11436 objv_tracker->prepare_op_for_read(&op);
11437 }
11438 if (attrs) {
11439 op.getxattrs(&unfiltered_attrset, NULL);
11440 }
11441 if (psize || pmtime) {
11442 op.stat2(&size, &mtime_ts, NULL);
11443 }
11444 if (first_chunk) {
11445 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11446 }
11447 bufferlist outbl;
11448 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11449
11450 if (epoch) {
11451 *epoch = ref.ioctx.get_last_version();
11452 }
11453
11454 if (r < 0)
11455 return r;
11456
11457 if (psize)
11458 *psize = size;
11459 if (pmtime)
11460 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11461 if (attrs) {
11462 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11463 }
11464
11465 return 0;
11466 }
11467
11468 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11469 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker)
11470 {
11471 map<string, rgw_bucket_dir_header> headers;
11472 map<int, string> bucket_instance_ids;
11473 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11474 if (r < 0) {
11475 return r;
11476 }
11477
11478 assert(headers.size() == bucket_instance_ids.size());
11479
11480 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11481 map<int, string>::iterator viter = bucket_instance_ids.begin();
11482 BucketIndexShardsManager ver_mgr;
11483 BucketIndexShardsManager master_ver_mgr;
11484 BucketIndexShardsManager marker_mgr;
11485 string shard_marker;
11486 char buf[64];
11487 for(; iter != headers.end(); ++iter, ++viter) {
11488 accumulate_raw_stats(iter->second, stats);
11489 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11490 ver_mgr.add(viter->first, string(buf));
11491 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11492 master_ver_mgr.add(viter->first, string(buf));
11493 if (shard_id >= 0) {
11494 *max_marker = iter->second.max_marker;
11495 } else {
11496 marker_mgr.add(viter->first, iter->second.max_marker);
11497 }
11498 }
11499 ver_mgr.to_string(bucket_ver);
11500 master_ver_mgr.to_string(master_ver);
11501 if (shard_id < 0) {
11502 marker_mgr.to_string(max_marker);
11503 }
11504 return 0;
11505 }
11506
11507 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11508 map<int, string>& markers)
11509 {
11510 map<string, rgw_bucket_dir_header> headers;
11511 map<int, string> bucket_instance_ids;
11512 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11513 if (r < 0)
11514 return r;
11515
11516 assert(headers.size() == bucket_instance_ids.size());
11517
11518 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11519 map<int, string>::iterator viter = bucket_instance_ids.begin();
11520
11521 for(; iter != headers.end(); ++iter, ++viter) {
11522 if (shard_id >= 0) {
11523 markers[shard_id] = iter->second.max_marker;
11524 } else {
11525 markers[viter->first] = iter->second.max_marker;
11526 }
11527 }
11528 return 0;
11529 }
11530
11531 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11532 RGWGetBucketStats_CB *cb;
11533 uint32_t pendings;
11534 map<RGWObjCategory, RGWStorageStats> stats;
11535 int ret_code;
11536 bool should_cb;
11537 Mutex lock;
11538
11539 public:
11540 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11541 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11542 lock("RGWGetBucketStatsContext") {}
11543
11544 void handle_response(int r, rgw_bucket_dir_header& header) override {
11545 Mutex::Locker l(lock);
11546 if (should_cb) {
11547 if ( r >= 0) {
11548 accumulate_raw_stats(header, stats);
11549 } else {
11550 ret_code = r;
11551 }
11552
11553 // Are we all done?
11554 if (--pendings == 0) {
11555 if (!ret_code) {
11556 cb->set_response(&stats);
11557 }
11558 cb->handle_response(ret_code);
11559 cb->put();
11560 }
11561 }
11562 }
11563
11564 void unset_cb() {
11565 Mutex::Locker l(lock);
11566 should_cb = false;
11567 }
11568 };
11569
11570 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11571 {
11572 int num_aio = 0;
11573 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards);
11574 assert(get_ctx);
11575 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11576 get_ctx->put();
11577 if (r < 0) {
11578 ctx->put();
11579 if (num_aio) {
11580 get_ctx->unset_cb();
11581 }
11582 }
11583 return r;
11584 }
11585
11586 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11587 RGWGetUserStats_CB *cb;
11588
11589 public:
11590 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11591 : cb(cb) {}
11592
11593 void handle_response(int r, cls_user_header& header) override {
11594 const cls_user_stats& hs = header.stats;
11595 if (r >= 0) {
11596 RGWStorageStats stats;
11597
11598 stats.size = hs.total_bytes;
11599 stats.size_rounded = hs.total_bytes_rounded;
11600 stats.num_objects = hs.total_entries;
11601
11602 cb->set_response(stats);
11603 }
11604
11605 cb->handle_response(r);
11606
11607 cb->put();
11608 }
11609 };
11610
11611 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11612 {
11613 string user_str = user.to_str();
11614
11615 cls_user_header header;
11616 int r = cls_user_get_header(user_str, &header);
11617 if (r < 0)
11618 return r;
11619
11620 const cls_user_stats& hs = header.stats;
11621
11622 stats.size = hs.total_bytes;
11623 stats.size_rounded = hs.total_bytes_rounded;
11624 stats.num_objects = hs.total_entries;
11625
11626 return 0;
11627 }
11628
11629 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11630 {
11631 string user_str = user.to_str();
11632
11633 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11634 int r = cls_user_get_header_async(user_str, get_ctx);
11635 if (r < 0) {
11636 ctx->put();
11637 delete get_ctx;
11638 return r;
11639 }
11640
11641 return 0;
11642 }
11643
11644 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11645 {
11646 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11647 }
11648
11649 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11650 {
11651 if (!bucket.oid.empty()) {
11652 obj.init(get_zone_params().domain_root, bucket.oid);
11653 } else {
11654 string oid;
11655 get_bucket_meta_oid(bucket, oid);
11656 obj.init(get_zone_params().domain_root, oid);
11657 }
11658 }
11659
11660 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11661 real_time *pmtime, map<string, bufferlist> *pattrs)
11662 {
11663 size_t pos = meta_key.find(':');
11664 if (pos == string::npos) {
11665 return -EINVAL;
11666 }
11667 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11668 rgw_bucket_instance_key_to_oid(oid);
11669
11670 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11671 }
11672
11673 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11674 real_time *pmtime, map<string, bufferlist> *pattrs)
11675 {
11676 string oid;
11677 if (bucket.oid.empty()) {
11678 get_bucket_meta_oid(bucket, oid);
11679 } else {
11680 oid = bucket.oid;
11681 }
11682
11683 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11684 }
11685
11686 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11687 real_time *pmtime, map<string, bufferlist> *pattrs,
11688 rgw_cache_entry_info *cache_info)
11689 {
11690 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11691
11692 bufferlist epbl;
11693
11694 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, oid, epbl, &info.objv_tracker, pmtime, pattrs, cache_info);
11695 if (ret < 0) {
11696 return ret;
11697 }
11698
11699 bufferlist::iterator iter = epbl.begin();
11700 try {
11701 ::decode(info, iter);
11702 } catch (buffer::error& err) {
11703 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11704 return -EIO;
11705 }
11706 info.bucket.oid = oid;
11707 return 0;
11708 }
11709
11710 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11711 const string& tenant_name,
11712 const string& bucket_name,
11713 RGWBucketEntryPoint& entry_point,
11714 RGWObjVersionTracker *objv_tracker,
11715 real_time *pmtime,
11716 map<string, bufferlist> *pattrs,
11717 rgw_cache_entry_info *cache_info)
11718 {
11719 bufferlist bl;
11720 string bucket_entry;
11721
11722 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11723 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, bucket_entry, bl, objv_tracker, pmtime, pattrs, cache_info);
11724 if (ret < 0) {
11725 return ret;
11726 }
11727
11728 bufferlist::iterator iter = bl.begin();
11729 try {
11730 ::decode(entry_point, iter);
11731 } catch (buffer::error& err) {
11732 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11733 return -EIO;
11734 }
11735 return 0;
11736 }
11737
11738 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11739 const string& tenant_name,
11740 const string& bucket_name)
11741 {
11742 RGWBucketEntryPoint entry_point;
11743 real_time ep_mtime;
11744 RGWObjVersionTracker ot;
11745 map<string, bufferlist> attrs;
11746 RGWBucketInfo info;
11747
11748 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
11749
11750 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
11751 if (ret < 0) {
11752 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
11753 return ret;
11754 }
11755
11756 if (!entry_point.has_bucket_info) {
11757 /* already converted! */
11758 return 0;
11759 }
11760
11761 info = entry_point.old_bucket_info;
11762 info.bucket.oid = bucket_name;
11763 info.ep_objv = ot.read_version;
11764
11765 ot.generate_new_write_ver(cct);
11766
11767 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
11768 if (ret < 0) {
11769 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
11770 return ret;
11771 }
11772
11773 return 0;
11774 }
11775
11776 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
11777 const string& tenant, const string& bucket_name, RGWBucketInfo& info,
11778 real_time *pmtime, map<string, bufferlist> *pattrs)
11779 {
11780 bucket_info_entry e;
11781 string bucket_entry;
11782 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
11783
11784 if (binfo_cache->find(bucket_entry, &e)) {
11785 info = e.info;
11786 if (pattrs)
11787 *pattrs = e.attrs;
11788 if (pmtime)
11789 *pmtime = e.mtime;
11790 return 0;
11791 }
11792
11793 RGWBucketEntryPoint entry_point;
11794 real_time ep_mtime;
11795 RGWObjVersionTracker ot;
11796 rgw_cache_entry_info entry_cache_info;
11797 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name, entry_point, &ot, &ep_mtime, pattrs, &entry_cache_info);
11798 if (ret < 0) {
11799 /* only init these fields */
11800 info.bucket.tenant = tenant;
11801 info.bucket.name = bucket_name;
11802 return ret;
11803 }
11804
11805 if (entry_point.has_bucket_info) {
11806 info = entry_point.old_bucket_info;
11807 info.bucket.oid = bucket_name;
11808 info.bucket.tenant = tenant;
11809 info.ep_objv = ot.read_version;
11810 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
11811 return 0;
11812 }
11813
11814 /* data is in the bucket instance object, we need to get attributes from there, clear everything
11815 * that we got
11816 */
11817 if (pattrs) {
11818 pattrs->clear();
11819 }
11820
11821 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
11822
11823
11824 /* read bucket instance info */
11825
11826 string oid;
11827 get_bucket_meta_oid(entry_point.bucket, oid);
11828
11829 rgw_cache_entry_info cache_info;
11830
11831 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs, &cache_info);
11832 e.info.ep_objv = ot.read_version;
11833 info = e.info;
11834 if (ret < 0) {
11835 info.bucket.tenant = tenant;
11836 info.bucket.name = bucket_name;
11837 // XXX and why return anything in case of an error anyway?
11838 return ret;
11839 }
11840
11841 if (pmtime)
11842 *pmtime = e.mtime;
11843 if (pattrs)
11844 *pattrs = e.attrs;
11845
11846 list<rgw_cache_entry_info *> cache_info_entries;
11847 cache_info_entries.push_back(&entry_cache_info);
11848 cache_info_entries.push_back(&cache_info);
11849
11850
11851 /* chain to both bucket entry point and bucket instance */
11852 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
11853 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
11854 }
11855
11856 return 0;
11857 }
11858
11859 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
11860 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
11861 map<string, bufferlist> *pattrs)
11862 {
11863 bufferlist epbl;
11864 ::encode(entry_point, epbl);
11865 string bucket_entry;
11866 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11867 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
11868 }
11869
11870 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
11871 real_time mtime, map<string, bufferlist> *pattrs)
11872 {
11873 info.has_instance_obj = true;
11874 bufferlist bl;
11875
11876 ::encode(info, bl);
11877
11878 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
11879 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
11880 if (ret == -EEXIST) {
11881 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
11882 * bucket operation on this specific bucket (e.g., being synced from the master), but
11883 * since bucket instace meta object is unique for this specific bucket instace, we don't
11884 * need to return an error.
11885 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
11886 * master, creating a bucket, sending bucket creation to the master, we create the bucket
11887 * locally, while in the sync thread we sync the new bucket.
11888 */
11889 ret = 0;
11890 }
11891 return ret;
11892 }
11893
11894 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
11895 map<string, bufferlist> *pattrs, bool create_entry_point)
11896 {
11897 bool create_head = !info.has_instance_obj || create_entry_point;
11898
11899 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
11900 if (ret < 0) {
11901 return ret;
11902 }
11903
11904 if (!create_head)
11905 return 0; /* done! */
11906
11907 RGWBucketEntryPoint entry_point;
11908 entry_point.bucket = info.bucket;
11909 entry_point.owner = info.owner;
11910 entry_point.creation_time = info.creation_time;
11911 entry_point.linked = true;
11912 RGWObjVersionTracker ot;
11913 if (pep_objv && !pep_objv->tag.empty()) {
11914 ot.write_version = *pep_objv;
11915 } else {
11916 ot.generate_new_write_ver(cct);
11917 if (pep_objv) {
11918 *pep_objv = ot.write_version;
11919 }
11920 }
11921 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
11922 if (ret < 0)
11923 return ret;
11924
11925 return 0;
11926 }
11927
11928 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
11929 {
11930 rgw_rados_ref ref;
11931 int r = get_raw_obj_ref(obj, &ref);
11932 if (r < 0) {
11933 return r;
11934 }
11935
11936 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
11937 if (r < 0)
11938 return r;
11939
11940 return 0;
11941
11942 }
11943
11944 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
11945 std::map<string, bufferlist>& m)
11946 {
11947 rgw_rados_ref ref;
11948 int r = get_raw_obj_ref(obj, &ref);
11949 if (r < 0) {
11950 return r;
11951 }
11952
11953 #define MAX_OMAP_GET_ENTRIES 1024
11954 const int count = MAX_OMAP_GET_ENTRIES;
11955 string start_after;
11956
11957 while (true) {
11958 std::map<string, bufferlist> t;
11959 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
11960 if (r < 0) {
11961 return r;
11962 }
11963 if (t.empty()) {
11964 break;
11965 }
11966 start_after = t.rbegin()->first;
11967 m.insert(t.begin(), t.end());
11968 }
11969 return 0;
11970 }
11971
11972 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
11973 {
11974 rgw_rados_ref ref;
11975 int r = get_raw_obj_ref(obj, &ref);
11976 if (r < 0) {
11977 return r;
11978 }
11979 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
11980
11981 map<string, bufferlist> m;
11982 m[key] = bl;
11983
11984 r = ref.ioctx.omap_set(ref.oid, m);
11985
11986 return r;
11987 }
11988
11989 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
11990 {
11991 rgw_rados_ref ref;
11992 int r = get_raw_obj_ref(obj, &ref);
11993 if (r < 0) {
11994 return r;
11995 }
11996
11997 r = ref.ioctx.omap_set(ref.oid, m);
11998
11999 return r;
12000 }
12001
12002 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12003 {
12004 rgw_rados_ref ref;
12005 int r = get_raw_obj_ref(obj, &ref);
12006 if (r < 0) {
12007 return r;
12008 }
12009
12010 set<string> k;
12011 k.insert(key);
12012
12013 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12014 return r;
12015 }
12016
12017 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12018 {
12019 RGWObjectCtx obj_ctx(this);
12020
12021 map<string, RGWBucketEnt>::iterator iter;
12022 for (iter = m.begin(); iter != m.end(); ++iter) {
12023 RGWBucketEnt& ent = iter->second;
12024 rgw_bucket& bucket = ent.bucket;
12025 ent.count = 0;
12026 ent.size = 0;
12027 ent.size_rounded = 0;
12028
12029 map<string, rgw_bucket_dir_header> headers;
12030
12031 RGWBucketInfo bucket_info;
12032 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12033 if (ret < 0) {
12034 return ret;
12035 }
12036
12037 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12038 if (r < 0)
12039 return r;
12040
12041 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12042 for (; hiter != headers.end(); ++hiter) {
12043 RGWObjCategory category = main_category;
12044 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12045 if (iter != hiter->second.stats.end()) {
12046 struct rgw_bucket_category_stats& stats = iter->second;
12047 ent.count += stats.num_entries;
12048 ent.size += stats.total_size;
12049 ent.size_rounded += stats.total_size_rounded;
12050 }
12051 }
12052 }
12053
12054 return m.size();
12055 }
12056
12057 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12058 {
12059 rgw_rados_ref ref;
12060 int r = get_raw_obj_ref(obj, &ref);
12061 if (r < 0) {
12062 return r;
12063 }
12064 librados::Rados *rad = get_rados_handle();
12065 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12066
12067 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12068 completion->release();
12069 return r;
12070 }
12071
12072 int RGWRados::distribute(const string& key, bufferlist& bl)
12073 {
12074 /*
12075 * we were called before watch was initialized. This can only happen if we're updating some system
12076 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12077 * objects, they're currently only read on startup anyway.
12078 */
12079 if (!watch_initialized)
12080 return 0;
12081
12082 string notify_oid;
12083 pick_control_oid(key, notify_oid);
12084
12085 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12086 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12087 }
12088
12089 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12090 {
12091 librados::IoCtx& io_ctx = ctx.io_ctx;
12092 librados::NObjectIterator& iter = ctx.iter;
12093
12094 int r = open_pool_ctx(pool, io_ctx);
12095 if (r < 0)
12096 return r;
12097
12098 iter = io_ctx.nobjects_begin();
12099
12100 return 0;
12101 }
12102
12103 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12104 bool *is_truncated, RGWAccessListFilter *filter)
12105 {
12106 librados::IoCtx& io_ctx = ctx.io_ctx;
12107 librados::NObjectIterator& iter = ctx.iter;
12108
12109 if (iter == io_ctx.nobjects_end())
12110 return -ENOENT;
12111
12112 uint32_t i;
12113
12114 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12115 rgw_bucket_dir_entry e;
12116
12117 string oid = iter->get_oid();
12118 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12119
12120 // fill it in with initial values; we may correct later
12121 if (filter && !filter->filter(oid, oid))
12122 continue;
12123
12124 e.key = oid;
12125 objs.push_back(e);
12126 }
12127
12128 if (is_truncated)
12129 *is_truncated = (iter != io_ctx.nobjects_end());
12130
12131 return objs.size();
12132 }
12133 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12134 string prefix;
12135
12136 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12137 bool filter(string& name, string& key) override {
12138 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12139 }
12140 };
12141
12142 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12143 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12144 bool *is_truncated)
12145 {
12146 RGWAccessListFilterPrefix filter(prefix_filter);
12147
12148 if (!ctx.initialized) {
12149 int r = pool_iterate_begin(pool, ctx.iter_ctx);
12150 if (r < 0) {
12151 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12152 return r;
12153 }
12154 ctx.initialized = true;
12155 }
12156
12157 vector<rgw_bucket_dir_entry> objs;
12158 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12159 if (r < 0) {
12160 if(r != -ENOENT)
12161 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12162 return r;
12163 }
12164
12165 vector<rgw_bucket_dir_entry>::iterator iter;
12166 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12167 oids.push_back(iter->key.name);
12168 }
12169
12170 return oids.size();
12171 }
12172
12173 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12174 std::list<rgw_bi_log_entry>& result, bool *truncated)
12175 {
12176 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12177 result.clear();
12178
12179 librados::IoCtx index_ctx;
12180 map<int, string> oids;
12181 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12182 map<int, string> bucket_instance_ids;
12183 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12184 if (r < 0)
12185 return r;
12186
12187 BucketIndexShardsManager marker_mgr;
12188 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12189 // If there are multiple shards for the bucket index object, the marker
12190 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12191 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12192 // only contain one record, and the key is the bucket instance id.
12193 r = marker_mgr.from_string(marker, shard_id);
12194 if (r < 0)
12195 return r;
12196
12197 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12198 if (r < 0)
12199 return r;
12200
12201 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12202 map<int, list<rgw_bi_log_entry>::iterator> vends;
12203 if (truncated) {
12204 *truncated = false;
12205 }
12206 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12207 for (; miter != bi_log_lists.end(); ++miter) {
12208 int shard_id = miter->first;
12209 vcurrents[shard_id] = miter->second.entries.begin();
12210 vends[shard_id] = miter->second.entries.end();
12211 if (truncated) {
12212 *truncated = (*truncated || miter->second.truncated);
12213 }
12214 }
12215
12216 size_t total = 0;
12217 bool has_more = true;
12218 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12219 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12220 while (total < max && has_more) {
12221 has_more = false;
12222
12223 viter = vcurrents.begin();
12224 eiter = vends.begin();
12225
12226 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12227 assert (eiter != vends.end());
12228
12229 int shard_id = viter->first;
12230 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12231
12232 if (liter == eiter->second){
12233 continue;
12234 }
12235 rgw_bi_log_entry& entry = *(liter);
12236 if (has_shards) {
12237 char buf[16];
12238 snprintf(buf, sizeof(buf), "%d", shard_id);
12239 string tmp_id;
12240 build_bucket_index_marker(buf, entry.id, &tmp_id);
12241 entry.id.swap(tmp_id);
12242 }
12243 marker_mgr.add(shard_id, entry.id);
12244 result.push_back(entry);
12245 total++;
12246 has_more = true;
12247 ++liter;
12248 }
12249 }
12250
12251 if (truncated) {
12252 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12253 assert (eiter != vends.end());
12254 *truncated = (*truncated || (viter->second != eiter->second));
12255 }
12256 }
12257
12258 // Refresh marker, if there are multiple shards, the output will look like
12259 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12260 // if there is no sharding, the simply marker (without oid) is returned
12261 if (has_shards) {
12262 marker_mgr.to_string(&marker);
12263 } else {
12264 if (!result.empty()) {
12265 marker = result.rbegin()->id;
12266 }
12267 }
12268
12269 return 0;
12270 }
12271
12272 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12273 {
12274 librados::IoCtx index_ctx;
12275 map<int, string> bucket_objs;
12276
12277 BucketIndexShardsManager start_marker_mgr;
12278 BucketIndexShardsManager end_marker_mgr;
12279
12280 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12281 if (r < 0) {
12282 return r;
12283 }
12284
12285 r = start_marker_mgr.from_string(start_marker, shard_id);
12286 if (r < 0) {
12287 return r;
12288 }
12289
12290 r = end_marker_mgr.from_string(end_marker, shard_id);
12291 if (r < 0) {
12292 return r;
12293 }
12294
12295 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12296 cct->_conf->rgw_bucket_index_max_aio)();
12297
12298 return r;
12299 }
12300
12301 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12302 {
12303 rgw_rados_ref ref;
12304 int r = get_obj_head_ref(bucket_info, obj, &ref);
12305 if (r < 0) {
12306 return r;
12307 }
12308
12309 rgw_cls_bi_entry bi_entry;
12310 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12311 if (r < 0 && r != -ENOENT) {
12312 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12313 }
12314 if (r < 0) {
12315 return r;
12316 }
12317 bufferlist::iterator iter = bi_entry.data.begin();
12318 try {
12319 ::decode(*dirent, iter);
12320 } catch (buffer::error& err) {
12321 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12322 return -EIO;
12323 }
12324
12325 return 0;
12326 }
12327
12328 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12329 {
12330 BucketShard bs(this);
12331 int ret = bs.init(bucket, obj);
12332 if (ret < 0) {
12333 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12334 return ret;
12335 }
12336
12337 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12338
12339 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12340 if (ret < 0)
12341 return ret;
12342
12343 return 0;
12344 }
12345
12346 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12347 {
12348 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12349 }
12350
12351 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12352 {
12353 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12354 if (ret < 0)
12355 return ret;
12356
12357 return 0;
12358 }
12359
12360 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12361 {
12362 BucketShard bs(this);
12363 int ret = bs.init(bucket, obj);
12364 if (ret < 0) {
12365 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12366 return ret;
12367 }
12368
12369 return bi_put(bs, entry);
12370 }
12371
12372 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12373 {
12374 rgw_obj obj(bucket, obj_name);
12375 BucketShard bs(this);
12376 int ret = bs.init(bucket, obj);
12377 if (ret < 0) {
12378 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12379 return ret;
12380 }
12381
12382 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12383 if (ret == -ENOENT) {
12384 *is_truncated = false;
12385 }
12386 if (ret < 0)
12387 return ret;
12388
12389 return 0;
12390 }
12391
12392 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12393 {
12394 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12395 if (ret < 0)
12396 return ret;
12397
12398 return 0;
12399 }
12400
12401 int RGWRados::bi_remove(BucketShard& bs)
12402 {
12403 int ret = bs.index_ctx.remove(bs.bucket_obj);
12404 if (ret == -ENOENT) {
12405 ret = 0;
12406 }
12407 if (ret < 0) {
12408 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12409 return ret;
12410 }
12411
12412 return 0;
12413 }
12414
12415 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12416 {
12417 BucketShard bs(this);
12418 int ret = bs.init(bucket, shard_id);
12419 if (ret < 0) {
12420 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12421 return ret;
12422 }
12423
12424 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12425 }
12426
12427 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12428 {
12429 return gc_pool_ctx.operate(oid, op);
12430 }
12431
12432 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12433 {
12434 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12435 int r = gc_pool_ctx.aio_operate(oid, c, op);
12436 c->release();
12437 return r;
12438 }
12439
12440 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12441 {
12442 return gc_pool_ctx.operate(oid, op, pbl);
12443 }
12444
12445 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12446 {
12447 return gc->list(index, marker, max, expired_only, result, truncated);
12448 }
12449
12450 int RGWRados::process_gc()
12451 {
12452 return gc->process();
12453 }
12454
12455 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12456 {
12457 return lc->list_lc_progress(marker, max_entries, progress_map);
12458 }
12459
12460 int RGWRados::process_lc()
12461 {
12462 return lc->process();
12463 }
12464
12465 int RGWRados::process_expire_objects()
12466 {
12467 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12468 return 0;
12469 }
12470
12471 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12472 {
12473 bufferlist in;
12474 cls_rgw_bucket_init(op);
12475 return index_ctx.operate(oid, &op);
12476 }
12477
12478 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12479 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12480 {
12481 rgw_zone_set zones_trace;
12482 if (_zones_trace) {
12483 zones_trace = *_zones_trace;
12484 }
12485 else {
12486 zones_trace.insert(get_zone().id);
12487 }
12488
12489 ObjectWriteOperation o;
12490 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12491 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12492 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12493 return bs.index_ctx.operate(bs.bucket_obj, &o);
12494 }
12495
12496 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12497 int64_t pool, uint64_t epoch,
12498 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12499 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12500 {
12501 ObjectWriteOperation o;
12502 rgw_bucket_dir_entry_meta dir_meta;
12503 dir_meta = ent.meta;
12504 dir_meta.category = category;
12505
12506 rgw_bucket_entry_ver ver;
12507 ver.pool = pool;
12508 ver.epoch = epoch;
12509 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12510 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12511 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12512 get_zone().log_data, bilog_flags, _zones_trace);
12513 complete_op_data *arg;
12514 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12515 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12516 librados::AioCompletion *completion = arg->rados_completion;
12517 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12518 completion->release(); /* can't reference arg here, as it might have already been released */
12519 return ret;
12520 }
12521
12522 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12523 int64_t pool, uint64_t epoch,
12524 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12525 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12526 {
12527 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12528 }
12529
12530 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12531 int64_t pool, uint64_t epoch,
12532 rgw_obj& obj,
12533 real_time& removed_mtime,
12534 list<rgw_obj_index_key> *remove_objs,
12535 uint16_t bilog_flags,
12536 rgw_zone_set *zones_trace)
12537 {
12538 rgw_bucket_dir_entry ent;
12539 ent.meta.mtime = removed_mtime;
12540 obj.key.get_index_key(&ent.key);
12541 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12542 }
12543
12544 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12545 {
12546 rgw_bucket_dir_entry ent;
12547 obj.key.get_index_key(&ent.key);
12548 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12549 }
12550
12551 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12552 {
12553 librados::IoCtx index_ctx;
12554 map<int, string> bucket_objs;
12555 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12556 if (r < 0)
12557 return r;
12558
12559 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12560 }
12561
12562 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12563 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12564 bool *is_truncated, rgw_obj_index_key *last_entry,
12565 bool (*force_check_filter)(const string& name))
12566 {
12567 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12568
12569 librados::IoCtx index_ctx;
12570 // key - oid (for different shards if there is any)
12571 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12572 map<int, string> oids;
12573 map<int, struct rgw_cls_list_ret> list_results;
12574 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12575 if (r < 0)
12576 return r;
12577
12578 cls_rgw_obj_key start_key(start.name, start.instance);
12579 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12580 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12581 if (r < 0)
12582 return r;
12583
12584 // Create a list of iterators that are used to iterate each shard
12585 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12586 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12587 vector<string> vnames(list_results.size());
12588 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12589 *is_truncated = false;
12590 for (; iter != list_results.end(); ++iter) {
12591 vcurrents.push_back(iter->second.dir.m.begin());
12592 vends.push_back(iter->second.dir.m.end());
12593 vnames.push_back(oids[iter->first]);
12594 *is_truncated = (*is_truncated || iter->second.is_truncated);
12595 }
12596
12597 // Create a map to track the next candidate entry from each shard, if the entry
12598 // from a specified shard is selected/erased, the next entry from that shard will
12599 // be inserted for next round selection
12600 map<string, size_t> candidates;
12601 for (size_t i = 0; i < vcurrents.size(); ++i) {
12602 if (vcurrents[i] != vends[i]) {
12603 candidates[vcurrents[i]->first] = i;
12604 }
12605 }
12606
12607 map<string, bufferlist> updates;
12608 uint32_t count = 0;
12609 while (count < num_entries && !candidates.empty()) {
12610 r = 0;
12611 // Select the next one
12612 int pos = candidates.begin()->second;
12613 const string& name = vcurrents[pos]->first;
12614 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12615
12616 bool force_check = force_check_filter && force_check_filter(dirent.key.name);
12617 if ((!dirent.exists && !dirent.is_delete_marker()) || !dirent.pending_map.empty() || force_check) {
12618 /* there are uncommitted ops. We need to check the current state,
12619 * and if the tags are old we need to do cleanup as well. */
12620 librados::IoCtx sub_ctx;
12621 sub_ctx.dup(index_ctx);
12622 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
12623 if (r < 0 && r != -ENOENT) {
12624 return r;
12625 }
12626 }
12627 if (r >= 0) {
12628 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
12629 m[name] = std::move(dirent);
12630 ++count;
12631 }
12632
12633 // Refresh the candidates map
12634 candidates.erase(candidates.begin());
12635 ++vcurrents[pos];
12636 if (vcurrents[pos] != vends[pos]) {
12637 candidates[vcurrents[pos]->first] = pos;
12638 }
12639 }
12640
12641 // Suggest updates if there is any
12642 map<string, bufferlist>::iterator miter = updates.begin();
12643 for (; miter != updates.end(); ++miter) {
12644 if (miter->second.length()) {
12645 ObjectWriteOperation o;
12646 cls_rgw_suggest_changes(o, miter->second);
12647 // we don't care if we lose suggested updates, send them off blindly
12648 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12649 index_ctx.aio_operate(miter->first, c, &o);
12650 c->release();
12651 }
12652 }
12653
12654 // Check if all the returned entries are consumed or not
12655 for (size_t i = 0; i < vcurrents.size(); ++i) {
12656 if (vcurrents[i] != vends[i])
12657 *is_truncated = true;
12658 }
12659 if (!m.empty())
12660 *last_entry = m.rbegin()->first;
12661
12662 return 0;
12663 }
12664
12665 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
12666 {
12667 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12668
12669 rgw_rados_ref ref;
12670 rgw_pool pool;
12671 int r = get_raw_obj_ref(obj, &ref, &pool);
12672 if (r < 0) {
12673 return r;
12674 }
12675
12676 ObjectWriteOperation op;
12677 cls_rgw_usage_log_add(op, info);
12678
12679 r = ref.ioctx.operate(ref.oid, &op);
12680 return r;
12681 }
12682
12683 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
12684 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
12685 {
12686 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12687
12688 rgw_rados_ref ref;
12689 rgw_pool pool;
12690 int r = get_raw_obj_ref(obj, &ref, &pool);
12691 if (r < 0) {
12692 return r;
12693 }
12694
12695 *is_truncated = false;
12696
12697 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
12698 max_entries, read_iter, usage, is_truncated);
12699
12700 return r;
12701 }
12702
12703 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
12704 {
12705 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
12706
12707 rgw_rados_ref ref;
12708 rgw_pool pool;
12709 int r = get_raw_obj_ref(obj, &ref, &pool);
12710 if (r < 0) {
12711 return r;
12712 }
12713
12714 ObjectWriteOperation op;
12715 cls_rgw_usage_log_trim(op, user, start_epoch, end_epoch);
12716
12717 r = ref.ioctx.operate(ref.oid, &op);
12718 return r;
12719 }
12720
12721 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
12722 {
12723 librados::IoCtx index_ctx;
12724 string dir_oid;
12725
12726 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12727
12728 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
12729 if (r < 0)
12730 return r;
12731
12732 bufferlist updates;
12733
12734 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
12735 rgw_bucket_dir_entry entry;
12736 entry.key = *iter;
12737 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
12738 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
12739 updates.append(CEPH_RGW_REMOVE | suggest_flag);
12740 ::encode(entry, updates);
12741 }
12742
12743 bufferlist out;
12744
12745 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
12746
12747 return r;
12748 }
12749
12750 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
12751 const RGWBucketInfo& bucket_info,
12752 rgw_bucket_dir_entry& list_state,
12753 rgw_bucket_dir_entry& object,
12754 bufferlist& suggested_updates)
12755 {
12756 const rgw_bucket& bucket = bucket_info.bucket;
12757 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
12758
12759 std::string loc;
12760
12761 rgw_obj obj(bucket, list_state.key);
12762
12763 string oid;
12764 get_obj_bucket_and_oid_loc(obj, oid, loc);
12765
12766 if (loc != list_state.locator) {
12767 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
12768 }
12769
12770 io_ctx.locator_set_key(list_state.locator);
12771
12772 RGWObjState *astate = NULL;
12773 RGWObjectCtx rctx(this);
12774 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
12775 if (r < 0)
12776 return r;
12777
12778 list_state.pending_map.clear(); // we don't need this and it inflates size
12779 if (!astate->exists) {
12780 /* object doesn't exist right now -- hopefully because it's
12781 * marked as !exists and got deleted */
12782 if (list_state.exists) {
12783 /* FIXME: what should happen now? Work out if there are any
12784 * non-bad ways this could happen (there probably are, but annoying
12785 * to handle!) */
12786 }
12787 // encode a suggested removal of that key
12788 list_state.ver.epoch = io_ctx.get_last_version();
12789 list_state.ver.pool = io_ctx.get_id();
12790 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
12791 return -ENOENT;
12792 }
12793
12794 string etag;
12795 string content_type;
12796 ACLOwner owner;
12797
12798 object.meta.size = astate->size;
12799 object.meta.accounted_size = astate->accounted_size;
12800 object.meta.mtime = astate->mtime;
12801
12802 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
12803 if (iter != astate->attrset.end()) {
12804 etag = iter->second.c_str();
12805 }
12806 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
12807 if (iter != astate->attrset.end()) {
12808 content_type = iter->second.c_str();
12809 }
12810 iter = astate->attrset.find(RGW_ATTR_ACL);
12811 if (iter != astate->attrset.end()) {
12812 r = decode_policy(iter->second, &owner);
12813 if (r < 0) {
12814 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
12815 }
12816 }
12817
12818 if (astate->has_manifest) {
12819 RGWObjManifest::obj_iterator miter;
12820 RGWObjManifest& manifest = astate->manifest;
12821 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
12822 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
12823 rgw_obj loc;
12824 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
12825
12826 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
12827 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
12828 r = delete_obj_index(loc);
12829 if (r < 0) {
12830 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
12831 }
12832 }
12833 }
12834 }
12835
12836 object.meta.etag = etag;
12837 object.meta.content_type = content_type;
12838 object.meta.owner = owner.get_id().to_str();
12839 object.meta.owner_display_name = owner.get_display_name();
12840
12841 // encode suggested updates
12842 list_state.ver.pool = io_ctx.get_id();
12843 list_state.ver.epoch = astate->epoch;
12844 list_state.meta.size = object.meta.size;
12845 list_state.meta.accounted_size = object.meta.accounted_size;
12846 list_state.meta.mtime = object.meta.mtime;
12847 list_state.meta.category = main_category;
12848 list_state.meta.etag = etag;
12849 list_state.meta.content_type = content_type;
12850 if (astate->obj_tag.length() > 0)
12851 list_state.tag = astate->obj_tag.c_str();
12852 list_state.meta.owner = owner.get_id().to_str();
12853 list_state.meta.owner_display_name = owner.get_display_name();
12854
12855 list_state.exists = true;
12856 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
12857 return 0;
12858 }
12859
12860 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
12861 {
12862 librados::IoCtx index_ctx;
12863 map<int, string> oids;
12864 map<int, struct rgw_cls_list_ret> list_results;
12865 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
12866 if (r < 0)
12867 return r;
12868
12869 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12870 if (r < 0)
12871 return r;
12872
12873 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12874 for(; iter != list_results.end(); ++iter) {
12875 headers[oids[iter->first]] = iter->second.dir.header;
12876 }
12877 return 0;
12878 }
12879
12880 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
12881 {
12882 librados::IoCtx index_ctx;
12883 map<int, string> bucket_objs;
12884 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12885 if (r < 0)
12886 return r;
12887
12888 map<int, string>::iterator iter = bucket_objs.begin();
12889 for (; iter != bucket_objs.end(); ++iter) {
12890 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
12891 if (r < 0) {
12892 ctx->put();
12893 break;
12894 } else {
12895 (*num_aio)++;
12896 }
12897 }
12898 return r;
12899 }
12900
12901 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
12902 {
12903 string buckets_obj_id;
12904 rgw_get_buckets_obj(user_id, buckets_obj_id);
12905 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12906
12907 rgw_rados_ref ref;
12908 rgw_pool pool;
12909 int r = get_raw_obj_ref(obj, &ref, &pool);
12910 if (r < 0) {
12911 return r;
12912 }
12913
12914 librados::ObjectReadOperation op;
12915 int rc;
12916 ::cls_user_get_header(op, header, &rc);
12917 bufferlist ibl;
12918 r = ref.ioctx.operate(ref.oid, &op, &ibl);
12919 if (r < 0)
12920 return r;
12921 if (rc < 0)
12922 return rc;
12923
12924 return 0;
12925 }
12926
12927 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
12928 {
12929 string buckets_obj_id;
12930 rgw_get_buckets_obj(user_id, buckets_obj_id);
12931 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
12932
12933 rgw_rados_ref ref;
12934 rgw_pool pool;
12935 int r = get_raw_obj_ref(obj, &ref, &pool);
12936 if (r < 0) {
12937 return r;
12938 }
12939
12940 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
12941 if (r < 0)
12942 return r;
12943
12944 return 0;
12945 }
12946
12947 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
12948 {
12949 map<string, struct rgw_bucket_dir_header> headers;
12950 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12951 if (r < 0) {
12952 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
12953 return r;
12954 }
12955
12956 cls_user_bucket_entry entry;
12957
12958 bucket_info.bucket.convert(&entry.bucket);
12959
12960 map<string, struct rgw_bucket_dir_header>::iterator hiter = headers.begin();
12961 for (; hiter != headers.end(); ++hiter) {
12962 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = hiter->second.stats.begin();
12963 for (; iter != hiter->second.stats.end(); ++iter) {
12964 struct rgw_bucket_category_stats& header_stats = iter->second;
12965 entry.size += header_stats.total_size;
12966 entry.size_rounded += header_stats.total_size_rounded;
12967 entry.count += header_stats.num_entries;
12968 }
12969 }
12970
12971 list<cls_user_bucket_entry> entries;
12972 entries.push_back(entry);
12973
12974 r = cls_user_update_buckets(user_obj, entries, false);
12975 if (r < 0) {
12976 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
12977 return r;
12978 }
12979
12980 return 0;
12981 }
12982
12983 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
12984 const string& in_marker,
12985 const string& end_marker,
12986 const int max_entries,
12987 list<cls_user_bucket_entry>& entries,
12988 string * const out_marker,
12989 bool * const truncated)
12990 {
12991 rgw_rados_ref ref;
12992 rgw_pool pool;
12993 int r = get_raw_obj_ref(obj, &ref, &pool);
12994 if (r < 0) {
12995 return r;
12996 }
12997
12998 librados::ObjectReadOperation op;
12999 int rc;
13000
13001 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13002 bufferlist ibl;
13003 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13004 if (r < 0)
13005 return r;
13006 if (rc < 0)
13007 return rc;
13008
13009 return 0;
13010 }
13011
13012 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13013 {
13014 rgw_rados_ref ref;
13015 rgw_pool pool;
13016 int r = get_raw_obj_ref(obj, &ref, &pool);
13017 if (r < 0) {
13018 return r;
13019 }
13020
13021 librados::ObjectWriteOperation op;
13022 cls_user_set_buckets(op, entries, add);
13023 r = ref.ioctx.operate(ref.oid, &op);
13024 if (r < 0)
13025 return r;
13026
13027 return 0;
13028 }
13029
13030 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13031 {
13032 string buckets_obj_id;
13033 rgw_get_buckets_obj(user_id, buckets_obj_id);
13034 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13035 return cls_user_complete_stats_sync(obj);
13036 }
13037
13038 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13039 {
13040 rgw_rados_ref ref;
13041 rgw_pool pool;
13042 int r = get_raw_obj_ref(obj, &ref, &pool);
13043 if (r < 0) {
13044 return r;
13045 }
13046
13047 librados::ObjectWriteOperation op;
13048 ::cls_user_complete_stats_sync(op);
13049 r = ref.ioctx.operate(ref.oid, &op);
13050 if (r < 0)
13051 return r;
13052
13053 return 0;
13054 }
13055
13056 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13057 {
13058 list<cls_user_bucket_entry> l;
13059 l.push_back(entry);
13060
13061 return cls_user_update_buckets(obj, l, true);
13062 }
13063
13064 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13065 {
13066 rgw_pool p;
13067 rgw_rados_ref ref;
13068 int r = get_system_obj_ref(obj, &ref, &p);
13069 if (r < 0) {
13070 return r;
13071 }
13072
13073 librados::ObjectWriteOperation op;
13074 ::cls_user_remove_bucket(op, bucket);
13075 r = ref.ioctx.operate(ref.oid, &op);
13076 if (r < 0)
13077 return r;
13078
13079 return 0;
13080 }
13081
13082 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, rgw_bucket& bucket,
13083 RGWQuotaInfo& bucket_quota)
13084 {
13085 if (!cct->_conf->rgw_dynamic_resharding) {
13086 return 0;
13087 }
13088
13089 bool need_resharding = false;
13090 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13091 uint32_t suggested_num_shards;
13092
13093 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13094 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13095 1, need_resharding, &suggested_num_shards);
13096 if (ret < 0) {
13097 return ret;
13098 }
13099
13100 if (need_resharding) {
13101 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13102 }
13103
13104 return ret;
13105 }
13106
13107 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13108 {
13109 RGWReshard reshard(this);
13110
13111 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13112
13113 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13114 if (new_num_shards <= num_source_shards) {
13115 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13116 return 0;
13117 }
13118
13119 cls_rgw_reshard_entry entry;
13120 entry.time = real_clock::now();
13121 entry.tenant = bucket_info.owner.tenant;
13122 entry.bucket_name = bucket_info.bucket.name;
13123 entry.bucket_id = bucket_info.bucket.bucket_id;
13124 entry.old_num_shards = num_source_shards;
13125 entry.new_num_shards = new_num_shards;
13126
13127 return reshard.add(entry);
13128 }
13129
13130 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13131 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13132 {
13133 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13134 }
13135
13136 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13137 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13138 {
13139 if (!num_shards) {
13140 bucket_objects[0] = bucket_oid_base;
13141 } else {
13142 char buf[bucket_oid_base.size() + 32];
13143 if (shard_id < 0) {
13144 for (uint32_t i = 0; i < num_shards; ++i) {
13145 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13146 bucket_objects[i] = buf;
13147 }
13148 } else {
13149 if ((uint32_t)shard_id > num_shards) {
13150 return;
13151 }
13152 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13153 bucket_objects[shard_id] = buf;
13154 }
13155 }
13156 }
13157
13158 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13159 {
13160 const rgw_bucket& bucket = bucket_info.bucket;
13161 string plain_id = bucket.name + ":" + bucket.bucket_id;
13162 if (!bucket_info.num_shards) {
13163 (*result)[0] = plain_id;
13164 } else {
13165 char buf[16];
13166 if (shard_id < 0) {
13167 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13168 snprintf(buf, sizeof(buf), ":%d", i);
13169 (*result)[i] = plain_id + buf;
13170 }
13171 } else {
13172 if ((uint32_t)shard_id > bucket_info.num_shards) {
13173 return;
13174 }
13175 snprintf(buf, sizeof(buf), ":%d", shard_id);
13176 (*result)[shard_id] = plain_id + buf;
13177 }
13178 }
13179 }
13180
13181 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13182 int *shard_id)
13183 {
13184 int r = 0;
13185 switch (bucket_info.bucket_index_shard_hash_type) {
13186 case RGWBucketInfo::MOD:
13187 if (!bucket_info.num_shards) {
13188 if (shard_id) {
13189 *shard_id = -1;
13190 }
13191 } else {
13192 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13193 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13194 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13195 if (shard_id) {
13196 *shard_id = (int)sid;
13197 }
13198 }
13199 break;
13200 default:
13201 r = -ENOTSUP;
13202 }
13203 return r;
13204 }
13205
13206 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13207 int shard_id, string *bucket_obj)
13208 {
13209 if (!num_shards) {
13210 // By default with no sharding, we use the bucket oid as itself
13211 (*bucket_obj) = bucket_oid_base;
13212 } else {
13213 char buf[bucket_oid_base.size() + 32];
13214 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13215 (*bucket_obj) = buf;
13216 }
13217 }
13218
13219 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13220 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13221 {
13222 int r = 0;
13223 switch (hash_type) {
13224 case RGWBucketInfo::MOD:
13225 if (!num_shards) {
13226 // By default with no sharding, we use the bucket oid as itself
13227 (*bucket_obj) = bucket_oid_base;
13228 if (shard_id) {
13229 *shard_id = -1;
13230 }
13231 } else {
13232 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13233 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13234 sid = rgw_shards_mod(sid2, num_shards);
13235 char buf[bucket_oid_base.size() + 32];
13236 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13237 (*bucket_obj) = buf;
13238 if (shard_id) {
13239 *shard_id = (int)sid;
13240 }
13241 }
13242 break;
13243 default:
13244 r = -ENOTSUP;
13245 }
13246 return r;
13247 }
13248
13249 void RGWStateLog::oid_str(int shard, string& oid) {
13250 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13251 char buf[16];
13252 snprintf(buf, sizeof(buf), "%d", shard);
13253 oid += buf;
13254 }
13255
13256 int RGWStateLog::get_shard_num(const string& object) {
13257 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13258 return val % num_shards;
13259 }
13260
13261 string RGWStateLog::get_oid(const string& object) {
13262 int shard = get_shard_num(object);
13263 string oid;
13264 oid_str(shard, oid);
13265 return oid;
13266 }
13267
13268 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13269 rgw_pool pool;
13270 store->get_log_pool(pool);
13271 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13272 if (r < 0) {
13273 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13274 return r;
13275 }
13276 return 0;
13277 }
13278
13279 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13280 uint32_t state, bufferlist *bl, uint32_t *check_state)
13281 {
13282 if (client_id.empty() ||
13283 op_id.empty() ||
13284 object.empty()) {
13285 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13286 }
13287
13288 librados::IoCtx ioctx;
13289 int r = open_ioctx(ioctx);
13290 if (r < 0)
13291 return r;
13292
13293 string oid = get_oid(object);
13294
13295 librados::ObjectWriteOperation op;
13296 if (check_state) {
13297 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13298 }
13299 utime_t ts = ceph_clock_now();
13300 bufferlist nobl;
13301 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13302 r = ioctx.operate(oid, &op);
13303 if (r < 0) {
13304 return r;
13305 }
13306
13307 return 0;
13308 }
13309
13310 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13311 {
13312 if (client_id.empty() ||
13313 op_id.empty() ||
13314 object.empty()) {
13315 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13316 }
13317
13318 librados::IoCtx ioctx;
13319 int r = open_ioctx(ioctx);
13320 if (r < 0)
13321 return r;
13322
13323 string oid = get_oid(object);
13324
13325 librados::ObjectWriteOperation op;
13326 cls_statelog_remove_by_object(op, object, op_id);
13327 r = ioctx.operate(oid, &op);
13328 if (r < 0) {
13329 return r;
13330 }
13331
13332 return 0;
13333 }
13334
13335 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13336 void **handle)
13337 {
13338 list_state *state = new list_state;
13339 state->client_id = client_id;
13340 state->op_id = op_id;
13341 state->object = object;
13342 if (object.empty()) {
13343 state->cur_shard = 0;
13344 state->max_shard = num_shards - 1;
13345 } else {
13346 state->cur_shard = state->max_shard = get_shard_num(object);
13347 }
13348 *handle = (void *)state;
13349 }
13350
13351 int RGWStateLog::list_entries(void *handle, int max_entries,
13352 list<cls_statelog_entry>& entries,
13353 bool *done)
13354 {
13355 list_state *state = static_cast<list_state *>(handle);
13356
13357 librados::IoCtx ioctx;
13358 int r = open_ioctx(ioctx);
13359 if (r < 0)
13360 return r;
13361
13362 entries.clear();
13363
13364 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13365 string oid;
13366 oid_str(state->cur_shard, oid);
13367
13368 librados::ObjectReadOperation op;
13369 list<cls_statelog_entry> ents;
13370 bool truncated;
13371 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13372 max_entries, ents, &state->marker, &truncated);
13373 bufferlist ibl;
13374 r = ioctx.operate(oid, &op, &ibl);
13375 if (r == -ENOENT) {
13376 truncated = false;
13377 r = 0;
13378 }
13379 if (r < 0) {
13380 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13381 return r;
13382 }
13383
13384 if (!truncated) {
13385 state->marker.clear();
13386 }
13387
13388 max_entries -= ents.size();
13389
13390 entries.splice(entries.end(), ents);
13391
13392 if (truncated)
13393 break;
13394 }
13395
13396 *done = (state->cur_shard > state->max_shard);
13397
13398 return 0;
13399 }
13400
13401 void RGWStateLog::finish_list_entries(void *handle)
13402 {
13403 list_state *state = static_cast<list_state *>(handle);
13404 delete state;
13405 }
13406
13407 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13408 {
13409 f->open_object_section("statelog_entry");
13410 f->dump_string("client_id", entry.client_id);
13411 f->dump_string("op_id", entry.op_id);
13412 f->dump_string("object", entry.object);
13413 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13414 if (!dump_entry_internal(entry, f)) {
13415 f->dump_int("state", entry.state);
13416 }
13417 f->close_section();
13418 }
13419
13420 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13421 {
13422 }
13423
13424 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13425 {
13426 string s;
13427 switch ((OpState)entry.state) {
13428 case OPSTATE_UNKNOWN:
13429 s = "unknown";
13430 break;
13431 case OPSTATE_IN_PROGRESS:
13432 s = "in-progress";
13433 break;
13434 case OPSTATE_COMPLETE:
13435 s = "complete";
13436 break;
13437 case OPSTATE_ERROR:
13438 s = "error";
13439 break;
13440 case OPSTATE_ABORT:
13441 s = "abort";
13442 break;
13443 case OPSTATE_CANCELLED:
13444 s = "cancelled";
13445 break;
13446 default:
13447 s = "invalid";
13448 }
13449 f->dump_string("state", s);
13450 return true;
13451 }
13452
13453 int RGWOpState::state_from_str(const string& s, OpState *state)
13454 {
13455 if (s == "unknown") {
13456 *state = OPSTATE_UNKNOWN;
13457 } else if (s == "in-progress") {
13458 *state = OPSTATE_IN_PROGRESS;
13459 } else if (s == "complete") {
13460 *state = OPSTATE_COMPLETE;
13461 } else if (s == "error") {
13462 *state = OPSTATE_ERROR;
13463 } else if (s == "abort") {
13464 *state = OPSTATE_ABORT;
13465 } else if (s == "cancelled") {
13466 *state = OPSTATE_CANCELLED;
13467 } else {
13468 return -EINVAL;
13469 }
13470
13471 return 0;
13472 }
13473
13474 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13475 {
13476 uint32_t s = (uint32_t)state;
13477 return store_entry(client_id, op_id, object, s, NULL, NULL);
13478 }
13479
13480 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13481 {
13482 uint32_t s = (uint32_t)state;
13483 return store_entry(client_id, op_id, object, s, NULL, &s);
13484 }
13485
13486 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13487 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13488 {
13489 cct = store->ctx();
13490 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13491 }
13492
13493 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13494 last_update = real_clock::now();
13495 cur_state = state;
13496 return os.set_state(client_id, op_id, object, state);
13497 }
13498
13499 int RGWOpStateSingleOp::renew_state() {
13500 real_time now = real_clock::now();
13501
13502 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13503
13504 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13505 return 0;
13506 }
13507
13508 last_update = now;
13509 return os.renew_state(client_id, op_id, object, cur_state);
13510 }
13511
13512
13513 uint64_t RGWRados::instance_id()
13514 {
13515 return get_rados_handle()->get_instance_id();
13516 }
13517
13518 uint64_t RGWRados::next_bucket_id()
13519 {
13520 Mutex::Locker l(bucket_id_lock);
13521 return ++max_bucket_id;
13522 }
13523
13524 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread, bool quota_threads, bool run_sync_thread, bool run_reshard_thread)
13525 {
13526 int use_cache = cct->_conf->rgw_cache_enabled;
13527 RGWRados *store = NULL;
13528 if (!use_cache) {
13529 store = new RGWRados;
13530 } else {
13531 store = new RGWCache<RGWRados>;
13532 }
13533
13534 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13535 delete store;
13536 return NULL;
13537 }
13538
13539 return store;
13540 }
13541
13542 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13543 {
13544 RGWRados *store = NULL;
13545 store = new RGWRados;
13546
13547 store->set_context(cct);
13548
13549 if (store->init_rados() < 0) {
13550 delete store;
13551 return NULL;
13552 }
13553
13554 return store;
13555 }
13556
13557 void RGWStoreManager::close_storage(RGWRados *store)
13558 {
13559 if (!store)
13560 return;
13561
13562 store->finalize();
13563
13564 delete store;
13565 }
13566
13567 librados::Rados* RGWRados::get_rados_handle()
13568 {
13569 if (rados.size() == 1) {
13570 return &rados[0];
13571 } else {
13572 handle_lock.get_read();
13573 pthread_t id = pthread_self();
13574 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13575
13576 if (it != rados_map.end()) {
13577 handle_lock.put_read();
13578 return &rados[it->second];
13579 } else {
13580 handle_lock.put_read();
13581 handle_lock.get_write();
13582 const uint32_t handle = next_rados_handle;
13583 rados_map[id] = handle;
13584 if (++next_rados_handle == rados.size()) {
13585 next_rados_handle = 0;
13586 }
13587 handle_lock.put_write();
13588 return &rados[handle];
13589 }
13590 }
13591 }
13592
13593 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
13594 {
13595 rgw_rados_ref ref;
13596 int ret = get_raw_obj_ref(obj, &ref);
13597 if (ret < 0) {
13598 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13599 return ret;
13600 }
13601
13602 ObjectWriteOperation op;
13603 list<string> prefixes;
13604 cls_rgw_remove_obj(op, prefixes);
13605
13606 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13607 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13608 if (ret < 0) {
13609 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13610 c->release();
13611 return ret;
13612 }
13613
13614 handles.push_back(c);
13615
13616 return 0;
13617 }
13618
13619 int RGWRados::delete_obj_aio(const rgw_obj& obj,
13620 RGWBucketInfo& bucket_info, RGWObjState *astate,
13621 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
13622 {
13623 rgw_rados_ref ref;
13624 int ret = get_obj_head_ref(bucket_info, obj, &ref);
13625 if (ret < 0) {
13626 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
13627 return ret;
13628 }
13629
13630 if (keep_index_consistent) {
13631 RGWRados::Bucket bop(this, bucket_info);
13632 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
13633
13634 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
13635 if (ret < 0) {
13636 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
13637 return ret;
13638 }
13639 }
13640
13641 ObjectWriteOperation op;
13642 list<string> prefixes;
13643 cls_rgw_remove_obj(op, prefixes);
13644
13645 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13646 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
13647 if (ret < 0) {
13648 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
13649 c->release();
13650 return ret;
13651 }
13652
13653 handles.push_back(c);
13654
13655 if (keep_index_consistent) {
13656 ret = delete_obj_index(obj);
13657 if (ret < 0) {
13658 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
13659 return ret;
13660 }
13661 }
13662 return ret;
13663 }
13664
13665 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
13666 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
13667 if (value != attrs.end()) {
13668 bufferlist::iterator bliter = value->second.begin();
13669 try {
13670 ::decode(cs_info, bliter);
13671 } catch (buffer::error& err) {
13672 return -EIO;
13673 }
13674 if (cs_info.blocks.size() == 0) {
13675 return -EIO;
13676 }
13677 if (cs_info.compression_type != "none")
13678 need_decompress = true;
13679 else
13680 need_decompress = false;
13681 return 0;
13682 } else {
13683 need_decompress = false;
13684 return 0;
13685 }
13686 }
13687