]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
update sources to 12.2.7
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9
10 #include <boost/format.hpp>
11 #include <boost/optional.hpp>
12 #include <boost/utility/in_place_factory.hpp>
13
14 #include "common/ceph_json.h"
15 #include "common/utf8.h"
16
17 #include "common/errno.h"
18 #include "common/Formatter.h"
19 #include "common/Throttle.h"
20 #include "common/Finisher.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_cache.h"
24 #include "rgw_acl.h"
25 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
26 #include "rgw_metadata.h"
27 #include "rgw_bucket.h"
28 #include "rgw_rest_conn.h"
29 #include "rgw_cr_rados.h"
30 #include "rgw_cr_rest.h"
31
32 #include "cls/rgw/cls_rgw_ops.h"
33 #include "cls/rgw/cls_rgw_types.h"
34 #include "cls/rgw/cls_rgw_client.h"
35 #include "cls/rgw/cls_rgw_const.h"
36 #include "cls/refcount/cls_refcount_client.h"
37 #include "cls/version/cls_version_client.h"
38 #include "cls/log/cls_log_client.h"
39 #include "cls/statelog/cls_statelog_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "osd/osd_types.h"
44
45 #include "rgw_tools.h"
46 #include "rgw_coroutine.h"
47 #include "rgw_compression.h"
48
49 #undef fork // fails to compile RGWPeriod::fork() below
50
51 #include "common/Clock.h"
52
53 #include "include/rados/librados.hpp"
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "auth/Crypto.h" // get_random_bytes()
63
64 #include "rgw_log.h"
65
66 #include "rgw_gc.h"
67 #include "rgw_lc.h"
68
69 #include "rgw_object_expirer_core.h"
70 #include "rgw_sync.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "compressor/Compressor.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_rgw
79
80 using namespace std;
81
82 static string notify_oid_prefix = "notify";
83 static string *notify_oids = NULL;
84 static string shadow_ns = "shadow";
85 static string dir_oid_prefix = ".dir.";
86 static string default_storage_pool_suffix = "rgw.buckets.data";
87 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
88 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
89 static string avail_pools = ".pools.avail";
90
91 static string zone_info_oid_prefix = "zone_info.";
92 static string zone_names_oid_prefix = "zone_names.";
93 static string region_info_oid_prefix = "region_info.";
94 static string zone_group_info_oid_prefix = "zonegroup_info.";
95 static string realm_names_oid_prefix = "realms_names.";
96 static string realm_info_oid_prefix = "realms.";
97 static string default_region_info_oid = "default.region";
98 static string default_zone_group_info_oid = "default.zonegroup";
99 static string period_info_oid_prefix = "periods.";
100 static string period_latest_epoch_info_oid = ".latest_epoch";
101 static string region_map_oid = "region_map";
102 static string zonegroup_map_oid = "zonegroup_map";
103 static string log_lock_name = "rgw_log_lock";
104 static string default_realm_info_oid = "default.realm";
105 const string default_zonegroup_name = "default";
106 const string default_zone_name = "default";
107 static string zonegroup_names_oid_prefix = "zonegroups_names.";
108 static RGWObjCategory main_category = RGW_OBJ_CATEGORY_MAIN;
109 #define RGW_USAGE_OBJ_PREFIX "usage."
110 #define FIRST_EPOCH 1
111 static string RGW_DEFAULT_ZONE_ROOT_POOL = "rgw.root";
112 static string RGW_DEFAULT_ZONEGROUP_ROOT_POOL = "rgw.root";
113 static string RGW_DEFAULT_REALM_ROOT_POOL = "rgw.root";
114 static string RGW_DEFAULT_PERIOD_ROOT_POOL = "rgw.root";
115
116 #define RGW_STATELOG_OBJ_PREFIX "statelog."
117
118 #define dout_subsys ceph_subsys_rgw
119
120
121 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
122 const string& placement_id, const rgw_obj& obj, rgw_pool *pool)
123 {
124 if (!zone_params.get_head_data_pool(placement_id, obj, pool)) {
125 RGWZonePlacementInfo placement;
126 if (!zone_params.get_placement(zonegroup.default_placement, &placement)) {
127 return false;
128 }
129
130 if (!obj.in_extra_data) {
131 *pool = placement.data_pool;
132 } else {
133 *pool = placement.get_data_extra_pool();
134 }
135 }
136
137 return true;
138 }
139
140 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
141 const string& placement_id, const rgw_obj& obj, rgw_raw_obj *raw_obj)
142 {
143 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
144
145 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_id, obj, &raw_obj->pool);
146 }
147
148 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
149 {
150 if (!is_raw) {
151 rgw_raw_obj r;
152 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
153 return r;
154 }
155 return raw_obj;
156 }
157
158 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
159 {
160 if (!is_raw) {
161 rgw_raw_obj r;
162 store->obj_to_raw(placement_rule, obj, &r);
163 return r;
164 }
165 return raw_obj;
166 }
167
168 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
169 {
170 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
171 if (r == -ENOENT && create) {
172 r = rados->pool_create(pool.name.c_str());
173 if (r == -ERANGE) {
174 dout(0)
175 << __func__
176 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
177 << " (this can be due to a pool or placement group misconfiguration, e.g."
178 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
179 << dendl;
180 }
181 if (r < 0 && r != -EEXIST) {
182 return r;
183 }
184
185 r = rados->ioctx_create(pool.name.c_str(), ioctx);
186 if (r < 0) {
187 return r;
188 }
189
190 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
191 if (r < 0 && r != -EOPNOTSUPP) {
192 return r;
193 }
194 } else if (r < 0) {
195 return r;
196 }
197 if (!pool.ns.empty()) {
198 ioctx.set_namespace(pool.ns);
199 }
200 return 0;
201 }
202
203 template<>
204 void RGWObjectCtxImpl<rgw_obj, RGWObjState>::invalidate(rgw_obj& obj) {
205 RWLock::WLocker wl(lock);
206 auto iter = objs_state.find(obj);
207 if (iter == objs_state.end()) {
208 return;
209 }
210 bool is_atomic = iter->second.is_atomic;
211 bool prefetch_data = iter->second.prefetch_data;
212
213 objs_state.erase(iter);
214
215 if (is_atomic || prefetch_data) {
216 auto& s = objs_state[obj];
217 s.is_atomic = is_atomic;
218 s.prefetch_data = prefetch_data;
219 }
220 }
221
222 template<>
223 void RGWObjectCtxImpl<rgw_raw_obj, RGWRawObjState>::invalidate(rgw_raw_obj& obj) {
224 RWLock::WLocker wl(lock);
225 auto iter = objs_state.find(obj);
226 if (iter == objs_state.end()) {
227 return;
228 }
229
230 objs_state.erase(iter);
231 }
232
233 void RGWDefaultZoneGroupInfo::dump(Formatter *f) const {
234 encode_json("default_zonegroup", default_zonegroup, f);
235 }
236
237 void RGWDefaultZoneGroupInfo::decode_json(JSONObj *obj) {
238
239 JSONDecoder::decode_json("default_zonegroup", default_zonegroup, obj);
240 /* backward compatability with region */
241 if (default_zonegroup.empty()) {
242 JSONDecoder::decode_json("default_region", default_zonegroup, obj);
243 }
244 }
245
246 rgw_pool RGWZoneGroup::get_pool(CephContext *cct_)
247 {
248 if (cct_->_conf->rgw_zonegroup_root_pool.empty()) {
249 return rgw_pool(RGW_DEFAULT_ZONEGROUP_ROOT_POOL);
250 }
251
252 return rgw_pool(cct_->_conf->rgw_zonegroup_root_pool);
253 }
254
255 int RGWZoneGroup::create_default(bool old_format)
256 {
257 name = default_zonegroup_name;
258 is_master = true;
259
260 RGWZoneGroupPlacementTarget placement_target;
261 placement_target.name = "default-placement";
262 placement_targets[placement_target.name] = placement_target;
263 default_placement = "default-placement";
264
265 RGWZoneParams zone_params(default_zone_name);
266
267 int r = zone_params.init(cct, store, false);
268 if (r < 0) {
269 ldout(cct, 0) << "create_default: error initializing zone params: " << cpp_strerror(-r) << dendl;
270 return r;
271 }
272
273 r = zone_params.create_default();
274 if (r < 0 && r != -EEXIST) {
275 ldout(cct, 0) << "create_default: error in create_default zone params: " << cpp_strerror(-r) << dendl;
276 return r;
277 } else if (r == -EEXIST) {
278 ldout(cct, 10) << "zone_params::create_default() returned -EEXIST, we raced with another default zone_params creation" << dendl;
279 zone_params.clear_id();
280 r = zone_params.init(cct, store);
281 if (r < 0) {
282 ldout(cct, 0) << "create_default: error in init existing zone params: " << cpp_strerror(-r) << dendl;
283 return r;
284 }
285 ldout(cct, 20) << "zone_params::create_default() " << zone_params.get_name() << " id " << zone_params.get_id()
286 << dendl;
287 }
288
289 RGWZone& default_zone = zones[zone_params.get_id()];
290 default_zone.name = zone_params.get_name();
291 default_zone.id = zone_params.get_id();
292 master_zone = default_zone.id;
293
294 r = create();
295 if (r < 0 && r != -EEXIST) {
296 ldout(cct, 0) << "error storing zone group info: " << cpp_strerror(-r) << dendl;
297 return r;
298 }
299
300 if (r == -EEXIST) {
301 ldout(cct, 10) << "create_default() returned -EEXIST, we raced with another zonegroup creation" << dendl;
302 id.clear();
303 r = init(cct, store);
304 if (r < 0) {
305 return r;
306 }
307 }
308
309 if (old_format) {
310 name = id;
311 }
312
313 post_process_params();
314
315 return 0;
316 }
317
318 const string RGWZoneGroup::get_default_oid(bool old_region_format)
319 {
320 if (old_region_format) {
321 if (cct->_conf->rgw_default_region_info_oid.empty()) {
322 return default_region_info_oid;
323 }
324 return cct->_conf->rgw_default_region_info_oid;
325 }
326
327 string default_oid = cct->_conf->rgw_default_zonegroup_info_oid;
328
329 if (cct->_conf->rgw_default_zonegroup_info_oid.empty()) {
330 default_oid = default_zone_group_info_oid;
331 }
332
333 default_oid += "." + realm_id;
334
335 return default_oid;
336 }
337
338 const string& RGWZoneGroup::get_info_oid_prefix(bool old_region_format)
339 {
340 if (old_region_format) {
341 return region_info_oid_prefix;
342 }
343 return zone_group_info_oid_prefix;
344 }
345
346 const string& RGWZoneGroup::get_names_oid_prefix()
347 {
348 return zonegroup_names_oid_prefix;
349 }
350
351 const string& RGWZoneGroup::get_predefined_name(CephContext *cct) {
352 return cct->_conf->rgw_zonegroup;
353 }
354
355 int RGWZoneGroup::equals(const string& other_zonegroup) const
356 {
357 if (is_master && other_zonegroup.empty())
358 return true;
359
360 return (id == other_zonegroup);
361 }
362
363 int RGWZoneGroup::add_zone(const RGWZoneParams& zone_params, bool *is_master, bool *read_only,
364 const list<string>& endpoints, const string *ptier_type,
365 bool *psync_from_all, list<string>& sync_from, list<string>& sync_from_rm)
366 {
367 auto& zone_id = zone_params.get_id();
368 auto& zone_name = zone_params.get_name();
369
370 // check for duplicate zone name on insert
371 if (!zones.count(zone_id)) {
372 for (const auto& zone : zones) {
373 if (zone.second.name == zone_name) {
374 ldout(cct, 0) << "ERROR: found existing zone name " << zone_name
375 << " (" << zone.first << ") in zonegroup " << get_name() << dendl;
376 return -EEXIST;
377 }
378 }
379 }
380
381 if (is_master) {
382 if (*is_master) {
383 if (!master_zone.empty() && master_zone != zone_params.get_id()) {
384 ldout(cct, 0) << "NOTICE: overriding master zone: " << master_zone << dendl;
385 }
386 master_zone = zone_params.get_id();
387 } else if (master_zone == zone_params.get_id()) {
388 master_zone.clear();
389 }
390 }
391
392 RGWZone& zone = zones[zone_params.get_id()];
393 zone.name = zone_params.get_name();
394 zone.id = zone_params.get_id();
395 if (!endpoints.empty()) {
396 zone.endpoints = endpoints;
397 }
398 if (read_only) {
399 zone.read_only = *read_only;
400 }
401 if (ptier_type) {
402 zone.tier_type = *ptier_type;
403 }
404
405 if (psync_from_all) {
406 zone.sync_from_all = *psync_from_all;
407 }
408
409 for (auto add : sync_from) {
410 zone.sync_from.insert(add);
411 }
412
413 for (auto rm : sync_from_rm) {
414 zone.sync_from.erase(rm);
415 }
416
417 post_process_params();
418
419 return update();
420 }
421
422
423 int RGWZoneGroup::rename_zone(const RGWZoneParams& zone_params)
424 {
425 RGWZone& zone = zones[zone_params.get_id()];
426 zone.name = zone_params.get_name();
427
428 return update();
429 }
430
431 void RGWZoneGroup::post_process_params()
432 {
433 bool log_data = zones.size() > 1;
434
435 if (master_zone.empty()) {
436 map<string, RGWZone>::iterator iter = zones.begin();
437 if (iter != zones.end()) {
438 master_zone = iter->first;
439 }
440 }
441
442 for (map<string, RGWZone>::iterator iter = zones.begin(); iter != zones.end(); ++iter) {
443 RGWZone& zone = iter->second;
444 zone.log_data = log_data;
445
446 RGWZoneParams zone_params(zone.id, zone.name);
447 int ret = zone_params.init(cct, store);
448 if (ret < 0) {
449 ldout(cct, 0) << "WARNING: could not read zone params for zone id=" << zone.id << " name=" << zone.name << dendl;
450 continue;
451 }
452
453 for (map<string, RGWZonePlacementInfo>::iterator iter = zone_params.placement_pools.begin();
454 iter != zone_params.placement_pools.end(); ++iter) {
455 const string& placement_name = iter->first;
456 if (placement_targets.find(placement_name) == placement_targets.end()) {
457 RGWZoneGroupPlacementTarget placement_target;
458 placement_target.name = placement_name;
459 placement_targets[placement_name] = placement_target;
460 }
461 }
462 }
463
464 if (default_placement.empty() && !placement_targets.empty()) {
465 default_placement = placement_targets.begin()->first;
466 }
467 }
468
469 int RGWZoneGroup::remove_zone(const std::string& zone_id)
470 {
471 map<string, RGWZone>::iterator iter = zones.find(zone_id);
472 if (iter == zones.end()) {
473 ldout(cct, 0) << "zone id " << zone_id << " is not a part of zonegroup "
474 << name << dendl;
475 return -ENOENT;
476 }
477
478 zones.erase(iter);
479
480 post_process_params();
481
482 return update();
483 }
484
485 int RGWZoneGroup::read_default_id(string& default_id, bool old_format)
486 {
487 if (realm_id.empty()) {
488 /* try using default realm */
489 RGWRealm realm;
490 int ret = realm.init(cct, store);
491 // no default realm exist
492 if (ret < 0) {
493 return read_id(default_zonegroup_name, default_id);
494 }
495 realm_id = realm.get_id();
496 }
497
498 return RGWSystemMetaObj::read_default_id(default_id, old_format);
499 }
500
501 int RGWZoneGroup::set_as_default(bool exclusive)
502 {
503 if (realm_id.empty()) {
504 /* try using default realm */
505 RGWRealm realm;
506 int ret = realm.init(cct, store);
507 if (ret < 0) {
508 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
509 return -EINVAL;
510 }
511 realm_id = realm.get_id();
512 }
513
514 return RGWSystemMetaObj::set_as_default(exclusive);
515 }
516
517 int RGWSystemMetaObj::init(CephContext *_cct, RGWRados *_store, bool setup_obj, bool old_format)
518 {
519 cct = _cct;
520 store = _store;
521
522 if (!setup_obj)
523 return 0;
524
525 if (old_format && id.empty()) {
526 id = name;
527 }
528
529 if (id.empty()) {
530 int r;
531 if (name.empty()) {
532 name = get_predefined_name(cct);
533 }
534 if (name.empty()) {
535 r = use_default(old_format);
536 if (r < 0) {
537 return r;
538 }
539 } else if (!old_format) {
540 r = read_id(name, id);
541 if (r < 0) {
542 if (r != -ENOENT) {
543 ldout(cct, 0) << "error in read_id for object name: " << name << " : " << cpp_strerror(-r) << dendl;
544 }
545 return r;
546 }
547 }
548 }
549
550 return read_info(id, old_format);
551 }
552
553 int RGWSystemMetaObj::read_default(RGWDefaultSystemMetaObjInfo& default_info, const string& oid)
554 {
555 auto pool = get_pool(cct);
556 bufferlist bl;
557 RGWObjectCtx obj_ctx(store);
558 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
559 if (ret < 0)
560 return ret;
561
562 try {
563 bufferlist::iterator iter = bl.begin();
564 ::decode(default_info, iter);
565 } catch (buffer::error& err) {
566 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
567 return -EIO;
568 }
569
570 return 0;
571 }
572
573 int RGWSystemMetaObj::read_default_id(string& default_id, bool old_format)
574 {
575 RGWDefaultSystemMetaObjInfo default_info;
576
577 int ret = read_default(default_info, get_default_oid(old_format));
578 if (ret < 0) {
579 return ret;
580 }
581
582 default_id = default_info.default_id;
583
584 return 0;
585 }
586
587 int RGWSystemMetaObj::use_default(bool old_format)
588 {
589 return read_default_id(id, old_format);
590 }
591
592 int RGWSystemMetaObj::set_as_default(bool exclusive)
593 {
594 string oid = get_default_oid();
595
596 rgw_pool pool(get_pool(cct));
597 bufferlist bl;
598
599 RGWDefaultSystemMetaObjInfo default_info;
600 default_info.default_id = id;
601
602 ::encode(default_info, bl);
603
604 int ret = rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
605 exclusive, NULL, real_time(), NULL);
606 if (ret < 0)
607 return ret;
608
609 return 0;
610 }
611
612 int RGWSystemMetaObj::read_id(const string& obj_name, string& object_id)
613 {
614 rgw_pool pool(get_pool(cct));
615 bufferlist bl;
616
617 string oid = get_names_oid_prefix() + obj_name;
618
619 RGWObjectCtx obj_ctx(store);
620 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
621 if (ret < 0) {
622 return ret;
623 }
624
625 RGWNameToId nameToId;
626 try {
627 bufferlist::iterator iter = bl.begin();
628 ::decode(nameToId, iter);
629 } catch (buffer::error& err) {
630 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
631 return -EIO;
632 }
633 object_id = nameToId.obj_id;
634 return 0;
635 }
636
637 int RGWSystemMetaObj::delete_obj(bool old_format)
638 {
639 rgw_pool pool(get_pool(cct));
640
641 /* check to see if obj is the default */
642 RGWDefaultSystemMetaObjInfo default_info;
643 int ret = read_default(default_info, get_default_oid(old_format));
644 if (ret < 0 && ret != -ENOENT)
645 return ret;
646 if (default_info.default_id == id || (old_format && default_info.default_id == name)) {
647 string oid = get_default_oid(old_format);
648 rgw_raw_obj default_named_obj(pool, oid);
649 ret = store->delete_system_obj(default_named_obj);
650 if (ret < 0) {
651 ldout(cct, 0) << "Error delete default obj name " << name << ": " << cpp_strerror(-ret) << dendl;
652 return ret;
653 }
654 }
655 if (!old_format) {
656 string oid = get_names_oid_prefix() + name;
657 rgw_raw_obj object_name(pool, oid);
658 ret = store->delete_system_obj(object_name);
659 if (ret < 0) {
660 ldout(cct, 0) << "Error delete obj name " << name << ": " << cpp_strerror(-ret) << dendl;
661 return ret;
662 }
663 }
664
665 string oid = get_info_oid_prefix(old_format);
666 if (old_format) {
667 oid += name;
668 } else {
669 oid += id;
670 }
671
672 rgw_raw_obj object_id(pool, oid);
673 ret = store->delete_system_obj(object_id);
674 if (ret < 0) {
675 ldout(cct, 0) << "Error delete object id " << id << ": " << cpp_strerror(-ret) << dendl;
676 }
677
678 return ret;
679 }
680
681 int RGWSystemMetaObj::store_name(bool exclusive)
682 {
683 rgw_pool pool(get_pool(cct));
684 string oid = get_names_oid_prefix() + name;
685
686 RGWNameToId nameToId;
687 nameToId.obj_id = id;
688
689 bufferlist bl;
690 ::encode(nameToId, bl);
691 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
692 }
693
694 int RGWSystemMetaObj::rename(const string& new_name)
695 {
696 string new_id;
697 int ret = read_id(new_name, new_id);
698 if (!ret) {
699 return -EEXIST;
700 }
701 if (ret < 0 && ret != -ENOENT) {
702 ldout(cct, 0) << "Error read_id " << new_name << ": " << cpp_strerror(-ret) << dendl;
703 return ret;
704 }
705 string old_name = name;
706 name = new_name;
707 ret = update();
708 if (ret < 0) {
709 ldout(cct, 0) << "Error storing new obj info " << new_name << ": " << cpp_strerror(-ret) << dendl;
710 return ret;
711 }
712 ret = store_name(true);
713 if (ret < 0) {
714 ldout(cct, 0) << "Error storing new name " << new_name << ": " << cpp_strerror(-ret) << dendl;
715 return ret;
716 }
717 /* delete old name */
718 rgw_pool pool(get_pool(cct));
719 string oid = get_names_oid_prefix() + old_name;
720 rgw_raw_obj old_name_obj(pool, oid);
721 ret = store->delete_system_obj(old_name_obj);
722 if (ret < 0) {
723 ldout(cct, 0) << "Error delete old obj name " << old_name << ": " << cpp_strerror(-ret) << dendl;
724 return ret;
725 }
726
727 return ret;
728 }
729
730 int RGWSystemMetaObj::read_info(const string& obj_id, bool old_format)
731 {
732 rgw_pool pool(get_pool(cct));
733
734 bufferlist bl;
735
736 string oid = get_info_oid_prefix(old_format) + obj_id;
737
738 RGWObjectCtx obj_ctx(store);
739 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, NULL, NULL);
740 if (ret < 0) {
741 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << oid << ": " << cpp_strerror(-ret) << dendl;
742 return ret;
743 }
744
745 try {
746 bufferlist::iterator iter = bl.begin();
747 ::decode(*this, iter);
748 } catch (buffer::error& err) {
749 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << oid << dendl;
750 return -EIO;
751 }
752
753 return 0;
754 }
755
756 int RGWSystemMetaObj::read()
757 {
758 int ret = read_id(name, id);
759 if (ret < 0) {
760 return ret;
761 }
762
763 return read_info(id);
764 }
765
766 int RGWSystemMetaObj::create(bool exclusive)
767 {
768 int ret;
769
770 /* check to see the name is not used */
771 ret = read_id(name, id);
772 if (exclusive && ret == 0) {
773 ldout(cct, 10) << "ERROR: name " << name << " already in use for obj id " << id << dendl;
774 return -EEXIST;
775 } else if ( ret < 0 && ret != -ENOENT) {
776 ldout(cct, 0) << "failed reading obj id " << id << ": " << cpp_strerror(-ret) << dendl;
777 return ret;
778 }
779
780 if (id.empty()) {
781 /* create unique id */
782 uuid_d new_uuid;
783 char uuid_str[37];
784 new_uuid.generate_random();
785 new_uuid.print(uuid_str);
786 id = uuid_str;
787 }
788
789 ret = store_info(exclusive);
790 if (ret < 0) {
791 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
792 return ret;
793 }
794
795 return store_name(exclusive);
796 }
797
798 int RGWSystemMetaObj::store_info(bool exclusive)
799 {
800 rgw_pool pool(get_pool(cct));
801
802 string oid = get_info_oid_prefix() + id;
803
804 bufferlist bl;
805 ::encode(*this, bl);
806 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(), exclusive, NULL, real_time(), NULL);
807 }
808
809 int RGWSystemMetaObj::write(bool exclusive)
810 {
811 int ret = store_info(exclusive);
812 if (ret < 0) {
813 ldout(cct, 20) << __func__ << "(): store_info() returned ret=" << ret << dendl;
814 return ret;
815 }
816 ret = store_name(exclusive);
817 if (ret < 0) {
818 ldout(cct, 20) << __func__ << "(): store_name() returned ret=" << ret << dendl;
819 return ret;
820 }
821 return 0;
822 }
823
824
825 const string& RGWRealm::get_predefined_name(CephContext *cct) {
826 return cct->_conf->rgw_realm;
827 }
828
829 int RGWRealm::create(bool exclusive)
830 {
831 int ret = RGWSystemMetaObj::create(exclusive);
832 if (ret < 0) {
833 ldout(cct, 0) << "ERROR creating new realm object " << name << ": " << cpp_strerror(-ret) << dendl;
834 return ret;
835 }
836 // create the control object for watch/notify
837 ret = create_control(exclusive);
838 if (ret < 0) {
839 ldout(cct, 0) << "ERROR creating control for new realm " << name << ": " << cpp_strerror(-ret) << dendl;
840 return ret;
841 }
842 RGWPeriod period;
843 if (current_period.empty()) {
844 /* create new period for the realm */
845 ret = period.init(cct, store, id, name, false);
846 if (ret < 0 ) {
847 return ret;
848 }
849 ret = period.create(true);
850 if (ret < 0) {
851 ldout(cct, 0) << "ERROR: creating new period for realm " << name << ": " << cpp_strerror(-ret) << dendl;
852 return ret;
853 }
854 } else {
855 period = RGWPeriod(current_period, 0);
856 int ret = period.init(cct, store, id, name);
857 if (ret < 0) {
858 ldout(cct, 0) << "ERROR: failed to init period " << current_period << dendl;
859 return ret;
860 }
861 }
862 ret = set_current_period(period);
863 if (ret < 0) {
864 ldout(cct, 0) << "ERROR: failed set current period " << current_period << dendl;
865 return ret;
866 }
867 // try to set as default. may race with another create, so pass exclusive=true
868 // so we don't override an existing default
869 ret = set_as_default(true);
870 if (ret < 0 && ret != -EEXIST) {
871 ldout(cct, 0) << "WARNING: failed to set realm as default realm, ret=" << ret << dendl;
872 }
873
874 return 0;
875 }
876
877 int RGWRealm::delete_obj()
878 {
879 int ret = RGWSystemMetaObj::delete_obj();
880 if (ret < 0) {
881 return ret;
882 }
883 return delete_control();
884 }
885
886 int RGWRealm::create_control(bool exclusive)
887 {
888 auto pool = rgw_pool{get_pool(cct)};
889 auto oid = get_control_oid();
890 return rgw_put_system_obj(store, pool, oid, nullptr, 0, exclusive,
891 nullptr, real_time(), nullptr);
892 }
893
894 int RGWRealm::delete_control()
895 {
896 auto pool = rgw_pool{get_pool(cct)};
897 auto obj = rgw_raw_obj{pool, get_control_oid()};
898 return store->delete_system_obj(obj);
899 }
900
901 rgw_pool RGWRealm::get_pool(CephContext *cct)
902 {
903 if (cct->_conf->rgw_realm_root_pool.empty()) {
904 return rgw_pool(RGW_DEFAULT_REALM_ROOT_POOL);
905 }
906 return rgw_pool(cct->_conf->rgw_realm_root_pool);
907 }
908
909 const string RGWRealm::get_default_oid(bool old_format)
910 {
911 if (cct->_conf->rgw_default_realm_info_oid.empty()) {
912 return default_realm_info_oid;
913 }
914 return cct->_conf->rgw_default_realm_info_oid;
915 }
916
917 const string& RGWRealm::get_names_oid_prefix()
918 {
919 return realm_names_oid_prefix;
920 }
921
922 const string& RGWRealm::get_info_oid_prefix(bool old_format)
923 {
924 return realm_info_oid_prefix;
925 }
926
927 int RGWRealm::set_current_period(RGWPeriod& period)
928 {
929 // update realm epoch to match the period's
930 if (epoch > period.get_realm_epoch()) {
931 ldout(cct, 0) << "ERROR: set_current_period with old realm epoch "
932 << period.get_realm_epoch() << ", current epoch=" << epoch << dendl;
933 return -EINVAL;
934 }
935 if (epoch == period.get_realm_epoch() && current_period != period.get_id()) {
936 ldout(cct, 0) << "ERROR: set_current_period with same realm epoch "
937 << period.get_realm_epoch() << ", but different period id "
938 << period.get_id() << " != " << current_period << dendl;
939 return -EINVAL;
940 }
941
942 epoch = period.get_realm_epoch();
943 current_period = period.get_id();
944
945 int ret = update();
946 if (ret < 0) {
947 ldout(cct, 0) << "ERROR: period update: " << cpp_strerror(-ret) << dendl;
948 return ret;
949 }
950
951 ret = period.reflect();
952 if (ret < 0) {
953 ldout(cct, 0) << "ERROR: period.reflect(): " << cpp_strerror(-ret) << dendl;
954 return ret;
955 }
956
957 return 0;
958 }
959
960 string RGWRealm::get_control_oid()
961 {
962 return get_info_oid_prefix() + id + ".control";
963 }
964
965 int RGWRealm::notify_zone(bufferlist& bl)
966 {
967 // open a context on the realm's pool
968 rgw_pool pool{get_pool(cct)};
969 librados::IoCtx ctx;
970 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ctx);
971 if (r < 0) {
972 ldout(cct, 0) << "Failed to open pool " << pool << dendl;
973 return r;
974 }
975 // send a notify on the realm object
976 r = ctx.notify2(get_control_oid(), bl, 0, nullptr);
977 if (r < 0) {
978 ldout(cct, 0) << "Realm notify failed with " << r << dendl;
979 return r;
980 }
981 return 0;
982 }
983
984 int RGWRealm::notify_new_period(const RGWPeriod& period)
985 {
986 bufferlist bl;
987 // push the period to dependent zonegroups/zones
988 ::encode(RGWRealmNotify::ZonesNeedPeriod, bl);
989 ::encode(period, bl);
990 // reload the gateway with the new period
991 ::encode(RGWRealmNotify::Reload, bl);
992
993 return notify_zone(bl);
994 }
995
996 std::string RGWPeriodConfig::get_oid(const std::string& realm_id)
997 {
998 if (realm_id.empty()) {
999 return "period_config.default";
1000 }
1001 return "period_config." + realm_id;
1002 }
1003
1004 rgw_pool RGWPeriodConfig::get_pool(CephContext *cct)
1005 {
1006 const auto& pool_name = cct->_conf->rgw_period_root_pool;
1007 if (pool_name.empty()) {
1008 return {RGW_DEFAULT_PERIOD_ROOT_POOL};
1009 }
1010 return {pool_name};
1011 }
1012
1013 int RGWPeriodConfig::read(RGWRados *store, const std::string& realm_id)
1014 {
1015 RGWObjectCtx obj_ctx(store);
1016 const auto& pool = get_pool(store->ctx());
1017 const auto& oid = get_oid(realm_id);
1018 bufferlist bl;
1019
1020 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, nullptr, nullptr);
1021 if (ret < 0) {
1022 return ret;
1023 }
1024 try {
1025 bufferlist::iterator iter = bl.begin();
1026 ::decode(*this, iter);
1027 } catch (buffer::error& err) {
1028 return -EIO;
1029 }
1030 return 0;
1031 }
1032
1033 int RGWPeriodConfig::write(RGWRados *store, const std::string& realm_id)
1034 {
1035 const auto& pool = get_pool(store->ctx());
1036 const auto& oid = get_oid(realm_id);
1037 bufferlist bl;
1038 ::encode(*this, bl);
1039 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1040 false, nullptr, real_time(), nullptr);
1041 }
1042
1043 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, const string& period_realm_id,
1044 const string& period_realm_name, bool setup_obj)
1045 {
1046 cct = _cct;
1047 store = _store;
1048 realm_id = period_realm_id;
1049 realm_name = period_realm_name;
1050
1051 if (!setup_obj)
1052 return 0;
1053
1054 return init(_cct, _store, setup_obj);
1055 }
1056
1057
1058 int RGWPeriod::init(CephContext *_cct, RGWRados *_store, bool setup_obj)
1059 {
1060 cct = _cct;
1061 store = _store;
1062
1063 if (!setup_obj)
1064 return 0;
1065
1066 if (id.empty()) {
1067 RGWRealm realm(realm_id, realm_name);
1068 int ret = realm.init(cct, store);
1069 if (ret < 0) {
1070 ldout(cct, 0) << "RGWPeriod::init failed to init realm " << realm_name << " id " << realm_id << " : " <<
1071 cpp_strerror(-ret) << dendl;
1072 return ret;
1073 }
1074 id = realm.get_current_period();
1075 realm_id = realm.get_id();
1076 }
1077
1078 if (!epoch) {
1079 int ret = use_latest_epoch();
1080 if (ret < 0) {
1081 ldout(cct, 0) << "failed to use_latest_epoch period id " << id << " realm " << realm_name << " id " << realm_id
1082 << " : " << cpp_strerror(-ret) << dendl;
1083 return ret;
1084 }
1085 }
1086
1087 return read_info();
1088 }
1089
1090
1091 int RGWPeriod::get_zonegroup(RGWZoneGroup& zonegroup, const string& zonegroup_id) {
1092 map<string, RGWZoneGroup>::const_iterator iter;
1093 if (!zonegroup_id.empty()) {
1094 iter = period_map.zonegroups.find(zonegroup_id);
1095 } else {
1096 iter = period_map.zonegroups.find("default");
1097 }
1098 if (iter != period_map.zonegroups.end()) {
1099 zonegroup = iter->second;
1100 return 0;
1101 }
1102
1103 return -ENOENT;
1104 }
1105
1106 const string& RGWPeriod::get_latest_epoch_oid()
1107 {
1108 if (cct->_conf->rgw_period_latest_epoch_info_oid.empty()) {
1109 return period_latest_epoch_info_oid;
1110 }
1111 return cct->_conf->rgw_period_latest_epoch_info_oid;
1112 }
1113
1114 const string& RGWPeriod::get_info_oid_prefix()
1115 {
1116 return period_info_oid_prefix;
1117 }
1118
1119 const string RGWPeriod::get_period_oid_prefix()
1120 {
1121 return get_info_oid_prefix() + id;
1122 }
1123
1124 const string RGWPeriod::get_period_oid()
1125 {
1126 std::ostringstream oss;
1127 oss << get_period_oid_prefix();
1128 // skip the epoch for the staging period
1129 if (id != get_staging_id(realm_id))
1130 oss << "." << epoch;
1131 return oss.str();
1132 }
1133
1134 int RGWPeriod::read_latest_epoch(RGWPeriodLatestEpochInfo& info,
1135 RGWObjVersionTracker *objv)
1136 {
1137 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1138
1139 rgw_pool pool(get_pool(cct));
1140 bufferlist bl;
1141 RGWObjectCtx obj_ctx(store);
1142 int ret = rgw_get_system_obj(store, obj_ctx, pool, oid, bl, objv, nullptr);
1143 if (ret < 0) {
1144 ldout(cct, 1) << "error read_lastest_epoch " << pool << ":" << oid << dendl;
1145 return ret;
1146 }
1147 try {
1148 bufferlist::iterator iter = bl.begin();
1149 ::decode(info, iter);
1150 } catch (buffer::error& err) {
1151 ldout(cct, 0) << "error decoding data from " << pool << ":" << oid << dendl;
1152 return -EIO;
1153 }
1154
1155 return 0;
1156 }
1157
1158 int RGWPeriod::get_latest_epoch(epoch_t& latest_epoch)
1159 {
1160 RGWPeriodLatestEpochInfo info;
1161
1162 int ret = read_latest_epoch(info);
1163 if (ret < 0) {
1164 return ret;
1165 }
1166
1167 latest_epoch = info.epoch;
1168
1169 return 0;
1170 }
1171
1172 int RGWPeriod::use_latest_epoch()
1173 {
1174 RGWPeriodLatestEpochInfo info;
1175 int ret = read_latest_epoch(info);
1176 if (ret < 0) {
1177 return ret;
1178 }
1179
1180 epoch = info.epoch;
1181
1182 return 0;
1183 }
1184
1185 int RGWPeriod::set_latest_epoch(epoch_t epoch, bool exclusive,
1186 RGWObjVersionTracker *objv)
1187 {
1188 string oid = get_period_oid_prefix() + get_latest_epoch_oid();
1189
1190 rgw_pool pool(get_pool(cct));
1191 bufferlist bl;
1192
1193 RGWPeriodLatestEpochInfo info;
1194 info.epoch = epoch;
1195
1196 ::encode(info, bl);
1197
1198 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1199 exclusive, objv, real_time(), nullptr);
1200 }
1201
1202 int RGWPeriod::update_latest_epoch(epoch_t epoch)
1203 {
1204 static constexpr int MAX_RETRIES = 20;
1205
1206 for (int i = 0; i < MAX_RETRIES; i++) {
1207 RGWPeriodLatestEpochInfo info;
1208 RGWObjVersionTracker objv;
1209 bool exclusive = false;
1210
1211 // read existing epoch
1212 int r = read_latest_epoch(info, &objv);
1213 if (r == -ENOENT) {
1214 // use an exclusive create to set the epoch atomically
1215 exclusive = true;
1216 ldout(cct, 20) << "creating initial latest_epoch=" << epoch
1217 << " for period=" << id << dendl;
1218 } else if (r < 0) {
1219 ldout(cct, 0) << "ERROR: failed to read latest_epoch" << dendl;
1220 return r;
1221 } else if (epoch <= info.epoch) {
1222 r = -EEXIST; // fail with EEXIST if epoch is not newer
1223 ldout(cct, 1) << "found existing latest_epoch " << info.epoch
1224 << " >= given epoch " << epoch << ", returning r=" << r << dendl;
1225 return r;
1226 } else {
1227 ldout(cct, 20) << "updating latest_epoch from " << info.epoch
1228 << " -> " << epoch << " on period=" << id << dendl;
1229 }
1230
1231 r = set_latest_epoch(epoch, exclusive, &objv);
1232 if (r == -EEXIST) {
1233 continue; // exclusive create raced with another update, retry
1234 } else if (r == -ECANCELED) {
1235 continue; // write raced with a conflicting version, retry
1236 }
1237 if (r < 0) {
1238 ldout(cct, 0) << "ERROR: failed to write latest_epoch" << dendl;
1239 return r;
1240 }
1241 return 0; // return success
1242 }
1243
1244 return -ECANCELED; // fail after max retries
1245 }
1246
1247 int RGWPeriod::delete_obj()
1248 {
1249 rgw_pool pool(get_pool(cct));
1250
1251 // delete the object for each period epoch
1252 for (epoch_t e = 1; e <= epoch; e++) {
1253 RGWPeriod p{get_id(), e};
1254 rgw_raw_obj oid{pool, p.get_period_oid()};
1255 int ret = store->delete_system_obj(oid);
1256 if (ret < 0) {
1257 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1258 << ": " << cpp_strerror(-ret) << dendl;
1259 }
1260 }
1261
1262 // delete the .latest_epoch object
1263 rgw_raw_obj oid{pool, get_period_oid_prefix() + get_latest_epoch_oid()};
1264 int ret = store->delete_system_obj(oid);
1265 if (ret < 0) {
1266 ldout(cct, 0) << "WARNING: failed to delete period object " << oid
1267 << ": " << cpp_strerror(-ret) << dendl;
1268 }
1269 return ret;
1270 }
1271
1272 int RGWPeriod::read_info()
1273 {
1274 rgw_pool pool(get_pool(cct));
1275
1276 bufferlist bl;
1277
1278 RGWObjectCtx obj_ctx(store);
1279 int ret = rgw_get_system_obj(store, obj_ctx, pool, get_period_oid(), bl, NULL, NULL);
1280 if (ret < 0) {
1281 ldout(cct, 0) << "failed reading obj info from " << pool << ":" << get_period_oid() << ": " << cpp_strerror(-ret) << dendl;
1282 return ret;
1283 }
1284
1285 try {
1286 bufferlist::iterator iter = bl.begin();
1287 ::decode(*this, iter);
1288 } catch (buffer::error& err) {
1289 ldout(cct, 0) << "ERROR: failed to decode obj from " << pool << ":" << get_period_oid() << dendl;
1290 return -EIO;
1291 }
1292
1293 return 0;
1294 }
1295
1296 int RGWPeriod::create(bool exclusive)
1297 {
1298 int ret;
1299
1300 /* create unique id */
1301 uuid_d new_uuid;
1302 char uuid_str[37];
1303 new_uuid.generate_random();
1304 new_uuid.print(uuid_str);
1305 id = uuid_str;
1306
1307 epoch = FIRST_EPOCH;
1308
1309 period_map.id = id;
1310
1311 ret = store_info(exclusive);
1312 if (ret < 0) {
1313 ldout(cct, 0) << "ERROR: storing info for " << id << ": " << cpp_strerror(-ret) << dendl;
1314 return ret;
1315 }
1316
1317 ret = set_latest_epoch(epoch);
1318 if (ret < 0) {
1319 ldout(cct, 0) << "ERROR: setting latest epoch " << id << ": " << cpp_strerror(-ret) << dendl;
1320 }
1321
1322 return ret;
1323 }
1324
1325 int RGWPeriod::store_info(bool exclusive)
1326 {
1327 rgw_pool pool(get_pool(cct));
1328
1329 string oid = get_period_oid();
1330 bufferlist bl;
1331 ::encode(*this, bl);
1332
1333 return rgw_put_system_obj(store, pool, oid, bl.c_str(), bl.length(),
1334 exclusive, NULL, real_time(), NULL);
1335 }
1336
1337 rgw_pool RGWPeriod::get_pool(CephContext *cct)
1338 {
1339 if (cct->_conf->rgw_period_root_pool.empty()) {
1340 return rgw_pool(RGW_DEFAULT_PERIOD_ROOT_POOL);
1341 }
1342 return rgw_pool(cct->_conf->rgw_period_root_pool);
1343 }
1344
1345 int RGWPeriod::add_zonegroup(const RGWZoneGroup& zonegroup)
1346 {
1347 if (zonegroup.realm_id != realm_id) {
1348 return 0;
1349 }
1350 int ret = period_map.update(zonegroup, cct);
1351 if (ret < 0) {
1352 ldout(cct, 0) << "ERROR: updating period map: " << cpp_strerror(-ret) << dendl;
1353 return ret;
1354 }
1355
1356 return store_info(false);
1357 }
1358
1359 int RGWPeriod::update()
1360 {
1361 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << get_id() << dendl;
1362 list<string> zonegroups;
1363 int ret = store->list_zonegroups(zonegroups);
1364 if (ret < 0) {
1365 ldout(cct, 0) << "ERROR: failed to list zonegroups: " << cpp_strerror(-ret) << dendl;
1366 return ret;
1367 }
1368
1369 // clear zone short ids of removed zones. period_map.update() will add the
1370 // remaining zones back
1371 period_map.short_zone_ids.clear();
1372
1373 for (auto& iter : zonegroups) {
1374 RGWZoneGroup zg(string(), iter);
1375 ret = zg.init(cct, store);
1376 if (ret < 0) {
1377 ldout(cct, 0) << "WARNING: zg.init() failed: " << cpp_strerror(-ret) << dendl;
1378 continue;
1379 }
1380
1381 if (zg.realm_id != realm_id) {
1382 ldout(cct, 20) << "skipping zonegroup " << zg.get_name() << " zone realm id " << zg.realm_id << ", not on our realm " << realm_id << dendl;
1383 continue;
1384 }
1385
1386 if (zg.master_zone.empty()) {
1387 ldout(cct, 0) << "ERROR: zonegroup " << zg.get_name() << " should have a master zone " << dendl;
1388 return -EINVAL;
1389 }
1390
1391 if (zg.is_master_zonegroup()) {
1392 master_zonegroup = zg.get_id();
1393 master_zone = zg.master_zone;
1394 }
1395
1396 int ret = period_map.update(zg, cct);
1397 if (ret < 0) {
1398 return ret;
1399 }
1400 }
1401
1402 ret = period_config.read(store, realm_id);
1403 if (ret < 0 && ret != -ENOENT) {
1404 ldout(cct, 0) << "ERROR: failed to read period config: "
1405 << cpp_strerror(ret) << dendl;
1406 return ret;
1407 }
1408 return 0;
1409 }
1410
1411 int RGWPeriod::reflect()
1412 {
1413 for (auto& iter : period_map.zonegroups) {
1414 RGWZoneGroup& zg = iter.second;
1415 zg.reinit_instance(cct, store);
1416 int r = zg.write(false);
1417 if (r < 0) {
1418 ldout(cct, 0) << "ERROR: failed to store zonegroup info for zonegroup=" << iter.first << ": " << cpp_strerror(-r) << dendl;
1419 return r;
1420 }
1421 if (zg.is_master_zonegroup()) {
1422 // set master as default if no default exists
1423 r = zg.set_as_default(true);
1424 if (r == 0) {
1425 ldout(cct, 1) << "Set the period's master zonegroup " << zg.get_id()
1426 << " as the default" << dendl;
1427 }
1428 }
1429 }
1430
1431 int r = period_config.write(store, realm_id);
1432 if (r < 0) {
1433 ldout(cct, 0) << "ERROR: failed to store period config: "
1434 << cpp_strerror(-r) << dendl;
1435 return r;
1436 }
1437 return 0;
1438 }
1439
1440 void RGWPeriod::fork()
1441 {
1442 ldout(cct, 20) << __func__ << " realm " << realm_id << " period " << id << dendl;
1443 predecessor_uuid = id;
1444 id = get_staging_id(realm_id);
1445 period_map.reset();
1446 realm_epoch++;
1447 }
1448
1449 static int read_sync_status(RGWRados *store, rgw_meta_sync_status *sync_status)
1450 {
1451 // initialize a sync status manager to read the status
1452 RGWMetaSyncStatusManager mgr(store, store->get_async_rados());
1453 int r = mgr.init();
1454 if (r < 0) {
1455 return r;
1456 }
1457 r = mgr.read_sync_status(sync_status);
1458 mgr.stop();
1459 return r;
1460 }
1461
1462 int RGWPeriod::update_sync_status(const RGWPeriod &current_period,
1463 std::ostream& error_stream,
1464 bool force_if_stale)
1465 {
1466 rgw_meta_sync_status status;
1467 int r = read_sync_status(store, &status);
1468 if (r < 0) {
1469 ldout(cct, 0) << "period failed to read sync status: "
1470 << cpp_strerror(-r) << dendl;
1471 return r;
1472 }
1473
1474 std::vector<std::string> markers;
1475
1476 const auto current_epoch = current_period.get_realm_epoch();
1477 if (current_epoch != status.sync_info.realm_epoch) {
1478 // no sync status markers for the current period
1479 assert(current_epoch > status.sync_info.realm_epoch);
1480 const int behind = current_epoch - status.sync_info.realm_epoch;
1481 if (!force_if_stale && current_epoch > 1) {
1482 error_stream << "ERROR: This zone is " << behind << " period(s) behind "
1483 "the current master zone in metadata sync. If this zone is promoted "
1484 "to master, any metadata changes during that time are likely to "
1485 "be lost.\n"
1486 "Waiting for this zone to catch up on metadata sync (see "
1487 "'radosgw-admin sync status') is recommended.\n"
1488 "To promote this zone to master anyway, add the flag "
1489 "--yes-i-really-mean-it." << std::endl;
1490 return -EINVAL;
1491 }
1492 // empty sync status markers - other zones will skip this period during
1493 // incremental metadata sync
1494 markers.resize(status.sync_info.num_shards);
1495 } else {
1496 markers.reserve(status.sync_info.num_shards);
1497 for (auto& i : status.sync_markers) {
1498 auto& marker = i.second;
1499 // filter out markers from other periods
1500 if (marker.realm_epoch != current_epoch) {
1501 marker.marker.clear();
1502 }
1503 markers.emplace_back(std::move(marker.marker));
1504 }
1505 }
1506
1507 std::swap(sync_status, markers);
1508 return 0;
1509 }
1510
1511 int RGWPeriod::commit(RGWRealm& realm, const RGWPeriod& current_period,
1512 std::ostream& error_stream, bool force_if_stale)
1513 {
1514 ldout(cct, 20) << __func__ << " realm " << realm.get_id() << " period " << current_period.get_id() << dendl;
1515 // gateway must be in the master zone to commit
1516 if (master_zone != store->get_zone_params().get_id()) {
1517 error_stream << "Cannot commit period on zone "
1518 << store->get_zone_params().get_id() << ", it must be sent to "
1519 "the period's master zone " << master_zone << '.' << std::endl;
1520 return -EINVAL;
1521 }
1522 // period predecessor must match current period
1523 if (predecessor_uuid != current_period.get_id()) {
1524 error_stream << "Period predecessor " << predecessor_uuid
1525 << " does not match current period " << current_period.get_id()
1526 << ". Use 'period pull' to get the latest period from the master, "
1527 "reapply your changes, and try again." << std::endl;
1528 return -EINVAL;
1529 }
1530 // realm epoch must be 1 greater than current period
1531 if (realm_epoch != current_period.get_realm_epoch() + 1) {
1532 error_stream << "Period's realm epoch " << realm_epoch
1533 << " does not come directly after current realm epoch "
1534 << current_period.get_realm_epoch() << ". Use 'realm pull' to get the "
1535 "latest realm and period from the master zone, reapply your changes, "
1536 "and try again." << std::endl;
1537 return -EINVAL;
1538 }
1539 // did the master zone change?
1540 if (master_zone != current_period.get_master_zone()) {
1541 // store the current metadata sync status in the period
1542 int r = update_sync_status(current_period, error_stream, force_if_stale);
1543 if (r < 0) {
1544 ldout(cct, 0) << "failed to update metadata sync status: "
1545 << cpp_strerror(-r) << dendl;
1546 return r;
1547 }
1548 // create an object with a new period id
1549 r = create(true);
1550 if (r < 0) {
1551 ldout(cct, 0) << "failed to create new period: " << cpp_strerror(-r) << dendl;
1552 return r;
1553 }
1554 // set as current period
1555 r = realm.set_current_period(*this);
1556 if (r < 0) {
1557 ldout(cct, 0) << "failed to update realm's current period: "
1558 << cpp_strerror(-r) << dendl;
1559 return r;
1560 }
1561 ldout(cct, 4) << "Promoted to master zone and committed new period "
1562 << id << dendl;
1563 realm.notify_new_period(*this);
1564 return 0;
1565 }
1566 // period must be based on current epoch
1567 if (epoch != current_period.get_epoch()) {
1568 error_stream << "Period epoch " << epoch << " does not match "
1569 "predecessor epoch " << current_period.get_epoch()
1570 << ". Use 'period pull' to get the latest epoch from the master zone, "
1571 "reapply your changes, and try again." << std::endl;
1572 return -EINVAL;
1573 }
1574 // set period as next epoch
1575 set_id(current_period.get_id());
1576 set_epoch(current_period.get_epoch() + 1);
1577 set_predecessor(current_period.get_predecessor());
1578 realm_epoch = current_period.get_realm_epoch();
1579 // write the period to rados
1580 int r = store_info(false);
1581 if (r < 0) {
1582 ldout(cct, 0) << "failed to store period: " << cpp_strerror(-r) << dendl;
1583 return r;
1584 }
1585 // set as latest epoch
1586 r = update_latest_epoch(epoch);
1587 if (r == -EEXIST) {
1588 // already have this epoch (or a more recent one)
1589 return 0;
1590 }
1591 if (r < 0) {
1592 ldout(cct, 0) << "failed to set latest epoch: " << cpp_strerror(-r) << dendl;
1593 return r;
1594 }
1595 r = reflect();
1596 if (r < 0) {
1597 ldout(cct, 0) << "failed to update local objects: " << cpp_strerror(-r) << dendl;
1598 return r;
1599 }
1600 ldout(cct, 4) << "Committed new epoch " << epoch
1601 << " for period " << id << dendl;
1602 realm.notify_new_period(*this);
1603 return 0;
1604 }
1605
1606 int RGWZoneParams::create_default(bool old_format)
1607 {
1608 name = default_zone_name;
1609
1610 int r = create();
1611 if (r < 0) {
1612 return r;
1613 }
1614
1615 if (old_format) {
1616 name = id;
1617 }
1618
1619 return r;
1620 }
1621
1622
1623 int get_zones_pool_set(CephContext* cct,
1624 RGWRados* store,
1625 const list<string>& zones,
1626 const string& my_zone_id,
1627 set<rgw_pool>& pool_names)
1628 {
1629 for(auto const& iter : zones) {
1630 RGWZoneParams zone(iter);
1631 int r = zone.init(cct, store);
1632 if (r < 0) {
1633 ldout(cct, 0) << "Error: init zone " << iter << ":" << cpp_strerror(-r) << dendl;
1634 return r;
1635 }
1636 if (zone.get_id() != my_zone_id) {
1637 pool_names.insert(zone.domain_root);
1638 pool_names.insert(zone.metadata_heap);
1639 pool_names.insert(zone.control_pool);
1640 pool_names.insert(zone.gc_pool);
1641 pool_names.insert(zone.log_pool);
1642 pool_names.insert(zone.intent_log_pool);
1643 pool_names.insert(zone.usage_log_pool);
1644 pool_names.insert(zone.user_keys_pool);
1645 pool_names.insert(zone.user_email_pool);
1646 pool_names.insert(zone.user_swift_pool);
1647 pool_names.insert(zone.user_uid_pool);
1648 pool_names.insert(zone.roles_pool);
1649 pool_names.insert(zone.reshard_pool);
1650 for(auto& iter : zone.placement_pools) {
1651 pool_names.insert(iter.second.index_pool);
1652 pool_names.insert(iter.second.data_pool);
1653 pool_names.insert(iter.second.data_extra_pool);
1654 }
1655 }
1656 }
1657 return 0;
1658 }
1659
1660 rgw_pool fix_zone_pool_dup(set<rgw_pool> pools,
1661 const string& default_prefix,
1662 const string& default_suffix,
1663 const rgw_pool& suggested_pool)
1664 {
1665 string suggested_name = suggested_pool.to_str();
1666
1667 string prefix = default_prefix;
1668 string suffix = default_suffix;
1669
1670 if (!suggested_pool.empty()) {
1671 prefix = suggested_name.substr(0, suggested_name.find("."));
1672 suffix = suggested_name.substr(prefix.length());
1673 }
1674
1675 rgw_pool pool(prefix + suffix);
1676
1677 if (pools.find(pool) == pools.end()) {
1678 return pool;
1679 } else {
1680 while(true) {
1681 pool = prefix + "_" + std::to_string(std::rand()) + suffix;
1682 if (pools.find(pool) == pools.end()) {
1683 return pool;
1684 }
1685 }
1686 }
1687 }
1688
1689 int RGWZoneParams::fix_pool_names()
1690 {
1691
1692 list<string> zones;
1693 int r = store->list_zones(zones);
1694 if (r < 0) {
1695 ldout(cct, 10) << "WARNING: store->list_zones() returned r=" << r << dendl;
1696 }
1697
1698 set<rgw_pool> pools;
1699 r = get_zones_pool_set(cct, store, zones, id, pools);
1700 if (r < 0) {
1701 ldout(cct, 0) << "Error: get_zones_pool_names" << r << dendl;
1702 return r;
1703 }
1704
1705 domain_root = fix_zone_pool_dup(pools, name, ".rgw.meta:root", domain_root);
1706 if (!metadata_heap.name.empty()) {
1707 metadata_heap = fix_zone_pool_dup(pools, name, ".rgw.meta:heap", metadata_heap);
1708 }
1709 control_pool = fix_zone_pool_dup(pools, name, ".rgw.control", control_pool);
1710 gc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:gc", gc_pool);
1711 lc_pool = fix_zone_pool_dup(pools, name ,".rgw.log:lc", lc_pool);
1712 log_pool = fix_zone_pool_dup(pools, name, ".rgw.log", log_pool);
1713 intent_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:intent", intent_log_pool);
1714 usage_log_pool = fix_zone_pool_dup(pools, name, ".rgw.log:usage", usage_log_pool);
1715 user_keys_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.keys", user_keys_pool);
1716 user_email_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.email", user_email_pool);
1717 user_swift_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.swift", user_swift_pool);
1718 user_uid_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:users.uid", user_uid_pool);
1719 roles_pool = fix_zone_pool_dup(pools, name, ".rgw.meta:roles", roles_pool);
1720 reshard_pool = fix_zone_pool_dup(pools, name, ".rgw.log:reshard", reshard_pool);
1721
1722 for(auto& iter : placement_pools) {
1723 iter.second.index_pool = fix_zone_pool_dup(pools, name, "." + default_bucket_index_pool_suffix,
1724 iter.second.index_pool);
1725 iter.second.data_pool = fix_zone_pool_dup(pools, name, "." + default_storage_pool_suffix,
1726 iter.second.data_pool);
1727 iter.second.data_extra_pool= fix_zone_pool_dup(pools, name, "." + default_storage_extra_pool_suffix,
1728 iter.second.data_extra_pool);
1729 }
1730
1731 return 0;
1732 }
1733
1734 int RGWZoneParams::create(bool exclusive)
1735 {
1736 /* check for old pools config */
1737 rgw_raw_obj obj(domain_root, avail_pools);
1738 int r = store->raw_obj_stat(obj, NULL, NULL, NULL, NULL, NULL, NULL);
1739 if (r < 0) {
1740 ldout(store->ctx(), 10) << "couldn't find old data placement pools config, setting up new ones for the zone" << dendl;
1741 /* a new system, let's set new placement info */
1742 RGWZonePlacementInfo default_placement;
1743 default_placement.index_pool = name + "." + default_bucket_index_pool_suffix;
1744 default_placement.data_pool = name + "." + default_storage_pool_suffix;
1745 default_placement.data_extra_pool = name + "." + default_storage_extra_pool_suffix;
1746 placement_pools["default-placement"] = default_placement;
1747 }
1748
1749 r = fix_pool_names();
1750 if (r < 0) {
1751 ldout(cct, 0) << "ERROR: fix_pool_names returned r=" << r << dendl;
1752 return r;
1753 }
1754
1755 r = RGWSystemMetaObj::create(exclusive);
1756 if (r < 0) {
1757 return r;
1758 }
1759
1760 // try to set as default. may race with another create, so pass exclusive=true
1761 // so we don't override an existing default
1762 r = set_as_default(true);
1763 if (r < 0 && r != -EEXIST) {
1764 ldout(cct, 10) << "WARNING: failed to set zone as default, r=" << r << dendl;
1765 }
1766
1767 return 0;
1768 }
1769
1770 rgw_pool RGWZoneParams::get_pool(CephContext *cct)
1771 {
1772 if (cct->_conf->rgw_zone_root_pool.empty()) {
1773 return rgw_pool(RGW_DEFAULT_ZONE_ROOT_POOL);
1774 }
1775
1776 return rgw_pool(cct->_conf->rgw_zone_root_pool);
1777 }
1778
1779 const string RGWZoneParams::get_default_oid(bool old_format)
1780 {
1781 if (old_format) {
1782 return cct->_conf->rgw_default_zone_info_oid;
1783 }
1784
1785 return cct->_conf->rgw_default_zone_info_oid + "." + realm_id;
1786 }
1787
1788 const string& RGWZoneParams::get_names_oid_prefix()
1789 {
1790 return zone_names_oid_prefix;
1791 }
1792
1793 const string& RGWZoneParams::get_info_oid_prefix(bool old_format)
1794 {
1795 return zone_info_oid_prefix;
1796 }
1797
1798 const string& RGWZoneParams::get_predefined_name(CephContext *cct) {
1799 return cct->_conf->rgw_zone;
1800 }
1801
1802 int RGWZoneParams::init(CephContext *cct, RGWRados *store, bool setup_obj, bool old_format)
1803 {
1804 if (name.empty()) {
1805 name = cct->_conf->rgw_zone;
1806 }
1807
1808 return RGWSystemMetaObj::init(cct, store, setup_obj, old_format);
1809 }
1810
1811 int RGWZoneParams::read_default_id(string& default_id, bool old_format)
1812 {
1813 if (realm_id.empty()) {
1814 /* try using default realm */
1815 RGWRealm realm;
1816 int ret = realm.init(cct, store);
1817 //no default realm exist
1818 if (ret < 0) {
1819 return read_id(default_zone_name, default_id);
1820 }
1821 realm_id = realm.get_id();
1822 }
1823
1824 return RGWSystemMetaObj::read_default_id(default_id, old_format);
1825 }
1826
1827
1828 int RGWZoneParams::set_as_default(bool exclusive)
1829 {
1830 if (realm_id.empty()) {
1831 /* try using default realm */
1832 RGWRealm realm;
1833 int ret = realm.init(cct, store);
1834 if (ret < 0) {
1835 ldout(cct, 10) << "could not read realm id: " << cpp_strerror(-ret) << dendl;
1836 return -EINVAL;
1837 }
1838 realm_id = realm.get_id();
1839 }
1840
1841 return RGWSystemMetaObj::set_as_default(exclusive);
1842 }
1843
1844 const string& RGWZoneParams::get_compression_type(const string& placement_rule) const
1845 {
1846 static const std::string NONE{"none"};
1847 auto p = placement_pools.find(placement_rule);
1848 if (p == placement_pools.end()) {
1849 return NONE;
1850 }
1851 const auto& type = p->second.compression_type;
1852 return !type.empty() ? type : NONE;
1853 }
1854
1855 void RGWPeriodMap::encode(bufferlist& bl) const {
1856 ENCODE_START(2, 1, bl);
1857 ::encode(id, bl);
1858 ::encode(zonegroups, bl);
1859 ::encode(master_zonegroup, bl);
1860 ::encode(short_zone_ids, bl);
1861 ENCODE_FINISH(bl);
1862 }
1863
1864 void RGWPeriodMap::decode(bufferlist::iterator& bl) {
1865 DECODE_START(2, bl);
1866 ::decode(id, bl);
1867 ::decode(zonegroups, bl);
1868 ::decode(master_zonegroup, bl);
1869 if (struct_v >= 2) {
1870 ::decode(short_zone_ids, bl);
1871 }
1872 DECODE_FINISH(bl);
1873
1874 zonegroups_by_api.clear();
1875 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
1876 iter != zonegroups.end(); ++iter) {
1877 RGWZoneGroup& zonegroup = iter->second;
1878 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1879 if (zonegroup.is_master_zonegroup()) {
1880 master_zonegroup = zonegroup.get_id();
1881 }
1882 }
1883 }
1884
1885 // run an MD5 hash on the zone_id and return the first 32 bits
1886 static uint32_t gen_short_zone_id(const std::string zone_id)
1887 {
1888 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
1889 MD5 hash;
1890 hash.Update((const byte *)zone_id.c_str(), zone_id.size());
1891 hash.Final(md5);
1892
1893 uint32_t short_id;
1894 memcpy((char *)&short_id, md5, sizeof(short_id));
1895 return std::max(short_id, 1u);
1896 }
1897
1898 int RGWPeriodMap::update(const RGWZoneGroup& zonegroup, CephContext *cct)
1899 {
1900 if (zonegroup.is_master_zonegroup() && (!master_zonegroup.empty() && zonegroup.get_id() != master_zonegroup)) {
1901 ldout(cct,0) << "Error updating periodmap, multiple master zonegroups configured "<< dendl;
1902 ldout(cct,0) << "master zonegroup: " << master_zonegroup << " and " << zonegroup.get_id() <<dendl;
1903 return -EINVAL;
1904 }
1905 map<string, RGWZoneGroup>::iterator iter = zonegroups.find(zonegroup.get_id());
1906 if (iter != zonegroups.end()) {
1907 RGWZoneGroup& old_zonegroup = iter->second;
1908 if (!old_zonegroup.api_name.empty()) {
1909 zonegroups_by_api.erase(old_zonegroup.api_name);
1910 }
1911 }
1912 zonegroups[zonegroup.get_id()] = zonegroup;
1913
1914 if (!zonegroup.api_name.empty()) {
1915 zonegroups_by_api[zonegroup.api_name] = zonegroup;
1916 }
1917
1918 if (zonegroup.is_master_zonegroup()) {
1919 master_zonegroup = zonegroup.get_id();
1920 } else if (master_zonegroup == zonegroup.get_id()) {
1921 master_zonegroup = "";
1922 }
1923
1924 for (auto& i : zonegroup.zones) {
1925 auto& zone = i.second;
1926 if (short_zone_ids.find(zone.id) != short_zone_ids.end()) {
1927 continue;
1928 }
1929 // calculate the zone's short id
1930 uint32_t short_id = gen_short_zone_id(zone.id);
1931
1932 // search for an existing zone with the same short id
1933 for (auto& s : short_zone_ids) {
1934 if (s.second == short_id) {
1935 ldout(cct, 0) << "New zone '" << zone.name << "' (" << zone.id
1936 << ") generates the same short_zone_id " << short_id
1937 << " as existing zone id " << s.first << dendl;
1938 return -EEXIST;
1939 }
1940 }
1941
1942 short_zone_ids[zone.id] = short_id;
1943 }
1944
1945 return 0;
1946 }
1947
1948 uint32_t RGWPeriodMap::get_zone_short_id(const string& zone_id) const
1949 {
1950 auto i = short_zone_ids.find(zone_id);
1951 if (i == short_zone_ids.end()) {
1952 return 0;
1953 }
1954 return i->second;
1955 }
1956
1957 int RGWZoneGroupMap::read(CephContext *cct, RGWRados *store)
1958 {
1959
1960 RGWPeriod period;
1961 int ret = period.init(cct, store);
1962 if (ret < 0) {
1963 cerr << "failed to read current period info: " << cpp_strerror(ret);
1964 return ret;
1965 }
1966
1967 bucket_quota = period.get_config().bucket_quota;
1968 user_quota = period.get_config().user_quota;
1969 zonegroups = period.get_map().zonegroups;
1970 zonegroups_by_api = period.get_map().zonegroups_by_api;
1971 master_zonegroup = period.get_map().master_zonegroup;
1972
1973 return 0;
1974 }
1975
1976 void RGWRegionMap::encode(bufferlist& bl) const {
1977 ENCODE_START( 3, 1, bl);
1978 ::encode(regions, bl);
1979 ::encode(master_region, bl);
1980 ::encode(bucket_quota, bl);
1981 ::encode(user_quota, bl);
1982 ENCODE_FINISH(bl);
1983 }
1984
1985 void RGWRegionMap::decode(bufferlist::iterator& bl) {
1986 DECODE_START(3, bl);
1987 ::decode(regions, bl);
1988 ::decode(master_region, bl);
1989 if (struct_v >= 2)
1990 ::decode(bucket_quota, bl);
1991 if (struct_v >= 3)
1992 ::decode(user_quota, bl);
1993 DECODE_FINISH(bl);
1994 }
1995
1996 void RGWZoneGroupMap::encode(bufferlist& bl) const {
1997 ENCODE_START( 3, 1, bl);
1998 ::encode(zonegroups, bl);
1999 ::encode(master_zonegroup, bl);
2000 ::encode(bucket_quota, bl);
2001 ::encode(user_quota, bl);
2002 ENCODE_FINISH(bl);
2003 }
2004
2005 void RGWZoneGroupMap::decode(bufferlist::iterator& bl) {
2006 DECODE_START(3, bl);
2007 ::decode(zonegroups, bl);
2008 ::decode(master_zonegroup, bl);
2009 if (struct_v >= 2)
2010 ::decode(bucket_quota, bl);
2011 if (struct_v >= 3)
2012 ::decode(user_quota, bl);
2013 DECODE_FINISH(bl);
2014
2015 zonegroups_by_api.clear();
2016 for (map<string, RGWZoneGroup>::iterator iter = zonegroups.begin();
2017 iter != zonegroups.end(); ++iter) {
2018 RGWZoneGroup& zonegroup = iter->second;
2019 zonegroups_by_api[zonegroup.api_name] = zonegroup;
2020 if (zonegroup.is_master_zonegroup()) {
2021 master_zonegroup = zonegroup.get_name();
2022 }
2023 }
2024 }
2025
2026 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
2027 {
2028 obj_version *check_objv = version_for_check();
2029
2030 if (check_objv) {
2031 cls_version_check(*op, *check_objv, VER_COND_EQ);
2032 }
2033
2034 cls_version_read(*op, &read_version);
2035 }
2036
2037 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
2038 {
2039 obj_version *check_objv = version_for_check();
2040 obj_version *modify_version = version_for_write();
2041
2042 if (check_objv) {
2043 cls_version_check(*op, *check_objv, VER_COND_EQ);
2044 }
2045
2046 if (modify_version) {
2047 cls_version_set(*op, *modify_version);
2048 } else {
2049 cls_version_inc(*op);
2050 }
2051 }
2052
2053 void RGWObjManifest::obj_iterator::operator++()
2054 {
2055 if (manifest->explicit_objs) {
2056 ++explicit_iter;
2057
2058 if (explicit_iter == manifest->objs.end()) {
2059 ofs = manifest->obj_size;
2060 return;
2061 }
2062
2063 update_explicit_pos();
2064
2065 update_location();
2066 return;
2067 }
2068
2069 uint64_t obj_size = manifest->get_obj_size();
2070 uint64_t head_size = manifest->get_head_size();
2071
2072 if (ofs == obj_size) {
2073 return;
2074 }
2075
2076 if (manifest->rules.empty()) {
2077 return;
2078 }
2079
2080 /* are we still pointing at the head? */
2081 if (ofs < head_size) {
2082 rule_iter = manifest->rules.begin();
2083 RGWObjManifestRule *rule = &rule_iter->second;
2084 ofs = MIN(head_size, obj_size);
2085 stripe_ofs = ofs;
2086 cur_stripe = 1;
2087 stripe_size = MIN(obj_size - ofs, rule->stripe_max_size);
2088 if (rule->part_size > 0) {
2089 stripe_size = MIN(stripe_size, rule->part_size);
2090 }
2091 update_location();
2092 return;
2093 }
2094
2095 RGWObjManifestRule *rule = &rule_iter->second;
2096
2097 stripe_ofs += rule->stripe_max_size;
2098 cur_stripe++;
2099 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
2100
2101 if (rule->part_size > 0) {
2102 /* multi part, multi stripes object */
2103
2104 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2105
2106 if (stripe_ofs >= part_ofs + rule->part_size) {
2107 /* moved to the next part */
2108 cur_stripe = 0;
2109 part_ofs += rule->part_size;
2110 stripe_ofs = part_ofs;
2111
2112 bool last_rule = (next_rule_iter == manifest->rules.end());
2113 /* move to the next rule? */
2114 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
2115 rule_iter = next_rule_iter;
2116 last_rule = (next_rule_iter == manifest->rules.end());
2117 if (!last_rule) {
2118 ++next_rule_iter;
2119 }
2120 cur_part_id = rule_iter->second.start_part_num;
2121 } else {
2122 cur_part_id++;
2123 }
2124
2125 rule = &rule_iter->second;
2126 }
2127
2128 stripe_size = MIN(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
2129 }
2130
2131 cur_override_prefix = rule->override_prefix;
2132
2133 ofs = stripe_ofs;
2134 if (ofs > obj_size) {
2135 ofs = obj_size;
2136 stripe_ofs = ofs;
2137 stripe_size = 0;
2138 }
2139
2140 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
2141 update_location();
2142 }
2143
2144 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m, const string& placement_rule, rgw_bucket& _b, rgw_obj& _obj)
2145 {
2146 manifest = _m;
2147
2148 manifest->set_tail_placement(placement_rule, _b);
2149 manifest->set_head(placement_rule, _obj, 0);
2150 last_ofs = 0;
2151
2152 if (manifest->get_prefix().empty()) {
2153 char buf[33];
2154 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
2155
2156 string oid_prefix = ".";
2157 oid_prefix.append(buf);
2158 oid_prefix.append("_");
2159
2160 manifest->set_prefix(oid_prefix);
2161 }
2162
2163 bool found = manifest->get_rule(0, &rule);
2164 if (!found) {
2165 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
2166 return -EIO;
2167 }
2168
2169 uint64_t head_size = manifest->get_head_size();
2170
2171 if (head_size > 0) {
2172 cur_stripe_size = head_size;
2173 } else {
2174 cur_stripe_size = rule.stripe_max_size;
2175 }
2176
2177 cur_part_id = rule.start_part_num;
2178
2179 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
2180
2181 // Normal object which not generated through copy operation
2182 manifest->set_tail_instance(_obj.key.instance);
2183
2184 manifest->update_iterators();
2185
2186 return 0;
2187 }
2188
2189 int RGWObjManifest::generator::create_next(uint64_t ofs)
2190 {
2191 if (ofs < last_ofs) /* only going forward */
2192 return -EINVAL;
2193
2194 uint64_t max_head_size = manifest->get_max_head_size();
2195
2196 if (ofs < max_head_size) {
2197 manifest->set_head_size(ofs);
2198 }
2199
2200 if (ofs >= max_head_size) {
2201 manifest->set_head_size(max_head_size);
2202 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
2203 cur_stripe_size = rule.stripe_max_size;
2204
2205 if (cur_part_id == 0 && max_head_size > 0) {
2206 cur_stripe++;
2207 }
2208 }
2209
2210 last_ofs = ofs;
2211 manifest->set_obj_size(ofs);
2212
2213 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
2214
2215 manifest->update_iterators();
2216
2217 return 0;
2218 }
2219
2220 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
2221 {
2222 return begin_iter;
2223 }
2224
2225 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
2226 {
2227 return end_iter;
2228 }
2229
2230 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
2231 {
2232 if (ofs > obj_size) {
2233 ofs = obj_size;
2234 }
2235 RGWObjManifest::obj_iterator iter(this);
2236 iter.seek(ofs);
2237 return iter;
2238 }
2239
2240 int RGWObjManifest::append(RGWObjManifest& m, RGWZoneGroup& zonegroup, RGWZoneParams& zone_params)
2241 {
2242 if (explicit_objs || m.explicit_objs) {
2243 return append_explicit(m, zonegroup, zone_params);
2244 }
2245
2246 if (rules.empty()) {
2247 *this = m;
2248 return 0;
2249 }
2250
2251 string override_prefix;
2252
2253 if (prefix.empty()) {
2254 prefix = m.prefix;
2255 }
2256
2257 if (prefix != m.prefix) {
2258 override_prefix = m.prefix;
2259 }
2260
2261 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
2262 if (miter == m.rules.end()) {
2263 return append_explicit(m, zonegroup, zone_params);
2264 }
2265
2266 for (; miter != m.rules.end(); ++miter) {
2267 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
2268
2269 RGWObjManifestRule& rule = last_rule->second;
2270
2271 if (rule.part_size == 0) {
2272 rule.part_size = obj_size - rule.start_ofs;
2273 }
2274
2275 RGWObjManifestRule& next_rule = miter->second;
2276 if (!next_rule.part_size) {
2277 next_rule.part_size = m.obj_size - next_rule.start_ofs;
2278 }
2279
2280 string rule_prefix = prefix;
2281 if (!rule.override_prefix.empty()) {
2282 rule_prefix = rule.override_prefix;
2283 }
2284
2285 string next_rule_prefix = m.prefix;
2286 if (!next_rule.override_prefix.empty()) {
2287 next_rule_prefix = next_rule.override_prefix;
2288 }
2289
2290 if (rule.part_size != next_rule.part_size ||
2291 rule.stripe_max_size != next_rule.stripe_max_size ||
2292 rule_prefix != next_rule_prefix) {
2293 if (next_rule_prefix != prefix) {
2294 append_rules(m, miter, &next_rule_prefix);
2295 } else {
2296 append_rules(m, miter, NULL);
2297 }
2298 break;
2299 }
2300
2301 uint64_t expected_part_num = rule.start_part_num + 1;
2302 if (rule.part_size > 0) {
2303 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
2304 }
2305
2306 if (expected_part_num != next_rule.start_part_num) {
2307 append_rules(m, miter, NULL);
2308 break;
2309 }
2310 }
2311
2312 set_obj_size(obj_size + m.obj_size);
2313
2314 return 0;
2315 }
2316
2317 int RGWObjManifest::append(RGWObjManifest& m, RGWRados *store)
2318 {
2319 return append(m, store->get_zonegroup(), store->get_zone_params());
2320 }
2321
2322 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
2323 string *override_prefix)
2324 {
2325 for (; miter != m.rules.end(); ++miter) {
2326 RGWObjManifestRule rule = miter->second;
2327 rule.start_ofs += obj_size;
2328 if (override_prefix)
2329 rule.override_prefix = *override_prefix;
2330 rules[rule.start_ofs] = rule;
2331 }
2332 }
2333
2334 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2335 {
2336 if (explicit_objs) {
2337 return;
2338 }
2339 obj_iterator iter = obj_begin();
2340
2341 while (iter != obj_end()) {
2342 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
2343 const rgw_obj_select& os = iter.get_location();
2344 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
2345 part.loc_ofs = 0;
2346
2347 uint64_t ofs = iter.get_stripe_ofs();
2348
2349 if (ofs == 0) {
2350 part.loc = obj;
2351 } else {
2352 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
2353 }
2354 ++iter;
2355 uint64_t next_ofs = iter.get_stripe_ofs();
2356
2357 part.size = next_ofs - ofs;
2358 }
2359
2360 explicit_objs = true;
2361 rules.clear();
2362 prefix.clear();
2363 }
2364
2365 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
2366 {
2367 if (!explicit_objs) {
2368 convert_to_explicit(zonegroup, zone_params);
2369 }
2370 if (!m.explicit_objs) {
2371 m.convert_to_explicit(zonegroup, zone_params);
2372 }
2373 map<uint64_t, RGWObjManifestPart>::iterator iter;
2374 uint64_t base = obj_size;
2375 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
2376 RGWObjManifestPart& part = iter->second;
2377 objs[base + iter->first] = part;
2378 }
2379 obj_size += m.obj_size;
2380
2381 return 0;
2382 }
2383
2384 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
2385 {
2386 if (rules.empty()) {
2387 return false;
2388 }
2389
2390 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
2391 if (iter != rules.begin()) {
2392 --iter;
2393 }
2394
2395 *rule = iter->second;
2396
2397 return true;
2398 }
2399
2400 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
2401 {
2402 write_version.ver = 1;
2403 #define TAG_LEN 24
2404
2405 write_version.tag.clear();
2406 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
2407 }
2408
2409 int RGWPutObjProcessor::complete(size_t accounted_size, const string& etag,
2410 real_time *mtime, real_time set_mtime,
2411 map<string, bufferlist>& attrs, real_time delete_at,
2412 const char *if_match, const char *if_nomatch, const string *user_data,
2413 rgw_zone_set *zones_trace)
2414 {
2415 int r = do_complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at, if_match, if_nomatch, user_data, zones_trace);
2416 if (r < 0)
2417 return r;
2418
2419 is_complete = !canceled;
2420 return 0;
2421 }
2422
2423 CephContext *RGWPutObjProcessor::ctx()
2424 {
2425 return store->ctx();
2426 }
2427
2428 RGWPutObjProcessor_Aio::~RGWPutObjProcessor_Aio()
2429 {
2430 drain_pending();
2431
2432 if (is_complete)
2433 return;
2434
2435 set<rgw_raw_obj>::iterator iter;
2436 bool need_to_remove_head = false;
2437 rgw_raw_obj raw_head;
2438
2439 if (!head_obj.empty()) {
2440 store->obj_to_raw(bucket_info.placement_rule, head_obj, &raw_head);
2441 }
2442
2443 /**
2444 * We should delete the object in the "multipart" namespace to avoid race condition.
2445 * Such race condition is caused by the fact that the multipart object is the gatekeeper of a multipart
2446 * upload, when it is deleted, a second upload would start with the same suffix("2/"), therefore, objects
2447 * written by the second upload may be deleted by the first upload.
2448 * details is describled on #11749
2449 *
2450 * The above comment still stands, but instead of searching for a specific object in the multipart
2451 * namespace, we just make sure that we remove the object that is marked as the head object after
2452 * we remove all the other raw objects. Note that we use different call to remove the head object,
2453 * as this one needs to go via the bucket index prepare/complete 2-phase commit scheme.
2454 */
2455 for (iter = written_objs.begin(); iter != written_objs.end(); ++iter) {
2456 const rgw_raw_obj& obj = *iter;
2457 if (!head_obj.empty() && obj == raw_head) {
2458 ldout(store->ctx(), 5) << "NOTE: we should not process the head object (" << obj << ") here" << dendl;
2459 need_to_remove_head = true;
2460 continue;
2461 }
2462
2463 int r = store->delete_raw_obj(obj);
2464 if (r < 0 && r != -ENOENT) {
2465 ldout(store->ctx(), 5) << "WARNING: failed to remove obj (" << obj << "), leaked" << dendl;
2466 }
2467 }
2468
2469 if (need_to_remove_head) {
2470 ldout(store->ctx(), 5) << "NOTE: we are going to process the head obj (" << raw_head << ")" << dendl;
2471 int r = store->delete_obj(obj_ctx, bucket_info, head_obj, 0, 0);
2472 if (r < 0 && r != -ENOENT) {
2473 ldout(store->ctx(), 0) << "WARNING: failed to remove obj (" << raw_head << "), leaked" << dendl;
2474 }
2475 }
2476 }
2477
2478 int RGWPutObjProcessor_Aio::handle_obj_data(rgw_raw_obj& obj, bufferlist& bl, off_t ofs, off_t abs_ofs, void **phandle, bool exclusive)
2479 {
2480 if ((uint64_t)abs_ofs + bl.length() > obj_len)
2481 obj_len = abs_ofs + bl.length();
2482
2483 if (!(obj == last_written_obj)) {
2484 last_written_obj = obj;
2485 }
2486
2487 // For the first call pass -1 as the offset to
2488 // do a write_full.
2489 return store->aio_put_obj_data(NULL, obj, bl, ((ofs != 0) ? ofs : -1), exclusive, phandle);
2490 }
2491
2492 struct put_obj_aio_info RGWPutObjProcessor_Aio::pop_pending()
2493 {
2494 struct put_obj_aio_info info;
2495 info = pending.front();
2496 pending.pop_front();
2497 pending_size -= info.size;
2498 return info;
2499 }
2500
2501 int RGWPutObjProcessor_Aio::wait_pending_front()
2502 {
2503 if (pending.empty()) {
2504 return 0;
2505 }
2506 struct put_obj_aio_info info = pop_pending();
2507 int ret = store->aio_wait(info.handle);
2508
2509 if (ret >= 0) {
2510 add_written_obj(info.obj);
2511 }
2512
2513 return ret;
2514 }
2515
2516 bool RGWPutObjProcessor_Aio::pending_has_completed()
2517 {
2518 if (pending.empty())
2519 return false;
2520
2521 struct put_obj_aio_info& info = pending.front();
2522 return store->aio_completed(info.handle);
2523 }
2524
2525 int RGWPutObjProcessor_Aio::drain_pending()
2526 {
2527 int ret = 0;
2528 while (!pending.empty()) {
2529 int r = wait_pending_front();
2530 if (r < 0)
2531 ret = r;
2532 }
2533 return ret;
2534 }
2535
2536 int RGWPutObjProcessor_Aio::throttle_data(void *handle, const rgw_raw_obj& obj, uint64_t size, bool need_to_wait)
2537 {
2538 bool _wait = need_to_wait;
2539
2540 if (handle) {
2541 struct put_obj_aio_info info;
2542 info.handle = handle;
2543 info.obj = obj;
2544 info.size = size;
2545 pending_size += size;
2546 pending.push_back(info);
2547 }
2548 size_t orig_size = pending_size;
2549
2550 /* first drain complete IOs */
2551 while (pending_has_completed()) {
2552 int r = wait_pending_front();
2553 if (r < 0)
2554 return r;
2555
2556 _wait = false;
2557 }
2558
2559 /* resize window in case messages are draining too fast */
2560 if (orig_size - pending_size >= window_size) {
2561 window_size += store->ctx()->_conf->rgw_max_chunk_size;
2562 uint64_t max_window_size = store->ctx()->_conf->rgw_put_obj_max_window_size;
2563 if (window_size > max_window_size) {
2564 window_size = max_window_size;
2565 }
2566 }
2567
2568 /* now throttle. Note that need_to_wait should only affect the first IO operation */
2569 if (pending_size > window_size || _wait) {
2570 int r = wait_pending_front();
2571 if (r < 0)
2572 return r;
2573 }
2574 return 0;
2575 }
2576
2577 int RGWPutObjProcessor_Atomic::write_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool exclusive)
2578 {
2579 if (ofs >= next_part_ofs) {
2580 int r = prepare_next_part(ofs);
2581 if (r < 0) {
2582 return r;
2583 }
2584 }
2585
2586 *pobj = cur_obj;
2587
2588 if (!bl.length()) {
2589 *phandle = nullptr;
2590 return 0;
2591 }
2592
2593 return RGWPutObjProcessor_Aio::handle_obj_data(cur_obj, bl, ofs - cur_part_ofs, ofs, phandle, exclusive);
2594 }
2595
2596 int RGWPutObjProcessor_Aio::prepare(RGWRados *store, string *oid_rand)
2597 {
2598 RGWPutObjProcessor::prepare(store, oid_rand);
2599
2600 window_size = store->ctx()->_conf->rgw_put_obj_min_window_size;
2601
2602 return 0;
2603 }
2604
2605 int RGWPutObjProcessor_Atomic::handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj, bool *again)
2606 {
2607 *phandle = NULL;
2608 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2609
2610 pending_data_bl.claim_append(bl);
2611 if (pending_data_bl.length() < max_write_size) {
2612 *again = false;
2613 return 0;
2614 }
2615
2616 pending_data_bl.splice(0, max_write_size, &bl);
2617
2618 /* do we have enough data pending accumulated that needs to be written? */
2619 *again = (pending_data_bl.length() >= max_chunk_size);
2620
2621 if (!data_ofs && !immutable_head()) {
2622 first_chunk.claim(bl);
2623 obj_len = (uint64_t)first_chunk.length();
2624 int r = prepare_next_part(obj_len);
2625 if (r < 0) {
2626 return r;
2627 }
2628 data_ofs = obj_len;
2629 return 0;
2630 }
2631 off_t write_ofs = data_ofs;
2632 data_ofs = write_ofs + bl.length();
2633 bool exclusive = (!write_ofs && immutable_head()); /* immutable head object, need to verify nothing exists there
2634 we could be racing with another upload, to the same
2635 object and cleanup can be messy */
2636 int ret = write_data(bl, write_ofs, phandle, pobj, exclusive);
2637 if (ret >= 0) { /* we might return, need to clear bl as it was already sent */
2638 bl.clear();
2639 }
2640 return ret;
2641 }
2642
2643
2644 int RGWPutObjProcessor_Atomic::prepare_init(RGWRados *store, string *oid_rand)
2645 {
2646 RGWPutObjProcessor_Aio::prepare(store, oid_rand);
2647
2648 int r = store->get_max_chunk_size(bucket_info.placement_rule, head_obj, &max_chunk_size);
2649 if (r < 0) {
2650 return r;
2651 }
2652
2653 return 0;
2654 }
2655
2656 int RGWPutObjProcessor_Atomic::prepare(RGWRados *store, string *oid_rand)
2657 {
2658 head_obj.init(bucket, obj_str);
2659
2660 int r = prepare_init(store, oid_rand);
2661 if (r < 0) {
2662 return r;
2663 }
2664
2665 if (!version_id.empty()) {
2666 head_obj.key.set_instance(version_id);
2667 } else if (versioned_object) {
2668 store->gen_rand_obj_instance_name(&head_obj);
2669 }
2670
2671 manifest.set_trivial_rule(max_chunk_size, store->ctx()->_conf->rgw_obj_stripe_size);
2672
2673 r = manifest_gen.create_begin(store->ctx(), &manifest, bucket_info.placement_rule, head_obj.bucket, head_obj);
2674 if (r < 0) {
2675 return r;
2676 }
2677
2678 return 0;
2679 }
2680
2681 int RGWPutObjProcessor_Atomic::prepare_next_part(off_t ofs) {
2682
2683 int ret = manifest_gen.create_next(ofs);
2684 if (ret < 0) {
2685 lderr(store->ctx()) << "ERROR: manifest_gen.create_next() returned ret=" << ret << dendl;
2686 return ret;
2687 }
2688 cur_part_ofs = ofs;
2689 next_part_ofs = ofs + manifest_gen.cur_stripe_max_size();
2690 cur_obj = manifest_gen.get_cur_obj(store);
2691
2692 return 0;
2693 }
2694
2695 int RGWPutObjProcessor_Atomic::complete_parts()
2696 {
2697 if (obj_len > (uint64_t)cur_part_ofs) {
2698 return prepare_next_part(obj_len);
2699 }
2700 return 0;
2701 }
2702
2703 int RGWPutObjProcessor_Atomic::complete_writing_data()
2704 {
2705 if (!data_ofs && !immutable_head()) {
2706 /* only claim if pending_data_bl() is not empty. This is needed because we might be called twice
2707 * (e.g., when a retry due to race happens). So a second call to first_chunk.claim() would
2708 * clobber first_chunk
2709 */
2710 if (pending_data_bl.length() > 0) {
2711 first_chunk.claim(pending_data_bl);
2712 }
2713 obj_len = (uint64_t)first_chunk.length();
2714 }
2715 while (pending_data_bl.length()) {
2716 void *handle = nullptr;
2717 rgw_raw_obj obj;
2718 uint64_t max_write_size = MIN(max_chunk_size, (uint64_t)next_part_ofs - data_ofs);
2719 if (max_write_size > pending_data_bl.length()) {
2720 max_write_size = pending_data_bl.length();
2721 }
2722 bufferlist bl;
2723 pending_data_bl.splice(0, max_write_size, &bl);
2724 uint64_t write_len = bl.length();
2725 int r = write_data(bl, data_ofs, &handle, &obj, false);
2726 if (r < 0) {
2727 ldout(store->ctx(), 0) << "ERROR: write_data() returned " << r << dendl;
2728 return r;
2729 }
2730 data_ofs += write_len;
2731 r = throttle_data(handle, obj, write_len, false);
2732 if (r < 0) {
2733 ldout(store->ctx(), 0) << "ERROR: throttle_data() returned " << r << dendl;
2734 return r;
2735 }
2736
2737 if (data_ofs >= next_part_ofs) {
2738 r = prepare_next_part(data_ofs);
2739 if (r < 0) {
2740 ldout(store->ctx(), 0) << "ERROR: prepare_next_part() returned " << r << dendl;
2741 return r;
2742 }
2743 }
2744 }
2745 int r = complete_parts();
2746 if (r < 0) {
2747 return r;
2748 }
2749
2750 r = drain_pending();
2751 if (r < 0)
2752 return r;
2753
2754 return 0;
2755 }
2756
2757 int RGWPutObjProcessor_Atomic::do_complete(size_t accounted_size, const string& etag,
2758 real_time *mtime, real_time set_mtime,
2759 map<string, bufferlist>& attrs,
2760 real_time delete_at,
2761 const char *if_match,
2762 const char *if_nomatch, const string *user_data,
2763 rgw_zone_set *zones_trace) {
2764 int r = complete_writing_data();
2765 if (r < 0)
2766 return r;
2767
2768 obj_ctx.obj.set_atomic(head_obj);
2769
2770 RGWRados::Object op_target(store, bucket_info, obj_ctx, head_obj);
2771
2772 /* some object types shouldn't be versioned, e.g., multipart parts */
2773 op_target.set_versioning_disabled(!versioned_object);
2774
2775 RGWRados::Object::Write obj_op(&op_target);
2776
2777 obj_op.meta.data = &first_chunk;
2778 obj_op.meta.manifest = &manifest;
2779 obj_op.meta.ptag = &unique_tag; /* use req_id as operation tag */
2780 obj_op.meta.if_match = if_match;
2781 obj_op.meta.if_nomatch = if_nomatch;
2782 obj_op.meta.mtime = mtime;
2783 obj_op.meta.set_mtime = set_mtime;
2784 obj_op.meta.owner = bucket_info.owner;
2785 obj_op.meta.flags = PUT_OBJ_CREATE;
2786 obj_op.meta.olh_epoch = olh_epoch;
2787 obj_op.meta.delete_at = delete_at;
2788 obj_op.meta.user_data = user_data;
2789 obj_op.meta.zones_trace = zones_trace;
2790 obj_op.meta.modify_tail = true;
2791
2792 r = obj_op.write_meta(obj_len, accounted_size, attrs);
2793 if (r < 0) {
2794 return r;
2795 }
2796
2797 canceled = obj_op.meta.canceled;
2798
2799 return 0;
2800 }
2801
2802 const char* RGWRados::admin_commands[4][3] = {
2803 { "cache list",
2804 "cache list name=filter,type=CephString,req=false",
2805 "cache list [filter_str]: list object cache, possibly matching substrings" },
2806 { "cache inspect",
2807 "cache inspect name=target,type=CephString,req=true",
2808 "cache inspect target: print cache element" },
2809 { "cache erase",
2810 "cache erase name=target,type=CephString,req=true",
2811 "cache erase target: erase element from cache" },
2812 { "cache zap",
2813 "cache zap",
2814 "cache zap: erase all elements from cache" }
2815 };
2816
2817
2818 int RGWRados::watch(const string& oid, uint64_t *watch_handle, librados::WatchCtx2 *ctx) {
2819 int r = control_pool_ctx.watch2(oid, watch_handle, ctx);
2820 if (r < 0)
2821 return r;
2822 return 0;
2823 }
2824
2825 int RGWRados::unwatch(uint64_t watch_handle)
2826 {
2827 int r = control_pool_ctx.unwatch2(watch_handle);
2828 if (r < 0) {
2829 ldout(cct, 0) << "ERROR: rados->unwatch2() returned r=" << r << dendl;
2830 return r;
2831 }
2832 r = rados[0].watch_flush();
2833 if (r < 0) {
2834 ldout(cct, 0) << "ERROR: rados->watch_flush() returned r=" << r << dendl;
2835 return r;
2836 }
2837 return 0;
2838 }
2839
2840 void RGWRados::add_watcher(int i)
2841 {
2842 ldout(cct, 20) << "add_watcher() i=" << i << dendl;
2843 Mutex::Locker l(watchers_lock);
2844 watchers_set.insert(i);
2845 if (watchers_set.size() == (size_t)num_watchers) {
2846 ldout(cct, 2) << "all " << num_watchers << " watchers are set, enabling cache" << dendl;
2847 set_cache_enabled(true);
2848 }
2849 }
2850
2851 void RGWRados::remove_watcher(int i)
2852 {
2853 ldout(cct, 20) << "remove_watcher() i=" << i << dendl;
2854 Mutex::Locker l(watchers_lock);
2855 size_t orig_size = watchers_set.size();
2856 watchers_set.erase(i);
2857 if (orig_size == (size_t)num_watchers &&
2858 watchers_set.size() < orig_size) { /* actually removed */
2859 ldout(cct, 2) << "removed watcher, disabling cache" << dendl;
2860 set_cache_enabled(false);
2861 }
2862 }
2863
2864 class RGWWatcher : public librados::WatchCtx2 {
2865 RGWRados *rados;
2866 int index;
2867 string oid;
2868 uint64_t watch_handle;
2869
2870 class C_ReinitWatch : public Context {
2871 RGWWatcher *watcher;
2872 public:
2873 explicit C_ReinitWatch(RGWWatcher *_watcher) : watcher(_watcher) {}
2874 void finish(int r) override {
2875 watcher->reinit();
2876 }
2877 };
2878 public:
2879 RGWWatcher(RGWRados *r, int i, const string& o) : rados(r), index(i), oid(o), watch_handle(0) {}
2880 void handle_notify(uint64_t notify_id,
2881 uint64_t cookie,
2882 uint64_t notifier_id,
2883 bufferlist& bl) override {
2884 ldout(rados->ctx(), 10) << "RGWWatcher::handle_notify() "
2885 << " notify_id " << notify_id
2886 << " cookie " << cookie
2887 << " notifier " << notifier_id
2888 << " bl.length()=" << bl.length() << dendl;
2889 rados->watch_cb(notify_id, cookie, notifier_id, bl);
2890
2891 bufferlist reply_bl; // empty reply payload
2892 rados->control_pool_ctx.notify_ack(oid, notify_id, cookie, reply_bl);
2893 }
2894 void handle_error(uint64_t cookie, int err) override {
2895 lderr(rados->ctx()) << "RGWWatcher::handle_error cookie " << cookie
2896 << " err " << cpp_strerror(err) << dendl;
2897 rados->remove_watcher(index);
2898 rados->schedule_context(new C_ReinitWatch(this));
2899 }
2900
2901 void reinit() {
2902 int ret = unregister_watch();
2903 if (ret < 0) {
2904 ldout(rados->ctx(), 0) << "ERROR: unregister_watch() returned ret=" << ret << dendl;
2905 return;
2906 }
2907 ret = register_watch();
2908 if (ret < 0) {
2909 ldout(rados->ctx(), 0) << "ERROR: register_watch() returned ret=" << ret << dendl;
2910 return;
2911 }
2912 }
2913
2914 int unregister_watch() {
2915 int r = rados->unwatch(watch_handle);
2916 if (r < 0) {
2917 return r;
2918 }
2919 rados->remove_watcher(index);
2920 return 0;
2921 }
2922
2923 int register_watch() {
2924 int r = rados->watch(oid, &watch_handle, this);
2925 if (r < 0) {
2926 return r;
2927 }
2928 rados->add_watcher(index);
2929 return 0;
2930 }
2931 };
2932
2933 class RGWMetaNotifierManager : public RGWCoroutinesManager {
2934 RGWRados *store;
2935 RGWHTTPManager http_manager;
2936
2937 public:
2938 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2939 http_manager(store->ctx(), completion_mgr) {
2940 http_manager.set_threaded();
2941 }
2942
2943 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
2944 rgw_http_param_pair pairs[] = { { "type", "metadata" },
2945 { "notify", NULL },
2946 { NULL, NULL } };
2947
2948 list<RGWCoroutinesStack *> stacks;
2949 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2950 RGWRESTConn *conn = iter->second;
2951 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2952 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2953
2954 stacks.push_back(stack);
2955 }
2956 return run(stacks);
2957 }
2958 };
2959
2960 class RGWDataNotifierManager : public RGWCoroutinesManager {
2961 RGWRados *store;
2962 RGWHTTPManager http_manager;
2963
2964 public:
2965 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
2966 http_manager(store->ctx(), completion_mgr) {
2967 http_manager.set_threaded();
2968 }
2969
2970 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
2971 rgw_http_param_pair pairs[] = { { "type", "data" },
2972 { "notify", NULL },
2973 { "source-zone", store->get_zone_params().get_id().c_str() },
2974 { NULL, NULL } };
2975
2976 list<RGWCoroutinesStack *> stacks;
2977 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
2978 RGWRESTConn *conn = iter->second;
2979 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
2980 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
2981
2982 stacks.push_back(stack);
2983 }
2984 return run(stacks);
2985 }
2986 };
2987
2988 class RGWRadosThread {
2989 class Worker : public Thread {
2990 CephContext *cct;
2991 RGWRadosThread *processor;
2992 Mutex lock;
2993 Cond cond;
2994
2995 void wait() {
2996 Mutex::Locker l(lock);
2997 cond.Wait(lock);
2998 };
2999
3000 void wait_interval(const utime_t& wait_time) {
3001 Mutex::Locker l(lock);
3002 cond.WaitInterval(lock, wait_time);
3003 }
3004
3005 public:
3006 Worker(CephContext *_cct, RGWRadosThread *_p) : cct(_cct), processor(_p), lock("RGWRadosThread::Worker") {}
3007 void *entry() override;
3008 void signal() {
3009 Mutex::Locker l(lock);
3010 cond.Signal();
3011 }
3012 };
3013
3014 Worker *worker;
3015
3016 protected:
3017 CephContext *cct;
3018 RGWRados *store;
3019
3020 std::atomic<bool> down_flag = { false };
3021
3022 string thread_name;
3023
3024 virtual uint64_t interval_msec() = 0;
3025 virtual void stop_process() {}
3026 public:
3027 RGWRadosThread(RGWRados *_store, const string& thread_name = "radosgw")
3028 : worker(NULL), cct(_store->ctx()), store(_store), thread_name(thread_name) {}
3029 virtual ~RGWRadosThread() {
3030 stop();
3031 }
3032
3033 virtual int init() { return 0; }
3034 virtual int process() = 0;
3035
3036 bool going_down() { return down_flag; }
3037
3038 void start();
3039 void stop();
3040
3041 void signal() {
3042 if (worker) {
3043 worker->signal();
3044 }
3045 }
3046 };
3047
3048 void RGWRadosThread::start()
3049 {
3050 worker = new Worker(cct, this);
3051 worker->create(thread_name.c_str());
3052 }
3053
3054 void RGWRadosThread::stop()
3055 {
3056 down_flag = true;
3057 stop_process();
3058 if (worker) {
3059 worker->signal();
3060 worker->join();
3061 }
3062 delete worker;
3063 worker = NULL;
3064 }
3065
3066 void *RGWRadosThread::Worker::entry() {
3067 uint64_t msec = processor->interval_msec();
3068 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3069
3070 do {
3071 utime_t start = ceph_clock_now();
3072 int r = processor->process();
3073 if (r < 0) {
3074 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
3075 }
3076
3077 if (processor->going_down())
3078 break;
3079
3080 utime_t end = ceph_clock_now();
3081 end -= start;
3082
3083 uint64_t cur_msec = processor->interval_msec();
3084 if (cur_msec != msec) { /* was it reconfigured? */
3085 msec = cur_msec;
3086 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
3087 }
3088
3089 if (cur_msec > 0) {
3090 if (interval <= end)
3091 continue; // next round
3092
3093 utime_t wait_time = interval;
3094 wait_time -= end;
3095
3096 wait_interval(wait_time);
3097 } else {
3098 wait();
3099 }
3100 } while (!processor->going_down());
3101
3102 return NULL;
3103 }
3104
3105 class RGWMetaNotifier : public RGWRadosThread {
3106 RGWMetaNotifierManager notify_mgr;
3107 RGWMetadataLog *const log;
3108
3109 uint64_t interval_msec() override {
3110 return cct->_conf->rgw_md_notify_interval_msec;
3111 }
3112 public:
3113 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
3114 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
3115
3116 int process() override;
3117 };
3118
3119 int RGWMetaNotifier::process()
3120 {
3121 set<int> shards;
3122
3123 log->read_clear_modified(shards);
3124
3125 if (shards.empty()) {
3126 return 0;
3127 }
3128
3129 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3130 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
3131 }
3132
3133 notify_mgr.notify_all(store->zone_conn_map, shards);
3134
3135 return 0;
3136 }
3137
3138 class RGWDataNotifier : public RGWRadosThread {
3139 RGWDataNotifierManager notify_mgr;
3140
3141 uint64_t interval_msec() override {
3142 return cct->_conf->get_val<int64_t>("rgw_data_notify_interval_msec");
3143 }
3144 public:
3145 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
3146
3147 int process() override;
3148 };
3149
3150 int RGWDataNotifier::process()
3151 {
3152 if (!store->data_log) {
3153 return 0;
3154 }
3155
3156 map<int, set<string> > shards;
3157
3158 store->data_log->read_clear_modified(shards);
3159
3160 if (shards.empty()) {
3161 return 0;
3162 }
3163
3164 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
3165 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
3166 }
3167
3168 notify_mgr.notify_all(store->zone_data_notify_to_map, shards);
3169
3170 return 0;
3171 }
3172
3173 class RGWSyncProcessorThread : public RGWRadosThread {
3174 public:
3175 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
3176 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
3177 ~RGWSyncProcessorThread() override {}
3178 int init() override = 0 ;
3179 int process() override = 0;
3180 };
3181
3182 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
3183 {
3184 RGWMetaSyncStatusManager sync;
3185
3186 uint64_t interval_msec() override {
3187 return 0; /* no interval associated, it'll run once until stopped */
3188 }
3189 void stop_process() override {
3190 sync.stop();
3191 }
3192 public:
3193 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
3194 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
3195
3196 void wakeup_sync_shards(set<int>& shard_ids) {
3197 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3198 sync.wakeup(*iter);
3199 }
3200 }
3201 RGWMetaSyncStatusManager* get_manager() { return &sync; }
3202
3203 int init() override {
3204 int ret = sync.init();
3205 if (ret < 0) {
3206 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
3207 return ret;
3208 }
3209 return 0;
3210 }
3211
3212 int process() override {
3213 sync.run();
3214 return 0;
3215 }
3216 };
3217
3218 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
3219 {
3220 RGWDataSyncStatusManager sync;
3221 bool initialized;
3222
3223 uint64_t interval_msec() override {
3224 if (initialized) {
3225 return 0; /* no interval associated, it'll run once until stopped */
3226 } else {
3227 #define DATA_SYNC_INIT_WAIT_SEC 20
3228 return DATA_SYNC_INIT_WAIT_SEC * 1000;
3229 }
3230 }
3231 void stop_process() override {
3232 sync.stop();
3233 }
3234 public:
3235 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
3236 const string& _source_zone,
3237 rgw::BucketChangeObserver *observer)
3238 : RGWSyncProcessorThread(_store, "data-sync"),
3239 sync(_store, async_rados, _source_zone, observer),
3240 initialized(false) {}
3241
3242 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
3243 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
3244 sync.wakeup(iter->first, iter->second);
3245 }
3246 }
3247 RGWDataSyncStatusManager* get_manager() { return &sync; }
3248
3249 int init() override {
3250 return 0;
3251 }
3252
3253 int process() override {
3254 while (!initialized) {
3255 if (going_down()) {
3256 return 0;
3257 }
3258 int ret = sync.init();
3259 if (ret >= 0) {
3260 initialized = true;
3261 break;
3262 }
3263 /* we'll be back! */
3264 return 0;
3265 }
3266 sync.run();
3267 return 0;
3268 }
3269 };
3270
3271 class RGWSyncLogTrimThread : public RGWSyncProcessorThread
3272 {
3273 RGWCoroutinesManager crs;
3274 RGWRados *store;
3275 rgw::BucketTrimManager *bucket_trim;
3276 RGWHTTPManager http;
3277 const utime_t trim_interval;
3278
3279 uint64_t interval_msec() override { return 0; }
3280 void stop_process() override { crs.stop(); }
3281 public:
3282 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
3283 int interval)
3284 : RGWSyncProcessorThread(store, "sync-log-trim"),
3285 crs(store->ctx(), store->get_cr_registry()), store(store),
3286 bucket_trim(bucket_trim),
3287 http(store->ctx(), crs.get_completion_mgr()),
3288 trim_interval(interval, 0)
3289 {}
3290
3291 int init() override {
3292 return http.set_threaded();
3293 }
3294 int process() override {
3295 list<RGWCoroutinesStack*> stacks;
3296 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
3297 meta->call(create_meta_log_trim_cr(store, &http,
3298 cct->_conf->rgw_md_log_max_shards,
3299 trim_interval));
3300 stacks.push_back(meta);
3301
3302 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
3303 data->call(create_data_log_trim_cr(store, &http,
3304 cct->_conf->rgw_data_log_num_shards,
3305 trim_interval));
3306 stacks.push_back(data);
3307
3308 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
3309 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
3310 stacks.push_back(bucket);
3311
3312 crs.run(stacks);
3313 return 0;
3314 }
3315 };
3316
3317 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
3318 {
3319 Mutex::Locker l(meta_sync_thread_lock);
3320 if (meta_sync_processor_thread) {
3321 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
3322 }
3323 }
3324
3325 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
3326 {
3327 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
3328 Mutex::Locker l(data_sync_thread_lock);
3329 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
3330 if (iter == data_sync_processor_threads.end()) {
3331 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
3332 return;
3333 }
3334
3335 RGWDataSyncProcessorThread *thread = iter->second;
3336 assert(thread);
3337 thread->wakeup_sync_shards(shard_ids);
3338 }
3339
3340 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
3341 {
3342 Mutex::Locker l(meta_sync_thread_lock);
3343 if (meta_sync_processor_thread) {
3344 return meta_sync_processor_thread->get_manager();
3345 }
3346 return nullptr;
3347 }
3348
3349 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
3350 {
3351 Mutex::Locker l(data_sync_thread_lock);
3352 auto thread = data_sync_processor_threads.find(source_zone);
3353 if (thread == data_sync_processor_threads.end()) {
3354 return nullptr;
3355 }
3356 return thread->second->get_manager();
3357 }
3358
3359 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
3360 {
3361 IoCtx ioctx;
3362 int r = open_pool_ctx(pool, ioctx);
3363 if (r < 0) {
3364 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
3365 return r;
3366 }
3367
3368 bool requires;
3369 r = ioctx.pool_requires_alignment2(&requires);
3370 if (r < 0) {
3371 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
3372 << r << dendl;
3373 return r;
3374 }
3375
3376 if (!requires) {
3377 *alignment = 0;
3378 return 0;
3379 }
3380
3381 uint64_t align;
3382 r = ioctx.pool_required_alignment2(&align);
3383 if (r < 0) {
3384 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
3385 << r << dendl;
3386 return r;
3387 }
3388 if (align != 0) {
3389 ldout(cct, 20) << "required alignment=" << align << dendl;
3390 }
3391 *alignment = align;
3392 return 0;
3393 }
3394
3395 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size)
3396 {
3397 uint64_t alignment = 0;
3398 int r = get_required_alignment(pool, &alignment);
3399 if (r < 0) {
3400 return r;
3401 }
3402
3403 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
3404
3405 if (alignment == 0) {
3406 *max_chunk_size = config_chunk_size;
3407 return 0;
3408 }
3409
3410 if (config_chunk_size <= alignment) {
3411 *max_chunk_size = alignment;
3412 return 0;
3413 }
3414
3415 *max_chunk_size = config_chunk_size - (config_chunk_size % alignment);
3416
3417 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
3418
3419 return 0;
3420 }
3421
3422 int RGWRados::get_max_chunk_size(const string& placement_rule, const rgw_obj& obj, uint64_t *max_chunk_size)
3423 {
3424 rgw_pool pool;
3425 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
3426 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
3427 return -EIO;
3428 }
3429 return get_max_chunk_size(pool, max_chunk_size);
3430 }
3431
3432 class RGWIndexCompletionManager;
3433
3434 struct complete_op_data {
3435 Mutex lock{"complete_op_data"};
3436 AioCompletion *rados_completion{nullptr};
3437 int manager_shard_id{-1};
3438 RGWIndexCompletionManager *manager{nullptr};
3439 rgw_obj obj;
3440 RGWModifyOp op;
3441 string tag;
3442 rgw_bucket_entry_ver ver;
3443 cls_rgw_obj_key key;
3444 rgw_bucket_dir_entry_meta dir_meta;
3445 list<cls_rgw_obj_key> remove_objs;
3446 bool log_op;
3447 uint16_t bilog_op;
3448 rgw_zone_set zones_trace;
3449
3450 bool stopped{false};
3451
3452 void stop() {
3453 Mutex::Locker l(lock);
3454 stopped = true;
3455 }
3456 };
3457
3458 class RGWIndexCompletionThread : public RGWRadosThread {
3459 RGWRados *store;
3460
3461 uint64_t interval_msec() override {
3462 return 0;
3463 }
3464
3465 list<complete_op_data *> completions;
3466
3467 Mutex completions_lock;
3468 public:
3469 RGWIndexCompletionThread(RGWRados *_store)
3470 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
3471
3472 int process() override;
3473
3474 void add_completion(complete_op_data *completion) {
3475 {
3476 Mutex::Locker l(completions_lock);
3477 completions.push_back(completion);
3478 }
3479
3480 signal();
3481 }
3482 };
3483
3484 int RGWIndexCompletionThread::process()
3485 {
3486 list<complete_op_data *> comps;
3487
3488 {
3489 Mutex::Locker l(completions_lock);
3490 completions.swap(comps);
3491 }
3492
3493 for (auto c : comps) {
3494 std::unique_ptr<complete_op_data> up{c};
3495
3496 if (going_down()) {
3497 continue;
3498 }
3499 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
3500
3501 RGWRados::BucketShard bs(store);
3502
3503 int r = bs.init(c->obj.bucket, c->obj);
3504 if (r < 0) {
3505 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
3506 /* not much to do */
3507 continue;
3508 }
3509
3510 r = store->guard_reshard(&bs, c->obj, [&](RGWRados::BucketShard *bs) -> int {
3511 librados::ObjectWriteOperation o;
3512 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
3513 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
3514 c->log_op, c->bilog_op, &c->zones_trace);
3515
3516 return bs->index_ctx.operate(bs->bucket_obj, &o);
3517 });
3518 if (r < 0) {
3519 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
3520 /* ignoring error, can't do anything about it */
3521 continue;
3522 }
3523 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
3524 if (r < 0) {
3525 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
3526 }
3527 }
3528
3529 return 0;
3530 }
3531
3532 class RGWIndexCompletionManager {
3533 RGWRados *store{nullptr};
3534 vector<Mutex *> locks;
3535 vector<set<complete_op_data *> > completions;
3536
3537 RGWIndexCompletionThread *completion_thread{nullptr};
3538
3539 int num_shards;
3540
3541 std::atomic<int> cur_shard {0};
3542
3543
3544 public:
3545 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
3546 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
3547
3548 for (int i = 0; i < num_shards; i++) {
3549 char buf[64];
3550 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
3551 locks.push_back(new Mutex(buf));
3552 }
3553
3554 completions.resize(num_shards);
3555 }
3556 ~RGWIndexCompletionManager() {
3557 stop();
3558
3559 for (auto l : locks) {
3560 delete l;
3561 }
3562 }
3563
3564 int next_shard() {
3565 int result = cur_shard % num_shards;
3566 cur_shard++;
3567 return result;
3568 }
3569
3570 void create_completion(const rgw_obj& obj,
3571 RGWModifyOp op, string& tag,
3572 rgw_bucket_entry_ver& ver,
3573 const cls_rgw_obj_key& key,
3574 rgw_bucket_dir_entry_meta& dir_meta,
3575 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3576 uint16_t bilog_op,
3577 rgw_zone_set *zones_trace,
3578 complete_op_data **result);
3579 bool handle_completion(completion_t cb, complete_op_data *arg);
3580
3581 int start() {
3582 completion_thread = new RGWIndexCompletionThread(store);
3583 int ret = completion_thread->init();
3584 if (ret < 0) {
3585 return ret;
3586 }
3587 completion_thread->start();
3588 return 0;
3589 }
3590 void stop() {
3591 if (completion_thread) {
3592 completion_thread->stop();
3593 delete completion_thread;
3594 }
3595
3596 for (int i = 0; i < num_shards; ++i) {
3597 Mutex::Locker l(*locks[i]);
3598 for (auto c : completions[i]) {
3599 Mutex::Locker cl(c->lock);
3600 c->stop();
3601 }
3602 }
3603 completions.clear();
3604 }
3605 };
3606
3607 static void obj_complete_cb(completion_t cb, void *arg)
3608 {
3609 complete_op_data *completion = (complete_op_data *)arg;
3610 completion->lock.Lock();
3611 if (completion->stopped) {
3612 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
3613 delete completion;
3614 return;
3615 }
3616 bool need_delete = completion->manager->handle_completion(cb, completion);
3617 completion->lock.Unlock();
3618 if (need_delete) {
3619 delete completion;
3620 }
3621 }
3622
3623
3624 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
3625 RGWModifyOp op, string& tag,
3626 rgw_bucket_entry_ver& ver,
3627 const cls_rgw_obj_key& key,
3628 rgw_bucket_dir_entry_meta& dir_meta,
3629 list<cls_rgw_obj_key> *remove_objs, bool log_op,
3630 uint16_t bilog_op,
3631 rgw_zone_set *zones_trace,
3632 complete_op_data **result)
3633 {
3634 complete_op_data *entry = new complete_op_data;
3635
3636 int shard_id = next_shard();
3637
3638 entry->manager_shard_id = shard_id;
3639 entry->manager = this;
3640 entry->obj = obj;
3641 entry->op = op;
3642 entry->tag = tag;
3643 entry->ver = ver;
3644 entry->key = key;
3645 entry->dir_meta = dir_meta;
3646 entry->log_op = log_op;
3647 entry->bilog_op = bilog_op;
3648
3649 if (remove_objs) {
3650 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
3651 entry->remove_objs.push_back(*iter);
3652 }
3653 }
3654
3655 if (zones_trace) {
3656 entry->zones_trace = *zones_trace;
3657 } else {
3658 entry->zones_trace.insert(store->get_zone().id);
3659 }
3660
3661 *result = entry;
3662
3663 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
3664
3665 Mutex::Locker l(*locks[shard_id]);
3666 completions[shard_id].insert(entry);
3667 }
3668
3669 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
3670 {
3671 int shard_id = arg->manager_shard_id;
3672 {
3673 Mutex::Locker l(*locks[shard_id]);
3674
3675 auto& comps = completions[shard_id];
3676
3677 auto iter = comps.find(arg);
3678 if (iter == comps.end()) {
3679 return true;
3680 }
3681
3682 comps.erase(iter);
3683 }
3684
3685 int r = rados_aio_get_return_value(cb);
3686 if (r != -ERR_BUSY_RESHARDING) {
3687 return true;
3688 }
3689 completion_thread->add_completion(arg);
3690 return false;
3691 }
3692
3693 void RGWRados::finalize()
3694 {
3695 auto admin_socket = cct->get_admin_socket();
3696 for (auto cmd : admin_commands) {
3697 int r = admin_socket->unregister_command(cmd[0]);
3698 if (r < 0) {
3699 lderr(cct) << "ERROR: fail to unregister admin socket command (r=" << r
3700 << ")" << dendl;
3701 }
3702 }
3703
3704 if (run_sync_thread) {
3705 Mutex::Locker l(meta_sync_thread_lock);
3706 meta_sync_processor_thread->stop();
3707
3708 Mutex::Locker dl(data_sync_thread_lock);
3709 for (auto iter : data_sync_processor_threads) {
3710 RGWDataSyncProcessorThread *thread = iter.second;
3711 thread->stop();
3712 }
3713 if (sync_log_trimmer) {
3714 sync_log_trimmer->stop();
3715 }
3716 }
3717 if (async_rados) {
3718 async_rados->stop();
3719 }
3720 if (run_sync_thread) {
3721 delete meta_sync_processor_thread;
3722 meta_sync_processor_thread = NULL;
3723 Mutex::Locker dl(data_sync_thread_lock);
3724 for (auto iter : data_sync_processor_threads) {
3725 RGWDataSyncProcessorThread *thread = iter.second;
3726 delete thread;
3727 }
3728 data_sync_processor_threads.clear();
3729 delete sync_log_trimmer;
3730 sync_log_trimmer = nullptr;
3731 bucket_trim = boost::none;
3732 }
3733 if (finisher) {
3734 finisher->stop();
3735 }
3736 if (need_watch_notify()) {
3737 finalize_watch();
3738 }
3739 if (finisher) {
3740 /* delete finisher only after cleaning up watches, as watch error path might call
3741 * into finisher. We stop finisher before finalizing watch to make sure we don't
3742 * actually handle any racing work
3743 */
3744 delete finisher;
3745 }
3746 if (meta_notifier) {
3747 meta_notifier->stop();
3748 delete meta_notifier;
3749 }
3750 if (data_notifier) {
3751 data_notifier->stop();
3752 delete data_notifier;
3753 }
3754 delete data_log;
3755 if (async_rados) {
3756 delete async_rados;
3757 }
3758
3759 delete lc;
3760 lc = NULL;
3761
3762 delete gc;
3763 gc = NULL;
3764
3765 delete obj_expirer;
3766 obj_expirer = NULL;
3767
3768 delete rest_master_conn;
3769
3770 map<string, RGWRESTConn *>::iterator iter;
3771 for (iter = zone_conn_map.begin(); iter != zone_conn_map.end(); ++iter) {
3772 RGWRESTConn *conn = iter->second;
3773 delete conn;
3774 }
3775
3776 for (iter = zonegroup_conn_map.begin(); iter != zonegroup_conn_map.end(); ++iter) {
3777 RGWRESTConn *conn = iter->second;
3778 delete conn;
3779 }
3780 RGWQuotaHandler::free_handler(quota_handler);
3781 if (cr_registry) {
3782 cr_registry->put();
3783 }
3784 delete meta_mgr;
3785 delete binfo_cache;
3786 delete obj_tombstone_cache;
3787 delete sync_modules_manager;
3788
3789 if (reshard_wait.get()) {
3790 reshard_wait->stop();
3791 reshard_wait.reset();
3792 }
3793
3794 if (run_reshard_thread) {
3795 reshard->stop_processor();
3796 }
3797 delete reshard;
3798 delete index_completion_manager;
3799 }
3800
3801 /**
3802 * Initialize the RADOS instance and prepare to do other ops
3803 * Returns 0 on success, -ERR# on failure.
3804 */
3805 int RGWRados::init_rados()
3806 {
3807 int ret = 0;
3808 auto admin_socket = cct->get_admin_socket();
3809 for (auto cmd : admin_commands) {
3810 int r = admin_socket->register_command(cmd[0], cmd[1], this,
3811 cmd[2]);
3812 if (r < 0) {
3813 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
3814 << ")" << dendl;
3815 return r;
3816 }
3817 }
3818
3819 auto handles = std::vector<librados::Rados>{cct->_conf->rgw_num_rados_handles};
3820
3821 for (auto& r : handles) {
3822 ret = r.init_with_context(cct);
3823 if (ret < 0) {
3824 return ret;
3825 }
3826 ret = r.connect();
3827 if (ret < 0) {
3828 return ret;
3829 }
3830 }
3831
3832 sync_modules_manager = new RGWSyncModulesManager();
3833
3834 rgw_register_sync_modules(sync_modules_manager);
3835
3836 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
3837 new RGWCoroutinesManagerRegistry(cct)};
3838 ret = crs->hook_to_admin_command("cr dump");
3839 if (ret < 0) {
3840 return ret;
3841 }
3842
3843 meta_mgr = new RGWMetadataManager(cct, this);
3844 data_log = new RGWDataChangesLog(cct, this);
3845 cr_registry = crs.release();
3846
3847 std::swap(handles, rados);
3848 return ret;
3849 }
3850
3851
3852 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
3853 {
3854 map<string,string> metadata = meta;
3855 metadata["num_handles"] = stringify(rados.size());
3856 metadata["zonegroup_id"] = zonegroup.get_id();
3857 metadata["zonegroup_name"] = zonegroup.get_name();
3858 metadata["zone_name"] = zone_name();
3859 metadata["zone_id"] = zone_id();;
3860 string name = cct->_conf->name.get_id();
3861 if (name.find("rgw.") == 0) {
3862 name = name.substr(4);
3863 }
3864 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
3865 if (ret < 0) {
3866 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
3867 return ret;
3868 }
3869
3870 return 0;
3871 }
3872
3873 /**
3874 * Add new connection to connections map
3875 * @param zonegroup_conn_map map which new connection will be added to
3876 * @param zonegroup zonegroup which new connection will connect to
3877 * @param new_connection pointer to new connection instance
3878 */
3879 static void add_new_connection_to_map(map<string, RGWRESTConn *> &zonegroup_conn_map,
3880 const RGWZoneGroup &zonegroup, RGWRESTConn *new_connection)
3881 {
3882 // Delete if connection is already exists
3883 map<string, RGWRESTConn *>::iterator iterZoneGroup = zonegroup_conn_map.find(zonegroup.get_id());
3884 if (iterZoneGroup != zonegroup_conn_map.end()) {
3885 delete iterZoneGroup->second;
3886 }
3887
3888 // Add new connection to connections map
3889 zonegroup_conn_map[zonegroup.get_id()] = new_connection;
3890 }
3891
3892 int RGWRados::convert_regionmap()
3893 {
3894 RGWZoneGroupMap zonegroupmap;
3895
3896 string pool_name = cct->_conf->rgw_zone_root_pool;
3897 if (pool_name.empty()) {
3898 pool_name = RGW_DEFAULT_ZONE_ROOT_POOL;
3899 }
3900 string oid = region_map_oid;
3901
3902 rgw_pool pool(pool_name);
3903 bufferlist bl;
3904 RGWObjectCtx obj_ctx(this);
3905 int ret = rgw_get_system_obj(this, obj_ctx, pool, oid, bl, NULL, NULL);
3906 if (ret < 0 && ret != -ENOENT) {
3907 return ret;
3908 } else if (ret == -ENOENT) {
3909 return 0;
3910 }
3911
3912 try {
3913 bufferlist::iterator iter = bl.begin();
3914 ::decode(zonegroupmap, iter);
3915 } catch (buffer::error& err) {
3916 ldout(cct, 0) << "error decoding regionmap from " << pool << ":" << oid << dendl;
3917 return -EIO;
3918 }
3919
3920 for (map<string, RGWZoneGroup>::iterator iter = zonegroupmap.zonegroups.begin();
3921 iter != zonegroupmap.zonegroups.end(); ++iter) {
3922 RGWZoneGroup& zonegroup = iter->second;
3923 ret = zonegroup.init(cct, this, false);
3924 ret = zonegroup.update();
3925 if (ret < 0 && ret != -ENOENT) {
3926 ldout(cct, 0) << "Error could not update zonegroup " << zonegroup.get_name() << ": " <<
3927 cpp_strerror(-ret) << dendl;
3928 return ret;
3929 } else if (ret == -ENOENT) {
3930 ret = zonegroup.create();
3931 if (ret < 0) {
3932 ldout(cct, 0) << "Error could not create " << zonegroup.get_name() << ": " <<
3933 cpp_strerror(-ret) << dendl;
3934 return ret;
3935 }
3936 }
3937 }
3938
3939 current_period.set_user_quota(zonegroupmap.user_quota);
3940 current_period.set_bucket_quota(zonegroupmap.bucket_quota);
3941
3942 // remove the region_map so we don't try to convert again
3943 rgw_raw_obj obj(pool, oid);
3944 ret = delete_system_obj(obj);
3945 if (ret < 0) {
3946 ldout(cct, 0) << "Error could not remove " << obj
3947 << " after upgrading to zonegroup map: " << cpp_strerror(ret) << dendl;
3948 return ret;
3949 }
3950
3951 return 0;
3952 }
3953
3954 /**
3955 * Replace all region configuration with zonegroup for
3956 * backward compatability
3957 * Returns 0 on success, -ERR# on failure.
3958 */
3959 int RGWRados::replace_region_with_zonegroup()
3960 {
3961 /* copy default region */
3962 /* convert default region to default zonegroup */
3963 string default_oid = cct->_conf->rgw_default_region_info_oid;
3964 if (default_oid.empty()) {
3965 default_oid = default_region_info_oid;
3966 }
3967
3968
3969 RGWZoneGroup default_zonegroup;
3970 rgw_pool pool{default_zonegroup.get_pool(cct)};
3971 string oid = "converted";
3972 bufferlist bl;
3973 RGWObjectCtx obj_ctx(this);
3974
3975 int ret = rgw_get_system_obj(this, obj_ctx, pool ,oid, bl, NULL, NULL);
3976 if (ret < 0 && ret != -ENOENT) {
3977 ldout(cct, 0) << __func__ << " failed to read converted: ret "<< ret << " " << cpp_strerror(-ret)
3978 << dendl;
3979 return ret;
3980 } else if (ret != -ENOENT) {
3981 ldout(cct, 20) << "System already converted " << dendl;
3982 return 0;
3983 }
3984
3985 string default_region;
3986 ret = default_zonegroup.init(cct, this, false, true);
3987 if (ret < 0) {
3988 ldout(cct, 0) << __func__ << " failed init default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3989 return ret;
3990 }
3991 ret = default_zonegroup.read_default_id(default_region, true);
3992 if (ret < 0 && ret != -ENOENT) {
3993 ldout(cct, 0) << __func__ << " failed reading old default region: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
3994 return ret;
3995 }
3996
3997 /* convert regions to zonegroups */
3998 list<string> regions;
3999 ret = list_regions(regions);
4000 if (ret < 0 && ret != -ENOENT) {
4001 ldout(cct, 0) << __func__ << " failed to list regions: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4002 return ret;
4003 } else if (ret == -ENOENT || regions.empty()) {
4004 RGWZoneParams zoneparams(default_zone_name);
4005 int ret = zoneparams.init(cct, this);
4006 if (ret < 0 && ret != -ENOENT) {
4007 ldout(cct, 0) << __func__ << ": error initializing default zone params: " << cpp_strerror(-ret) << dendl;
4008 return ret;
4009 }
4010 /* update master zone */
4011 RGWZoneGroup default_zg(default_zonegroup_name);
4012 ret = default_zg.init(cct, this);
4013 if (ret < 0 && ret != -ENOENT) {
4014 ldout(cct, 0) << __func__ << ": error in initializing default zonegroup: " << cpp_strerror(-ret) << dendl;
4015 return ret;
4016 }
4017 if (ret != -ENOENT && default_zg.master_zone.empty()) {
4018 default_zg.master_zone = zoneparams.get_id();
4019 return default_zg.update();
4020 }
4021 return 0;
4022 }
4023
4024 string master_region, master_zone;
4025 for (list<string>::iterator iter = regions.begin(); iter != regions.end(); ++iter) {
4026 if (*iter != default_zonegroup_name){
4027 RGWZoneGroup region(*iter);
4028 int ret = region.init(cct, this, true, true);
4029 if (ret < 0) {
4030 ldout(cct, 0) << __func__ << " failed init region "<< *iter << ": " << cpp_strerror(-ret) << dendl;
4031 return ret;
4032 }
4033 if (region.is_master_zonegroup()) {
4034 master_region = region.get_id();
4035 master_zone = region.master_zone;
4036 }
4037 }
4038 }
4039
4040 /* create realm if there is none.
4041 The realm name will be the region and zone concatenated
4042 realm id will be mds of its name */
4043 if (realm.get_id().empty() && !master_region.empty() && !master_zone.empty()) {
4044 string new_realm_name = master_region + "." + master_zone;
4045 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
4046 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
4047 MD5 hash;
4048 hash.Update((const byte *)new_realm_name.c_str(), new_realm_name.length());
4049 hash.Final(md5);
4050 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
4051 string new_realm_id(md5_str);
4052 RGWRealm new_realm(new_realm_id,new_realm_name);
4053 ret = new_realm.init(cct, this, false);
4054 if (ret < 0) {
4055 ldout(cct, 0) << __func__ << " Error initing new realm: " << cpp_strerror(-ret) << dendl;
4056 return ret;
4057 }
4058 ret = new_realm.create();
4059 if (ret < 0 && ret != -EEXIST) {
4060 ldout(cct, 0) << __func__ << " Error creating new realm: " << cpp_strerror(-ret) << dendl;
4061 return ret;
4062 }
4063 ret = new_realm.set_as_default();
4064 if (ret < 0) {
4065 ldout(cct, 0) << __func__ << " Error setting realm as default: " << cpp_strerror(-ret) << dendl;
4066 return ret;
4067 }
4068 ret = realm.init(cct, this);
4069 if (ret < 0) {
4070 ldout(cct, 0) << __func__ << " Error initing realm: " << cpp_strerror(-ret) << dendl;
4071 return ret;
4072 }
4073 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4074 if (ret < 0) {
4075 ldout(cct, 0) << __func__ << " Error initing current period: " << cpp_strerror(-ret) << dendl;
4076 return ret;
4077 }
4078 }
4079
4080 list<string>::iterator iter;
4081 /* create zonegroups */
4082 for (iter = regions.begin(); iter != regions.end(); ++iter)
4083 {
4084 ldout(cct, 0) << __func__ << "Converting " << *iter << dendl;
4085 /* check to see if we don't have already a zonegroup with this name */
4086 RGWZoneGroup new_zonegroup(*iter);
4087 ret = new_zonegroup.init(cct , this);
4088 if (ret == 0 && new_zonegroup.get_id() != *iter) {
4089 ldout(cct, 0) << __func__ << " zonegroup "<< *iter << " already exists id " << new_zonegroup.get_id () <<
4090 " skipping conversion " << dendl;
4091 continue;
4092 }
4093 RGWZoneGroup zonegroup(*iter);
4094 zonegroup.set_id(*iter);
4095 int ret = zonegroup.init(cct, this, true, true);
4096 if (ret < 0) {
4097 ldout(cct, 0) << __func__ << " failed init zonegroup: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4098 return ret;
4099 }
4100 zonegroup.realm_id = realm.get_id();
4101 /* fix default region master zone */
4102 if (*iter == default_zonegroup_name && zonegroup.master_zone.empty()) {
4103 ldout(cct, 0) << __func__ << " Setting default zone as master for default region" << dendl;
4104 zonegroup.master_zone = default_zone_name;
4105 }
4106 ret = zonegroup.update();
4107 if (ret < 0 && ret != -EEXIST) {
4108 ldout(cct, 0) << __func__ << " failed to update zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4109 << dendl;
4110 return ret;
4111 }
4112 ret = zonegroup.update_name();
4113 if (ret < 0 && ret != -EEXIST) {
4114 ldout(cct, 0) << __func__ << " failed to update_name for zonegroup " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4115 << dendl;
4116 return ret;
4117 }
4118 if (zonegroup.get_name() == default_region) {
4119 ret = zonegroup.set_as_default();
4120 if (ret < 0) {
4121 ldout(cct, 0) << __func__ << " failed to set_as_default " << *iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4122 << dendl;
4123 return ret;
4124 }
4125 }
4126 for (map<string, RGWZone>::const_iterator iter = zonegroup.zones.begin(); iter != zonegroup.zones.end();
4127 ++iter) {
4128 ldout(cct, 0) << __func__ << " Converting zone" << iter->first << dendl;
4129 RGWZoneParams zoneparams(iter->first, iter->first);
4130 zoneparams.set_id(iter->first);
4131 zoneparams.realm_id = realm.get_id();
4132 ret = zoneparams.init(cct, this);
4133 if (ret < 0 && ret != -ENOENT) {
4134 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4135 return ret;
4136 } else if (ret == -ENOENT) {
4137 ldout(cct, 0) << __func__ << " zone is part of another cluster " << iter->first << " skipping " << dendl;
4138 continue;
4139 }
4140 zonegroup.realm_id = realm.get_id();
4141 ret = zoneparams.update();
4142 if (ret < 0 && ret != -EEXIST) {
4143 ldout(cct, 0) << __func__ << " failed to update zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4144 return ret;
4145 }
4146 ret = zoneparams.update_name();
4147 if (ret < 0 && ret != -EEXIST) {
4148 ldout(cct, 0) << __func__ << " failed to init zoneparams " << iter->first << ": " << cpp_strerror(-ret) << dendl;
4149 return ret;
4150 }
4151 }
4152
4153 if (!current_period.get_id().empty()) {
4154 ret = current_period.add_zonegroup(zonegroup);
4155 if (ret < 0) {
4156 ldout(cct, 0) << __func__ << " failed to add zonegroup to current_period: " << cpp_strerror(-ret) << dendl;
4157 return ret;
4158 }
4159 }
4160 }
4161
4162 if (!current_period.get_id().empty()) {
4163 ret = current_period.update();
4164 if (ret < 0) {
4165 ldout(cct, 0) << __func__ << " failed to update new period: " << cpp_strerror(-ret) << dendl;
4166 return ret;
4167 }
4168 ret = current_period.store_info(false);
4169 if (ret < 0) {
4170 ldout(cct, 0) << __func__ << " failed to store new period: " << cpp_strerror(-ret) << dendl;
4171 return ret;
4172 }
4173 ret = current_period.reflect();
4174 if (ret < 0) {
4175 ldout(cct, 0) << __func__ << " failed to update local objects: " << cpp_strerror(-ret) << dendl;
4176 return ret;
4177 }
4178 }
4179
4180 for (auto const& iter : regions) {
4181 RGWZoneGroup zonegroup(iter);
4182 int ret = zonegroup.init(cct, this, true, true);
4183 if (ret < 0) {
4184 ldout(cct, 0) << __func__ << " failed init zonegroup" << iter << ": ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4185 return ret;
4186 }
4187 ret = zonegroup.delete_obj(true);
4188 if (ret < 0 && ret != -ENOENT) {
4189 ldout(cct, 0) << __func__ << " failed to delete region " << iter << ": ret "<< ret << " " << cpp_strerror(-ret)
4190 << dendl;
4191 return ret;
4192 }
4193 }
4194
4195 /* mark as converted */
4196 ret = rgw_put_system_obj(this, pool, oid, bl.c_str(), bl.length(),
4197 true, NULL, real_time(), NULL);
4198 if (ret < 0 ) {
4199 ldout(cct, 0) << __func__ << " failed to mark cluster as converted: ret "<< ret << " " << cpp_strerror(-ret)
4200 << dendl;
4201 return ret;
4202 }
4203
4204 return 0;
4205 }
4206
4207 int RGWRados::init_zg_from_period(bool *initialized)
4208 {
4209 *initialized = false;
4210
4211 if (current_period.get_id().empty()) {
4212 return 0;
4213 }
4214
4215 int ret = zonegroup.init(cct, this);
4216 ldout(cct, 20) << "period zonegroup init ret " << ret << dendl;
4217 if (ret == -ENOENT) {
4218 return 0;
4219 }
4220 if (ret < 0) {
4221 ldout(cct, 0) << "failed reading zonegroup info: " << cpp_strerror(-ret) << dendl;
4222 return ret;
4223 }
4224 ldout(cct, 20) << "period zonegroup name " << zonegroup.get_name() << dendl;
4225
4226 map<string, RGWZoneGroup>::const_iterator iter =
4227 current_period.get_map().zonegroups.find(zonegroup.get_id());
4228
4229 if (iter != current_period.get_map().zonegroups.end()) {
4230 ldout(cct, 20) << "using current period zonegroup " << zonegroup.get_name() << dendl;
4231 zonegroup = iter->second;
4232 ret = zonegroup.init(cct, this, false);
4233 if (ret < 0) {
4234 ldout(cct, 0) << "failed init zonegroup: " << " " << cpp_strerror(-ret) << dendl;
4235 return ret;
4236 }
4237 ret = zone_params.init(cct, this);
4238 if (ret < 0 && ret != -ENOENT) {
4239 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4240 return ret;
4241 } if (ret ==-ENOENT && zonegroup.get_name() == default_zonegroup_name) {
4242 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4243 zone_params.set_name(default_zone_name);
4244 ret = zone_params.init(cct, this);
4245 if (ret < 0 && ret != -ENOENT) {
4246 ldout(cct, 0) << "failed reading zone params info: " << " " << cpp_strerror(-ret) << dendl;
4247 return ret;
4248 }
4249 }
4250 }
4251 for (iter = current_period.get_map().zonegroups.begin();
4252 iter != current_period.get_map().zonegroups.end(); ++iter){
4253 const RGWZoneGroup& zg = iter->second;
4254 // use endpoints from the zonegroup's master zone
4255 auto master = zg.zones.find(zg.master_zone);
4256 if (master == zg.zones.end()) {
4257 // fix missing master zone for a single zone zonegroup
4258 if (zg.master_zone.empty() && zg.zones.size() == 1) {
4259 master = zg.zones.begin();
4260 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing master_zone, setting zone " <<
4261 master->second.name << " id:" << master->second.id << " as master" << dendl;
4262 if (zonegroup.get_id() == zg.get_id()) {
4263 zonegroup.master_zone = master->second.id;
4264 ret = zonegroup.update();
4265 if (ret < 0) {
4266 ldout(cct, 0) << "error updating zonegroup : " << cpp_strerror(-ret) << dendl;
4267 return ret;
4268 }
4269 } else {
4270 RGWZoneGroup fixed_zg(zg.get_id(),zg.get_name());
4271 ret = fixed_zg.init(cct, this);
4272 if (ret < 0) {
4273 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4274 return ret;
4275 }
4276 fixed_zg.master_zone = master->second.id;
4277 ret = fixed_zg.update();
4278 if (ret < 0) {
4279 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4280 return ret;
4281 }
4282 }
4283 } else {
4284 ldout(cct, 0) << "zonegroup " << zg.get_name() << " missing zone for master_zone=" <<
4285 zg.master_zone << dendl;
4286 return -EINVAL;
4287 }
4288 }
4289 const auto& endpoints = master->second.endpoints;
4290 add_new_connection_to_map(zonegroup_conn_map, zg, new RGWRESTConn(cct, this, zg.get_id(), endpoints));
4291 if (!current_period.get_master_zonegroup().empty() &&
4292 zg.get_id() == current_period.get_master_zonegroup()) {
4293 rest_master_conn = new RGWRESTConn(cct, this, zg.get_id(), endpoints);
4294 }
4295 }
4296
4297 *initialized = true;
4298
4299 return 0;
4300 }
4301
4302 int RGWRados::init_zg_from_local(bool *creating_defaults)
4303 {
4304 int ret = zonegroup.init(cct, this);
4305 if ( (ret < 0 && ret != -ENOENT) || (ret == -ENOENT && !cct->_conf->rgw_zonegroup.empty())) {
4306 ldout(cct, 0) << "failed reading zonegroup info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4307 return ret;
4308 } else if (ret == -ENOENT) {
4309 *creating_defaults = true;
4310 ldout(cct, 10) << "Creating default zonegroup " << dendl;
4311 ret = zonegroup.create_default();
4312 if (ret < 0) {
4313 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4314 << dendl;
4315 return ret;
4316 }
4317 ret = zonegroup.init(cct, this);
4318 if (ret < 0) {
4319 ldout(cct, 0) << "failure in zonegroup create_default: ret "<< ret << " " << cpp_strerror(-ret)
4320 << dendl;
4321 return ret;
4322 }
4323 }
4324 ldout(cct, 20) << "zonegroup " << zonegroup.get_name() << dendl;
4325 if (zonegroup.is_master_zonegroup()) {
4326 // use endpoints from the zonegroup's master zone
4327 auto master = zonegroup.zones.find(zonegroup.master_zone);
4328 if (master == zonegroup.zones.end()) {
4329 // fix missing master zone for a single zone zonegroup
4330 if (zonegroup.master_zone.empty() && zonegroup.zones.size() == 1) {
4331 master = zonegroup.zones.begin();
4332 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing master_zone, setting zone " <<
4333 master->second.name << " id:" << master->second.id << " as master" << dendl;
4334 zonegroup.master_zone = master->second.id;
4335 ret = zonegroup.update();
4336 if (ret < 0) {
4337 ldout(cct, 0) << "error initializing zonegroup : " << cpp_strerror(-ret) << dendl;
4338 return ret;
4339 }
4340 } else {
4341 ldout(cct, 0) << "zonegroup " << zonegroup.get_name() << " missing zone for "
4342 "master_zone=" << zonegroup.master_zone << dendl;
4343 return -EINVAL;
4344 }
4345 }
4346 const auto& endpoints = master->second.endpoints;
4347 rest_master_conn = new RGWRESTConn(cct, this, zonegroup.get_id(), endpoints);
4348 }
4349
4350 return 0;
4351 }
4352
4353
4354 bool RGWRados::zone_syncs_from(RGWZone& target_zone, RGWZone& source_zone)
4355 {
4356 return target_zone.syncs_from(source_zone.name) &&
4357 sync_modules_manager->supports_data_export(source_zone.tier_type);
4358 }
4359
4360 /**
4361 * Initialize the RADOS instance and prepare to do other ops
4362 * Returns 0 on success, -ERR# on failure.
4363 */
4364 int RGWRados::init_complete()
4365 {
4366 int ret = realm.init(cct, this);
4367 if (ret < 0 && ret != -ENOENT) {
4368 ldout(cct, 0) << "failed reading realm info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4369 return ret;
4370 } else if (ret != -ENOENT) {
4371 ldout(cct, 20) << "realm " << realm.get_name() << " " << realm.get_id() << dendl;
4372 ret = current_period.init(cct, this, realm.get_id(), realm.get_name());
4373 if (ret < 0 && ret != -ENOENT) {
4374 ldout(cct, 0) << "failed reading current period info: " << " " << cpp_strerror(-ret) << dendl;
4375 return ret;
4376 }
4377 ldout(cct, 20) << "current period " << current_period.get_id() << dendl;
4378 }
4379
4380 ret = replace_region_with_zonegroup();
4381 if (ret < 0) {
4382 lderr(cct) << "failed converting region to zonegroup : ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4383 return ret;
4384 }
4385
4386 ret = convert_regionmap();
4387 if (ret < 0) {
4388 lderr(cct) << "failed converting regionmap: " << cpp_strerror(-ret) << dendl;
4389 return ret;
4390 }
4391
4392 bool zg_initialized = false;
4393
4394 if (!current_period.get_id().empty()) {
4395 ret = init_zg_from_period(&zg_initialized);
4396 if (ret < 0) {
4397 return ret;
4398 }
4399 }
4400
4401 bool creating_defaults = false;
4402 bool using_local = (!zg_initialized);
4403 if (using_local) {
4404 ldout(cct, 10) << " cannot find current period zonegroup using local zonegroup" << dendl;
4405 ret = init_zg_from_local(&creating_defaults);
4406 if (ret < 0) {
4407 return ret;
4408 }
4409 // read period_config into current_period
4410 auto& period_config = current_period.get_config();
4411 ret = period_config.read(this, zonegroup.realm_id);
4412 if (ret < 0 && ret != -ENOENT) {
4413 ldout(cct, 0) << "ERROR: failed to read period config: "
4414 << cpp_strerror(ret) << dendl;
4415 return ret;
4416 }
4417 }
4418
4419 ldout(cct, 10) << "Cannot find current period zone using local zone" << dendl;
4420 if (creating_defaults && cct->_conf->rgw_zone.empty()) {
4421 ldout(cct, 10) << " Using default name "<< default_zone_name << dendl;
4422 zone_params.set_name(default_zone_name);
4423 }
4424
4425 ret = zone_params.init(cct, this);
4426 if (ret < 0 && ret != -ENOENT) {
4427 lderr(cct) << "failed reading zone info: ret "<< ret << " " << cpp_strerror(-ret) << dendl;
4428 return ret;
4429 }
4430 map<string, RGWZone>::iterator zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4431 if (zone_iter == get_zonegroup().zones.end()) {
4432 if (using_local) {
4433 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4434 return -EINVAL;
4435 }
4436 ldout(cct, 1) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << "), switching to local zonegroup configuration" << dendl;
4437 ret = init_zg_from_local(&creating_defaults);
4438 if (ret < 0) {
4439 return ret;
4440 }
4441 zone_iter = get_zonegroup().zones.find(zone_params.get_id());
4442 }
4443 if (zone_iter != get_zonegroup().zones.end()) {
4444 zone_public_config = zone_iter->second;
4445 ldout(cct, 20) << "zone " << zone_params.get_name() << dendl;
4446 } else {
4447 lderr(cct) << "Cannot find zone id=" << zone_params.get_id() << " (name=" << zone_params.get_name() << ")" << dendl;
4448 return -EINVAL;
4449 }
4450
4451 zone_short_id = current_period.get_map().get_zone_short_id(zone_params.get_id());
4452
4453 if (run_sync_thread) {
4454 ret = sync_modules_manager->create_instance(cct, zone_public_config.tier_type, zone_params.tier_config, &sync_module);
4455 if (ret < 0) {
4456 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
4457 return ret;
4458 }
4459 }
4460
4461 writeable_zone = (zone_public_config.tier_type.empty() || zone_public_config.tier_type == "rgw");
4462
4463 init_unique_trans_id_deps();
4464
4465 finisher = new Finisher(cct);
4466 finisher->start();
4467
4468 period_puller.reset(new RGWPeriodPuller(this));
4469 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
4470 current_period));
4471
4472 if (need_watch_notify()) {
4473 ret = init_watch();
4474 if (ret < 0) {
4475 lderr(cct) << "ERROR: failed to initialize watch: " << cpp_strerror(-ret) << dendl;
4476 return ret;
4477 }
4478 }
4479
4480 /* first build all zones index */
4481 for (auto ziter : get_zonegroup().zones) {
4482 const string& id = ziter.first;
4483 RGWZone& z = ziter.second;
4484 zone_id_by_name[z.name] = id;
4485 zone_by_id[id] = z;
4486 }
4487
4488 if (zone_by_id.find(zone_id()) == zone_by_id.end()) {
4489 ldout(cct, 0) << "WARNING: could not find zone config in zonegroup for local zone (" << zone_id() << "), will use defaults" << dendl;
4490 }
4491 zone_public_config = zone_by_id[zone_id()];
4492 for (auto ziter : get_zonegroup().zones) {
4493 const string& id = ziter.first;
4494 RGWZone& z = ziter.second;
4495 if (id == zone_id()) {
4496 continue;
4497 }
4498 if (z.endpoints.empty()) {
4499 ldout(cct, 0) << "WARNING: can't generate connection for zone " << z.id << " id " << z.name << ": no endpoints defined" << dendl;
4500 continue;
4501 }
4502 ldout(cct, 20) << "generating connection object for zone " << z.name << " id " << z.id << dendl;
4503 RGWRESTConn *conn = new RGWRESTConn(cct, this, z.id, z.endpoints);
4504 zone_conn_map[id] = conn;
4505 if (zone_syncs_from(zone_public_config, z) ||
4506 zone_syncs_from(z, zone_public_config)) {
4507 if (zone_syncs_from(zone_public_config, z)) {
4508 zone_data_sync_from_map[id] = conn;
4509 }
4510 if (zone_syncs_from(z, zone_public_config)) {
4511 zone_data_notify_to_map[id] = conn;
4512 }
4513 } else {
4514 ldout(cct, 20) << "NOTICE: not syncing to/from zone " << z.name << " id " << z.id << dendl;
4515 }
4516 }
4517
4518 ret = open_root_pool_ctx();
4519 if (ret < 0)
4520 return ret;
4521
4522 ret = open_gc_pool_ctx();
4523 if (ret < 0)
4524 return ret;
4525
4526 ret = open_lc_pool_ctx();
4527 if (ret < 0)
4528 return ret;
4529
4530 ret = open_objexp_pool_ctx();
4531 if (ret < 0)
4532 return ret;
4533
4534 ret = open_reshard_pool_ctx();
4535 if (ret < 0)
4536 return ret;
4537
4538 pools_initialized = true;
4539
4540 gc = new RGWGC();
4541 gc->initialize(cct, this);
4542
4543 obj_expirer = new RGWObjectExpirer(this);
4544
4545 if (use_gc_thread) {
4546 gc->start_processor();
4547 obj_expirer->start_processor();
4548 }
4549
4550 /* no point of running sync thread if we don't have a master zone configured
4551 or there is no rest_master_conn */
4552 if (get_zonegroup().master_zone.empty() || !rest_master_conn
4553 || current_period.get_id().empty()) {
4554 run_sync_thread = false;
4555 }
4556
4557 if (run_sync_thread) {
4558 // initialize the log period history
4559 meta_mgr->init_oldest_log_period();
4560 }
4561
4562 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
4563 async_rados->start();
4564
4565 ret = meta_mgr->init(current_period.get_id());
4566 if (ret < 0) {
4567 lderr(cct) << "ERROR: failed to initialize metadata log: "
4568 << cpp_strerror(-ret) << dendl;
4569 return ret;
4570 }
4571
4572 if (is_meta_master()) {
4573 auto md_log = meta_mgr->get_log(current_period.get_id());
4574 meta_notifier = new RGWMetaNotifier(this, md_log);
4575 meta_notifier->start();
4576 }
4577
4578 if (run_sync_thread) {
4579 Mutex::Locker l(meta_sync_thread_lock);
4580 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
4581 ret = meta_sync_processor_thread->init();
4582 if (ret < 0) {
4583 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
4584 return ret;
4585 }
4586 meta_sync_processor_thread->start();
4587
4588 // configure the bucket trim manager
4589 rgw::BucketTrimConfig config;
4590 rgw::configure_bucket_trim(cct, config);
4591
4592 bucket_trim.emplace(this, config);
4593 ret = bucket_trim->init();
4594 if (ret < 0) {
4595 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
4596 return ret;
4597 }
4598
4599 Mutex::Locker dl(data_sync_thread_lock);
4600 for (auto iter : zone_data_sync_from_map) {
4601 ldout(cct, 5) << "starting data sync thread for zone " << iter.first << dendl;
4602 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, iter.first,
4603 &*bucket_trim);
4604 ret = thread->init();
4605 if (ret < 0) {
4606 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
4607 return ret;
4608 }
4609 thread->start();
4610 data_sync_processor_threads[iter.first] = thread;
4611 }
4612 auto interval = cct->_conf->rgw_sync_log_trim_interval;
4613 if (interval > 0) {
4614 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
4615 ret = sync_log_trimmer->init();
4616 if (ret < 0) {
4617 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
4618 return ret;
4619 }
4620 sync_log_trimmer->start();
4621 }
4622 }
4623 data_notifier = new RGWDataNotifier(this);
4624 data_notifier->start();
4625
4626 lc = new RGWLC();
4627 lc->initialize(cct, this);
4628
4629 if (use_lc_thread)
4630 lc->start_processor();
4631
4632 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
4633
4634 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
4635 get_zone().bucket_index_max_shards);
4636 if (bucket_index_max_shards > get_max_bucket_shards()) {
4637 bucket_index_max_shards = get_max_bucket_shards();
4638 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
4639 << get_max_bucket_shards() << dendl;
4640 }
4641 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
4642
4643 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
4644 binfo_cache->init(this);
4645
4646 bool need_tombstone_cache = !zone_data_notify_to_map.empty(); /* have zones syncing from us */
4647
4648 if (need_tombstone_cache) {
4649 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
4650 }
4651
4652 reshard_wait = std::make_shared<RGWReshardWait>(this);
4653
4654 reshard = new RGWReshard(this);
4655
4656 /* only the master zone in the zonegroup reshards buckets */
4657 run_reshard_thread = run_reshard_thread && (get_zonegroup().master_zone == zone_public_config.id);
4658 if (run_reshard_thread) {
4659 reshard->start_processor();
4660 }
4661
4662 index_completion_manager = new RGWIndexCompletionManager(this);
4663 ret = index_completion_manager->start();
4664
4665 return ret;
4666 }
4667
4668 /**
4669 * Initialize the RADOS instance and prepare to do other ops
4670 * Returns 0 on success, -ERR# on failure.
4671 */
4672 int RGWRados::initialize()
4673 {
4674 int ret;
4675
4676 ret = init_rados();
4677 if (ret < 0)
4678 return ret;
4679
4680 return init_complete();
4681 }
4682
4683 void RGWRados::finalize_watch()
4684 {
4685 for (int i = 0; i < num_watchers; i++) {
4686 RGWWatcher *watcher = watchers[i];
4687 watcher->unregister_watch();
4688 delete watcher;
4689 }
4690
4691 delete[] notify_oids;
4692 delete[] watchers;
4693 }
4694
4695 void RGWRados::schedule_context(Context *c) {
4696 finisher->queue(c);
4697 }
4698
4699 int RGWRados::list_raw_prefixed_objs(const rgw_pool& pool, const string& prefix, list<string>& result)
4700 {
4701 bool is_truncated;
4702 RGWListRawObjsCtx ctx;
4703 do {
4704 list<string> oids;
4705 int r = list_raw_objects(pool, prefix, 1000,
4706 ctx, oids, &is_truncated);
4707 if (r < 0) {
4708 return r;
4709 }
4710 list<string>::iterator iter;
4711 for (iter = oids.begin(); iter != oids.end(); ++iter) {
4712 string& val = *iter;
4713 if (val.size() > prefix.size())
4714 result.push_back(val.substr(prefix.size()));
4715 }
4716 } while (is_truncated);
4717
4718 return 0;
4719 }
4720
4721 int RGWRados::list_regions(list<string>& regions)
4722 {
4723 RGWZoneGroup zonegroup;
4724
4725 return list_raw_prefixed_objs(zonegroup.get_pool(cct), region_info_oid_prefix, regions);
4726 }
4727
4728 int RGWRados::list_zonegroups(list<string>& zonegroups)
4729 {
4730 RGWZoneGroup zonegroup;
4731
4732 return list_raw_prefixed_objs(zonegroup.get_pool(cct), zonegroup_names_oid_prefix, zonegroups);
4733 }
4734
4735 int RGWRados::list_zones(list<string>& zones)
4736 {
4737 RGWZoneParams zoneparams;
4738
4739 return list_raw_prefixed_objs(zoneparams.get_pool(cct), zone_names_oid_prefix, zones);
4740 }
4741
4742 int RGWRados::list_realms(list<string>& realms)
4743 {
4744 RGWRealm realm(cct, this);
4745 return list_raw_prefixed_objs(realm.get_pool(cct), realm_names_oid_prefix, realms);
4746 }
4747
4748 int RGWRados::list_periods(list<string>& periods)
4749 {
4750 RGWPeriod period;
4751 list<string> raw_periods;
4752 int ret = list_raw_prefixed_objs(period.get_pool(cct), period.get_info_oid_prefix(), raw_periods);
4753 if (ret < 0) {
4754 return ret;
4755 }
4756 for (const auto& oid : raw_periods) {
4757 size_t pos = oid.find(".");
4758 if (pos != std::string::npos) {
4759 periods.push_back(oid.substr(0, pos));
4760 } else {
4761 periods.push_back(oid);
4762 }
4763 }
4764 periods.sort(); // unique() only detects duplicates if they're adjacent
4765 periods.unique();
4766 return 0;
4767 }
4768
4769
4770 int RGWRados::list_periods(const string& current_period, list<string>& periods)
4771 {
4772 int ret = 0;
4773 string period_id = current_period;
4774 while(!period_id.empty()) {
4775 RGWPeriod period(period_id);
4776 ret = period.init(cct, this);
4777 if (ret < 0) {
4778 return ret;
4779 }
4780 periods.push_back(period.get_id());
4781 period_id = period.get_predecessor();
4782 }
4783
4784 return ret;
4785 }
4786
4787 /**
4788 * Open the pool used as root for this gateway
4789 * Returns: 0 on success, -ERR# otherwise.
4790 */
4791 int RGWRados::open_root_pool_ctx()
4792 {
4793 return rgw_init_ioctx(get_rados_handle(), get_zone_params().domain_root, root_pool_ctx, true);
4794 }
4795
4796 int RGWRados::open_gc_pool_ctx()
4797 {
4798 return rgw_init_ioctx(get_rados_handle(), get_zone_params().gc_pool, gc_pool_ctx, true);
4799 }
4800
4801 int RGWRados::open_lc_pool_ctx()
4802 {
4803 return rgw_init_ioctx(get_rados_handle(), get_zone_params().lc_pool, lc_pool_ctx, true);
4804 }
4805
4806 int RGWRados::open_objexp_pool_ctx()
4807 {
4808 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, objexp_pool_ctx, true);
4809 }
4810
4811 int RGWRados::open_reshard_pool_ctx()
4812 {
4813 return rgw_init_ioctx(get_rados_handle(), get_zone_params().reshard_pool, reshard_pool_ctx, true);
4814 }
4815
4816 int RGWRados::init_watch()
4817 {
4818 int r = rgw_init_ioctx(&rados[0], get_zone_params().control_pool, control_pool_ctx, true);
4819 if (r < 0) {
4820 return r;
4821 }
4822
4823 num_watchers = cct->_conf->rgw_num_control_oids;
4824
4825 bool compat_oid = (num_watchers == 0);
4826
4827 if (num_watchers <= 0)
4828 num_watchers = 1;
4829
4830 notify_oids = new string[num_watchers];
4831 watchers = new RGWWatcher *[num_watchers];
4832
4833 for (int i=0; i < num_watchers; i++) {
4834 string& notify_oid = notify_oids[i];
4835 notify_oid = notify_oid_prefix;
4836 if (!compat_oid) {
4837 char buf[16];
4838 snprintf(buf, sizeof(buf), ".%d", i);
4839 notify_oid.append(buf);
4840 }
4841 r = control_pool_ctx.create(notify_oid, false);
4842 if (r < 0 && r != -EEXIST)
4843 return r;
4844
4845 RGWWatcher *watcher = new RGWWatcher(this, i, notify_oid);
4846 watchers[i] = watcher;
4847
4848 r = watcher->register_watch();
4849 if (r < 0)
4850 return r;
4851 }
4852
4853 watch_initialized = true;
4854
4855 set_cache_enabled(true);
4856
4857 return 0;
4858 }
4859
4860 void RGWRados::pick_control_oid(const string& key, string& notify_oid)
4861 {
4862 uint32_t r = ceph_str_hash_linux(key.c_str(), key.size());
4863
4864 int i = r % num_watchers;
4865 char buf[16];
4866 snprintf(buf, sizeof(buf), ".%d", i);
4867
4868 notify_oid = notify_oid_prefix;
4869 notify_oid.append(buf);
4870 }
4871
4872 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
4873 {
4874 constexpr bool create = true; // create the pool if it doesn't exist
4875 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
4876 }
4877
4878 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
4879 string *marker) {
4880 if (marker) {
4881 *marker = shard_id_str;
4882 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
4883 marker->append(shard_marker);
4884 }
4885 }
4886
4887 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
4888 {
4889 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
4890
4891 if (!explicit_pool.empty()) {
4892 return open_pool_ctx(explicit_pool, index_ctx);
4893 }
4894
4895 const string *rule = &bucket_info.placement_rule;
4896 if (rule->empty()) {
4897 rule = &zonegroup.default_placement;
4898 }
4899 auto iter = zone_params.placement_pools.find(*rule);
4900 if (iter == zone_params.placement_pools.end()) {
4901 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
4902 return -EINVAL;
4903 }
4904
4905 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
4906 if (r < 0)
4907 return r;
4908
4909 return 0;
4910 }
4911
4912 /**
4913 * set up a bucket listing.
4914 * handle is filled in.
4915 * Returns 0 on success, -ERR# otherwise.
4916 */
4917 int RGWRados::list_buckets_init(RGWAccessHandle *handle)
4918 {
4919 librados::NObjectIterator *state = new librados::NObjectIterator(root_pool_ctx.nobjects_begin());
4920 *handle = (RGWAccessHandle)state;
4921 return 0;
4922 }
4923
4924 /**
4925 * get the next bucket in the listing.
4926 * obj is filled in,
4927 * handle is updated.
4928 * returns 0 on success, -ERR# otherwise.
4929 */
4930 int RGWRados::list_buckets_next(rgw_bucket_dir_entry& obj, RGWAccessHandle *handle)
4931 {
4932 librados::NObjectIterator *state = (librados::NObjectIterator *)*handle;
4933
4934 do {
4935 if (*state == root_pool_ctx.nobjects_end()) {
4936 delete state;
4937 return -ENOENT;
4938 }
4939
4940 obj.key.name = (*state)->get_oid();
4941 if (obj.key.name[0] == '_') {
4942 obj.key.name = obj.key.name.substr(1);
4943 }
4944
4945 (*state)++;
4946 } while (obj.key.name[0] == '.'); /* skip all entries starting with '.' */
4947
4948 return 0;
4949 }
4950
4951
4952 /**** logs ****/
4953
4954 struct log_list_state {
4955 string prefix;
4956 librados::IoCtx io_ctx;
4957 librados::NObjectIterator obit;
4958 };
4959
4960 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
4961 {
4962 log_list_state *state = new log_list_state;
4963 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
4964 if (r < 0) {
4965 delete state;
4966 return r;
4967 }
4968 state->prefix = prefix;
4969 state->obit = state->io_ctx.nobjects_begin();
4970 *handle = (RGWAccessHandle)state;
4971 return 0;
4972 }
4973
4974 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
4975 {
4976 log_list_state *state = static_cast<log_list_state *>(handle);
4977 while (true) {
4978 if (state->obit == state->io_ctx.nobjects_end()) {
4979 delete state;
4980 return -ENOENT;
4981 }
4982 if (state->prefix.length() &&
4983 state->obit->get_oid().find(state->prefix) != 0) {
4984 state->obit++;
4985 continue;
4986 }
4987 *name = state->obit->get_oid();
4988 state->obit++;
4989 break;
4990 }
4991 return 0;
4992 }
4993
4994 int RGWRados::log_remove(const string& name)
4995 {
4996 librados::IoCtx io_ctx;
4997 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
4998 if (r < 0)
4999 return r;
5000 return io_ctx.remove(name);
5001 }
5002
5003 struct log_show_state {
5004 librados::IoCtx io_ctx;
5005 bufferlist bl;
5006 bufferlist::iterator p;
5007 string name;
5008 uint64_t pos;
5009 bool eof;
5010 log_show_state() : pos(0), eof(false) {}
5011 };
5012
5013 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
5014 {
5015 log_show_state *state = new log_show_state;
5016 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, state->io_ctx);
5017 if (r < 0) {
5018 delete state;
5019 return r;
5020 }
5021 state->name = name;
5022 *handle = (RGWAccessHandle)state;
5023 return 0;
5024 }
5025
5026 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
5027 {
5028 log_show_state *state = static_cast<log_show_state *>(handle);
5029 off_t off = state->p.get_off();
5030
5031 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
5032 << " off " << off
5033 << " eof " << (int)state->eof
5034 << dendl;
5035 // read some?
5036 unsigned chunk = 1024*1024;
5037 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
5038 bufferlist more;
5039 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
5040 if (r < 0)
5041 return r;
5042 state->pos += r;
5043 bufferlist old;
5044 try {
5045 old.substr_of(state->bl, off, state->bl.length() - off);
5046 } catch (buffer::error& err) {
5047 return -EINVAL;
5048 }
5049 state->bl.clear();
5050 state->bl.claim(old);
5051 state->bl.claim_append(more);
5052 state->p = state->bl.begin();
5053 if ((unsigned)r < chunk)
5054 state->eof = true;
5055 ldout(cct, 10) << " read " << r << dendl;
5056 }
5057
5058 if (state->p.end())
5059 return 0; // end of file
5060 try {
5061 ::decode(*entry, state->p);
5062 }
5063 catch (const buffer::error &e) {
5064 return -EINVAL;
5065 }
5066 return 1;
5067 }
5068
5069 /**
5070 * usage_log_hash: get usage log key hash, based on name and index
5071 *
5072 * Get the usage object name. Since a user may have more than 1
5073 * object holding that info (multiple shards), we use index to
5074 * specify that shard number. Once index exceeds max shards it
5075 * wraps.
5076 * If name is not being set, results for all users will be returned
5077 * and index will wrap only after total shards number.
5078 *
5079 * @param cct [in] ceph context
5080 * @param name [in] user name
5081 * @param hash [out] hash value
5082 * @param index [in] shard index number
5083 */
5084 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
5085 {
5086 uint32_t val = index;
5087
5088 if (!name.empty()) {
5089 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
5090 val %= max_user_shards;
5091 val += ceph_str_hash_linux(name.c_str(), name.size());
5092 }
5093 char buf[17];
5094 int max_shards = cct->_conf->rgw_usage_max_shards;
5095 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
5096 hash = buf;
5097 }
5098
5099 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
5100 {
5101 uint32_t index = 0;
5102
5103 map<string, rgw_usage_log_info> log_objs;
5104
5105 string hash;
5106 string last_user;
5107
5108 /* restructure usage map, zone by object hash */
5109 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
5110 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
5111 const rgw_user_bucket& ub = iter->first;
5112 RGWUsageBatch& info = iter->second;
5113
5114 if (ub.user.empty()) {
5115 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
5116 continue;
5117 }
5118
5119 if (ub.user != last_user) {
5120 /* index *should* be random, but why waste extra cycles
5121 in most cases max user shards is not going to exceed 1,
5122 so just incrementing it */
5123 usage_log_hash(cct, ub.user, hash, index++);
5124 }
5125 last_user = ub.user;
5126 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
5127
5128 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
5129 v.push_back(miter->second);
5130 }
5131 }
5132
5133 map<string, rgw_usage_log_info>::iterator liter;
5134
5135 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
5136 int r = cls_obj_usage_log_add(liter->first, liter->second);
5137 if (r < 0)
5138 return r;
5139 }
5140 return 0;
5141 }
5142
5143 int RGWRados::read_usage(const rgw_user& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
5144 bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage)
5145 {
5146 uint32_t num = max_entries;
5147 string hash, first_hash;
5148 string user_str = user.to_str();
5149 usage_log_hash(cct, user_str, first_hash, 0);
5150
5151 if (usage_iter.index) {
5152 usage_log_hash(cct, user_str, hash, usage_iter.index);
5153 } else {
5154 hash = first_hash;
5155 }
5156
5157 usage.clear();
5158
5159 do {
5160 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
5161 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
5162
5163 int ret = cls_obj_usage_log_read(hash, user_str, start_epoch, end_epoch, num,
5164 usage_iter.read_iter, ret_usage, is_truncated);
5165 if (ret == -ENOENT)
5166 goto next;
5167
5168 if (ret < 0)
5169 return ret;
5170
5171 num -= ret_usage.size();
5172
5173 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
5174 usage[iter->first].aggregate(iter->second);
5175 }
5176
5177 next:
5178 if (!*is_truncated) {
5179 usage_iter.read_iter.clear();
5180 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
5181 }
5182 } while (num && !*is_truncated && hash != first_hash);
5183 return 0;
5184 }
5185
5186 int RGWRados::trim_usage(rgw_user& user, uint64_t start_epoch, uint64_t end_epoch)
5187 {
5188 uint32_t index = 0;
5189 string hash, first_hash;
5190 string user_str = user.to_str();
5191 usage_log_hash(cct, user_str, first_hash, index);
5192
5193 hash = first_hash;
5194 do {
5195 int ret = cls_obj_usage_log_trim(hash, user_str, start_epoch, end_epoch);
5196
5197 if (ret < 0 && ret != -ENOENT)
5198 return ret;
5199
5200 usage_log_hash(cct, user_str, hash, ++index);
5201 } while (hash != first_hash);
5202
5203 return 0;
5204 }
5205
5206 int RGWRados::key_to_shard_id(const string& key, int max_shards)
5207 {
5208 return rgw_shards_hash(key, max_shards);
5209 }
5210
5211 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
5212 {
5213 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5214 char buf[16];
5215 if (shard_id) {
5216 *shard_id = val % max_shards;
5217 }
5218 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5219 name = prefix + buf;
5220 }
5221
5222 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
5223 {
5224 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
5225 val ^= ceph_str_hash_linux(section.c_str(), section.size());
5226 char buf[16];
5227 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
5228 name = prefix + buf;
5229 }
5230
5231 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
5232 {
5233 char buf[16];
5234 snprintf(buf, sizeof(buf), "%u", shard_id);
5235 name = prefix + buf;
5236
5237 }
5238
5239 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5240 {
5241 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
5242 }
5243
5244 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
5245 {
5246 return rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx, true);
5247
5248 }
5249
5250 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
5251 {
5252 librados::IoCtx io_ctx;
5253
5254 int r = time_log_add_init(io_ctx);
5255 if (r < 0) {
5256 return r;
5257 }
5258
5259 ObjectWriteOperation op;
5260 utime_t t(ut);
5261 cls_log_add(op, t, section, key, bl);
5262
5263 return io_ctx.operate(oid, &op);
5264 }
5265
5266 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
5267 librados::AioCompletion *completion, bool monotonic_inc)
5268 {
5269 librados::IoCtx io_ctx;
5270
5271 int r = time_log_add_init(io_ctx);
5272 if (r < 0) {
5273 return r;
5274 }
5275
5276 ObjectWriteOperation op;
5277 cls_log_add(op, entries, monotonic_inc);
5278
5279 if (!completion) {
5280 r = io_ctx.operate(oid, &op);
5281 } else {
5282 r = io_ctx.aio_operate(oid, completion, &op);
5283 }
5284 return r;
5285 }
5286
5287 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
5288 int max_entries, list<cls_log_entry>& entries,
5289 const string& marker,
5290 string *out_marker,
5291 bool *truncated)
5292 {
5293 librados::IoCtx io_ctx;
5294
5295 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5296 if (r < 0)
5297 return r;
5298 librados::ObjectReadOperation op;
5299
5300 utime_t st(start_time);
5301 utime_t et(end_time);
5302
5303 cls_log_list(op, st, et, marker, max_entries, entries,
5304 out_marker, truncated);
5305
5306 bufferlist obl;
5307
5308 int ret = io_ctx.operate(oid, &op, &obl);
5309 if (ret < 0)
5310 return ret;
5311
5312 return 0;
5313 }
5314
5315 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
5316 {
5317 librados::IoCtx io_ctx;
5318
5319 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5320 if (r < 0)
5321 return r;
5322 librados::ObjectReadOperation op;
5323
5324 cls_log_info(op, header);
5325
5326 bufferlist obl;
5327
5328 int ret = io_ctx.operate(oid, &op, &obl);
5329 if (ret < 0)
5330 return ret;
5331
5332 return 0;
5333 }
5334
5335 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
5336 {
5337 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5338 if (r < 0)
5339 return r;
5340
5341 librados::ObjectReadOperation op;
5342
5343 cls_log_info(op, header);
5344
5345 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
5346 if (ret < 0)
5347 return ret;
5348
5349 return 0;
5350 }
5351
5352 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
5353 const string& from_marker, const string& to_marker,
5354 librados::AioCompletion *completion)
5355 {
5356 librados::IoCtx io_ctx;
5357
5358 int r = rgw_init_ioctx(get_rados_handle(), get_zone_params().log_pool, io_ctx);
5359 if (r < 0)
5360 return r;
5361
5362 utime_t st(start_time);
5363 utime_t et(end_time);
5364
5365 ObjectWriteOperation op;
5366 cls_log_trim(op, st, et, from_marker, to_marker);
5367
5368 if (!completion) {
5369 r = io_ctx.operate(oid, &op);
5370 } else {
5371 r = io_ctx.aio_operate(oid, completion, &op);
5372 }
5373 return r;
5374 }
5375
5376 string RGWRados::objexp_hint_get_shardname(int shard_num)
5377 {
5378 char buf[32];
5379 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
5380
5381 string objname("obj_delete_at_hint.");
5382 return objname + buf;
5383 }
5384
5385 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
5386 {
5387 string obj_key = key.name + key.instance;
5388 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
5389 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
5390 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
5391 sid = rgw_shards_mod(sid2, num_shards);
5392 return sid;
5393 }
5394
5395 static string objexp_hint_get_keyext(const string& tenant_name,
5396 const string& bucket_name,
5397 const string& bucket_id,
5398 const rgw_obj_key& obj_key)
5399 {
5400 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
5401 ":" + obj_key.name + ":" + obj_key.instance;
5402 }
5403
5404 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
5405 const string& tenant_name,
5406 const string& bucket_name,
5407 const string& bucket_id,
5408 const rgw_obj_index_key& obj_key)
5409 {
5410 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
5411 bucket_id, obj_key);
5412 objexp_hint_entry he = {
5413 .tenant = tenant_name,
5414 .bucket_name = bucket_name,
5415 .bucket_id = bucket_id,
5416 .obj_key = obj_key,
5417 .exp_time = delete_at };
5418 bufferlist hebl;
5419 ::encode(he, hebl);
5420 ObjectWriteOperation op;
5421 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
5422
5423 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
5424 return objexp_pool_ctx.operate(shard_name, &op);
5425 }
5426
5427 void RGWRados::objexp_get_shard(int shard_num,
5428 string& shard) /* out */
5429 {
5430 shard = objexp_hint_get_shardname(shard_num);
5431 }
5432
5433 int RGWRados::objexp_hint_list(const string& oid,
5434 const ceph::real_time& start_time,
5435 const ceph::real_time& end_time,
5436 const int max_entries,
5437 const string& marker,
5438 list<cls_timeindex_entry>& entries, /* out */
5439 string *out_marker, /* out */
5440 bool *truncated) /* out */
5441 {
5442 librados::ObjectReadOperation op;
5443 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
5444 out_marker, truncated);
5445
5446 bufferlist obl;
5447 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
5448
5449 if ((ret < 0 ) && (ret != -ENOENT)) {
5450 return ret;
5451 }
5452
5453 if ((ret == -ENOENT) && truncated) {
5454 *truncated = false;
5455 }
5456
5457 return 0;
5458 }
5459
5460 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
5461 objexp_hint_entry& hint_entry) /* out */
5462 {
5463 try {
5464 bufferlist::iterator iter = ti_entry.value.begin();
5465 ::decode(hint_entry, iter);
5466 } catch (buffer::error& err) {
5467 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
5468 }
5469
5470 return 0;
5471 }
5472
5473 int RGWRados::objexp_hint_trim(const string& oid,
5474 const ceph::real_time& start_time,
5475 const ceph::real_time& end_time,
5476 const string& from_marker,
5477 const string& to_marker)
5478 {
5479 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
5480 from_marker, to_marker);
5481 if ((ret < 0 ) && (ret != -ENOENT)) {
5482 return ret;
5483 }
5484
5485 return 0;
5486 }
5487
5488 int RGWRados::lock_exclusive(rgw_pool& pool, const string& oid, timespan& duration,
5489 string& zone_id, string& owner_id) {
5490 librados::IoCtx io_ctx;
5491
5492 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5493 if (r < 0) {
5494 return r;
5495 }
5496 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
5497 utime_t ut(msec / 1000, msec % 1000);
5498
5499 rados::cls::lock::Lock l(log_lock_name);
5500 l.set_duration(ut);
5501 l.set_cookie(owner_id);
5502 l.set_tag(zone_id);
5503 l.set_renew(true);
5504
5505 return l.lock_exclusive(&io_ctx, oid);
5506 }
5507
5508 int RGWRados::unlock(rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
5509 librados::IoCtx io_ctx;
5510
5511 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
5512 if (r < 0) {
5513 return r;
5514 }
5515
5516 rados::cls::lock::Lock l(log_lock_name);
5517 l.set_tag(zone_id);
5518 l.set_cookie(owner_id);
5519
5520 return l.unlock(&io_ctx, oid);
5521 }
5522
5523 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
5524 {
5525 bufferlist::iterator i = bl.begin();
5526 RGWAccessControlPolicy policy(cct);
5527 try {
5528 policy.decode_owner(i);
5529 } catch (buffer::error& err) {
5530 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5531 return -EIO;
5532 }
5533 *owner = policy.get_owner();
5534 return 0;
5535 }
5536
5537 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
5538 {
5539 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
5540 if (aiter == attrset.end())
5541 return -EIO;
5542
5543 bufferlist& bl = aiter->second;
5544 bufferlist::iterator iter = bl.begin();
5545 try {
5546 policy->decode(iter);
5547 } catch (buffer::error& err) {
5548 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
5549 return -EIO;
5550 }
5551 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 15)) {
5552 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
5553 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
5554 s3policy->to_xml(*_dout);
5555 *_dout << dendl;
5556 }
5557 return 0;
5558 }
5559
5560
5561 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
5562 {
5563 rgw_bucket bucket = bucket_info.bucket;
5564 bucket.update_bucket_id(new_bucket_id);
5565
5566 RGWObjectCtx obj_ctx(store);
5567
5568 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
5569 if (ret < 0) {
5570 return ret;
5571 }
5572
5573 return 0;
5574 }
5575
5576 /**
5577 * get listing of the objects in a bucket.
5578 *
5579 * max: maximum number of results to return
5580 * bucket: bucket to list contents of
5581 * prefix: only return results that match this prefix
5582 * delim: do not include results that match this string.
5583 * Any skipped results will have the matching portion of their name
5584 * inserted in common_prefixes with a "true" mark.
5585 * marker: if filled in, begin the listing with this object.
5586 * end_marker: if filled in, end the listing with this object.
5587 * result: the objects are put in here.
5588 * common_prefixes: if delim is filled in, any matching prefixes are placed here.
5589 * is_truncated: if number of objects in the bucket is bigger than max, then truncated.
5590 */
5591 int RGWRados::Bucket::List::list_objects(int64_t max,
5592 vector<rgw_bucket_dir_entry> *result,
5593 map<string, bool> *common_prefixes,
5594 bool *is_truncated)
5595 {
5596 RGWRados *store = target->get_store();
5597 CephContext *cct = store->ctx();
5598 int shard_id = target->get_shard_id();
5599
5600 int count = 0;
5601 bool truncated = true;
5602 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
5603
5604 result->clear();
5605
5606 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
5607 rgw_obj_index_key cur_marker;
5608 marker_obj.get_index_key(&cur_marker);
5609
5610 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
5611 params.ns);
5612 rgw_obj_index_key cur_end_marker;
5613 end_marker_obj.get_index_key(&cur_end_marker);
5614 const bool cur_end_marker_valid = !params.end_marker.empty();
5615
5616 rgw_obj_key prefix_obj(params.prefix);
5617 prefix_obj.ns = params.ns;
5618 string cur_prefix = prefix_obj.get_index_key_name();
5619
5620 string bigger_than_delim;
5621
5622 if (!params.delim.empty()) {
5623 unsigned long val = decode_utf8((unsigned char *)params.delim.c_str(), params.delim.size());
5624 char buf[params.delim.size() + 16];
5625 int r = encode_utf8(val + 1, (unsigned char *)buf);
5626 if (r < 0) {
5627 ldout(cct,0) << "ERROR: encode_utf8() failed" << dendl;
5628 return -EINVAL;
5629 }
5630 buf[r] = '\0';
5631
5632 bigger_than_delim = buf;
5633
5634 /* if marker points at a common prefix, fast forward it into its upperbound string */
5635 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5636 if (delim_pos >= 0) {
5637 string s = cur_marker.name.substr(0, delim_pos);
5638 s.append(bigger_than_delim);
5639 cur_marker = s;
5640 }
5641 }
5642
5643 string skip_after_delim;
5644 while (truncated && count <= max) {
5645 if (skip_after_delim > cur_marker.name) {
5646 cur_marker = skip_after_delim;
5647 ldout(cct, 20) << "setting cur_marker=" << cur_marker.name << "[" << cur_marker.instance << "]" << dendl;
5648 }
5649 std::map<string, rgw_bucket_dir_entry> ent_map;
5650 int r = store->cls_bucket_list(target->get_bucket_info(), shard_id, cur_marker, cur_prefix,
5651 read_ahead + 1 - count, params.list_versions, ent_map,
5652 &truncated, &cur_marker);
5653 if (r < 0)
5654 return r;
5655
5656 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
5657 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
5658 rgw_bucket_dir_entry& entry = eiter->second;
5659 rgw_obj_index_key index_key = entry.key;
5660
5661 rgw_obj_key obj(index_key);
5662
5663 /* note that parse_raw_oid() here will not set the correct object's instance, as
5664 * rgw_obj_index_key encodes that separately. We don't need to set the instance because it's
5665 * not needed for the checks here and we end up using the raw entry for the return vector
5666 */
5667 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
5668 if (!valid) {
5669 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
5670 continue;
5671 }
5672 bool check_ns = (obj.ns == params.ns);
5673 if (!params.list_versions && !entry.is_visible()) {
5674 continue;
5675 }
5676
5677 if (params.enforce_ns && !check_ns) {
5678 if (!params.ns.empty()) {
5679 /* we've iterated past the namespace we're searching -- done now */
5680 truncated = false;
5681 goto done;
5682 }
5683
5684 /* we're not looking at the namespace this object is in, next! */
5685 continue;
5686 }
5687
5688 if (cur_end_marker_valid && cur_end_marker <= index_key) {
5689 truncated = false;
5690 goto done;
5691 }
5692
5693 if (count < max) {
5694 params.marker = index_key;
5695 next_marker = index_key;
5696 }
5697
5698 if (params.filter && !params.filter->filter(obj.name, index_key.name))
5699 continue;
5700
5701 if (params.prefix.size() && (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
5702 continue;
5703
5704 if (!params.delim.empty()) {
5705 int delim_pos = obj.name.find(params.delim, params.prefix.size());
5706
5707 if (delim_pos >= 0) {
5708 string prefix_key = obj.name.substr(0, delim_pos + 1);
5709
5710 if (common_prefixes &&
5711 common_prefixes->find(prefix_key) == common_prefixes->end()) {
5712 if (count >= max) {
5713 truncated = true;
5714 goto done;
5715 }
5716 next_marker = prefix_key;
5717 (*common_prefixes)[prefix_key] = true;
5718
5719 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
5720
5721 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
5722 skip_after_delim.append(bigger_than_delim);
5723
5724 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
5725
5726 count++;
5727 }
5728
5729 continue;
5730 }
5731 }
5732
5733 if (count >= max) {
5734 truncated = true;
5735 goto done;
5736 }
5737
5738 result->emplace_back(std::move(entry));
5739 count++;
5740 }
5741
5742 // Either the back-end telling us truncated, or we don't consume all
5743 // items returned per the amount caller request
5744 truncated = (truncated || eiter != ent_map.end());
5745 }
5746
5747 done:
5748 if (is_truncated)
5749 *is_truncated = truncated;
5750
5751 return 0;
5752 }
5753
5754 /**
5755 * create a rados pool, associated meta info
5756 * returns 0 on success, -ERR# otherwise.
5757 */
5758 int RGWRados::create_pool(const rgw_pool& pool)
5759 {
5760 librados::IoCtx io_ctx;
5761 constexpr bool create = true;
5762 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
5763 }
5764
5765 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
5766 {
5767 librados::IoCtx index_ctx; // context for new bucket
5768
5769 string dir_oid = dir_oid_prefix;
5770 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5771 if (r < 0) {
5772 return r;
5773 }
5774
5775 dir_oid.append(bucket_info.bucket.bucket_id);
5776
5777 map<int, string> bucket_objs;
5778 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
5779
5780 return CLSRGWIssueBucketIndexInit(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5781 }
5782
5783 void RGWRados::create_bucket_id(string *bucket_id)
5784 {
5785 uint64_t iid = instance_id();
5786 uint64_t bid = next_bucket_id();
5787 char buf[get_zone_params().get_id().size() + 48];
5788 snprintf(buf, sizeof(buf), "%s.%llu.%llu", get_zone_params().get_id().c_str(), (long long)iid, (long long)bid);
5789 *bucket_id = buf;
5790 }
5791
5792 int RGWRados::create_bucket(RGWUserInfo& owner, rgw_bucket& bucket,
5793 const string& zonegroup_id,
5794 const string& placement_rule,
5795 const string& swift_ver_location,
5796 const RGWQuotaInfo * pquota_info,
5797 map<std::string, bufferlist>& attrs,
5798 RGWBucketInfo& info,
5799 obj_version *pobjv,
5800 obj_version *pep_objv,
5801 real_time creation_time,
5802 rgw_bucket *pmaster_bucket,
5803 uint32_t *pmaster_num_shards,
5804 bool exclusive)
5805 {
5806 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
5807 string selected_placement_rule_name;
5808 RGWZonePlacementInfo rule_info;
5809
5810 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
5811 int ret = 0;
5812 ret = select_bucket_placement(owner, zonegroup_id, placement_rule,
5813 &selected_placement_rule_name, &rule_info);
5814 if (ret < 0)
5815 return ret;
5816
5817 if (!pmaster_bucket) {
5818 create_bucket_id(&bucket.marker);
5819 bucket.bucket_id = bucket.marker;
5820 } else {
5821 bucket.marker = pmaster_bucket->marker;
5822 bucket.bucket_id = pmaster_bucket->bucket_id;
5823 }
5824
5825 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
5826
5827 if (pobjv) {
5828 objv_tracker.write_version = *pobjv;
5829 } else {
5830 objv_tracker.generate_new_write_ver(cct);
5831 }
5832
5833 info.bucket = bucket;
5834 info.owner = owner.user_id;
5835 info.zonegroup = zonegroup_id;
5836 info.placement_rule = selected_placement_rule_name;
5837 info.index_type = rule_info.index_type;
5838 info.swift_ver_location = swift_ver_location;
5839 info.swift_versioning = (!swift_ver_location.empty());
5840 if (pmaster_num_shards) {
5841 info.num_shards = *pmaster_num_shards;
5842 } else {
5843 info.num_shards = bucket_index_max_shards;
5844 }
5845 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
5846 info.requester_pays = false;
5847 if (real_clock::is_zero(creation_time)) {
5848 info.creation_time = ceph::real_clock::now();
5849 } else {
5850 info.creation_time = creation_time;
5851 }
5852 if (pquota_info) {
5853 info.quota = *pquota_info;
5854 }
5855
5856 int r = init_bucket_index(info, info.num_shards);
5857 if (r < 0) {
5858 return r;
5859 }
5860
5861 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
5862 if (ret == -EEXIST) {
5863 librados::IoCtx index_ctx;
5864 map<int, string> bucket_objs;
5865 int r = open_bucket_index(info, index_ctx, bucket_objs);
5866 if (r < 0)
5867 return r;
5868
5869 /* we need to reread the info and return it, caller will have a use for it */
5870 RGWObjVersionTracker instance_ver = info.objv_tracker;
5871 info.objv_tracker.clear();
5872 RGWObjectCtx obj_ctx(this);
5873 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
5874 if (r < 0) {
5875 if (r == -ENOENT) {
5876 continue;
5877 }
5878 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
5879 return r;
5880 }
5881
5882 /* only remove it if it's a different bucket instance */
5883 if (info.bucket.bucket_id != bucket.bucket_id) {
5884 /* remove bucket meta instance */
5885 string entry = bucket.get_key();
5886 r = rgw_bucket_instance_remove_entry(this, entry, &instance_ver);
5887 if (r < 0)
5888 return r;
5889
5890 map<int, string>::const_iterator biter;
5891 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
5892 // Do best effort removal
5893 index_ctx.remove(biter->second);
5894 }
5895 }
5896 /* ret == -ENOENT here */
5897 }
5898 return ret;
5899 }
5900
5901 /* this is highly unlikely */
5902 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
5903 return -ENOENT;
5904 }
5905
5906 int RGWRados::select_new_bucket_location(RGWUserInfo& user_info, const string& zonegroup_id, const string& request_rule,
5907 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
5908
5909 {
5910 /* first check that zonegroup exists within current period. */
5911 RGWZoneGroup zonegroup;
5912 int ret = get_zonegroup(zonegroup_id, zonegroup);
5913 if (ret < 0) {
5914 ldout(cct, 0) << "could not find zonegroup " << zonegroup_id << " in current period" << dendl;
5915 return ret;
5916 }
5917
5918 /* find placement rule. Hierarchy: request rule > user default rule > zonegroup default rule */
5919 std::map<std::string, RGWZoneGroupPlacementTarget>::const_iterator titer;
5920
5921 if (!request_rule.empty()) {
5922 titer = zonegroup.placement_targets.find(request_rule);
5923 if (titer == zonegroup.placement_targets.end()) {
5924 ldout(cct, 0) << "could not find requested placement id " << request_rule
5925 << " within zonegroup " << dendl;
5926 return -ERR_INVALID_LOCATION_CONSTRAINT;
5927 }
5928 } else if (!user_info.default_placement.empty()) {
5929 titer = zonegroup.placement_targets.find(user_info.default_placement);
5930 if (titer == zonegroup.placement_targets.end()) {
5931 ldout(cct, 0) << "could not find user default placement id " << user_info.default_placement
5932 << " within zonegroup " << dendl;
5933 return -ERR_INVALID_LOCATION_CONSTRAINT;
5934 }
5935 } else {
5936 if (zonegroup.default_placement.empty()) { // zonegroup default rule as fallback, it should not be empty.
5937 ldout(cct, 0) << "misconfiguration, zonegroup default placement id should not be empty." << dendl;
5938 return -ERR_ZONEGROUP_DEFAULT_PLACEMENT_MISCONFIGURATION;
5939 } else {
5940 titer = zonegroup.placement_targets.find(zonegroup.default_placement);
5941 if (titer == zonegroup.placement_targets.end()) {
5942 ldout(cct, 0) << "could not find zonegroup default placement id " << zonegroup.default_placement
5943 << " within zonegroup " << dendl;
5944 return -ERR_INVALID_LOCATION_CONSTRAINT;
5945 }
5946 }
5947 }
5948
5949 /* now check tag for the rule, whether user is permitted to use rule */
5950 const auto& target_rule = titer->second;
5951 if (!target_rule.user_permitted(user_info.placement_tags)) {
5952 ldout(cct, 0) << "user not permitted to use placement rule " << titer->first << dendl;
5953 return -EPERM;
5954 }
5955
5956 if (pselected_rule_name)
5957 *pselected_rule_name = titer->first;
5958
5959 return select_bucket_location_by_rule(titer->first, rule_info);
5960 }
5961
5962 int RGWRados::select_bucket_location_by_rule(const string& location_rule, RGWZonePlacementInfo *rule_info)
5963 {
5964 if (location_rule.empty()) {
5965 /* we can only reach here if we're trying to set a bucket location from a bucket
5966 * created on a different zone, using a legacy / default pool configuration
5967 */
5968 return select_legacy_bucket_placement(rule_info);
5969 }
5970
5971 /*
5972 * make sure that zone has this rule configured. We're
5973 * checking it for the local zone, because that's where this bucket object is going to
5974 * reside.
5975 */
5976 map<string, RGWZonePlacementInfo>::iterator piter = get_zone_params().placement_pools.find(location_rule);
5977 if (piter == get_zone_params().placement_pools.end()) {
5978 /* couldn't find, means we cannot really place data for this bucket in this zone */
5979 if (get_zonegroup().equals(zonegroup.get_id())) {
5980 /* that's a configuration error, zone should have that rule, as we're within the requested
5981 * zonegroup */
5982 return -EINVAL;
5983 } else {
5984 /* oh, well, data is not going to be placed here, bucket object is just a placeholder */
5985 return 0;
5986 }
5987 }
5988
5989 RGWZonePlacementInfo& placement_info = piter->second;
5990
5991 if (rule_info) {
5992 *rule_info = placement_info;
5993 }
5994
5995 return 0;
5996 }
5997
5998 int RGWRados::select_bucket_placement(RGWUserInfo& user_info, const string& zonegroup_id, const string& placement_rule,
5999 string *pselected_rule_name, RGWZonePlacementInfo *rule_info)
6000 {
6001 if (!get_zone_params().placement_pools.empty()) {
6002 return select_new_bucket_location(user_info, zonegroup_id, placement_rule,
6003 pselected_rule_name, rule_info);
6004 }
6005
6006 if (pselected_rule_name) {
6007 pselected_rule_name->clear();
6008 }
6009
6010 return select_legacy_bucket_placement(rule_info);
6011 }
6012
6013 int RGWRados::select_legacy_bucket_placement(RGWZonePlacementInfo *rule_info)
6014 {
6015 bufferlist map_bl;
6016 map<string, bufferlist> m;
6017 string pool_name;
6018 bool write_map = false;
6019
6020 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6021
6022 RGWObjectCtx obj_ctx(this);
6023 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root, avail_pools, map_bl, NULL, NULL);
6024 if (ret < 0) {
6025 goto read_omap;
6026 }
6027
6028 try {
6029 bufferlist::iterator iter = map_bl.begin();
6030 ::decode(m, iter);
6031 } catch (buffer::error& err) {
6032 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
6033 }
6034
6035 read_omap:
6036 if (m.empty()) {
6037 bufferlist header;
6038 ret = omap_get_all(obj, header, m);
6039
6040 write_map = true;
6041 }
6042
6043 if (ret < 0 || m.empty()) {
6044 vector<rgw_pool> pools;
6045 string s = string("default.") + default_storage_pool_suffix;
6046 pools.push_back(rgw_pool(s));
6047 vector<int> retcodes;
6048 bufferlist bl;
6049 ret = create_pools(pools, retcodes);
6050 if (ret < 0)
6051 return ret;
6052 ret = omap_set(obj, s, bl);
6053 if (ret < 0)
6054 return ret;
6055 m[s] = bl;
6056 }
6057
6058 if (write_map) {
6059 bufferlist new_bl;
6060 ::encode(m, new_bl);
6061 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6062 if (ret < 0) {
6063 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6064 }
6065 }
6066
6067 map<string, bufferlist>::iterator miter;
6068 if (m.size() > 1) {
6069 vector<string> v;
6070 for (miter = m.begin(); miter != m.end(); ++miter) {
6071 v.push_back(miter->first);
6072 }
6073
6074 uint32_t r;
6075 ret = get_random_bytes((char *)&r, sizeof(r));
6076 if (ret < 0)
6077 return ret;
6078
6079 int i = r % v.size();
6080 pool_name = v[i];
6081 } else {
6082 miter = m.begin();
6083 pool_name = miter->first;
6084 }
6085
6086 rule_info->data_pool = pool_name;
6087 rule_info->data_extra_pool = pool_name;
6088 rule_info->index_pool = pool_name;
6089 rule_info->index_type = RGWBIType_Normal;
6090
6091 return 0;
6092 }
6093
6094 bool RGWRados::get_obj_data_pool(const string& placement_rule, const rgw_obj& obj, rgw_pool *pool)
6095 {
6096 return rgw_get_obj_data_pool(zonegroup, zone_params, placement_rule, obj, pool);
6097 }
6098
6099 bool RGWRados::obj_to_raw(const string& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
6100 {
6101 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
6102
6103 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
6104 }
6105
6106 int RGWRados::update_placement_map()
6107 {
6108 bufferlist header;
6109 map<string, bufferlist> m;
6110 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6111 int ret = omap_get_all(obj, header, m);
6112 if (ret < 0)
6113 return ret;
6114
6115 bufferlist new_bl;
6116 ::encode(m, new_bl);
6117 ret = put_system_obj_data(NULL, obj, new_bl, -1, false);
6118 if (ret < 0) {
6119 ldout(cct, 0) << "WARNING: could not save avail pools map info ret=" << ret << dendl;
6120 }
6121
6122 return ret;
6123 }
6124
6125 int RGWRados::add_bucket_placement(const rgw_pool& new_pool)
6126 {
6127 librados::Rados *rad = get_rados_handle();
6128 int ret = rad->pool_lookup(new_pool.name.c_str());
6129 if (ret < 0) // DNE, or something
6130 return ret;
6131
6132 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6133 bufferlist empty_bl;
6134 ret = omap_set(obj, new_pool.to_str(), empty_bl);
6135
6136 // don't care about return value
6137 update_placement_map();
6138
6139 return ret;
6140 }
6141
6142 int RGWRados::remove_bucket_placement(const rgw_pool& old_pool)
6143 {
6144 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6145 int ret = omap_del(obj, old_pool.to_str());
6146
6147 // don't care about return value
6148 update_placement_map();
6149
6150 return ret;
6151 }
6152
6153 int RGWRados::list_placement_set(set<rgw_pool>& names)
6154 {
6155 bufferlist header;
6156 map<string, bufferlist> m;
6157
6158 rgw_raw_obj obj(get_zone_params().domain_root, avail_pools);
6159 int ret = omap_get_all(obj, header, m);
6160 if (ret < 0)
6161 return ret;
6162
6163 names.clear();
6164 map<string, bufferlist>::iterator miter;
6165 for (miter = m.begin(); miter != m.end(); ++miter) {
6166 names.insert(rgw_pool(miter->first));
6167 }
6168
6169 return names.size();
6170 }
6171
6172 int RGWRados::create_pools(vector<rgw_pool>& pools, vector<int>& retcodes)
6173 {
6174 vector<librados::PoolAsyncCompletion *> completions;
6175 vector<int> rets;
6176
6177 librados::Rados *rad = get_rados_handle();
6178 for (auto iter = pools.begin(); iter != pools.end(); ++iter) {
6179 librados::PoolAsyncCompletion *c = librados::Rados::pool_async_create_completion();
6180 completions.push_back(c);
6181 rgw_pool& pool = *iter;
6182 int ret = rad->pool_create_async(pool.name.c_str(), c);
6183 rets.push_back(ret);
6184 }
6185
6186 vector<int>::iterator riter;
6187 vector<librados::PoolAsyncCompletion *>::iterator citer;
6188
6189 bool error = false;
6190 assert(rets.size() == completions.size());
6191 for (riter = rets.begin(), citer = completions.begin(); riter != rets.end(); ++riter, ++citer) {
6192 int r = *riter;
6193 PoolAsyncCompletion *c = *citer;
6194 if (r == 0) {
6195 c->wait();
6196 r = c->get_return_value();
6197 if (r < 0) {
6198 ldout(cct, 0) << "WARNING: async pool_create returned " << r << dendl;
6199 error = true;
6200 }
6201 }
6202 c->release();
6203 retcodes.push_back(r);
6204 }
6205 if (error) {
6206 return 0;
6207 }
6208
6209 std::vector<librados::IoCtx> io_ctxs;
6210 retcodes.clear();
6211 for (auto pool : pools) {
6212 io_ctxs.emplace_back();
6213 int ret = rad->ioctx_create(pool.name.c_str(), io_ctxs.back());
6214 if (ret < 0) {
6215 ldout(cct, 0) << "WARNING: ioctx_create returned " << ret << dendl;
6216 error = true;
6217 }
6218 retcodes.push_back(ret);
6219 }
6220 if (error) {
6221 return 0;
6222 }
6223
6224 completions.clear();
6225 for (auto &io_ctx : io_ctxs) {
6226 librados::PoolAsyncCompletion *c =
6227 librados::Rados::pool_async_create_completion();
6228 completions.push_back(c);
6229 int ret = io_ctx.application_enable_async(pg_pool_t::APPLICATION_NAME_RGW,
6230 false, c);
6231 assert(ret == 0);
6232 }
6233
6234 retcodes.clear();
6235 for (auto c : completions) {
6236 c->wait();
6237 int ret = c->get_return_value();
6238 if (ret == -EOPNOTSUPP) {
6239 ret = 0;
6240 } else if (ret < 0) {
6241 ldout(cct, 0) << "WARNING: async application_enable returned " << ret
6242 << dendl;
6243 error = true;
6244 }
6245 c->release();
6246 retcodes.push_back(ret);
6247 }
6248 return 0;
6249 }
6250
6251 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
6252 {
6253 string oid, key;
6254 get_obj_bucket_and_oid_loc(obj, oid, key);
6255
6256 rgw_pool pool;
6257 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6258 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6259 return -EIO;
6260 }
6261
6262 int r = open_pool_ctx(pool, *ioctx);
6263 if (r < 0) {
6264 return r;
6265 }
6266
6267 ioctx->locator_set_key(key);
6268
6269 return 0;
6270 }
6271
6272 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
6273 {
6274 get_obj_bucket_and_oid_loc(obj, ref->oid, ref->key);
6275
6276 rgw_pool pool;
6277 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
6278 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
6279 return -EIO;
6280 }
6281
6282 int r = open_pool_ctx(pool, ref->ioctx);
6283 if (r < 0) {
6284 return r;
6285 }
6286
6287 ref->ioctx.locator_set_key(ref->key);
6288
6289 return 0;
6290 }
6291
6292 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6293 {
6294 ref->oid = obj.oid;
6295 ref->key = obj.loc;
6296
6297 int r;
6298
6299 if (ref->oid.empty()) {
6300 ref->oid = obj.pool.to_str();
6301 ref->pool = get_zone_params().domain_root;
6302 } else {
6303 ref->pool = obj.pool;
6304 }
6305 r = open_pool_ctx(ref->pool, ref->ioctx);
6306 if (r < 0)
6307 return r;
6308
6309 ref->ioctx.locator_set_key(ref->key);
6310
6311 return 0;
6312 }
6313
6314 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
6315 {
6316 return get_raw_obj_ref(obj, ref);
6317 }
6318
6319 /*
6320 * fixes an issue where head objects were supposed to have a locator created, but ended
6321 * up without one
6322 */
6323 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
6324 {
6325 const rgw_bucket& bucket = bucket_info.bucket;
6326 string oid;
6327 string locator;
6328
6329 rgw_obj obj(bucket, key);
6330
6331 get_obj_bucket_and_oid_loc(obj, oid, locator);
6332
6333 if (locator.empty()) {
6334 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
6335 return 0;
6336 }
6337
6338 librados::IoCtx ioctx;
6339
6340 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
6341 if (ret < 0) {
6342 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
6343 return ret;
6344 }
6345 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
6346
6347 uint64_t size;
6348 bufferlist data;
6349
6350 struct timespec mtime_ts;
6351 map<string, bufferlist> attrs;
6352 librados::ObjectReadOperation op;
6353 op.getxattrs(&attrs, NULL);
6354 op.stat2(&size, &mtime_ts, NULL);
6355 #define HEAD_SIZE 512 * 1024
6356 op.read(0, HEAD_SIZE, &data, NULL);
6357
6358 ret = ioctx.operate(oid, &op, NULL);
6359 if (ret < 0) {
6360 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
6361 return ret;
6362 }
6363
6364 if (size > HEAD_SIZE) {
6365 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
6366 return -EIO;
6367 }
6368
6369 if (size != data.length()) {
6370 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
6371 return -EIO;
6372 }
6373
6374 if (copy_obj) {
6375 librados::ObjectWriteOperation wop;
6376
6377 wop.mtime2(&mtime_ts);
6378
6379 map<string, bufferlist>::iterator iter;
6380 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6381 wop.setxattr(iter->first.c_str(), iter->second);
6382 }
6383
6384 wop.write(0, data);
6385
6386 ioctx.locator_set_key(locator);
6387 ioctx.operate(oid, &wop);
6388 }
6389
6390 if (remove_bad) {
6391 ioctx.locator_set_key(string());
6392
6393 ret = ioctx.remove(oid);
6394 if (ret < 0) {
6395 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
6396 return ret;
6397 }
6398 }
6399
6400 return 0;
6401 }
6402
6403 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
6404 const string& src_oid, const string& src_locator,
6405 librados::IoCtx& dst_ioctx,
6406 const string& dst_oid, const string& dst_locator)
6407 {
6408
6409 #define COPY_BUF_SIZE (4 * 1024 * 1024)
6410 bool done = false;
6411 uint64_t chunk_size = COPY_BUF_SIZE;
6412 uint64_t ofs = 0;
6413 int ret = 0;
6414 real_time mtime;
6415 struct timespec mtime_ts;
6416 uint64_t size;
6417
6418 if (src_oid == dst_oid && src_locator == dst_locator) {
6419 return 0;
6420 }
6421
6422 src_ioctx.locator_set_key(src_locator);
6423 dst_ioctx.locator_set_key(dst_locator);
6424
6425 do {
6426 bufferlist data;
6427 ObjectReadOperation rop;
6428 ObjectWriteOperation wop;
6429
6430 if (ofs == 0) {
6431 rop.stat2(&size, &mtime_ts, NULL);
6432 mtime = real_clock::from_timespec(mtime_ts);
6433 }
6434 rop.read(ofs, chunk_size, &data, NULL);
6435 ret = src_ioctx.operate(src_oid, &rop, NULL);
6436 if (ret < 0) {
6437 goto done_err;
6438 }
6439
6440 if (data.length() == 0) {
6441 break;
6442 }
6443
6444 if (ofs == 0) {
6445 wop.create(true); /* make it exclusive */
6446 wop.mtime2(&mtime_ts);
6447 mtime = real_clock::from_timespec(mtime_ts);
6448 }
6449 wop.write(ofs, data);
6450 ret = dst_ioctx.operate(dst_oid, &wop);
6451 ofs += data.length();
6452 done = data.length() != chunk_size;
6453 } while (!done);
6454
6455 if (ofs != size) {
6456 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
6457 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
6458 ret = -EIO;
6459 goto done_err;
6460 }
6461
6462 src_ioctx.remove(src_oid);
6463
6464 return 0;
6465
6466 done_err:
6467 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
6468 return ret;
6469 }
6470
6471 /*
6472 * fixes an issue where head objects were supposed to have a locator created, but ended
6473 * up without one
6474 */
6475 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
6476 {
6477 const rgw_bucket& bucket = bucket_info.bucket;
6478 rgw_obj obj(bucket, key);
6479
6480 if (need_fix) {
6481 *need_fix = false;
6482 }
6483
6484 rgw_rados_ref ref;
6485 int r = get_obj_head_ref(bucket_info, obj, &ref);
6486 if (r < 0) {
6487 return r;
6488 }
6489
6490 RGWObjState *astate = NULL;
6491 RGWObjectCtx rctx(this);
6492 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
6493 if (r < 0)
6494 return r;
6495
6496 if (astate->has_manifest) {
6497 RGWObjManifest::obj_iterator miter;
6498 RGWObjManifest& manifest = astate->manifest;
6499 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
6500 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
6501 rgw_obj loc;
6502 string oid;
6503 string locator;
6504
6505 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
6506
6507 if (loc.key.ns.empty()) {
6508 /* continue, we're only interested in tail objects */
6509 continue;
6510 }
6511
6512 get_obj_bucket_and_oid_loc(loc, oid, locator);
6513 ref.ioctx.locator_set_key(locator);
6514
6515 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
6516
6517 r = ref.ioctx.stat(oid, NULL, NULL);
6518 if (r != -ENOENT) {
6519 continue;
6520 }
6521
6522 string bad_loc;
6523 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
6524
6525 /* create a new ioctx with the bad locator */
6526 librados::IoCtx src_ioctx;
6527 src_ioctx.dup(ref.ioctx);
6528 src_ioctx.locator_set_key(bad_loc);
6529
6530 r = src_ioctx.stat(oid, NULL, NULL);
6531 if (r != 0) {
6532 /* cannot find a broken part */
6533 continue;
6534 }
6535 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
6536 if (need_fix) {
6537 *need_fix = true;
6538 }
6539 if (fix) {
6540 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
6541 if (r < 0) {
6542 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
6543 }
6544 }
6545 }
6546 }
6547
6548 return 0;
6549 }
6550
6551 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, const rgw_obj& obj)
6552 {
6553 bucket = _bucket;
6554
6555 RGWObjectCtx obj_ctx(store);
6556
6557 RGWBucketInfo bucket_info;
6558 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6559 if (ret < 0) {
6560 return ret;
6561 }
6562
6563 ret = store->open_bucket_index_shard(bucket_info, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
6564 if (ret < 0) {
6565 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6566 return ret;
6567 }
6568 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6569
6570 return 0;
6571 }
6572
6573 int RGWRados::BucketShard::init(const rgw_bucket& _bucket, int sid)
6574 {
6575 bucket = _bucket;
6576 shard_id = sid;
6577
6578 RGWObjectCtx obj_ctx(store);
6579
6580 RGWBucketInfo bucket_info;
6581 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
6582 if (ret < 0) {
6583 return ret;
6584 }
6585
6586 ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6587 if (ret < 0) {
6588 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6589 return ret;
6590 }
6591 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6592
6593 return 0;
6594 }
6595
6596 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
6597 {
6598 bucket = bucket_info.bucket;
6599 shard_id = sid;
6600
6601 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
6602 if (ret < 0) {
6603 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
6604 return ret;
6605 }
6606 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
6607
6608 return 0;
6609 }
6610
6611
6612 /* Execute @handler on last item in bucket listing for bucket specified
6613 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
6614 * to objects matching these criterias. */
6615 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
6616 const std::string& obj_prefix,
6617 const std::string& obj_delim,
6618 std::function<int(const rgw_bucket_dir_entry&)> handler)
6619 {
6620 RGWRados::Bucket target(this, bucket_info);
6621 RGWRados::Bucket::List list_op(&target);
6622
6623 list_op.params.prefix = obj_prefix;
6624 list_op.params.delim = obj_delim;
6625
6626 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
6627 << ", obj_prefix=" << obj_prefix
6628 << ", obj_delim=" << obj_delim
6629 << dendl;
6630
6631 bool is_truncated = false;
6632
6633 boost::optional<rgw_bucket_dir_entry> last_entry;
6634 /* We need to rewind to the last object in a listing. */
6635 do {
6636 /* List bucket entries in chunks. */
6637 static constexpr int MAX_LIST_OBJS = 100;
6638 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
6639
6640 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
6641 &is_truncated);
6642 if (ret < 0) {
6643 return ret;
6644 } else if (!entries.empty()) {
6645 last_entry = entries.back();
6646 }
6647 } while (is_truncated);
6648
6649 if (last_entry) {
6650 return handler(*last_entry);
6651 }
6652
6653 /* Empty listing - no items we can run handler on. */
6654 return 0;
6655 }
6656
6657
6658 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
6659 const rgw_user& user,
6660 RGWBucketInfo& bucket_info,
6661 rgw_obj& obj)
6662 {
6663 if (! swift_versioning_enabled(bucket_info)) {
6664 return 0;
6665 }
6666
6667 obj_ctx.obj.set_atomic(obj);
6668
6669 RGWObjState * state = nullptr;
6670 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
6671 if (r < 0) {
6672 return r;
6673 }
6674
6675 if (!state->exists) {
6676 return 0;
6677 }
6678
6679 string client_id;
6680 string op_id;
6681
6682 const string& src_name = obj.get_oid();
6683 char buf[src_name.size() + 32];
6684 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
6685 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
6686 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
6687
6688 RGWBucketInfo dest_bucket_info;
6689
6690 r = get_bucket_info(obj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
6691 if (r < 0) {
6692 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
6693 if (r == -ENOENT) {
6694 return -ERR_PRECONDITION_FAILED;
6695 }
6696 return r;
6697 }
6698
6699 if (dest_bucket_info.owner != bucket_info.owner) {
6700 return -ERR_PRECONDITION_FAILED;
6701 }
6702
6703 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
6704 obj_ctx.obj.set_atomic(dest_obj);
6705
6706 string no_zone;
6707
6708 r = copy_obj(obj_ctx,
6709 user,
6710 client_id,
6711 op_id,
6712 NULL, /* req_info *info */
6713 no_zone,
6714 dest_obj,
6715 obj,
6716 dest_bucket_info,
6717 bucket_info,
6718 NULL, /* time_t *src_mtime */
6719 NULL, /* time_t *mtime */
6720 NULL, /* const time_t *mod_ptr */
6721 NULL, /* const time_t *unmod_ptr */
6722 false, /* bool high_precision_time */
6723 NULL, /* const char *if_match */
6724 NULL, /* const char *if_nomatch */
6725 RGWRados::ATTRSMOD_NONE,
6726 true, /* bool copy_if_newer */
6727 state->attrset,
6728 RGW_OBJ_CATEGORY_MAIN,
6729 0, /* uint64_t olh_epoch */
6730 real_time(), /* time_t delete_at */
6731 NULL, /* string *version_id */
6732 NULL, /* string *ptag */
6733 NULL, /* string *petag */
6734 NULL, /* void (*progress_cb)(off_t, void *) */
6735 NULL); /* void *progress_data */
6736 if (r == -ECANCELED || r == -ENOENT) {
6737 /* Has already been overwritten, meaning another rgw process already
6738 * copied it out */
6739 return 0;
6740 }
6741
6742 return r;
6743 }
6744
6745 int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
6746 const rgw_user& user,
6747 RGWBucketInfo& bucket_info,
6748 rgw_obj& obj,
6749 bool& restored) /* out */
6750 {
6751 if (! swift_versioning_enabled(bucket_info)) {
6752 return 0;
6753 }
6754
6755 /* Bucket info of the bucket that stores previous versions of our object. */
6756 RGWBucketInfo archive_binfo;
6757
6758 int ret = get_bucket_info(obj_ctx, bucket_info.bucket.tenant,
6759 bucket_info.swift_ver_location, archive_binfo,
6760 nullptr, nullptr);
6761 if (ret < 0) {
6762 return ret;
6763 }
6764
6765 /* Abort the operation if the bucket storing our archive belongs to someone
6766 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
6767 * into consideration. For we can live with that.
6768 *
6769 * TODO: delegate this check to un upper layer and compare with ACLs. */
6770 if (bucket_info.owner != archive_binfo.owner) {
6771 return -EPERM;
6772 }
6773
6774 /* This code will be executed on latest version of the object. */
6775 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
6776 std::string no_client_id;
6777 std::string no_op_id;
6778 std::string no_zone;
6779
6780 /* We don't support object versioning of Swift API on those buckets that
6781 * are already versioned using the S3 mechanism. This affects also bucket
6782 * storing archived objects. Otherwise the delete operation would create
6783 * a deletion marker. */
6784 if (archive_binfo.versioned()) {
6785 restored = false;
6786 return -ERR_PRECONDITION_FAILED;
6787 }
6788
6789 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
6790 * irrelevant and may be safely skipped. */
6791 std::map<std::string, ceph::bufferlist> no_attrs;
6792
6793 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
6794 obj_ctx.obj.set_atomic(archive_obj);
6795 obj_ctx.obj.set_atomic(obj);
6796
6797 int ret = copy_obj(obj_ctx,
6798 user,
6799 no_client_id,
6800 no_op_id,
6801 nullptr, /* req_info *info */
6802 no_zone,
6803 obj, /* dest obj */
6804 archive_obj, /* src obj */
6805 bucket_info, /* dest bucket info */
6806 archive_binfo, /* src bucket info */
6807 nullptr, /* time_t *src_mtime */
6808 nullptr, /* time_t *mtime */
6809 nullptr, /* const time_t *mod_ptr */
6810 nullptr, /* const time_t *unmod_ptr */
6811 false, /* bool high_precision_time */
6812 nullptr, /* const char *if_match */
6813 nullptr, /* const char *if_nomatch */
6814 RGWRados::ATTRSMOD_NONE,
6815 true, /* bool copy_if_newer */
6816 no_attrs,
6817 RGW_OBJ_CATEGORY_MAIN,
6818 0, /* uint64_t olh_epoch */
6819 real_time(), /* time_t delete_at */
6820 nullptr, /* string *version_id */
6821 nullptr, /* string *ptag */
6822 nullptr, /* string *petag */
6823 nullptr, /* void (*progress_cb)(off_t, void *) */
6824 nullptr); /* void *progress_data */
6825 if (ret == -ECANCELED || ret == -ENOENT) {
6826 /* Has already been overwritten, meaning another rgw process already
6827 * copied it out */
6828 return 0;
6829 } else if (ret < 0) {
6830 return ret;
6831 } else {
6832 restored = true;
6833 }
6834
6835 /* Need to remove the archived copy. */
6836 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
6837 archive_binfo.versioning_status());
6838
6839 return ret;
6840 };
6841
6842 const std::string& obj_name = obj.get_oid();
6843 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
6844 % obj_name);
6845
6846 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
6847 handler);
6848 }
6849
6850 /**
6851 * Write/overwrite an object to the bucket storage.
6852 * bucket: the bucket to store the object in
6853 * obj: the object name/key
6854 * data: the object contents/value
6855 * size: the amount of data to write (data must be this long)
6856 * accounted_size: original size of data before compression, encryption
6857 * mtime: if non-NULL, writes the given mtime to the bucket storage
6858 * attrs: all the given attrs are written to bucket storage for the given object
6859 * exclusive: create object exclusively
6860 * Returns: 0 on success, -ERR# otherwise.
6861 */
6862 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
6863 map<string, bufferlist>& attrs,
6864 bool assume_noent, bool modify_tail,
6865 void *_index_op)
6866 {
6867 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
6868 RGWRados *store = target->get_store();
6869
6870 ObjectWriteOperation op;
6871
6872 RGWObjState *state;
6873 int r = target->get_state(&state, false, assume_noent);
6874 if (r < 0)
6875 return r;
6876
6877 rgw_obj& obj = target->get_obj();
6878
6879 if (obj.get_oid().empty()) {
6880 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
6881 return -EIO;
6882 }
6883
6884 rgw_rados_ref ref;
6885 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
6886 if (r < 0)
6887 return r;
6888
6889 bool is_olh = state->is_olh;
6890
6891 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
6892
6893 const string *ptag = meta.ptag;
6894 if (!ptag && !index_op->get_optag()->empty()) {
6895 ptag = index_op->get_optag();
6896 }
6897 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
6898 if (r < 0)
6899 return r;
6900
6901 if (real_clock::is_zero(meta.set_mtime)) {
6902 meta.set_mtime = real_clock::now();
6903 }
6904
6905 if (state->is_olh) {
6906 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
6907 }
6908
6909 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
6910 op.mtime2(&mtime_ts);
6911
6912 if (meta.data) {
6913 /* if we want to overwrite the data, we also want to overwrite the
6914 xattrs, so just remove the object */
6915 op.write_full(*meta.data);
6916 }
6917
6918 string etag;
6919 string content_type;
6920 bufferlist acl_bl;
6921
6922 map<string, bufferlist>::iterator iter;
6923 if (meta.rmattrs) {
6924 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
6925 const string& name = iter->first;
6926 op.rmxattr(name.c_str());
6927 }
6928 }
6929
6930 if (meta.manifest) {
6931 /* remove existing manifest attr */
6932 iter = attrs.find(RGW_ATTR_MANIFEST);
6933 if (iter != attrs.end())
6934 attrs.erase(iter);
6935
6936 bufferlist bl;
6937 ::encode(*meta.manifest, bl);
6938 op.setxattr(RGW_ATTR_MANIFEST, bl);
6939 }
6940
6941 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6942 const string& name = iter->first;
6943 bufferlist& bl = iter->second;
6944
6945 if (!bl.length())
6946 continue;
6947
6948 op.setxattr(name.c_str(), bl);
6949
6950 if (name.compare(RGW_ATTR_ETAG) == 0) {
6951 etag = bl.c_str();
6952 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
6953 content_type = bl.c_str();
6954 } else if (name.compare(RGW_ATTR_ACL) == 0) {
6955 acl_bl = bl;
6956 }
6957 }
6958 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
6959 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
6960 }
6961
6962 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
6963 bufferlist bl;
6964 ::encode(store->get_zone_short_id(), bl);
6965 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
6966 }
6967
6968 if (!op.size())
6969 return 0;
6970
6971 uint64_t epoch;
6972 int64_t poolid;
6973 bool orig_exists;
6974 uint64_t orig_size;
6975
6976 if (!reset_obj) { //Multipart upload, it has immutable head.
6977 orig_exists = false;
6978 orig_size = 0;
6979 } else {
6980 orig_exists = state->exists;
6981 orig_size = state->accounted_size;
6982 }
6983
6984 bool versioned_target = (meta.olh_epoch > 0 || !obj.key.instance.empty());
6985
6986 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
6987
6988 if (versioned_op) {
6989 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
6990 }
6991
6992 if (!index_op->is_prepared()) {
6993 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
6994 if (r < 0)
6995 return r;
6996 }
6997
6998 r = ref.ioctx.operate(ref.oid, &op);
6999 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
7000 or -ENOENT if was removed, or -EEXIST if it did not exist
7001 before and now it does */
7002 if (r == -EEXIST && assume_noent) {
7003 target->invalidate_state();
7004 return r;
7005 }
7006 goto done_cancel;
7007 }
7008
7009 epoch = ref.ioctx.get_last_version();
7010 poolid = ref.ioctx.get_id();
7011
7012 r = target->complete_atomic_modification();
7013 if (r < 0) {
7014 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7015 }
7016
7017 r = index_op->complete(poolid, epoch, size, accounted_size,
7018 meta.set_mtime, etag, content_type, &acl_bl,
7019 meta.category, meta.remove_objs, meta.user_data);
7020 if (r < 0)
7021 goto done_cancel;
7022
7023 if (meta.mtime) {
7024 *meta.mtime = meta.set_mtime;
7025 }
7026
7027 /* note that index_op was using state so we couldn't invalidate it earlier */
7028 target->invalidate_state();
7029 state = NULL;
7030
7031 if (versioned_op) {
7032 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, meta.olh_epoch, real_time(), false, meta.zones_trace);
7033 if (r < 0) {
7034 return r;
7035 }
7036 }
7037
7038 if (!real_clock::is_zero(meta.delete_at)) {
7039 rgw_obj_index_key obj_key;
7040 obj.key.get_index_key(&obj_key);
7041
7042 r = store->objexp_hint_add(meta.delete_at,
7043 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
7044 if (r < 0) {
7045 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7046 /* ignoring error, nothing we can do at this point */
7047 }
7048 }
7049 meta.canceled = false;
7050
7051 /* update quota cache */
7052 if (meta.completeMultipart){
7053 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7054 0, orig_size);
7055 }
7056 else {
7057 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
7058 accounted_size, orig_size);
7059 }
7060 return 0;
7061
7062 done_cancel:
7063 int ret = index_op->cancel();
7064 if (ret < 0) {
7065 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7066 }
7067
7068 meta.canceled = true;
7069
7070 /* we lost in a race. There are a few options:
7071 * - existing object was rewritten (ECANCELED)
7072 * - non existing object was created (EEXIST)
7073 * - object was removed (ENOENT)
7074 * should treat it as a success
7075 */
7076 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
7077 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
7078 r = 0;
7079 }
7080 } else {
7081 if (meta.if_match != NULL) {
7082 // only overwrite existing object
7083 if (strcmp(meta.if_match, "*") == 0) {
7084 if (r == -ENOENT) {
7085 r = -ERR_PRECONDITION_FAILED;
7086 } else if (r == -ECANCELED) {
7087 r = 0;
7088 }
7089 }
7090 }
7091
7092 if (meta.if_nomatch != NULL) {
7093 // only create a new object
7094 if (strcmp(meta.if_nomatch, "*") == 0) {
7095 if (r == -EEXIST) {
7096 r = -ERR_PRECONDITION_FAILED;
7097 } else if (r == -ENOENT) {
7098 r = 0;
7099 }
7100 }
7101 }
7102 }
7103
7104 return r;
7105 }
7106
7107 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
7108 map<string, bufferlist>& attrs)
7109 {
7110 RGWBucketInfo& bucket_info = target->get_bucket_info();
7111
7112 RGWRados::Bucket bop(target->get_store(), bucket_info);
7113 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
7114 index_op.set_zones_trace(meta.zones_trace);
7115
7116 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
7117 int r;
7118 if (assume_noent) {
7119 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7120 if (r == -EEXIST) {
7121 assume_noent = false;
7122 }
7123 }
7124 if (!assume_noent) {
7125 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7126 }
7127 return r;
7128 }
7129
7130 /** Write/overwrite a system object. */
7131 int RGWRados::put_system_obj_impl(rgw_raw_obj& obj, uint64_t size, real_time *mtime,
7132 map<std::string, bufferlist>& attrs, int flags,
7133 bufferlist& data,
7134 RGWObjVersionTracker *objv_tracker,
7135 real_time set_mtime /* 0 for don't set */)
7136 {
7137 rgw_rados_ref ref;
7138 int r = get_system_obj_ref(obj, &ref);
7139 if (r < 0)
7140 return r;
7141
7142 ObjectWriteOperation op;
7143
7144 if (flags & PUT_OBJ_EXCL) {
7145 if (!(flags & PUT_OBJ_CREATE))
7146 return -EINVAL;
7147 op.create(true); // exclusive create
7148 } else {
7149 op.remove();
7150 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
7151 op.create(false);
7152 }
7153
7154 if (objv_tracker) {
7155 objv_tracker->prepare_op_for_write(&op);
7156 }
7157
7158 if (real_clock::is_zero(set_mtime)) {
7159 set_mtime = real_clock::now();
7160 }
7161
7162 struct timespec mtime_ts = real_clock::to_timespec(set_mtime);
7163 op.mtime2(&mtime_ts);
7164 op.write_full(data);
7165
7166 bufferlist acl_bl;
7167
7168 for (map<string, bufferlist>::iterator iter = attrs.begin(); iter != attrs.end(); ++iter) {
7169 const string& name = iter->first;
7170 bufferlist& bl = iter->second;
7171
7172 if (!bl.length())
7173 continue;
7174
7175 op.setxattr(name.c_str(), bl);
7176 }
7177
7178 r = ref.ioctx.operate(ref.oid, &op);
7179 if (r < 0) {
7180 return r;
7181 }
7182
7183 if (objv_tracker) {
7184 objv_tracker->apply_write();
7185 }
7186
7187 if (mtime) {
7188 *mtime = set_mtime;
7189 }
7190
7191 return 0;
7192 }
7193
7194 int RGWRados::put_system_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7195 off_t ofs, bool exclusive,
7196 RGWObjVersionTracker *objv_tracker)
7197 {
7198 rgw_rados_ref ref;
7199 int r = get_system_obj_ref(obj, &ref);
7200 if (r < 0) {
7201 return r;
7202 }
7203
7204 ObjectWriteOperation op;
7205
7206 if (exclusive)
7207 op.create(true);
7208
7209 if (objv_tracker) {
7210 objv_tracker->prepare_op_for_write(&op);
7211 }
7212 if (ofs == -1) {
7213 op.write_full(bl);
7214 } else {
7215 op.write(ofs, bl);
7216 }
7217 r = ref.ioctx.operate(ref.oid, &op);
7218 if (r < 0)
7219 return r;
7220
7221 if (objv_tracker) {
7222 objv_tracker->apply_write();
7223 }
7224 return 0;
7225 }
7226
7227 /**
7228 * Write/overwrite an object to the bucket storage.
7229 * bucket: the bucket to store the object in
7230 * obj: the object name/key
7231 * data: the object contents/value
7232 * offset: the offet to write to in the object
7233 * If this is -1, we will overwrite the whole object.
7234 * size: the amount of data to write (data must be this long)
7235 * attrs: all the given attrs are written to bucket storage for the given object
7236 * Returns: 0 on success, -ERR# otherwise.
7237 */
7238
7239 int RGWRados::aio_put_obj_data(void *ctx, rgw_raw_obj& obj, bufferlist& bl,
7240 off_t ofs, bool exclusive,
7241 void **handle)
7242 {
7243 rgw_rados_ref ref;
7244 int r = get_raw_obj_ref(obj, &ref);
7245 if (r < 0) {
7246 return r;
7247 }
7248
7249 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
7250 *handle = c;
7251
7252 ObjectWriteOperation op;
7253
7254 if (exclusive)
7255 op.create(true);
7256
7257 if (ofs == -1) {
7258 op.write_full(bl);
7259 } else {
7260 op.write(ofs, bl);
7261 }
7262 r = ref.ioctx.aio_operate(ref.oid, c, &op);
7263 if (r < 0)
7264 return r;
7265
7266 return 0;
7267 }
7268
7269 int RGWRados::aio_wait(void *handle)
7270 {
7271 AioCompletion *c = (AioCompletion *)handle;
7272 c->wait_for_safe();
7273 int ret = c->get_return_value();
7274 c->release();
7275 return ret;
7276 }
7277
7278 bool RGWRados::aio_completed(void *handle)
7279 {
7280 AioCompletion *c = (AioCompletion *)handle;
7281 return c->is_safe();
7282 }
7283
7284 // PutObj filter that buffers data so we don't try to compress tiny blocks.
7285 // libcurl reads in 16k at a time, and we need at least 64k to get a good
7286 // compression ratio
7287 class RGWPutObj_Buffer : public RGWPutObj_Filter {
7288 const unsigned buffer_size;
7289 bufferlist buffer;
7290 public:
7291 RGWPutObj_Buffer(RGWPutObjDataProcessor* next, unsigned buffer_size)
7292 : RGWPutObj_Filter(next), buffer_size(buffer_size) {
7293 assert(ISP2(buffer_size)); // must be power of 2
7294 }
7295
7296 int handle_data(bufferlist& bl, off_t ofs, void **phandle, rgw_raw_obj *pobj,
7297 bool *again) override {
7298 if (*again || !bl.length()) {
7299 // flush buffered data
7300 return RGWPutObj_Filter::handle_data(buffer, ofs, phandle, pobj, again);
7301 }
7302 // transform offset to the beginning of the buffer
7303 ofs = ofs - buffer.length();
7304 buffer.claim_append(bl);
7305 if (buffer.length() < buffer_size) {
7306 *again = false; // don't come back until there's more data
7307 return 0;
7308 }
7309 const auto count = P2ALIGN(buffer.length(), buffer_size);
7310 buffer.splice(0, count, &bl);
7311 return RGWPutObj_Filter::handle_data(bl, ofs, phandle, pobj, again);
7312 }
7313 };
7314
7315 class RGWRadosPutObj : public RGWGetDataCB
7316 {
7317 CephContext* cct;
7318 rgw_obj obj;
7319 RGWPutObjDataProcessor *filter;
7320 boost::optional<RGWPutObj_Compress>& compressor;
7321 boost::optional<RGWPutObj_Buffer> buffering;
7322 CompressorRef& plugin;
7323 RGWPutObjProcessor_Atomic *processor;
7324 RGWOpStateSingleOp *opstate;
7325 void (*progress_cb)(off_t, void *);
7326 void *progress_data;
7327 bufferlist extra_data_bl;
7328 uint64_t extra_data_left;
7329 uint64_t data_len;
7330 map<string, bufferlist> src_attrs;
7331 public:
7332 RGWRadosPutObj(CephContext* cct,
7333 CompressorRef& plugin,
7334 boost::optional<RGWPutObj_Compress>& compressor,
7335 RGWPutObjProcessor_Atomic *p,
7336 RGWOpStateSingleOp *_ops,
7337 void (*_progress_cb)(off_t, void *),
7338 void *_progress_data) :
7339 cct(cct),
7340 filter(p),
7341 compressor(compressor),
7342 plugin(plugin),
7343 processor(p),
7344 opstate(_ops),
7345 progress_cb(_progress_cb),
7346 progress_data(_progress_data),
7347 extra_data_left(0),
7348 data_len(0) {}
7349
7350 int process_attrs(void) {
7351 if (extra_data_bl.length()) {
7352 JSONParser jp;
7353 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7354 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7355 return -EIO;
7356 }
7357
7358 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7359
7360 src_attrs.erase(RGW_ATTR_COMPRESSION);
7361 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7362 }
7363
7364 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
7365 //do not compress if object is encrypted
7366 compressor = boost::in_place(cct, plugin, filter);
7367 constexpr unsigned buffer_size = 512 * 1024;
7368 buffering = boost::in_place(&*compressor, buffer_size);
7369 filter = &*buffering;
7370 }
7371 return 0;
7372 }
7373
7374 int handle_data(bufferlist& bl, off_t ofs, off_t len) override {
7375 if (progress_cb) {
7376 progress_cb(ofs, progress_data);
7377 }
7378 if (extra_data_left) {
7379 size_t extra_len = bl.length();
7380 if (extra_len > extra_data_left)
7381 extra_len = extra_data_left;
7382
7383 bufferlist extra;
7384 bl.splice(0, extra_len, &extra);
7385 extra_data_bl.append(extra);
7386
7387 extra_data_left -= extra_len;
7388 if (extra_data_left == 0) {
7389 int res = process_attrs();
7390 if (res < 0)
7391 return res;
7392 }
7393 if (bl.length() == 0) {
7394 return 0;
7395 }
7396 ofs += extra_len;
7397 }
7398 // adjust ofs based on extra_data_len, so the result is a logical offset
7399 // into the object data
7400 assert(uint64_t(ofs) >= extra_data_len);
7401 ofs -= extra_data_len;
7402
7403 data_len += bl.length();
7404 bool again = false;
7405
7406 bool need_opstate = true;
7407
7408 do {
7409 void *handle = NULL;
7410 rgw_raw_obj obj;
7411 uint64_t size = bl.length();
7412 int ret = filter->handle_data(bl, ofs, &handle, &obj, &again);
7413 if (ret < 0)
7414 return ret;
7415
7416 if (need_opstate && opstate) {
7417 /* need to update opstate repository with new state. This is ratelimited, so we're not
7418 * really doing it every time
7419 */
7420 ret = opstate->renew_state();
7421 if (ret < 0) {
7422 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): failed to renew op state ret=" << ret << dendl;
7423 int r = filter->throttle_data(handle, obj, size, false);
7424 if (r < 0) {
7425 ldout(cct, 0) << "ERROR: RGWRadosPutObj::handle_data(): processor->throttle_data() returned " << r << dendl;
7426 }
7427 /* could not renew state! might have been marked as cancelled */
7428 return ret;
7429 }
7430 need_opstate = false;
7431 }
7432
7433 ret = filter->throttle_data(handle, obj, size, false);
7434 if (ret < 0)
7435 return ret;
7436 } while (again);
7437
7438 return 0;
7439 }
7440
7441 int flush() {
7442 bufferlist bl;
7443 return put_data_and_throttle(filter, bl, 0, false);
7444 }
7445
7446 bufferlist& get_extra_data() { return extra_data_bl; }
7447
7448 map<string, bufferlist>& get_attrs() { return src_attrs; }
7449
7450 void set_extra_data_len(uint64_t len) override {
7451 extra_data_left = len;
7452 RGWGetDataCB::set_extra_data_len(len);
7453 }
7454
7455 uint64_t get_data_len() {
7456 return data_len;
7457 }
7458
7459 int complete(const string& etag, real_time *mtime, real_time set_mtime,
7460 map<string, bufferlist>& attrs, real_time delete_at, rgw_zone_set *zones_trace) {
7461 return processor->complete(data_len, etag, mtime, set_mtime, attrs, delete_at, NULL, NULL, NULL, zones_trace);
7462 }
7463
7464 bool is_canceled() {
7465 return processor->is_canceled();
7466 }
7467 };
7468
7469 /*
7470 * prepare attrset depending on attrs_mod.
7471 */
7472 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
7473 map<string, bufferlist>& attrs,
7474 RGWRados::AttrsMod attrs_mod)
7475 {
7476 switch (attrs_mod) {
7477 case RGWRados::ATTRSMOD_NONE:
7478 attrs = src_attrs;
7479 break;
7480 case RGWRados::ATTRSMOD_REPLACE:
7481 if (!attrs[RGW_ATTR_ETAG].length()) {
7482 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
7483 }
7484 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
7485 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
7486 if (ttiter != src_attrs.end()) {
7487 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
7488 }
7489 }
7490 break;
7491 case RGWRados::ATTRSMOD_MERGE:
7492 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
7493 if (attrs.find(it->first) == attrs.end()) {
7494 attrs[it->first] = it->second;
7495 }
7496 }
7497 break;
7498 }
7499 }
7500
7501 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw_obj& obj)
7502 {
7503 map<string, bufferlist> attrset;
7504
7505 real_time mtime;
7506 uint64_t obj_size;
7507 RGWObjectCtx rctx(this);
7508
7509 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
7510 RGWRados::Object::Read read_op(&op_target);
7511
7512 read_op.params.attrs = &attrset;
7513 read_op.params.lastmod = &mtime;
7514 read_op.params.obj_size = &obj_size;
7515
7516 int ret = read_op.prepare();
7517 if (ret < 0)
7518 return ret;
7519
7520 attrset.erase(RGW_ATTR_ID_TAG);
7521 attrset.erase(RGW_ATTR_TAIL_TAG);
7522
7523 uint64_t max_chunk_size;
7524
7525 ret = get_max_chunk_size(dest_bucket_info.placement_rule, obj, &max_chunk_size);
7526 if (ret < 0) {
7527 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << obj.bucket << dendl;
7528 return ret;
7529 }
7530
7531 return copy_obj_data(rctx, dest_bucket_info, read_op, obj_size - 1, obj, obj,
7532 max_chunk_size, NULL, mtime, attrset,
7533 RGW_OBJ_CATEGORY_MAIN, 0, real_time(),
7534 (obj.key.instance.empty() ? NULL : &(obj.key.instance)),
7535 NULL, NULL);
7536 }
7537
7538 struct obj_time_weight {
7539 real_time mtime;
7540 uint32_t zone_short_id;
7541 uint64_t pg_ver;
7542 bool high_precision;
7543
7544 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
7545
7546 bool compare_low_precision(const obj_time_weight& rhs) {
7547 struct timespec l = ceph::real_clock::to_timespec(mtime);
7548 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
7549 l.tv_nsec = 0;
7550 r.tv_nsec = 0;
7551 if (l > r) {
7552 return false;
7553 }
7554 if (l < r) {
7555 return true;
7556 }
7557 if (zone_short_id != rhs.zone_short_id) {
7558 return (zone_short_id < rhs.zone_short_id);
7559 }
7560 return (pg_ver < rhs.pg_ver);
7561
7562 }
7563
7564 bool operator<(const obj_time_weight& rhs) {
7565 if (!high_precision || !rhs.high_precision) {
7566 return compare_low_precision(rhs);
7567 }
7568 if (mtime > rhs.mtime) {
7569 return false;
7570 }
7571 if (mtime < rhs.mtime) {
7572 return true;
7573 }
7574 if (zone_short_id != rhs.zone_short_id) {
7575 return (zone_short_id < rhs.zone_short_id);
7576 }
7577 return (pg_ver < rhs.pg_ver);
7578 }
7579
7580 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
7581 mtime = _mtime;
7582 zone_short_id = _short_id;
7583 pg_ver = _pg_ver;
7584 }
7585
7586 void init(RGWObjState *state) {
7587 mtime = state->mtime;
7588 zone_short_id = state->zone_short_id;
7589 pg_ver = state->pg_ver;
7590 }
7591 };
7592
7593 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
7594 out << o.mtime;
7595
7596 if (o.zone_short_id != 0 || o.pg_ver != 0) {
7597 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
7598 }
7599
7600 return out;
7601 }
7602
7603 class RGWGetExtraDataCB : public RGWGetDataCB {
7604 bufferlist extra_data;
7605 public:
7606 RGWGetExtraDataCB() {}
7607 int handle_data(bufferlist& bl, off_t bl_ofs, off_t bl_len) override {
7608 if (extra_data.length() < extra_data_len) {
7609 off_t max = extra_data_len - extra_data.length();
7610 if (max > bl_len) {
7611 max = bl_len;
7612 }
7613 bl.splice(0, max, &extra_data);
7614 }
7615 return bl_len;
7616 }
7617
7618 bufferlist& get_extra_data() {
7619 return extra_data;
7620 }
7621 };
7622
7623 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
7624 const rgw_user& user_id,
7625 const string& client_id,
7626 req_info *info,
7627 const string& source_zone,
7628 rgw_obj& src_obj,
7629 RGWBucketInfo& src_bucket_info,
7630 real_time *src_mtime,
7631 uint64_t *psize,
7632 const real_time *mod_ptr,
7633 const real_time *unmod_ptr,
7634 bool high_precision_time,
7635 const char *if_match,
7636 const char *if_nomatch,
7637 map<string, bufferlist> *pattrs,
7638 string *version_id,
7639 string *ptag,
7640 string *petag)
7641 {
7642 /* source is in a different zonegroup, copy from there */
7643
7644 RGWRESTStreamRWRequest *in_stream_req;
7645 string tag;
7646 map<string, bufferlist> src_attrs;
7647 append_rand_alpha(cct, tag, tag, 32);
7648 obj_time_weight set_mtime_weight;
7649 set_mtime_weight.high_precision = high_precision_time;
7650
7651 RGWRESTConn *conn;
7652 if (source_zone.empty()) {
7653 if (src_bucket_info.zonegroup.empty()) {
7654 /* source is in the master zonegroup */
7655 conn = rest_master_conn;
7656 } else {
7657 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7658 if (iter == zonegroup_conn_map.end()) {
7659 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7660 return -ENOENT;
7661 }
7662 conn = iter->second;
7663 }
7664 } else {
7665 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7666 if (iter == zone_conn_map.end()) {
7667 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7668 return -ENOENT;
7669 }
7670 conn = iter->second;
7671 }
7672
7673 RGWGetExtraDataCB cb;
7674 string etag;
7675 map<string, string> req_headers;
7676 real_time set_mtime;
7677
7678 const real_time *pmod = mod_ptr;
7679
7680 obj_time_weight dest_mtime_weight;
7681
7682 constexpr bool prepend_meta = true;
7683 constexpr bool get_op = true;
7684 constexpr bool rgwx_stat = true;
7685 constexpr bool sync_manifest = true;
7686 constexpr bool skip_decrypt = true;
7687 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7688 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7689 prepend_meta, get_op, rgwx_stat,
7690 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7691 if (ret < 0) {
7692 return ret;
7693 }
7694
7695 ret = conn->complete_request(in_stream_req, etag, &set_mtime, psize, req_headers);
7696 if (ret < 0) {
7697 return ret;
7698 }
7699
7700 bufferlist& extra_data_bl = cb.get_extra_data();
7701 if (extra_data_bl.length()) {
7702 JSONParser jp;
7703 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
7704 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7705 return -EIO;
7706 }
7707
7708 JSONDecoder::decode_json("attrs", src_attrs, &jp);
7709
7710 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
7711 }
7712
7713 if (src_mtime) {
7714 *src_mtime = set_mtime;
7715 }
7716
7717 if (petag) {
7718 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
7719 if (iter != src_attrs.end()) {
7720 bufferlist& etagbl = iter->second;
7721 *petag = etagbl.to_str();
7722 }
7723 }
7724
7725 if (pattrs) {
7726 *pattrs = src_attrs;
7727 }
7728
7729 return 0;
7730 }
7731
7732 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
7733 const rgw_user& user_id,
7734 const string& client_id,
7735 const string& op_id,
7736 bool record_op_state,
7737 req_info *info,
7738 const string& source_zone,
7739 rgw_obj& dest_obj,
7740 rgw_obj& src_obj,
7741 RGWBucketInfo& dest_bucket_info,
7742 RGWBucketInfo& src_bucket_info,
7743 real_time *src_mtime,
7744 real_time *mtime,
7745 const real_time *mod_ptr,
7746 const real_time *unmod_ptr,
7747 bool high_precision_time,
7748 const char *if_match,
7749 const char *if_nomatch,
7750 AttrsMod attrs_mod,
7751 bool copy_if_newer,
7752 map<string, bufferlist>& attrs,
7753 RGWObjCategory category,
7754 uint64_t olh_epoch,
7755 real_time delete_at,
7756 string *version_id,
7757 string *ptag,
7758 ceph::buffer::list *petag,
7759 void (*progress_cb)(off_t, void *),
7760 void *progress_data,
7761 rgw_zone_set *zones_trace)
7762 {
7763 /* source is in a different zonegroup, copy from there */
7764
7765 RGWRESTStreamRWRequest *in_stream_req;
7766 string tag;
7767 int i;
7768 append_rand_alpha(cct, tag, tag, 32);
7769 obj_time_weight set_mtime_weight;
7770 set_mtime_weight.high_precision = high_precision_time;
7771
7772 RGWPutObjProcessor_Atomic processor(obj_ctx,
7773 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
7774 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
7775 if (version_id && *version_id != "null") {
7776 processor.set_version_id(*version_id);
7777 }
7778 processor.set_olh_epoch(olh_epoch);
7779 int ret = processor.prepare(this, NULL);
7780 if (ret < 0) {
7781 return ret;
7782 }
7783
7784 RGWRESTConn *conn;
7785 if (source_zone.empty()) {
7786 if (dest_bucket_info.zonegroup.empty()) {
7787 /* source is in the master zonegroup */
7788 conn = rest_master_conn;
7789 } else {
7790 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
7791 if (iter == zonegroup_conn_map.end()) {
7792 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7793 return -ENOENT;
7794 }
7795 conn = iter->second;
7796 }
7797 } else {
7798 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
7799 if (iter == zone_conn_map.end()) {
7800 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7801 return -ENOENT;
7802 }
7803 conn = iter->second;
7804 }
7805
7806 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
7807
7808 RGWOpStateSingleOp *opstate = NULL;
7809
7810 if (record_op_state) {
7811 opstate = new RGWOpStateSingleOp(this, client_id, op_id, obj_name);
7812
7813 ret = opstate->set_state(RGWOpState::OPSTATE_IN_PROGRESS);
7814 if (ret < 0) {
7815 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7816 delete opstate;
7817 return ret;
7818 }
7819 }
7820
7821 boost::optional<RGWPutObj_Compress> compressor;
7822 CompressorRef plugin;
7823
7824 const auto& compression_type = zone_params.get_compression_type(
7825 dest_bucket_info.placement_rule);
7826 if (compression_type != "none") {
7827 plugin = Compressor::create(cct, compression_type);
7828 if (!plugin) {
7829 ldout(cct, 1) << "Cannot load plugin for compression type "
7830 << compression_type << dendl;
7831 }
7832 }
7833
7834 RGWRadosPutObj cb(cct, plugin, compressor, &processor, opstate, progress_cb, progress_data);
7835
7836 string etag;
7837 map<string, string> req_headers;
7838 real_time set_mtime;
7839
7840 RGWObjState *dest_state = NULL;
7841
7842 const real_time *pmod = mod_ptr;
7843
7844 obj_time_weight dest_mtime_weight;
7845
7846 if (copy_if_newer) {
7847 /* need to get mtime for destination */
7848 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7849 if (ret < 0)
7850 goto set_err_state;
7851
7852 if (!real_clock::is_zero(dest_state->mtime)) {
7853 dest_mtime_weight.init(dest_state);
7854 pmod = &dest_mtime_weight.mtime;
7855 }
7856 }
7857
7858 static constexpr bool prepend_meta = true;
7859 static constexpr bool get_op = true;
7860 static constexpr bool rgwx_stat = false;
7861 static constexpr bool sync_manifest = true;
7862 static constexpr bool skip_decrypt = true;
7863 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
7864 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
7865 prepend_meta, get_op, rgwx_stat,
7866 sync_manifest, skip_decrypt, &cb, &in_stream_req);
7867 if (ret < 0) {
7868 goto set_err_state;
7869 }
7870
7871 ret = conn->complete_request(in_stream_req, etag, &set_mtime, nullptr, req_headers);
7872 if (ret < 0) {
7873 goto set_err_state;
7874 }
7875 ret = cb.flush();
7876 if (ret < 0) {
7877 goto set_err_state;
7878 }
7879 if (compressor && compressor->is_compressed()) {
7880 bufferlist tmp;
7881 RGWCompressionInfo cs_info;
7882 cs_info.compression_type = plugin->get_type_name();
7883 cs_info.orig_size = cb.get_data_len();
7884 cs_info.blocks = move(compressor->get_compression_blocks());
7885 ::encode(cs_info, tmp);
7886 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
7887 }
7888
7889 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
7890 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
7891 } else {
7892 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
7893 if (iter != cb.get_attrs().end()) {
7894 try {
7895 ::decode(delete_at, iter->second);
7896 } catch (buffer::error& err) {
7897 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7898 }
7899 }
7900 }
7901
7902 if (src_mtime) {
7903 *src_mtime = set_mtime;
7904 }
7905
7906 if (petag) {
7907 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
7908 if (iter != cb.get_attrs().end()) {
7909 *petag = iter->second;
7910 }
7911 }
7912
7913 if (source_zone.empty()) {
7914 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
7915 } else {
7916 attrs = cb.get_attrs();
7917 }
7918
7919 if (copy_if_newer) {
7920 uint64_t pg_ver = 0;
7921 auto i = attrs.find(RGW_ATTR_PG_VER);
7922 if (i != attrs.end() && i->second.length() > 0) {
7923 bufferlist::iterator iter = i->second.begin();
7924 try {
7925 ::decode(pg_ver, iter);
7926 } catch (buffer::error& err) {
7927 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7928 /* non critical error */
7929 }
7930 }
7931 set_mtime_weight.init(set_mtime, get_zone_short_id(), pg_ver);
7932 }
7933
7934 #define MAX_COMPLETE_RETRY 100
7935 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
7936 ret = cb.complete(etag, mtime, set_mtime, attrs, delete_at, zones_trace);
7937 if (ret < 0) {
7938 goto set_err_state;
7939 }
7940 if (copy_if_newer && cb.is_canceled()) {
7941 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
7942 obj_ctx.obj.invalidate(dest_obj); /* object was overwritten */
7943 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
7944 if (ret < 0) {
7945 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7946 goto set_err_state;
7947 }
7948 dest_mtime_weight.init(dest_state);
7949 dest_mtime_weight.high_precision = high_precision_time;
7950 if (!dest_state->exists ||
7951 dest_mtime_weight < set_mtime_weight) {
7952 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7953 continue;
7954 } else {
7955 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7956 }
7957 }
7958 break;
7959 }
7960
7961 if (i == MAX_COMPLETE_RETRY) {
7962 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7963 ret = -EIO;
7964 goto set_err_state;
7965 }
7966
7967 if (opstate) {
7968 ret = opstate->set_state(RGWOpState::OPSTATE_COMPLETE);
7969 if (ret < 0) {
7970 ldout(cct, 0) << "ERROR: failed to set opstate ret=" << ret << dendl;
7971 }
7972 delete opstate;
7973 }
7974
7975 return 0;
7976 set_err_state:
7977 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
7978 ret = 0;
7979 }
7980 if (opstate) {
7981 RGWOpState::OpState state;
7982 if (ret < 0) {
7983 state = RGWOpState::OPSTATE_ERROR;
7984 } else {
7985 state = RGWOpState::OPSTATE_COMPLETE;
7986 }
7987 int r = opstate->set_state(state);
7988 if (r < 0) {
7989 ldout(cct, 0) << "ERROR: failed to set opstate r=" << ret << dendl;
7990 }
7991 delete opstate;
7992 }
7993 return ret;
7994 }
7995
7996
7997 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
7998 map<string, bufferlist>& src_attrs,
7999 RGWRados::Object::Read& read_op,
8000 const rgw_user& user_id,
8001 rgw_obj& dest_obj,
8002 real_time *mtime)
8003 {
8004 string etag;
8005
8006 RGWRESTStreamWriteRequest *out_stream_req;
8007
8008 int ret = rest_master_conn->put_obj_init(user_id, dest_obj, astate->size, src_attrs, &out_stream_req);
8009 if (ret < 0) {
8010 return ret;
8011 }
8012
8013 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
8014 if (ret < 0) {
8015 delete out_stream_req;
8016 return ret;
8017 }
8018
8019 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
8020 if (ret < 0)
8021 return ret;
8022
8023 return 0;
8024 }
8025
8026 /**
8027 * Copy an object.
8028 * dest_obj: the object to copy into
8029 * src_obj: the object to copy from
8030 * attrs: usage depends on attrs_mod parameter
8031 * attrs_mod: the modification mode of the attrs, may have the following values:
8032 * ATTRSMOD_NONE - the attributes of the source object will be
8033 * copied without modifications, attrs parameter is ignored;
8034 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
8035 * parameter, source object attributes are not copied;
8036 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
8037 * are overwritten by values contained in attrs parameter.
8038 * err: stores any errors resulting from the get of the original object
8039 * Returns: 0 on success, -ERR# otherwise.
8040 */
8041 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
8042 const rgw_user& user_id,
8043 const string& client_id,
8044 const string& op_id,
8045 req_info *info,
8046 const string& source_zone,
8047 rgw_obj& dest_obj,
8048 rgw_obj& src_obj,
8049 RGWBucketInfo& dest_bucket_info,
8050 RGWBucketInfo& src_bucket_info,
8051 real_time *src_mtime,
8052 real_time *mtime,
8053 const real_time *mod_ptr,
8054 const real_time *unmod_ptr,
8055 bool high_precision_time,
8056 const char *if_match,
8057 const char *if_nomatch,
8058 AttrsMod attrs_mod,
8059 bool copy_if_newer,
8060 map<string, bufferlist>& attrs,
8061 RGWObjCategory category,
8062 uint64_t olh_epoch,
8063 real_time delete_at,
8064 string *version_id,
8065 string *ptag,
8066 ceph::buffer::list *petag,
8067 void (*progress_cb)(off_t, void *),
8068 void *progress_data)
8069 {
8070 int ret;
8071 uint64_t obj_size;
8072 rgw_obj shadow_obj = dest_obj;
8073 string shadow_oid;
8074
8075 bool remote_src;
8076 bool remote_dest;
8077
8078 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
8079 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
8080
8081 remote_dest = !get_zonegroup().equals(dest_bucket_info.zonegroup);
8082 remote_src = !get_zonegroup().equals(src_bucket_info.zonegroup);
8083
8084 if (remote_src && remote_dest) {
8085 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
8086 return -EINVAL;
8087 }
8088
8089 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
8090
8091 if (remote_src || !source_zone.empty()) {
8092 return fetch_remote_obj(obj_ctx, user_id, client_id, op_id, true, info, source_zone,
8093 dest_obj, src_obj, dest_bucket_info, src_bucket_info, src_mtime, mtime, mod_ptr,
8094 unmod_ptr, high_precision_time,
8095 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
8096 olh_epoch, delete_at, version_id, ptag, petag, progress_cb, progress_data);
8097 }
8098
8099 map<string, bufferlist> src_attrs;
8100 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
8101 RGWRados::Object::Read read_op(&src_op_target);
8102
8103 read_op.conds.mod_ptr = mod_ptr;
8104 read_op.conds.unmod_ptr = unmod_ptr;
8105 read_op.conds.high_precision_time = high_precision_time;
8106 read_op.conds.if_match = if_match;
8107 read_op.conds.if_nomatch = if_nomatch;
8108 read_op.params.attrs = &src_attrs;
8109 read_op.params.lastmod = src_mtime;
8110 read_op.params.obj_size = &obj_size;
8111
8112 ret = read_op.prepare();
8113 if (ret < 0) {
8114 return ret;
8115 }
8116 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
8117 // Current implementation does not follow S3 spec and even
8118 // may result in data corruption silently when copying
8119 // multipart objects acorss pools. So reject COPY operations
8120 //on encrypted objects before it is fully functional.
8121 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
8122 << " has not been implemented." << dendl;
8123 return -ERR_NOT_IMPLEMENTED;
8124 }
8125
8126 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
8127 src_attrs.erase(RGW_ATTR_DELETE_AT);
8128
8129 set_copy_attrs(src_attrs, attrs, attrs_mod);
8130 attrs.erase(RGW_ATTR_ID_TAG);
8131 attrs.erase(RGW_ATTR_PG_VER);
8132 attrs.erase(RGW_ATTR_SOURCE_ZONE);
8133 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
8134 if (cmp != src_attrs.end())
8135 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
8136
8137 RGWObjManifest manifest;
8138 RGWObjState *astate = NULL;
8139
8140 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
8141 if (ret < 0) {
8142 return ret;
8143 }
8144
8145 vector<rgw_raw_obj> ref_objs;
8146
8147 if (remote_dest) {
8148 /* dest is in a different zonegroup, copy it there */
8149 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
8150 }
8151 uint64_t max_chunk_size;
8152
8153 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
8154 if (ret < 0) {
8155 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
8156 return ret;
8157 }
8158
8159 rgw_pool src_pool;
8160 rgw_pool dest_pool;
8161 if (!get_obj_data_pool(src_bucket_info.placement_rule, src_obj, &src_pool)) {
8162 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
8163 return -EIO;
8164 }
8165 if (!get_obj_data_pool(dest_bucket_info.placement_rule, dest_obj, &dest_pool)) {
8166 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
8167 return -EIO;
8168 }
8169
8170
8171 bool copy_data = !astate->has_manifest || (src_pool != dest_pool);
8172 bool copy_first = false;
8173 if (astate->has_manifest) {
8174 if (!astate->manifest.has_tail()) {
8175 copy_data = true;
8176 } else {
8177 uint64_t head_size = astate->manifest.get_head_size();
8178
8179 if (head_size > 0) {
8180 if (head_size > max_chunk_size) {
8181 copy_data = true;
8182 } else {
8183 copy_first = true;
8184 }
8185 }
8186 }
8187 }
8188
8189 if (petag) {
8190 const auto iter = attrs.find(RGW_ATTR_ETAG);
8191 if (iter != attrs.end()) {
8192 *petag = iter->second;
8193 }
8194 }
8195
8196 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
8197 return copy_obj_data(obj_ctx, dest_bucket_info, read_op, obj_size - 1, dest_obj, src_obj,
8198 max_chunk_size, mtime, real_time(), attrs, category, olh_epoch, delete_at,
8199 version_id, ptag, petag);
8200 }
8201
8202 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
8203
8204 if (copy_first) { // we need to copy first chunk, not increase refcount
8205 ++miter;
8206 }
8207
8208 rgw_rados_ref ref;
8209 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
8210 if (ret < 0) {
8211 return ret;
8212 }
8213
8214 bool versioned_dest = dest_bucket_info.versioning_enabled();
8215
8216 if (version_id && !version_id->empty()) {
8217 versioned_dest = true;
8218 dest_obj.key.set_instance(*version_id);
8219 } else if (versioned_dest) {
8220 gen_rand_obj_instance_name(&dest_obj);
8221 }
8222
8223 bufferlist first_chunk;
8224
8225 bool copy_itself = (dest_obj == src_obj);
8226 RGWObjManifest *pmanifest;
8227 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
8228
8229 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
8230 RGWRados::Object::Write write_op(&dest_op_target);
8231
8232 string tag;
8233
8234 if (ptag) {
8235 tag = *ptag;
8236 }
8237
8238 if (tag.empty()) {
8239 append_rand_alpha(cct, tag, tag, 32);
8240 }
8241
8242 if (!copy_itself) {
8243 attrs.erase(RGW_ATTR_TAIL_TAG);
8244 manifest = astate->manifest;
8245 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
8246 if (tail_placement.bucket.name.empty()) {
8247 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
8248 }
8249 string ref_tag;
8250 for (; miter != astate->manifest.obj_end(); ++miter) {
8251 ObjectWriteOperation op;
8252 ref_tag = tag + '\0';
8253 cls_refcount_get(op, ref_tag, true);
8254 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
8255 ref.ioctx.locator_set_key(loc.loc);
8256
8257 ret = ref.ioctx.operate(loc.oid, &op);
8258 if (ret < 0) {
8259 goto done_ret;
8260 }
8261
8262 ref_objs.push_back(loc);
8263 }
8264
8265 pmanifest = &manifest;
8266 } else {
8267 pmanifest = &astate->manifest;
8268 /* don't send the object's tail for garbage collection */
8269 astate->keep_tail = true;
8270 }
8271
8272 if (copy_first) {
8273 ret = read_op.read(0, max_chunk_size, first_chunk);
8274 if (ret < 0) {
8275 goto done_ret;
8276 }
8277
8278 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
8279 } else {
8280 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
8281 }
8282
8283 write_op.meta.data = &first_chunk;
8284 write_op.meta.manifest = pmanifest;
8285 write_op.meta.ptag = &tag;
8286 write_op.meta.owner = dest_bucket_info.owner;
8287 write_op.meta.mtime = mtime;
8288 write_op.meta.flags = PUT_OBJ_CREATE;
8289 write_op.meta.category = category;
8290 write_op.meta.olh_epoch = olh_epoch;
8291 write_op.meta.delete_at = delete_at;
8292 write_op.meta.modify_tail = !copy_itself;
8293
8294 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
8295 if (ret < 0) {
8296 goto done_ret;
8297 }
8298
8299 return 0;
8300
8301 done_ret:
8302 if (!copy_itself) {
8303 vector<rgw_raw_obj>::iterator riter;
8304
8305 /* rollback reference */
8306 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
8307 ObjectWriteOperation op;
8308 cls_refcount_put(op, tag, true);
8309
8310 ref.ioctx.locator_set_key(riter->loc);
8311
8312 int r = ref.ioctx.operate(riter->oid, &op);
8313 if (r < 0) {
8314 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
8315 }
8316 }
8317 }
8318 return ret;
8319 }
8320
8321
8322 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
8323 RGWBucketInfo& dest_bucket_info,
8324 RGWRados::Object::Read& read_op, off_t end,
8325 rgw_obj& dest_obj,
8326 rgw_obj& src_obj,
8327 uint64_t max_chunk_size,
8328 real_time *mtime,
8329 real_time set_mtime,
8330 map<string, bufferlist>& attrs,
8331 RGWObjCategory category,
8332 uint64_t olh_epoch,
8333 real_time delete_at,
8334 string *version_id,
8335 string *ptag,
8336 ceph::buffer::list *petag)
8337 {
8338 bufferlist first_chunk;
8339 RGWObjManifest manifest;
8340
8341 string tag;
8342 append_rand_alpha(cct, tag, tag, 32);
8343
8344 RGWPutObjProcessor_Atomic processor(obj_ctx,
8345 dest_bucket_info, dest_obj.bucket, dest_obj.key.name,
8346 cct->_conf->rgw_obj_stripe_size, tag, dest_bucket_info.versioning_enabled());
8347 if (version_id) {
8348 processor.set_version_id(*version_id);
8349 }
8350 processor.set_olh_epoch(olh_epoch);
8351 int ret = processor.prepare(this, NULL);
8352 if (ret < 0)
8353 return ret;
8354
8355 off_t ofs = 0;
8356
8357 do {
8358 bufferlist bl;
8359 ret = read_op.read(ofs, end, bl);
8360
8361 uint64_t read_len = ret;
8362 bool again;
8363
8364 do {
8365 void *handle;
8366 rgw_raw_obj obj;
8367
8368 ret = processor.handle_data(bl, ofs, &handle, &obj, &again);
8369 if (ret < 0) {
8370 return ret;
8371 }
8372 ret = processor.throttle_data(handle, obj, read_len, false);
8373 if (ret < 0)
8374 return ret;
8375 } while (again);
8376
8377 ofs += read_len;
8378 } while (ofs <= end);
8379
8380 string etag;
8381 auto iter = attrs.find(RGW_ATTR_ETAG);
8382 if (iter != attrs.end()) {
8383 bufferlist& bl = iter->second;
8384 etag = string(bl.c_str(), bl.length());
8385 if (petag) {
8386 *petag = bl;
8387 }
8388 }
8389
8390 uint64_t accounted_size;
8391 {
8392 bool compressed{false};
8393 RGWCompressionInfo cs_info;
8394 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
8395 if (ret < 0) {
8396 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
8397 return ret;
8398 }
8399 // pass original size if compressed
8400 accounted_size = compressed ? cs_info.orig_size : ofs;
8401 }
8402
8403 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at);
8404 }
8405
8406 bool RGWRados::is_meta_master()
8407 {
8408 if (!get_zonegroup().is_master_zonegroup()) {
8409 return false;
8410 }
8411
8412 return (get_zonegroup().master_zone == zone_public_config.id);
8413 }
8414
8415 /**
8416 * Check to see if the bucket metadata could be synced
8417 * bucket: the bucket to check
8418 * Returns false is the bucket is not synced
8419 */
8420 bool RGWRados::is_syncing_bucket_meta(const rgw_bucket& bucket)
8421 {
8422
8423 /* no current period */
8424 if (current_period.get_id().empty()) {
8425 return false;
8426 }
8427
8428 /* zonegroup is not master zonegroup */
8429 if (!get_zonegroup().is_master_zonegroup()) {
8430 return false;
8431 }
8432
8433 /* single zonegroup and a single zone */
8434 if (current_period.is_single_zonegroup() && get_zonegroup().zones.size() == 1) {
8435 return false;
8436 }
8437
8438 /* zone is not master */
8439 if (get_zonegroup().master_zone.compare(zone_public_config.id) != 0) {
8440 return false;
8441 }
8442
8443 return true;
8444 }
8445
8446 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
8447 {
8448 std::map<string, rgw_bucket_dir_entry> ent_map;
8449 rgw_obj_index_key marker;
8450 string prefix;
8451 bool is_truncated;
8452
8453 do {
8454 #define NUM_ENTRIES 1000
8455 int r = cls_bucket_list(bucket_info, RGW_NO_SHARD, marker, prefix, NUM_ENTRIES, true, ent_map,
8456 &is_truncated, &marker);
8457 if (r < 0)
8458 return r;
8459
8460 string ns;
8461 std::map<string, rgw_bucket_dir_entry>::iterator eiter;
8462 for (eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
8463 rgw_obj_key obj;
8464
8465 if (rgw_obj_key::oid_to_key_in_ns(eiter->second.key.name, &obj, ns))
8466 return -ENOTEMPTY;
8467 }
8468 } while (is_truncated);
8469 return 0;
8470 }
8471
8472 /**
8473 * Delete a bucket.
8474 * bucket: the name of the bucket to delete
8475 * Returns 0 on success, -ERR# otherwise.
8476 */
8477 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
8478 {
8479 const rgw_bucket& bucket = bucket_info.bucket;
8480 librados::IoCtx index_ctx;
8481 map<int, string> bucket_objs;
8482 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8483 if (r < 0)
8484 return r;
8485
8486 if (check_empty) {
8487 r = check_bucket_empty(bucket_info);
8488 if (r < 0) {
8489 return r;
8490 }
8491 }
8492
8493 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
8494 if (r < 0)
8495 return r;
8496
8497 /* if the bucket is not synced we can remove the meta file */
8498 if (!is_syncing_bucket_meta(bucket)) {
8499 RGWObjVersionTracker objv_tracker;
8500 string entry = bucket.get_key();
8501 r= rgw_bucket_instance_remove_entry(this, entry, &objv_tracker);
8502 if (r < 0) {
8503 return r;
8504 }
8505 /* remove bucket index objects*/
8506 map<int, string>::const_iterator biter;
8507 for (biter = bucket_objs.begin(); biter != bucket_objs.end(); ++biter) {
8508 index_ctx.remove(biter->second);
8509 }
8510 }
8511 return 0;
8512 }
8513
8514 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
8515 {
8516 RGWBucketInfo info;
8517 map<string, bufferlist> attrs;
8518 RGWObjectCtx obj_ctx(this);
8519 int r;
8520 if (bucket.bucket_id.empty()) {
8521 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8522 } else {
8523 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
8524 }
8525 if (r < 0) {
8526 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8527 return r;
8528 }
8529
8530 info.owner = owner.get_id();
8531
8532 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8533 if (r < 0) {
8534 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
8535 return r;
8536 }
8537
8538 return 0;
8539 }
8540
8541
8542 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
8543 {
8544 int ret = 0;
8545
8546 vector<rgw_bucket>::iterator iter;
8547
8548 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
8549 rgw_bucket& bucket = *iter;
8550 if (enabled)
8551 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
8552 else
8553 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
8554
8555 RGWBucketInfo info;
8556 map<string, bufferlist> attrs;
8557 RGWObjectCtx obj_ctx(this);
8558 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
8559 if (r < 0) {
8560 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8561 ret = r;
8562 continue;
8563 }
8564 if (enabled) {
8565 info.flags &= ~BUCKET_SUSPENDED;
8566 } else {
8567 info.flags |= BUCKET_SUSPENDED;
8568 }
8569
8570 r = put_bucket_instance_info(info, false, real_time(), &attrs);
8571 if (r < 0) {
8572 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
8573 ret = r;
8574 continue;
8575 }
8576 }
8577 return ret;
8578 }
8579
8580 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
8581 {
8582 RGWBucketInfo bucket_info;
8583 RGWObjectCtx obj_ctx(this);
8584 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
8585 if (ret < 0) {
8586 return ret;
8587 }
8588
8589 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
8590 return 0;
8591 }
8592
8593 int RGWRados::Object::complete_atomic_modification()
8594 {
8595 if (!state->has_manifest || state->keep_tail)
8596 return 0;
8597
8598 cls_rgw_obj_chain chain;
8599 store->update_gc_chain(obj, state->manifest, &chain);
8600
8601 if (chain.empty()) {
8602 return 0;
8603 }
8604
8605 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
8606 return store->gc->send_chain(chain, tag, false); // do it async
8607 }
8608
8609 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
8610 {
8611 RGWObjManifest::obj_iterator iter;
8612 rgw_raw_obj raw_head;
8613 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
8614 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
8615 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
8616 if (mobj == raw_head)
8617 continue;
8618 cls_rgw_obj_key key(mobj.oid);
8619 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
8620 }
8621 }
8622
8623 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
8624 {
8625 return gc->send_chain(chain, tag, sync);
8626 }
8627
8628 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx, string& bucket_oid)
8629 {
8630 const rgw_bucket& bucket = bucket_info.bucket;
8631 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8632 if (r < 0)
8633 return r;
8634
8635 if (bucket.bucket_id.empty()) {
8636 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
8637 return -EIO;
8638 }
8639
8640 bucket_oid = dir_oid_prefix;
8641 bucket_oid.append(bucket.bucket_id);
8642
8643 return 0;
8644 }
8645
8646 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8647 string& bucket_oid_base) {
8648 const rgw_bucket& bucket = bucket_info.bucket;
8649 int r = open_bucket_index_ctx(bucket_info, index_ctx);
8650 if (r < 0)
8651 return r;
8652
8653 if (bucket.bucket_id.empty()) {
8654 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
8655 return -EIO;
8656 }
8657
8658 bucket_oid_base = dir_oid_prefix;
8659 bucket_oid_base.append(bucket.bucket_id);
8660
8661 return 0;
8662
8663 }
8664
8665 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8666 map<int, string>& bucket_objs, int shard_id, map<int, string> *bucket_instance_ids) {
8667 string bucket_oid_base;
8668 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8669 if (ret < 0) {
8670 return ret;
8671 }
8672
8673 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
8674 if (bucket_instance_ids) {
8675 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
8676 }
8677 return 0;
8678 }
8679
8680 template<typename T>
8681 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8682 map<int, string>& oids, map<int, T>& bucket_objs,
8683 int shard_id, map<int, string> *bucket_instance_ids)
8684 {
8685 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
8686 if (ret < 0)
8687 return ret;
8688
8689 map<int, string>::const_iterator iter = oids.begin();
8690 for (; iter != oids.end(); ++iter) {
8691 bucket_objs[iter->first] = T();
8692 }
8693 return 0;
8694 }
8695
8696 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8697 const string& obj_key, string *bucket_obj, int *shard_id)
8698 {
8699 string bucket_oid_base;
8700 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8701 if (ret < 0)
8702 return ret;
8703
8704 RGWObjectCtx obj_ctx(this);
8705
8706 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
8707 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
8708 if (ret < 0) {
8709 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
8710 return ret;
8711 }
8712 return 0;
8713 }
8714
8715 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
8716 int shard_id, string *bucket_obj)
8717 {
8718 string bucket_oid_base;
8719 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
8720 if (ret < 0)
8721 return ret;
8722
8723 RGWObjectCtx obj_ctx(this);
8724
8725 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
8726 shard_id, bucket_obj);
8727 return 0;
8728 }
8729
8730 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
8731 map<RGWObjCategory, RGWStorageStats>& stats)
8732 {
8733 for (const auto& pair : header.stats) {
8734 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
8735 const rgw_bucket_category_stats& header_stats = pair.second;
8736
8737 RGWStorageStats& s = stats[category];
8738
8739 s.category = category;
8740 s.size += header_stats.total_size;
8741 s.size_rounded += header_stats.total_size_rounded;
8742 s.size_utilized += header_stats.actual_size;
8743 s.num_objects += header_stats.num_entries;
8744 }
8745 }
8746
8747 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
8748 map<RGWObjCategory, RGWStorageStats> *existing_stats,
8749 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
8750 {
8751 librados::IoCtx index_ctx;
8752 // key - bucket index object id
8753 // value - bucket index check OP returned result with the given bucket index object (shard)
8754 map<int, string> oids;
8755 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
8756
8757 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
8758 if (ret < 0) {
8759 return ret;
8760 }
8761
8762 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
8763 if (ret < 0) {
8764 return ret;
8765 }
8766
8767 // Aggregate results (from different shards if there is any)
8768 map<int, struct rgw_cls_check_index_ret>::iterator iter;
8769 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
8770 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
8771 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
8772 }
8773
8774 return 0;
8775 }
8776
8777 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
8778 {
8779 librados::IoCtx index_ctx;
8780 map<int, string> bucket_objs;
8781
8782 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8783 if (r < 0) {
8784 return r;
8785 }
8786
8787 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8788 }
8789
8790 int RGWRados::bucket_set_reshard(RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
8791 {
8792 librados::IoCtx index_ctx;
8793 map<int, string> bucket_objs;
8794
8795 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
8796 if (r < 0) {
8797 return r;
8798 }
8799
8800 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
8801 }
8802
8803 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
8804 {
8805 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
8806 std::string oid, key;
8807 get_obj_bucket_and_oid_loc(obj, oid, key);
8808 if (!rctx)
8809 return 0;
8810
8811 RGWObjState *state = NULL;
8812
8813 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
8814 if (r < 0)
8815 return r;
8816
8817 if (!state->is_atomic) {
8818 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
8819 return -EINVAL;
8820 }
8821
8822 string tag;
8823
8824 if (state->tail_tag.length() > 0) {
8825 tag = state->tail_tag.c_str();
8826 } else if (state->obj_tag.length() > 0) {
8827 tag = state->obj_tag.c_str();
8828 } else {
8829 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
8830 return -EINVAL;
8831 }
8832
8833 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
8834
8835 return gc->defer_chain(tag, false);
8836 }
8837
8838 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
8839 {
8840 list<string> prefixes;
8841 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
8842 cls_rgw_remove_obj(op, prefixes);
8843 }
8844
8845 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
8846 {
8847 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
8848 }
8849
8850 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
8851 {
8852 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
8853 }
8854
8855
8856 /**
8857 * Delete an object.
8858 * bucket: name of the bucket storing the object
8859 * obj: name of the object to delete
8860 * Returns: 0 on success, -ERR# otherwise.
8861 */
8862 int RGWRados::Object::Delete::delete_obj()
8863 {
8864 RGWRados *store = target->get_store();
8865 rgw_obj& src_obj = target->get_obj();
8866 const string& instance = src_obj.key.instance;
8867 rgw_obj obj = src_obj;
8868
8869 if (instance == "null") {
8870 obj.key.instance.clear();
8871 }
8872
8873 bool explicit_marker_version = (!params.marker_version_id.empty());
8874
8875 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
8876 if (instance.empty() || explicit_marker_version) {
8877 rgw_obj marker = obj;
8878
8879 if (!params.marker_version_id.empty()) {
8880 if (params.marker_version_id != "null") {
8881 marker.key.set_instance(params.marker_version_id);
8882 }
8883 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
8884 store->gen_rand_obj_instance_name(&marker);
8885 }
8886
8887 result.version_id = marker.key.instance;
8888 result.delete_marker = true;
8889
8890 struct rgw_bucket_dir_entry_meta meta;
8891
8892 meta.owner = params.obj_owner.get_id().to_str();
8893 meta.owner_display_name = params.obj_owner.get_display_name();
8894
8895 if (real_clock::is_zero(params.mtime)) {
8896 meta.mtime = real_clock::now();
8897 } else {
8898 meta.mtime = params.mtime;
8899 }
8900
8901 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
8902 if (r < 0) {
8903 return r;
8904 }
8905 } else {
8906 rgw_bucket_dir_entry dirent;
8907
8908 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
8909 if (r < 0) {
8910 return r;
8911 }
8912 result.delete_marker = dirent.is_delete_marker();
8913 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
8914 if (r < 0) {
8915 return r;
8916 }
8917 result.version_id = instance;
8918 }
8919
8920 BucketShard *bs;
8921 int r = target->get_bucket_shard(&bs);
8922 if (r < 0) {
8923 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
8924 return r;
8925 }
8926
8927 if (target->bucket_info.datasync_flag_enabled()) {
8928 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
8929 if (r < 0) {
8930 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
8931 return r;
8932 }
8933 }
8934
8935 return 0;
8936 }
8937
8938 rgw_rados_ref ref;
8939 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
8940 if (r < 0) {
8941 return r;
8942 }
8943
8944 RGWObjState *state;
8945 r = target->get_state(&state, false);
8946 if (r < 0)
8947 return r;
8948
8949 ObjectWriteOperation op;
8950
8951 if (!real_clock::is_zero(params.unmod_since)) {
8952 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
8953 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
8954 if (!params.high_precision_time) {
8955 ctime.tv_nsec = 0;
8956 unmod.tv_nsec = 0;
8957 }
8958
8959 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
8960 if (ctime > unmod) {
8961 return -ERR_PRECONDITION_FAILED;
8962 }
8963
8964 /* only delete object if mtime is less than or equal to params.unmod_since */
8965 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
8966 }
8967 uint64_t obj_size = state->size;
8968
8969 if (!real_clock::is_zero(params.expiration_time)) {
8970 bufferlist bl;
8971 real_time delete_at;
8972
8973 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
8974 try {
8975 bufferlist::iterator iter = bl.begin();
8976 ::decode(delete_at, iter);
8977 } catch (buffer::error& err) {
8978 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
8979 return -EIO;
8980 }
8981
8982 if (params.expiration_time != delete_at) {
8983 return -ERR_PRECONDITION_FAILED;
8984 }
8985 } else {
8986 return -ERR_PRECONDITION_FAILED;
8987 }
8988 }
8989
8990 if (!state->exists) {
8991 target->invalidate_state();
8992 return -ENOENT;
8993 }
8994
8995 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
8996 if (r < 0)
8997 return r;
8998
8999 RGWBucketInfo& bucket_info = target->get_bucket_info();
9000
9001 RGWRados::Bucket bop(store, bucket_info);
9002 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9003
9004 index_op.set_zones_trace(params.zones_trace);
9005 index_op.set_bilog_flags(params.bilog_flags);
9006
9007 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
9008 if (r < 0)
9009 return r;
9010
9011 store->remove_rgw_head_obj(op);
9012 r = ref.ioctx.operate(ref.oid, &op);
9013
9014 /* raced with another operation, object state is indeterminate */
9015 const bool need_invalidate = (r == -ECANCELED);
9016
9017 int64_t poolid = ref.ioctx.get_id();
9018 if (r >= 0) {
9019 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
9020 if (obj_tombstone_cache) {
9021 tombstone_entry entry{*state};
9022 obj_tombstone_cache->add(obj, entry);
9023 }
9024 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
9025
9026 int ret = target->complete_atomic_modification();
9027 if (ret < 0) {
9028 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
9029 }
9030 /* other than that, no need to propagate error */
9031 } else {
9032 int ret = index_op.cancel();
9033 if (ret < 0) {
9034 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
9035 }
9036 }
9037
9038 if (need_invalidate) {
9039 target->invalidate_state();
9040 }
9041
9042 if (r < 0)
9043 return r;
9044
9045 /* update quota cache */
9046 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_size);
9047
9048 return 0;
9049 }
9050
9051 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
9052 const RGWBucketInfo& bucket_info,
9053 const rgw_obj& obj,
9054 int versioning_status,
9055 uint16_t bilog_flags,
9056 const real_time& expiration_time,
9057 rgw_zone_set *zones_trace)
9058 {
9059 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
9060 RGWRados::Object::Delete del_op(&del_target);
9061
9062 del_op.params.bucket_owner = bucket_info.owner;
9063 del_op.params.versioning_status = versioning_status;
9064 del_op.params.bilog_flags = bilog_flags;
9065 del_op.params.expiration_time = expiration_time;
9066 del_op.params.zones_trace = zones_trace;
9067
9068 return del_op.delete_obj();
9069 }
9070
9071 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
9072 {
9073 rgw_rados_ref ref;
9074 int r = get_raw_obj_ref(obj, &ref);
9075 if (r < 0) {
9076 return r;
9077 }
9078
9079 ObjectWriteOperation op;
9080
9081 op.remove();
9082 r = ref.ioctx.operate(ref.oid, &op);
9083 if (r < 0)
9084 return r;
9085
9086 return 0;
9087 }
9088
9089 int RGWRados::delete_system_obj(rgw_raw_obj& obj, RGWObjVersionTracker *objv_tracker)
9090 {
9091 if (obj.empty()) {
9092 ldout(cct, 1) << "delete_system_obj got empty object name "
9093 << obj << ", returning EINVAL" << dendl;
9094 return -EINVAL;
9095 }
9096 rgw_rados_ref ref;
9097 int r = get_raw_obj_ref(obj, &ref);
9098 if (r < 0) {
9099 return r;
9100 }
9101
9102 ObjectWriteOperation op;
9103
9104 if (objv_tracker) {
9105 objv_tracker->prepare_op_for_write(&op);
9106 }
9107
9108 op.remove();
9109 r = ref.ioctx.operate(ref.oid, &op);
9110 if (r < 0)
9111 return r;
9112
9113 return 0;
9114 }
9115
9116 int RGWRados::delete_obj_index(const rgw_obj& obj)
9117 {
9118 std::string oid, key;
9119 get_obj_bucket_and_oid_loc(obj, oid, key);
9120
9121 RGWObjectCtx obj_ctx(this);
9122
9123 RGWBucketInfo bucket_info;
9124 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
9125 if (ret < 0) {
9126 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
9127 return ret;
9128 }
9129
9130 RGWRados::Bucket bop(this, bucket_info);
9131 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9132
9133 real_time removed_mtime;
9134 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
9135
9136 return r;
9137 }
9138
9139 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
9140 {
9141 string tag;
9142
9143 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
9144 if (mi != manifest.obj_end()) {
9145 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
9146 ++mi;
9147 tag = mi.get_location().get_raw_obj(store).oid;
9148 tag.append("_");
9149 }
9150
9151 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
9152 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
9153 MD5 hash;
9154 hash.Update((const byte *)manifest_bl.c_str(), manifest_bl.length());
9155
9156 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
9157 if (iter != attrset.end()) {
9158 bufferlist& bl = iter->second;
9159 hash.Update((const byte *)bl.c_str(), bl.length());
9160 }
9161
9162 hash.Final(md5);
9163 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
9164 tag.append(md5_str);
9165
9166 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
9167
9168 tag_bl.append(tag.c_str(), tag.size() + 1);
9169 }
9170
9171 static bool is_olh(map<string, bufferlist>& attrs)
9172 {
9173 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
9174 return (iter != attrs.end());
9175 }
9176
9177 static bool has_olh_tag(map<string, bufferlist>& attrs)
9178 {
9179 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
9180 return (iter != attrs.end());
9181 }
9182
9183 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9184 RGWObjState *olh_state, RGWObjState **target_state)
9185 {
9186 assert(olh_state->is_olh);
9187
9188 rgw_obj target;
9189 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
9190 if (r < 0) {
9191 return r;
9192 }
9193 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
9194 if (r < 0) {
9195 return r;
9196 }
9197
9198 return 0;
9199 }
9200
9201 int RGWRados::get_system_obj_state_impl(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9202 {
9203 if (obj.empty()) {
9204 return -EINVAL;
9205 }
9206
9207 RGWRawObjState *s = rctx->raw.get_state(obj);
9208 ldout(cct, 20) << "get_system_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9209 *state = s;
9210 if (s->has_attrs) {
9211 return 0;
9212 }
9213
9214 s->obj = obj;
9215
9216 int r = raw_obj_stat(obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), objv_tracker);
9217 if (r == -ENOENT) {
9218 s->exists = false;
9219 s->has_attrs = true;
9220 s->mtime = real_time();
9221 return 0;
9222 }
9223 if (r < 0)
9224 return r;
9225
9226 s->exists = true;
9227 s->has_attrs = true;
9228 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9229
9230 if (s->obj_tag.length())
9231 ldout(cct, 20) << "get_system_obj_state: setting s->obj_tag to "
9232 << s->obj_tag.c_str() << dendl;
9233 else
9234 ldout(cct, 20) << "get_system_obj_state: s->obj_tag was set empty" << dendl;
9235
9236 return 0;
9237 }
9238
9239 int RGWRados::get_system_obj_state(RGWObjectCtx *rctx, rgw_raw_obj& obj, RGWRawObjState **state, RGWObjVersionTracker *objv_tracker)
9240 {
9241 int ret;
9242
9243 do {
9244 ret = get_system_obj_state_impl(rctx, obj, state, objv_tracker);
9245 } while (ret == -EAGAIN);
9246
9247 return ret;
9248 }
9249
9250 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9251 RGWObjState **state, bool follow_olh, bool assume_noent)
9252 {
9253 if (obj.empty()) {
9254 return -EINVAL;
9255 }
9256
9257 bool need_follow_olh = follow_olh && obj.key.instance.empty();
9258
9259 RGWObjState *s = rctx->obj.get_state(obj);
9260 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
9261 *state = s;
9262 if (s->has_attrs) {
9263 if (s->is_olh && need_follow_olh) {
9264 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9265 }
9266 return 0;
9267 }
9268
9269 s->obj = obj;
9270
9271 rgw_raw_obj raw_obj;
9272 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
9273
9274 int r = -ENOENT;
9275
9276 if (!assume_noent) {
9277 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
9278 }
9279
9280 if (r == -ENOENT) {
9281 s->exists = false;
9282 s->has_attrs = true;
9283 tombstone_entry entry;
9284 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
9285 s->mtime = entry.mtime;
9286 s->zone_short_id = entry.zone_short_id;
9287 s->pg_ver = entry.pg_ver;
9288 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
9289 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
9290 } else {
9291 s->mtime = real_time();
9292 }
9293 return 0;
9294 }
9295 if (r < 0)
9296 return r;
9297
9298 s->exists = true;
9299 s->has_attrs = true;
9300 s->accounted_size = s->size;
9301
9302 auto iter = s->attrset.find(RGW_ATTR_COMPRESSION);
9303 const bool compressed = (iter != s->attrset.end());
9304 if (compressed) {
9305 // use uncompressed size for accounted_size
9306 try {
9307 RGWCompressionInfo info;
9308 auto p = iter->second.begin();
9309 ::decode(info, p);
9310 s->accounted_size = info.orig_size;
9311 } catch (buffer::error&) {
9312 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
9313 return -EIO;
9314 }
9315 }
9316
9317 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
9318 if (iter != s->attrset.end()) {
9319 bufferlist bl = iter->second;
9320 bufferlist::iterator it = bl.begin();
9321 it.copy(bl.length(), s->shadow_obj);
9322 s->shadow_obj[bl.length()] = '\0';
9323 }
9324 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
9325 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
9326 if (ttiter != s->attrset.end()) {
9327 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
9328 }
9329
9330 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
9331 if (manifest_bl.length()) {
9332 bufferlist::iterator miter = manifest_bl.begin();
9333 try {
9334 ::decode(s->manifest, miter);
9335 s->has_manifest = true;
9336 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
9337 broken due to old bugs */
9338 s->size = s->manifest.get_obj_size();
9339 if (!compressed)
9340 s->accounted_size = s->size;
9341 } catch (buffer::error& err) {
9342 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
9343 return -EIO;
9344 }
9345 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
9346 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20) && s->manifest.has_explicit_objs()) {
9347 RGWObjManifest::obj_iterator mi;
9348 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
9349 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
9350 }
9351 }
9352
9353 if (!s->obj_tag.length()) {
9354 /*
9355 * Uh oh, something's wrong, object with manifest should have tag. Let's
9356 * create one out of the manifest, would be unique
9357 */
9358 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
9359 s->fake_tag = true;
9360 }
9361 }
9362 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
9363 if (aiter != s->attrset.end()) {
9364 bufferlist& pg_ver_bl = aiter->second;
9365 if (pg_ver_bl.length()) {
9366 bufferlist::iterator pgbl = pg_ver_bl.begin();
9367 try {
9368 ::decode(s->pg_ver, pgbl);
9369 } catch (buffer::error& err) {
9370 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9371 }
9372 }
9373 }
9374 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
9375 if (aiter != s->attrset.end()) {
9376 bufferlist& zone_short_id_bl = aiter->second;
9377 if (zone_short_id_bl.length()) {
9378 bufferlist::iterator zbl = zone_short_id_bl.begin();
9379 try {
9380 ::decode(s->zone_short_id, zbl);
9381 } catch (buffer::error& err) {
9382 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
9383 }
9384 }
9385 }
9386 if (s->obj_tag.length())
9387 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
9388 else
9389 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
9390
9391 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
9392 * it exist, and not only if is_olh() returns true
9393 */
9394 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
9395 if (iter != s->attrset.end()) {
9396 s->olh_tag = iter->second;
9397 }
9398
9399 if (is_olh(s->attrset)) {
9400 s->is_olh = true;
9401
9402 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
9403
9404 if (need_follow_olh) {
9405 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
9406 }
9407 }
9408
9409 return 0;
9410 }
9411
9412 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9413 bool follow_olh, bool assume_noent)
9414 {
9415 int ret;
9416
9417 do {
9418 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
9419 } while (ret == -EAGAIN);
9420
9421 return ret;
9422 }
9423
9424 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
9425 {
9426 RGWObjState *astate;
9427 int r = get_state(&astate, true);
9428 if (r < 0) {
9429 return r;
9430 }
9431
9432 *pmanifest = &astate->manifest;
9433
9434 return 0;
9435 }
9436
9437 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
9438 {
9439 RGWObjState *state;
9440 int r = source->get_state(&state, true);
9441 if (r < 0)
9442 return r;
9443 if (!state->exists)
9444 return -ENOENT;
9445 if (!state->get_attr(name, dest))
9446 return -ENODATA;
9447
9448 return 0;
9449 }
9450
9451
9452 int RGWRados::Object::Stat::stat_async()
9453 {
9454 RGWObjectCtx& ctx = source->get_ctx();
9455 rgw_obj& obj = source->get_obj();
9456 RGWRados *store = source->get_store();
9457
9458 RGWObjState *s = ctx.obj.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
9459 result.obj = obj;
9460 if (s->has_attrs) {
9461 state.ret = 0;
9462 result.size = s->size;
9463 result.mtime = ceph::real_clock::to_timespec(s->mtime);
9464 result.attrs = s->attrset;
9465 result.has_manifest = s->has_manifest;
9466 result.manifest = s->manifest;
9467 return 0;
9468 }
9469
9470 string oid;
9471 string loc;
9472 get_obj_bucket_and_oid_loc(obj, oid, loc);
9473
9474 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
9475 if (r < 0) {
9476 return r;
9477 }
9478
9479 librados::ObjectReadOperation op;
9480 op.stat2(&result.size, &result.mtime, NULL);
9481 op.getxattrs(&result.attrs, NULL);
9482 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9483 state.io_ctx.locator_set_key(loc);
9484 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
9485 if (r < 0) {
9486 ldout(store->ctx(), 5) << __func__
9487 << ": ERROR: aio_operate() returned ret=" << r
9488 << dendl;
9489 return r;
9490 }
9491
9492 return 0;
9493 }
9494
9495
9496 int RGWRados::Object::Stat::wait()
9497 {
9498 if (!state.completion) {
9499 return state.ret;
9500 }
9501
9502 state.completion->wait_for_safe();
9503 state.ret = state.completion->get_return_value();
9504 state.completion->release();
9505
9506 if (state.ret != 0) {
9507 return state.ret;
9508 }
9509
9510 return finish();
9511 }
9512
9513 int RGWRados::Object::Stat::finish()
9514 {
9515 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
9516 if (iter != result.attrs.end()) {
9517 bufferlist& bl = iter->second;
9518 bufferlist::iterator biter = bl.begin();
9519 try {
9520 ::decode(result.manifest, biter);
9521 } catch (buffer::error& err) {
9522 RGWRados *store = source->get_store();
9523 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
9524 return -EIO;
9525 }
9526 result.has_manifest = true;
9527 }
9528
9529 return 0;
9530 }
9531
9532 /**
9533 * Get an attribute for a system object.
9534 * obj: the object to get attr
9535 * name: name of the attr to retrieve
9536 * dest: bufferlist to store the result in
9537 * Returns: 0 on success, -ERR# otherwise.
9538 */
9539 int RGWRados::system_obj_get_attr(rgw_raw_obj& obj, const char *name, bufferlist& dest)
9540 {
9541 rgw_rados_ref ref;
9542 int r = get_system_obj_ref(obj, &ref);
9543 if (r < 0) {
9544 return r;
9545 }
9546
9547 ObjectReadOperation op;
9548
9549 int rval;
9550 op.getxattr(name, &dest, &rval);
9551
9552 r = ref.ioctx.operate(ref.oid, &op, NULL);
9553 if (r < 0)
9554 return r;
9555
9556 return 0;
9557 }
9558
9559 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
9560 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9561 ObjectOperation& op, RGWObjState **pstate)
9562 {
9563 if (!rctx)
9564 return 0;
9565
9566 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
9567 if (r < 0)
9568 return r;
9569
9570 RGWObjState *state = *pstate;
9571
9572 if (!state->is_atomic) {
9573 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not appending atomic test" << dendl;
9574 return 0;
9575 }
9576
9577 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
9578 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9579 } else {
9580 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
9581 }
9582 return 0;
9583 }
9584
9585 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
9586 {
9587 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
9588 }
9589
9590 void RGWRados::Object::invalidate_state()
9591 {
9592 ctx.obj.invalidate(obj);
9593 }
9594
9595 void RGWRados::SystemObject::invalidate_state()
9596 {
9597 ctx.raw.invalidate(obj);
9598 }
9599
9600 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
9601 const char *if_match, const char *if_nomatch, bool removal_op,
9602 bool modify_tail)
9603 {
9604 int r = get_state(&state, false);
9605 if (r < 0)
9606 return r;
9607
9608 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
9609 if_match != NULL || if_nomatch != NULL) &&
9610 (!state->fake_tag);
9611
9612 if (!state->is_atomic) {
9613 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
9614
9615 if (reset_obj) {
9616 op.create(false);
9617 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
9618 }
9619
9620 return 0;
9621 }
9622
9623 if (need_guard) {
9624 /* first verify that the object wasn't replaced under */
9625 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
9626 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
9627 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
9628 }
9629
9630 if (if_match) {
9631 if (strcmp(if_match, "*") == 0) {
9632 // test the object is existing
9633 if (!state->exists) {
9634 return -ERR_PRECONDITION_FAILED;
9635 }
9636 } else {
9637 bufferlist bl;
9638 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9639 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
9640 return -ERR_PRECONDITION_FAILED;
9641 }
9642 }
9643 }
9644
9645 if (if_nomatch) {
9646 if (strcmp(if_nomatch, "*") == 0) {
9647 // test the object is NOT existing
9648 if (state->exists) {
9649 return -ERR_PRECONDITION_FAILED;
9650 }
9651 } else {
9652 bufferlist bl;
9653 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
9654 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
9655 return -ERR_PRECONDITION_FAILED;
9656 }
9657 }
9658 }
9659 }
9660
9661 if (reset_obj) {
9662 if (state->exists) {
9663 op.create(false);
9664 store->remove_rgw_head_obj(op);
9665 } else {
9666 op.create(true);
9667 }
9668 }
9669
9670 if (removal_op) {
9671 /* the object is being removed, no need to update its tag */
9672 return 0;
9673 }
9674
9675 if (ptag) {
9676 state->write_tag = *ptag;
9677 } else {
9678 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
9679 }
9680 bufferlist bl;
9681 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
9682
9683 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
9684
9685 op.setxattr(RGW_ATTR_ID_TAG, bl);
9686 if (modify_tail) {
9687 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
9688 }
9689
9690 return 0;
9691 }
9692
9693 int RGWRados::system_obj_set_attr(void *ctx, rgw_raw_obj& obj, const char *name, bufferlist& bl,
9694 RGWObjVersionTracker *objv_tracker)
9695 {
9696 map<string, bufferlist> attrs;
9697 attrs[name] = bl;
9698 return system_obj_set_attrs(ctx, obj, attrs, NULL, objv_tracker);
9699 }
9700
9701 int RGWRados::system_obj_set_attrs(void *ctx, rgw_raw_obj& obj,
9702 map<string, bufferlist>& attrs,
9703 map<string, bufferlist>* rmattrs,
9704 RGWObjVersionTracker *objv_tracker)
9705 {
9706 rgw_rados_ref ref;
9707 int r = get_system_obj_ref(obj, &ref);
9708 if (r < 0) {
9709 return r;
9710 }
9711 ObjectWriteOperation op;
9712
9713 if (objv_tracker) {
9714 objv_tracker->prepare_op_for_write(&op);
9715 }
9716
9717 map<string, bufferlist>::iterator iter;
9718 if (rmattrs) {
9719 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9720 const string& name = iter->first;
9721 op.rmxattr(name.c_str());
9722 }
9723 }
9724
9725 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9726 const string& name = iter->first;
9727 bufferlist& bl = iter->second;
9728
9729 if (!bl.length())
9730 continue;
9731
9732 op.setxattr(name.c_str(), bl);
9733 }
9734
9735 if (!op.size())
9736 return 0;
9737
9738 bufferlist bl;
9739
9740 r = ref.ioctx.operate(ref.oid, &op);
9741 if (r < 0)
9742 return r;
9743
9744 return 0;
9745 }
9746
9747 /**
9748 * Set an attr on an object.
9749 * bucket: name of the bucket holding the object
9750 * obj: name of the object to set the attr on
9751 * name: the attr to set
9752 * bl: the contents of the attr
9753 * Returns: 0 on success, -ERR# otherwise.
9754 */
9755 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
9756 {
9757 map<string, bufferlist> attrs;
9758 attrs[name] = bl;
9759 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
9760 }
9761
9762 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
9763 map<string, bufferlist>& attrs,
9764 map<string, bufferlist>* rmattrs)
9765 {
9766 rgw_rados_ref ref;
9767 int r = get_obj_head_ref(bucket_info, obj, &ref);
9768 if (r < 0) {
9769 return r;
9770 }
9771 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
9772
9773 ObjectWriteOperation op;
9774 RGWObjState *state = NULL;
9775
9776 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
9777 if (r < 0)
9778 return r;
9779
9780 map<string, bufferlist>::iterator iter;
9781 if (rmattrs) {
9782 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9783 const string& name = iter->first;
9784 op.rmxattr(name.c_str());
9785 }
9786 }
9787
9788 const rgw_bucket& bucket = obj.bucket;
9789
9790 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9791 const string& name = iter->first;
9792 bufferlist& bl = iter->second;
9793
9794 if (!bl.length())
9795 continue;
9796
9797 op.setxattr(name.c_str(), bl);
9798
9799 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
9800 real_time ts;
9801 try {
9802 ::decode(ts, bl);
9803
9804 rgw_obj_index_key obj_key;
9805 obj.key.get_index_key(&obj_key);
9806
9807 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
9808 } catch (buffer::error& err) {
9809 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
9810 }
9811 }
9812 }
9813
9814 if (!op.size())
9815 return 0;
9816
9817 RGWObjectCtx obj_ctx(this);
9818
9819 bufferlist bl;
9820 RGWRados::Bucket bop(this, bucket_info);
9821 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9822
9823 if (state) {
9824 string tag;
9825 append_rand_alpha(cct, tag, tag, 32);
9826 state->write_tag = tag;
9827 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
9828
9829 if (r < 0)
9830 return r;
9831
9832 bl.append(tag.c_str(), tag.size() + 1);
9833 op.setxattr(RGW_ATTR_ID_TAG, bl);
9834 }
9835
9836
9837 real_time mtime = real_clock::now();
9838 struct timespec mtime_ts = real_clock::to_timespec(mtime);
9839 op.mtime2(&mtime_ts);
9840 r = ref.ioctx.operate(ref.oid, &op);
9841 if (state) {
9842 if (r >= 0) {
9843 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
9844 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
9845 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
9846 string etag(etag_bl.c_str(), etag_bl.length());
9847 string content_type(content_type_bl.c_str(), content_type_bl.length());
9848 uint64_t epoch = ref.ioctx.get_last_version();
9849 int64_t poolid = ref.ioctx.get_id();
9850 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
9851 mtime, etag, content_type, &acl_bl,
9852 RGW_OBJ_CATEGORY_MAIN, NULL);
9853 } else {
9854 int ret = index_op.cancel();
9855 if (ret < 0) {
9856 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
9857 }
9858 }
9859 }
9860 if (r < 0)
9861 return r;
9862
9863 if (state) {
9864 state->obj_tag.swap(bl);
9865 if (rmattrs) {
9866 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
9867 state->attrset.erase(iter->first);
9868 }
9869 }
9870 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
9871 state->attrset[iter->first] = iter->second;
9872 }
9873 }
9874
9875 return 0;
9876 }
9877
9878 int RGWRados::Object::Read::prepare()
9879 {
9880 RGWRados *store = source->get_store();
9881 CephContext *cct = store->ctx();
9882
9883 bufferlist etag;
9884
9885 map<string, bufferlist>::iterator iter;
9886
9887 RGWObjState *astate;
9888 int r = source->get_state(&astate, true);
9889 if (r < 0)
9890 return r;
9891
9892 if (!astate->exists) {
9893 return -ENOENT;
9894 }
9895
9896 const RGWBucketInfo& bucket_info = source->get_bucket_info();
9897
9898 state.obj = astate->obj;
9899 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
9900
9901 r = store->get_obj_head_ioctx(bucket_info, state.obj, &state.io_ctx);
9902 if (r < 0) {
9903 return r;
9904 }
9905 if (params.attrs) {
9906 *params.attrs = astate->attrset;
9907 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
9908 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
9909 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
9910 }
9911 }
9912 }
9913
9914 /* Convert all times go GMT to make them compatible */
9915 if (conds.mod_ptr || conds.unmod_ptr) {
9916 obj_time_weight src_weight;
9917 src_weight.init(astate);
9918 src_weight.high_precision = conds.high_precision_time;
9919
9920 obj_time_weight dest_weight;
9921 dest_weight.high_precision = conds.high_precision_time;
9922
9923 if (conds.mod_ptr) {
9924 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9925 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9926 if (!(dest_weight < src_weight)) {
9927 return -ERR_NOT_MODIFIED;
9928 }
9929 }
9930
9931 if (conds.unmod_ptr) {
9932 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
9933 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
9934 if (dest_weight < src_weight) {
9935 return -ERR_PRECONDITION_FAILED;
9936 }
9937 }
9938 }
9939 if (conds.if_match || conds.if_nomatch) {
9940 r = get_attr(RGW_ATTR_ETAG, etag);
9941 if (r < 0)
9942 return r;
9943
9944 if (conds.if_match) {
9945 string if_match_str = rgw_string_unquote(conds.if_match);
9946 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-Match: " << if_match_str << dendl;
9947 if (if_match_str.compare(etag.c_str()) != 0) {
9948 return -ERR_PRECONDITION_FAILED;
9949 }
9950 }
9951
9952 if (conds.if_nomatch) {
9953 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
9954 ldout(cct, 10) << "ETag: " << etag.c_str() << " " << " If-NoMatch: " << if_nomatch_str << dendl;
9955 if (if_nomatch_str.compare(etag.c_str()) == 0) {
9956 return -ERR_NOT_MODIFIED;
9957 }
9958 }
9959 }
9960
9961 if (params.obj_size)
9962 *params.obj_size = astate->size;
9963 if (params.lastmod)
9964 *params.lastmod = astate->mtime;
9965
9966 return 0;
9967 }
9968
9969 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
9970 {
9971 if (ofs < 0) {
9972 ofs += obj_size;
9973 if (ofs < 0)
9974 ofs = 0;
9975 end = obj_size - 1;
9976 } else if (end < 0) {
9977 end = obj_size - 1;
9978 }
9979
9980 if (obj_size > 0) {
9981 if (ofs >= (off_t)obj_size) {
9982 return -ERANGE;
9983 }
9984 if (end >= (off_t)obj_size) {
9985 end = obj_size - 1;
9986 }
9987 }
9988 return 0;
9989 }
9990
9991 int RGWRados::SystemObject::get_state(RGWRawObjState **pstate, RGWObjVersionTracker *objv_tracker)
9992 {
9993 return store->get_system_obj_state(&ctx, obj, pstate, objv_tracker);
9994 }
9995
9996 int RGWRados::stat_system_obj(RGWObjectCtx& obj_ctx,
9997 RGWRados::SystemObject::Read::GetObjState& state,
9998 rgw_raw_obj& obj,
9999 map<string, bufferlist> *attrs,
10000 real_time *lastmod,
10001 uint64_t *obj_size,
10002 RGWObjVersionTracker *objv_tracker)
10003 {
10004 RGWRawObjState *astate = NULL;
10005
10006 int r = get_system_obj_state(&obj_ctx, obj, &astate, objv_tracker);
10007 if (r < 0)
10008 return r;
10009
10010 if (!astate->exists) {
10011 return -ENOENT;
10012 }
10013
10014 if (attrs) {
10015 *attrs = astate->attrset;
10016 if (cct->_conf->subsys.should_gather(ceph_subsys_rgw, 20)) {
10017 map<string, bufferlist>::iterator iter;
10018 for (iter = attrs->begin(); iter != attrs->end(); ++iter) {
10019 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
10020 }
10021 }
10022 }
10023
10024 if (obj_size)
10025 *obj_size = astate->size;
10026 if (lastmod)
10027 *lastmod = astate->mtime;
10028
10029 return 0;
10030 }
10031
10032
10033 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
10034 {
10035 RGWRados *store = target->get_store();
10036 BucketShard *bs;
10037 int r;
10038
10039 #define NUM_RESHARD_RETRIES 10
10040 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
10041 int ret = get_bucket_shard(&bs);
10042 if (ret < 0) {
10043 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10044 return ret;
10045 }
10046 r = call(bs);
10047 if (r != -ERR_BUSY_RESHARDING) {
10048 break;
10049 }
10050 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
10051 string new_bucket_id;
10052 r = store->block_while_resharding(bs, &new_bucket_id);
10053 if (r == -ERR_BUSY_RESHARDING) {
10054 continue;
10055 }
10056 if (r < 0) {
10057 return r;
10058 }
10059 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
10060 i = 0; /* resharding is finished, make sure we can retry */
10061 r = target->update_bucket_id(new_bucket_id);
10062 if (r < 0) {
10063 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
10064 return r;
10065 }
10066 invalidate_bs();
10067 }
10068
10069 if (r < 0) {
10070 return r;
10071 }
10072
10073 if (pbs) {
10074 *pbs = bs;
10075 }
10076
10077 return 0;
10078 }
10079
10080 int RGWRados::SystemObject::Read::stat(RGWObjVersionTracker *objv_tracker)
10081 {
10082 RGWRados *store = source->get_store();
10083 rgw_raw_obj& obj = source->get_obj();
10084
10085 return store->stat_system_obj(source->get_ctx(), state, obj, stat_params.attrs,
10086 stat_params.lastmod, stat_params.obj_size, objv_tracker);
10087 }
10088
10089 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
10090 {
10091 if (blind) {
10092 return 0;
10093 }
10094 RGWRados *store = target->get_store();
10095
10096 if (write_tag && write_tag->length()) {
10097 optag = string(write_tag->c_str(), write_tag->length());
10098 } else {
10099 if (optag.empty()) {
10100 append_rand_alpha(store->ctx(), optag, optag, 32);
10101 }
10102 }
10103
10104 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
10105 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
10106 });
10107
10108 if (r < 0) {
10109 return r;
10110 }
10111 prepared = true;
10112
10113 return 0;
10114 }
10115
10116 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
10117 uint64_t size, uint64_t accounted_size,
10118 ceph::real_time& ut, const string& etag,
10119 const string& content_type,
10120 bufferlist *acl_bl,
10121 RGWObjCategory category,
10122 list<rgw_obj_index_key> *remove_objs, const string *user_data)
10123 {
10124 if (blind) {
10125 return 0;
10126 }
10127 RGWRados *store = target->get_store();
10128 BucketShard *bs;
10129
10130 int ret = get_bucket_shard(&bs);
10131 if (ret < 0) {
10132 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10133 return ret;
10134 }
10135
10136 rgw_bucket_dir_entry ent;
10137 obj.key.get_index_key(&ent.key);
10138 ent.meta.size = size;
10139 ent.meta.accounted_size = accounted_size;
10140 ent.meta.mtime = ut;
10141 ent.meta.etag = etag;
10142 if (user_data)
10143 ent.meta.user_data = *user_data;
10144
10145 ACLOwner owner;
10146 if (acl_bl && acl_bl->length()) {
10147 int ret = store->decode_policy(*acl_bl, &owner);
10148 if (ret < 0) {
10149 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
10150 }
10151 }
10152 ent.meta.owner = owner.get_id().to_str();
10153 ent.meta.owner_display_name = owner.get_display_name();
10154 ent.meta.content_type = content_type;
10155
10156 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
10157
10158 if (target->bucket_info.datasync_flag_enabled()) {
10159 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10160 if (r < 0) {
10161 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10162 }
10163 }
10164
10165 return ret;
10166 }
10167
10168 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
10169 real_time& removed_mtime,
10170 list<rgw_obj_index_key> *remove_objs)
10171 {
10172 if (blind) {
10173 return 0;
10174 }
10175 RGWRados *store = target->get_store();
10176 BucketShard *bs;
10177
10178 int ret = get_bucket_shard(&bs);
10179 if (ret < 0) {
10180 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
10181 return ret;
10182 }
10183
10184 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
10185
10186 if (target->bucket_info.datasync_flag_enabled()) {
10187 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10188 if (r < 0) {
10189 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10190 }
10191 }
10192
10193 return ret;
10194 }
10195
10196
10197 int RGWRados::Bucket::UpdateIndex::cancel()
10198 {
10199 if (blind) {
10200 return 0;
10201 }
10202 RGWRados *store = target->get_store();
10203 BucketShard *bs;
10204
10205 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
10206 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
10207 });
10208
10209 /*
10210 * need to update data log anyhow, so that whoever follows needs to update its internal markers
10211 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
10212 * have no way to tell that they're all caught up
10213 */
10214 if (target->bucket_info.datasync_flag_enabled()) {
10215 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
10216 if (r < 0) {
10217 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
10218 }
10219 }
10220
10221 return ret;
10222 }
10223
10224 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
10225 {
10226 RGWRados *store = source->get_store();
10227 CephContext *cct = store->ctx();
10228
10229 rgw_raw_obj read_obj;
10230 uint64_t read_ofs = ofs;
10231 uint64_t len, read_len;
10232 bool reading_from_head = true;
10233 ObjectReadOperation op;
10234
10235 bool merge_bl = false;
10236 bufferlist *pbl = &bl;
10237 bufferlist read_bl;
10238 uint64_t max_chunk_size;
10239
10240 RGWObjState *astate;
10241 int r = source->get_state(&astate, true);
10242 if (r < 0)
10243 return r;
10244
10245 if (end < 0)
10246 len = 0;
10247 else
10248 len = end - ofs + 1;
10249
10250 if (astate->has_manifest && astate->manifest.has_tail()) {
10251 /* now get the relevant object part */
10252 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10253
10254 uint64_t stripe_ofs = iter.get_stripe_ofs();
10255 read_obj = iter.get_location().get_raw_obj(store);
10256 len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10257 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10258 reading_from_head = (read_obj == state.head_obj);
10259 } else {
10260 read_obj = state.head_obj;
10261 }
10262
10263 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
10264 if (r < 0) {
10265 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
10266 return r;
10267 }
10268
10269 if (len > max_chunk_size)
10270 len = max_chunk_size;
10271
10272
10273 state.io_ctx.locator_set_key(read_obj.loc);
10274
10275 read_len = len;
10276
10277 if (reading_from_head) {
10278 /* only when reading from the head object do we need to do the atomic test */
10279 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
10280 if (r < 0)
10281 return r;
10282
10283 if (astate && astate->prefetch_data) {
10284 if (!ofs && astate->data.length() >= len) {
10285 bl = astate->data;
10286 return bl.length();
10287 }
10288
10289 if (ofs < astate->data.length()) {
10290 unsigned copy_len = min((uint64_t)astate->data.length() - ofs, len);
10291 astate->data.copy(ofs, copy_len, bl);
10292 read_len -= copy_len;
10293 read_ofs += copy_len;
10294 if (!read_len)
10295 return bl.length();
10296
10297 merge_bl = true;
10298 pbl = &read_bl;
10299 }
10300 }
10301 }
10302
10303 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
10304 op.read(read_ofs, read_len, pbl, NULL);
10305
10306 r = state.io_ctx.operate(read_obj.oid, &op, NULL);
10307 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10308
10309 if (r < 0) {
10310 return r;
10311 }
10312
10313 if (merge_bl) {
10314 bl.append(read_bl);
10315 }
10316
10317 return bl.length();
10318 }
10319
10320 int RGWRados::SystemObject::Read::GetObjState::get_ref(RGWRados *store, rgw_raw_obj& obj, rgw_rados_ref **pref)
10321 {
10322 if (!has_ref) {
10323 int r = store->get_raw_obj_ref(obj, &ref);
10324 if (r < 0) {
10325 return r;
10326 }
10327 has_ref = true;
10328 }
10329 *pref = &ref;
10330 return 0;
10331
10332 }
10333
10334 int RGWRados::get_system_obj(RGWObjectCtx& obj_ctx, RGWRados::SystemObject::Read::GetObjState& read_state,
10335 RGWObjVersionTracker *objv_tracker, rgw_raw_obj& obj,
10336 bufferlist& bl, off_t ofs, off_t end,
10337 map<string, bufferlist> *attrs,
10338 rgw_cache_entry_info *cache_info,
10339 boost::optional<obj_version>)
10340 {
10341 uint64_t len;
10342 ObjectReadOperation op;
10343
10344 if (end < 0)
10345 len = 0;
10346 else
10347 len = end - ofs + 1;
10348
10349 if (objv_tracker) {
10350 objv_tracker->prepare_op_for_read(&op);
10351 }
10352
10353 ldout(cct, 20) << "rados->read ofs=" << ofs << " len=" << len << dendl;
10354 op.read(ofs, len, &bl, NULL);
10355
10356 if (attrs) {
10357 op.getxattrs(attrs, NULL);
10358 }
10359
10360 rgw_rados_ref *ref;
10361 int r = read_state.get_ref(this, obj, &ref);
10362 if (r < 0) {
10363 ldout(cct, 20) << "read_state.get_ref() on obj=" << obj << " returned " << r << dendl;
10364 return r;
10365 }
10366 r = ref->ioctx.operate(ref->oid, &op, NULL);
10367 if (r < 0) {
10368 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10369 return r;
10370 }
10371 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
10372
10373 uint64_t op_ver = ref->ioctx.get_last_version();
10374
10375 if (read_state.last_ver > 0 &&
10376 read_state.last_ver != op_ver) {
10377 ldout(cct, 5) << "raced with an object write, abort" << dendl;
10378 return -ECANCELED;
10379 }
10380
10381 read_state.last_ver = op_ver;
10382
10383 return bl.length();
10384 }
10385
10386 int RGWRados::SystemObject::Read::read(int64_t ofs, int64_t end, bufferlist& bl,
10387 RGWObjVersionTracker *objv_tracker,
10388 boost::optional<obj_version> refresh_version)
10389 {
10390 RGWRados *store = source->get_store();
10391 rgw_raw_obj& obj = source->get_obj();
10392
10393 return store->get_system_obj(source->get_ctx(), state, objv_tracker, obj, bl,
10394 ofs, end, read_params.attrs,
10395 read_params.cache_info, refresh_version);
10396 }
10397
10398 int RGWRados::SystemObject::Read::get_attr(const char *name, bufferlist& dest)
10399 {
10400 RGWRados *store = source->get_store();
10401 rgw_raw_obj& obj = source->get_obj();
10402
10403 return store->system_obj_get_attr(obj, name, dest);
10404 }
10405
10406 struct get_obj_data;
10407
10408 struct get_obj_aio_data {
10409 struct get_obj_data *op_data;
10410 off_t ofs;
10411 off_t len;
10412 };
10413
10414 struct get_obj_io {
10415 off_t len;
10416 bufferlist bl;
10417 };
10418
10419 static void _get_obj_aio_completion_cb(completion_t cb, void *arg);
10420
10421 struct get_obj_data : public RefCountedObject {
10422 CephContext *cct;
10423 RGWRados *rados;
10424 RGWObjectCtx *ctx;
10425 IoCtx io_ctx;
10426 map<off_t, get_obj_io> io_map;
10427 map<off_t, librados::AioCompletion *> completion_map;
10428 uint64_t total_read;
10429 Mutex lock;
10430 Mutex data_lock;
10431 list<get_obj_aio_data> aio_data;
10432 RGWGetDataCB *client_cb;
10433 std::atomic<bool> cancelled = { false };
10434 std::atomic<int64_t> err_code = { 0 };
10435 Throttle throttle;
10436 list<bufferlist> read_list;
10437
10438 explicit get_obj_data(CephContext *_cct)
10439 : cct(_cct),
10440 rados(NULL), ctx(NULL),
10441 total_read(0), lock("get_obj_data"), data_lock("get_obj_data::data_lock"),
10442 client_cb(NULL),
10443 throttle(cct, "get_obj_data", cct->_conf->rgw_get_obj_window_size, false) {}
10444 ~get_obj_data() override { }
10445 void set_cancelled(int r) {
10446 cancelled = true;
10447 err_code = r;
10448 }
10449
10450 bool is_cancelled() {
10451 return cancelled;
10452 }
10453
10454 int get_err_code() {
10455 return err_code;
10456 }
10457
10458 int wait_next_io(bool *done) {
10459 lock.Lock();
10460 map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10461 if (iter == completion_map.end()) {
10462 *done = true;
10463 lock.Unlock();
10464 return 0;
10465 }
10466 off_t cur_ofs = iter->first;
10467 librados::AioCompletion *c = iter->second;
10468 lock.Unlock();
10469
10470 c->wait_for_safe_and_cb();
10471 int r = c->get_return_value();
10472
10473 lock.Lock();
10474 completion_map.erase(cur_ofs);
10475
10476 if (completion_map.empty()) {
10477 *done = true;
10478 }
10479 lock.Unlock();
10480
10481 c->release();
10482
10483 return r;
10484 }
10485
10486 void add_io(off_t ofs, off_t len, bufferlist **pbl, AioCompletion **pc) {
10487 Mutex::Locker l(lock);
10488
10489 const auto& io_iter = io_map.insert(
10490 map<off_t, get_obj_io>::value_type(ofs, get_obj_io()));
10491
10492 assert(io_iter.second); // assert new insertion
10493
10494 get_obj_io& io = (io_iter.first)->second;
10495 *pbl = &io.bl;
10496
10497 struct get_obj_aio_data aio;
10498 aio.ofs = ofs;
10499 aio.len = len;
10500 aio.op_data = this;
10501
10502 aio_data.push_back(aio);
10503
10504 struct get_obj_aio_data *paio_data = &aio_data.back(); /* last element */
10505
10506 librados::AioCompletion *c = librados::Rados::aio_create_completion((void *)paio_data, NULL, _get_obj_aio_completion_cb);
10507 completion_map[ofs] = c;
10508
10509 *pc = c;
10510
10511 /* we have a reference per IO, plus one reference for the calling function.
10512 * reference is dropped for each callback, plus when we're done iterating
10513 * over the parts */
10514 get();
10515 }
10516
10517 void cancel_io(off_t ofs) {
10518 ldout(cct, 20) << "get_obj_data::cancel_io() ofs=" << ofs << dendl;
10519 lock.Lock();
10520 map<off_t, AioCompletion *>::iterator iter = completion_map.find(ofs);
10521 if (iter != completion_map.end()) {
10522 AioCompletion *c = iter->second;
10523 c->release();
10524 completion_map.erase(ofs);
10525 io_map.erase(ofs);
10526 }
10527 lock.Unlock();
10528
10529 /* we don't drop a reference here -- e.g., not calling d->put(), because we still
10530 * need IoCtx to live, as io callback may still be called
10531 */
10532 }
10533
10534 void cancel_all_io() {
10535 ldout(cct, 20) << "get_obj_data::cancel_all_io()" << dendl;
10536 Mutex::Locker l(lock);
10537 for (map<off_t, librados::AioCompletion *>::iterator iter = completion_map.begin();
10538 iter != completion_map.end(); ++iter) {
10539 librados::AioCompletion *c = iter->second;
10540 c->release();
10541 }
10542 }
10543
10544 int get_complete_ios(off_t ofs, list<bufferlist>& bl_list) {
10545 Mutex::Locker l(lock);
10546
10547 map<off_t, get_obj_io>::iterator liter = io_map.begin();
10548
10549 if (liter == io_map.end() ||
10550 liter->first != ofs) {
10551 return 0;
10552 }
10553
10554 map<off_t, librados::AioCompletion *>::iterator aiter;
10555 aiter = completion_map.find(ofs);
10556 if (aiter == completion_map.end()) {
10557 /* completion map does not hold this io, it was cancelled */
10558 return 0;
10559 }
10560
10561 AioCompletion *completion = aiter->second;
10562 int r = completion->get_return_value();
10563 if (r < 0)
10564 return r;
10565
10566 for (; aiter != completion_map.end(); ++aiter) {
10567 completion = aiter->second;
10568 if (!completion->is_safe()) {
10569 /* reached a request that is not yet complete, stop */
10570 break;
10571 }
10572
10573 r = completion->get_return_value();
10574 if (r < 0) {
10575 set_cancelled(r); /* mark it as cancelled, so that we don't continue processing next operations */
10576 return r;
10577 }
10578
10579 total_read += r;
10580
10581 map<off_t, get_obj_io>::iterator old_liter = liter++;
10582 bl_list.push_back(old_liter->second.bl);
10583 io_map.erase(old_liter);
10584 }
10585
10586 return 0;
10587 }
10588 };
10589
10590 static int _get_obj_iterate_cb(const RGWBucketInfo& bucket_info, const rgw_obj& obj, const rgw_raw_obj& read_obj, off_t obj_ofs, off_t read_ofs, off_t len, bool is_head_obj, RGWObjState *astate, void *arg)
10591 {
10592 struct get_obj_data *d = (struct get_obj_data *)arg;
10593
10594 return d->rados->get_obj_iterate_cb(d->ctx, astate, bucket_info, obj, read_obj, obj_ofs, read_ofs, len, is_head_obj, arg);
10595 }
10596
10597 static void _get_obj_aio_completion_cb(completion_t cb, void *arg)
10598 {
10599 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10600 struct get_obj_data *d = aio_data->op_data;
10601
10602 d->rados->get_obj_aio_completion_cb(cb, arg);
10603 }
10604
10605
10606 void RGWRados::get_obj_aio_completion_cb(completion_t c, void *arg)
10607 {
10608 struct get_obj_aio_data *aio_data = (struct get_obj_aio_data *)arg;
10609 struct get_obj_data *d = aio_data->op_data;
10610 off_t ofs = aio_data->ofs;
10611 off_t len = aio_data->len;
10612
10613 list<bufferlist> bl_list;
10614 list<bufferlist>::iterator iter;
10615 int r;
10616
10617 ldout(cct, 20) << "get_obj_aio_completion_cb: io completion ofs=" << ofs << " len=" << len << dendl;
10618 d->throttle.put(len);
10619
10620 r = rados_aio_get_return_value(c);
10621 if (r < 0) {
10622 ldout(cct, 0) << "ERROR: got unexpected error when trying to read object: " << r << dendl;
10623 d->set_cancelled(r);
10624 goto done;
10625 }
10626
10627 if (d->is_cancelled()) {
10628 goto done;
10629 }
10630
10631 d->data_lock.Lock();
10632
10633 r = d->get_complete_ios(ofs, bl_list);
10634 if (r < 0) {
10635 goto done_unlock;
10636 }
10637
10638 d->read_list.splice(d->read_list.end(), bl_list);
10639
10640 done_unlock:
10641 d->data_lock.Unlock();
10642 done:
10643 d->put();
10644 return;
10645 }
10646
10647 int RGWRados::flush_read_list(struct get_obj_data *d)
10648 {
10649 d->data_lock.Lock();
10650 list<bufferlist> l;
10651 l.swap(d->read_list);
10652 d->get();
10653 d->read_list.clear();
10654
10655 d->data_lock.Unlock();
10656
10657 int r = 0;
10658
10659 list<bufferlist>::iterator iter;
10660 for (iter = l.begin(); iter != l.end(); ++iter) {
10661 bufferlist& bl = *iter;
10662 r = d->client_cb->handle_data(bl, 0, bl.length());
10663 if (r < 0) {
10664 dout(0) << "ERROR: flush_read_list(): d->client_cb->handle_data() returned " << r << dendl;
10665 break;
10666 }
10667 }
10668
10669 d->data_lock.Lock();
10670 d->put();
10671 if (r < 0) {
10672 d->set_cancelled(r);
10673 }
10674 d->data_lock.Unlock();
10675 return r;
10676 }
10677
10678 int RGWRados::get_obj_iterate_cb(RGWObjectCtx *ctx, RGWObjState *astate,
10679 const RGWBucketInfo& bucket_info,
10680 const rgw_obj& obj,
10681 const rgw_raw_obj& read_obj,
10682 off_t obj_ofs,
10683 off_t read_ofs, off_t len,
10684 bool is_head_obj, void *arg)
10685 {
10686 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
10687 ObjectReadOperation op;
10688 struct get_obj_data *d = (struct get_obj_data *)arg;
10689 string oid, key;
10690 bufferlist *pbl;
10691 AioCompletion *c;
10692
10693 int r;
10694
10695 if (is_head_obj) {
10696 /* only when reading from the head object do we need to do the atomic test */
10697 r = append_atomic_test(rctx, bucket_info, obj, op, &astate);
10698 if (r < 0)
10699 return r;
10700
10701 if (astate &&
10702 obj_ofs < astate->data.length()) {
10703 unsigned chunk_len = min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
10704
10705 d->data_lock.Lock();
10706 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
10707 d->data_lock.Unlock();
10708 if (r < 0)
10709 return r;
10710
10711 d->lock.Lock();
10712 d->total_read += chunk_len;
10713 d->lock.Unlock();
10714
10715 len -= chunk_len;
10716 read_ofs += chunk_len;
10717 obj_ofs += chunk_len;
10718 if (!len)
10719 return 0;
10720 }
10721 }
10722
10723 d->throttle.get(len);
10724 if (d->is_cancelled()) {
10725 return d->get_err_code();
10726 }
10727
10728 /* add io after we check that we're not cancelled, otherwise we're going to have trouble
10729 * cleaning up
10730 */
10731 d->add_io(obj_ofs, len, &pbl, &c);
10732
10733 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
10734 op.read(read_ofs, len, pbl, NULL);
10735
10736 librados::IoCtx io_ctx(d->io_ctx);
10737 io_ctx.locator_set_key(read_obj.loc);
10738
10739 r = io_ctx.aio_operate(read_obj.oid, c, &op, NULL);
10740 if (r < 0) {
10741 ldout(cct, 0) << "rados->aio_operate r=" << r << dendl;
10742 goto done_err;
10743 }
10744
10745 // Flush data to client if there is any
10746 r = flush_read_list(d);
10747 if (r < 0)
10748 return r;
10749
10750 return 0;
10751
10752 done_err:
10753 ldout(cct, 20) << "cancelling io r=" << r << " obj_ofs=" << obj_ofs << dendl;
10754 d->set_cancelled(r);
10755 d->cancel_io(obj_ofs);
10756
10757 return r;
10758 }
10759
10760 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
10761 {
10762 RGWRados *store = source->get_store();
10763 CephContext *cct = store->ctx();
10764
10765 struct get_obj_data *data = new get_obj_data(cct);
10766 bool done = false;
10767
10768 RGWObjectCtx& obj_ctx = source->get_ctx();
10769
10770 data->rados = store;
10771 data->io_ctx.dup(state.io_ctx);
10772 data->client_cb = cb;
10773
10774 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj, ofs, end, cct->_conf->rgw_get_obj_max_req_size, _get_obj_iterate_cb, (void *)data);
10775 if (r < 0) {
10776 data->cancel_all_io();
10777 goto done;
10778 }
10779
10780 while (!done) {
10781 r = data->wait_next_io(&done);
10782 if (r < 0) {
10783 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10784 data->cancel_all_io();
10785 break;
10786 }
10787 r = store->flush_read_list(data);
10788 if (r < 0) {
10789 dout(10) << "get_obj_iterate() r=" << r << ", canceling all io" << dendl;
10790 data->cancel_all_io();
10791 break;
10792 }
10793 }
10794
10795 done:
10796 data->put();
10797 return r;
10798 }
10799
10800 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
10801 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
10802 off_t ofs, off_t end,
10803 uint64_t max_chunk_size,
10804 int (*iterate_obj_cb)(const RGWBucketInfo&, const rgw_obj& obj,
10805 const rgw_raw_obj&, off_t, off_t, off_t, bool,
10806 RGWObjState *, void *),
10807 void *arg)
10808 {
10809 rgw_raw_obj head_obj;
10810 rgw_raw_obj read_obj;
10811 uint64_t read_ofs = ofs;
10812 uint64_t len;
10813 bool reading_from_head = true;
10814 RGWObjState *astate = NULL;
10815
10816 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
10817
10818 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
10819 if (r < 0) {
10820 return r;
10821 }
10822
10823 if (end < 0)
10824 len = 0;
10825 else
10826 len = end - ofs + 1;
10827
10828 if (astate->has_manifest) {
10829 /* now get the relevant object stripe */
10830 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
10831
10832 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
10833
10834 for (; iter != obj_end && ofs <= end; ++iter) {
10835 off_t stripe_ofs = iter.get_stripe_ofs();
10836 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
10837
10838 while (ofs < next_stripe_ofs && ofs <= end) {
10839 read_obj = iter.get_location().get_raw_obj(this);
10840 uint64_t read_len = min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
10841 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
10842
10843 if (read_len > max_chunk_size) {
10844 read_len = max_chunk_size;
10845 }
10846
10847 reading_from_head = (read_obj == head_obj);
10848 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
10849 if (r < 0) {
10850 return r;
10851 }
10852
10853 len -= read_len;
10854 ofs += read_len;
10855 }
10856 }
10857 } else {
10858 while (ofs <= end) {
10859 read_obj = head_obj;
10860 uint64_t read_len = min(len, max_chunk_size);
10861
10862 r = iterate_obj_cb(bucket_info, obj, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
10863 if (r < 0) {
10864 return r;
10865 }
10866
10867 len -= read_len;
10868 ofs += read_len;
10869 }
10870 }
10871
10872 return 0;
10873 }
10874
10875 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
10876 {
10877 rgw_rados_ref ref;
10878 int r = get_obj_head_ref(bucket_info, obj, &ref);
10879 if (r < 0) {
10880 return r;
10881 }
10882
10883 return ref.ioctx.operate(ref.oid, op);
10884 }
10885
10886 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
10887 {
10888 rgw_rados_ref ref;
10889 int r = get_obj_head_ref(bucket_info, obj, &ref);
10890 if (r < 0) {
10891 return r;
10892 }
10893
10894 bufferlist outbl;
10895
10896 return ref.ioctx.operate(ref.oid, op, &outbl);
10897 }
10898
10899 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
10900 {
10901 ObjectWriteOperation op;
10902
10903 assert(olh_obj.key.instance.empty());
10904
10905 bool has_tag = (state.exists && has_olh_tag(state.attrset));
10906
10907 if (!state.exists) {
10908 op.create(true);
10909 } else {
10910 op.assert_exists();
10911 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
10912 op.mtime2(&mtime_ts);
10913 }
10914
10915 /*
10916 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
10917 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
10918 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
10919 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
10920 * log will reflect that.
10921 *
10922 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
10923 * is used for object data instance, olh_tag for olh instance.
10924 */
10925 if (has_tag) {
10926 /* guard against racing writes */
10927 bucket_index_guard_olh_op(state, op);
10928 }
10929
10930 if (!has_tag) {
10931 /* obj tag */
10932 string obj_tag;
10933 int ret = gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
10934 if (ret < 0) {
10935 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10936 return ret;
10937 }
10938 bufferlist bl;
10939 bl.append(obj_tag.c_str(), obj_tag.size());
10940 op.setxattr(RGW_ATTR_ID_TAG, bl);
10941
10942 state.attrset[RGW_ATTR_ID_TAG] = bl;
10943 state.obj_tag = bl;
10944
10945 /* olh tag */
10946 string olh_tag;
10947 ret = gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
10948 if (ret < 0) {
10949 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10950 return ret;
10951 }
10952 bufferlist olh_bl;
10953 olh_bl.append(olh_tag.c_str(), olh_tag.size());
10954 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
10955
10956 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
10957 state.olh_tag = olh_bl;
10958 state.is_olh = true;
10959
10960 bufferlist verbl;
10961 op.setxattr(RGW_ATTR_OLH_VER, verbl);
10962 }
10963
10964 bufferlist bl;
10965 RGWOLHPendingInfo pending_info;
10966 pending_info.time = real_clock::now();
10967 ::encode(pending_info, bl);
10968
10969 #define OLH_PENDING_TAG_LEN 32
10970 /* tag will start with current time epoch, this so that entries are sorted by time */
10971 char buf[32];
10972 utime_t ut(pending_info.time);
10973 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
10974 *op_tag = buf;
10975
10976 string s;
10977 int ret = gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
10978 if (ret < 0) {
10979 ldout(cct, 0) << "ERROR: gen_rand_alphanumeric_lower() returned ret=" << ret << dendl;
10980 return ret;
10981 }
10982 op_tag->append(s);
10983
10984 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
10985 attr_name.append(*op_tag);
10986
10987 op.setxattr(attr_name.c_str(), bl);
10988
10989 ret = obj_operate(bucket_info, olh_obj, &op);
10990 if (ret < 0) {
10991 return ret;
10992 }
10993
10994 state.exists = true;
10995 state.attrset[attr_name] = bl;
10996
10997 return 0;
10998 }
10999
11000 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
11001 {
11002 int ret;
11003
11004 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
11005 if (ret == -EEXIST) {
11006 ret = -ECANCELED;
11007 }
11008
11009 return ret;
11010 }
11011
11012 int RGWRados::guard_reshard(BucketShard *bs, const rgw_obj& obj_instance, std::function<int(BucketShard *)> call)
11013 {
11014 rgw_obj obj;
11015 const rgw_obj *pobj = &obj_instance;
11016 int r;
11017
11018 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
11019 r = bs->init(pobj->bucket, *pobj);
11020 if (r < 0) {
11021 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
11022 return r;
11023 }
11024 r = call(bs);
11025 if (r != -ERR_BUSY_RESHARDING) {
11026 break;
11027 }
11028 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
11029 string new_bucket_id;
11030 r = block_while_resharding(bs, &new_bucket_id);
11031 if (r == -ERR_BUSY_RESHARDING) {
11032 continue;
11033 }
11034 if (r < 0) {
11035 return r;
11036 }
11037 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
11038 i = 0; /* resharding is finished, make sure we can retry */
11039
11040 obj = *pobj;
11041 obj.bucket.update_bucket_id(new_bucket_id);
11042 pobj = &obj;
11043 }
11044
11045 if (r < 0) {
11046 return r;
11047 }
11048
11049 return 0;
11050 }
11051
11052 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs, string *new_bucket_id)
11053 {
11054 std::shared_ptr<RGWReshardWait> waiter = reshard_wait;
11055
11056 return waiter->block_while_resharding(bs, new_bucket_id);
11057 }
11058
11059 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
11060 bool delete_marker,
11061 const string& op_tag,
11062 struct rgw_bucket_dir_entry_meta *meta,
11063 uint64_t olh_epoch,
11064 real_time unmod_since, bool high_precision_time, rgw_zone_set *_zones_trace)
11065 {
11066 rgw_rados_ref ref;
11067 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11068 if (r < 0) {
11069 return r;
11070 }
11071
11072 rgw_zone_set zones_trace;
11073 if (_zones_trace) {
11074 zones_trace = *_zones_trace;
11075 } else {
11076 zones_trace.insert(get_zone().id);
11077 }
11078
11079 BucketShard bs(this);
11080
11081 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11082 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11083 librados::ObjectWriteOperation op;
11084 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11085 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
11086 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
11087 unmod_since, high_precision_time,
11088 get_zone().log_data, zones_trace);
11089 });
11090 if (r < 0) {
11091 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11092 return r;
11093 }
11094
11095 return 0;
11096 }
11097
11098 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
11099 {
11100 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
11101 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
11102 }
11103
11104 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
11105 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
11106 {
11107 rgw_rados_ref ref;
11108 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11109 if (r < 0) {
11110 return r;
11111 }
11112
11113 rgw_zone_set zones_trace;
11114 if (_zones_trace) {
11115 zones_trace = *_zones_trace;
11116 }
11117 zones_trace.insert(get_zone().id);
11118
11119 BucketShard bs(this);
11120
11121 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
11122 r = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11123 librados::ObjectWriteOperation op;
11124 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11125 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11126 olh_tag, olh_epoch, get_zone().log_data, zones_trace);
11127 });
11128 if (r < 0) {
11129 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
11130 return r;
11131 }
11132
11133 return 0;
11134 }
11135
11136 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
11137 const rgw_obj& obj_instance, uint64_t ver_marker,
11138 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
11139 bool *is_truncated)
11140 {
11141 rgw_rados_ref ref;
11142 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11143 if (r < 0) {
11144 return r;
11145 }
11146
11147 BucketShard bs(this);
11148 int ret = bs.init(obj_instance.bucket, obj_instance);
11149 if (ret < 0) {
11150 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11151 return ret;
11152 }
11153
11154 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11155
11156 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11157
11158 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *bs) -> int {
11159 ObjectReadOperation op;
11160 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11161 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
11162 key, ver_marker, olh_tag, log, is_truncated);
11163 });
11164 if (ret < 0) {
11165 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
11166 return ret;
11167 }
11168
11169 return 0;
11170 }
11171
11172 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
11173 {
11174 rgw_rados_ref ref;
11175 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11176 if (r < 0) {
11177 return r;
11178 }
11179
11180 BucketShard bs(this);
11181 int ret = bs.init(obj_instance.bucket, obj_instance);
11182 if (ret < 0) {
11183 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
11184 return ret;
11185 }
11186
11187 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11188
11189 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11190
11191 ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11192 ObjectWriteOperation op;
11193 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11194 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
11195 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
11196 });
11197 if (ret < 0) {
11198 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
11199 return ret;
11200 }
11201
11202 return 0;
11203 }
11204
11205 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
11206 {
11207 rgw_rados_ref ref;
11208 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
11209 if (r < 0) {
11210 return r;
11211 }
11212
11213 BucketShard bs(this);
11214
11215 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
11216
11217 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
11218
11219 int ret = guard_reshard(&bs, obj_instance, [&](BucketShard *pbs) -> int {
11220 ObjectWriteOperation op;
11221 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
11222 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
11223 });
11224 if (ret < 0) {
11225 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
11226 return ret;
11227 }
11228
11229 return 0;
11230 }
11231
11232 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11233 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
11234 uint64_t *plast_ver, rgw_zone_set* zones_trace)
11235 {
11236 if (log.empty()) {
11237 return 0;
11238 }
11239
11240 librados::ObjectWriteOperation op;
11241
11242 uint64_t last_ver = log.rbegin()->first;
11243 *plast_ver = last_ver;
11244
11245 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
11246
11247 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11248 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11249
11250 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
11251 op.mtime2(&mtime_ts);
11252
11253 bool need_to_link = false;
11254 cls_rgw_obj_key key;
11255 bool delete_marker = false;
11256 list<cls_rgw_obj_key> remove_instances;
11257 bool need_to_remove = false;
11258
11259 for (iter = log.begin(); iter != log.end(); ++iter) {
11260 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
11261 for (; viter != iter->second.end(); ++viter) {
11262 rgw_bucket_olh_log_entry& entry = *viter;
11263
11264 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
11265 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
11266 << (entry.delete_marker ? "(delete)" : "") << dendl;
11267 switch (entry.op) {
11268 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
11269 remove_instances.push_back(entry.key);
11270 break;
11271 case CLS_RGW_OLH_OP_LINK_OLH:
11272 need_to_link = true;
11273 need_to_remove = false;
11274 key = entry.key;
11275 delete_marker = entry.delete_marker;
11276 break;
11277 case CLS_RGW_OLH_OP_UNLINK_OLH:
11278 need_to_remove = true;
11279 need_to_link = false;
11280 break;
11281 default:
11282 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
11283 return -EIO;
11284 }
11285 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
11286 attr_name.append(entry.op_tag);
11287 op.rmxattr(attr_name.c_str());
11288 }
11289 }
11290
11291 rgw_rados_ref ref;
11292 int r = get_obj_head_ref(bucket_info, obj, &ref);
11293 if (r < 0) {
11294 return r;
11295 }
11296
11297 const rgw_bucket& bucket = obj.bucket;
11298
11299 if (need_to_link) {
11300 rgw_obj target(bucket, key);
11301 RGWOLHInfo info;
11302 info.target = target;
11303 info.removed = delete_marker;
11304 bufferlist bl;
11305 ::encode(info, bl);
11306 op.setxattr(RGW_ATTR_OLH_INFO, bl);
11307 }
11308
11309 /* first remove object instances */
11310 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
11311 liter != remove_instances.end(); ++liter) {
11312 cls_rgw_obj_key& key = *liter;
11313 rgw_obj obj_instance(bucket, key);
11314 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
11315 if (ret < 0 && ret != -ENOENT) {
11316 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
11317 return ret;
11318 }
11319 }
11320
11321 /* update olh object */
11322 r = ref.ioctx.operate(ref.oid, &op);
11323 if (r == -ECANCELED) {
11324 r = 0;
11325 }
11326 if (r < 0) {
11327 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11328 return r;
11329 }
11330
11331 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
11332 if (r < 0) {
11333 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
11334 return r;
11335 }
11336
11337 if (need_to_remove) {
11338 ObjectWriteOperation rm_op;
11339
11340 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
11341 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
11342 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
11343 rm_op.remove();
11344
11345 r = ref.ioctx.operate(ref.oid, &rm_op);
11346 if (r == -ECANCELED) {
11347 return 0; /* someone else won this race */
11348 } else {
11349 /*
11350 * only clear if was successful, otherwise we might clobber pending operations on this object
11351 */
11352 r = bucket_index_clear_olh(bucket_info, state, obj);
11353 if (r < 0) {
11354 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
11355 return r;
11356 }
11357 }
11358 }
11359
11360 return 0;
11361 }
11362
11363 /*
11364 * read olh log and apply it
11365 */
11366 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
11367 {
11368 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
11369 bool is_truncated;
11370 uint64_t ver_marker = 0;
11371
11372 do {
11373 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
11374 if (ret < 0) {
11375 return ret;
11376 }
11377 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
11378 if (ret < 0) {
11379 return ret;
11380 }
11381 } while (is_truncated);
11382
11383 return 0;
11384 }
11385
11386 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
11387 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time, rgw_zone_set *zones_trace)
11388 {
11389 string op_tag;
11390
11391 rgw_obj olh_obj = target_obj;
11392 olh_obj.key.instance.clear();
11393
11394 RGWObjState *state = NULL;
11395
11396 int ret = 0;
11397 int i;
11398
11399 #define MAX_ECANCELED_RETRY 100
11400 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11401 if (ret == -ECANCELED) {
11402 obj_ctx.obj.invalidate(olh_obj);
11403 }
11404
11405 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11406 if (ret < 0) {
11407 return ret;
11408 }
11409
11410 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11411 if (ret < 0) {
11412 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11413 if (ret == -ECANCELED) {
11414 continue;
11415 }
11416 return ret;
11417 }
11418 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker, op_tag, meta, olh_epoch, unmod_since, high_precision_time, zones_trace);
11419 if (ret < 0) {
11420 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
11421 if (ret == -ECANCELED) {
11422 continue;
11423 }
11424 return ret;
11425 }
11426 break;
11427 }
11428
11429 if (i == MAX_ECANCELED_RETRY) {
11430 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11431 return -EIO;
11432 }
11433
11434 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11435 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11436 ret = 0;
11437 }
11438 if (ret < 0) {
11439 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11440 return ret;
11441 }
11442
11443 return 0;
11444 }
11445
11446 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
11447 uint64_t olh_epoch, rgw_zone_set *zones_trace)
11448 {
11449 string op_tag;
11450
11451 rgw_obj olh_obj = target_obj;
11452 olh_obj.key.instance.clear();
11453
11454 RGWObjState *state = NULL;
11455
11456 int ret = 0;
11457 int i;
11458
11459 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
11460 if (ret == -ECANCELED) {
11461 obj_ctx.obj.invalidate(olh_obj);
11462 }
11463
11464 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
11465 if (ret < 0)
11466 return ret;
11467
11468 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
11469 if (ret < 0) {
11470 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
11471 if (ret == -ECANCELED) {
11472 continue;
11473 }
11474 return ret;
11475 }
11476
11477 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
11478
11479 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
11480 if (ret < 0) {
11481 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
11482 if (ret == -ECANCELED) {
11483 continue;
11484 }
11485 return ret;
11486 }
11487 break;
11488 }
11489
11490 if (i == MAX_ECANCELED_RETRY) {
11491 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
11492 return -EIO;
11493 }
11494
11495 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
11496 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
11497 return 0;
11498 }
11499 if (ret < 0) {
11500 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
11501 return ret;
11502 }
11503
11504 return 0;
11505 }
11506
11507 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
11508 {
11509 #define OBJ_INSTANCE_LEN 32
11510 char buf[OBJ_INSTANCE_LEN + 1];
11511
11512 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
11513 no underscore for instance name due to the way we encode the raw keys */
11514
11515 target_obj->key.set_instance(buf);
11516 }
11517
11518 static void filter_attrset(map<string, bufferlist>& unfiltered_attrset, const string& check_prefix,
11519 map<string, bufferlist> *attrset)
11520 {
11521 attrset->clear();
11522 map<string, bufferlist>::iterator iter;
11523 for (iter = unfiltered_attrset.lower_bound(check_prefix);
11524 iter != unfiltered_attrset.end(); ++iter) {
11525 if (!boost::algorithm::starts_with(iter->first, check_prefix))
11526 break;
11527 (*attrset)[iter->first] = iter->second;
11528 }
11529 }
11530
11531 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
11532 {
11533 map<string, bufferlist> unfiltered_attrset;
11534
11535 ObjectReadOperation op;
11536 op.getxattrs(&unfiltered_attrset, NULL);
11537
11538 bufferlist outbl;
11539 int r = obj_operate(bucket_info, obj, &op);
11540
11541 if (r < 0) {
11542 return r;
11543 }
11544 map<string, bufferlist> attrset;
11545
11546 filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
11547
11548 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
11549 if (iter == attrset.end()) { /* not an olh */
11550 return -EINVAL;
11551 }
11552
11553 try {
11554 bufferlist::iterator biter = iter->second.begin();
11555 ::decode(*olh, biter);
11556 } catch (buffer::error& err) {
11557 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11558 return -EIO;
11559 }
11560
11561 return 0;
11562 }
11563
11564 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
11565 map<string, bufferlist> *rm_pending_entries)
11566 {
11567 map<string, bufferlist>::iterator iter = pending_entries.begin();
11568
11569 real_time now = real_clock::now();
11570
11571 while (iter != pending_entries.end()) {
11572 bufferlist::iterator biter = iter->second.begin();
11573 RGWOLHPendingInfo pending_info;
11574 try {
11575 ::decode(pending_info, biter);
11576 } catch (buffer::error& err) {
11577 /* skipping bad entry, we could remove it but it might hide a bug */
11578 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
11579 ++iter;
11580 continue;
11581 }
11582
11583 map<string, bufferlist>::iterator cur_iter = iter;
11584 ++iter;
11585 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
11586 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
11587 pending_entries.erase(cur_iter);
11588 } else {
11589 /* entries names are sorted by time (rounded to a second) */
11590 break;
11591 }
11592 }
11593 }
11594
11595 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
11596 {
11597 ObjectWriteOperation op;
11598
11599 bucket_index_guard_olh_op(state, op);
11600
11601 for (map<string, bufferlist>::iterator iter = pending_attrs.begin(); iter != pending_attrs.end(); ++iter) {
11602 op.rmxattr(iter->first.c_str());
11603 }
11604
11605 rgw_rados_ref ref;
11606 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
11607 if (r < 0) {
11608 return r;
11609 }
11610
11611 /* update olh object */
11612 r = ref.ioctx.operate(ref.oid, &op);
11613 if (r == -ENOENT || r == -ECANCELED) {
11614 /* raced with some other change, shouldn't sweat about it */
11615 r = 0;
11616 }
11617 if (r < 0) {
11618 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
11619 return r;
11620 }
11621
11622 return 0;
11623 }
11624
11625 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
11626 {
11627 map<string, bufferlist> pending_entries;
11628 filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
11629
11630 map<string, bufferlist> rm_pending_entries;
11631 check_pending_olh_entries(pending_entries, &rm_pending_entries);
11632
11633 if (!rm_pending_entries.empty()) {
11634 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
11635 if (ret < 0) {
11636 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
11637 return ret;
11638 }
11639 }
11640 if (!pending_entries.empty()) {
11641 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
11642
11643 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
11644 if (ret < 0) {
11645 return ret;
11646 }
11647 }
11648
11649 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11650 assert(iter != state->attrset.end());
11651 RGWOLHInfo olh;
11652 try {
11653 bufferlist::iterator biter = iter->second.begin();
11654 ::decode(olh, biter);
11655 } catch (buffer::error& err) {
11656 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
11657 return -EIO;
11658 }
11659
11660 if (olh.removed) {
11661 return -ENOENT;
11662 }
11663
11664 *target = olh.target;
11665
11666 return 0;
11667 }
11668
11669 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
11670 map<string, bufferlist> *attrs, bufferlist *first_chunk,
11671 RGWObjVersionTracker *objv_tracker)
11672 {
11673 rgw_rados_ref ref;
11674 int r = get_raw_obj_ref(obj, &ref);
11675 if (r < 0) {
11676 return r;
11677 }
11678
11679 map<string, bufferlist> unfiltered_attrset;
11680 uint64_t size = 0;
11681 struct timespec mtime_ts;
11682
11683 ObjectReadOperation op;
11684 if (objv_tracker) {
11685 objv_tracker->prepare_op_for_read(&op);
11686 }
11687 if (attrs) {
11688 op.getxattrs(&unfiltered_attrset, NULL);
11689 }
11690 if (psize || pmtime) {
11691 op.stat2(&size, &mtime_ts, NULL);
11692 }
11693 if (first_chunk) {
11694 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
11695 }
11696 bufferlist outbl;
11697 r = ref.ioctx.operate(ref.oid, &op, &outbl);
11698
11699 if (epoch) {
11700 *epoch = ref.ioctx.get_last_version();
11701 }
11702
11703 if (r < 0)
11704 return r;
11705
11706 if (psize)
11707 *psize = size;
11708 if (pmtime)
11709 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
11710 if (attrs) {
11711 filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
11712 }
11713
11714 return 0;
11715 }
11716
11717 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
11718 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
11719 {
11720 map<string, rgw_bucket_dir_header> headers;
11721 map<int, string> bucket_instance_ids;
11722 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11723 if (r < 0) {
11724 return r;
11725 }
11726
11727 assert(headers.size() == bucket_instance_ids.size());
11728
11729 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11730 map<int, string>::iterator viter = bucket_instance_ids.begin();
11731 BucketIndexShardsManager ver_mgr;
11732 BucketIndexShardsManager master_ver_mgr;
11733 BucketIndexShardsManager marker_mgr;
11734 char buf[64];
11735 for(; iter != headers.end(); ++iter, ++viter) {
11736 accumulate_raw_stats(iter->second, stats);
11737 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.ver);
11738 ver_mgr.add(viter->first, string(buf));
11739 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->second.master_ver);
11740 master_ver_mgr.add(viter->first, string(buf));
11741 if (shard_id >= 0) {
11742 *max_marker = iter->second.max_marker;
11743 } else {
11744 marker_mgr.add(viter->first, iter->second.max_marker);
11745 }
11746 if (syncstopped != NULL)
11747 *syncstopped = iter->second.syncstopped;
11748 }
11749 ver_mgr.to_string(bucket_ver);
11750 master_ver_mgr.to_string(master_ver);
11751 if (shard_id < 0) {
11752 marker_mgr.to_string(max_marker);
11753 }
11754 return 0;
11755 }
11756
11757 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
11758 map<int, string>& markers)
11759 {
11760 map<string, rgw_bucket_dir_header> headers;
11761 map<int, string> bucket_instance_ids;
11762 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
11763 if (r < 0)
11764 return r;
11765
11766 assert(headers.size() == bucket_instance_ids.size());
11767
11768 map<string, rgw_bucket_dir_header>::iterator iter = headers.begin();
11769 map<int, string>::iterator viter = bucket_instance_ids.begin();
11770
11771 for(; iter != headers.end(); ++iter, ++viter) {
11772 if (shard_id >= 0) {
11773 markers[shard_id] = iter->second.max_marker;
11774 } else {
11775 markers[viter->first] = iter->second.max_marker;
11776 }
11777 }
11778 return 0;
11779 }
11780
11781 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
11782 RGWGetBucketStats_CB *cb;
11783 uint32_t pendings;
11784 map<RGWObjCategory, RGWStorageStats> stats;
11785 int ret_code;
11786 bool should_cb;
11787 Mutex lock;
11788
11789 public:
11790 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
11791 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
11792 lock("RGWGetBucketStatsContext") {}
11793
11794 void handle_response(int r, rgw_bucket_dir_header& header) override {
11795 Mutex::Locker l(lock);
11796 if (should_cb) {
11797 if ( r >= 0) {
11798 accumulate_raw_stats(header, stats);
11799 } else {
11800 ret_code = r;
11801 }
11802
11803 // Are we all done?
11804 if (--pendings == 0) {
11805 if (!ret_code) {
11806 cb->set_response(&stats);
11807 }
11808 cb->handle_response(ret_code);
11809 cb->put();
11810 }
11811 }
11812 }
11813
11814 void unset_cb() {
11815 Mutex::Locker l(lock);
11816 should_cb = false;
11817 }
11818 };
11819
11820 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
11821 {
11822 int num_aio = 0;
11823 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11824 assert(get_ctx);
11825 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
11826 if (r < 0) {
11827 ctx->put();
11828 if (num_aio) {
11829 get_ctx->unset_cb();
11830 }
11831 }
11832 get_ctx->put();
11833 return r;
11834 }
11835
11836 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
11837 RGWGetUserStats_CB *cb;
11838
11839 public:
11840 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
11841 : cb(cb) {}
11842
11843 void handle_response(int r, cls_user_header& header) override {
11844 const cls_user_stats& hs = header.stats;
11845 if (r >= 0) {
11846 RGWStorageStats stats;
11847
11848 stats.size = hs.total_bytes;
11849 stats.size_rounded = hs.total_bytes_rounded;
11850 stats.num_objects = hs.total_entries;
11851
11852 cb->set_response(stats);
11853 }
11854
11855 cb->handle_response(r);
11856
11857 cb->put();
11858 }
11859 };
11860
11861 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
11862 {
11863 string user_str = user.to_str();
11864
11865 cls_user_header header;
11866 int r = cls_user_get_header(user_str, &header);
11867 if (r < 0)
11868 return r;
11869
11870 const cls_user_stats& hs = header.stats;
11871
11872 stats.size = hs.total_bytes;
11873 stats.size_rounded = hs.total_bytes_rounded;
11874 stats.num_objects = hs.total_entries;
11875
11876 return 0;
11877 }
11878
11879 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
11880 {
11881 string user_str = user.to_str();
11882
11883 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
11884 int r = cls_user_get_header_async(user_str, get_ctx);
11885 if (r < 0) {
11886 ctx->put();
11887 delete get_ctx;
11888 return r;
11889 }
11890
11891 return 0;
11892 }
11893
11894 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
11895 {
11896 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
11897 }
11898
11899 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
11900 {
11901 if (!bucket.oid.empty()) {
11902 obj.init(get_zone_params().domain_root, bucket.oid);
11903 } else {
11904 string oid;
11905 get_bucket_meta_oid(bucket, oid);
11906 obj.init(get_zone_params().domain_root, oid);
11907 }
11908 }
11909
11910 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
11911 real_time *pmtime, map<string, bufferlist> *pattrs)
11912 {
11913 size_t pos = meta_key.find(':');
11914 if (pos == string::npos) {
11915 return -EINVAL;
11916 }
11917 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
11918 rgw_bucket_instance_key_to_oid(oid);
11919
11920 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11921 }
11922
11923 int RGWRados::get_bucket_instance_info(RGWObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
11924 real_time *pmtime, map<string, bufferlist> *pattrs)
11925 {
11926 string oid;
11927 if (bucket.oid.empty()) {
11928 get_bucket_meta_oid(bucket, oid);
11929 } else {
11930 oid = bucket.oid;
11931 }
11932
11933 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
11934 }
11935
11936 int RGWRados::get_bucket_instance_from_oid(RGWObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
11937 real_time *pmtime, map<string, bufferlist> *pattrs,
11938 rgw_cache_entry_info *cache_info,
11939 boost::optional<obj_version> refresh_version)
11940 {
11941 ldout(cct, 20) << "reading from " << get_zone_params().domain_root << ":" << oid << dendl;
11942
11943 bufferlist epbl;
11944
11945 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11946 oid, epbl, &info.objv_tracker, pmtime, pattrs,
11947 cache_info, refresh_version);
11948 if (ret < 0) {
11949 return ret;
11950 }
11951
11952 bufferlist::iterator iter = epbl.begin();
11953 try {
11954 ::decode(info, iter);
11955 } catch (buffer::error& err) {
11956 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11957 return -EIO;
11958 }
11959 info.bucket.oid = oid;
11960 return 0;
11961 }
11962
11963 int RGWRados::get_bucket_entrypoint_info(RGWObjectCtx& obj_ctx,
11964 const string& tenant_name,
11965 const string& bucket_name,
11966 RGWBucketEntryPoint& entry_point,
11967 RGWObjVersionTracker *objv_tracker,
11968 real_time *pmtime,
11969 map<string, bufferlist> *pattrs,
11970 rgw_cache_entry_info *cache_info,
11971 boost::optional<obj_version> refresh_version)
11972 {
11973 bufferlist bl;
11974 string bucket_entry;
11975
11976 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11977 int ret = rgw_get_system_obj(this, obj_ctx, get_zone_params().domain_root,
11978 bucket_entry, bl, objv_tracker, pmtime, pattrs,
11979 cache_info, refresh_version);
11980 if (ret < 0) {
11981 return ret;
11982 }
11983
11984 bufferlist::iterator iter = bl.begin();
11985 try {
11986 ::decode(entry_point, iter);
11987 } catch (buffer::error& err) {
11988 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
11989 return -EIO;
11990 }
11991 return 0;
11992 }
11993
11994 int RGWRados::convert_old_bucket_info(RGWObjectCtx& obj_ctx,
11995 const string& tenant_name,
11996 const string& bucket_name)
11997 {
11998 RGWBucketEntryPoint entry_point;
11999 real_time ep_mtime;
12000 RGWObjVersionTracker ot;
12001 map<string, bufferlist> attrs;
12002 RGWBucketInfo info;
12003
12004 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
12005
12006 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
12007 if (ret < 0) {
12008 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
12009 return ret;
12010 }
12011
12012 if (!entry_point.has_bucket_info) {
12013 /* already converted! */
12014 return 0;
12015 }
12016
12017 info = entry_point.old_bucket_info;
12018 info.bucket.oid = bucket_name;
12019 info.ep_objv = ot.read_version;
12020
12021 ot.generate_new_write_ver(cct);
12022
12023 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
12024 if (ret < 0) {
12025 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
12026 return ret;
12027 }
12028
12029 return 0;
12030 }
12031
12032 int RGWRados::_get_bucket_info(RGWObjectCtx& obj_ctx,
12033 const string& tenant,
12034 const string& bucket_name,
12035 RGWBucketInfo& info,
12036 real_time *pmtime,
12037 map<string, bufferlist> *pattrs,
12038 boost::optional<obj_version> refresh_version)
12039 {
12040 bucket_info_entry e;
12041 string bucket_entry;
12042 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
12043
12044
12045 if (binfo_cache->find(bucket_entry, &e)) {
12046 if (refresh_version &&
12047 e.info.objv_tracker.read_version.compare(&(*refresh_version))) {
12048 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
12049 << "a failure that should be debugged. I am a nice machine, "
12050 << "so I will try to recover." << dendl;
12051 binfo_cache->invalidate(bucket_entry);
12052 }
12053 info = e.info;
12054 if (pattrs)
12055 *pattrs = e.attrs;
12056 if (pmtime)
12057 *pmtime = e.mtime;
12058 return 0;
12059 }
12060
12061 RGWBucketEntryPoint entry_point;
12062 real_time ep_mtime;
12063 RGWObjVersionTracker ot;
12064 rgw_cache_entry_info entry_cache_info;
12065 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
12066 entry_point, &ot, &ep_mtime, pattrs,
12067 &entry_cache_info, refresh_version);
12068 if (ret < 0) {
12069 /* only init these fields */
12070 info.bucket.tenant = tenant;
12071 info.bucket.name = bucket_name;
12072 return ret;
12073 }
12074
12075 if (entry_point.has_bucket_info) {
12076 info = entry_point.old_bucket_info;
12077 info.bucket.oid = bucket_name;
12078 info.bucket.tenant = tenant;
12079 info.ep_objv = ot.read_version;
12080 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
12081 return 0;
12082 }
12083
12084 /* data is in the bucket instance object, we need to get attributes from there, clear everything
12085 * that we got
12086 */
12087 if (pattrs) {
12088 pattrs->clear();
12089 }
12090
12091 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
12092
12093
12094 /* read bucket instance info */
12095
12096 string oid;
12097 get_bucket_meta_oid(entry_point.bucket, oid);
12098
12099 rgw_cache_entry_info cache_info;
12100
12101 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
12102 &cache_info, refresh_version);
12103 e.info.ep_objv = ot.read_version;
12104 info = e.info;
12105 if (ret < 0) {
12106 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
12107 info.bucket.tenant = tenant;
12108 info.bucket.name = bucket_name;
12109 // XXX and why return anything in case of an error anyway?
12110 return ret;
12111 }
12112
12113 if (pmtime)
12114 *pmtime = e.mtime;
12115 if (pattrs)
12116 *pattrs = e.attrs;
12117
12118 list<rgw_cache_entry_info *> cache_info_entries;
12119 cache_info_entries.push_back(&entry_cache_info);
12120 cache_info_entries.push_back(&cache_info);
12121
12122
12123 /* chain to both bucket entry point and bucket instance */
12124 if (!binfo_cache->put(this, bucket_entry, &e, cache_info_entries)) {
12125 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
12126 }
12127
12128 if (refresh_version &&
12129 refresh_version->compare(&info.objv_tracker.read_version)) {
12130 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
12131 << "have gone squirrelly. An administrator may have forced a "
12132 << "change; otherwise there is a problem somewhere." << dendl;
12133 }
12134
12135 return 0;
12136 }
12137
12138 int RGWRados::get_bucket_info(RGWObjectCtx& obj_ctx,
12139 const string& tenant, const string& bucket_name,
12140 RGWBucketInfo& info,
12141 real_time *pmtime, map<string, bufferlist> *pattrs)
12142 {
12143 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
12144 pattrs, boost::none);
12145 }
12146
12147 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
12148 ceph::real_time *pmtime,
12149 map<string, bufferlist> *pattrs)
12150 {
12151 RGWObjectCtx obj_ctx(this);
12152
12153 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
12154 info, pmtime, pattrs, info.objv_tracker.read_version);
12155 }
12156
12157 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
12158 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
12159 map<string, bufferlist> *pattrs)
12160 {
12161 bufferlist epbl;
12162 ::encode(entry_point, epbl);
12163 string bucket_entry;
12164 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
12165 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
12166 }
12167
12168 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
12169 real_time mtime, map<string, bufferlist> *pattrs)
12170 {
12171 info.has_instance_obj = true;
12172 bufferlist bl;
12173
12174 ::encode(info, bl);
12175
12176 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
12177 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
12178 if (ret == -EEXIST) {
12179 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
12180 * bucket operation on this specific bucket (e.g., being synced from the master), but
12181 * since bucket instace meta object is unique for this specific bucket instace, we don't
12182 * need to return an error.
12183 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
12184 * master, creating a bucket, sending bucket creation to the master, we create the bucket
12185 * locally, while in the sync thread we sync the new bucket.
12186 */
12187 ret = 0;
12188 }
12189 return ret;
12190 }
12191
12192 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
12193 map<string, bufferlist> *pattrs, bool create_entry_point)
12194 {
12195 bool create_head = !info.has_instance_obj || create_entry_point;
12196
12197 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
12198 if (ret < 0) {
12199 return ret;
12200 }
12201
12202 if (!create_head)
12203 return 0; /* done! */
12204
12205 RGWBucketEntryPoint entry_point;
12206 entry_point.bucket = info.bucket;
12207 entry_point.owner = info.owner;
12208 entry_point.creation_time = info.creation_time;
12209 entry_point.linked = true;
12210 RGWObjVersionTracker ot;
12211 if (pep_objv && !pep_objv->tag.empty()) {
12212 ot.write_version = *pep_objv;
12213 } else {
12214 ot.generate_new_write_ver(cct);
12215 if (pep_objv) {
12216 *pep_objv = ot.write_version;
12217 }
12218 }
12219 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
12220 if (ret < 0)
12221 return ret;
12222
12223 return 0;
12224 }
12225
12226 int RGWRados::omap_get_vals(rgw_raw_obj& obj, bufferlist& header, const string& marker, uint64_t count, std::map<string, bufferlist>& m)
12227 {
12228 rgw_rados_ref ref;
12229 int r = get_raw_obj_ref(obj, &ref);
12230 if (r < 0) {
12231 return r;
12232 }
12233
12234 r = ref.ioctx.omap_get_vals(ref.oid, marker, count, &m);
12235 if (r < 0)
12236 return r;
12237
12238 return 0;
12239
12240 }
12241
12242 int RGWRados::omap_get_all(rgw_raw_obj& obj, bufferlist& header,
12243 std::map<string, bufferlist>& m)
12244 {
12245 rgw_rados_ref ref;
12246 int r = get_raw_obj_ref(obj, &ref);
12247 if (r < 0) {
12248 return r;
12249 }
12250
12251 #define MAX_OMAP_GET_ENTRIES 1024
12252 const int count = MAX_OMAP_GET_ENTRIES;
12253 string start_after;
12254
12255 while (true) {
12256 std::map<string, bufferlist> t;
12257 r = ref.ioctx.omap_get_vals(ref.oid, start_after, count, &t);
12258 if (r < 0) {
12259 return r;
12260 }
12261 if (t.empty()) {
12262 break;
12263 }
12264 start_after = t.rbegin()->first;
12265 m.insert(t.begin(), t.end());
12266 }
12267 return 0;
12268 }
12269
12270 int RGWRados::omap_set(rgw_raw_obj& obj, const std::string& key, bufferlist& bl)
12271 {
12272 rgw_rados_ref ref;
12273 int r = get_raw_obj_ref(obj, &ref);
12274 if (r < 0) {
12275 return r;
12276 }
12277 ldout(cct, 15) << "omap_set obj=" << obj << " key=" << key << dendl;
12278
12279 map<string, bufferlist> m;
12280 m[key] = bl;
12281
12282 r = ref.ioctx.omap_set(ref.oid, m);
12283
12284 return r;
12285 }
12286
12287 int RGWRados::omap_set(rgw_raw_obj& obj, std::map<std::string, bufferlist>& m)
12288 {
12289 rgw_rados_ref ref;
12290 int r = get_raw_obj_ref(obj, &ref);
12291 if (r < 0) {
12292 return r;
12293 }
12294
12295 r = ref.ioctx.omap_set(ref.oid, m);
12296
12297 return r;
12298 }
12299
12300 int RGWRados::omap_del(rgw_raw_obj& obj, const std::string& key)
12301 {
12302 rgw_rados_ref ref;
12303 int r = get_raw_obj_ref(obj, &ref);
12304 if (r < 0) {
12305 return r;
12306 }
12307
12308 set<string> k;
12309 k.insert(key);
12310
12311 r = ref.ioctx.omap_rm_keys(ref.oid, k);
12312 return r;
12313 }
12314
12315 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
12316 {
12317 RGWObjectCtx obj_ctx(this);
12318
12319 map<string, RGWBucketEnt>::iterator iter;
12320 for (iter = m.begin(); iter != m.end(); ++iter) {
12321 RGWBucketEnt& ent = iter->second;
12322 rgw_bucket& bucket = ent.bucket;
12323 ent.count = 0;
12324 ent.size = 0;
12325 ent.size_rounded = 0;
12326
12327 map<string, rgw_bucket_dir_header> headers;
12328
12329 RGWBucketInfo bucket_info;
12330 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
12331 if (ret < 0) {
12332 return ret;
12333 }
12334
12335 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
12336 if (r < 0)
12337 return r;
12338
12339 map<string, rgw_bucket_dir_header>::iterator hiter = headers.begin();
12340 for (; hiter != headers.end(); ++hiter) {
12341 RGWObjCategory category = main_category;
12342 map<uint8_t, struct rgw_bucket_category_stats>::iterator iter = (hiter->second.stats).find((uint8_t)category);
12343 if (iter != hiter->second.stats.end()) {
12344 struct rgw_bucket_category_stats& stats = iter->second;
12345 ent.count += stats.num_entries;
12346 ent.size += stats.total_size;
12347 ent.size_rounded += stats.total_size_rounded;
12348 }
12349 }
12350
12351 // fill in placement_rule from the bucket instance for use in swift's
12352 // per-storage policy statistics
12353 ent.placement_rule = std::move(bucket_info.placement_rule);
12354 }
12355
12356 return m.size();
12357 }
12358
12359 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
12360 {
12361 rgw_rados_ref ref;
12362 int r = get_raw_obj_ref(obj, &ref);
12363 if (r < 0) {
12364 return r;
12365 }
12366 librados::Rados *rad = get_rados_handle();
12367 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
12368
12369 r = ref.ioctx.aio_append(ref.oid, completion, bl, size);
12370 completion->release();
12371 return r;
12372 }
12373
12374 int RGWRados::distribute(const string& key, bufferlist& bl)
12375 {
12376 /*
12377 * we were called before watch was initialized. This can only happen if we're updating some system
12378 * config object (e.g., zone info) during init. Don't try to distribute the cache info for these
12379 * objects, they're currently only read on startup anyway.
12380 */
12381 if (!watch_initialized)
12382 return 0;
12383
12384 string notify_oid;
12385 pick_control_oid(key, notify_oid);
12386
12387 ldout(cct, 10) << "distributing notification oid=" << notify_oid << " bl.length()=" << bl.length() << dendl;
12388 return control_pool_ctx.notify2(notify_oid, bl, 0, NULL);
12389 }
12390
12391 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
12392 {
12393 librados::IoCtx& io_ctx = ctx.io_ctx;
12394 librados::NObjectIterator& iter = ctx.iter;
12395
12396 int r = open_pool_ctx(pool, io_ctx);
12397 if (r < 0)
12398 return r;
12399
12400 iter = io_ctx.nobjects_begin();
12401
12402 return 0;
12403 }
12404
12405 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
12406 {
12407 librados::IoCtx& io_ctx = ctx.io_ctx;
12408 librados::NObjectIterator& iter = ctx.iter;
12409
12410 int r = open_pool_ctx(pool, io_ctx);
12411 if (r < 0)
12412 return r;
12413
12414 librados::ObjectCursor oc;
12415 if (!oc.from_str(cursor)) {
12416 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
12417 return -EINVAL;
12418 }
12419
12420 iter = io_ctx.nobjects_begin(oc);
12421
12422 return 0;
12423 }
12424
12425 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
12426 {
12427 return ctx.iter.get_cursor().to_str();
12428 }
12429
12430 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
12431 bool *is_truncated, RGWAccessListFilter *filter)
12432 {
12433 librados::IoCtx& io_ctx = ctx.io_ctx;
12434 librados::NObjectIterator& iter = ctx.iter;
12435
12436 if (iter == io_ctx.nobjects_end())
12437 return -ENOENT;
12438
12439 uint32_t i;
12440
12441 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
12442 rgw_bucket_dir_entry e;
12443
12444 string oid = iter->get_oid();
12445 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
12446
12447 // fill it in with initial values; we may correct later
12448 if (filter && !filter->filter(oid, oid))
12449 continue;
12450
12451 e.key = oid;
12452 objs.push_back(e);
12453 }
12454
12455 if (is_truncated)
12456 *is_truncated = (iter != io_ctx.nobjects_end());
12457
12458 return objs.size();
12459 }
12460 struct RGWAccessListFilterPrefix : public RGWAccessListFilter {
12461 string prefix;
12462
12463 explicit RGWAccessListFilterPrefix(const string& _prefix) : prefix(_prefix) {}
12464 bool filter(string& name, string& key) override {
12465 return (prefix.compare(key.substr(0, prefix.size())) == 0);
12466 }
12467 };
12468
12469 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
12470 {
12471 if (!ctx->initialized) {
12472 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
12473 if (r < 0) {
12474 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
12475 return r;
12476 }
12477 ctx->initialized = true;
12478 }
12479 return 0;
12480 }
12481
12482 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
12483 RGWListRawObjsCtx& ctx, list<string>& oids,
12484 bool *is_truncated)
12485 {
12486 if (!ctx.initialized) {
12487 return -EINVAL;
12488 }
12489 RGWAccessListFilterPrefix filter(prefix_filter);
12490 vector<rgw_bucket_dir_entry> objs;
12491 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
12492 if (r < 0) {
12493 if(r != -ENOENT)
12494 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
12495 return r;
12496 }
12497
12498 vector<rgw_bucket_dir_entry>::iterator iter;
12499 for (iter = objs.begin(); iter != objs.end(); ++iter) {
12500 oids.push_back(iter->key.name);
12501 }
12502
12503 return oids.size();
12504 }
12505
12506 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
12507 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
12508 bool *is_truncated)
12509 {
12510 if (!ctx.initialized) {
12511 int r = list_raw_objects_init(pool, string(), &ctx);
12512 if (r < 0) {
12513 return r;
12514 }
12515 }
12516
12517 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
12518 }
12519
12520 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
12521 {
12522 return pool_iterate_get_cursor(ctx.iter_ctx);
12523 }
12524
12525 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
12526 std::list<rgw_bi_log_entry>& result, bool *truncated)
12527 {
12528 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
12529 result.clear();
12530
12531 librados::IoCtx index_ctx;
12532 map<int, string> oids;
12533 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
12534 map<int, string> bucket_instance_ids;
12535 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id, &bucket_instance_ids);
12536 if (r < 0)
12537 return r;
12538
12539 BucketIndexShardsManager marker_mgr;
12540 bool has_shards = (oids.size() > 1 || shard_id >= 0);
12541 // If there are multiple shards for the bucket index object, the marker
12542 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
12543 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
12544 // only contain one record, and the key is the bucket instance id.
12545 r = marker_mgr.from_string(marker, shard_id);
12546 if (r < 0)
12547 return r;
12548
12549 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
12550 if (r < 0)
12551 return r;
12552
12553 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
12554 map<int, list<rgw_bi_log_entry>::iterator> vends;
12555 if (truncated) {
12556 *truncated = false;
12557 }
12558 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
12559 for (; miter != bi_log_lists.end(); ++miter) {
12560 int shard_id = miter->first;
12561 vcurrents[shard_id] = miter->second.entries.begin();
12562 vends[shard_id] = miter->second.entries.end();
12563 if (truncated) {
12564 *truncated = (*truncated || miter->second.truncated);
12565 }
12566 }
12567
12568 size_t total = 0;
12569 bool has_more = true;
12570 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
12571 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
12572 while (total < max && has_more) {
12573 has_more = false;
12574
12575 viter = vcurrents.begin();
12576 eiter = vends.begin();
12577
12578 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
12579 assert (eiter != vends.end());
12580
12581 int shard_id = viter->first;
12582 list<rgw_bi_log_entry>::iterator& liter = viter->second;
12583
12584 if (liter == eiter->second){
12585 continue;
12586 }
12587 rgw_bi_log_entry& entry = *(liter);
12588 if (has_shards) {
12589 char buf[16];
12590 snprintf(buf, sizeof(buf), "%d", shard_id);
12591 string tmp_id;
12592 build_bucket_index_marker(buf, entry.id, &tmp_id);
12593 entry.id.swap(tmp_id);
12594 }
12595 marker_mgr.add(shard_id, entry.id);
12596 result.push_back(entry);
12597 total++;
12598 has_more = true;
12599 ++liter;
12600 }
12601 }
12602
12603 if (truncated) {
12604 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
12605 assert (eiter != vends.end());
12606 *truncated = (*truncated || (viter->second != eiter->second));
12607 }
12608 }
12609
12610 // Refresh marker, if there are multiple shards, the output will look like
12611 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
12612 // if there is no sharding, the simply marker (without oid) is returned
12613 if (has_shards) {
12614 marker_mgr.to_string(&marker);
12615 } else {
12616 if (!result.empty()) {
12617 marker = result.rbegin()->id;
12618 }
12619 }
12620
12621 return 0;
12622 }
12623
12624 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
12625 {
12626 librados::IoCtx index_ctx;
12627 map<int, string> bucket_objs;
12628
12629 BucketIndexShardsManager start_marker_mgr;
12630 BucketIndexShardsManager end_marker_mgr;
12631
12632 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12633 if (r < 0) {
12634 return r;
12635 }
12636
12637 r = start_marker_mgr.from_string(start_marker, shard_id);
12638 if (r < 0) {
12639 return r;
12640 }
12641
12642 r = end_marker_mgr.from_string(end_marker, shard_id);
12643 if (r < 0) {
12644 return r;
12645 }
12646
12647 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
12648 cct->_conf->rgw_bucket_index_max_aio)();
12649
12650 return r;
12651 }
12652
12653 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12654 {
12655 librados::IoCtx index_ctx;
12656 map<int, string> bucket_objs;
12657 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12658 if (r < 0)
12659 return r;
12660
12661 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12662 }
12663
12664 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
12665 {
12666 librados::IoCtx index_ctx;
12667 map<int, string> bucket_objs;
12668 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
12669 if (r < 0)
12670 return r;
12671
12672 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
12673 }
12674
12675 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, rgw_obj& obj, rgw_bucket_dir_entry *dirent)
12676 {
12677 rgw_rados_ref ref;
12678 int r = get_obj_head_ref(bucket_info, obj, &ref);
12679 if (r < 0) {
12680 return r;
12681 }
12682
12683 rgw_cls_bi_entry bi_entry;
12684 r = bi_get(obj.bucket, obj, InstanceIdx, &bi_entry);
12685 if (r < 0 && r != -ENOENT) {
12686 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
12687 }
12688 if (r < 0) {
12689 return r;
12690 }
12691 bufferlist::iterator iter = bi_entry.data.begin();
12692 try {
12693 ::decode(*dirent, iter);
12694 } catch (buffer::error& err) {
12695 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
12696 return -EIO;
12697 }
12698
12699 return 0;
12700 }
12701
12702 int RGWRados::bi_get(rgw_bucket& bucket, rgw_obj& obj, BIIndexType index_type, rgw_cls_bi_entry *entry)
12703 {
12704 BucketShard bs(this);
12705 int ret = bs.init(bucket, obj);
12706 if (ret < 0) {
12707 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12708 return ret;
12709 }
12710
12711 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12712
12713 ret = cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
12714 if (ret < 0)
12715 return ret;
12716
12717 return 0;
12718 }
12719
12720 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
12721 {
12722 cls_rgw_bi_put(op, bs.bucket_obj, entry);
12723 }
12724
12725 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
12726 {
12727 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
12728 if (ret < 0)
12729 return ret;
12730
12731 return 0;
12732 }
12733
12734 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
12735 {
12736 BucketShard bs(this);
12737 int ret = bs.init(bucket, obj);
12738 if (ret < 0) {
12739 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12740 return ret;
12741 }
12742
12743 return bi_put(bs, entry);
12744 }
12745
12746 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12747 {
12748 rgw_obj obj(bucket, obj_name);
12749 BucketShard bs(this);
12750 int ret = bs.init(bucket, obj);
12751 if (ret < 0) {
12752 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12753 return ret;
12754 }
12755
12756 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
12757 if (ret == -ENOENT) {
12758 *is_truncated = false;
12759 }
12760 if (ret < 0)
12761 return ret;
12762
12763 return 0;
12764 }
12765
12766 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12767 {
12768 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
12769 if (ret < 0)
12770 return ret;
12771
12772 return 0;
12773 }
12774
12775 int RGWRados::bi_remove(BucketShard& bs)
12776 {
12777 int ret = bs.index_ctx.remove(bs.bucket_obj);
12778 if (ret == -ENOENT) {
12779 ret = 0;
12780 }
12781 if (ret < 0) {
12782 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
12783 return ret;
12784 }
12785
12786 return 0;
12787 }
12788
12789 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
12790 {
12791 BucketShard bs(this);
12792 int ret = bs.init(bucket, shard_id);
12793 if (ret < 0) {
12794 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
12795 return ret;
12796 }
12797
12798 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
12799 }
12800
12801 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
12802 {
12803 return gc_pool_ctx.operate(oid, op);
12804 }
12805
12806 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op)
12807 {
12808 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
12809 int r = gc_pool_ctx.aio_operate(oid, c, op);
12810 c->release();
12811 return r;
12812 }
12813
12814 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
12815 {
12816 return gc_pool_ctx.operate(oid, op, pbl);
12817 }
12818
12819 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
12820 {
12821 return gc->list(index, marker, max, expired_only, result, truncated);
12822 }
12823
12824 int RGWRados::process_gc()
12825 {
12826 return gc->process();
12827 }
12828
12829 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
12830 {
12831 return lc->list_lc_progress(marker, max_entries, progress_map);
12832 }
12833
12834 int RGWRados::process_lc()
12835 {
12836 return lc->process();
12837 }
12838
12839 int RGWRados::process_expire_objects()
12840 {
12841 obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
12842 return 0;
12843 }
12844
12845 int RGWRados::cls_rgw_init_index(librados::IoCtx& index_ctx, librados::ObjectWriteOperation& op, string& oid)
12846 {
12847 bufferlist in;
12848 cls_rgw_bucket_init(op);
12849 return index_ctx.operate(oid, &op);
12850 }
12851
12852 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
12853 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12854 {
12855 rgw_zone_set zones_trace;
12856 if (_zones_trace) {
12857 zones_trace = *_zones_trace;
12858 }
12859 else {
12860 zones_trace.insert(get_zone().id);
12861 }
12862
12863 ObjectWriteOperation o;
12864 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
12865 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12866 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), get_zone().log_data, bilog_flags, zones_trace);
12867 return bs.index_ctx.operate(bs.bucket_obj, &o);
12868 }
12869
12870 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
12871 int64_t pool, uint64_t epoch,
12872 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12873 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
12874 {
12875 ObjectWriteOperation o;
12876 rgw_bucket_dir_entry_meta dir_meta;
12877 dir_meta = ent.meta;
12878 dir_meta.category = category;
12879
12880 rgw_bucket_entry_ver ver;
12881 ver.pool = pool;
12882 ver.epoch = epoch;
12883 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
12884 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
12885 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
12886 get_zone().log_data, bilog_flags, _zones_trace);
12887 complete_op_data *arg;
12888 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
12889 get_zone().log_data, bilog_flags, _zones_trace, &arg);
12890 librados::AioCompletion *completion = arg->rados_completion;
12891 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
12892 completion->release(); /* can't reference arg here, as it might have already been released */
12893 return ret;
12894 }
12895
12896 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
12897 int64_t pool, uint64_t epoch,
12898 rgw_bucket_dir_entry& ent, RGWObjCategory category,
12899 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12900 {
12901 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
12902 }
12903
12904 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
12905 int64_t pool, uint64_t epoch,
12906 rgw_obj& obj,
12907 real_time& removed_mtime,
12908 list<rgw_obj_index_key> *remove_objs,
12909 uint16_t bilog_flags,
12910 rgw_zone_set *zones_trace)
12911 {
12912 rgw_bucket_dir_entry ent;
12913 ent.meta.mtime = removed_mtime;
12914 obj.key.get_index_key(&ent.key);
12915 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch, ent, RGW_OBJ_CATEGORY_NONE, remove_objs, bilog_flags, zones_trace);
12916 }
12917
12918 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
12919 {
12920 rgw_bucket_dir_entry ent;
12921 obj.key.get_index_key(&ent.key);
12922 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag, -1 /* pool id */, 0, ent, RGW_OBJ_CATEGORY_NONE, NULL, bilog_flags, zones_trace);
12923 }
12924
12925 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
12926 {
12927 librados::IoCtx index_ctx;
12928 map<int, string> bucket_objs;
12929 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
12930 if (r < 0)
12931 return r;
12932
12933 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
12934 }
12935
12936 int RGWRados::cls_bucket_list(RGWBucketInfo& bucket_info, int shard_id, rgw_obj_index_key& start, const string& prefix,
12937 uint32_t num_entries, bool list_versions, map<string, rgw_bucket_dir_entry>& m,
12938 bool *is_truncated, rgw_obj_index_key *last_entry,
12939 bool (*force_check_filter)(const string& name))
12940 {
12941 ldout(cct, 10) << "cls_bucket_list " << bucket_info.bucket << " start " << start.name << "[" << start.instance << "] num_entries " << num_entries << dendl;
12942
12943 librados::IoCtx index_ctx;
12944 // key - oid (for different shards if there is any)
12945 // value - list result for the corresponding oid (shard), it is filled by the AIO callback
12946 map<int, string> oids;
12947 map<int, struct rgw_cls_list_ret> list_results;
12948 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
12949 if (r < 0)
12950 return r;
12951
12952 cls_rgw_obj_key start_key(start.name, start.instance);
12953 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries, list_versions,
12954 oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
12955 if (r < 0)
12956 return r;
12957
12958 // Create a list of iterators that are used to iterate each shard
12959 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents(list_results.size());
12960 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends(list_results.size());
12961 vector<string> vnames(list_results.size());
12962 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
12963 *is_truncated = false;
12964 for (; iter != list_results.end(); ++iter) {
12965 vcurrents.push_back(iter->second.dir.m.begin());
12966 vends.push_back(iter->second.dir.m.end());
12967 vnames.push_back(oids[iter->first]);
12968 *is_truncated = (*is_truncated || iter->second.is_truncated);
12969 }
12970
12971 // Create a map to track the next candidate entry from each shard, if the entry
12972 // from a specified shard is selected/erased, the next entry from that shard will
12973 // be inserted for next round selection
12974 map<string, size_t> candidates;
12975 for (size_t i = 0; i < vcurrents.size(); ++i) {
12976 if (vcurrents[i] != vends[i]) {
12977 candidates[vcurrents[i]->first] = i;
12978 }
12979 }
12980
12981 map<string, bufferlist> updates;
12982 uint32_t count = 0;
12983 while (count < num_entries && !candidates.empty()) {
12984 r = 0;
12985 // Select the next one
12986 int pos = candidates.begin()->second;
12987 const string& name = vcurrents[pos]->first;
12988 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
12989
12990 bool force_check = force_check_filter &&
12991 force_check_filter(dirent.key.name);
12992 if ((!dirent.exists && !dirent.is_delete_marker()) ||
12993 !dirent.pending_map.empty() ||
12994 force_check) {
12995 /* there are uncommitted ops. We need to check the current state,
12996 * and if the tags are old we need to do cleanup as well. */
12997 librados::IoCtx sub_ctx;
12998 sub_ctx.dup(index_ctx);
12999 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[vnames[pos]]);
13000 if (r < 0 && r != -ENOENT) {
13001 return r;
13002 }
13003 }
13004 if (r >= 0) {
13005 ldout(cct, 10) << "RGWRados::cls_bucket_list: got " << dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
13006 m[name] = std::move(dirent);
13007 ++count;
13008 }
13009
13010 // Refresh the candidates map
13011 candidates.erase(candidates.begin());
13012 ++vcurrents[pos];
13013 if (vcurrents[pos] != vends[pos]) {
13014 candidates[vcurrents[pos]->first] = pos;
13015 }
13016 }
13017
13018 // Suggest updates if there is any
13019 map<string, bufferlist>::iterator miter = updates.begin();
13020 for (; miter != updates.end(); ++miter) {
13021 if (miter->second.length()) {
13022 ObjectWriteOperation o;
13023 cls_rgw_suggest_changes(o, miter->second);
13024 // we don't care if we lose suggested updates, send them off blindly
13025 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
13026 index_ctx.aio_operate(miter->first, c, &o);
13027 c->release();
13028 }
13029 }
13030
13031 // Check if all the returned entries are consumed or not
13032 for (size_t i = 0; i < vcurrents.size(); ++i) {
13033 if (vcurrents[i] != vends[i])
13034 *is_truncated = true;
13035 }
13036 if (!m.empty())
13037 *last_entry = m.rbegin()->first;
13038
13039 return 0;
13040 }
13041
13042 int RGWRados::cls_obj_usage_log_add(const string& oid, rgw_usage_log_info& info)
13043 {
13044 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13045
13046 rgw_rados_ref ref;
13047 int r = get_raw_obj_ref(obj, &ref);
13048 if (r < 0) {
13049 return r;
13050 }
13051
13052 ObjectWriteOperation op;
13053 cls_rgw_usage_log_add(op, info);
13054
13055 r = ref.ioctx.operate(ref.oid, &op);
13056 return r;
13057 }
13058
13059 int RGWRados::cls_obj_usage_log_read(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
13060 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage, bool *is_truncated)
13061 {
13062 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13063
13064 rgw_rados_ref ref;
13065 int r = get_raw_obj_ref(obj, &ref);
13066 if (r < 0) {
13067 return r;
13068 }
13069
13070 *is_truncated = false;
13071
13072 r = cls_rgw_usage_log_read(ref.ioctx, ref.oid, user, start_epoch, end_epoch,
13073 max_entries, read_iter, usage, is_truncated);
13074
13075 return r;
13076 }
13077
13078 int RGWRados::cls_obj_usage_log_trim(string& oid, string& user, uint64_t start_epoch, uint64_t end_epoch)
13079 {
13080 rgw_raw_obj obj(get_zone_params().usage_log_pool, oid);
13081
13082 rgw_rados_ref ref;
13083 int r = get_raw_obj_ref(obj, &ref);
13084 if (r < 0) {
13085 return r;
13086 }
13087
13088 r = cls_rgw_usage_log_trim(ref.ioctx, ref.oid, user, start_epoch, end_epoch);
13089 return r;
13090 }
13091
13092 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
13093 {
13094 librados::IoCtx index_ctx;
13095 string dir_oid;
13096
13097 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13098
13099 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
13100 if (r < 0)
13101 return r;
13102
13103 bufferlist updates;
13104
13105 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
13106 rgw_bucket_dir_entry entry;
13107 entry.key = *iter;
13108 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
13109 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
13110 updates.append(CEPH_RGW_REMOVE | suggest_flag);
13111 ::encode(entry, updates);
13112 }
13113
13114 bufferlist out;
13115
13116 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
13117
13118 return r;
13119 }
13120
13121 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
13122 const RGWBucketInfo& bucket_info,
13123 rgw_bucket_dir_entry& list_state,
13124 rgw_bucket_dir_entry& object,
13125 bufferlist& suggested_updates)
13126 {
13127 const rgw_bucket& bucket = bucket_info.bucket;
13128 uint8_t suggest_flag = (get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
13129
13130 std::string loc;
13131
13132 rgw_obj obj(bucket, list_state.key);
13133
13134 string oid;
13135 get_obj_bucket_and_oid_loc(obj, oid, loc);
13136
13137 if (loc != list_state.locator) {
13138 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
13139 }
13140
13141 io_ctx.locator_set_key(list_state.locator);
13142
13143 RGWObjState *astate = NULL;
13144 RGWObjectCtx rctx(this);
13145 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
13146 if (r < 0)
13147 return r;
13148
13149 list_state.pending_map.clear(); // we don't need this and it inflates size
13150 if (!astate->exists) {
13151 /* object doesn't exist right now -- hopefully because it's
13152 * marked as !exists and got deleted */
13153 if (list_state.exists) {
13154 /* FIXME: what should happen now? Work out if there are any
13155 * non-bad ways this could happen (there probably are, but annoying
13156 * to handle!) */
13157 }
13158 // encode a suggested removal of that key
13159 list_state.ver.epoch = io_ctx.get_last_version();
13160 list_state.ver.pool = io_ctx.get_id();
13161 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
13162 return -ENOENT;
13163 }
13164
13165 string etag;
13166 string content_type;
13167 ACLOwner owner;
13168
13169 object.meta.size = astate->size;
13170 object.meta.accounted_size = astate->accounted_size;
13171 object.meta.mtime = astate->mtime;
13172
13173 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
13174 if (iter != astate->attrset.end()) {
13175 etag = iter->second.c_str();
13176 }
13177 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
13178 if (iter != astate->attrset.end()) {
13179 content_type = iter->second.c_str();
13180 }
13181 iter = astate->attrset.find(RGW_ATTR_ACL);
13182 if (iter != astate->attrset.end()) {
13183 r = decode_policy(iter->second, &owner);
13184 if (r < 0) {
13185 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
13186 }
13187 }
13188
13189 if (astate->has_manifest) {
13190 RGWObjManifest::obj_iterator miter;
13191 RGWObjManifest& manifest = astate->manifest;
13192 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
13193 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
13194 rgw_obj loc;
13195 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
13196
13197 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
13198 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
13199 r = delete_obj_index(loc);
13200 if (r < 0) {
13201 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
13202 }
13203 }
13204 }
13205 }
13206
13207 object.meta.etag = etag;
13208 object.meta.content_type = content_type;
13209 object.meta.owner = owner.get_id().to_str();
13210 object.meta.owner_display_name = owner.get_display_name();
13211
13212 // encode suggested updates
13213 list_state.ver.pool = io_ctx.get_id();
13214 list_state.ver.epoch = astate->epoch;
13215 list_state.meta.size = object.meta.size;
13216 list_state.meta.accounted_size = object.meta.accounted_size;
13217 list_state.meta.mtime = object.meta.mtime;
13218 list_state.meta.category = main_category;
13219 list_state.meta.etag = etag;
13220 list_state.meta.content_type = content_type;
13221 if (astate->obj_tag.length() > 0)
13222 list_state.tag = astate->obj_tag.c_str();
13223 list_state.meta.owner = owner.get_id().to_str();
13224 list_state.meta.owner_display_name = owner.get_display_name();
13225
13226 list_state.exists = true;
13227 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
13228 return 0;
13229 }
13230
13231 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, map<string, struct rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
13232 {
13233 librados::IoCtx index_ctx;
13234 map<int, string> oids;
13235 map<int, struct rgw_cls_list_ret> list_results;
13236 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
13237 if (r < 0)
13238 return r;
13239
13240 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
13241 if (r < 0)
13242 return r;
13243
13244 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
13245 for(; iter != list_results.end(); ++iter) {
13246 headers[oids[iter->first]] = iter->second.dir.header;
13247 }
13248 return 0;
13249 }
13250
13251 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
13252 {
13253 librados::IoCtx index_ctx;
13254 map<int, string> bucket_objs;
13255 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
13256 if (r < 0)
13257 return r;
13258
13259 map<int, string>::iterator iter = bucket_objs.begin();
13260 for (; iter != bucket_objs.end(); ++iter) {
13261 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
13262 if (r < 0) {
13263 ctx->put();
13264 break;
13265 } else {
13266 (*num_aio)++;
13267 }
13268 }
13269 return r;
13270 }
13271
13272 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
13273 {
13274 string buckets_obj_id;
13275 rgw_get_buckets_obj(user_id, buckets_obj_id);
13276 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13277
13278 rgw_rados_ref ref;
13279 int r = get_raw_obj_ref(obj, &ref);
13280 if (r < 0) {
13281 return r;
13282 }
13283
13284 librados::ObjectReadOperation op;
13285 int rc;
13286 ::cls_user_get_header(op, header, &rc);
13287 bufferlist ibl;
13288 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13289 if (r < 0)
13290 return r;
13291 if (rc < 0)
13292 return rc;
13293
13294 return 0;
13295 }
13296
13297 int RGWRados::cls_user_reset_stats(const string& user_id)
13298 {
13299 string buckets_obj_id;
13300 rgw_get_buckets_obj(user_id, buckets_obj_id);
13301 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13302
13303 rgw_rados_ref ref;
13304 int r = get_raw_obj_ref(obj, &ref);
13305 if (r < 0) {
13306 return r;
13307 }
13308
13309 librados::ObjectWriteOperation op;
13310 ::cls_user_reset_stats(op);
13311 return ref.ioctx.operate(ref.oid, &op);
13312 }
13313
13314 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
13315 {
13316 string buckets_obj_id;
13317 rgw_get_buckets_obj(user_id, buckets_obj_id);
13318 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13319
13320 rgw_rados_ref ref;
13321 int r = get_raw_obj_ref(obj, &ref);
13322 if (r < 0) {
13323 return r;
13324 }
13325
13326 r = ::cls_user_get_header_async(ref.ioctx, ref.oid, ctx);
13327 if (r < 0)
13328 return r;
13329
13330 return 0;
13331 }
13332
13333 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj, const RGWBucketInfo& bucket_info)
13334 {
13335 map<string, struct rgw_bucket_dir_header> headers;
13336 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13337 if (r < 0) {
13338 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
13339 return r;
13340 }
13341
13342 cls_user_bucket_entry entry;
13343
13344 bucket_info.bucket.convert(&entry.bucket);
13345
13346 for (const auto& hiter : headers) {
13347 for (const auto& iter : hiter.second.stats) {
13348 const struct rgw_bucket_category_stats& header_stats = iter.second;
13349 entry.size += header_stats.total_size;
13350 entry.size_rounded += header_stats.total_size_rounded;
13351 entry.count += header_stats.num_entries;
13352 }
13353 }
13354
13355 list<cls_user_bucket_entry> entries;
13356 entries.push_back(entry);
13357
13358 r = cls_user_update_buckets(user_obj, entries, false);
13359 if (r < 0) {
13360 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
13361 return r;
13362 }
13363
13364 return 0;
13365 }
13366
13367 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
13368 {
13369 map<string, struct rgw_bucket_dir_header> headers;
13370 RGWBucketInfo bucket_info;
13371 RGWObjectCtx obj_ctx(this);
13372 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
13373 if (ret < 0) {
13374 return ret;
13375 }
13376
13377 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
13378 if (ret < 0) {
13379 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
13380 return ret;
13381 }
13382
13383 bucket.convert(&entry.bucket);
13384
13385 for (const auto& hiter : headers) {
13386 for (const auto& iter : hiter.second.stats) {
13387 const struct rgw_bucket_category_stats& header_stats = iter.second;
13388 entry.size += header_stats.total_size;
13389 entry.size_rounded += header_stats.total_size_rounded;
13390 entry.count += header_stats.num_entries;
13391 }
13392 }
13393
13394 return 0;
13395 }
13396
13397 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
13398 const string& in_marker,
13399 const string& end_marker,
13400 const int max_entries,
13401 list<cls_user_bucket_entry>& entries,
13402 string * const out_marker,
13403 bool * const truncated)
13404 {
13405 rgw_rados_ref ref;
13406 int r = get_raw_obj_ref(obj, &ref);
13407 if (r < 0) {
13408 return r;
13409 }
13410
13411 librados::ObjectReadOperation op;
13412 int rc;
13413
13414 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
13415 bufferlist ibl;
13416 r = ref.ioctx.operate(ref.oid, &op, &ibl);
13417 if (r < 0)
13418 return r;
13419 if (rc < 0)
13420 return rc;
13421
13422 return 0;
13423 }
13424
13425 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
13426 {
13427 rgw_rados_ref ref;
13428 int r = get_raw_obj_ref(obj, &ref);
13429 if (r < 0) {
13430 return r;
13431 }
13432
13433 librados::ObjectWriteOperation op;
13434 cls_user_set_buckets(op, entries, add);
13435 r = ref.ioctx.operate(ref.oid, &op);
13436 if (r < 0)
13437 return r;
13438
13439 return 0;
13440 }
13441
13442 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
13443 {
13444 string buckets_obj_id;
13445 rgw_get_buckets_obj(user_id, buckets_obj_id);
13446 rgw_raw_obj obj(get_zone_params().user_uid_pool, buckets_obj_id);
13447 return cls_user_complete_stats_sync(obj);
13448 }
13449
13450 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
13451 {
13452 rgw_rados_ref ref;
13453 int r = get_raw_obj_ref(obj, &ref);
13454 if (r < 0) {
13455 return r;
13456 }
13457
13458 librados::ObjectWriteOperation op;
13459 ::cls_user_complete_stats_sync(op);
13460 r = ref.ioctx.operate(ref.oid, &op);
13461 if (r < 0)
13462 return r;
13463
13464 return 0;
13465 }
13466
13467 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
13468 {
13469 list<cls_user_bucket_entry> l;
13470 l.push_back(entry);
13471
13472 return cls_user_update_buckets(obj, l, true);
13473 }
13474
13475 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
13476 {
13477 rgw_rados_ref ref;
13478 int r = get_system_obj_ref(obj, &ref);
13479 if (r < 0) {
13480 return r;
13481 }
13482
13483 librados::ObjectWriteOperation op;
13484 ::cls_user_remove_bucket(op, bucket);
13485 r = ref.ioctx.operate(ref.oid, &op);
13486 if (r < 0)
13487 return r;
13488
13489 return 0;
13490 }
13491
13492 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
13493 RGWQuotaInfo& bucket_quota)
13494 {
13495 if (!cct->_conf->rgw_dynamic_resharding) {
13496 return 0;
13497 }
13498
13499 bool need_resharding = false;
13500 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13501 uint32_t suggested_num_shards;
13502
13503 int ret = quota_handler->check_bucket_shards((uint64_t)cct->_conf->rgw_max_objs_per_shard,
13504 num_source_shards, bucket_info.owner, bucket, bucket_quota,
13505 1, need_resharding, &suggested_num_shards);
13506 if (ret < 0) {
13507 return ret;
13508 }
13509
13510 if (need_resharding) {
13511 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
13512 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
13513 dendl;
13514 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
13515 }
13516
13517 return ret;
13518 }
13519
13520 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
13521 {
13522 RGWReshard reshard(this);
13523
13524 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
13525
13526 new_num_shards = min(new_num_shards, get_max_bucket_shards());
13527 if (new_num_shards <= num_source_shards) {
13528 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
13529 return 0;
13530 }
13531
13532 cls_rgw_reshard_entry entry;
13533 entry.time = real_clock::now();
13534 entry.tenant = bucket_info.owner.tenant;
13535 entry.bucket_name = bucket_info.bucket.name;
13536 entry.bucket_id = bucket_info.bucket.bucket_id;
13537 entry.old_num_shards = num_source_shards;
13538 entry.new_num_shards = new_num_shards;
13539
13540 return reshard.add(entry);
13541 }
13542
13543 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
13544 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size)
13545 {
13546 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
13547 }
13548
13549 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
13550 uint32_t num_shards, map<int, string>& bucket_objects, int shard_id)
13551 {
13552 if (!num_shards) {
13553 bucket_objects[0] = bucket_oid_base;
13554 } else {
13555 char buf[bucket_oid_base.size() + 32];
13556 if (shard_id < 0) {
13557 for (uint32_t i = 0; i < num_shards; ++i) {
13558 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
13559 bucket_objects[i] = buf;
13560 }
13561 } else {
13562 if ((uint32_t)shard_id > num_shards) {
13563 return;
13564 }
13565 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13566 bucket_objects[shard_id] = buf;
13567 }
13568 }
13569 }
13570
13571 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
13572 {
13573 const rgw_bucket& bucket = bucket_info.bucket;
13574 string plain_id = bucket.name + ":" + bucket.bucket_id;
13575 if (!bucket_info.num_shards) {
13576 (*result)[0] = plain_id;
13577 } else {
13578 char buf[16];
13579 if (shard_id < 0) {
13580 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
13581 snprintf(buf, sizeof(buf), ":%d", i);
13582 (*result)[i] = plain_id + buf;
13583 }
13584 } else {
13585 if ((uint32_t)shard_id > bucket_info.num_shards) {
13586 return;
13587 }
13588 snprintf(buf, sizeof(buf), ":%d", shard_id);
13589 (*result)[shard_id] = plain_id + buf;
13590 }
13591 }
13592 }
13593
13594 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
13595 int *shard_id)
13596 {
13597 int r = 0;
13598 switch (bucket_info.bucket_index_shard_hash_type) {
13599 case RGWBucketInfo::MOD:
13600 if (!bucket_info.num_shards) {
13601 if (shard_id) {
13602 *shard_id = -1;
13603 }
13604 } else {
13605 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13606 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13607 sid = rgw_shards_mod(sid2, bucket_info.num_shards);
13608 if (shard_id) {
13609 *shard_id = (int)sid;
13610 }
13611 }
13612 break;
13613 default:
13614 r = -ENOTSUP;
13615 }
13616 return r;
13617 }
13618
13619 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
13620 int shard_id, string *bucket_obj)
13621 {
13622 if (!num_shards) {
13623 // By default with no sharding, we use the bucket oid as itself
13624 (*bucket_obj) = bucket_oid_base;
13625 } else {
13626 char buf[bucket_oid_base.size() + 32];
13627 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
13628 (*bucket_obj) = buf;
13629 }
13630 }
13631
13632 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
13633 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
13634 {
13635 int r = 0;
13636 switch (hash_type) {
13637 case RGWBucketInfo::MOD:
13638 if (!num_shards) {
13639 // By default with no sharding, we use the bucket oid as itself
13640 (*bucket_obj) = bucket_oid_base;
13641 if (shard_id) {
13642 *shard_id = -1;
13643 }
13644 } else {
13645 uint32_t sid = ceph_str_hash_linux(obj_key.c_str(), obj_key.size());
13646 uint32_t sid2 = sid ^ ((sid & 0xFF) << 24);
13647 sid = rgw_shards_mod(sid2, num_shards);
13648 char buf[bucket_oid_base.size() + 32];
13649 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
13650 (*bucket_obj) = buf;
13651 if (shard_id) {
13652 *shard_id = (int)sid;
13653 }
13654 }
13655 break;
13656 default:
13657 r = -ENOTSUP;
13658 }
13659 return r;
13660 }
13661
13662 void RGWStateLog::oid_str(int shard, string& oid) {
13663 oid = RGW_STATELOG_OBJ_PREFIX + module_name + ".";
13664 char buf[16];
13665 snprintf(buf, sizeof(buf), "%d", shard);
13666 oid += buf;
13667 }
13668
13669 int RGWStateLog::get_shard_num(const string& object) {
13670 uint32_t val = ceph_str_hash_linux(object.c_str(), object.length());
13671 return val % num_shards;
13672 }
13673
13674 string RGWStateLog::get_oid(const string& object) {
13675 int shard = get_shard_num(object);
13676 string oid;
13677 oid_str(shard, oid);
13678 return oid;
13679 }
13680
13681 int RGWStateLog::open_ioctx(librados::IoCtx& ioctx) {
13682 rgw_pool pool;
13683 store->get_log_pool(pool);
13684 int r = rgw_init_ioctx(store->get_rados_handle(), pool, ioctx);
13685 if (r < 0) {
13686 lderr(store->ctx()) << "ERROR: could not open rados pool" << dendl;
13687 return r;
13688 }
13689 return 0;
13690 }
13691
13692 int RGWStateLog::store_entry(const string& client_id, const string& op_id, const string& object,
13693 uint32_t state, bufferlist *bl, uint32_t *check_state)
13694 {
13695 if (client_id.empty() ||
13696 op_id.empty() ||
13697 object.empty()) {
13698 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13699 }
13700
13701 librados::IoCtx ioctx;
13702 int r = open_ioctx(ioctx);
13703 if (r < 0)
13704 return r;
13705
13706 string oid = get_oid(object);
13707
13708 librados::ObjectWriteOperation op;
13709 if (check_state) {
13710 cls_statelog_check_state(op, client_id, op_id, object, *check_state);
13711 }
13712 utime_t ts = ceph_clock_now();
13713 bufferlist nobl;
13714 cls_statelog_add(op, client_id, op_id, object, ts, state, (bl ? *bl : nobl));
13715 r = ioctx.operate(oid, &op);
13716 if (r < 0) {
13717 return r;
13718 }
13719
13720 return 0;
13721 }
13722
13723 int RGWStateLog::remove_entry(const string& client_id, const string& op_id, const string& object)
13724 {
13725 if (client_id.empty() ||
13726 op_id.empty() ||
13727 object.empty()) {
13728 ldout(store->ctx(), 0) << "client_id / op_id / object is empty" << dendl;
13729 }
13730
13731 librados::IoCtx ioctx;
13732 int r = open_ioctx(ioctx);
13733 if (r < 0)
13734 return r;
13735
13736 string oid = get_oid(object);
13737
13738 librados::ObjectWriteOperation op;
13739 cls_statelog_remove_by_object(op, object, op_id);
13740 r = ioctx.operate(oid, &op);
13741 if (r < 0) {
13742 return r;
13743 }
13744
13745 return 0;
13746 }
13747
13748 void RGWStateLog::init_list_entries(const string& client_id, const string& op_id, const string& object,
13749 void **handle)
13750 {
13751 list_state *state = new list_state;
13752 state->client_id = client_id;
13753 state->op_id = op_id;
13754 state->object = object;
13755 if (object.empty()) {
13756 state->cur_shard = 0;
13757 state->max_shard = num_shards - 1;
13758 } else {
13759 state->cur_shard = state->max_shard = get_shard_num(object);
13760 }
13761 *handle = (void *)state;
13762 }
13763
13764 int RGWStateLog::list_entries(void *handle, int max_entries,
13765 list<cls_statelog_entry>& entries,
13766 bool *done)
13767 {
13768 list_state *state = static_cast<list_state *>(handle);
13769
13770 librados::IoCtx ioctx;
13771 int r = open_ioctx(ioctx);
13772 if (r < 0)
13773 return r;
13774
13775 entries.clear();
13776
13777 for (; state->cur_shard <= state->max_shard && max_entries > 0; ++state->cur_shard) {
13778 string oid;
13779 oid_str(state->cur_shard, oid);
13780
13781 librados::ObjectReadOperation op;
13782 list<cls_statelog_entry> ents;
13783 bool truncated;
13784 cls_statelog_list(op, state->client_id, state->op_id, state->object, state->marker,
13785 max_entries, ents, &state->marker, &truncated);
13786 bufferlist ibl;
13787 r = ioctx.operate(oid, &op, &ibl);
13788 if (r == -ENOENT) {
13789 truncated = false;
13790 r = 0;
13791 }
13792 if (r < 0) {
13793 ldout(store->ctx(), 0) << "cls_statelog_list returned " << r << dendl;
13794 return r;
13795 }
13796
13797 if (!truncated) {
13798 state->marker.clear();
13799 }
13800
13801 max_entries -= ents.size();
13802
13803 entries.splice(entries.end(), ents);
13804
13805 if (truncated)
13806 break;
13807 }
13808
13809 *done = (state->cur_shard > state->max_shard);
13810
13811 return 0;
13812 }
13813
13814 void RGWStateLog::finish_list_entries(void *handle)
13815 {
13816 list_state *state = static_cast<list_state *>(handle);
13817 delete state;
13818 }
13819
13820 void RGWStateLog::dump_entry(const cls_statelog_entry& entry, Formatter *f)
13821 {
13822 f->open_object_section("statelog_entry");
13823 f->dump_string("client_id", entry.client_id);
13824 f->dump_string("op_id", entry.op_id);
13825 f->dump_string("object", entry.object);
13826 entry.timestamp.gmtime_nsec(f->dump_stream("timestamp"));
13827 if (!dump_entry_internal(entry, f)) {
13828 f->dump_int("state", entry.state);
13829 }
13830 f->close_section();
13831 }
13832
13833 RGWOpState::RGWOpState(RGWRados *_store) : RGWStateLog(_store, _store->ctx()->_conf->rgw_num_zone_opstate_shards, string("obj_opstate"))
13834 {
13835 }
13836
13837 bool RGWOpState::dump_entry_internal(const cls_statelog_entry& entry, Formatter *f)
13838 {
13839 string s;
13840 switch ((OpState)entry.state) {
13841 case OPSTATE_UNKNOWN:
13842 s = "unknown";
13843 break;
13844 case OPSTATE_IN_PROGRESS:
13845 s = "in-progress";
13846 break;
13847 case OPSTATE_COMPLETE:
13848 s = "complete";
13849 break;
13850 case OPSTATE_ERROR:
13851 s = "error";
13852 break;
13853 case OPSTATE_ABORT:
13854 s = "abort";
13855 break;
13856 case OPSTATE_CANCELLED:
13857 s = "cancelled";
13858 break;
13859 default:
13860 s = "invalid";
13861 }
13862 f->dump_string("state", s);
13863 return true;
13864 }
13865
13866 int RGWOpState::state_from_str(const string& s, OpState *state)
13867 {
13868 if (s == "unknown") {
13869 *state = OPSTATE_UNKNOWN;
13870 } else if (s == "in-progress") {
13871 *state = OPSTATE_IN_PROGRESS;
13872 } else if (s == "complete") {
13873 *state = OPSTATE_COMPLETE;
13874 } else if (s == "error") {
13875 *state = OPSTATE_ERROR;
13876 } else if (s == "abort") {
13877 *state = OPSTATE_ABORT;
13878 } else if (s == "cancelled") {
13879 *state = OPSTATE_CANCELLED;
13880 } else {
13881 return -EINVAL;
13882 }
13883
13884 return 0;
13885 }
13886
13887 int RGWOpState::set_state(const string& client_id, const string& op_id, const string& object, OpState state)
13888 {
13889 uint32_t s = (uint32_t)state;
13890 return store_entry(client_id, op_id, object, s, NULL, NULL);
13891 }
13892
13893 int RGWOpState::renew_state(const string& client_id, const string& op_id, const string& object, OpState state)
13894 {
13895 uint32_t s = (uint32_t)state;
13896 return store_entry(client_id, op_id, object, s, NULL, &s);
13897 }
13898
13899 RGWOpStateSingleOp::RGWOpStateSingleOp(RGWRados *store, const string& cid, const string& oid,
13900 const string& obj) : os(store), client_id(cid), op_id(oid), object(obj)
13901 {
13902 cct = store->ctx();
13903 cur_state = RGWOpState::OPSTATE_UNKNOWN;
13904 }
13905
13906 int RGWOpStateSingleOp::set_state(RGWOpState::OpState state) {
13907 last_update = real_clock::now();
13908 cur_state = state;
13909 return os.set_state(client_id, op_id, object, state);
13910 }
13911
13912 int RGWOpStateSingleOp::renew_state() {
13913 real_time now = real_clock::now();
13914
13915 int rate_limit_sec = cct->_conf->rgw_opstate_ratelimit_sec;
13916
13917 if (rate_limit_sec && now - last_update < make_timespan(rate_limit_sec)) {
13918 return 0;
13919 }
13920
13921 last_update = now;
13922 return os.renew_state(client_id, op_id, object, cur_state);
13923 }
13924
13925
13926 uint64_t RGWRados::instance_id()
13927 {
13928 return get_rados_handle()->get_instance_id();
13929 }
13930
13931 uint64_t RGWRados::next_bucket_id()
13932 {
13933 Mutex::Locker l(bucket_id_lock);
13934 return ++max_bucket_id;
13935 }
13936
13937 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
13938 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
13939 {
13940 RGWRados *store = NULL;
13941 if (!use_cache) {
13942 store = new RGWRados;
13943 } else {
13944 store = new RGWCache<RGWRados>;
13945 }
13946
13947 if (store->initialize(cct, use_gc_thread, use_lc_thread, quota_threads, run_sync_thread, run_reshard_thread) < 0) {
13948 delete store;
13949 return NULL;
13950 }
13951
13952 return store;
13953 }
13954
13955 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
13956 {
13957 RGWRados *store = NULL;
13958 store = new RGWRados;
13959
13960 store->set_context(cct);
13961
13962 if (store->init_rados() < 0) {
13963 delete store;
13964 return NULL;
13965 }
13966
13967 return store;
13968 }
13969
13970 void RGWStoreManager::close_storage(RGWRados *store)
13971 {
13972 if (!store)
13973 return;
13974
13975 store->finalize();
13976
13977 delete store;
13978 }
13979
13980 librados::Rados* RGWRados::get_rados_handle()
13981 {
13982 if (rados.size() == 1) {
13983 return &rados[0];
13984 } else {
13985 handle_lock.get_read();
13986 pthread_t id = pthread_self();
13987 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
13988
13989 if (it != rados_map.end()) {
13990 handle_lock.put_read();
13991 return &rados[it->second];
13992 } else {
13993 handle_lock.put_read();
13994 handle_lock.get_write();
13995 const uint32_t handle = next_rados_handle;
13996 rados_map[id] = handle;
13997 if (++next_rados_handle == rados.size()) {
13998 next_rados_handle = 0;
13999 }
14000 handle_lock.put_write();
14001 return &rados[handle];
14002 }
14003 }
14004 }
14005
14006 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
14007 {
14008 rgw_rados_ref ref;
14009 int ret = get_raw_obj_ref(obj, &ref);
14010 if (ret < 0) {
14011 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14012 return ret;
14013 }
14014
14015 ObjectWriteOperation op;
14016 list<string> prefixes;
14017 cls_rgw_remove_obj(op, prefixes);
14018
14019 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14020 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14021 if (ret < 0) {
14022 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14023 c->release();
14024 return ret;
14025 }
14026
14027 handles.push_back(c);
14028
14029 return 0;
14030 }
14031
14032 int RGWRados::delete_obj_aio(const rgw_obj& obj,
14033 RGWBucketInfo& bucket_info, RGWObjState *astate,
14034 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
14035 {
14036 rgw_rados_ref ref;
14037 int ret = get_obj_head_ref(bucket_info, obj, &ref);
14038 if (ret < 0) {
14039 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
14040 return ret;
14041 }
14042
14043 if (keep_index_consistent) {
14044 RGWRados::Bucket bop(this, bucket_info);
14045 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
14046
14047 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
14048 if (ret < 0) {
14049 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
14050 return ret;
14051 }
14052 }
14053
14054 ObjectWriteOperation op;
14055 list<string> prefixes;
14056 cls_rgw_remove_obj(op, prefixes);
14057
14058 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
14059 ret = ref.ioctx.aio_operate(ref.oid, c, &op);
14060 if (ret < 0) {
14061 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
14062 c->release();
14063 return ret;
14064 }
14065
14066 handles.push_back(c);
14067
14068 if (keep_index_consistent) {
14069 ret = delete_obj_index(obj);
14070 if (ret < 0) {
14071 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
14072 return ret;
14073 }
14074 }
14075 return ret;
14076 }
14077
14078 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
14079 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
14080 if (value != attrs.end()) {
14081 bufferlist::iterator bliter = value->second.begin();
14082 try {
14083 ::decode(cs_info, bliter);
14084 } catch (buffer::error& err) {
14085 return -EIO;
14086 }
14087 if (cs_info.blocks.size() == 0) {
14088 return -EIO;
14089 }
14090 if (cs_info.compression_type != "none")
14091 need_decompress = true;
14092 else
14093 need_decompress = false;
14094 return 0;
14095 } else {
14096 need_decompress = false;
14097 return 0;
14098 }
14099 }
14100
14101 bool RGWRados::call(std::string command, cmdmap_t& cmdmap, std::string format,
14102 bufferlist& out)
14103 {
14104 if (command == "cache list") {
14105 boost::optional<std::string> filter;
14106 auto i = cmdmap.find("filter");
14107 if (i != cmdmap.cend()) {
14108 filter = boost::get<std::string>(i->second);
14109 }
14110 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
14111 if (f) {
14112 f->open_array_section("cache_entries");
14113 call_list(filter, f.get());
14114 f->close_section();
14115 f->flush(out);
14116 return true;
14117 } else {
14118 out.append("Unable to create Formatter.\n");
14119 return false;
14120 }
14121 } else if (command == "cache inspect") {
14122 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
14123 if (f) {
14124 const auto& target = boost::get<std::string>(cmdmap["target"]);
14125 if (call_inspect(target, f.get())) {
14126 f->flush(out);
14127 return true;
14128 } else {
14129 out.append(string("Unable to find entry ") + target + string(".\n"));
14130 return false;
14131 }
14132 } else {
14133 out.append("Unable to create Formatter.\n");
14134 return false;
14135 }
14136 } else if (command == "cache erase") {
14137 const auto& target = boost::get<std::string>(cmdmap["target"]);
14138 if (call_erase(target)) {
14139 return true;
14140 } else {
14141 out.append(string("Unable to find entry ") + target + string(".\n"));
14142 return false;
14143 }
14144 } else if (command == "cache zap") {
14145 call_zap();
14146 return true;
14147 }
14148 return false;
14149 }
14150
14151 void RGWRados::call_list(const boost::optional<std::string>&,
14152 ceph::Formatter*)
14153 {
14154 return;
14155 }
14156
14157 bool RGWRados::call_inspect(const std::string&, Formatter*)
14158 {
14159 return false;
14160 }
14161
14162 bool RGWRados::call_erase(const std::string&) {
14163 return false;
14164 }
14165
14166 void RGWRados::call_zap() {
14167 return;
14168 }