]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
use the buster suite for getting the source package for now
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
9f95a23c 2// vim: ts=8 sw=2 smarttab ft=cpp
7c673cae 3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
9f95a23c
TL
8#include <sstream>
9
7c673cae 10#include <boost/algorithm/string.hpp>
11fdf7f2 11#include <string_view>
7c673cae 12
11fdf7f2 13#include <boost/container/flat_set.hpp>
7c673cae
FG
14#include <boost/format.hpp>
15#include <boost/optional.hpp>
16#include <boost/utility/in_place_factory.hpp>
17
18#include "common/ceph_json.h"
7c673cae
FG
19
20#include "common/errno.h"
21#include "common/Formatter.h"
22#include "common/Throttle.h"
7c673cae 23
9f95a23c 24#include "rgw_sal.h"
11fdf7f2 25#include "rgw_zone.h"
7c673cae
FG
26#include "rgw_cache.h"
27#include "rgw_acl.h"
28#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 29#include "rgw_aio_throttle.h"
7c673cae
FG
30#include "rgw_bucket.h"
31#include "rgw_rest_conn.h"
32#include "rgw_cr_rados.h"
33#include "rgw_cr_rest.h"
f67539c2 34#include "rgw_datalog.h"
11fdf7f2 35#include "rgw_putobj_processor.h"
7c673cae
FG
36
37#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
38#include "cls/rgw/cls_rgw_client.h"
39#include "cls/rgw/cls_rgw_const.h"
40#include "cls/refcount/cls_refcount_client.h"
41#include "cls/version/cls_version_client.h"
c07f9fc5 42#include "osd/osd_types.h"
7c673cae
FG
43
44#include "rgw_tools.h"
45#include "rgw_coroutine.h"
46#include "rgw_compression.h"
adb31ebb 47#include "rgw_etag_verifier.h"
9f95a23c 48#include "rgw_worker.h"
f67539c2 49#include "rgw_notify.h"
7c673cae 50
7c673cae
FG
51#undef fork // fails to compile RGWPeriod::fork() below
52
53#include "common/Clock.h"
54
7c673cae
FG
55using namespace librados;
56
57#include <string>
58#include <iostream>
59#include <vector>
60#include <atomic>
61#include <list>
62#include <map>
11fdf7f2 63#include "include/random.h"
7c673cae
FG
64
65#include "rgw_gc.h"
66#include "rgw_lc.h"
67
68#include "rgw_object_expirer_core.h"
69#include "rgw_sync.h"
81eedcae 70#include "rgw_sync_counters.h"
11fdf7f2 71#include "rgw_sync_trace.h"
9f95a23c
TL
72#include "rgw_trim_datalog.h"
73#include "rgw_trim_mdlog.h"
7c673cae
FG
74#include "rgw_data_sync.h"
75#include "rgw_realm_watcher.h"
31f18b77 76#include "rgw_reshard.h"
7c673cae 77
11fdf7f2
TL
78#include "services/svc_zone.h"
79#include "services/svc_zone_utils.h"
80#include "services/svc_quota.h"
81#include "services/svc_sync_modules.h"
82#include "services/svc_sys_obj.h"
83#include "services/svc_sys_obj_cache.h"
9f95a23c
TL
84#include "services/svc_bucket.h"
85#include "services/svc_mdlog.h"
11fdf7f2 86
7c673cae
FG
87#include "compressor/Compressor.h"
88
11fdf7f2
TL
89#ifdef WITH_LTTNG
90#define TRACEPOINT_DEFINE
91#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92#include "tracing/rgw_rados.h"
93#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
94#undef TRACEPOINT_DEFINE
95#else
96#define tracepoint(...)
97#endif
98
7c673cae
FG
99#define dout_context g_ceph_context
100#define dout_subsys ceph_subsys_rgw
101
7c673cae 102
7c673cae 103static string shadow_ns = "shadow";
7c673cae
FG
104static string default_bucket_index_pool_suffix = "rgw.buckets.index";
105static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 106
11fdf7f2 107static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 108#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae
FG
109
110#define dout_subsys ceph_subsys_rgw
111
112
113static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
114 const rgw_placement_rule& head_placement_rule,
115 const rgw_obj& obj, rgw_pool *pool)
7c673cae 116{
11fdf7f2 117 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 118 RGWZonePlacementInfo placement;
11fdf7f2 119 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
120 return false;
121 }
122
123 if (!obj.in_extra_data) {
11fdf7f2 124 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 125 } else {
31f18b77 126 *pool = placement.get_data_extra_pool();
7c673cae
FG
127 }
128 }
129
130 return true;
131}
132
133static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
134 const rgw_placement_rule& head_placement_rule,
135 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
136{
137 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
138
11fdf7f2 139 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
140}
141
142rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
143{
144 if (!is_raw) {
145 rgw_raw_obj r;
146 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
147 return r;
148 }
149 return raw_obj;
150}
151
f67539c2 152rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RGWStore* store) const
7c673cae
FG
153{
154 if (!is_raw) {
155 rgw_raw_obj r;
f67539c2 156 store->get_raw_obj(placement_rule, obj, &r);
7c673cae
FG
157 return r;
158 }
159 return raw_obj;
160}
161
11fdf7f2
TL
162void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
163{
164 obj_version *check_objv = version_for_check();
7c673cae 165
11fdf7f2
TL
166 if (check_objv) {
167 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
168 }
169
11fdf7f2 170 cls_version_read(*op, &read_version);
7c673cae
FG
171}
172
11fdf7f2
TL
173void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
174{
175 obj_version *check_objv = version_for_check();
176 obj_version *modify_version = version_for_write();
7c673cae 177
11fdf7f2
TL
178 if (check_objv) {
179 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 180 }
7c673cae 181
11fdf7f2
TL
182 if (modify_version) {
183 cls_version_set(*op, *modify_version);
184 } else {
185 cls_version_inc(*op);
7c673cae 186 }
7c673cae
FG
187}
188
f91f0fd5
TL
189void RGWObjVersionTracker::apply_write()
190{
191 const bool checked = (read_version.ver != 0);
192 const bool incremented = (write_version.ver == 0);
193
194 if (checked && incremented) {
195 // apply cls_version_inc() so our next operation can recheck it
196 ++read_version.ver;
197 } else {
198 read_version = write_version;
199 }
200 write_version = obj_version();
201}
202
9f95a23c 203RGWObjState::RGWObjState() {
7c673cae
FG
204}
205
9f95a23c 206RGWObjState::~RGWObjState() {
7c673cae
FG
207}
208
9f95a23c
TL
209RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
210 is_atomic = rhs.is_atomic;
211 has_attrs = rhs.has_attrs;
212 exists = rhs.exists;
213 size = rhs.size;
214 accounted_size = rhs.accounted_size;
215 mtime = rhs.mtime;
216 epoch = rhs.epoch;
217 if (rhs.obj_tag.length()) {
218 obj_tag = rhs.obj_tag;
7c673cae 219 }
9f95a23c
TL
220 if (rhs.tail_tag.length()) {
221 tail_tag = rhs.tail_tag;
7c673cae 222 }
9f95a23c
TL
223 write_tag = rhs.write_tag;
224 fake_tag = rhs.fake_tag;
225 manifest = rhs.manifest;
226 shadow_obj = rhs.shadow_obj;
227 has_data = rhs.has_data;
228 if (rhs.data.length()) {
229 data = rhs.data;
7c673cae 230 }
9f95a23c
TL
231 prefetch_data = rhs.prefetch_data;
232 keep_tail = rhs.keep_tail;
233 is_olh = rhs.is_olh;
234 objv_tracker = rhs.objv_tracker;
235 pg_ver = rhs.pg_ver;
7c673cae
FG
236}
237
9f95a23c
TL
238RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
239 RGWObjState *result;
240 typename std::map<rgw_obj, RGWObjState>::iterator iter;
241 lock.lock_shared();
242 assert (!obj.empty());
243 iter = objs_state.find(obj);
244 if (iter != objs_state.end()) {
245 result = &iter->second;
246 lock.unlock_shared();
247 } else {
248 lock.unlock_shared();
249 lock.lock();
250 result = &objs_state[obj];
251 lock.unlock();
224ce89b 252 }
9f95a23c 253 return result;
7c673cae
FG
254}
255
9f95a23c
TL
256void RGWObjectCtx::set_atomic(rgw_obj& obj) {
257 std::unique_lock wl{lock};
258 assert (!obj.empty());
259 objs_state[obj].is_atomic = true;
7c673cae 260}
9f95a23c
TL
261void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
262 std::unique_lock wl{lock};
263 assert (!obj.empty());
264 objs_state[obj].prefetch_data = true;
7c673cae
FG
265}
266
9f95a23c
TL
267void RGWObjectCtx::invalidate(const rgw_obj& obj) {
268 std::unique_lock wl{lock};
269 auto iter = objs_state.find(obj);
270 if (iter == objs_state.end()) {
11fdf7f2 271 return;
7c673cae 272 }
9f95a23c
TL
273 bool is_atomic = iter->second.is_atomic;
274 bool prefetch_data = iter->second.prefetch_data;
7c673cae 275
9f95a23c 276 objs_state.erase(iter);
7c673cae 277
9f95a23c
TL
278 if (is_atomic || prefetch_data) {
279 auto& state = objs_state[obj];
280 state.is_atomic = is_atomic;
281 state.prefetch_data = prefetch_data;
11fdf7f2 282 }
7c673cae
FG
283}
284
11fdf7f2 285void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 286{
11fdf7f2
TL
287 write_version.ver = 1;
288#define TAG_LEN 24
7c673cae 289
11fdf7f2
TL
290 write_version.tag.clear();
291 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
292}
293
7c673cae
FG
294class RGWMetaNotifierManager : public RGWCoroutinesManager {
295 RGWRados *store;
296 RGWHTTPManager http_manager;
297
298public:
299 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
300 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 301 http_manager.start();
7c673cae
FG
302 }
303
b3b6e05e 304 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
7c673cae
FG
305 rgw_http_param_pair pairs[] = { { "type", "metadata" },
306 { "notify", NULL },
307 { NULL, NULL } };
308
309 list<RGWCoroutinesStack *> stacks;
9f95a23c 310 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
311 RGWRESTConn *conn = iter->second;
312 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
313 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
314
315 stacks.push_back(stack);
316 }
b3b6e05e 317 return run(dpp, stacks);
7c673cae
FG
318 }
319};
320
321class RGWDataNotifierManager : public RGWCoroutinesManager {
322 RGWRados *store;
323 RGWHTTPManager http_manager;
324
325public:
326 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
327 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 328 http_manager.start();
7c673cae
FG
329 }
330
b3b6e05e 331 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
f67539c2 332 bc::flat_map<int, bc::flat_set<string> >& shards) {
7c673cae
FG
333 rgw_http_param_pair pairs[] = { { "type", "data" },
334 { "notify", NULL },
11fdf7f2 335 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
336 { NULL, NULL } };
337
338 list<RGWCoroutinesStack *> stacks;
9f95a23c 339 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
340 RGWRESTConn *conn = iter->second;
341 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
f67539c2 342 stack->call(new RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
7c673cae
FG
343
344 stacks.push_back(stack);
345 }
b3b6e05e 346 return run(dpp, stacks);
7c673cae
FG
347 }
348};
349
11fdf7f2
TL
350/* class RGWRadosThread */
351
7c673cae
FG
352void RGWRadosThread::start()
353{
354 worker = new Worker(cct, this);
355 worker->create(thread_name.c_str());
356}
357
358void RGWRadosThread::stop()
359{
360 down_flag = true;
361 stop_process();
362 if (worker) {
31f18b77 363 worker->signal();
7c673cae
FG
364 worker->join();
365 }
366 delete worker;
367 worker = NULL;
368}
369
370void *RGWRadosThread::Worker::entry() {
371 uint64_t msec = processor->interval_msec();
9f95a23c 372 auto interval = std::chrono::milliseconds(msec);
7c673cae
FG
373
374 do {
9f95a23c 375 auto start = ceph::real_clock::now();
b3b6e05e 376 int r = processor->process(this);
7c673cae 377 if (r < 0) {
b3b6e05e 378 ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
7c673cae
FG
379 }
380
381 if (processor->going_down())
382 break;
383
9f95a23c 384 auto end = ceph::real_clock::now() - start;
7c673cae
FG
385
386 uint64_t cur_msec = processor->interval_msec();
387 if (cur_msec != msec) { /* was it reconfigured? */
388 msec = cur_msec;
9f95a23c 389 interval = std::chrono::milliseconds(msec);
7c673cae
FG
390 }
391
392 if (cur_msec > 0) {
393 if (interval <= end)
394 continue; // next round
395
9f95a23c 396 auto wait_time = interval - end;
31f18b77 397 wait_interval(wait_time);
7c673cae 398 } else {
31f18b77 399 wait();
7c673cae
FG
400 }
401 } while (!processor->going_down());
402
403 return NULL;
404}
405
406class RGWMetaNotifier : public RGWRadosThread {
407 RGWMetaNotifierManager notify_mgr;
408 RGWMetadataLog *const log;
409
410 uint64_t interval_msec() override {
411 return cct->_conf->rgw_md_notify_interval_msec;
412 }
1adf2230
AA
413 void stop_process() override {
414 notify_mgr.stop();
415 }
7c673cae
FG
416public:
417 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
418 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
419
b3b6e05e 420 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
421};
422
b3b6e05e 423int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
7c673cae
FG
424{
425 set<int> shards;
426
427 log->read_clear_modified(shards);
428
429 if (shards.empty()) {
430 return 0;
431 }
432
433 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
b3b6e05e 434 ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
7c673cae
FG
435 }
436
b3b6e05e 437 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
438
439 return 0;
440}
441
442class RGWDataNotifier : public RGWRadosThread {
443 RGWDataNotifierManager notify_mgr;
444
445 uint64_t interval_msec() override {
11fdf7f2 446 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 447 }
1adf2230
AA
448 void stop_process() override {
449 notify_mgr.stop();
450 }
7c673cae
FG
451public:
452 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
453
b3b6e05e 454 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
455};
456
b3b6e05e 457int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
7c673cae 458{
f67539c2 459 auto data_log = store->svc.datalog_rados;
9f95a23c 460 if (!data_log) {
7c673cae
FG
461 return 0;
462 }
463
f67539c2 464 auto shards = data_log->read_clear_modified();
7c673cae
FG
465
466 if (shards.empty()) {
467 return 0;
468 }
469
f67539c2 470 for (const auto& [shard_id, keys] : shards) {
b3b6e05e 471 ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
f67539c2 472 << shard_id << ": " << keys << dendl;
7c673cae
FG
473 }
474
b3b6e05e 475 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
476
477 return 0;
478}
479
480class RGWSyncProcessorThread : public RGWRadosThread {
481public:
482 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
483 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
484 ~RGWSyncProcessorThread() override {}
b3b6e05e
TL
485 int init(const DoutPrefixProvider *dpp) override = 0 ;
486 int process(const DoutPrefixProvider *dpp) override = 0;
7c673cae
FG
487};
488
489class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
490{
491 RGWMetaSyncStatusManager sync;
492
493 uint64_t interval_msec() override {
494 return 0; /* no interval associated, it'll run once until stopped */
495 }
496 void stop_process() override {
497 sync.stop();
498 }
499public:
9f95a23c
TL
500 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados)
501 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
7c673cae
FG
502
503 void wakeup_sync_shards(set<int>& shard_ids) {
504 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
505 sync.wakeup(*iter);
506 }
507 }
508 RGWMetaSyncStatusManager* get_manager() { return &sync; }
509
b3b6e05e
TL
510 int init(const DoutPrefixProvider *dpp) override {
511 int ret = sync.init(dpp);
7c673cae 512 if (ret < 0) {
b3b6e05e 513 ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
7c673cae
FG
514 return ret;
515 }
516 return 0;
517 }
518
b3b6e05e
TL
519 int process(const DoutPrefixProvider *dpp) override {
520 sync.run(dpp, null_yield);
7c673cae
FG
521 return 0;
522 }
523};
524
525class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
526{
81eedcae 527 PerfCountersRef counters;
7c673cae
FG
528 RGWDataSyncStatusManager sync;
529 bool initialized;
530
531 uint64_t interval_msec() override {
532 if (initialized) {
533 return 0; /* no interval associated, it'll run once until stopped */
534 } else {
535#define DATA_SYNC_INIT_WAIT_SEC 20
536 return DATA_SYNC_INIT_WAIT_SEC * 1000;
537 }
538 }
539 void stop_process() override {
540 sync.stop();
541 }
542public:
9f95a23c 543 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados,
81eedcae 544 const RGWZone* source_zone)
9f95a23c 545 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
81eedcae
TL
546 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
547 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
548 initialized(false) {}
549
550 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
551 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
552 sync.wakeup(iter->first, iter->second);
553 }
554 }
555 RGWDataSyncStatusManager* get_manager() { return &sync; }
556
b3b6e05e 557 int init(const DoutPrefixProvider *dpp) override {
7c673cae
FG
558 return 0;
559 }
560
b3b6e05e 561 int process(const DoutPrefixProvider *dpp) override {
7c673cae
FG
562 while (!initialized) {
563 if (going_down()) {
564 return 0;
565 }
b3b6e05e 566 int ret = sync.init(dpp);
7c673cae
FG
567 if (ret >= 0) {
568 initialized = true;
569 break;
570 }
571 /* we'll be back! */
572 return 0;
573 }
b3b6e05e 574 sync.run(dpp);
7c673cae
FG
575 return 0;
576 }
577};
578
11fdf7f2 579class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
580{
581 RGWCoroutinesManager crs;
9f95a23c 582 rgw::sal::RGWRadosStore *store;
b32b8144 583 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
584 RGWHTTPManager http;
585 const utime_t trim_interval;
586
587 uint64_t interval_msec() override { return 0; }
588 void stop_process() override { crs.stop(); }
589public:
9f95a23c 590 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore *store, rgw::BucketTrimManager *bucket_trim,
b32b8144 591 int interval)
9f95a23c
TL
592 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
593 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
b32b8144 594 bucket_trim(bucket_trim),
7c673cae
FG
595 http(store->ctx(), crs.get_completion_mgr()),
596 trim_interval(interval, 0)
597 {}
598
b3b6e05e 599 int init(const DoutPrefixProvider *dpp) override {
11fdf7f2 600 return http.start();
7c673cae 601 }
b3b6e05e 602 int process(const DoutPrefixProvider *dpp) override {
7c673cae
FG
603 list<RGWCoroutinesStack*> stacks;
604 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
11fdf7f2 605 meta->call(create_meta_log_trim_cr(this, store, &http,
7c673cae
FG
606 cct->_conf->rgw_md_log_max_shards,
607 trim_interval));
608 stacks.push_back(meta);
609
9f95a23c
TL
610 if (store->svc()->zone->sync_module_exports_data()) {
611 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
b3b6e05e 612 data->call(create_data_log_trim_cr(this, store, &http,
9f95a23c
TL
613 cct->_conf->rgw_data_log_num_shards,
614 trim_interval));
615 stacks.push_back(data);
7c673cae 616
9f95a23c
TL
617 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
618 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
619 stacks.push_back(bucket);
620 }
b32b8144 621
b3b6e05e 622 crs.run(dpp, stacks);
7c673cae
FG
623 return 0;
624 }
11fdf7f2
TL
625
626 // implements DoutPrefixProvider
627 CephContext *get_cct() const override { return store->ctx(); }
9f95a23c 628 unsigned get_subsys() const override
11fdf7f2
TL
629 {
630 return dout_subsys;
631 }
632
9f95a23c 633 std::ostream& gen_prefix(std::ostream& out) const override
11fdf7f2
TL
634 {
635 return out << "sync log trim: ";
636 }
637
7c673cae
FG
638};
639
640void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
641{
9f95a23c 642 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
643 if (meta_sync_processor_thread) {
644 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
645 }
646}
647
9f95a23c 648void RGWRados::wakeup_data_sync_shards(const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
7c673cae
FG
649{
650 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
9f95a23c
TL
651 std::lock_guard l{data_sync_thread_lock};
652 auto iter = data_sync_processor_threads.find(source_zone);
7c673cae
FG
653 if (iter == data_sync_processor_threads.end()) {
654 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
655 return;
656 }
657
658 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 659 ceph_assert(thread);
7c673cae
FG
660 thread->wakeup_sync_shards(shard_ids);
661}
662
663RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
664{
9f95a23c 665 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
666 if (meta_sync_processor_thread) {
667 return meta_sync_processor_thread->get_manager();
668 }
669 return nullptr;
670}
671
9f95a23c 672RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
7c673cae 673{
9f95a23c 674 std::lock_guard l{data_sync_thread_lock};
7c673cae
FG
675 auto thread = data_sync_processor_threads.find(source_zone);
676 if (thread == data_sync_processor_threads.end()) {
677 return nullptr;
678 }
679 return thread->second->get_manager();
680}
681
b3b6e05e 682int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
7c673cae
FG
683{
684 IoCtx ioctx;
b3b6e05e 685 int r = open_pool_ctx(dpp, pool, ioctx, false);
7c673cae
FG
686 if (r < 0) {
687 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
688 return r;
689 }
690
691 bool requires;
692 r = ioctx.pool_requires_alignment2(&requires);
693 if (r < 0) {
694 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
695 << r << dendl;
696 return r;
697 }
698
699 if (!requires) {
700 *alignment = 0;
701 return 0;
702 }
703
704 uint64_t align;
705 r = ioctx.pool_required_alignment2(&align);
706 if (r < 0) {
707 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
708 << r << dendl;
709 return r;
710 }
711 if (align != 0) {
712 ldout(cct, 20) << "required alignment=" << align << dendl;
713 }
714 *alignment = align;
715 return 0;
716}
717
11fdf7f2
TL
718void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
719{
720 if (alignment == 0) {
721 *max_size = size;
722 return;
723 }
724
725 if (size <= alignment) {
726 *max_size = alignment;
727 return;
728 }
729
730 *max_size = size - (size % alignment);
731}
732
b3b6e05e 733int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae 734{
11fdf7f2 735 uint64_t alignment;
b3b6e05e 736 int r = get_required_alignment(dpp, pool, &alignment);
7c673cae
FG
737 if (r < 0) {
738 return r;
739 }
740
11fdf7f2
TL
741 if (palignment) {
742 *palignment = alignment;
7c673cae
FG
743 }
744
11fdf7f2 745 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 746
11fdf7f2 747 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae 748
b3b6e05e 749 ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
7c673cae
FG
750
751 return 0;
752}
753
11fdf7f2 754int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
b3b6e05e 755 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae
FG
756{
757 rgw_pool pool;
758 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
b3b6e05e 759 ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
7c673cae
FG
760 return -EIO;
761 }
b3b6e05e 762 return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
7c673cae
FG
763}
764
31f18b77
FG
765class RGWIndexCompletionManager;
766
767struct complete_op_data {
9f95a23c 768 ceph::mutex lock = ceph::make_mutex("complete_op_data");
31f18b77
FG
769 AioCompletion *rados_completion{nullptr};
770 int manager_shard_id{-1};
771 RGWIndexCompletionManager *manager{nullptr};
772 rgw_obj obj;
773 RGWModifyOp op;
774 string tag;
775 rgw_bucket_entry_ver ver;
776 cls_rgw_obj_key key;
777 rgw_bucket_dir_entry_meta dir_meta;
778 list<cls_rgw_obj_key> remove_objs;
779 bool log_op;
780 uint16_t bilog_op;
781 rgw_zone_set zones_trace;
782
783 bool stopped{false};
784
785 void stop() {
9f95a23c 786 std::lock_guard l{lock};
31f18b77
FG
787 stopped = true;
788 }
789};
790
b3b6e05e 791class RGWIndexCompletionThread : public RGWRadosThread, public DoutPrefixProvider {
31f18b77
FG
792 RGWRados *store;
793
794 uint64_t interval_msec() override {
795 return 0;
796 }
797
798 list<complete_op_data *> completions;
799
9f95a23c
TL
800 ceph::mutex completions_lock =
801 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
31f18b77
FG
802public:
803 RGWIndexCompletionThread(RGWRados *_store)
9f95a23c 804 : RGWRadosThread(_store, "index-complete"), store(_store) {}
31f18b77 805
b3b6e05e 806 int process(const DoutPrefixProvider *dpp) override;
31f18b77
FG
807
808 void add_completion(complete_op_data *completion) {
809 {
9f95a23c 810 std::lock_guard l{completions_lock};
31f18b77
FG
811 completions.push_back(completion);
812 }
813
814 signal();
815 }
b3b6e05e
TL
816
817 CephContext *get_cct() const override { return store->ctx(); }
818 unsigned get_subsys() const { return dout_subsys; }
819 std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw index completion thread: "; }
31f18b77
FG
820};
821
b3b6e05e 822int RGWIndexCompletionThread::process(const DoutPrefixProvider *dpp)
31f18b77
FG
823{
824 list<complete_op_data *> comps;
825
826 {
9f95a23c 827 std::lock_guard l{completions_lock};
31f18b77
FG
828 completions.swap(comps);
829 }
830
831 for (auto c : comps) {
832 std::unique_ptr<complete_op_data> up{c};
833
834 if (going_down()) {
835 continue;
836 }
b3b6e05e 837 ldpp_dout(this, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
31f18b77
FG
838
839 RGWRados::BucketShard bs(store);
f64942e4 840 RGWBucketInfo bucket_info;
31f18b77 841
b3b6e05e 842 int r = bs.init(c->obj.bucket, c->obj, &bucket_info, this);
31f18b77 843 if (r < 0) {
b3b6e05e 844 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
31f18b77
FG
845 /* not much to do */
846 continue;
847 }
848
b3b6e05e 849 r = store->guard_reshard(this, &bs, c->obj, bucket_info,
f64942e4
AA
850 [&](RGWRados::BucketShard *bs) -> int {
851 librados::ObjectWriteOperation o;
852 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
853 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
854 c->log_op, c->bilog_op, &c->zones_trace);
b3b6e05e 855 return bs->bucket_obj.operate(this, &o, null_yield);
31f18b77
FG
856 });
857 if (r < 0) {
b3b6e05e 858 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
31f18b77
FG
859 /* ignoring error, can't do anything about it */
860 continue;
861 }
b3b6e05e 862 r = store->svc.datalog_rados->add_entry(this, bucket_info, bs.shard_id);
31f18b77 863 if (r < 0) {
b3b6e05e 864 ldpp_dout(this, -1) << "ERROR: failed writing data log" << dendl;
31f18b77
FG
865 }
866 }
867
868 return 0;
869}
870
871class RGWIndexCompletionManager {
872 RGWRados *store{nullptr};
9f95a23c 873 ceph::containers::tiny_vector<ceph::mutex> locks;
31f18b77
FG
874 vector<set<complete_op_data *> > completions;
875
876 RGWIndexCompletionThread *completion_thread{nullptr};
877
878 int num_shards;
879
880 std::atomic<int> cur_shard {0};
881
882
883public:
9f95a23c
TL
884 RGWIndexCompletionManager(RGWRados *_store) :
885 store(_store),
886 locks{ceph::make_lock_container<ceph::mutex>(
887 store->ctx()->_conf->rgw_thread_pool_size,
888 [](const size_t i) {
889 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
890 std::to_string(i));
891 })}
892 {
31f18b77 893 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
31f18b77
FG
894 completions.resize(num_shards);
895 }
896 ~RGWIndexCompletionManager() {
897 stop();
31f18b77
FG
898 }
899
900 int next_shard() {
901 int result = cur_shard % num_shards;
902 cur_shard++;
903 return result;
904 }
905
906 void create_completion(const rgw_obj& obj,
907 RGWModifyOp op, string& tag,
908 rgw_bucket_entry_ver& ver,
909 const cls_rgw_obj_key& key,
910 rgw_bucket_dir_entry_meta& dir_meta,
911 list<cls_rgw_obj_key> *remove_objs, bool log_op,
912 uint16_t bilog_op,
913 rgw_zone_set *zones_trace,
914 complete_op_data **result);
915 bool handle_completion(completion_t cb, complete_op_data *arg);
916
b3b6e05e 917 int start(const DoutPrefixProvider *dpp) {
31f18b77 918 completion_thread = new RGWIndexCompletionThread(store);
b3b6e05e 919 int ret = completion_thread->init(dpp);
31f18b77
FG
920 if (ret < 0) {
921 return ret;
922 }
923 completion_thread->start();
924 return 0;
925 }
926 void stop() {
927 if (completion_thread) {
928 completion_thread->stop();
929 delete completion_thread;
930 }
931
932 for (int i = 0; i < num_shards; ++i) {
9f95a23c 933 std::lock_guard l{locks[i]};
31f18b77 934 for (auto c : completions[i]) {
31f18b77
FG
935 c->stop();
936 }
937 }
938 completions.clear();
939 }
940};
941
942static void obj_complete_cb(completion_t cb, void *arg)
943{
944 complete_op_data *completion = (complete_op_data *)arg;
9f95a23c 945 completion->lock.lock();
31f18b77 946 if (completion->stopped) {
9f95a23c 947 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
31f18b77
FG
948 delete completion;
949 return;
950 }
951 bool need_delete = completion->manager->handle_completion(cb, completion);
9f95a23c 952 completion->lock.unlock();
31f18b77
FG
953 if (need_delete) {
954 delete completion;
955 }
956}
957
958
959void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
960 RGWModifyOp op, string& tag,
961 rgw_bucket_entry_ver& ver,
962 const cls_rgw_obj_key& key,
963 rgw_bucket_dir_entry_meta& dir_meta,
964 list<cls_rgw_obj_key> *remove_objs, bool log_op,
965 uint16_t bilog_op,
966 rgw_zone_set *zones_trace,
967 complete_op_data **result)
968{
969 complete_op_data *entry = new complete_op_data;
970
971 int shard_id = next_shard();
972
973 entry->manager_shard_id = shard_id;
974 entry->manager = this;
975 entry->obj = obj;
976 entry->op = op;
977 entry->tag = tag;
978 entry->ver = ver;
979 entry->key = key;
980 entry->dir_meta = dir_meta;
981 entry->log_op = log_op;
982 entry->bilog_op = bilog_op;
983
984 if (remove_objs) {
985 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
986 entry->remove_objs.push_back(*iter);
987 }
988 }
989
990 if (zones_trace) {
991 entry->zones_trace = *zones_trace;
992 } else {
9f95a23c 993 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
31f18b77
FG
994 }
995
996 *result = entry;
997
9f95a23c 998 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
31f18b77 999
9f95a23c 1000 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1001 completions[shard_id].insert(entry);
1002}
1003
1004bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1005{
1006 int shard_id = arg->manager_shard_id;
1007 {
9f95a23c 1008 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1009
1010 auto& comps = completions[shard_id];
1011
1012 auto iter = comps.find(arg);
1013 if (iter == comps.end()) {
1014 return true;
1015 }
1016
1017 comps.erase(iter);
1018 }
1019
1020 int r = rados_aio_get_return_value(cb);
1021 if (r != -ERR_BUSY_RESHARDING) {
1022 return true;
1023 }
1024 completion_thread->add_completion(arg);
1025 return false;
1026}
1027
7c673cae
FG
1028void RGWRados::finalize()
1029{
1030 if (run_sync_thread) {
9f95a23c 1031 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
1032 meta_sync_processor_thread->stop();
1033
9f95a23c 1034 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1035 for (auto iter : data_sync_processor_threads) {
1036 RGWDataSyncProcessorThread *thread = iter.second;
1037 thread->stop();
1038 }
1039 if (sync_log_trimmer) {
1040 sync_log_trimmer->stop();
1041 }
1042 }
7c673cae
FG
1043 if (run_sync_thread) {
1044 delete meta_sync_processor_thread;
1045 meta_sync_processor_thread = NULL;
9f95a23c 1046 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1047 for (auto iter : data_sync_processor_threads) {
1048 RGWDataSyncProcessorThread *thread = iter.second;
1049 delete thread;
1050 }
1051 data_sync_processor_threads.clear();
1052 delete sync_log_trimmer;
1053 sync_log_trimmer = nullptr;
b32b8144 1054 bucket_trim = boost::none;
7c673cae 1055 }
7c673cae
FG
1056 if (meta_notifier) {
1057 meta_notifier->stop();
1058 delete meta_notifier;
1059 }
1060 if (data_notifier) {
1061 data_notifier->stop();
1062 delete data_notifier;
1063 }
11fdf7f2 1064 delete sync_tracer;
11fdf7f2
TL
1065
1066 delete lc;
1067 lc = NULL;
7c673cae 1068
11fdf7f2
TL
1069 delete gc;
1070 gc = NULL;
7c673cae 1071
11fdf7f2
TL
1072 delete obj_expirer;
1073 obj_expirer = NULL;
7c673cae 1074
11fdf7f2
TL
1075 RGWQuotaHandler::free_handler(quota_handler);
1076 if (cr_registry) {
1077 cr_registry->put();
7c673cae
FG
1078 }
1079
11fdf7f2 1080 svc.shutdown();
7c673cae 1081
11fdf7f2
TL
1082 delete binfo_cache;
1083 delete obj_tombstone_cache;
7c673cae 1084
11fdf7f2
TL
1085 if (reshard_wait.get()) {
1086 reshard_wait->stop();
1087 reshard_wait.reset();
7c673cae
FG
1088 }
1089
11fdf7f2
TL
1090 if (run_reshard_thread) {
1091 reshard->stop_processor();
7c673cae 1092 }
11fdf7f2
TL
1093 delete reshard;
1094 delete index_completion_manager;
f67539c2
TL
1095
1096 rgw::notify::shutdown();
11fdf7f2
TL
1097}
1098
1099/**
1100 * Initialize the RADOS instance and prepare to do other ops
1101 * Returns 0 on success, -ERR# on failure.
1102 */
1103int RGWRados::init_rados()
1104{
1105 int ret = 0;
7c673cae 1106
494da23a
TL
1107 ret = rados.init_with_context(cct);
1108 if (ret < 0) {
1109 return ret;
1110 }
1111 ret = rados.connect();
1112 if (ret < 0) {
1113 return ret;
7c673cae 1114 }
11fdf7f2
TL
1115
1116 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1117 new RGWCoroutinesManagerRegistry(cct)};
1118 ret = crs->hook_to_admin_command("cr dump");
1119 if (ret < 0) {
1120 return ret;
7c673cae
FG
1121 }
1122
11fdf7f2 1123 cr_registry = crs.release();
11fdf7f2 1124 return ret;
7c673cae
FG
1125}
1126
11fdf7f2 1127int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
7c673cae 1128{
f67539c2
TL
1129 string name = cct->_conf->name.get_id();
1130 if (name.compare(0, 4, "rgw.") == 0) {
1131 name = name.substr(4);
1132 }
11fdf7f2 1133 map<string,string> metadata = meta;
494da23a 1134 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1135 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1136 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1137 metadata["zone_name"] = svc.zone->zone_name();
9f95a23c 1138 metadata["zone_id"] = svc.zone->zone_id().id;
f67539c2
TL
1139 metadata["id"] = name;
1140 int ret = rados.service_daemon_register(
1141 daemon_type,
1142 stringify(rados.get_instance_id()),
1143 metadata);
11fdf7f2
TL
1144 if (ret < 0) {
1145 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1146 return ret;
7c673cae
FG
1147 }
1148
1149 return 0;
1150}
1151
11fdf7f2 1152int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
7c673cae 1153{
494da23a 1154 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2
TL
1155 if (ret < 0) {
1156 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1157 return ret;
1158 }
1159
1160 return 0;
7c673cae
FG
1161}
1162
1163/**
1164 * Initialize the RADOS instance and prepare to do other ops
1165 * Returns 0 on success, -ERR# on failure.
1166 */
b3b6e05e 1167int RGWRados::init_complete(const DoutPrefixProvider *dpp)
7c673cae 1168{
11fdf7f2 1169 int ret;
7c673cae 1170
11fdf7f2
TL
1171 /*
1172 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1173 */
9f95a23c 1174 sync_module = svc.sync_modules->get_sync_module();
7c673cae 1175
b3b6e05e 1176 ret = open_root_pool_ctx(dpp);
7c673cae
FG
1177 if (ret < 0)
1178 return ret;
1179
b3b6e05e 1180 ret = open_gc_pool_ctx(dpp);
7c673cae
FG
1181 if (ret < 0)
1182 return ret;
1183
b3b6e05e 1184 ret = open_lc_pool_ctx(dpp);
7c673cae
FG
1185 if (ret < 0)
1186 return ret;
1187
b3b6e05e 1188 ret = open_objexp_pool_ctx(dpp);
7c673cae
FG
1189 if (ret < 0)
1190 return ret;
1191
b3b6e05e 1192 ret = open_reshard_pool_ctx(dpp);
31f18b77
FG
1193 if (ret < 0)
1194 return ret;
1195
b3b6e05e 1196 ret = open_notif_pool_ctx(dpp);
f67539c2
TL
1197 if (ret < 0)
1198 return ret;
1199
7c673cae
FG
1200 pools_initialized = true;
1201
1202 gc = new RGWGC();
1203 gc->initialize(cct, this);
1204
9f95a23c 1205 obj_expirer = new RGWObjectExpirer(this->store);
7c673cae
FG
1206
1207 if (use_gc_thread) {
1208 gc->start_processor();
1209 obj_expirer->start_processor();
1210 }
1211
11fdf7f2
TL
1212 auto& current_period = svc.zone->get_current_period();
1213 auto& zonegroup = svc.zone->get_zonegroup();
1214 auto& zone_params = svc.zone->get_zone_params();
1215 auto& zone = svc.zone->get_zone();
1216
7c673cae
FG
1217 /* no point of running sync thread if we don't have a master zone configured
1218 or there is no rest_master_conn */
9f95a23c 1219 if (!svc.zone->need_to_sync()) {
7c673cae
FG
1220 run_sync_thread = false;
1221 }
1222
11fdf7f2 1223 if (svc.zone->is_meta_master()) {
9f95a23c 1224 auto md_log = svc.mdlog->get_log(current_period.get_id());
7c673cae
FG
1225 meta_notifier = new RGWMetaNotifier(this, md_log);
1226 meta_notifier->start();
1227 }
1228
11fdf7f2
TL
1229 /* init it anyway, might run sync through radosgw-admin explicitly */
1230 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1231 sync_tracer->init(this);
1232 ret = sync_tracer->hook_to_admin_command();
1233 if (ret < 0) {
1234 return ret;
1235 }
1236
7c673cae 1237 if (run_sync_thread) {
11fdf7f2
TL
1238 for (const auto &pt: zonegroup.placement_targets) {
1239 if (zone_params.placement_pools.find(pt.second.name)
1240 == zone_params.placement_pools.end()){
b3b6e05e 1241 ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
11fdf7f2
TL
1242 << pt.second.name << " present in zonegroup" << dendl;
1243 }
1244 }
9f95a23c
TL
1245 auto async_processor = svc.rados->get_async_processor();
1246 std::lock_guard l{meta_sync_thread_lock};
1247 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
b3b6e05e 1248 ret = meta_sync_processor_thread->init(dpp);
7c673cae 1249 if (ret < 0) {
b3b6e05e 1250 ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
7c673cae
FG
1251 return ret;
1252 }
1253 meta_sync_processor_thread->start();
1254
b32b8144
FG
1255 // configure the bucket trim manager
1256 rgw::BucketTrimConfig config;
1257 rgw::configure_bucket_trim(cct, config);
1258
9f95a23c 1259 bucket_trim.emplace(this->store, config);
b32b8144
FG
1260 ret = bucket_trim->init();
1261 if (ret < 0) {
b3b6e05e 1262 ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
b32b8144
FG
1263 return ret;
1264 }
9f95a23c 1265 svc.datalog_rados->set_observer(&*bucket_trim);
b32b8144 1266
9f95a23c 1267 std::lock_guard dl{data_sync_thread_lock};
81eedcae 1268 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
b3b6e05e 1269 ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
9f95a23c 1270 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
b3b6e05e 1271 ret = thread->init(dpp);
7c673cae 1272 if (ret < 0) {
b3b6e05e 1273 ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
7c673cae
FG
1274 return ret;
1275 }
1276 thread->start();
9f95a23c 1277 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
7c673cae
FG
1278 }
1279 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1280 if (interval > 0) {
9f95a23c 1281 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
b3b6e05e 1282 ret = sync_log_trimmer->init(dpp);
7c673cae 1283 if (ret < 0) {
b3b6e05e 1284 ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
7c673cae
FG
1285 return ret;
1286 }
1287 sync_log_trimmer->start();
1288 }
1289 }
1290 data_notifier = new RGWDataNotifier(this);
1291 data_notifier->start();
1292
92f5a8d4
TL
1293 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1294 binfo_cache->init(svc.cache);
1295
7c673cae 1296 lc = new RGWLC();
9f95a23c 1297 lc->initialize(cct, this->store);
31f18b77 1298
7c673cae
FG
1299 if (use_lc_thread)
1300 lc->start_processor();
31f18b77 1301
b3b6e05e 1302 quota_handler = RGWQuotaHandler::generate_handler(dpp, this->store, quota_threads);
7c673cae
FG
1303
1304 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1305 zone.bucket_index_max_shards);
31f18b77
FG
1306 if (bucket_index_max_shards > get_max_bucket_shards()) {
1307 bucket_index_max_shards = get_max_bucket_shards();
b3b6e05e 1308 ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1309 << get_max_bucket_shards() << dendl;
7c673cae 1310 }
b3b6e05e 1311 ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
7c673cae 1312
11fdf7f2 1313 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1314
1315 if (need_tombstone_cache) {
1316 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1317 }
1318
11fdf7f2 1319 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77 1320
9f95a23c 1321 reshard = new RGWReshard(this->store);
31f18b77
FG
1322
1323 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1324 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1325 if (run_reshard_thread) {
1326 reshard->start_processor();
1327 }
1328
1329 index_completion_manager = new RGWIndexCompletionManager(this);
b3b6e05e 1330 ret = index_completion_manager->start(dpp);
f67539c2
TL
1331 if (ret < 0) {
1332 return ret;
1333 }
b3b6e05e 1334 ret = rgw::notify::init(cct, store, dpp);
f67539c2 1335 if (ret < 0 ) {
b3b6e05e 1336 ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
f67539c2 1337 }
31f18b77 1338
7c673cae
FG
1339 return ret;
1340}
1341
b3b6e05e 1342int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
11fdf7f2
TL
1343{
1344 if (raw) {
b3b6e05e 1345 return svc.init_raw(cct, use_cache, null_yield, dpp);
11fdf7f2
TL
1346 }
1347
b3b6e05e 1348 return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
9f95a23c
TL
1349}
1350
b3b6e05e 1351int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
9f95a23c 1352{
b3b6e05e 1353 return ctl.init(&svc, dpp);
11fdf7f2
TL
1354}
1355
7c673cae
FG
1356/**
1357 * Initialize the RADOS instance and prepare to do other ops
1358 * Returns 0 on success, -ERR# on failure.
1359 */
b3b6e05e 1360int RGWRados::initialize(const DoutPrefixProvider *dpp)
7c673cae
FG
1361{
1362 int ret;
1363
11fdf7f2
TL
1364 inject_notify_timeout_probability =
1365 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1366 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1367
b3b6e05e 1368 ret = init_svc(false, dpp);
7c673cae 1369 if (ret < 0) {
b3b6e05e 1370 ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1371 return ret;
1372 }
7c673cae 1373
b3b6e05e 1374 ret = init_ctl(dpp);
9f95a23c 1375 if (ret < 0) {
b3b6e05e 1376 ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
9f95a23c
TL
1377 return ret;
1378 }
1379
11fdf7f2 1380 host_id = svc.zone_utils->gen_host_id();
7c673cae 1381
11fdf7f2
TL
1382 ret = init_rados();
1383 if (ret < 0)
1384 return ret;
1385
b3b6e05e 1386 return init_complete(dpp);
7c673cae
FG
1387}
1388
1389/**
1390 * Open the pool used as root for this gateway
1391 * Returns: 0 on success, -ERR# otherwise.
1392 */
b3b6e05e 1393int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1394{
b3b6e05e 1395 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1396}
1397
b3b6e05e 1398int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1399{
b3b6e05e 1400 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1401}
1402
b3b6e05e 1403int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1404{
b3b6e05e 1405 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1406}
1407
b3b6e05e 1408int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1409{
b3b6e05e 1410 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1411}
1412
b3b6e05e 1413int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
31f18b77 1414{
b3b6e05e 1415 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1416}
1417
b3b6e05e 1418int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
f67539c2 1419{
b3b6e05e 1420 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
f67539c2
TL
1421}
1422
b3b6e05e 1423int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
494da23a 1424 bool mostly_omap)
7c673cae 1425{
28e407b8 1426 constexpr bool create = true; // create the pool if it doesn't exist
b3b6e05e 1427 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1428}
1429
7c673cae
FG
1430/**** logs ****/
1431
1432struct log_list_state {
1433 string prefix;
1434 librados::IoCtx io_ctx;
1435 librados::NObjectIterator obit;
1436};
1437
b3b6e05e 1438int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
7c673cae
FG
1439{
1440 log_list_state *state = new log_list_state;
b3b6e05e 1441 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1442 if (r < 0) {
1443 delete state;
1444 return r;
1445 }
1446 state->prefix = prefix;
1447 state->obit = state->io_ctx.nobjects_begin();
1448 *handle = (RGWAccessHandle)state;
1449 return 0;
1450}
1451
1452int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1453{
1454 log_list_state *state = static_cast<log_list_state *>(handle);
1455 while (true) {
1456 if (state->obit == state->io_ctx.nobjects_end()) {
1457 delete state;
1458 return -ENOENT;
1459 }
1460 if (state->prefix.length() &&
1461 state->obit->get_oid().find(state->prefix) != 0) {
1462 state->obit++;
1463 continue;
1464 }
1465 *name = state->obit->get_oid();
1466 state->obit++;
1467 break;
1468 }
1469 return 0;
1470}
1471
b3b6e05e 1472int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
7c673cae
FG
1473{
1474 librados::IoCtx io_ctx;
b3b6e05e 1475 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1476 if (r < 0)
1477 return r;
1478 return io_ctx.remove(name);
1479}
1480
1481struct log_show_state {
1482 librados::IoCtx io_ctx;
1483 bufferlist bl;
11fdf7f2 1484 bufferlist::const_iterator p;
7c673cae
FG
1485 string name;
1486 uint64_t pos;
1487 bool eof;
1488 log_show_state() : pos(0), eof(false) {}
1489};
1490
b3b6e05e 1491int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
7c673cae
FG
1492{
1493 log_show_state *state = new log_show_state;
b3b6e05e 1494 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1495 if (r < 0) {
1496 delete state;
1497 return r;
1498 }
1499 state->name = name;
1500 *handle = (RGWAccessHandle)state;
1501 return 0;
1502}
1503
1504int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1505{
1506 log_show_state *state = static_cast<log_show_state *>(handle);
1507 off_t off = state->p.get_off();
1508
1509 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1510 << " off " << off
1511 << " eof " << (int)state->eof
1512 << dendl;
1513 // read some?
1514 unsigned chunk = 1024*1024;
1515 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1516 bufferlist more;
1517 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1518 if (r < 0)
1519 return r;
1520 state->pos += r;
1521 bufferlist old;
1522 try {
1523 old.substr_of(state->bl, off, state->bl.length() - off);
1524 } catch (buffer::error& err) {
1525 return -EINVAL;
1526 }
f67539c2 1527 state->bl = std::move(old);
7c673cae 1528 state->bl.claim_append(more);
11fdf7f2 1529 state->p = state->bl.cbegin();
7c673cae
FG
1530 if ((unsigned)r < chunk)
1531 state->eof = true;
1532 ldout(cct, 10) << " read " << r << dendl;
1533 }
1534
1535 if (state->p.end())
1536 return 0; // end of file
1537 try {
11fdf7f2 1538 decode(*entry, state->p);
7c673cae
FG
1539 }
1540 catch (const buffer::error &e) {
1541 return -EINVAL;
1542 }
1543 return 1;
1544}
1545
1546/**
1547 * usage_log_hash: get usage log key hash, based on name and index
1548 *
1549 * Get the usage object name. Since a user may have more than 1
1550 * object holding that info (multiple shards), we use index to
1551 * specify that shard number. Once index exceeds max shards it
1552 * wraps.
1553 * If name is not being set, results for all users will be returned
1554 * and index will wrap only after total shards number.
1555 *
1556 * @param cct [in] ceph context
1557 * @param name [in] user name
1558 * @param hash [out] hash value
1559 * @param index [in] shard index number
1560 */
1561static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1562{
1563 uint32_t val = index;
1564
1565 if (!name.empty()) {
c07f9fc5 1566 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1567 val %= max_user_shards;
1568 val += ceph_str_hash_linux(name.c_str(), name.size());
1569 }
1570 char buf[17];
c07f9fc5 1571 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1572 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1573 hash = buf;
1574}
1575
b3b6e05e 1576int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
7c673cae
FG
1577{
1578 uint32_t index = 0;
1579
1580 map<string, rgw_usage_log_info> log_objs;
1581
1582 string hash;
1583 string last_user;
1584
1585 /* restructure usage map, zone by object hash */
1586 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1587 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1588 const rgw_user_bucket& ub = iter->first;
1589 RGWUsageBatch& info = iter->second;
1590
1591 if (ub.user.empty()) {
b3b6e05e 1592 ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
7c673cae
FG
1593 continue;
1594 }
1595
1596 if (ub.user != last_user) {
1597 /* index *should* be random, but why waste extra cycles
1598 in most cases max user shards is not going to exceed 1,
1599 so just incrementing it */
1600 usage_log_hash(cct, ub.user, hash, index++);
1601 }
1602 last_user = ub.user;
1603 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1604
1605 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1606 v.push_back(miter->second);
1607 }
1608 }
1609
1610 map<string, rgw_usage_log_info>::iterator liter;
1611
1612 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
b3b6e05e 1613 int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
7c673cae
FG
1614 if (r < 0)
1615 return r;
1616 }
1617 return 0;
1618}
1619
b3b6e05e 1620int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
11fdf7f2
TL
1621 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1622 rgw_usage_log_entry>& usage)
7c673cae
FG
1623{
1624 uint32_t num = max_entries;
1625 string hash, first_hash;
1626 string user_str = user.to_str();
1627 usage_log_hash(cct, user_str, first_hash, 0);
1628
1629 if (usage_iter.index) {
1630 usage_log_hash(cct, user_str, hash, usage_iter.index);
1631 } else {
1632 hash = first_hash;
1633 }
1634
1635 usage.clear();
1636
1637 do {
1638 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1639 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1640
b3b6e05e 1641 int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1642 usage_iter.read_iter, ret_usage, is_truncated);
1643 if (ret == -ENOENT)
1644 goto next;
1645
1646 if (ret < 0)
1647 return ret;
1648
1649 num -= ret_usage.size();
1650
1651 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1652 usage[iter->first].aggregate(iter->second);
1653 }
1654
1655next:
1656 if (!*is_truncated) {
1657 usage_iter.read_iter.clear();
1658 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1659 }
1660 } while (num && !*is_truncated && hash != first_hash);
1661 return 0;
1662}
1663
b3b6e05e 1664int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1665{
1666 uint32_t index = 0;
1667 string hash, first_hash;
1668 string user_str = user.to_str();
1669 usage_log_hash(cct, user_str, first_hash, index);
1670
1671 hash = first_hash;
7c673cae 1672 do {
b3b6e05e 1673 int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1674
b32b8144 1675 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1676 return ret;
1677
7c673cae
FG
1678 usage_log_hash(cct, user_str, hash, ++index);
1679 } while (hash != first_hash);
1680
1681 return 0;
1682}
1683
11fdf7f2 1684
b3b6e05e 1685int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
11fdf7f2
TL
1686{
1687 auto max_shards = cct->_conf->rgw_usage_max_shards;
1688 int ret=0;
1689 for (unsigned i=0; i < max_shards; i++){
1690 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
b3b6e05e 1691 ret = cls_obj_usage_log_clear(dpp, oid);
11fdf7f2 1692 if (ret < 0){
b3b6e05e 1693 ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
11fdf7f2
TL
1694 return ret;
1695 }
1696 }
1697 return ret;
1698}
1699
9f95a23c 1700int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
7c673cae 1701{
9f95a23c
TL
1702 auto i = bl.cbegin();
1703 RGWAccessControlPolicy policy(cct);
1704 try {
1705 policy.decode_owner(i);
1706 } catch (buffer::error& err) {
1707 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1708 return -EIO;
7c673cae 1709 }
9f95a23c
TL
1710 *owner = policy.get_owner();
1711 return 0;
7c673cae
FG
1712}
1713
b3b6e05e 1714int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
7c673cae 1715{
9f95a23c
TL
1716 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1717 if (aiter == attrset.end())
1718 return -EIO;
7c673cae 1719
9f95a23c
TL
1720 bufferlist& bl = aiter->second;
1721 auto iter = bl.cbegin();
1722 try {
1723 policy->decode(iter);
1724 } catch (buffer::error& err) {
b3b6e05e 1725 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
1726 return -EIO;
1727 }
1728 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1729 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
b3b6e05e 1730 ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
9f95a23c
TL
1731 s3policy->to_xml(*_dout);
1732 *_dout << dendl;
1733 }
1734 return 0;
7c673cae
FG
1735}
1736
7c673cae 1737
b3b6e05e 1738int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
7c673cae 1739{
9f95a23c
TL
1740 rgw_bucket bucket = bucket_info.bucket;
1741 bucket.update_bucket_id(new_bucket_id);
7c673cae 1742
9f95a23c 1743 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 1744
9f95a23c 1745 bucket_info.objv_tracker.clear();
b3b6e05e 1746 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
9f95a23c
TL
1747 if (ret < 0) {
1748 return ret;
7c673cae
FG
1749 }
1750
9f95a23c 1751 return 0;
eafe8130
TL
1752}
1753
1754
1adf2230
AA
1755/**
1756 * Get ordered listing of the objects in a bucket.
7c673cae 1757 *
9f95a23c 1758 * max_p: maximum number of results to return
7c673cae
FG
1759 * bucket: bucket to list contents of
1760 * prefix: only return results that match this prefix
1761 * delim: do not include results that match this string.
1762 * Any skipped results will have the matching portion of their name
1763 * inserted in common_prefixes with a "true" mark.
1764 * marker: if filled in, begin the listing with this object.
1765 * end_marker: if filled in, end the listing with this object.
1766 * result: the objects are put in here.
11fdf7f2
TL
1767 * common_prefixes: if delim is filled in, any matching prefixes are
1768 * placed here.
1769 * is_truncated: if number of objects in the bucket is bigger than
1770 * max, then truncated.
7c673cae 1771 */
11fdf7f2 1772int RGWRados::Bucket::List::list_objects_ordered(
b3b6e05e 1773 const DoutPrefixProvider *dpp,
eafe8130 1774 int64_t max_p,
11fdf7f2
TL
1775 vector<rgw_bucket_dir_entry> *result,
1776 map<string, bool> *common_prefixes,
9f95a23c
TL
1777 bool *is_truncated,
1778 optional_yield y)
7c673cae
FG
1779{
1780 RGWRados *store = target->get_store();
1781 CephContext *cct = store->ctx();
1782 int shard_id = target->get_shard_id();
1783
1784 int count = 0;
1785 bool truncated = true;
9f95a23c 1786 bool cls_filtered = false;
eafe8130
TL
1787 const int64_t max = // protect against memory issues and negative vals
1788 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1789 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
7c673cae
FG
1790
1791 result->clear();
1792
9f95a23c
TL
1793 // use a local marker; either the marker will have a previous entry
1794 // or it will be empty; either way it's OK to copy
1795 rgw_obj_key marker_obj(params.marker.name,
1796 params.marker.instance,
f91f0fd5 1797 params.ns.empty() ? params.marker.ns : params.ns);
7c673cae
FG
1798 rgw_obj_index_key cur_marker;
1799 marker_obj.get_index_key(&cur_marker);
1800
9f95a23c
TL
1801 rgw_obj_key end_marker_obj(params.end_marker.name,
1802 params.end_marker.instance,
f91f0fd5 1803 params.ns.empty() ? params.end_marker.ns : params.ns);
3efd9988
FG
1804 rgw_obj_index_key cur_end_marker;
1805 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
1806 const bool cur_end_marker_valid = !params.end_marker.empty();
1807
1808 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 1809 prefix_obj.set_ns(params.ns);
7c673cae 1810 string cur_prefix = prefix_obj.get_index_key_name();
11fdf7f2 1811 string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
1812
1813 if (!params.delim.empty()) {
9f95a23c 1814 after_delim_s = cls_rgw_after_delim(params.delim);
11fdf7f2
TL
1815 /* if marker points at a common prefix, fast forward it into its
1816 * upper bound string */
224ce89b 1817 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
1818 if (delim_pos >= 0) {
1819 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 1820 s.append(after_delim_s);
7c673cae
FG
1821 cur_marker = s;
1822 }
1823 }
1adf2230 1824
9f95a23c 1825 rgw_obj_index_key prev_marker;
f6b5b4d7 1826 for (uint16_t attempt = 1; /* empty */; ++attempt) {
b3b6e05e 1827 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
f6b5b4d7 1828 " starting attempt " << attempt << dendl;
9f95a23c
TL
1829
1830 if (attempt > 1 && !(prev_marker < cur_marker)) {
1831 // we've failed to make forward progress
b3b6e05e 1832 ldpp_dout(dpp, 0) << "RGWRados::Bucket::List::" << __func__ <<
9f95a23c
TL
1833 ": ERROR marker failed to make forward progress; attempt=" << attempt <<
1834 ", prev_marker=" << prev_marker <<
1835 ", cur_marker=" << cur_marker << dendl;
1836 break;
1837 }
1838 prev_marker = cur_marker;
1839
1840 ent_map_t ent_map;
1841 ent_map.reserve(read_ahead);
b3b6e05e
TL
1842 int r = store->cls_bucket_list_ordered(dpp,
1843 target->get_bucket_info(),
1adf2230
AA
1844 shard_id,
1845 cur_marker,
1846 cur_prefix,
9f95a23c 1847 params.delim,
1adf2230
AA
1848 read_ahead + 1 - count,
1849 params.list_versions,
9f95a23c 1850 attempt,
1adf2230
AA
1851 ent_map,
1852 &truncated,
9f95a23c
TL
1853 &cls_filtered,
1854 &cur_marker,
1855 y);
1856 if (r < 0) {
7c673cae 1857 return r;
9f95a23c 1858 }
7c673cae 1859
1adf2230 1860 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
1861 rgw_bucket_dir_entry& entry = eiter->second;
1862 rgw_obj_index_key index_key = entry.key;
7c673cae
FG
1863 rgw_obj_key obj(index_key);
1864
b3b6e05e 1865 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
9f95a23c
TL
1866 " considering entry " << entry.key << dendl;
1867
1adf2230
AA
1868 /* note that parse_raw_oid() here will not set the correct
1869 * object's instance, as rgw_obj_index_key encodes that
1870 * separately. We don't need to set the instance because it's
1871 * not needed for the checks here and we end up using the raw
1872 * entry for the return vector
7c673cae
FG
1873 */
1874 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1875 if (!valid) {
b3b6e05e 1876 ldpp_dout(dpp, 0) << "ERROR: could not parse object name: " <<
9f95a23c 1877 obj.name << dendl;
7c673cae
FG
1878 continue;
1879 }
11fdf7f2 1880
9f95a23c 1881 bool matched_ns = (obj.ns == params.ns);
7c673cae
FG
1882 if (!params.list_versions && !entry.is_visible()) {
1883 continue;
1884 }
1885
9f95a23c 1886 if (params.enforce_ns && !matched_ns) {
7c673cae
FG
1887 if (!params.ns.empty()) {
1888 /* we've iterated past the namespace we're searching -- done now */
1889 truncated = false;
1890 goto done;
1891 }
1892
1893 /* we're not looking at the namespace this object is in, next! */
1894 continue;
1895 }
1896
1897 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1898 truncated = false;
1899 goto done;
1900 }
1901
1902 if (count < max) {
9f95a23c
TL
1903 params.marker = index_key;
1904 next_marker = index_key;
7c673cae
FG
1905 }
1906
9f95a23c
TL
1907 if (params.filter &&
1908 ! params.filter->filter(obj.name, index_key.name)) {
7c673cae 1909 continue;
9f95a23c 1910 }
7c673cae 1911
1adf2230 1912 if (params.prefix.size() &&
9f95a23c 1913 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
7c673cae 1914 continue;
9f95a23c 1915 }
7c673cae
FG
1916
1917 if (!params.delim.empty()) {
9f95a23c
TL
1918 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1919 if (delim_pos >= 0) {
1920 // run either the code where delimiter filtering is done a)
1921 // in the OSD/CLS or b) here.
1922 if (cls_filtered) {
1923 // NOTE: this condition is for the newer versions of the
1924 // OSD that does filtering on the CLS side
1925
1926 // should only find one delimiter at the end if it finds any
1927 // after the prefix
1928 if (delim_pos !=
1929 int(obj.name.length() - params.delim.length())) {
b3b6e05e 1930 ldpp_dout(dpp, 0) <<
9f95a23c
TL
1931 "WARNING: found delimiter in place other than the end of "
1932 "the prefix; obj.name=" << obj.name <<
1933 ", prefix=" << params.prefix << dendl;
1934 }
1935 if (common_prefixes) {
1936 if (count >= max) {
1937 truncated = true;
1938 goto done;
1939 }
1940
1941 (*common_prefixes)[obj.name] = true;
1942 count++;
1943 }
1944
1945 continue;
1946 } else {
1947 // NOTE: this condition is for older versions of the OSD
1948 // that do not filter on the CLS side, so the following code
1949 // must do the filtering; once we reach version 16 of ceph,
1950 // this code can be removed along with the conditional that
1951 // can lead this way
1952
1953 /* extract key -with trailing delimiter- for CommonPrefix */
1954 string prefix_key =
1955 obj.name.substr(0, delim_pos + params.delim.length());
1956
1957 if (common_prefixes &&
1958 common_prefixes->find(prefix_key) == common_prefixes->end()) {
1959 if (count >= max) {
1960 truncated = true;
1961 goto done;
1962 }
1963 next_marker = prefix_key;
1964 (*common_prefixes)[prefix_key] = true;
1965
1966 count++;
1967 }
1968
1969 continue;
1970 } // if we're running an older OSD version
1971 } // if a delimiter was found after prefix
1972 } // if a delimiter was passed in
7c673cae
FG
1973
1974 if (count >= max) {
1975 truncated = true;
1976 goto done;
1977 }
1978
b3b6e05e 1979 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
9f95a23c
TL
1980 " adding entry " << entry.key << " to result" << dendl;
1981
7c673cae
FG
1982 result->emplace_back(std::move(entry));
1983 count++;
9f95a23c
TL
1984 } // eiter for loop
1985
1986 // NOTE: the following conditional is needed by older versions of
1987 // the OSD that don't do delimiter filtering on the CLS side; once
1988 // we reach version 16 of ceph, the following conditional and the
1989 // code within can be removed
1990 if (!cls_filtered && !params.delim.empty()) {
1991 int marker_delim_pos =
1992 cur_marker.name.find(params.delim, cur_prefix.size());
eafe8130 1993 if (marker_delim_pos >= 0) {
9f95a23c
TL
1994 std::string skip_after_delim =
1995 cur_marker.name.substr(0, marker_delim_pos);
eafe8130
TL
1996 skip_after_delim.append(after_delim_s);
1997
b3b6e05e 1998 ldpp_dout(dpp, 20) << "skip_after_delim=" << skip_after_delim << dendl;
eafe8130
TL
1999
2000 if (skip_after_delim > cur_marker.name) {
2001 cur_marker = skip_after_delim;
b3b6e05e 2002 ldpp_dout(dpp, 20) << "setting cur_marker="
eafe8130
TL
2003 << cur_marker.name
2004 << "[" << cur_marker.instance << "]"
2005 << dendl;
2006 }
2007 }
9f95a23c
TL
2008 } // if older osd didn't do delimiter filtering
2009
b3b6e05e 2010 ldpp_dout(dpp, 20) << "RGWRados::Bucket::List::" << __func__ <<
9f95a23c
TL
2011 " INFO end of outer loop, truncated=" << truncated <<
2012 ", count=" << count << ", attempt=" << attempt << dendl;
2013
2014 if (!truncated || count >= (max + 1) / 2) {
2015 // if we finished listing, or if we're returning at least half the
2016 // requested entries, that's enough; S3 and swift protocols allow
2017 // returning fewer than max entries
2018 break;
2019 } else if (attempt > 8 && count >= 1) {
2020 // if we've made at least 8 attempts and we have some, but very
2021 // few, results, return with what we have
2022 break;
eafe8130 2023 }
f6b5b4d7 2024 } // for (uint16_t attempt...
7c673cae
FG
2025
2026done:
9f95a23c
TL
2027
2028 if (is_truncated) {
7c673cae 2029 *is_truncated = truncated;
9f95a23c 2030 }
7c673cae
FG
2031
2032 return 0;
1adf2230
AA
2033} // list_objects_ordered
2034
2035
2036/**
2037 * Get listing of the objects in a bucket and allow the results to be out
2038 * of order.
2039 *
2040 * Even though there are key differences with the ordered counterpart,
2041 * the parameters are the same to maintain some compatability.
2042 *
2043 * max: maximum number of results to return
2044 * bucket: bucket to list contents of
2045 * prefix: only return results that match this prefix
2046 * delim: should not be set; if it is we should have indicated an error
2047 * marker: if filled in, begin the listing with this object.
2048 * end_marker: if filled in, end the listing with this object.
2049 * result: the objects are put in here.
2050 * common_prefixes: this is never filled with an unordered list; the param
2051 * is maintained for compatibility
2052 * is_truncated: if number of objects in the bucket is bigger than max, then
2053 * truncated.
2054 */
b3b6e05e
TL
2055int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
2056 int64_t max_p,
1adf2230
AA
2057 vector<rgw_bucket_dir_entry> *result,
2058 map<string, bool> *common_prefixes,
9f95a23c
TL
2059 bool *is_truncated,
2060 optional_yield y)
1adf2230
AA
2061{
2062 RGWRados *store = target->get_store();
1adf2230
AA
2063 int shard_id = target->get_shard_id();
2064
2065 int count = 0;
2066 bool truncated = true;
2067
eafe8130
TL
2068 const int64_t max = // protect against memory issues and negative vals
2069 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2070
1adf2230
AA
2071 // read a few extra in each call to cls_bucket_list_unordered in
2072 // case some are filtered out due to namespace matching, versioning,
2073 // filtering, etc.
2074 const int64_t max_read_ahead = 100;
2075 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2076
2077 result->clear();
2078
9f95a23c
TL
2079 // use a local marker; either the marker will have a previous entry
2080 // or it will be empty; either way it's OK to copy
11fdf7f2
TL
2081 rgw_obj_key marker_obj(params.marker.name,
2082 params.marker.instance,
f91f0fd5 2083 params.ns.empty() ? params.marker.ns : params.ns);
1adf2230
AA
2084 rgw_obj_index_key cur_marker;
2085 marker_obj.get_index_key(&cur_marker);
2086
11fdf7f2
TL
2087 rgw_obj_key end_marker_obj(params.end_marker.name,
2088 params.end_marker.instance,
f91f0fd5 2089 params.ns.empty() ? params.end_marker.ns : params.ns);
1adf2230
AA
2090 rgw_obj_index_key cur_end_marker;
2091 end_marker_obj.get_index_key(&cur_end_marker);
2092 const bool cur_end_marker_valid = !params.end_marker.empty();
2093
2094 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 2095 prefix_obj.set_ns(params.ns);
1adf2230
AA
2096 string cur_prefix = prefix_obj.get_index_key_name();
2097
2098 while (truncated && count <= max) {
2099 std::vector<rgw_bucket_dir_entry> ent_list;
9f95a23c
TL
2100 ent_list.reserve(read_ahead);
2101
b3b6e05e
TL
2102 int r = store->cls_bucket_list_unordered(dpp,
2103 target->get_bucket_info(),
1adf2230
AA
2104 shard_id,
2105 cur_marker,
2106 cur_prefix,
2107 read_ahead,
2108 params.list_versions,
2109 ent_list,
2110 &truncated,
9f95a23c
TL
2111 &cur_marker,
2112 y);
1adf2230
AA
2113 if (r < 0)
2114 return r;
2115
2116 // NB: while regions of ent_list will be sorted, we have no
2117 // guarantee that all items will be sorted since they can cross
2118 // shard boundaries
2119
2120 for (auto& entry : ent_list) {
2121 rgw_obj_index_key index_key = entry.key;
2122 rgw_obj_key obj(index_key);
2123
9f95a23c
TL
2124 if (count < max) {
2125 params.marker.set(index_key);
2126 next_marker.set(index_key);
2127 }
2128
1adf2230
AA
2129 /* note that parse_raw_oid() here will not set the correct
2130 * object's instance, as rgw_obj_index_key encodes that
2131 * separately. We don't need to set the instance because it's
2132 * not needed for the checks here and we end up using the raw
2133 * entry for the return vector
2134 */
2135 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2136 if (!valid) {
b3b6e05e 2137 ldpp_dout(dpp, 0) << "ERROR: could not parse object name: " <<
1adf2230
AA
2138 obj.name << dendl;
2139 continue;
2140 }
2141
2142 if (!params.list_versions && !entry.is_visible()) {
2143 continue;
2144 }
2145
2146 if (params.enforce_ns && obj.ns != params.ns) {
2147 continue;
2148 }
2149
2150 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2151 // we're not guaranteed items will come in order, so we have
2152 // to loop through all
2153 continue;
2154 }
2155
1adf2230
AA
2156 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2157 continue;
2158
2159 if (params.prefix.size() &&
2160 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2161 continue;
2162
2163 if (count >= max) {
2164 truncated = true;
2165 goto done;
2166 }
2167
2168 result->emplace_back(std::move(entry));
2169 count++;
2170 } // for (auto& entry : ent_list)
2171 } // while (truncated && count <= max)
2172
2173done:
2174 if (is_truncated)
2175 *is_truncated = truncated;
2176
2177 return 0;
2178} // list_objects_unordered
2179
7c673cae
FG
2180
2181/**
2182 * create a rados pool, associated meta info
2183 * returns 0 on success, -ERR# otherwise.
2184 */
b3b6e05e 2185int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
7c673cae 2186{
c07f9fc5 2187 librados::IoCtx io_ctx;
28e407b8 2188 constexpr bool create = true;
b3b6e05e 2189 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2190}
2191
9f95a23c 2192void RGWRados::create_bucket_id(string *bucket_id)
7c673cae 2193{
9f95a23c
TL
2194 uint64_t iid = instance_id();
2195 uint64_t bid = next_bucket_id();
2196 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2197 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2198 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2199 *bucket_id = buf;
2200}
7c673cae 2201
11fdf7f2 2202int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2203 const string& zonegroup_id,
11fdf7f2 2204 const rgw_placement_rule& placement_rule,
7c673cae
FG
2205 const string& swift_ver_location,
2206 const RGWQuotaInfo * pquota_info,
2207 map<std::string, bufferlist>& attrs,
2208 RGWBucketInfo& info,
2209 obj_version *pobjv,
2210 obj_version *pep_objv,
2211 real_time creation_time,
2212 rgw_bucket *pmaster_bucket,
2213 uint32_t *pmaster_num_shards,
f67539c2 2214 optional_yield y,
b3b6e05e 2215 const DoutPrefixProvider *dpp,
7c673cae
FG
2216 bool exclusive)
2217{
2218#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2219 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2220 RGWZonePlacementInfo rule_info;
2221
2222 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2223 int ret = 0;
b3b6e05e 2224 ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
f67539c2 2225 &selected_placement_rule, &rule_info, y);
7c673cae
FG
2226 if (ret < 0)
2227 return ret;
2228
2229 if (!pmaster_bucket) {
2230 create_bucket_id(&bucket.marker);
2231 bucket.bucket_id = bucket.marker;
2232 } else {
2233 bucket.marker = pmaster_bucket->marker;
2234 bucket.bucket_id = pmaster_bucket->bucket_id;
2235 }
2236
2237 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2238
9f95a23c
TL
2239 objv_tracker.read_version.clear();
2240
7c673cae
FG
2241 if (pobjv) {
2242 objv_tracker.write_version = *pobjv;
2243 } else {
2244 objv_tracker.generate_new_write_ver(cct);
2245 }
2246
2247 info.bucket = bucket;
2248 info.owner = owner.user_id;
2249 info.zonegroup = zonegroup_id;
11fdf7f2 2250 info.placement_rule = selected_placement_rule;
7c673cae
FG
2251 info.swift_ver_location = swift_ver_location;
2252 info.swift_versioning = (!swift_ver_location.empty());
f67539c2
TL
2253
2254 init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
2255 pmaster_num_shards ?
2256 std::optional{*pmaster_num_shards} :
2257 std::nullopt,
2258 rule_info.index_type);
2259
7c673cae
FG
2260 info.requester_pays = false;
2261 if (real_clock::is_zero(creation_time)) {
2262 info.creation_time = ceph::real_clock::now();
2263 } else {
2264 info.creation_time = creation_time;
2265 }
2266 if (pquota_info) {
2267 info.quota = *pquota_info;
2268 }
2269
b3b6e05e 2270 int r = svc.bi->init_index(dpp, info);
11fdf7f2
TL
2271 if (r < 0) {
2272 return r;
2273 }
7c673cae 2274
b3b6e05e 2275 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
9f95a23c
TL
2276 if (ret == -ECANCELED) {
2277 ret = -EEXIST;
2278 }
11fdf7f2 2279 if (ret == -EEXIST) {
11fdf7f2 2280 /* we need to reread the info and return it, caller will have a use for it */
9f95a23c
TL
2281 RGWBucketInfo orig_info;
2282 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
11fdf7f2
TL
2283 if (r < 0) {
2284 if (r == -ENOENT) {
2285 continue;
2286 }
b3b6e05e 2287 ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
11fdf7f2
TL
2288 return r;
2289 }
7c673cae 2290
11fdf7f2 2291 /* only remove it if it's a different bucket instance */
9f95a23c 2292 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
b3b6e05e 2293 int r = svc.bi->clean_index(dpp, info);
9f95a23c 2294 if (r < 0) {
b3b6e05e 2295 ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
9f95a23c 2296 }
b3b6e05e 2297 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
9f95a23c 2298 if (r < 0) {
b3b6e05e 2299 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
9f95a23c
TL
2300 /* continue anyway */
2301 }
11fdf7f2 2302 }
9f95a23c
TL
2303
2304 info = std::move(orig_info);
2305 /* ret == -EEXIST here */
11fdf7f2 2306 }
7c673cae 2307 return ret;
7c673cae
FG
2308 }
2309
11fdf7f2 2310 /* this is highly unlikely */
b3b6e05e 2311 ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
11fdf7f2 2312 return -ENOENT;
7c673cae
FG
2313}
2314
11fdf7f2 2315bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2316{
11fdf7f2
TL
2317 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2318}
c07f9fc5 2319
11fdf7f2
TL
2320bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2321{
2322 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2323
11fdf7f2 2324 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2325}
2326
b3b6e05e 2327int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
7c673cae
FG
2328{
2329 string oid, key;
2330 get_obj_bucket_and_oid_loc(obj, oid, key);
2331
2332 rgw_pool pool;
2333 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
b3b6e05e 2334 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
7c673cae
FG
2335 return -EIO;
2336 }
2337
b3b6e05e 2338 int r = open_pool_ctx(dpp, pool, *ioctx, false);
7c673cae
FG
2339 if (r < 0) {
2340 return r;
2341 }
2342
2343 ioctx->locator_set_key(key);
2344
2345 return 0;
2346}
2347
b3b6e05e 2348int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
7c673cae 2349{
11fdf7f2 2350 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2351
2352 rgw_pool pool;
2353 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
b3b6e05e 2354 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
7c673cae
FG
2355 return -EIO;
2356 }
2357
9f95a23c
TL
2358 ref->pool = svc.rados->pool(pool);
2359
b3b6e05e 2360 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c 2361 .set_mostly_omap(false));
7c673cae 2362 if (r < 0) {
b3b6e05e 2363 ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
7c673cae
FG
2364 return r;
2365 }
2366
9f95a23c 2367 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2368
2369 return 0;
2370}
2371
b3b6e05e 2372int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2373{
11fdf7f2 2374 ref->obj = obj;
7c673cae 2375
11fdf7f2
TL
2376 if (ref->obj.oid.empty()) {
2377 ref->obj.oid = obj.pool.to_str();
2378 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2379 }
9f95a23c 2380 ref->pool = svc.rados->pool(obj.pool);
b3b6e05e 2381 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c
TL
2382 .set_mostly_omap(false));
2383 if (r < 0) {
b3b6e05e 2384 ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
7c673cae 2385 return r;
9f95a23c 2386 }
7c673cae 2387
9f95a23c 2388 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2389
2390 return 0;
2391}
2392
b3b6e05e 2393int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2394{
b3b6e05e 2395 return get_raw_obj_ref(dpp, obj, ref);
7c673cae
FG
2396}
2397
2398/*
2399 * fixes an issue where head objects were supposed to have a locator created, but ended
2400 * up without one
2401 */
b3b6e05e 2402int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
7c673cae
FG
2403{
2404 const rgw_bucket& bucket = bucket_info.bucket;
2405 string oid;
2406 string locator;
2407
2408 rgw_obj obj(bucket, key);
2409
2410 get_obj_bucket_and_oid_loc(obj, oid, locator);
2411
2412 if (locator.empty()) {
b3b6e05e 2413 ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
7c673cae
FG
2414 return 0;
2415 }
2416
2417 librados::IoCtx ioctx;
2418
b3b6e05e 2419 int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
7c673cae
FG
2420 if (ret < 0) {
2421 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2422 return ret;
2423 }
2424 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2425
2426 uint64_t size;
2427 bufferlist data;
2428
2429 struct timespec mtime_ts;
2430 map<string, bufferlist> attrs;
2431 librados::ObjectReadOperation op;
2432 op.getxattrs(&attrs, NULL);
2433 op.stat2(&size, &mtime_ts, NULL);
2434#define HEAD_SIZE 512 * 1024
2435 op.read(0, HEAD_SIZE, &data, NULL);
2436
b3b6e05e 2437 ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
7c673cae 2438 if (ret < 0) {
b3b6e05e 2439 ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
7c673cae
FG
2440 return ret;
2441 }
2442
2443 if (size > HEAD_SIZE) {
b3b6e05e 2444 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
7c673cae
FG
2445 return -EIO;
2446 }
2447
2448 if (size != data.length()) {
b3b6e05e 2449 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
7c673cae
FG
2450 return -EIO;
2451 }
2452
2453 if (copy_obj) {
2454 librados::ObjectWriteOperation wop;
2455
2456 wop.mtime2(&mtime_ts);
2457
2458 map<string, bufferlist>::iterator iter;
2459 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2460 wop.setxattr(iter->first.c_str(), iter->second);
2461 }
2462
2463 wop.write(0, data);
2464
2465 ioctx.locator_set_key(locator);
b3b6e05e 2466 rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
7c673cae
FG
2467 }
2468
2469 if (remove_bad) {
2470 ioctx.locator_set_key(string());
2471
2472 ret = ioctx.remove(oid);
2473 if (ret < 0) {
b3b6e05e 2474 ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
7c673cae
FG
2475 return ret;
2476 }
2477 }
2478
2479 return 0;
2480}
2481
b3b6e05e
TL
2482int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
2483 librados::IoCtx& src_ioctx,
7c673cae
FG
2484 const string& src_oid, const string& src_locator,
2485 librados::IoCtx& dst_ioctx,
2486 const string& dst_oid, const string& dst_locator)
2487{
2488
2489#define COPY_BUF_SIZE (4 * 1024 * 1024)
2490 bool done = false;
2491 uint64_t chunk_size = COPY_BUF_SIZE;
2492 uint64_t ofs = 0;
2493 int ret = 0;
2494 real_time mtime;
2495 struct timespec mtime_ts;
2496 uint64_t size;
2497
2498 if (src_oid == dst_oid && src_locator == dst_locator) {
2499 return 0;
2500 }
2501
2502 src_ioctx.locator_set_key(src_locator);
2503 dst_ioctx.locator_set_key(dst_locator);
2504
2505 do {
2506 bufferlist data;
2507 ObjectReadOperation rop;
2508 ObjectWriteOperation wop;
2509
2510 if (ofs == 0) {
2511 rop.stat2(&size, &mtime_ts, NULL);
2512 mtime = real_clock::from_timespec(mtime_ts);
2513 }
2514 rop.read(ofs, chunk_size, &data, NULL);
b3b6e05e 2515 ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
7c673cae
FG
2516 if (ret < 0) {
2517 goto done_err;
2518 }
2519
2520 if (data.length() == 0) {
2521 break;
2522 }
2523
2524 if (ofs == 0) {
2525 wop.create(true); /* make it exclusive */
2526 wop.mtime2(&mtime_ts);
2527 mtime = real_clock::from_timespec(mtime_ts);
2528 }
2529 wop.write(ofs, data);
b3b6e05e 2530 ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
11fdf7f2
TL
2531 if (ret < 0) {
2532 goto done_err;
2533 }
7c673cae
FG
2534 ofs += data.length();
2535 done = data.length() != chunk_size;
2536 } while (!done);
2537
2538 if (ofs != size) {
b3b6e05e 2539 ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
7c673cae
FG
2540 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2541 ret = -EIO;
2542 goto done_err;
2543 }
2544
2545 src_ioctx.remove(src_oid);
2546
2547 return 0;
2548
2549done_err:
11fdf7f2 2550 // TODO: clean up dst_oid if we created it
b3b6e05e 2551 ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
7c673cae
FG
2552 return ret;
2553}
2554
2555/*
2556 * fixes an issue where head objects were supposed to have a locator created, but ended
2557 * up without one
2558 */
b3b6e05e 2559int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
7c673cae
FG
2560{
2561 const rgw_bucket& bucket = bucket_info.bucket;
2562 rgw_obj obj(bucket, key);
2563
2564 if (need_fix) {
2565 *need_fix = false;
2566 }
2567
2568 rgw_rados_ref ref;
b3b6e05e 2569 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
2570 if (r < 0) {
2571 return r;
2572 }
2573
2574 RGWObjState *astate = NULL;
9f95a23c 2575 RGWObjectCtx rctx(this->store);
b3b6e05e 2576 r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
2577 if (r < 0)
2578 return r;
2579
9f95a23c 2580 if (astate->manifest) {
7c673cae 2581 RGWObjManifest::obj_iterator miter;
9f95a23c 2582 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 2583 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 2584 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
7c673cae
FG
2585 rgw_obj loc;
2586 string oid;
2587 string locator;
2588
9f95a23c 2589 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
7c673cae
FG
2590
2591 if (loc.key.ns.empty()) {
2592 /* continue, we're only interested in tail objects */
2593 continue;
2594 }
2595
9f95a23c
TL
2596 auto& ioctx = ref.pool.ioctx();
2597
7c673cae 2598 get_obj_bucket_and_oid_loc(loc, oid, locator);
9f95a23c 2599 ref.pool.ioctx().locator_set_key(locator);
7c673cae 2600
b3b6e05e 2601 ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
7c673cae 2602
9f95a23c 2603 r = ioctx.stat(oid, NULL, NULL);
7c673cae
FG
2604 if (r != -ENOENT) {
2605 continue;
2606 }
2607
2608 string bad_loc;
2609 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2610
2611 /* create a new ioctx with the bad locator */
2612 librados::IoCtx src_ioctx;
9f95a23c 2613 src_ioctx.dup(ioctx);
7c673cae
FG
2614 src_ioctx.locator_set_key(bad_loc);
2615
2616 r = src_ioctx.stat(oid, NULL, NULL);
2617 if (r != 0) {
2618 /* cannot find a broken part */
2619 continue;
2620 }
b3b6e05e 2621 ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
7c673cae
FG
2622 if (need_fix) {
2623 *need_fix = true;
2624 }
2625 if (fix) {
b3b6e05e 2626 r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
7c673cae 2627 if (r < 0) {
b3b6e05e 2628 ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
7c673cae
FG
2629 }
2630 }
2631 }
2632 }
2633
2634 return 0;
2635}
2636
f64942e4
AA
2637int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2638 const rgw_obj& obj,
b3b6e05e
TL
2639 RGWBucketInfo* bucket_info_out,
2640 const DoutPrefixProvider *dpp)
7c673cae
FG
2641{
2642 bucket = _bucket;
2643
11fdf7f2 2644 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2645
2646 RGWBucketInfo bucket_info;
f64942e4
AA
2647 RGWBucketInfo* bucket_info_p =
2648 bucket_info_out ? bucket_info_out : &bucket_info;
2649
b3b6e05e 2650 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2651 if (ret < 0) {
2652 return ret;
2653 }
2654
9f95a23c
TL
2655 string oid;
2656
b3b6e05e 2657 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae 2658 if (ret < 0) {
b3b6e05e 2659 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2660 return ret;
2661 }
b3b6e05e 2662 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2663
2664 return 0;
2665}
2666
f64942e4 2667int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
f67539c2 2668 int sid, const rgw::bucket_index_layout_generation& idx_layout,
b3b6e05e
TL
2669 RGWBucketInfo* bucket_info_out,
2670 const DoutPrefixProvider *dpp)
7c673cae
FG
2671{
2672 bucket = _bucket;
2673 shard_id = sid;
2674
11fdf7f2 2675 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 2676
f67539c2 2677
7c673cae 2678 RGWBucketInfo bucket_info;
f64942e4
AA
2679 RGWBucketInfo* bucket_info_p =
2680 bucket_info_out ? bucket_info_out : &bucket_info;
b3b6e05e 2681 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2682 if (ret < 0) {
2683 return ret;
2684 }
2685
9f95a23c
TL
2686 string oid;
2687
b3b6e05e 2688 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, shard_id, idx_layout, &bucket_obj);
7c673cae 2689 if (ret < 0) {
b3b6e05e 2690 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2691 return ret;
2692 }
b3b6e05e 2693 ldpp_dout(dpp, 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2694
2695 return 0;
2696}
2697
b3b6e05e 2698int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
a8e16298
TL
2699 const rgw_obj& obj)
2700{
2701 bucket = bucket_info.bucket;
2702
b3b6e05e 2703 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
9f95a23c
TL
2704 obj.get_hash_object(),
2705 &bucket_obj,
2706 &shard_id);
a8e16298 2707 if (ret < 0) {
b3b6e05e 2708 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
a8e16298
TL
2709 return ret;
2710 }
b3b6e05e 2711 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
a8e16298
TL
2712
2713 return 0;
2714}
2715
b3b6e05e 2716int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int sid)
b32b8144
FG
2717{
2718 bucket = bucket_info.bucket;
2719 shard_id = sid;
2720
b3b6e05e 2721 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, idx_layout, &bucket_obj);
b32b8144 2722 if (ret < 0) {
b3b6e05e 2723 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
b32b8144
FG
2724 return ret;
2725 }
b3b6e05e 2726 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
b32b8144
FG
2727
2728 return 0;
2729}
2730
7c673cae
FG
2731
2732/* Execute @handler on last item in bucket listing for bucket specified
2733 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2734 * to objects matching these criterias. */
b3b6e05e
TL
2735int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
2736 RGWBucketInfo& bucket_info,
7c673cae
FG
2737 const std::string& obj_prefix,
2738 const std::string& obj_delim,
2739 std::function<int(const rgw_bucket_dir_entry&)> handler)
2740{
2741 RGWRados::Bucket target(this, bucket_info);
2742 RGWRados::Bucket::List list_op(&target);
2743
2744 list_op.params.prefix = obj_prefix;
2745 list_op.params.delim = obj_delim;
2746
b3b6e05e 2747 ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
7c673cae
FG
2748 << ", obj_prefix=" << obj_prefix
2749 << ", obj_delim=" << obj_delim
2750 << dendl;
2751
2752 bool is_truncated = false;
2753
2754 boost::optional<rgw_bucket_dir_entry> last_entry;
2755 /* We need to rewind to the last object in a listing. */
2756 do {
2757 /* List bucket entries in chunks. */
2758 static constexpr int MAX_LIST_OBJS = 100;
2759 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2760
b3b6e05e 2761 int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
9f95a23c 2762 &is_truncated, null_yield);
7c673cae
FG
2763 if (ret < 0) {
2764 return ret;
2765 } else if (!entries.empty()) {
2766 last_entry = entries.back();
2767 }
2768 } while (is_truncated);
2769
2770 if (last_entry) {
2771 return handler(*last_entry);
2772 }
2773
2774 /* Empty listing - no items we can run handler on. */
2775 return 0;
2776}
2777
f67539c2
TL
2778bool RGWRados::swift_versioning_enabled(rgw::sal::RGWBucket* bucket) const
2779{
2780 return bucket->get_info().has_swift_versioning() &&
2781 bucket->get_info().swift_ver_location.size();
2782}
7c673cae
FG
2783
2784int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2785 const rgw_user& user,
f67539c2
TL
2786 rgw::sal::RGWBucket* bucket,
2787 rgw::sal::RGWObject* obj,
9f95a23c
TL
2788 const DoutPrefixProvider *dpp,
2789 optional_yield y)
7c673cae 2790{
f67539c2 2791 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
2792 return 0;
2793 }
2794
f67539c2 2795 obj->set_atomic(&obj_ctx);
7c673cae
FG
2796
2797 RGWObjState * state = nullptr;
b3b6e05e 2798 int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj->get_obj(), &state, false, y);
7c673cae
FG
2799 if (r < 0) {
2800 return r;
2801 }
2802
2803 if (!state->exists) {
2804 return 0;
2805 }
2806
f67539c2 2807 const string& src_name = obj->get_oid();
7c673cae
FG
2808 char buf[src_name.size() + 32];
2809 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2810 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2811 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2812
2813 RGWBucketInfo dest_bucket_info;
2814
f67539c2 2815 r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
7c673cae 2816 if (r < 0) {
b3b6e05e 2817 ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
7c673cae
FG
2818 if (r == -ENOENT) {
2819 return -ERR_PRECONDITION_FAILED;
2820 }
2821 return r;
2822 }
2823
f67539c2 2824 if (dest_bucket_info.owner != bucket->get_info().owner) {
7c673cae
FG
2825 return -ERR_PRECONDITION_FAILED;
2826 }
2827
f67539c2
TL
2828 rgw::sal::RGWRadosBucket dest_bucket(store, dest_bucket_info);
2829 rgw::sal::RGWRadosObject dest_obj(store, rgw_obj_key(buf), &dest_bucket);
11fdf7f2
TL
2830
2831 if (dest_bucket_info.versioning_enabled()){
f67539c2 2832 dest_obj.gen_rand_obj_instance_name();
11fdf7f2
TL
2833 }
2834
f67539c2 2835 dest_obj.set_atomic(&obj_ctx);
7c673cae 2836
9f95a23c 2837 rgw_zone_id no_zone;
7c673cae
FG
2838
2839 r = copy_obj(obj_ctx,
2840 user,
7c673cae
FG
2841 NULL, /* req_info *info */
2842 no_zone,
f67539c2 2843 &dest_obj,
7c673cae 2844 obj,
f67539c2
TL
2845 &dest_bucket,
2846 bucket,
2847 bucket->get_placement_rule(),
7c673cae
FG
2848 NULL, /* time_t *src_mtime */
2849 NULL, /* time_t *mtime */
2850 NULL, /* const time_t *mod_ptr */
2851 NULL, /* const time_t *unmod_ptr */
2852 false, /* bool high_precision_time */
2853 NULL, /* const char *if_match */
2854 NULL, /* const char *if_nomatch */
2855 RGWRados::ATTRSMOD_NONE,
2856 true, /* bool copy_if_newer */
2857 state->attrset,
11fdf7f2 2858 RGWObjCategory::Main,
7c673cae
FG
2859 0, /* uint64_t olh_epoch */
2860 real_time(), /* time_t delete_at */
2861 NULL, /* string *version_id */
2862 NULL, /* string *ptag */
2863 NULL, /* string *petag */
7c673cae 2864 NULL, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2865 NULL, /* void *progress_data */
2866 dpp,
2867 null_yield);
7c673cae
FG
2868 if (r == -ECANCELED || r == -ENOENT) {
2869 /* Has already been overwritten, meaning another rgw process already
2870 * copied it out */
2871 return 0;
2872 }
2873
2874 return r;
2875}
2876
9f95a23c 2877int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
7c673cae 2878 const rgw_user& user,
f67539c2
TL
2879 rgw::sal::RGWBucket* bucket,
2880 rgw::sal::RGWObject* obj,
9f95a23c
TL
2881 bool& restored, /* out */
2882 const DoutPrefixProvider *dpp)
7c673cae 2883{
f67539c2 2884 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
2885 return 0;
2886 }
2887
2888 /* Bucket info of the bucket that stores previous versions of our object. */
2889 RGWBucketInfo archive_binfo;
2890
f67539c2
TL
2891 int ret = get_bucket_info(&svc, bucket->get_tenant(),
2892 bucket->get_info().swift_ver_location,
2893 archive_binfo, nullptr, null_yield, nullptr);
7c673cae
FG
2894 if (ret < 0) {
2895 return ret;
2896 }
2897
2898 /* Abort the operation if the bucket storing our archive belongs to someone
2899 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2900 * into consideration. For we can live with that.
2901 *
2902 * TODO: delegate this check to un upper layer and compare with ACLs. */
f67539c2 2903 if (bucket->get_info().owner != archive_binfo.owner) {
7c673cae
FG
2904 return -EPERM;
2905 }
2906
2907 /* This code will be executed on latest version of the object. */
2908 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
9f95a23c 2909 rgw_zone_id no_zone;
7c673cae
FG
2910
2911 /* We don't support object versioning of Swift API on those buckets that
2912 * are already versioned using the S3 mechanism. This affects also bucket
2913 * storing archived objects. Otherwise the delete operation would create
2914 * a deletion marker. */
2915 if (archive_binfo.versioned()) {
2916 restored = false;
2917 return -ERR_PRECONDITION_FAILED;
2918 }
2919
2920 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2921 * irrelevant and may be safely skipped. */
2922 std::map<std::string, ceph::bufferlist> no_attrs;
2923
f67539c2
TL
2924 rgw::sal::RGWRadosBucket archive_bucket(store, archive_binfo);
2925 rgw::sal::RGWRadosObject archive_obj(store, entry.key, &archive_bucket);
11fdf7f2 2926
f67539c2
TL
2927 if (bucket->versioning_enabled()){
2928 obj->gen_rand_obj_instance_name();
11fdf7f2
TL
2929 }
2930
f67539c2
TL
2931 archive_obj.set_atomic(&obj_ctx);
2932 obj->set_atomic(&obj_ctx);
7c673cae
FG
2933
2934 int ret = copy_obj(obj_ctx,
2935 user,
7c673cae
FG
2936 nullptr, /* req_info *info */
2937 no_zone,
2938 obj, /* dest obj */
f67539c2
TL
2939 &archive_obj, /* src obj */
2940 bucket, /* dest bucket info */
2941 &archive_bucket, /* src bucket info */
2942 bucket->get_placement_rule(), /* placement_rule */
7c673cae
FG
2943 nullptr, /* time_t *src_mtime */
2944 nullptr, /* time_t *mtime */
2945 nullptr, /* const time_t *mod_ptr */
2946 nullptr, /* const time_t *unmod_ptr */
2947 false, /* bool high_precision_time */
2948 nullptr, /* const char *if_match */
2949 nullptr, /* const char *if_nomatch */
2950 RGWRados::ATTRSMOD_NONE,
2951 true, /* bool copy_if_newer */
2952 no_attrs,
11fdf7f2 2953 RGWObjCategory::Main,
7c673cae
FG
2954 0, /* uint64_t olh_epoch */
2955 real_time(), /* time_t delete_at */
2956 nullptr, /* string *version_id */
2957 nullptr, /* string *ptag */
2958 nullptr, /* string *petag */
7c673cae 2959 nullptr, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2960 nullptr, /* void *progress_data */
2961 dpp,
2962 null_yield);
7c673cae
FG
2963 if (ret == -ECANCELED || ret == -ENOENT) {
2964 /* Has already been overwritten, meaning another rgw process already
2965 * copied it out */
2966 return 0;
2967 } else if (ret < 0) {
2968 return ret;
2969 } else {
2970 restored = true;
2971 }
2972
2973 /* Need to remove the archived copy. */
b3b6e05e 2974 ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj.get_obj(),
7c673cae
FG
2975 archive_binfo.versioning_status());
2976
2977 return ret;
2978 };
2979
f67539c2 2980 const std::string& obj_name = obj->get_oid();
7c673cae
FG
2981 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
2982 % obj_name);
2983
b3b6e05e 2984 return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
7c673cae
FG
2985 handler);
2986}
2987
b3b6e05e
TL
2988int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
2989 uint64_t size, uint64_t accounted_size,
181888fb
FG
2990 map<string, bufferlist>& attrs,
2991 bool assume_noent, bool modify_tail,
9f95a23c 2992 void *_index_op, optional_yield y)
7c673cae
FG
2993{
2994 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
2995 RGWRados *store = target->get_store();
2996
2997 ObjectWriteOperation op;
11fdf7f2
TL
2998#ifdef WITH_LTTNG
2999 const struct req_state* s = get_req_state();
3000 string req_id;
3001 if (!s) {
3002 // fake req_id
3003 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3004 } else {
3005 req_id = s->req_id;
3006 }
3007#endif
7c673cae
FG
3008
3009 RGWObjState *state;
b3b6e05e 3010 int r = target->get_state(dpp, &state, false, y, assume_noent);
7c673cae
FG
3011 if (r < 0)
3012 return r;
3013
3014 rgw_obj& obj = target->get_obj();
3015
3016 if (obj.get_oid().empty()) {
b3b6e05e 3017 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7c673cae
FG
3018 return -EIO;
3019 }
3020
224ce89b 3021 rgw_rados_ref ref;
b3b6e05e 3022 r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
7c673cae
FG
3023 if (r < 0)
3024 return r;
3025
3026 bool is_olh = state->is_olh;
3027
3028 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3029
3030 const string *ptag = meta.ptag;
3031 if (!ptag && !index_op->get_optag()->empty()) {
3032 ptag = index_op->get_optag();
3033 }
b3b6e05e 3034 r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
7c673cae
FG
3035 if (r < 0)
3036 return r;
3037
3038 if (real_clock::is_zero(meta.set_mtime)) {
3039 meta.set_mtime = real_clock::now();
3040 }
3041
eafe8130
TL
3042 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
3043 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
3044 if (iter == attrs.end()) {
3045 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3046 string mode = target->bucket_info.obj_lock.get_mode();
3047 RGWObjectRetention obj_retention(mode, lock_until_date);
3048 bufferlist bl;
3049 obj_retention.encode(bl);
3050 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3051 }
3052 }
3053
7c673cae
FG
3054 if (state->is_olh) {
3055 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3056 }
3057
3058 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3059 op.mtime2(&mtime_ts);
3060
3061 if (meta.data) {
3062 /* if we want to overwrite the data, we also want to overwrite the
3063 xattrs, so just remove the object */
3064 op.write_full(*meta.data);
3065 }
3066
3067 string etag;
3068 string content_type;
3069 bufferlist acl_bl;
11fdf7f2 3070 string storage_class;
7c673cae
FG
3071
3072 map<string, bufferlist>::iterator iter;
3073 if (meta.rmattrs) {
3074 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3075 const string& name = iter->first;
3076 op.rmxattr(name.c_str());
3077 }
3078 }
3079
3080 if (meta.manifest) {
11fdf7f2
TL
3081 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3082
7c673cae
FG
3083 /* remove existing manifest attr */
3084 iter = attrs.find(RGW_ATTR_MANIFEST);
3085 if (iter != attrs.end())
3086 attrs.erase(iter);
3087
3088 bufferlist bl;
11fdf7f2 3089 encode(*meta.manifest, bl);
7c673cae
FG
3090 op.setxattr(RGW_ATTR_MANIFEST, bl);
3091 }
3092
3093 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3094 const string& name = iter->first;
3095 bufferlist& bl = iter->second;
3096
3097 if (!bl.length())
3098 continue;
3099
3100 op.setxattr(name.c_str(), bl);
3101
3102 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3103 etag = rgw_bl_str(bl);
7c673cae 3104 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3105 content_type = rgw_bl_str(bl);
7c673cae
FG
3106 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3107 acl_bl = bl;
3108 }
3109 }
3110 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3111 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3112 }
3113
3114 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3115 bufferlist bl;
11fdf7f2 3116 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3117 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3118 }
3119
11fdf7f2
TL
3120 if (!storage_class.empty()) {
3121 bufferlist bl;
3122 bl.append(storage_class);
3123 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3124 }
3125
7c673cae
FG
3126 if (!op.size())
3127 return 0;
3128
3129 uint64_t epoch;
3130 int64_t poolid;
224ce89b
WB
3131 bool orig_exists;
3132 uint64_t orig_size;
3133
3134 if (!reset_obj) { //Multipart upload, it has immutable head.
3135 orig_exists = false;
3136 orig_size = 0;
3137 } else {
3138 orig_exists = state->exists;
3139 orig_size = state->accounted_size;
3140 }
7c673cae 3141
91327a77
AA
3142 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3143 !obj.key.instance.empty();
7c673cae
FG
3144
3145 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3146
3147 if (versioned_op) {
3148 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3149 }
3150
3151 if (!index_op->is_prepared()) {
11fdf7f2 3152 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
b3b6e05e 3153 r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
11fdf7f2 3154 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3155 if (r < 0)
3156 return r;
3157 }
3158
9f95a23c
TL
3159 auto& ioctx = ref.pool.ioctx();
3160
11fdf7f2 3161 tracepoint(rgw_rados, operate_enter, req_id.c_str());
b3b6e05e 3162 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
11fdf7f2 3163 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3164 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3165 or -ENOENT if was removed, or -EEXIST if it did not exist
3166 before and now it does */
3167 if (r == -EEXIST && assume_noent) {
3168 target->invalidate_state();
3169 return r;
3170 }
3171 goto done_cancel;
3172 }
3173
9f95a23c
TL
3174 epoch = ioctx.get_last_version();
3175 poolid = ioctx.get_id();
7c673cae 3176
b3b6e05e 3177 r = target->complete_atomic_modification(dpp);
7c673cae 3178 if (r < 0) {
b3b6e05e 3179 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7c673cae
FG
3180 }
3181
11fdf7f2 3182 tracepoint(rgw_rados, complete_enter, req_id.c_str());
b3b6e05e 3183 r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
11fdf7f2
TL
3184 meta.set_mtime, etag, content_type,
3185 storage_class, &acl_bl,
3186 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3187 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3188 if (r < 0)
3189 goto done_cancel;
3190
3191 if (meta.mtime) {
3192 *meta.mtime = meta.set_mtime;
3193 }
3194
3195 /* note that index_op was using state so we couldn't invalidate it earlier */
3196 target->invalidate_state();
3197 state = NULL;
3198
91327a77 3199 if (versioned_op && meta.olh_epoch) {
b3b6e05e 3200 r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
7c673cae
FG
3201 if (r < 0) {
3202 return r;
3203 }
3204 }
3205
3206 if (!real_clock::is_zero(meta.delete_at)) {
3207 rgw_obj_index_key obj_key;
3208 obj.key.get_index_key(&obj_key);
3209
b3b6e05e 3210 r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
9f95a23c 3211 obj.bucket.bucket_id, obj_key);
7c673cae 3212 if (r < 0) {
b3b6e05e 3213 ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7c673cae
FG
3214 /* ignoring error, nothing we can do at this point */
3215 }
3216 }
3217 meta.canceled = false;
3218
3219 /* update quota cache */
3efd9988
FG
3220 if (meta.completeMultipart){
3221 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3222 0, orig_size);
3223 }
3224 else {
3225 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3226 accounted_size, orig_size);
3227 }
7c673cae
FG
3228 return 0;
3229
3230done_cancel:
b3b6e05e 3231 int ret = index_op->cancel(dpp);
7c673cae 3232 if (ret < 0) {
b3b6e05e 3233 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7c673cae
FG
3234 }
3235
3236 meta.canceled = true;
3237
3238 /* we lost in a race. There are a few options:
3239 * - existing object was rewritten (ECANCELED)
3240 * - non existing object was created (EEXIST)
3241 * - object was removed (ENOENT)
3242 * should treat it as a success
3243 */
3244 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3245 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3246 r = 0;
3247 }
3248 } else {
3249 if (meta.if_match != NULL) {
3250 // only overwrite existing object
3251 if (strcmp(meta.if_match, "*") == 0) {
3252 if (r == -ENOENT) {
3253 r = -ERR_PRECONDITION_FAILED;
3254 } else if (r == -ECANCELED) {
3255 r = 0;
3256 }
3257 }
3258 }
3259
3260 if (meta.if_nomatch != NULL) {
3261 // only create a new object
3262 if (strcmp(meta.if_nomatch, "*") == 0) {
3263 if (r == -EEXIST) {
3264 r = -ERR_PRECONDITION_FAILED;
3265 } else if (r == -ENOENT) {
3266 r = 0;
3267 }
3268 }
3269 }
3270 }
3271
3272 return r;
3273}
3274
b3b6e05e 3275int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
9f95a23c 3276 map<string, bufferlist>& attrs, optional_yield y)
7c673cae
FG
3277{
3278 RGWBucketInfo& bucket_info = target->get_bucket_info();
3279
3280 RGWRados::Bucket bop(target->get_store(), bucket_info);
3281 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3282 index_op.set_zones_trace(meta.zones_trace);
3283
7c673cae
FG
3284 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3285 int r;
3286 if (assume_noent) {
b3b6e05e 3287 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3288 if (r == -EEXIST) {
3289 assume_noent = false;
3290 }
3291 }
3292 if (!assume_noent) {
b3b6e05e 3293 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3294 }
3295 return r;
3296}
3297
11fdf7f2 3298class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae 3299{
b3b6e05e 3300 const DoutPrefixProvider *dpp;
7c673cae
FG
3301 CephContext* cct;
3302 rgw_obj obj;
11fdf7f2 3303 rgw::putobj::DataProcessor *filter;
7c673cae 3304 boost::optional<RGWPutObj_Compress>& compressor;
adb31ebb
TL
3305 bool try_etag_verify;
3306 rgw::putobj::etag_verifier_ptr etag_verifier;
11fdf7f2 3307 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3308 CompressorRef& plugin;
11fdf7f2 3309 rgw::putobj::ObjectProcessor *processor;
7c673cae
FG
3310 void (*progress_cb)(off_t, void *);
3311 void *progress_data;
adb31ebb
TL
3312 bufferlist extra_data_bl, manifest_bl;
3313 std::optional<RGWCompressionInfo> compression_info;
11fdf7f2
TL
3314 uint64_t extra_data_left{0};
3315 bool need_to_process_attrs{true};
3316 uint64_t data_len{0};
7c673cae 3317 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3318 uint64_t ofs{0};
3319 uint64_t lofs{0}; /* logical ofs */
9f95a23c 3320 std::function<int(map<string, bufferlist>&)> attrs_handler;
7c673cae 3321public:
b3b6e05e
TL
3322 RGWRadosPutObj(const DoutPrefixProvider *dpp,
3323 CephContext* cct,
7c673cae
FG
3324 CompressorRef& plugin,
3325 boost::optional<RGWPutObj_Compress>& compressor,
11fdf7f2 3326 rgw::putobj::ObjectProcessor *p,
7c673cae 3327 void (*_progress_cb)(off_t, void *),
11fdf7f2 3328 void *_progress_data,
9f95a23c 3329 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
b3b6e05e 3330 dpp(dpp),
7c673cae
FG
3331 cct(cct),
3332 filter(p),
3333 compressor(compressor),
adb31ebb 3334 try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
7c673cae
FG
3335 plugin(plugin),
3336 processor(p),
7c673cae
FG
3337 progress_cb(_progress_cb),
3338 progress_data(_progress_data),
11fdf7f2 3339 attrs_handler(_attrs_handler) {}
7c673cae
FG
3340
3341 int process_attrs(void) {
3342 if (extra_data_bl.length()) {
3343 JSONParser jp;
3344 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
b3b6e05e 3345 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7c673cae
FG
3346 return -EIO;
3347 }
3348
3349 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3350
adb31ebb
TL
3351 auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
3352 if (iter != src_attrs.end()) {
3353 const bufferlist bl = std::move(iter->second);
3354 src_attrs.erase(iter); // don't preserve source compression info
3355
3356 if (try_etag_verify) {
3357 // if we're trying to verify etags, we need to convert compressed
3358 // ranges in the manifest back into logical multipart part offsets
3359 RGWCompressionInfo info;
3360 bool compressed = false;
3361 int r = rgw_compression_info_from_attr(bl, compressed, info);
3362 if (r < 0) {
b3b6e05e 3363 ldpp_dout(dpp, 4) << "failed to decode compression info, "
adb31ebb
TL
3364 "disabling etag verification" << dendl;
3365 try_etag_verify = false;
3366 } else if (compressed) {
3367 compression_info = std::move(info);
3368 }
3369 }
3370 }
3371 /* We need the manifest to recompute the ETag for verification */
3372 iter = src_attrs.find(RGW_ATTR_MANIFEST);
3373 if (iter != src_attrs.end()) {
3374 manifest_bl = std::move(iter->second);
3375 src_attrs.erase(iter);
3376 }
a8e16298
TL
3377
3378 // filter out olh attributes
adb31ebb 3379 iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
a8e16298
TL
3380 while (iter != src_attrs.end()) {
3381 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3382 break;
3383 }
3384 iter = src_attrs.erase(iter);
3385 }
7c673cae
FG
3386 }
3387
11fdf7f2
TL
3388 int ret = attrs_handler(src_attrs);
3389 if (ret < 0) {
3390 return ret;
3391 }
3392
7c673cae
FG
3393 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3394 //do not compress if object is encrypted
3395 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3396 // add a filter that buffers data so we don't try to compress tiny blocks.
3397 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3398 // compression ratio
28e407b8
AA
3399 constexpr unsigned buffer_size = 512 * 1024;
3400 buffering = boost::in_place(&*compressor, buffer_size);
3401 filter = &*buffering;
7c673cae 3402 }
11fdf7f2 3403
adb31ebb
TL
3404 /*
3405 * Presently we don't support ETag based verification if encryption is
3406 * requested. We can enable simultaneous support once we have a mechanism
3407 * to know the sequence in which the filters must be applied.
3408 */
3409 if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
b3b6e05e 3410 ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
adb31ebb
TL
3411 compression_info,
3412 etag_verifier);
3413 if (ret < 0) {
b3b6e05e 3414 ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
adb31ebb
TL
3415 "disabling etag verification" << dendl;
3416 } else {
3417 filter = etag_verifier.get();
3418 }
3419 }
3420
11fdf7f2
TL
3421 need_to_process_attrs = false;
3422
7c673cae
FG
3423 return 0;
3424 }
3425
11fdf7f2 3426 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3427 if (progress_cb) {
11fdf7f2 3428 progress_cb(data_len, progress_data);
7c673cae 3429 }
b32b8144 3430 if (extra_data_left) {
11fdf7f2 3431 uint64_t extra_len = bl.length();
b32b8144
FG
3432 if (extra_len > extra_data_left)
3433 extra_len = extra_data_left;
7c673cae
FG
3434
3435 bufferlist extra;
3436 bl.splice(0, extra_len, &extra);
3437 extra_data_bl.append(extra);
3438
b32b8144
FG
3439 extra_data_left -= extra_len;
3440 if (extra_data_left == 0) {
7c673cae
FG
3441 int res = process_attrs();
3442 if (res < 0)
3443 return res;
3444 }
11fdf7f2 3445 ofs += extra_len;
7c673cae
FG
3446 if (bl.length() == 0) {
3447 return 0;
3448 }
3449 }
11fdf7f2
TL
3450 if (need_to_process_attrs) {
3451 /* need to call process_attrs() even if we don't get any attrs,
3452 * need it to call attrs_handler().
3453 */
3454 int res = process_attrs();
3455 if (res < 0) {
3456 return res;
3457 }
3458 }
7c673cae 3459
11fdf7f2 3460 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3461
11fdf7f2
TL
3462 uint64_t size = bl.length();
3463 ofs += size;
7c673cae 3464
11fdf7f2
TL
3465 const uint64_t lofs = data_len;
3466 data_len += size;
7c673cae 3467
11fdf7f2 3468 return filter->process(std::move(bl), lofs);
7c673cae
FG
3469 }
3470
28e407b8 3471 int flush() {
11fdf7f2 3472 return filter->process({}, data_len);
28e407b8
AA
3473 }
3474
7c673cae
FG
3475 bufferlist& get_extra_data() { return extra_data_bl; }
3476
3477 map<string, bufferlist>& get_attrs() { return src_attrs; }
3478
3479 void set_extra_data_len(uint64_t len) override {
b32b8144 3480 extra_data_left = len;
11fdf7f2 3481 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3482 }
3483
3484 uint64_t get_data_len() {
3485 return data_len;
3486 }
adb31ebb
TL
3487
3488 std::string get_verifier_etag() {
3489 if (etag_verifier) {
3490 etag_verifier->calculate_etag();
3491 return etag_verifier->get_calculated_etag();
3492 } else {
3493 return "";
3494 }
3495 }
7c673cae
FG
3496};
3497
3498/*
3499 * prepare attrset depending on attrs_mod.
3500 */
3501static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3502 map<string, bufferlist>& attrs,
3503 RGWRados::AttrsMod attrs_mod)
3504{
3505 switch (attrs_mod) {
3506 case RGWRados::ATTRSMOD_NONE:
3507 attrs = src_attrs;
3508 break;
3509 case RGWRados::ATTRSMOD_REPLACE:
3510 if (!attrs[RGW_ATTR_ETAG].length()) {
3511 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3512 }
181888fb
FG
3513 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3514 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3515 if (ttiter != src_attrs.end()) {
3516 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3517 }
3518 }
7c673cae
FG
3519 break;
3520 case RGWRados::ATTRSMOD_MERGE:
3521 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3522 if (attrs.find(it->first) == attrs.end()) {
3523 attrs[it->first] = it->second;
3524 }
3525 }
3526 break;
3527 }
3528}
3529
f67539c2 3530int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, rgw::sal::RGWObject* obj, const DoutPrefixProvider *dpp, optional_yield y)
7c673cae 3531{
9f95a23c 3532 RGWObjectCtx rctx(this->store);
f67539c2 3533 rgw::sal::RGWRadosBucket bucket(store, dest_bucket_info);
7c673cae 3534
f67539c2 3535 return obj->copy_obj_data(rctx, &bucket, obj, 0, NULL, dpp, y);
7c673cae
FG
3536}
3537
3538struct obj_time_weight {
3539 real_time mtime;
3540 uint32_t zone_short_id;
3541 uint64_t pg_ver;
3542 bool high_precision;
3543
3544 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3545
3546 bool compare_low_precision(const obj_time_weight& rhs) {
3547 struct timespec l = ceph::real_clock::to_timespec(mtime);
3548 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3549 l.tv_nsec = 0;
3550 r.tv_nsec = 0;
3551 if (l > r) {
3552 return false;
3553 }
3554 if (l < r) {
3555 return true;
3556 }
11fdf7f2
TL
3557 if (!zone_short_id || !rhs.zone_short_id) {
3558 /* don't compare zone ids, if one wasn't provided */
3559 return false;
3560 }
7c673cae
FG
3561 if (zone_short_id != rhs.zone_short_id) {
3562 return (zone_short_id < rhs.zone_short_id);
3563 }
3564 return (pg_ver < rhs.pg_ver);
3565
3566 }
3567
3568 bool operator<(const obj_time_weight& rhs) {
3569 if (!high_precision || !rhs.high_precision) {
3570 return compare_low_precision(rhs);
3571 }
3572 if (mtime > rhs.mtime) {
3573 return false;
3574 }
3575 if (mtime < rhs.mtime) {
3576 return true;
3577 }
11fdf7f2
TL
3578 if (!zone_short_id || !rhs.zone_short_id) {
3579 /* don't compare zone ids, if one wasn't provided */
3580 return false;
3581 }
7c673cae
FG
3582 if (zone_short_id != rhs.zone_short_id) {
3583 return (zone_short_id < rhs.zone_short_id);
3584 }
3585 return (pg_ver < rhs.pg_ver);
3586 }
3587
3588 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3589 mtime = _mtime;
3590 zone_short_id = _short_id;
3591 pg_ver = _pg_ver;
3592 }
3593
3594 void init(RGWObjState *state) {
3595 mtime = state->mtime;
3596 zone_short_id = state->zone_short_id;
3597 pg_ver = state->pg_ver;
3598 }
3599};
3600
3601inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3602 out << o.mtime;
3603
3604 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3605 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3606 }
3607
3608 return out;
3609}
3610
11fdf7f2 3611class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
3612 bufferlist extra_data;
3613public:
3614 RGWGetExtraDataCB() {}
11fdf7f2
TL
3615 int handle_data(bufferlist& bl, bool *pause) override {
3616 int bl_len = (int)bl.length();
7c673cae
FG
3617 if (extra_data.length() < extra_data_len) {
3618 off_t max = extra_data_len - extra_data.length();
3619 if (max > bl_len) {
3620 max = bl_len;
3621 }
3622 bl.splice(0, max, &extra_data);
3623 }
3624 return bl_len;
3625 }
3626
3627 bufferlist& get_extra_data() {
3628 return extra_data;
3629 }
3630};
3631
b3b6e05e
TL
3632int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
3633 RGWObjectCtx& obj_ctx,
7c673cae 3634 const rgw_user& user_id,
7c673cae 3635 req_info *info,
9f95a23c 3636 const rgw_zone_id& source_zone,
f67539c2 3637 rgw::sal::RGWObject* src_obj,
9f95a23c 3638 const RGWBucketInfo *src_bucket_info,
7c673cae
FG
3639 real_time *src_mtime,
3640 uint64_t *psize,
3641 const real_time *mod_ptr,
3642 const real_time *unmod_ptr,
3643 bool high_precision_time,
3644 const char *if_match,
3645 const char *if_nomatch,
3646 map<string, bufferlist> *pattrs,
11fdf7f2 3647 map<string, string> *pheaders,
7c673cae
FG
3648 string *version_id,
3649 string *ptag,
3650 string *petag)
3651{
3652 /* source is in a different zonegroup, copy from there */
3653
3654 RGWRESTStreamRWRequest *in_stream_req;
3655 string tag;
3656 map<string, bufferlist> src_attrs;
3657 append_rand_alpha(cct, tag, tag, 32);
3658 obj_time_weight set_mtime_weight;
3659 set_mtime_weight.high_precision = high_precision_time;
3660
3661 RGWRESTConn *conn;
3662 if (source_zone.empty()) {
9f95a23c 3663 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3664 /* source is in the master zonegroup */
11fdf7f2 3665 conn = svc.zone->get_master_conn();
7c673cae 3666 } else {
11fdf7f2 3667 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
9f95a23c 3668 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae
FG
3669 if (iter == zonegroup_conn_map.end()) {
3670 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3671 return -ENOENT;
3672 }
3673 conn = iter->second;
3674 }
3675 } else {
11fdf7f2 3676 auto& zone_conn_map = svc.zone->get_zone_conn_map();
9f95a23c 3677 auto iter = zone_conn_map.find(source_zone);
7c673cae
FG
3678 if (iter == zone_conn_map.end()) {
3679 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3680 return -ENOENT;
3681 }
3682 conn = iter->second;
3683 }
3684
3685 RGWGetExtraDataCB cb;
7c673cae
FG
3686 map<string, string> req_headers;
3687 real_time set_mtime;
3688
3689 const real_time *pmod = mod_ptr;
3690
3691 obj_time_weight dest_mtime_weight;
3692
181888fb
FG
3693 constexpr bool prepend_meta = true;
3694 constexpr bool get_op = true;
3695 constexpr bool rgwx_stat = true;
3696 constexpr bool sync_manifest = true;
3697 constexpr bool skip_decrypt = true;
b3b6e05e 3698 int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 3699 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3700 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3701 sync_manifest, skip_decrypt,
3702 true, &cb, &in_stream_req);
7c673cae
FG
3703 if (ret < 0) {
3704 return ret;
3705 }
3706
f67539c2
TL
3707 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
3708 nullptr, pheaders, null_yield);
7c673cae
FG
3709 if (ret < 0) {
3710 return ret;
3711 }
3712
3713 bufferlist& extra_data_bl = cb.get_extra_data();
3714 if (extra_data_bl.length()) {
3715 JSONParser jp;
3716 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3717 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3718 return -EIO;
3719 }
3720
3721 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3722
3723 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3724 }
3725
3726 if (src_mtime) {
3727 *src_mtime = set_mtime;
3728 }
3729
3730 if (petag) {
3731 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3732 if (iter != src_attrs.end()) {
3733 bufferlist& etagbl = iter->second;
3734 *petag = etagbl.to_str();
11fdf7f2
TL
3735 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3736 *petag = petag->substr(0, petag->size() - 1);
3737 }
7c673cae
FG
3738 }
3739 }
3740
3741 if (pattrs) {
11fdf7f2 3742 *pattrs = std::move(src_attrs);
7c673cae
FG
3743 }
3744
3745 return 0;
3746}
3747
9f95a23c
TL
3748int RGWFetchObjFilter_Default::filter(CephContext *cct,
3749 const rgw_obj_key& source_key,
3750 const RGWBucketInfo& dest_bucket_info,
3751 std::optional<rgw_placement_rule> dest_placement_rule,
3752 const map<string, bufferlist>& obj_attrs,
3753 std::optional<rgw_user> *poverride_owner,
3754 const rgw_placement_rule **prule)
3755{
3756 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3757 if (!ptail_rule) {
3758 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3759 if (iter != obj_attrs.end()) {
3760 dest_rule.storage_class = iter->second.to_str();
3761 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3762 ptail_rule = &dest_rule;
3763 } else {
3764 ptail_rule = &dest_bucket_info.placement_rule;
3765 }
3766 }
3767 *prule = ptail_rule;
3768 return 0;
3769}
3770
7c673cae
FG
3771int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3772 const rgw_user& user_id,
7c673cae 3773 req_info *info,
9f95a23c 3774 const rgw_zone_id& source_zone,
f67539c2
TL
3775 rgw::sal::RGWObject* dest_obj,
3776 rgw::sal::RGWObject* src_obj,
3777 rgw::sal::RGWBucket* dest_bucket,
3778 rgw::sal::RGWBucket* src_bucket,
11fdf7f2 3779 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
3780 real_time *src_mtime,
3781 real_time *mtime,
3782 const real_time *mod_ptr,
3783 const real_time *unmod_ptr,
3784 bool high_precision_time,
3785 const char *if_match,
3786 const char *if_nomatch,
3787 AttrsMod attrs_mod,
3788 bool copy_if_newer,
f67539c2 3789 rgw::sal::RGWAttrs& attrs,
7c673cae 3790 RGWObjCategory category,
11fdf7f2 3791 std::optional<uint64_t> olh_epoch,
7c673cae 3792 real_time delete_at,
7c673cae 3793 string *ptag,
11fdf7f2 3794 string *petag,
7c673cae 3795 void (*progress_cb)(off_t, void *),
31f18b77 3796 void *progress_data,
9f95a23c
TL
3797 const DoutPrefixProvider *dpp,
3798 RGWFetchObjFilter *filter,
81eedcae
TL
3799 rgw_zone_set *zones_trace,
3800 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
3801{
3802 /* source is in a different zonegroup, copy from there */
3803
3804 RGWRESTStreamRWRequest *in_stream_req;
3805 string tag;
3806 int i;
3807 append_rand_alpha(cct, tag, tag, 32);
3808 obj_time_weight set_mtime_weight;
3809 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 3810 int ret;
7c673cae 3811
9f95a23c 3812 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 3813 using namespace rgw::putobj;
f67539c2
TL
3814 AtomicObjectProcessor processor(&aio, this->store, dest_bucket, nullptr, user_id,
3815 obj_ctx, dest_obj->clone(), olh_epoch,
3816 tag, dpp, null_yield);
7c673cae 3817 RGWRESTConn *conn;
11fdf7f2
TL
3818 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3819 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae 3820 if (source_zone.empty()) {
f67539c2 3821 if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
7c673cae 3822 /* source is in the master zonegroup */
11fdf7f2 3823 conn = svc.zone->get_master_conn();
7c673cae 3824 } else {
f67539c2 3825 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
7c673cae 3826 if (iter == zonegroup_conn_map.end()) {
b3b6e05e 3827 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7c673cae
FG
3828 return -ENOENT;
3829 }
3830 conn = iter->second;
3831 }
3832 } else {
9f95a23c 3833 auto iter = zone_conn_map.find(source_zone);
7c673cae 3834 if (iter == zone_conn_map.end()) {
b3b6e05e 3835 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 3836 return -ENOENT;
7c673cae 3837 }
11fdf7f2 3838 conn = iter->second;
7c673cae
FG
3839 }
3840
3841 boost::optional<RGWPutObj_Compress> compressor;
3842 CompressorRef plugin;
3843
9f95a23c
TL
3844 RGWFetchObjFilter_Default source_filter;
3845 if (!filter) {
3846 filter = &source_filter;
3847 }
3848
3849 std::optional<rgw_user> override_owner;
3850
b3b6e05e 3851 RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
9f95a23c
TL
3852 [&](map<string, bufferlist>& obj_attrs) {
3853 const rgw_placement_rule *ptail_rule;
3854
3855 int ret = filter->filter(cct,
f67539c2
TL
3856 src_obj->get_key(),
3857 dest_bucket->get_info(),
9f95a23c
TL
3858 dest_placement_rule,
3859 obj_attrs,
3860 &override_owner,
3861 &ptail_rule);
3862 if (ret < 0) {
b3b6e05e 3863 ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
9f95a23c 3864 return ret;
11fdf7f2 3865 }
9f95a23c
TL
3866
3867 processor.set_tail_placement(*ptail_rule);
3868
11fdf7f2
TL
3869 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
3870 if (compression_type != "none") {
3871 plugin = Compressor::create(cct, compression_type);
3872 if (!plugin) {
b3b6e05e 3873 ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
11fdf7f2
TL
3874 << compression_type << dendl;
3875 }
3876 }
3877
9f95a23c 3878 ret = processor.prepare(null_yield);
11fdf7f2
TL
3879 if (ret < 0) {
3880 return ret;
3881 }
3882 return 0;
3883 });
7c673cae
FG
3884
3885 string etag;
7c673cae 3886 real_time set_mtime;
81eedcae 3887 uint64_t expected_size = 0;
7c673cae
FG
3888
3889 RGWObjState *dest_state = NULL;
3890
3891 const real_time *pmod = mod_ptr;
3892
3893 obj_time_weight dest_mtime_weight;
3894
3895 if (copy_if_newer) {
3896 /* need to get mtime for destination */
b3b6e05e 3897 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae
FG
3898 if (ret < 0)
3899 goto set_err_state;
3900
3901 if (!real_clock::is_zero(dest_state->mtime)) {
3902 dest_mtime_weight.init(dest_state);
3903 pmod = &dest_mtime_weight.mtime;
3904 }
3905 }
3906
181888fb
FG
3907 static constexpr bool prepend_meta = true;
3908 static constexpr bool get_op = true;
3909 static constexpr bool rgwx_stat = false;
3910 static constexpr bool sync_manifest = true;
3911 static constexpr bool skip_decrypt = true;
b3b6e05e 3912 ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 3913 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3914 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3915 sync_manifest, skip_decrypt,
3916 true,
3917 &cb, &in_stream_req);
7c673cae
FG
3918 if (ret < 0) {
3919 goto set_err_state;
3920 }
3921
81eedcae 3922 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
f67539c2 3923 &expected_size, nullptr, nullptr, null_yield);
7c673cae
FG
3924 if (ret < 0) {
3925 goto set_err_state;
3926 }
28e407b8
AA
3927 ret = cb.flush();
3928 if (ret < 0) {
3929 goto set_err_state;
3930 }
81eedcae
TL
3931 if (cb.get_data_len() != expected_size) {
3932 ret = -EIO;
b3b6e05e 3933 ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
81eedcae
TL
3934 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
3935 goto set_err_state;
3936 }
7c673cae
FG
3937 if (compressor && compressor->is_compressed()) {
3938 bufferlist tmp;
3939 RGWCompressionInfo cs_info;
3940 cs_info.compression_type = plugin->get_type_name();
3941 cs_info.orig_size = cb.get_data_len();
f67539c2 3942 cs_info.compressor_message = compressor->get_compressor_message();
7c673cae 3943 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 3944 encode(cs_info, tmp);
7c673cae
FG
3945 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
3946 }
3947
9f95a23c
TL
3948 if (override_owner) {
3949 processor.set_owner(*override_owner);
3950
3951 auto& obj_attrs = cb.get_attrs();
3952
3953 RGWUserInfo owner_info;
b3b6e05e
TL
3954 if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
3955 ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
9f95a23c
TL
3956 return -EINVAL;
3957 }
3958
3959 RGWAccessControlPolicy acl;
3960
3961 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
3962 if (aiter == obj_attrs.end()) {
b3b6e05e 3963 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
9f95a23c
TL
3964 acl.create_default(owner_info.user_id, owner_info.display_name);
3965 } else {
3966 auto iter = aiter->second.cbegin();
3967 try {
3968 acl.decode(iter);
3969 } catch (buffer::error& err) {
b3b6e05e 3970 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
3971 return -EIO;
3972 }
3973 }
3974
3975 ACLOwner new_owner;
3976 new_owner.set_id(*override_owner);
3977 new_owner.set_name(owner_info.display_name);
3978
3979 acl.set_owner(new_owner);
3980
3981 bufferlist bl;
3982 acl.encode(bl);
3983 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
3984 }
3985
7c673cae
FG
3986 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3987 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
3988 } else {
3989 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
3990 if (iter != cb.get_attrs().end()) {
3991 try {
11fdf7f2 3992 decode(delete_at, iter->second);
7c673cae 3993 } catch (buffer::error& err) {
b3b6e05e 3994 ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7c673cae
FG
3995 }
3996 }
3997 }
3998
3999 if (src_mtime) {
4000 *src_mtime = set_mtime;
4001 }
4002
4003 if (petag) {
4004 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4005 if (iter != cb.get_attrs().end()) {
11fdf7f2 4006 *petag = iter->second.to_str();
7c673cae
FG
4007 }
4008 }
4009
11fdf7f2
TL
4010 //erase the append attr
4011 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4012
7c673cae
FG
4013 if (source_zone.empty()) {
4014 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4015 } else {
4016 attrs = cb.get_attrs();
4017 }
4018
4019 if (copy_if_newer) {
4020 uint64_t pg_ver = 0;
4021 auto i = attrs.find(RGW_ATTR_PG_VER);
4022 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 4023 auto iter = i->second.cbegin();
7c673cae 4024 try {
11fdf7f2 4025 decode(pg_ver, iter);
7c673cae 4026 } catch (buffer::error& err) {
b3b6e05e 4027 ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7c673cae
FG
4028 /* non critical error */
4029 }
4030 }
11fdf7f2 4031 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
4032 }
4033
adb31ebb
TL
4034 /* Perform ETag verification is we have computed the object's MD5 sum at our end */
4035 if (const auto& verifier_etag = cb.get_verifier_etag();
4036 !verifier_etag.empty()) {
4037 string trimmed_etag = etag;
4038
4039 /* Remove the leading and trailing double quotes from etag */
4040 trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
4041 trimmed_etag.end());
4042
4043 if (verifier_etag != trimmed_etag) {
4044 ret = -EIO;
b3b6e05e 4045 ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
adb31ebb
TL
4046 << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
4047 goto set_err_state;
4048 }
4049 }
4050
7c673cae
FG
4051#define MAX_COMPLETE_RETRY 100
4052 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
4053 bool canceled = false;
4054 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4055 attrs, delete_at, nullptr, nullptr, nullptr,
9f95a23c 4056 zones_trace, &canceled, null_yield);
7c673cae
FG
4057 if (ret < 0) {
4058 goto set_err_state;
4059 }
adb31ebb 4060
11fdf7f2 4061 if (copy_if_newer && canceled) {
b3b6e05e 4062 ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
f67539c2 4063 obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
b3b6e05e 4064 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae 4065 if (ret < 0) {
b3b6e05e 4066 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7c673cae
FG
4067 goto set_err_state;
4068 }
4069 dest_mtime_weight.init(dest_state);
4070 dest_mtime_weight.high_precision = high_precision_time;
4071 if (!dest_state->exists ||
4072 dest_mtime_weight < set_mtime_weight) {
b3b6e05e 4073 ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4074 continue;
4075 } else {
b3b6e05e 4076 ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4077 }
4078 }
4079 break;
4080 }
4081
4082 if (i == MAX_COMPLETE_RETRY) {
b3b6e05e 4083 ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7c673cae
FG
4084 ret = -EIO;
4085 goto set_err_state;
4086 }
4087
81eedcae
TL
4088 if (bytes_transferred) {
4089 *bytes_transferred = cb.get_data_len();
4090 }
7c673cae
FG
4091 return 0;
4092set_err_state:
4093 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
4094 // we may have already fetched during sync of OP_ADD, but were waiting
4095 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4096 if (olh_epoch && *olh_epoch > 0) {
4097 constexpr bool log_data_change = true;
b3b6e05e 4098 ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), false, nullptr,
9f95a23c 4099 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
91327a77
AA
4100 } else {
4101 // we already have the latest copy
4102 ret = 0;
4103 }
7c673cae 4104 }
7c673cae
FG
4105 return ret;
4106}
4107
4108
b3b6e05e
TL
4109int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
4110 RGWObjState *astate,
7c673cae
FG
4111 map<string, bufferlist>& src_attrs,
4112 RGWRados::Object::Read& read_op,
4113 const rgw_user& user_id,
f67539c2 4114 rgw::sal::RGWObject* dest_obj,
7c673cae
FG
4115 real_time *mtime)
4116{
4117 string etag;
4118
11fdf7f2 4119 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4120
11fdf7f2
TL
4121 auto rest_master_conn = svc.zone->get_master_conn();
4122
b3b6e05e 4123 int ret = rest_master_conn->put_obj_async(dpp, user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
7c673cae 4124 if (ret < 0) {
7c673cae
FG
4125 return ret;
4126 }
4127
b3b6e05e 4128 ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
224ce89b
WB
4129 if (ret < 0) {
4130 delete out_stream_req;
7c673cae 4131 return ret;
224ce89b 4132 }
7c673cae 4133
f67539c2 4134 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
7c673cae
FG
4135 if (ret < 0)
4136 return ret;
4137
4138 return 0;
4139}
4140
4141/**
4142 * Copy an object.
4143 * dest_obj: the object to copy into
4144 * src_obj: the object to copy from
4145 * attrs: usage depends on attrs_mod parameter
4146 * attrs_mod: the modification mode of the attrs, may have the following values:
4147 * ATTRSMOD_NONE - the attributes of the source object will be
4148 * copied without modifications, attrs parameter is ignored;
4149 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4150 * parameter, source object attributes are not copied;
4151 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4152 * are overwritten by values contained in attrs parameter.
4153 * err: stores any errors resulting from the get of the original object
4154 * Returns: 0 on success, -ERR# otherwise.
4155 */
4156int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4157 const rgw_user& user_id,
7c673cae 4158 req_info *info,
9f95a23c 4159 const rgw_zone_id& source_zone,
f67539c2
TL
4160 rgw::sal::RGWObject* dest_obj,
4161 rgw::sal::RGWObject* src_obj,
4162 rgw::sal::RGWBucket* dest_bucket,
4163 rgw::sal::RGWBucket* src_bucket,
11fdf7f2 4164 const rgw_placement_rule& dest_placement,
7c673cae
FG
4165 real_time *src_mtime,
4166 real_time *mtime,
4167 const real_time *mod_ptr,
4168 const real_time *unmod_ptr,
4169 bool high_precision_time,
4170 const char *if_match,
4171 const char *if_nomatch,
4172 AttrsMod attrs_mod,
4173 bool copy_if_newer,
f67539c2 4174 rgw::sal::RGWAttrs& attrs,
7c673cae
FG
4175 RGWObjCategory category,
4176 uint64_t olh_epoch,
4177 real_time delete_at,
4178 string *version_id,
4179 string *ptag,
11fdf7f2 4180 string *petag,
7c673cae 4181 void (*progress_cb)(off_t, void *),
9f95a23c
TL
4182 void *progress_data,
4183 const DoutPrefixProvider *dpp,
4184 optional_yield y)
7c673cae
FG
4185{
4186 int ret;
4187 uint64_t obj_size;
f67539c2 4188 rgw_obj shadow_obj = dest_obj->get_obj();
7c673cae
FG
4189 string shadow_oid;
4190
4191 bool remote_src;
4192 bool remote_dest;
4193
f67539c2
TL
4194 append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
4195 shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
7c673cae 4196
11fdf7f2
TL
4197 auto& zonegroup = svc.zone->get_zonegroup();
4198
f67539c2
TL
4199 remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
4200 remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
7c673cae
FG
4201
4202 if (remote_src && remote_dest) {
9f95a23c 4203 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7c673cae
FG
4204 return -EINVAL;
4205 }
4206
f67539c2 4207 ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
7c673cae
FG
4208
4209 if (remote_src || !source_zone.empty()) {
11fdf7f2 4210 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
f67539c2 4211 dest_obj, src_obj, dest_bucket, src_bucket,
11fdf7f2 4212 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4213 unmod_ptr, high_precision_time,
4214 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
9f95a23c
TL
4215 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4216 nullptr /* filter */);
7c673cae
FG
4217 }
4218
4219 map<string, bufferlist> src_attrs;
f67539c2 4220 RGWRados::Object src_op_target(this, src_bucket->get_info(), obj_ctx, src_obj->get_obj());
7c673cae
FG
4221 RGWRados::Object::Read read_op(&src_op_target);
4222
4223 read_op.conds.mod_ptr = mod_ptr;
4224 read_op.conds.unmod_ptr = unmod_ptr;
4225 read_op.conds.high_precision_time = high_precision_time;
4226 read_op.conds.if_match = if_match;
4227 read_op.conds.if_nomatch = if_nomatch;
4228 read_op.params.attrs = &src_attrs;
4229 read_op.params.lastmod = src_mtime;
4230 read_op.params.obj_size = &obj_size;
7c673cae 4231
b3b6e05e 4232 ret = read_op.prepare(y, dpp);
7c673cae
FG
4233 if (ret < 0) {
4234 return ret;
4235 }
94b18763
FG
4236 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4237 // Current implementation does not follow S3 spec and even
4238 // may result in data corruption silently when copying
4239 // multipart objects acorss pools. So reject COPY operations
4240 //on encrypted objects before it is fully functional.
9f95a23c 4241 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
94b18763
FG
4242 << " has not been implemented." << dendl;
4243 return -ERR_NOT_IMPLEMENTED;
4244 }
7c673cae
FG
4245
4246 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4247 src_attrs.erase(RGW_ATTR_DELETE_AT);
4248
4249 set_copy_attrs(src_attrs, attrs, attrs_mod);
4250 attrs.erase(RGW_ATTR_ID_TAG);
4251 attrs.erase(RGW_ATTR_PG_VER);
4252 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4253 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4254 if (cmp != src_attrs.end())
4255 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4256
4257 RGWObjManifest manifest;
4258 RGWObjState *astate = NULL;
4259
b3b6e05e 4260 ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj->get_obj(), &astate, y);
7c673cae
FG
4261 if (ret < 0) {
4262 return ret;
4263 }
4264
4265 vector<rgw_raw_obj> ref_objs;
4266
4267 if (remote_dest) {
4268 /* dest is in a different zonegroup, copy it there */
b3b6e05e 4269 return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
7c673cae
FG
4270 }
4271 uint64_t max_chunk_size;
4272
b3b6e05e 4273 ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
7c673cae 4274 if (ret < 0) {
f67539c2 4275 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
7c673cae
FG
4276 return ret;
4277 }
4278
4279 rgw_pool src_pool;
4280 rgw_pool dest_pool;
11fdf7f2
TL
4281
4282 const rgw_placement_rule *src_rule{nullptr};
4283
9f95a23c
TL
4284 if (astate->manifest) {
4285 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4286 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
11fdf7f2
TL
4287 }
4288
4289 if (!src_rule || src_rule->empty()) {
f67539c2 4290 src_rule = &src_bucket->get_placement_rule();
11fdf7f2
TL
4291 }
4292
f67539c2 4293 if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
9f95a23c 4294 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7c673cae
FG
4295 return -EIO;
4296 }
11fdf7f2 4297
f67539c2 4298 if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
9f95a23c 4299 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7c673cae
FG
4300 return -EIO;
4301 }
4302
9f95a23c 4303 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
11fdf7f2
TL
4304 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4305
9f95a23c 4306 bool copy_data = (!astate->manifest) ||
11fdf7f2
TL
4307 (*src_rule != dest_placement) ||
4308 (src_pool != dest_pool);
7c673cae 4309
7c673cae 4310 bool copy_first = false;
9f95a23c
TL
4311 if (astate->manifest) {
4312 if (!astate->manifest->has_tail()) {
7c673cae
FG
4313 copy_data = true;
4314 } else {
9f95a23c 4315 uint64_t head_size = astate->manifest->get_head_size();
7c673cae
FG
4316
4317 if (head_size > 0) {
4318 if (head_size > max_chunk_size) {
4319 copy_data = true;
4320 } else {
4321 copy_first = true;
4322 }
4323 }
4324 }
4325 }
4326
4327 if (petag) {
4328 const auto iter = attrs.find(RGW_ATTR_ETAG);
4329 if (iter != attrs.end()) {
11fdf7f2 4330 *petag = iter->second.to_str();
7c673cae
FG
4331 }
4332 }
4333
4334 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2 4335 attrs.erase(RGW_ATTR_TAIL_TAG);
f67539c2 4336 return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
9f95a23c 4337 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
7c673cae
FG
4338 }
4339
b3b6e05e 4340 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin(dpp);
7c673cae
FG
4341
4342 if (copy_first) { // we need to copy first chunk, not increase refcount
4343 ++miter;
4344 }
4345
4346 rgw_rados_ref ref;
b3b6e05e 4347 ret = get_raw_obj_ref(dpp, miter.get_location().get_raw_obj(store), &ref);
7c673cae
FG
4348 if (ret < 0) {
4349 return ret;
4350 }
4351
7c673cae
FG
4352 bufferlist first_chunk;
4353
4354 bool copy_itself = (dest_obj == src_obj);
4355 RGWObjManifest *pmanifest;
9f95a23c 4356 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae 4357
f67539c2 4358 RGWRados::Object dest_op_target(this, dest_bucket->get_info(), obj_ctx, dest_obj->get_obj());
7c673cae
FG
4359 RGWRados::Object::Write write_op(&dest_op_target);
4360
4361 string tag;
4362
4363 if (ptag) {
4364 tag = *ptag;
4365 }
4366
4367 if (tag.empty()) {
4368 append_rand_alpha(cct, tag, tag, 32);
4369 }
4370
4371 if (!copy_itself) {
181888fb 4372 attrs.erase(RGW_ATTR_TAIL_TAG);
9f95a23c 4373 manifest = *astate->manifest;
7c673cae
FG
4374 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4375 if (tail_placement.bucket.name.empty()) {
f67539c2 4376 manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
7c673cae 4377 }
3efd9988 4378 string ref_tag;
b3b6e05e 4379 for (; miter != astate->manifest->obj_end(dpp); ++miter) {
7c673cae 4380 ObjectWriteOperation op;
3efd9988
FG
4381 ref_tag = tag + '\0';
4382 cls_refcount_get(op, ref_tag, true);
f67539c2 4383 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store);
7c673cae 4384
9f95a23c
TL
4385 auto& ioctx = ref.pool.ioctx();
4386 ioctx.locator_set_key(loc.loc);
4387
b3b6e05e 4388 ret = rgw_rados_operate(dpp, ioctx, loc.oid, &op, null_yield);
7c673cae
FG
4389 if (ret < 0) {
4390 goto done_ret;
4391 }
4392
4393 ref_objs.push_back(loc);
4394 }
4395
4396 pmanifest = &manifest;
4397 } else {
9f95a23c 4398 pmanifest = &(*astate->manifest);
7c673cae
FG
4399 /* don't send the object's tail for garbage collection */
4400 astate->keep_tail = true;
4401 }
4402
4403 if (copy_first) {
b3b6e05e 4404 ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
7c673cae
FG
4405 if (ret < 0) {
4406 goto done_ret;
4407 }
4408
f67539c2 4409 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
7c673cae 4410 } else {
f67539c2 4411 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
7c673cae
FG
4412 }
4413
4414 write_op.meta.data = &first_chunk;
4415 write_op.meta.manifest = pmanifest;
4416 write_op.meta.ptag = &tag;
f67539c2 4417 write_op.meta.owner = dest_bucket->get_info().owner;
7c673cae
FG
4418 write_op.meta.mtime = mtime;
4419 write_op.meta.flags = PUT_OBJ_CREATE;
4420 write_op.meta.category = category;
4421 write_op.meta.olh_epoch = olh_epoch;
4422 write_op.meta.delete_at = delete_at;
181888fb 4423 write_op.meta.modify_tail = !copy_itself;
7c673cae 4424
b3b6e05e 4425 ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
7c673cae
FG
4426 if (ret < 0) {
4427 goto done_ret;
4428 }
4429
4430 return 0;
4431
4432done_ret:
4433 if (!copy_itself) {
4434 vector<rgw_raw_obj>::iterator riter;
4435
7c673cae 4436 /* rollback reference */
92f5a8d4 4437 string ref_tag = tag + '\0';
7c673cae
FG
4438 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4439 ObjectWriteOperation op;
92f5a8d4 4440 cls_refcount_put(op, ref_tag, true);
7c673cae 4441
9f95a23c 4442 ref.pool.ioctx().locator_set_key(riter->loc);
7c673cae 4443
b3b6e05e 4444 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), riter->oid, &op, null_yield);
7c673cae 4445 if (r < 0) {
9f95a23c 4446 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7c673cae
FG
4447 }
4448 }
4449 }
4450 return ret;
4451}
4452
4453
4454int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
f67539c2 4455 rgw::sal::RGWBucket* bucket,
11fdf7f2 4456 const rgw_placement_rule& dest_placement,
7c673cae 4457 RGWRados::Object::Read& read_op, off_t end,
f67539c2 4458 rgw::sal::RGWObject* dest_obj,
7c673cae
FG
4459 real_time *mtime,
4460 real_time set_mtime,
f67539c2 4461 rgw::sal::RGWAttrs& attrs,
7c673cae
FG
4462 uint64_t olh_epoch,
4463 real_time delete_at,
9f95a23c
TL
4464 string *petag,
4465 const DoutPrefixProvider *dpp,
4466 optional_yield y)
7c673cae 4467{
7c673cae
FG
4468 string tag;
4469 append_rand_alpha(cct, tag, tag, 32);
4470
9f95a23c 4471 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 4472 using namespace rgw::putobj;
9f95a23c
TL
4473 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4474 // it causes crashes in the ragweed tests
f67539c2
TL
4475 AtomicObjectProcessor processor(&aio, this->store, bucket, &dest_placement,
4476 bucket->get_info().owner, obj_ctx,
4477 dest_obj->clone(), olh_epoch, tag,
4478 dpp, null_yield);
9f95a23c 4479 int ret = processor.prepare(y);
7c673cae
FG
4480 if (ret < 0)
4481 return ret;
4482
4483 off_t ofs = 0;
4484
4485 do {
4486 bufferlist bl;
b3b6e05e 4487 ret = read_op.read(ofs, end, bl, y, dpp);
11fdf7f2 4488 if (ret < 0) {
9f95a23c 4489 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
11fdf7f2
TL
4490 return ret;
4491 }
7c673cae
FG
4492
4493 uint64_t read_len = ret;
11fdf7f2
TL
4494 ret = processor.process(std::move(bl), ofs);
4495 if (ret < 0) {
4496 return ret;
4497 }
7c673cae
FG
4498
4499 ofs += read_len;
4500 } while (ofs <= end);
4501
11fdf7f2
TL
4502 // flush
4503 ret = processor.process({}, ofs);
4504 if (ret < 0) {
4505 return ret;
4506 }
4507
7c673cae
FG
4508 string etag;
4509 auto iter = attrs.find(RGW_ATTR_ETAG);
4510 if (iter != attrs.end()) {
4511 bufferlist& bl = iter->second;
11fdf7f2 4512 etag = bl.to_str();
7c673cae 4513 if (petag) {
11fdf7f2 4514 *petag = etag;
7c673cae
FG
4515 }
4516 }
4517
4518 uint64_t accounted_size;
4519 {
4520 bool compressed{false};
4521 RGWCompressionInfo cs_info;
4522 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4523 if (ret < 0) {
9f95a23c 4524 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
7c673cae
FG
4525 return ret;
4526 }
4527 // pass original size if compressed
4528 accounted_size = compressed ? cs_info.orig_size : ofs;
4529 }
4530
11fdf7f2 4531 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
9f95a23c 4532 nullptr, nullptr, nullptr, nullptr, nullptr, y);
7c673cae
FG
4533}
4534
11fdf7f2 4535int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
f67539c2
TL
4536 rgw::sal::RGWBucket* bucket,
4537 rgw::sal::RGWObject& obj,
11fdf7f2
TL
4538 const rgw_placement_rule& placement_rule,
4539 const real_time& mtime,
9f95a23c
TL
4540 uint64_t olh_epoch,
4541 const DoutPrefixProvider *dpp,
4542 optional_yield y)
7c673cae 4543{
f67539c2 4544 rgw::sal::RGWAttrs attrs;
11fdf7f2
TL
4545 real_time read_mtime;
4546 uint64_t obj_size;
7c673cae 4547
f67539c2
TL
4548 obj.set_atomic(&obj_ctx);
4549 RGWRados::Object op_target(this, bucket->get_info(), obj_ctx, obj.get_obj());
11fdf7f2 4550 RGWRados::Object::Read read_op(&op_target);
7c673cae 4551
11fdf7f2
TL
4552 read_op.params.attrs = &attrs;
4553 read_op.params.lastmod = &read_mtime;
4554 read_op.params.obj_size = &obj_size;
7c673cae 4555
b3b6e05e 4556 int ret = read_op.prepare(y, dpp);
11fdf7f2
TL
4557 if (ret < 0) {
4558 return ret;
7c673cae
FG
4559 }
4560
11fdf7f2
TL
4561 if (read_mtime != mtime) {
4562 /* raced */
4563 return -ECANCELED;
7c673cae
FG
4564 }
4565
9f95a23c
TL
4566 attrs.erase(RGW_ATTR_ID_TAG);
4567 attrs.erase(RGW_ATTR_TAIL_TAG);
4568
11fdf7f2 4569 ret = copy_obj_data(obj_ctx,
f67539c2 4570 bucket,
11fdf7f2
TL
4571 placement_rule,
4572 read_op,
4573 obj_size - 1,
f67539c2 4574 &obj,
11fdf7f2
TL
4575 nullptr /* pmtime */,
4576 mtime,
4577 attrs,
4578 olh_epoch,
4579 real_time(),
9f95a23c
TL
4580 nullptr /* petag */,
4581 dpp,
4582 y);
11fdf7f2
TL
4583 if (ret < 0) {
4584 return ret;
7c673cae
FG
4585 }
4586
11fdf7f2 4587 return 0;
7c673cae
FG
4588}
4589
b3b6e05e 4590int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
7c673cae 4591{
9f95a23c
TL
4592 constexpr uint NUM_ENTRIES = 1000u;
4593
7c673cae
FG
4594 rgw_obj_index_key marker;
4595 string prefix;
4596 bool is_truncated;
4597
4598 do {
9f95a23c
TL
4599 std::vector<rgw_bucket_dir_entry> ent_list;
4600 ent_list.reserve(NUM_ENTRIES);
4601
b3b6e05e
TL
4602 int r = cls_bucket_list_unordered(dpp,
4603 bucket_info,
1adf2230
AA
4604 RGW_NO_SHARD,
4605 marker,
4606 prefix,
4607 NUM_ENTRIES,
4608 true,
4609 ent_list,
4610 &is_truncated,
9f95a23c
TL
4611 &marker,
4612 y);
4613 if (r < 0) {
7c673cae 4614 return r;
9f95a23c 4615 }
7c673cae
FG
4616
4617 string ns;
1adf2230 4618 for (auto const& dirent : ent_list) {
7c673cae
FG
4619 rgw_obj_key obj;
4620
9f95a23c 4621 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
7c673cae 4622 return -ENOTEMPTY;
9f95a23c 4623 }
7c673cae
FG
4624 }
4625 } while (is_truncated);
1adf2230 4626
7c673cae
FG
4627 return 0;
4628}
4629
4630/**
4631 * Delete a bucket.
4632 * bucket: the name of the bucket to delete
4633 * Returns 0 on success, -ERR# otherwise.
4634 */
b3b6e05e 4635int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
7c673cae
FG
4636{
4637 const rgw_bucket& bucket = bucket_info.bucket;
9f95a23c 4638 RGWSI_RADOS::Pool index_pool;
7c673cae 4639 map<int, string> bucket_objs;
b3b6e05e 4640 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
4641 if (r < 0)
4642 return r;
4643
4644 if (check_empty) {
b3b6e05e 4645 r = check_bucket_empty(dpp, bucket_info, y);
7c673cae
FG
4646 if (r < 0) {
4647 return r;
4648 }
4649 }
9f95a23c
TL
4650
4651 bool remove_ep = true;
4652
4653 if (objv_tracker.read_version.empty()) {
4654 RGWBucketEntryPoint ep;
4655 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4656 &ep,
4657 null_yield,
b3b6e05e 4658 dpp,
9f95a23c
TL
4659 RGWBucketCtl::Bucket::GetParams()
4660 .set_objv_tracker(&objv_tracker));
4661 if (r < 0 ||
4662 (!bucket_info.bucket.bucket_id.empty() &&
4663 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4664 if (r != -ENOENT) {
b3b6e05e 4665 ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
9f95a23c
TL
4666 /* we have no idea what caused the error, will not try to remove it */
4667 }
4668 /*
4669 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4670 * requested
4671 */
4672 remove_ep = false;
4673 }
4674 }
4675
4676 if (remove_ep) {
b3b6e05e 4677 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
9f95a23c
TL
4678 RGWBucketCtl::Bucket::RemoveParams()
4679 .set_objv_tracker(&objv_tracker));
4680 if (r < 0)
4681 return r;
4682 }
7c673cae
FG
4683
4684 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 4685 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 4686 RGWObjVersionTracker objv_tracker;
b3b6e05e 4687 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
7c673cae
FG
4688 if (r < 0) {
4689 return r;
4690 }
f64942e4
AA
4691
4692 /* remove bucket index objects asynchronously by best effort */
9f95a23c 4693 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
f64942e4
AA
4694 bucket_objs,
4695 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 4696 }
f64942e4 4697
7c673cae
FG
4698 return 0;
4699}
4700
b3b6e05e 4701int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
7c673cae
FG
4702{
4703 RGWBucketInfo info;
4704 map<string, bufferlist> attrs;
31f18b77 4705 int r;
9f95a23c
TL
4706 auto obj_ctx = svc.sysobj->init_obj_ctx();
4707
31f18b77 4708 if (bucket.bucket_id.empty()) {
b3b6e05e 4709 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
31f18b77 4710 } else {
b3b6e05e 4711 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield, dpp);
31f18b77 4712 }
7c673cae 4713 if (r < 0) {
b3b6e05e 4714 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4715 return r;
4716 }
4717
4718 info.owner = owner.get_id();
4719
b3b6e05e 4720 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4721 if (r < 0) {
b3b6e05e 4722 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4723 return r;
4724 }
4725
4726 return 0;
4727}
4728
4729
b3b6e05e 4730int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
7c673cae
FG
4731{
4732 int ret = 0;
4733
4734 vector<rgw_bucket>::iterator iter;
4735
4736 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4737 rgw_bucket& bucket = *iter;
b3b6e05e
TL
4738 if (enabled) {
4739 ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
4740 } else {
4741 ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
4742 }
7c673cae
FG
4743
4744 RGWBucketInfo info;
4745 map<string, bufferlist> attrs;
b3b6e05e 4746 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
7c673cae 4747 if (r < 0) {
b3b6e05e 4748 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4749 ret = r;
4750 continue;
4751 }
4752 if (enabled) {
4753 info.flags &= ~BUCKET_SUSPENDED;
4754 } else {
4755 info.flags |= BUCKET_SUSPENDED;
4756 }
4757
b3b6e05e 4758 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4759 if (r < 0) {
b3b6e05e 4760 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4761 ret = r;
4762 continue;
4763 }
4764 }
4765 return ret;
4766}
4767
b3b6e05e 4768int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
7c673cae
FG
4769{
4770 RGWBucketInfo bucket_info;
b3b6e05e 4771 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
7c673cae
FG
4772 if (ret < 0) {
4773 return ret;
4774 }
4775
4776 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4777 return 0;
4778}
4779
b3b6e05e 4780int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
7c673cae 4781{
9f95a23c 4782 if ((!state->manifest)|| state->keep_tail)
7c673cae
FG
4783 return 0;
4784
4785 cls_rgw_obj_chain chain;
b3b6e05e 4786 store->update_gc_chain(dpp, obj, *state->manifest, &chain);
7c673cae
FG
4787
4788 if (chain.empty()) {
4789 return 0;
4790 }
4791
181888fb 4792 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
9f95a23c
TL
4793 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4794 if (ret < 0) {
4795 //Delete objects inline if send chain to gc fails
b3b6e05e 4796 store->delete_objs_inline(dpp, chain, tag);
9f95a23c
TL
4797 }
4798 return 0;
7c673cae
FG
4799}
4800
b3b6e05e 4801void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
7c673cae
FG
4802{
4803 RGWObjManifest::obj_iterator iter;
4804 rgw_raw_obj raw_head;
4805 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
b3b6e05e 4806 for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
f67539c2 4807 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(store);
7c673cae
FG
4808 if (mobj == raw_head)
4809 continue;
4810 cls_rgw_obj_key key(mobj.oid);
4811 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4812 }
4813}
4814
9f95a23c 4815int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4816{
f67539c2
TL
4817 if (chain.empty()) {
4818 return 0;
4819 }
4820
9f95a23c 4821 return gc->send_chain(chain, tag);
7c673cae
FG
4822}
4823
b3b6e05e 4824void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4825{
9f95a23c
TL
4826 string last_pool;
4827 std::unique_ptr<IoCtx> ctx(new IoCtx);
4828 int ret = 0;
4829 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4830 cls_rgw_obj& obj = *liter;
4831 if (obj.pool != last_pool) {
4832 ctx.reset(new IoCtx);
b3b6e05e 4833 ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
9f95a23c
TL
4834 if (ret < 0) {
4835 last_pool = "";
b3b6e05e 4836 ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
9f95a23c
TL
4837 obj.pool << dendl;
4838 continue;
4839 }
4840 last_pool = obj.pool;
4841 }
4842 ctx->locator_set_key(obj.loc);
4843 const string& oid = obj.key.name; /* just stored raw oid there */
b3b6e05e 4844 ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
9f95a23c
TL
4845 ":" << obj.key.name << dendl;
4846 ObjectWriteOperation op;
4847 cls_refcount_put(op, tag, true);
4848 ret = ctx->operate(oid, &op);
4849 if (ret < 0) {
b3b6e05e 4850 ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
9f95a23c 4851 }
7c673cae 4852 }
7c673cae
FG
4853}
4854
4855static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
4856 map<RGWObjCategory, RGWStorageStats>& stats)
4857{
4858 for (const auto& pair : header.stats) {
4859 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
4860 const rgw_bucket_category_stats& header_stats = pair.second;
4861
4862 RGWStorageStats& s = stats[category];
4863
4864 s.category = category;
4865 s.size += header_stats.total_size;
4866 s.size_rounded += header_stats.total_size_rounded;
4867 s.size_utilized += header_stats.actual_size;
4868 s.num_objects += header_stats.num_entries;
4869 }
4870}
4871
b3b6e05e 4872int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
7c673cae
FG
4873 map<RGWObjCategory, RGWStorageStats> *existing_stats,
4874 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
4875{
9f95a23c 4876 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
4877 // key - bucket index object id
4878 // value - bucket index check OP returned result with the given bucket index object (shard)
4879 map<int, string> oids;
4880 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 4881
b3b6e05e 4882 int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &oids, nullptr);
31f18b77
FG
4883 if (ret < 0) {
4884 return ret;
4885 }
7c673cae 4886
9f95a23c
TL
4887 for (auto& iter : oids) {
4888 bucket_objs_ret[iter.first] = rgw_cls_check_index_ret();
4889 }
4890
4891 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
4892 if (ret < 0) {
4893 return ret;
4894 }
7c673cae
FG
4895
4896 // Aggregate results (from different shards if there is any)
4897 map<int, struct rgw_cls_check_index_ret>::iterator iter;
4898 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
4899 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
4900 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
4901 }
4902
4903 return 0;
4904}
4905
b3b6e05e 4906int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
7c673cae 4907{
9f95a23c 4908 RGWSI_RADOS::Pool index_pool;
7c673cae 4909 map<int, string> bucket_objs;
31f18b77 4910
b3b6e05e 4911 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77 4912 if (r < 0) {
7c673cae 4913 return r;
31f18b77 4914 }
7c673cae 4915
9f95a23c 4916 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
4917}
4918
b3b6e05e 4919int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77 4920{
9f95a23c 4921 RGWSI_RADOS::Pool index_pool;
31f18b77
FG
4922 map<int, string> bucket_objs;
4923
b3b6e05e 4924 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77
FG
4925 if (r < 0) {
4926 return r;
4927 }
4928
9f95a23c 4929 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 4930}
7c673cae 4931
b3b6e05e 4932int RGWRados::defer_gc(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
7c673cae
FG
4933{
4934 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
4935 std::string oid, key;
4936 get_obj_bucket_and_oid_loc(obj, oid, key);
4937 if (!rctx)
4938 return 0;
4939
4940 RGWObjState *state = NULL;
4941
b3b6e05e 4942 int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, false, y);
7c673cae
FG
4943 if (r < 0)
4944 return r;
4945
4946 if (!state->is_atomic) {
b3b6e05e 4947 ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
7c673cae
FG
4948 return -EINVAL;
4949 }
4950
181888fb
FG
4951 string tag;
4952
4953 if (state->tail_tag.length() > 0) {
4954 tag = state->tail_tag.c_str();
4955 } else if (state->obj_tag.length() > 0) {
4956 tag = state->obj_tag.c_str();
4957 } else {
b3b6e05e 4958 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
7c673cae
FG
4959 return -EINVAL;
4960 }
4961
b3b6e05e 4962 ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
7c673cae 4963
9f95a23c 4964 cls_rgw_obj_chain chain;
b3b6e05e 4965 update_gc_chain(dpp, state->obj, *state->manifest, &chain);
9f95a23c 4966 return gc->async_defer_chain(tag, chain);
7c673cae
FG
4967}
4968
4969void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
4970{
4971 list<string> prefixes;
4972 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
4973 cls_rgw_remove_obj(op, prefixes);
4974}
4975
4976void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
4977{
4978 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
4979}
4980
4981void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
4982{
4983 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
4984}
4985
9f95a23c
TL
4986struct tombstone_entry {
4987 ceph::real_time mtime;
4988 uint32_t zone_short_id;
4989 uint64_t pg_ver;
4990
4991 tombstone_entry() = default;
4992 explicit tombstone_entry(const RGWObjState& state)
4993 : mtime(state.mtime), zone_short_id(state.zone_short_id),
4994 pg_ver(state.pg_ver) {}
4995};
7c673cae
FG
4996
4997/**
4998 * Delete an object.
4999 * bucket: name of the bucket storing the object
5000 * obj: name of the object to delete
5001 * Returns: 0 on success, -ERR# otherwise.
5002 */
b3b6e05e 5003int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
5004{
5005 RGWRados *store = target->get_store();
5006 rgw_obj& src_obj = target->get_obj();
5007 const string& instance = src_obj.key.instance;
5008 rgw_obj obj = src_obj;
5009
5010 if (instance == "null") {
5011 obj.key.instance.clear();
5012 }
5013
5014 bool explicit_marker_version = (!params.marker_version_id.empty());
5015
5016 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5017 if (instance.empty() || explicit_marker_version) {
5018 rgw_obj marker = obj;
5019
5020 if (!params.marker_version_id.empty()) {
5021 if (params.marker_version_id != "null") {
5022 marker.key.set_instance(params.marker_version_id);
5023 }
5024 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5025 store->gen_rand_obj_instance_name(&marker);
5026 }
5027
5028 result.version_id = marker.key.instance;
91327a77
AA
5029 if (result.version_id.empty())
5030 result.version_id = "null";
7c673cae
FG
5031 result.delete_marker = true;
5032
5033 struct rgw_bucket_dir_entry_meta meta;
5034
5035 meta.owner = params.obj_owner.get_id().to_str();
5036 meta.owner_display_name = params.obj_owner.get_display_name();
5037
5038 if (real_clock::is_zero(params.mtime)) {
5039 meta.mtime = real_clock::now();
5040 } else {
5041 meta.mtime = params.mtime;
5042 }
5043
b3b6e05e 5044 int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
7c673cae
FG
5045 if (r < 0) {
5046 return r;
5047 }
5048 } else {
5049 rgw_bucket_dir_entry dirent;
5050
b3b6e05e 5051 int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
7c673cae
FG
5052 if (r < 0) {
5053 return r;
5054 }
5055 result.delete_marker = dirent.is_delete_marker();
b3b6e05e 5056 r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
7c673cae
FG
5057 if (r < 0) {
5058 return r;
5059 }
5060 result.version_id = instance;
5061 }
5062
5063 BucketShard *bs;
b3b6e05e 5064 int r = target->get_bucket_shard(&bs, dpp);
7c673cae 5065 if (r < 0) {
b3b6e05e 5066 ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
7c673cae
FG
5067 return r;
5068 }
5069
b3b6e05e 5070 r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 5071 if (r < 0) {
b3b6e05e 5072 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
9f95a23c 5073 return r;
7c673cae
FG
5074 }
5075
5076 return 0;
5077 }
5078
5079 rgw_rados_ref ref;
b3b6e05e 5080 int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
7c673cae
FG
5081 if (r < 0) {
5082 return r;
5083 }
5084
5085 RGWObjState *state;
b3b6e05e 5086 r = target->get_state(dpp, &state, false, y);
7c673cae
FG
5087 if (r < 0)
5088 return r;
5089
5090 ObjectWriteOperation op;
5091
5092 if (!real_clock::is_zero(params.unmod_since)) {
5093 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5094 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5095 if (!params.high_precision_time) {
5096 ctime.tv_nsec = 0;
5097 unmod.tv_nsec = 0;
5098 }
5099
b3b6e05e 5100 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
7c673cae
FG
5101 if (ctime > unmod) {
5102 return -ERR_PRECONDITION_FAILED;
5103 }
5104
5105 /* only delete object if mtime is less than or equal to params.unmod_since */
5106 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5107 }
11fdf7f2 5108 uint64_t obj_accounted_size = state->accounted_size;
7c673cae 5109
9f95a23c
TL
5110 if(params.abortmp) {
5111 obj_accounted_size = params.parts_accounted_size;
5112 }
5113
7c673cae
FG
5114 if (!real_clock::is_zero(params.expiration_time)) {
5115 bufferlist bl;
5116 real_time delete_at;
5117
5118 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5119 try {
11fdf7f2
TL
5120 auto iter = bl.cbegin();
5121 decode(delete_at, iter);
7c673cae 5122 } catch (buffer::error& err) {
b3b6e05e 5123 ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
7c673cae
FG
5124 return -EIO;
5125 }
5126
5127 if (params.expiration_time != delete_at) {
5128 return -ERR_PRECONDITION_FAILED;
5129 }
5130 } else {
5131 return -ERR_PRECONDITION_FAILED;
5132 }
5133 }
5134
5135 if (!state->exists) {
5136 target->invalidate_state();
5137 return -ENOENT;
5138 }
5139
b3b6e05e 5140 r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
7c673cae
FG
5141 if (r < 0)
5142 return r;
5143
5144 RGWBucketInfo& bucket_info = target->get_bucket_info();
5145
5146 RGWRados::Bucket bop(store, bucket_info);
5147 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5148
5149 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5150 index_op.set_bilog_flags(params.bilog_flags);
5151
b3b6e05e 5152 r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
7c673cae
FG
5153 if (r < 0)
5154 return r;
5155
5156 store->remove_rgw_head_obj(op);
9f95a23c
TL
5157
5158 auto& ioctx = ref.pool.ioctx();
b3b6e05e 5159 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
94b18763
FG
5160
5161 /* raced with another operation, object state is indeterminate */
5162 const bool need_invalidate = (r == -ECANCELED);
7c673cae 5163
9f95a23c 5164 int64_t poolid = ioctx.get_id();
7c673cae
FG
5165 if (r >= 0) {
5166 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5167 if (obj_tombstone_cache) {
5168 tombstone_entry entry{*state};
5169 obj_tombstone_cache->add(obj, entry);
5170 }
b3b6e05e 5171 r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5172
b3b6e05e 5173 int ret = target->complete_atomic_modification(dpp);
7c673cae 5174 if (ret < 0) {
b3b6e05e 5175 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
7c673cae
FG
5176 }
5177 /* other than that, no need to propagate error */
224ce89b 5178 } else {
b3b6e05e 5179 int ret = index_op.cancel(dpp);
224ce89b 5180 if (ret < 0) {
b3b6e05e 5181 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
224ce89b 5182 }
7c673cae
FG
5183 }
5184
5185 if (need_invalidate) {
5186 target->invalidate_state();
5187 }
5188
5189 if (r < 0)
5190 return r;
5191
5192 /* update quota cache */
11fdf7f2 5193 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5194
5195 return 0;
5196}
5197
b3b6e05e
TL
5198int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
5199 RGWObjectCtx& obj_ctx,
7c673cae
FG
5200 const RGWBucketInfo& bucket_info,
5201 const rgw_obj& obj,
b3b6e05e 5202 int versioning_status, // versioning flags in enum RGWBucketFlags
7c673cae 5203 uint16_t bilog_flags,
31f18b77
FG
5204 const real_time& expiration_time,
5205 rgw_zone_set *zones_trace)
7c673cae
FG
5206{
5207 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5208 RGWRados::Object::Delete del_op(&del_target);
5209
5210 del_op.params.bucket_owner = bucket_info.owner;
5211 del_op.params.versioning_status = versioning_status;
5212 del_op.params.bilog_flags = bilog_flags;
5213 del_op.params.expiration_time = expiration_time;
31f18b77 5214 del_op.params.zones_trace = zones_trace;
7c673cae 5215
b3b6e05e 5216 return del_op.delete_obj(null_yield, dpp);
7c673cae
FG
5217}
5218
b3b6e05e 5219int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
7c673cae
FG
5220{
5221 rgw_rados_ref ref;
b3b6e05e 5222 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
5223 if (r < 0) {
5224 return r;
5225 }
5226
5227 ObjectWriteOperation op;
5228
5229 op.remove();
b3b6e05e 5230 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
5231 if (r < 0)
5232 return r;
5233
5234 return 0;
5235}
5236
b3b6e05e 5237int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
7c673cae
FG
5238{
5239 std::string oid, key;
5240 get_obj_bucket_and_oid_loc(obj, oid, key);
5241
11fdf7f2 5242 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5243
5244 RGWBucketInfo bucket_info;
b3b6e05e 5245 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae 5246 if (ret < 0) {
b3b6e05e 5247 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
7c673cae
FG
5248 return ret;
5249 }
5250
5251 RGWRados::Bucket bop(this, bucket_info);
5252 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5253
b3b6e05e 5254 return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5255}
5256
b3b6e05e 5257static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::RGWStore* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
7c673cae
FG
5258{
5259 string tag;
5260
b3b6e05e
TL
5261 RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
5262 if (mi != manifest.obj_end(dpp)) {
7c673cae
FG
5263 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5264 ++mi;
5265 tag = mi.get_location().get_raw_obj(store).oid;
5266 tag.append("_");
5267 }
5268
5269 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5270 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5271 MD5 hash;
11fdf7f2 5272 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5273
5274 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5275 if (iter != attrset.end()) {
5276 bufferlist& bl = iter->second;
11fdf7f2 5277 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5278 }
5279
5280 hash.Final(md5);
5281 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5282 tag.append(md5_str);
5283
5284 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5285
5286 tag_bl.append(tag.c_str(), tag.size() + 1);
5287}
5288
5289static bool is_olh(map<string, bufferlist>& attrs)
5290{
5291 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5292 return (iter != attrs.end());
5293}
5294
5295static bool has_olh_tag(map<string, bufferlist>& attrs)
5296{
5297 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5298 return (iter != attrs.end());
5299}
5300
b3b6e05e 5301int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5302 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
7c673cae 5303{
11fdf7f2 5304 ceph_assert(olh_state->is_olh);
7c673cae
FG
5305
5306 rgw_obj target;
b3b6e05e 5307 int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
7c673cae
FG
5308 if (r < 0) {
5309 return r;
5310 }
b3b6e05e 5311 r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state, false, y);
7c673cae
FG
5312 if (r < 0) {
5313 return r;
5314 }
5315
5316 return 0;
5317}
5318
b3b6e05e 5319int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5320 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5321{
5322 if (obj.empty()) {
5323 return -EINVAL;
5324 }
5325
5326 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5327
11fdf7f2 5328 RGWObjState *s = rctx->get_state(obj);
b3b6e05e 5329 ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
7c673cae
FG
5330 *state = s;
5331 if (s->has_attrs) {
5332 if (s->is_olh && need_follow_olh) {
b3b6e05e 5333 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
7c673cae
FG
5334 }
5335 return 0;
5336 }
5337
5338 s->obj = obj;
5339
5340 rgw_raw_obj raw_obj;
5341 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5342
5343 int r = -ENOENT;
5344
5345 if (!assume_noent) {
b3b6e05e 5346 r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
7c673cae
FG
5347 }
5348
5349 if (r == -ENOENT) {
5350 s->exists = false;
5351 s->has_attrs = true;
5352 tombstone_entry entry;
5353 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5354 s->mtime = entry.mtime;
5355 s->zone_short_id = entry.zone_short_id;
5356 s->pg_ver = entry.pg_ver;
b3b6e05e 5357 ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
7c673cae
FG
5358 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5359 } else {
5360 s->mtime = real_time();
5361 }
5362 return 0;
5363 }
5364 if (r < 0)
5365 return r;
5366
5367 s->exists = true;
5368 s->has_attrs = true;
5369 s->accounted_size = s->size;
5370
11fdf7f2
TL
5371 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5372 if (iter != s->attrset.end()) {
5373 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5374 bufferlist& bletag = iter->second;
5375 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5376 bufferlist newbl;
5377 bletag.splice(0, bletag.length() - 1, &newbl);
f67539c2 5378 bletag = std::move(newbl);
11fdf7f2
TL
5379 }
5380 }
5381
5382 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5383 const bool compressed = (iter != s->attrset.end());
5384 if (compressed) {
7c673cae
FG
5385 // use uncompressed size for accounted_size
5386 try {
5387 RGWCompressionInfo info;
11fdf7f2
TL
5388 auto p = iter->second.cbegin();
5389 decode(info, p);
31f18b77 5390 s->accounted_size = info.orig_size;
7c673cae 5391 } catch (buffer::error&) {
b3b6e05e 5392 ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
7c673cae
FG
5393 return -EIO;
5394 }
5395 }
5396
5397 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5398 if (iter != s->attrset.end()) {
5399 bufferlist bl = iter->second;
5400 bufferlist::iterator it = bl.begin();
5401 it.copy(bl.length(), s->shadow_obj);
5402 s->shadow_obj[bl.length()] = '\0';
5403 }
5404 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5405 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5406 if (ttiter != s->attrset.end()) {
5407 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5408 }
7c673cae
FG
5409
5410 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5411 if (manifest_bl.length()) {
11fdf7f2 5412 auto miter = manifest_bl.cbegin();
7c673cae 5413 try {
9f95a23c
TL
5414 s->manifest.emplace();
5415 decode(*s->manifest, miter);
5416 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
7c673cae 5417 broken due to old bugs */
9f95a23c 5418 s->size = s->manifest->get_obj_size();
31f18b77
FG
5419 if (!compressed)
5420 s->accounted_size = s->size;
7c673cae 5421 } catch (buffer::error& err) {
b3b6e05e 5422 ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
7c673cae
FG
5423 return -EIO;
5424 }
b3b6e05e 5425 ldpp_dout(dpp, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
11fdf7f2 5426 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
9f95a23c 5427 s->manifest->has_explicit_objs()) {
7c673cae 5428 RGWObjManifest::obj_iterator mi;
b3b6e05e
TL
5429 for (mi = s->manifest->obj_begin(dpp); mi != s->manifest->obj_end(dpp); ++mi) {
5430 ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(store) << dendl;
7c673cae
FG
5431 }
5432 }
5433
5434 if (!s->obj_tag.length()) {
5435 /*
5436 * Uh oh, something's wrong, object with manifest should have tag. Let's
5437 * create one out of the manifest, would be unique
5438 */
b3b6e05e 5439 generate_fake_tag(dpp, store, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
7c673cae
FG
5440 s->fake_tag = true;
5441 }
5442 }
5443 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5444 if (aiter != s->attrset.end()) {
5445 bufferlist& pg_ver_bl = aiter->second;
5446 if (pg_ver_bl.length()) {
11fdf7f2 5447 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5448 try {
11fdf7f2 5449 decode(s->pg_ver, pgbl);
7c673cae 5450 } catch (buffer::error& err) {
b3b6e05e 5451 ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5452 }
5453 }
5454 }
5455 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5456 if (aiter != s->attrset.end()) {
5457 bufferlist& zone_short_id_bl = aiter->second;
5458 if (zone_short_id_bl.length()) {
11fdf7f2 5459 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5460 try {
11fdf7f2 5461 decode(s->zone_short_id, zbl);
7c673cae 5462 } catch (buffer::error& err) {
b3b6e05e 5463 ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5464 }
5465 }
5466 }
b3b6e05e
TL
5467 if (s->obj_tag.length()) {
5468 ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5469 } else {
5470 ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5471 }
7c673cae
FG
5472
5473 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5474 * it exist, and not only if is_olh() returns true
5475 */
5476 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5477 if (iter != s->attrset.end()) {
5478 s->olh_tag = iter->second;
5479 }
5480
5481 if (is_olh(s->attrset)) {
5482 s->is_olh = true;
5483
b3b6e05e 5484 ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
7c673cae
FG
5485
5486 if (need_follow_olh) {
b3b6e05e 5487 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
9f95a23c 5488 } else if (obj.key.have_null_instance() && !s->manifest) {
11fdf7f2
TL
5489 // read null version, and the head object only have olh info
5490 s->exists = false;
5491 return -ENOENT;
7c673cae
FG
5492 }
5493 }
5494
5495 return 0;
5496}
5497
b3b6e05e 5498int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9f95a23c 5499 bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5500{
5501 int ret;
5502
5503 do {
b3b6e05e 5504 ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
7c673cae
FG
5505 } while (ret == -EAGAIN);
5506
5507 return ret;
5508}
5509
b3b6e05e 5510int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
7c673cae
FG
5511{
5512 RGWObjState *astate;
b3b6e05e 5513 int r = get_state(dpp, &astate, true, y);
7c673cae
FG
5514 if (r < 0) {
5515 return r;
5516 }
5517
9f95a23c 5518 *pmanifest = &(*astate->manifest);
7c673cae
FG
5519
5520 return 0;
5521}
5522
b3b6e05e 5523int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
7c673cae
FG
5524{
5525 RGWObjState *state;
b3b6e05e 5526 int r = source->get_state(dpp, &state, true, y);
7c673cae
FG
5527 if (r < 0)
5528 return r;
5529 if (!state->exists)
5530 return -ENOENT;
5531 if (!state->get_attr(name, dest))
5532 return -ENODATA;
5533
5534 return 0;
5535}
5536
b3b6e05e 5537int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
7c673cae
FG
5538{
5539 RGWObjectCtx& ctx = source->get_ctx();
5540 rgw_obj& obj = source->get_obj();
5541 RGWRados *store = source->get_store();
5542
11fdf7f2 5543 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5544 result.obj = obj;
5545 if (s->has_attrs) {
5546 state.ret = 0;
5547 result.size = s->size;
5548 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5549 result.attrs = s->attrset;
7c673cae
FG
5550 result.manifest = s->manifest;
5551 return 0;
5552 }
5553
5554 string oid;
5555 string loc;
5556 get_obj_bucket_and_oid_loc(obj, oid, loc);
5557
b3b6e05e 5558 int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
7c673cae
FG
5559 if (r < 0) {
5560 return r;
5561 }
5562
5563 librados::ObjectReadOperation op;
5564 op.stat2(&result.size, &result.mtime, NULL);
5565 op.getxattrs(&result.attrs, NULL);
9f95a23c 5566 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
7c673cae
FG
5567 state.io_ctx.locator_set_key(loc);
5568 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5569 if (r < 0) {
b3b6e05e 5570 ldpp_dout(dpp, 5) << __func__
7c673cae
FG
5571 << ": ERROR: aio_operate() returned ret=" << r
5572 << dendl;
5573 return r;
5574 }
5575
5576 return 0;
5577}
5578
5579
5580int RGWRados::Object::Stat::wait()
5581{
5582 if (!state.completion) {
5583 return state.ret;
5584 }
5585
9f95a23c 5586 state.completion->wait_for_complete();
7c673cae
FG
5587 state.ret = state.completion->get_return_value();
5588 state.completion->release();
5589
5590 if (state.ret != 0) {
5591 return state.ret;
5592 }
5593
5594 return finish();
5595}
5596
5597int RGWRados::Object::Stat::finish()
5598{
5599 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5600 if (iter != result.attrs.end()) {
5601 bufferlist& bl = iter->second;
11fdf7f2 5602 auto biter = bl.cbegin();
7c673cae 5603 try {
9f95a23c
TL
5604 result.manifest.emplace();
5605 decode(*result.manifest, biter);
7c673cae
FG
5606 } catch (buffer::error& err) {
5607 RGWRados *store = source->get_store();
5608 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5609 return -EIO;
5610 }
7c673cae
FG
5611 }
5612
5613 return 0;
5614}
5615
b3b6e05e 5616int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
7c673cae 5617 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5618 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
7c673cae
FG
5619{
5620 if (!rctx)
5621 return 0;
5622
b3b6e05e 5623 int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, false, y);
7c673cae
FG
5624 if (r < 0)
5625 return r;
5626
b3b6e05e 5627 return append_atomic_test(dpp, *pstate, op);
11fdf7f2 5628}
7c673cae 5629
b3b6e05e
TL
5630int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
5631 const RGWObjState* state,
11fdf7f2
TL
5632 librados::ObjectOperation& op)
5633{
7c673cae 5634 if (!state->is_atomic) {
b3b6e05e 5635 ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
5636 return 0;
5637 }
5638
5639 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5640 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5641 } else {
b3b6e05e 5642 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
7c673cae
FG
5643 }
5644 return 0;
5645}
5646
b3b6e05e 5647int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae 5648{
b3b6e05e 5649 return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
7c673cae
FG
5650}
5651
5652void RGWRados::Object::invalidate_state()
5653{
11fdf7f2 5654 ctx.invalidate(obj);
7c673cae
FG
5655}
5656
b3b6e05e
TL
5657int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
5658 ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb 5659 const char *if_match, const char *if_nomatch, bool removal_op,
9f95a23c 5660 bool modify_tail, optional_yield y)
7c673cae 5661{
b3b6e05e 5662 int r = get_state(dpp, &state, false, y);
7c673cae
FG
5663 if (r < 0)
5664 return r;
5665
9f95a23c 5666 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
7c673cae
FG
5667 if_match != NULL || if_nomatch != NULL) &&
5668 (!state->fake_tag);
5669
5670 if (!state->is_atomic) {
b3b6e05e 5671 ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
7c673cae
FG
5672
5673 if (reset_obj) {
5674 op.create(false);
5675 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5676 }
5677
5678 return 0;
5679 }
5680
5681 if (need_guard) {
5682 /* first verify that the object wasn't replaced under */
5683 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5684 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5685 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5686 }
5687
5688 if (if_match) {
5689 if (strcmp(if_match, "*") == 0) {
5690 // test the object is existing
5691 if (!state->exists) {
5692 return -ERR_PRECONDITION_FAILED;
5693 }
5694 } else {
5695 bufferlist bl;
5696 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5697 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5698 return -ERR_PRECONDITION_FAILED;
5699 }
5700 }
5701 }
5702
5703 if (if_nomatch) {
5704 if (strcmp(if_nomatch, "*") == 0) {
5705 // test the object is NOT existing
5706 if (state->exists) {
5707 return -ERR_PRECONDITION_FAILED;
5708 }
5709 } else {
5710 bufferlist bl;
5711 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5712 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5713 return -ERR_PRECONDITION_FAILED;
5714 }
5715 }
5716 }
5717 }
5718
5719 if (reset_obj) {
5720 if (state->exists) {
5721 op.create(false);
5722 store->remove_rgw_head_obj(op);
5723 } else {
5724 op.create(true);
5725 }
5726 }
5727
5728 if (removal_op) {
5729 /* the object is being removed, no need to update its tag */
5730 return 0;
5731 }
5732
5733 if (ptag) {
5734 state->write_tag = *ptag;
5735 } else {
5736 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5737 }
5738 bufferlist bl;
5739 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5740
b3b6e05e 5741 ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
7c673cae
FG
5742
5743 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
5744 if (modify_tail) {
5745 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5746 }
7c673cae
FG
5747
5748 return 0;
5749}
5750
7c673cae
FG
5751/**
5752 * Set an attr on an object.
5753 * bucket: name of the bucket holding the object
5754 * obj: name of the object to set the attr on
5755 * name: the attr to set
5756 * bl: the contents of the attr
5757 * Returns: 0 on success, -ERR# otherwise.
5758 */
b3b6e05e 5759int RGWRados::set_attr(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
7c673cae
FG
5760{
5761 map<string, bufferlist> attrs;
5762 attrs[name] = bl;
b3b6e05e 5763 return set_attrs(dpp, ctx, bucket_info, obj, attrs, NULL, null_yield);
7c673cae
FG
5764}
5765
b3b6e05e 5766int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae 5767 map<string, bufferlist>& attrs,
9f95a23c
TL
5768 map<string, bufferlist>* rmattrs,
5769 optional_yield y)
7c673cae 5770{
494da23a
TL
5771 rgw_obj obj = src_obj;
5772 if (obj.key.instance == "null") {
5773 obj.key.instance.clear();
5774 }
5775
7c673cae 5776 rgw_rados_ref ref;
b3b6e05e 5777 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
5778 if (r < 0) {
5779 return r;
5780 }
5781 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5782
5783 ObjectWriteOperation op;
5784 RGWObjState *state = NULL;
5785
b3b6e05e 5786 r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, y);
7c673cae
FG
5787 if (r < 0)
5788 return r;
5789
494da23a 5790 // ensure null version object exist
9f95a23c 5791 if (src_obj.key.instance == "null" && !state->manifest) {
494da23a
TL
5792 return -ENOENT;
5793 }
5794
7c673cae
FG
5795 map<string, bufferlist>::iterator iter;
5796 if (rmattrs) {
5797 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5798 const string& name = iter->first;
5799 op.rmxattr(name.c_str());
5800 }
5801 }
5802
5803 const rgw_bucket& bucket = obj.bucket;
5804
5805 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5806 const string& name = iter->first;
5807 bufferlist& bl = iter->second;
5808
5809 if (!bl.length())
5810 continue;
5811
5812 op.setxattr(name.c_str(), bl);
5813
5814 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5815 real_time ts;
5816 try {
11fdf7f2 5817 decode(ts, bl);
7c673cae
FG
5818
5819 rgw_obj_index_key obj_key;
5820 obj.key.get_index_key(&obj_key);
5821
b3b6e05e 5822 obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
7c673cae 5823 } catch (buffer::error& err) {
b3b6e05e 5824 ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
7c673cae
FG
5825 }
5826 }
5827 }
5828
5829 if (!op.size())
5830 return 0;
5831
9f95a23c 5832 RGWObjectCtx obj_ctx(this->store);
7c673cae
FG
5833
5834 bufferlist bl;
5835 RGWRados::Bucket bop(this, bucket_info);
5836 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5837
5838 if (state) {
5839 string tag;
5840 append_rand_alpha(cct, tag, tag, 32);
5841 state->write_tag = tag;
b3b6e05e 5842 r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
7c673cae
FG
5843
5844 if (r < 0)
5845 return r;
5846
5847 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
5848 op.setxattr(RGW_ATTR_ID_TAG, bl);
5849 }
5850
3efd9988
FG
5851
5852 real_time mtime = real_clock::now();
5853 struct timespec mtime_ts = real_clock::to_timespec(mtime);
5854 op.mtime2(&mtime_ts);
9f95a23c 5855 auto& ioctx = ref.pool.ioctx();
b3b6e05e 5856 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
7c673cae
FG
5857 if (state) {
5858 if (r >= 0) {
5859 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
5860 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
5861 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
5862 string etag = rgw_bl_str(etag_bl);
5863 string content_type = rgw_bl_str(content_type_bl);
5864 string storage_class;
5865 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
5866 if (iter != attrs.end()) {
5867 storage_class = rgw_bl_str(iter->second);
5868 }
9f95a23c
TL
5869 uint64_t epoch = ioctx.get_last_version();
5870 int64_t poolid = ioctx.get_id();
b3b6e05e 5871 r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
5872 mtime, etag, content_type, storage_class, &acl_bl,
5873 RGWObjCategory::Main, NULL);
7c673cae 5874 } else {
b3b6e05e 5875 int ret = index_op.cancel(dpp);
7c673cae 5876 if (ret < 0) {
b3b6e05e 5877 ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
7c673cae
FG
5878 }
5879 }
5880 }
5881 if (r < 0)
5882 return r;
5883
5884 if (state) {
5885 state->obj_tag.swap(bl);
5886 if (rmattrs) {
5887 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5888 state->attrset.erase(iter->first);
5889 }
5890 }
92f5a8d4 5891
7c673cae
FG
5892 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5893 state->attrset[iter->first] = iter->second;
5894 }
92f5a8d4
TL
5895
5896 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
5897 if (iter != state->attrset.end()) {
5898 iter->second = state->obj_tag;
5899 }
7c673cae
FG
5900 }
5901
5902 return 0;
5903}
5904
b3b6e05e 5905int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
5906{
5907 RGWRados *store = source->get_store();
5908 CephContext *cct = store->ctx();
5909
5910 bufferlist etag;
5911
5912 map<string, bufferlist>::iterator iter;
5913
5914 RGWObjState *astate;
b3b6e05e 5915 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
5916 if (r < 0)
5917 return r;
5918
5919 if (!astate->exists) {
5920 return -ENOENT;
5921 }
5922
5923 const RGWBucketInfo& bucket_info = source->get_bucket_info();
5924
5925 state.obj = astate->obj;
5926 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
5927
11fdf7f2
TL
5928 state.cur_pool = state.head_obj.pool;
5929 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
5930
b3b6e05e 5931 r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
5932 if (r < 0) {
5933 return r;
5934 }
eafe8130
TL
5935 if (params.target_obj) {
5936 *params.target_obj = state.obj;
5937 }
7c673cae
FG
5938 if (params.attrs) {
5939 *params.attrs = astate->attrset;
11fdf7f2 5940 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae 5941 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
b3b6e05e 5942 ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
7c673cae
FG
5943 }
5944 }
5945 }
5946
5947 /* Convert all times go GMT to make them compatible */
5948 if (conds.mod_ptr || conds.unmod_ptr) {
5949 obj_time_weight src_weight;
5950 src_weight.init(astate);
5951 src_weight.high_precision = conds.high_precision_time;
5952
5953 obj_time_weight dest_weight;
5954 dest_weight.high_precision = conds.high_precision_time;
5955
9f95a23c 5956 if (conds.mod_ptr && !conds.if_nomatch) {
7c673cae 5957 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 5958 ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
5959 if (!(dest_weight < src_weight)) {
5960 return -ERR_NOT_MODIFIED;
5961 }
5962 }
5963
9f95a23c 5964 if (conds.unmod_ptr && !conds.if_match) {
7c673cae 5965 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 5966 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
5967 if (dest_weight < src_weight) {
5968 return -ERR_PRECONDITION_FAILED;
5969 }
5970 }
5971 }
5972 if (conds.if_match || conds.if_nomatch) {
b3b6e05e 5973 r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
7c673cae
FG
5974 if (r < 0)
5975 return r;
5976
5977 if (conds.if_match) {
5978 string if_match_str = rgw_string_unquote(conds.if_match);
b3b6e05e 5979 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
11fdf7f2 5980 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
5981 return -ERR_PRECONDITION_FAILED;
5982 }
5983 }
5984
5985 if (conds.if_nomatch) {
5986 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
b3b6e05e 5987 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
11fdf7f2 5988 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
5989 return -ERR_NOT_MODIFIED;
5990 }
5991 }
5992 }
5993
5994 if (params.obj_size)
5995 *params.obj_size = astate->size;
5996 if (params.lastmod)
5997 *params.lastmod = astate->mtime;
5998
5999 return 0;
6000}
6001
6002int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6003{
6004 if (ofs < 0) {
6005 ofs += obj_size;
11fdf7f2
TL
6006 if (ofs < 0)
6007 ofs = 0;
6008 end = obj_size - 1;
6009 } else if (end < 0) {
6010 end = obj_size - 1;
7c673cae
FG
6011 }
6012
11fdf7f2
TL
6013 if (obj_size > 0) {
6014 if (ofs >= (off_t)obj_size) {
6015 return -ERANGE;
6016 }
6017 if (end >= (off_t)obj_size) {
6018 end = obj_size - 1;
7c673cae
FG
6019 }
6020 }
7c673cae
FG
6021 return 0;
6022}
6023
b3b6e05e 6024int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, BucketShard **pbs, std::function<int(BucketShard *)> call)
31f18b77
FG
6025{
6026 RGWRados *store = target->get_store();
6027 BucketShard *bs;
6028 int r;
6029
6030#define NUM_RESHARD_RETRIES 10
6031 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6032 int ret = get_bucket_shard(&bs, dpp);
31f18b77 6033 if (ret < 0) {
b3b6e05e 6034 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
31f18b77
FG
6035 return ret;
6036 }
6037 r = call(bs);
6038 if (r != -ERR_BUSY_RESHARDING) {
6039 break;
6040 }
b3b6e05e 6041 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
31f18b77 6042 string new_bucket_id;
11fdf7f2 6043 r = store->block_while_resharding(bs, &new_bucket_id,
b3b6e05e 6044 target->bucket_info, null_yield, dpp);
31f18b77
FG
6045 if (r == -ERR_BUSY_RESHARDING) {
6046 continue;
6047 }
6048 if (r < 0) {
6049 return r;
6050 }
b3b6e05e 6051 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
31f18b77 6052 i = 0; /* resharding is finished, make sure we can retry */
b3b6e05e 6053 r = target->update_bucket_id(new_bucket_id, dpp);
31f18b77 6054 if (r < 0) {
b3b6e05e 6055 ldpp_dout(dpp, 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
31f18b77
FG
6056 return r;
6057 }
6058 invalidate_bs();
81eedcae 6059 } // for loop
31f18b77
FG
6060
6061 if (r < 0) {
6062 return r;
6063 }
6064
6065 if (pbs) {
6066 *pbs = bs;
6067 }
6068
6069 return 0;
6070}
6071
b3b6e05e 6072int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
7c673cae
FG
6073{
6074 if (blind) {
6075 return 0;
6076 }
6077 RGWRados *store = target->get_store();
7c673cae
FG
6078
6079 if (write_tag && write_tag->length()) {
6080 optag = string(write_tag->c_str(), write_tag->length());
6081 } else {
6082 if (optag.empty()) {
6083 append_rand_alpha(store->ctx(), optag, optag, 32);
6084 }
6085 }
6086
b3b6e05e
TL
6087 int r = guard_reshard(dpp, nullptr, [&](BucketShard *bs) -> int {
6088 return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
f64942e4 6089 });
31f18b77 6090
7c673cae
FG
6091 if (r < 0) {
6092 return r;
6093 }
6094 prepared = true;
31f18b77 6095
7c673cae
FG
6096 return 0;
6097}
6098
b3b6e05e 6099int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
7c673cae
FG
6100 uint64_t size, uint64_t accounted_size,
6101 ceph::real_time& ut, const string& etag,
11fdf7f2 6102 const string& content_type, const string& storage_class,
7c673cae
FG
6103 bufferlist *acl_bl,
6104 RGWObjCategory category,
11fdf7f2
TL
6105 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6106 bool appendable)
7c673cae
FG
6107{
6108 if (blind) {
6109 return 0;
6110 }
6111 RGWRados *store = target->get_store();
6112 BucketShard *bs;
31f18b77 6113
b3b6e05e 6114 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6115 if (ret < 0) {
b3b6e05e 6116 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6117 return ret;
6118 }
6119
6120 rgw_bucket_dir_entry ent;
6121 obj.key.get_index_key(&ent.key);
6122 ent.meta.size = size;
6123 ent.meta.accounted_size = accounted_size;
6124 ent.meta.mtime = ut;
6125 ent.meta.etag = etag;
11fdf7f2 6126 ent.meta.storage_class = storage_class;
7c673cae
FG
6127 if (user_data)
6128 ent.meta.user_data = *user_data;
6129
6130 ACLOwner owner;
6131 if (acl_bl && acl_bl->length()) {
6132 int ret = store->decode_policy(*acl_bl, &owner);
6133 if (ret < 0) {
b3b6e05e 6134 ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
7c673cae
FG
6135 }
6136 }
6137 ent.meta.owner = owner.get_id().to_str();
6138 ent.meta.owner_display_name = owner.get_display_name();
6139 ent.meta.content_type = content_type;
11fdf7f2 6140 ent.meta.appendable = appendable;
7c673cae 6141
31f18b77 6142 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6143
b3b6e05e 6144 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6145 if (r < 0) {
b3b6e05e 6146 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6147 }
6148
6149 return ret;
6150}
6151
b3b6e05e
TL
6152int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
6153 int64_t poolid, uint64_t epoch,
7c673cae
FG
6154 real_time& removed_mtime,
6155 list<rgw_obj_index_key> *remove_objs)
6156{
6157 if (blind) {
6158 return 0;
6159 }
6160 RGWRados *store = target->get_store();
6161 BucketShard *bs;
31f18b77 6162
b3b6e05e 6163 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6164 if (ret < 0) {
b3b6e05e 6165 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6166 return ret;
6167 }
6168
31f18b77 6169 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6170
b3b6e05e 6171 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6172 if (r < 0) {
b3b6e05e 6173 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6174 }
6175
6176 return ret;
6177}
6178
6179
b3b6e05e 6180int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp)
7c673cae
FG
6181{
6182 if (blind) {
6183 return 0;
6184 }
6185 RGWRados *store = target->get_store();
6186 BucketShard *bs;
7c673cae 6187
b3b6e05e 6188 int ret = guard_reshard(dpp, &bs, [&](BucketShard *bs) -> int {
f64942e4
AA
6189 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6190 });
7c673cae
FG
6191
6192 /*
6193 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6194 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6195 * have no way to tell that they're all caught up
6196 */
b3b6e05e 6197 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6198 if (r < 0) {
b3b6e05e 6199 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6200 }
6201
6202 return ret;
6203}
6204
b3b6e05e 6205int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
6206{
6207 RGWRados *store = source->get_store();
7c673cae 6208
7c673cae
FG
6209 rgw_raw_obj read_obj;
6210 uint64_t read_ofs = ofs;
6211 uint64_t len, read_len;
6212 bool reading_from_head = true;
6213 ObjectReadOperation op;
6214
6215 bool merge_bl = false;
6216 bufferlist *pbl = &bl;
6217 bufferlist read_bl;
6218 uint64_t max_chunk_size;
6219
6220 RGWObjState *astate;
b3b6e05e 6221 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
6222 if (r < 0)
6223 return r;
6224
11fdf7f2
TL
6225 if (astate->size == 0) {
6226 end = 0;
6227 } else if (end >= (int64_t)astate->size) {
6228 end = astate->size - 1;
6229 }
6230
7c673cae
FG
6231 if (end < 0)
6232 len = 0;
6233 else
6234 len = end - ofs + 1;
6235
9f95a23c 6236 if (astate->manifest && astate->manifest->has_tail()) {
7c673cae 6237 /* now get the relevant object part */
b3b6e05e 6238 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae
FG
6239
6240 uint64_t stripe_ofs = iter.get_stripe_ofs();
f67539c2 6241 read_obj = iter.get_location().get_raw_obj(store->store);
11fdf7f2 6242 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6243 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6244 reading_from_head = (read_obj == state.head_obj);
6245 } else {
6246 read_obj = state.head_obj;
6247 }
6248
b3b6e05e 6249 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
7c673cae 6250 if (r < 0) {
b3b6e05e 6251 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
7c673cae
FG
6252 return r;
6253 }
6254
6255 if (len > max_chunk_size)
6256 len = max_chunk_size;
6257
6258
7c673cae
FG
6259 read_len = len;
6260
6261 if (reading_from_head) {
6262 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6263 r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
7c673cae
FG
6264 if (r < 0)
6265 return r;
6266
6267 if (astate && astate->prefetch_data) {
6268 if (!ofs && astate->data.length() >= len) {
6269 bl = astate->data;
6270 return bl.length();
6271 }
6272
6273 if (ofs < astate->data.length()) {
11fdf7f2 6274 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
9f95a23c 6275 astate->data.begin(ofs).copy(copy_len, bl);
7c673cae
FG
6276 read_len -= copy_len;
6277 read_ofs += copy_len;
6278 if (!read_len)
6279 return bl.length();
6280
6281 merge_bl = true;
6282 pbl = &read_bl;
6283 }
6284 }
6285 }
6286
b3b6e05e 6287 ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
7c673cae
FG
6288 op.read(read_ofs, read_len, pbl, NULL);
6289
11fdf7f2
TL
6290 if (state.cur_pool != read_obj.pool) {
6291 auto iter = state.io_ctxs.find(read_obj.pool);
6292 if (iter == state.io_ctxs.end()) {
6293 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
b3b6e05e 6294 r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
11fdf7f2 6295 if (r < 0) {
b3b6e05e 6296 ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
11fdf7f2
TL
6297 return r;
6298 }
6299 } else {
6300 state.cur_ioctx = &iter->second;
7c673cae 6301 }
11fdf7f2 6302 state.cur_pool = read_obj.pool;
7c673cae
FG
6303 }
6304
11fdf7f2 6305 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6306
11fdf7f2 6307 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
b3b6e05e 6308 ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6309
7c673cae 6310 if (r < 0) {
7c673cae
FG
6311 return r;
6312 }
7c673cae 6313
11fdf7f2
TL
6314 if (merge_bl) {
6315 bl.append(read_bl);
7c673cae
FG
6316 }
6317
7c673cae
FG
6318 return bl.length();
6319}
6320
11fdf7f2
TL
6321struct get_obj_data {
6322 RGWRados* store;
6323 RGWGetDataCB* client_cb;
6324 rgw::Aio* aio;
6325 uint64_t offset; // next offset to write to client
6326 rgw::AioResultList completed; // completed read results, sorted by offset
9f95a23c 6327 optional_yield yield;
7c673cae 6328
9f95a23c
TL
6329 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio,
6330 uint64_t offset, optional_yield yield)
6331 : store(store), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
7c673cae 6332
11fdf7f2
TL
6333 int flush(rgw::AioResultList&& results) {
6334 int r = rgw::check_for_errors(results);
6335 if (r < 0) {
6336 return r;
7c673cae 6337 }
7c673cae 6338
11fdf7f2
TL
6339 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6340 results.sort(cmp); // merge() requires results to be sorted first
6341 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6342
11fdf7f2
TL
6343 while (!completed.empty() && completed.front().id == offset) {
6344 auto bl = std::move(completed.front().data);
6345 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6346
11fdf7f2
TL
6347 offset += bl.length();
6348 int r = client_cb->handle_data(bl, 0, bl.length());
6349 if (r < 0) {
6350 return r;
6351 }
7c673cae 6352 }
11fdf7f2 6353 return 0;
7c673cae
FG
6354 }
6355
11fdf7f2
TL
6356 void cancel() {
6357 // wait for all completions to drain and ignore the results
6358 aio->drain();
7c673cae
FG
6359 }
6360
11fdf7f2
TL
6361 int drain() {
6362 auto c = aio->wait();
6363 while (!c.empty()) {
6364 int r = flush(std::move(c));
7c673cae 6365 if (r < 0) {
11fdf7f2 6366 cancel();
7c673cae
FG
6367 return r;
6368 }
11fdf7f2 6369 c = aio->wait();
7c673cae 6370 }
11fdf7f2 6371 return flush(std::move(c));
7c673cae
FG
6372 }
6373};
6374
b3b6e05e
TL
6375static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6376 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6377 off_t read_ofs, off_t len, bool is_head_obj,
6378 RGWObjState *astate, void *arg)
7c673cae
FG
6379{
6380 struct get_obj_data *d = (struct get_obj_data *)arg;
6381
b3b6e05e 6382 return d->store->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
11fdf7f2 6383 is_head_obj, astate, arg);
7c673cae
FG
6384}
6385
b3b6e05e
TL
6386int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6387 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6388 off_t read_ofs, off_t len, bool is_head_obj,
6389 RGWObjState *astate, void *arg)
7c673cae 6390{
7c673cae
FG
6391 ObjectReadOperation op;
6392 struct get_obj_data *d = (struct get_obj_data *)arg;
6393 string oid, key;
7c673cae
FG
6394
6395 if (is_head_obj) {
6396 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6397 int r = append_atomic_test(dpp, astate, op);
7c673cae
FG
6398 if (r < 0)
6399 return r;
6400
6401 if (astate &&
6402 obj_ofs < astate->data.length()) {
11fdf7f2 6403 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6404
7c673cae 6405 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6406 if (r < 0)
6407 return r;
6408
7c673cae 6409 len -= chunk_len;
11fdf7f2 6410 d->offset += chunk_len;
7c673cae
FG
6411 read_ofs += chunk_len;
6412 obj_ofs += chunk_len;
6413 if (!len)
6414 return 0;
6415 }
6416 }
6417
11fdf7f2 6418 auto obj = d->store->svc.rados->obj(read_obj);
b3b6e05e 6419 int r = obj.open(dpp);
7c673cae 6420 if (r < 0) {
b3b6e05e 6421 ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
11fdf7f2 6422 return r;
7c673cae
FG
6423 }
6424
b3b6e05e 6425 ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
11fdf7f2 6426 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6427
11fdf7f2
TL
6428 const uint64_t cost = len;
6429 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6430
9f95a23c 6431 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
7c673cae 6432
11fdf7f2 6433 return d->flush(std::move(completed));
7c673cae
FG
6434}
6435
b3b6e05e 6436int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
9f95a23c 6437 optional_yield y)
7c673cae
FG
6438{
6439 RGWRados *store = source->get_store();
6440 CephContext *cct = store->ctx();
7c673cae 6441 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6442 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6443 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6444
9f95a23c
TL
6445 auto aio = rgw::make_throttle(window_size, y);
6446 get_obj_data data(store, cb, &*aio, ofs, y);
7c673cae 6447
b3b6e05e 6448 int r = store->iterate_obj(dpp, obj_ctx, source->get_bucket_info(), state.obj,
9f95a23c 6449 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
7c673cae 6450 if (r < 0) {
b3b6e05e 6451 ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
11fdf7f2
TL
6452 data.cancel(); // drain completions without writing back to client
6453 return r;
7c673cae
FG
6454 }
6455
11fdf7f2 6456 return data.drain();
7c673cae
FG
6457}
6458
b3b6e05e 6459int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
7c673cae 6460 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2 6461 off_t ofs, off_t end, uint64_t max_chunk_size,
9f95a23c 6462 iterate_obj_cb cb, void *arg, optional_yield y)
7c673cae
FG
6463{
6464 rgw_raw_obj head_obj;
6465 rgw_raw_obj read_obj;
6466 uint64_t read_ofs = ofs;
6467 uint64_t len;
6468 bool reading_from_head = true;
6469 RGWObjState *astate = NULL;
6470
6471 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6472
b3b6e05e 6473 int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
6474 if (r < 0) {
6475 return r;
6476 }
6477
6478 if (end < 0)
6479 len = 0;
6480 else
6481 len = end - ofs + 1;
6482
9f95a23c 6483 if (astate->manifest) {
7c673cae 6484 /* now get the relevant object stripe */
b3b6e05e 6485 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae 6486
b3b6e05e 6487 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end(dpp);
7c673cae
FG
6488
6489 for (; iter != obj_end && ofs <= end; ++iter) {
6490 off_t stripe_ofs = iter.get_stripe_ofs();
6491 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6492
6493 while (ofs < next_stripe_ofs && ofs <= end) {
f67539c2 6494 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6495 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6496 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6497
6498 if (read_len > max_chunk_size) {
6499 read_len = max_chunk_size;
6500 }
6501
6502 reading_from_head = (read_obj == head_obj);
b3b6e05e 6503 r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6504 if (r < 0) {
6505 return r;
6506 }
6507
6508 len -= read_len;
6509 ofs += read_len;
6510 }
6511 }
6512 } else {
6513 while (ofs <= end) {
6514 read_obj = head_obj;
11fdf7f2 6515 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6516
b3b6e05e 6517 r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6518 if (r < 0) {
6519 return r;
6520 }
6521
6522 len -= read_len;
6523 ofs += read_len;
6524 }
6525 }
6526
6527 return 0;
6528}
6529
b3b6e05e 6530int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
7c673cae
FG
6531{
6532 rgw_rados_ref ref;
b3b6e05e 6533 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6534 if (r < 0) {
6535 return r;
6536 }
6537
b3b6e05e 6538 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
7c673cae
FG
6539}
6540
b3b6e05e 6541int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
7c673cae
FG
6542{
6543 rgw_rados_ref ref;
b3b6e05e 6544 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6545 if (r < 0) {
6546 return r;
6547 }
6548
6549 bufferlist outbl;
6550
b3b6e05e 6551 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
7c673cae
FG
6552}
6553
b3b6e05e 6554int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
7c673cae
FG
6555{
6556 ObjectWriteOperation op;
6557
11fdf7f2 6558 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6559
6560 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6561
6562 if (!state.exists) {
6563 op.create(true);
6564 } else {
6565 op.assert_exists();
b32b8144
FG
6566 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6567 op.mtime2(&mtime_ts);
7c673cae
FG
6568 }
6569
6570 /*
6571 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6572 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6573 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6574 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6575 * log will reflect that.
6576 *
6577 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6578 * is used for object data instance, olh_tag for olh instance.
6579 */
6580 if (has_tag) {
6581 /* guard against racing writes */
b3b6e05e 6582 bucket_index_guard_olh_op(dpp, state, op);
7c673cae
FG
6583 }
6584
6585 if (!has_tag) {
6586 /* obj tag */
9f95a23c 6587 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6588
7c673cae
FG
6589 bufferlist bl;
6590 bl.append(obj_tag.c_str(), obj_tag.size());
6591 op.setxattr(RGW_ATTR_ID_TAG, bl);
6592
6593 state.attrset[RGW_ATTR_ID_TAG] = bl;
6594 state.obj_tag = bl;
6595
6596 /* olh tag */
9f95a23c 6597 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6598
7c673cae
FG
6599 bufferlist olh_bl;
6600 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6601 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6602
6603 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6604 state.olh_tag = olh_bl;
6605 state.is_olh = true;
6606
6607 bufferlist verbl;
6608 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6609 }
6610
6611 bufferlist bl;
6612 RGWOLHPendingInfo pending_info;
6613 pending_info.time = real_clock::now();
11fdf7f2 6614 encode(pending_info, bl);
7c673cae
FG
6615
6616#define OLH_PENDING_TAG_LEN 32
6617 /* tag will start with current time epoch, this so that entries are sorted by time */
6618 char buf[32];
6619 utime_t ut(pending_info.time);
6620 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6621 *op_tag = buf;
6622
9f95a23c 6623 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
11fdf7f2 6624
7c673cae
FG
6625 op_tag->append(s);
6626
6627 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6628 attr_name.append(*op_tag);
6629
6630 op.setxattr(attr_name.c_str(), bl);
6631
b3b6e05e 6632 int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
7c673cae
FG
6633 if (ret < 0) {
6634 return ret;
6635 }
6636
6637 state.exists = true;
6638 state.attrset[attr_name] = bl;
6639
6640 return 0;
6641}
6642
b3b6e05e 6643int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7c673cae
FG
6644{
6645 int ret;
6646
b3b6e05e 6647 ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
7c673cae
FG
6648 if (ret == -EEXIST) {
6649 ret = -ECANCELED;
6650 }
6651
6652 return ret;
6653}
6654
b3b6e05e
TL
6655int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
6656 BucketShard *bs,
f64942e4
AA
6657 const rgw_obj& obj_instance,
6658 const RGWBucketInfo& bucket_info,
6659 std::function<int(BucketShard *)> call)
31f18b77
FG
6660{
6661 rgw_obj obj;
6662 const rgw_obj *pobj = &obj_instance;
6663 int r;
6664
6665 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6666 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
31f18b77 6667 if (r < 0) {
b3b6e05e 6668 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
31f18b77
FG
6669 return r;
6670 }
6671 r = call(bs);
6672 if (r != -ERR_BUSY_RESHARDING) {
6673 break;
6674 }
b3b6e05e 6675 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
31f18b77 6676 string new_bucket_id;
b3b6e05e 6677 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield, dpp);
31f18b77
FG
6678 if (r == -ERR_BUSY_RESHARDING) {
6679 continue;
6680 }
6681 if (r < 0) {
6682 return r;
6683 }
b3b6e05e 6684 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
31f18b77
FG
6685 i = 0; /* resharding is finished, make sure we can retry */
6686
6687 obj = *pobj;
6688 obj.bucket.update_bucket_id(new_bucket_id);
6689 pobj = &obj;
81eedcae 6690 } // for loop
31f18b77
FG
6691
6692 if (r < 0) {
6693 return r;
6694 }
6695
6696 return 0;
6697}
6698
f64942e4
AA
6699int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6700 string *new_bucket_id,
11fdf7f2 6701 const RGWBucketInfo& bucket_info,
b3b6e05e
TL
6702 optional_yield y,
6703 const DoutPrefixProvider *dpp)
31f18b77 6704{
11fdf7f2
TL
6705 int ret = 0;
6706 cls_rgw_bucket_instance_entry entry;
6707
81eedcae
TL
6708 // since we want to run this recovery code from two distinct places,
6709 // let's just put it in a lambda so we can easily re-use; if the
6710 // lambda successfully fetches a new bucket id, it sets
6711 // new_bucket_id and returns 0, otherwise it returns a negative
6712 // error code
6713 auto fetch_new_bucket_id =
b3b6e05e 6714 [this, &bucket_info, dpp](const std::string& log_tag,
9f95a23c 6715 std::string* new_bucket_id) -> int {
81eedcae 6716 RGWBucketInfo fresh_bucket_info = bucket_info;
b3b6e05e 6717 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr, dpp);
81eedcae 6718 if (ret < 0) {
b3b6e05e 6719 ldpp_dout(dpp, 0) << __func__ <<
81eedcae
TL
6720 " ERROR: failed to refresh bucket info after reshard at " <<
6721 log_tag << ": " << cpp_strerror(-ret) << dendl;
6722 return ret;
6723 }
6724 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6725 return 0;
6726 };
6727
6728 constexpr int num_retries = 10;
6729 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
9f95a23c
TL
6730 auto& ref = bs->bucket_obj.get_ref();
6731 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
81eedcae
TL
6732 if (ret == -ENOENT) {
6733 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6734 } else if (ret < 0) {
b3b6e05e 6735 ldpp_dout(dpp, 0) << __func__ <<
81eedcae
TL
6736 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6737 dendl;
11fdf7f2
TL
6738 return ret;
6739 }
81eedcae 6740
11fdf7f2 6741 if (!entry.resharding_in_progress()) {
81eedcae
TL
6742 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6743 new_bucket_id);
11fdf7f2 6744 }
31f18b77 6745
b3b6e05e 6746 ldpp_dout(dpp, 20) << "NOTICE: reshard still in progress; " <<
81eedcae
TL
6747 (i < num_retries ? "retrying" : "too many retries") << dendl;
6748
6749 if (i == num_retries) {
11fdf7f2
TL
6750 break;
6751 }
6752
6753 // If bucket is erroneously marked as resharding (e.g., crash or
6754 // other error) then fix it. If we can take the bucket reshard
6755 // lock then it means no other resharding should be taking place,
6756 // and we're free to clear the flags.
6757 {
6758 // since we expect to do this rarely, we'll do our work in a
6759 // block and erase our work after each try
6760
9f95a23c 6761 RGWObjectCtx obj_ctx(this->store);
11fdf7f2
TL
6762 const rgw_bucket& b = bs->bucket;
6763 std::string bucket_id = b.get_key();
9f95a23c 6764 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
11fdf7f2
TL
6765 ret = reshard_lock.lock();
6766 if (ret < 0) {
b3b6e05e 6767 ldpp_dout(dpp, 20) << __func__ <<
11fdf7f2
TL
6768 " INFO: failed to take reshard lock for bucket " <<
6769 bucket_id << "; expected if resharding underway" << dendl;
6770 } else {
b3b6e05e 6771 ldpp_dout(dpp, 10) << __func__ <<
11fdf7f2
TL
6772 " INFO: was able to take reshard lock for bucket " <<
6773 bucket_id << dendl;
b3b6e05e 6774 ret = RGWBucketReshard::clear_resharding(dpp, this->store, bucket_info);
11fdf7f2
TL
6775 if (ret < 0) {
6776 reshard_lock.unlock();
b3b6e05e 6777 ldpp_dout(dpp, 0) << __func__ <<
11fdf7f2
TL
6778 " ERROR: failed to clear resharding flags for bucket " <<
6779 bucket_id << dendl;
6780 } else {
6781 reshard_lock.unlock();
b3b6e05e 6782 ldpp_dout(dpp, 5) << __func__ <<
11fdf7f2
TL
6783 " INFO: apparently successfully cleared resharding flags for "
6784 "bucket " << bucket_id << dendl;
6785 continue; // if we apparently succeed immediately test again
6786 } // if clear resharding succeeded
6787 } // if taking of lock succeeded
6788 } // block to encapsulate recovery from incomplete reshard
6789
6790 ret = reshard_wait->wait(y);
6791 if (ret < 0) {
b3b6e05e 6792 ldpp_dout(dpp, 0) << __func__ <<
81eedcae 6793 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
6794 return ret;
6795 }
81eedcae
TL
6796 } // for loop
6797
b3b6e05e 6798 ldpp_dout(dpp, 0) << __func__ <<
81eedcae 6799 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 6800 return -ERR_BUSY_RESHARDING;
31f18b77
FG
6801}
6802
b3b6e05e 6803int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7c673cae
FG
6804 bool delete_marker,
6805 const string& op_tag,
6806 struct rgw_bucket_dir_entry_meta *meta,
6807 uint64_t olh_epoch,
91327a77
AA
6808 real_time unmod_since, bool high_precision_time,
6809 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
6810{
6811 rgw_rados_ref ref;
b3b6e05e 6812 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
6813 if (r < 0) {
6814 return r;
6815 }
6816
31f18b77
FG
6817 rgw_zone_set zones_trace;
6818 if (_zones_trace) {
6819 zones_trace = *_zones_trace;
7c673cae 6820 }
9f95a23c 6821 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
7c673cae 6822
31f18b77
FG
6823 BucketShard bs(this);
6824
b3b6e05e 6825 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 6826 [&](BucketShard *bs) -> int {
9f95a23c
TL
6827 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6828 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6829 librados::ObjectWriteOperation op;
6830 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6831 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6832 delete_marker, op_tag, meta, olh_epoch,
6833 unmod_since, high_precision_time,
6834 svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 6835 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6836 });
6837 if (r < 0) {
b3b6e05e 6838 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
31f18b77 6839 return r;
7c673cae
FG
6840 }
6841
b3b6e05e 6842 r = svc.datalog_rados->add_entry(dpp, bucket_info, bs.shard_id);
9f95a23c 6843 if (r < 0) {
b3b6e05e 6844 ldpp_dout(dpp, 0) << "ERROR: failed writing data log" << dendl;
91327a77
AA
6845 }
6846
7c673cae
FG
6847 return 0;
6848}
6849
b3b6e05e 6850void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
7c673cae 6851{
b3b6e05e 6852 ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7c673cae
FG
6853 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
6854}
6855
b3b6e05e 6856int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 6857 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
6858{
6859 rgw_rados_ref ref;
b3b6e05e 6860 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
6861 if (r < 0) {
6862 return r;
6863 }
6864
31f18b77
FG
6865 rgw_zone_set zones_trace;
6866 if (_zones_trace) {
6867 zones_trace = *_zones_trace;
7c673cae 6868 }
9f95a23c 6869 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
31f18b77
FG
6870
6871 BucketShard bs(this);
7c673cae
FG
6872
6873 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
b3b6e05e 6874 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 6875 [&](BucketShard *bs) -> int {
9f95a23c 6876 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6877 librados::ObjectWriteOperation op;
6878 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6879 cls_rgw_bucket_unlink_instance(op, key, op_tag,
6880 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 6881 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6882 });
6883 if (r < 0) {
b3b6e05e 6884 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
31f18b77 6885 return r;
7c673cae
FG
6886 }
6887
6888 return 0;
6889}
6890
b3b6e05e
TL
6891int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
6892 const RGWBucketInfo& bucket_info, RGWObjState& state,
7c673cae
FG
6893 const rgw_obj& obj_instance, uint64_t ver_marker,
6894 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
6895 bool *is_truncated)
6896{
6897 rgw_rados_ref ref;
b3b6e05e 6898 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
6899 if (r < 0) {
6900 return r;
6901 }
6902
6903 BucketShard bs(this);
f64942e4 6904 int ret =
b3b6e05e 6905 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 6906 if (ret < 0) {
b3b6e05e 6907 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
6908 return ret;
6909 }
6910
6911 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6912
6913 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6914
b3b6e05e 6915 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 6916 [&](BucketShard *bs) -> int {
9f95a23c 6917 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6918 ObjectReadOperation op;
6919 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6920
6921 rgw_cls_read_olh_log_ret log_ret;
6922 int op_ret = 0;
6923 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
6924 bufferlist outbl;
b3b6e05e 6925 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
9f95a23c
TL
6926 if (r < 0) {
6927 return r;
6928 }
6929 if (op_ret < 0) {
6930 return op_ret;
6931 }
6932
6933 *log = std::move(log_ret.log);
6934 *is_truncated = log_ret.is_truncated;
6935 return r;
f64942e4 6936 });
31f18b77 6937 if (ret < 0) {
b3b6e05e 6938 ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 6939 return ret;
31f18b77 6940 }
7c673cae
FG
6941
6942 return 0;
6943}
6944
a8e16298
TL
6945// a multisite sync bug resulted in the OLH head attributes being overwritten by
6946// the attributes from another zone, causing link_olh() to fail endlessly due to
6947// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6948// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
b3b6e05e 6949int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
a8e16298
TL
6950 const rgw_obj& obj)
6951{
6952 // fetch the current olh entry from the bucket index
6953 rgw_bucket_olh_entry olh;
b3b6e05e 6954 int r = bi_get_olh(dpp, bucket_info, obj, &olh);
a8e16298 6955 if (r < 0) {
b3b6e05e 6956 ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
a8e16298
TL
6957 return r;
6958 }
11fdf7f2 6959 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
6960 return 0;
6961 }
6962
b3b6e05e 6963 ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
a8e16298
TL
6964 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
6965
6966 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6967 ObjectWriteOperation op;
6968 // assert this is the same olh tag we think we're fixing
b3b6e05e 6969 bucket_index_guard_olh_op(dpp, *state, op);
a8e16298
TL
6970 // preserve existing mtime
6971 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
6972 op.mtime2(&mtime_ts);
6973 {
6974 bufferlist bl;
6975 bl.append(olh.tag.c_str(), olh.tag.size());
6976 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
6977 }
6978 {
6979 RGWOLHInfo info;
6980 info.target = rgw_obj(bucket_info.bucket, olh.key);
6981 info.removed = olh.delete_marker;
6982 bufferlist bl;
6983 encode(info, bl);
6984 op.setxattr(RGW_ATTR_OLH_INFO, bl);
6985 }
6986 rgw_rados_ref ref;
b3b6e05e 6987 r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
a8e16298
TL
6988 if (r < 0) {
6989 return r;
6990 }
b3b6e05e 6991 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
a8e16298 6992 if (r < 0) {
b3b6e05e 6993 ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
a8e16298
TL
6994 << cpp_strerror(r) << dendl;
6995 return r;
6996 }
6997 return 0;
6998}
6999
b3b6e05e 7000int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7c673cae
FG
7001{
7002 rgw_rados_ref ref;
b3b6e05e 7003 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7004 if (r < 0) {
7005 return r;
7006 }
7007
7008 BucketShard bs(this);
f64942e4 7009 int ret =
b3b6e05e 7010 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 7011 if (ret < 0) {
b3b6e05e 7012 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
7013 return ret;
7014 }
7015
7016 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7017
7018 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7019
b3b6e05e 7020 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7021 [&](BucketShard *pbs) -> int {
7022 ObjectWriteOperation op;
7023 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7024 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
b3b6e05e 7025 return pbs->bucket_obj.operate(dpp, &op, null_yield);
31f18b77
FG
7026 });
7027 if (ret < 0) {
b3b6e05e 7028 ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 7029 return ret;
31f18b77 7030 }
7c673cae
FG
7031
7032 return 0;
7033}
7034
b3b6e05e 7035int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7c673cae
FG
7036{
7037 rgw_rados_ref ref;
b3b6e05e 7038 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7039 if (r < 0) {
7040 return r;
7041 }
7042
7043 BucketShard bs(this);
7c673cae
FG
7044
7045 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7046
7047 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7048
b3b6e05e 7049 int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7050 [&](BucketShard *pbs) -> int {
7051 ObjectWriteOperation op;
9f95a23c 7052 auto& ref = pbs->bucket_obj.get_ref();
f64942e4 7053 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c 7054 cls_rgw_clear_olh(op, key, olh_tag);
b3b6e05e 7055 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77 7056 });
7c673cae 7057 if (ret < 0) {
b3b6e05e 7058 ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7c673cae
FG
7059 return ret;
7060 }
7061
7062 return 0;
7063}
7064
92f5a8d4
TL
7065static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
7066{
7067 try {
7068 auto biter = bl.cbegin();
7069 decode(*olh, biter);
7070 return 0;
7071 } catch (buffer::error& err) {
7072 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7073 return -EIO;
7074 }
7075}
7076
b3b6e05e 7077int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7c673cae 7078 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 7079 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
7080{
7081 if (log.empty()) {
7082 return 0;
7083 }
7084
7085 librados::ObjectWriteOperation op;
7086
7087 uint64_t last_ver = log.rbegin()->first;
7088 *plast_ver = last_ver;
7089
7090 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7091
7092 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
92f5a8d4 7093 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7c673cae 7094
a8e16298
TL
7095 bufferlist ver_bl;
7096 string last_ver_s = to_string(last_ver);
7097 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7098 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7099
b32b8144
FG
7100 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7101 op.mtime2(&mtime_ts);
7102
7c673cae 7103 bool need_to_link = false;
92f5a8d4 7104 uint64_t link_epoch = 0;
7c673cae
FG
7105 cls_rgw_obj_key key;
7106 bool delete_marker = false;
7107 list<cls_rgw_obj_key> remove_instances;
7108 bool need_to_remove = false;
7109
92f5a8d4
TL
7110 // decode current epoch and instance
7111 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
7112 if (olh_ver != state.attrset.end()) {
7113 std::string str = olh_ver->second.to_str();
7114 std::string err;
7115 link_epoch = strict_strtoll(str.c_str(), 10, &err);
7116 }
7117 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
7118 if (olh_info != state.attrset.end()) {
7119 RGWOLHInfo info;
7120 int r = decode_olh_info(cct, olh_info->second, &info);
7121 if (r < 0) {
7122 return r;
7123 }
7124 info.target.key.get_index_key(&key);
7125 delete_marker = info.removed;
7126 }
7127
7c673cae
FG
7128 for (iter = log.begin(); iter != log.end(); ++iter) {
7129 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7130 for (; viter != iter->second.end(); ++viter) {
7131 rgw_bucket_olh_log_entry& entry = *viter;
7132
b3b6e05e 7133 ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7c673cae
FG
7134 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7135 << (entry.delete_marker ? "(delete)" : "") << dendl;
7136 switch (entry.op) {
7137 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7138 remove_instances.push_back(entry.key);
7139 break;
7140 case CLS_RGW_OLH_OP_LINK_OLH:
92f5a8d4
TL
7141 // only overwrite a link of the same epoch if its key sorts before
7142 if (link_epoch < iter->first || key.instance.empty() ||
7143 key.instance > entry.key.instance) {
b3b6e05e 7144 ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7145 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7146 need_to_link = true;
7147 need_to_remove = false;
7148 key = entry.key;
7149 delete_marker = entry.delete_marker;
7150 } else {
b3b6e05e 7151 ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7152 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7153 }
7c673cae
FG
7154 break;
7155 case CLS_RGW_OLH_OP_UNLINK_OLH:
7156 need_to_remove = true;
7157 need_to_link = false;
7158 break;
7159 default:
b3b6e05e 7160 ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7c673cae
FG
7161 return -EIO;
7162 }
7163 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7164 attr_name.append(entry.op_tag);
7165 op.rmxattr(attr_name.c_str());
7166 }
7167 }
7168
7169 rgw_rados_ref ref;
b3b6e05e 7170 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
7171 if (r < 0) {
7172 return r;
7173 }
7174
7175 const rgw_bucket& bucket = obj.bucket;
7176
7177 if (need_to_link) {
7178 rgw_obj target(bucket, key);
7179 RGWOLHInfo info;
7180 info.target = target;
7181 info.removed = delete_marker;
7182 bufferlist bl;
11fdf7f2 7183 encode(info, bl);
7c673cae
FG
7184 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7185 }
7186
7187 /* first remove object instances */
7188 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7189 liter != remove_instances.end(); ++liter) {
7190 cls_rgw_obj_key& key = *liter;
7191 rgw_obj obj_instance(bucket, key);
b3b6e05e 7192 int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae 7193 if (ret < 0 && ret != -ENOENT) {
b3b6e05e 7194 ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7c673cae
FG
7195 return ret;
7196 }
7197 }
7198
7199 /* update olh object */
b3b6e05e 7200 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
7201 if (r == -ECANCELED) {
7202 r = 0;
7203 }
7204 if (r < 0) {
b3b6e05e 7205 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7c673cae
FG
7206 return r;
7207 }
7208
b3b6e05e 7209 r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
7c673cae 7210 if (r < 0) {
b3b6e05e 7211 ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7c673cae
FG
7212 return r;
7213 }
7214
7215 if (need_to_remove) {
7216 ObjectWriteOperation rm_op;
7217
7218 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7219 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7220 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7221 rm_op.remove();
7222
b3b6e05e 7223 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7c673cae
FG
7224 if (r == -ECANCELED) {
7225 return 0; /* someone else won this race */
7226 } else {
7227 /*
7228 * only clear if was successful, otherwise we might clobber pending operations on this object
7229 */
b3b6e05e 7230 r = bucket_index_clear_olh(dpp, bucket_info, state, obj);
7c673cae 7231 if (r < 0) {
b3b6e05e 7232 ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7c673cae
FG
7233 return r;
7234 }
7235 }
7236 }
7237
7238 return 0;
7239}
7240
7241/*
7242 * read olh log and apply it
7243 */
b3b6e05e 7244int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7245{
7246 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7247 bool is_truncated;
7248 uint64_t ver_marker = 0;
7249
7250 do {
b3b6e05e 7251 int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7c673cae
FG
7252 if (ret < 0) {
7253 return ret;
7254 }
b3b6e05e 7255 ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7256 if (ret < 0) {
7257 return ret;
7258 }
7259 } while (is_truncated);
7260
7261 return 0;
7262}
7263
b3b6e05e 7264int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77 7265 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
9f95a23c 7266 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7267{
7268 string op_tag;
7269
7270 rgw_obj olh_obj = target_obj;
7271 olh_obj.key.instance.clear();
7272
7273 RGWObjState *state = NULL;
7274
7275 int ret = 0;
7276 int i;
31f18b77 7277
7c673cae
FG
7278#define MAX_ECANCELED_RETRY 100
7279 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7280 if (ret == -ECANCELED) {
11fdf7f2 7281 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7282 }
7283
b3b6e05e 7284 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7285 if (ret < 0) {
7286 return ret;
7287 }
7288
b3b6e05e 7289 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7290 if (ret < 0) {
b3b6e05e 7291 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae
FG
7292 if (ret == -ECANCELED) {
7293 continue;
7294 }
7295 return ret;
7296 }
b3b6e05e 7297 ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj, delete_marker,
91327a77
AA
7298 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7299 zones_trace, log_data_change);
7c673cae 7300 if (ret < 0) {
b3b6e05e 7301 ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae 7302 if (ret == -ECANCELED) {
a8e16298
TL
7303 // the bucket index rejected the link_olh() due to olh tag mismatch;
7304 // attempt to reconstruct olh head attributes based on the bucket index
b3b6e05e 7305 int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
a8e16298
TL
7306 if (r2 < 0 && r2 != -ECANCELED) {
7307 return r2;
7308 }
7c673cae
FG
7309 continue;
7310 }
7311 return ret;
7312 }
7313 break;
7314 }
7315
7316 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7317 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7318 return -EIO;
7319 }
7320
b3b6e05e 7321 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7322 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7323 ret = 0;
7324 }
7325 if (ret < 0) {
b3b6e05e 7326 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7327 return ret;
7328 }
7329
7330 return 0;
7331}
7332
b3b6e05e 7333int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
9f95a23c 7334 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7c673cae
FG
7335{
7336 string op_tag;
7337
7338 rgw_obj olh_obj = target_obj;
7339 olh_obj.key.instance.clear();
7340
7341 RGWObjState *state = NULL;
7342
7343 int ret = 0;
7344 int i;
7345
7346 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7347 if (ret == -ECANCELED) {
11fdf7f2 7348 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7349 }
7350
b3b6e05e 7351 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7352 if (ret < 0)
7353 return ret;
7354
b3b6e05e 7355 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7356 if (ret < 0) {
b3b6e05e 7357 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7358 if (ret == -ECANCELED) {
7359 continue;
7360 }
7361 return ret;
7362 }
7363
7364 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7365
b3b6e05e 7366 ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae 7367 if (ret < 0) {
b3b6e05e 7368 ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7369 if (ret == -ECANCELED) {
7370 continue;
7371 }
7372 return ret;
7373 }
7374 break;
7375 }
7376
7377 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7378 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7379 return -EIO;
7380 }
7381
b3b6e05e 7382 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7383 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7384 return 0;
7385 }
7386 if (ret < 0) {
b3b6e05e 7387 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7388 return ret;
7389 }
7390
7391 return 0;
7392}
7393
11fdf7f2 7394void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7395{
7396#define OBJ_INSTANCE_LEN 32
7397 char buf[OBJ_INSTANCE_LEN + 1];
7398
7399 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7400 no underscore for instance name due to the way we encode the raw keys */
7401
11fdf7f2 7402 target_key->set_instance(buf);
7c673cae
FG
7403}
7404
11fdf7f2 7405void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7406{
11fdf7f2 7407 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7408}
7409
b3b6e05e 7410int RGWRados::get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7c673cae 7411{
92f5a8d4 7412 map<string, bufferlist> attrset;
7c673cae
FG
7413
7414 ObjectReadOperation op;
92f5a8d4 7415 op.getxattrs(&attrset, NULL);
7c673cae 7416
b3b6e05e 7417 int r = obj_operate(dpp, bucket_info, obj, &op);
7c673cae
FG
7418 if (r < 0) {
7419 return r;
7420 }
7c673cae 7421
92f5a8d4 7422 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7c673cae
FG
7423 if (iter == attrset.end()) { /* not an olh */
7424 return -EINVAL;
7425 }
7426
92f5a8d4 7427 return decode_olh_info(cct, iter->second, olh);
7c673cae
FG
7428}
7429
7430void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7431 map<string, bufferlist> *rm_pending_entries)
7432{
7433 map<string, bufferlist>::iterator iter = pending_entries.begin();
7434
7435 real_time now = real_clock::now();
7436
7437 while (iter != pending_entries.end()) {
11fdf7f2 7438 auto biter = iter->second.cbegin();
7c673cae
FG
7439 RGWOLHPendingInfo pending_info;
7440 try {
11fdf7f2 7441 decode(pending_info, biter);
7c673cae
FG
7442 } catch (buffer::error& err) {
7443 /* skipping bad entry, we could remove it but it might hide a bug */
7444 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7445 ++iter;
7446 continue;
7447 }
7448
7449 map<string, bufferlist>::iterator cur_iter = iter;
7450 ++iter;
7451 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7452 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7453 pending_entries.erase(cur_iter);
7454 } else {
7455 /* entries names are sorted by time (rounded to a second) */
7456 break;
7457 }
7458 }
7459}
7460
b3b6e05e 7461int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7c673cae 7462{
7c673cae 7463 rgw_rados_ref ref;
b3b6e05e 7464 int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
7c673cae
FG
7465 if (r < 0) {
7466 return r;
7467 }
7468
81eedcae
TL
7469 // trim no more than 1000 entries per osd op
7470 constexpr int max_entries = 1000;
7c673cae 7471
81eedcae
TL
7472 auto i = pending_attrs.begin();
7473 while (i != pending_attrs.end()) {
7474 ObjectWriteOperation op;
b3b6e05e 7475 bucket_index_guard_olh_op(dpp, state, op);
81eedcae
TL
7476
7477 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7478 op.rmxattr(i->first.c_str());
7479 }
7480
b3b6e05e 7481 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
81eedcae
TL
7482 if (r == -ENOENT || r == -ECANCELED) {
7483 /* raced with some other change, shouldn't sweat about it */
7484 return 0;
7485 }
7486 if (r < 0) {
b3b6e05e 7487 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
81eedcae
TL
7488 return r;
7489 }
7490 }
7c673cae
FG
7491 return 0;
7492}
7493
b3b6e05e 7494int RGWRados::follow_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7c673cae
FG
7495{
7496 map<string, bufferlist> pending_entries;
11fdf7f2 7497 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7498
7499 map<string, bufferlist> rm_pending_entries;
7500 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7501
7502 if (!rm_pending_entries.empty()) {
b3b6e05e 7503 int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
7c673cae 7504 if (ret < 0) {
b3b6e05e 7505 ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7c673cae
FG
7506 return ret;
7507 }
7508 }
7509 if (!pending_entries.empty()) {
b3b6e05e 7510 ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7c673cae 7511
b3b6e05e 7512 int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7513 if (ret < 0) {
7514 return ret;
7515 }
7516 }
7517
92f5a8d4
TL
7518 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7519 if (iter == state->attrset.end()) {
7520 return -EINVAL;
7521 }
7522
7c673cae 7523 RGWOLHInfo olh;
92f5a8d4
TL
7524 int ret = decode_olh_info(cct, iter->second, &olh);
7525 if (ret < 0) {
7526 return ret;
7c673cae
FG
7527 }
7528
7529 if (olh.removed) {
7530 return -ENOENT;
7531 }
7532
7533 *target = olh.target;
7534
7535 return 0;
7536}
7537
b3b6e05e
TL
7538int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
7539 rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7c673cae 7540 map<string, bufferlist> *attrs, bufferlist *first_chunk,
9f95a23c 7541 RGWObjVersionTracker *objv_tracker, optional_yield y)
7c673cae
FG
7542{
7543 rgw_rados_ref ref;
b3b6e05e 7544 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
7545 if (r < 0) {
7546 return r;
7547 }
7548
7549 map<string, bufferlist> unfiltered_attrset;
7550 uint64_t size = 0;
7551 struct timespec mtime_ts;
7552
7553 ObjectReadOperation op;
7554 if (objv_tracker) {
7555 objv_tracker->prepare_op_for_read(&op);
7556 }
7557 if (attrs) {
7558 op.getxattrs(&unfiltered_attrset, NULL);
7559 }
7560 if (psize || pmtime) {
7561 op.stat2(&size, &mtime_ts, NULL);
7562 }
7563 if (first_chunk) {
7564 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7565 }
7566 bufferlist outbl;
b3b6e05e 7567 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7c673cae
FG
7568
7569 if (epoch) {
9f95a23c 7570 *epoch = ref.pool.ioctx().get_last_version();
7c673cae
FG
7571 }
7572
7573 if (r < 0)
7574 return r;
7575
7576 if (psize)
7577 *psize = size;
7578 if (pmtime)
7579 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7580 if (attrs) {
11fdf7f2 7581 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7582 }
7583
7584 return 0;
7585}
7586
b3b6e05e 7587int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7588 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7589{
a8e16298 7590 vector<rgw_bucket_dir_header> headers;
7c673cae 7591 map<int, string> bucket_instance_ids;
b3b6e05e 7592 int r = cls_bucket_head(dpp, bucket_info, shard_id, headers, &bucket_instance_ids);
7c673cae
FG
7593 if (r < 0) {
7594 return r;
7595 }
7596
11fdf7f2 7597 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7598
a8e16298 7599 auto iter = headers.begin();
7c673cae
FG
7600 map<int, string>::iterator viter = bucket_instance_ids.begin();
7601 BucketIndexShardsManager ver_mgr;
7602 BucketIndexShardsManager master_ver_mgr;
7603 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7604 char buf[64];
7605 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7606 accumulate_raw_stats(*iter, stats);
7607 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7608 ver_mgr.add(viter->first, string(buf));
a8e16298 7609 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7610 master_ver_mgr.add(viter->first, string(buf));
7611 if (shard_id >= 0) {
a8e16298 7612 *max_marker = iter->max_marker;
7c673cae 7613 } else {
a8e16298 7614 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7615 }
c07f9fc5 7616 if (syncstopped != NULL)
a8e16298 7617 *syncstopped = iter->syncstopped;
7c673cae
FG
7618 }
7619 ver_mgr.to_string(bucket_ver);
7620 master_ver_mgr.to_string(master_ver);
7621 if (shard_id < 0) {
7622 marker_mgr.to_string(max_marker);
7623 }
7624 return 0;
7625}
7626
7c673cae
FG
7627class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7628 RGWGetBucketStats_CB *cb;
7629 uint32_t pendings;
7630 map<RGWObjCategory, RGWStorageStats> stats;
7631 int ret_code;
7632 bool should_cb;
9f95a23c 7633 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7c673cae
FG
7634
7635public:
7636 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
9f95a23c
TL
7637 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7638 {}
7c673cae
FG
7639
7640 void handle_response(int r, rgw_bucket_dir_header& header) override {
9f95a23c 7641 std::lock_guard l{lock};
7c673cae
FG
7642 if (should_cb) {
7643 if ( r >= 0) {
7644 accumulate_raw_stats(header, stats);
7645 } else {
7646 ret_code = r;
7647 }
7648
7649 // Are we all done?
7650 if (--pendings == 0) {
7651 if (!ret_code) {
7652 cb->set_response(&stats);
7653 }
7654 cb->handle_response(ret_code);
7655 cb->put();
7656 }
7657 }
7658 }
7659
7660 void unset_cb() {
9f95a23c 7661 std::lock_guard l{lock};
7c673cae
FG
7662 should_cb = false;
7663 }
7664};
7665
b3b6e05e 7666int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7c673cae
FG
7667{
7668 int num_aio = 0;
f67539c2 7669 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
11fdf7f2 7670 ceph_assert(get_ctx);
b3b6e05e 7671 int r = cls_bucket_head_async(dpp, bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
7672 if (r < 0) {
7673 ctx->put();
7674 if (num_aio) {
7675 get_ctx->unset_cb();
7676 }
7677 }
c07f9fc5 7678 get_ctx->put();
7c673cae
FG
7679 return r;
7680}
7681
e306af50
TL
7682int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7683 const string& meta_key,
7684 RGWBucketInfo& info,
7685 real_time *pmtime,
7686 map<string, bufferlist> *pattrs,
b3b6e05e
TL
7687 optional_yield y,
7688 const DoutPrefixProvider *dpp)
9f95a23c
TL
7689{
7690 rgw_bucket bucket;
7691 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7c673cae 7692
b3b6e05e 7693 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y, dpp);
9f95a23c 7694}
7c673cae 7695
11fdf7f2 7696int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
b3b6e05e
TL
7697 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
7698 const DoutPrefixProvider *dpp)
7c673cae 7699{
9f95a23c
TL
7700 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7701 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7702 y,
b3b6e05e 7703 dpp,
9f95a23c
TL
7704 RGWBucketCtl::BucketInstance::GetParams()
7705 .set_mtime(pmtime)
7706 .set_attrs(pattrs)
7707 .set_bectx_params(bectx_params));
7c673cae
FG
7708}
7709
9f95a23c 7710int RGWRados::get_bucket_info(RGWServices *svc,
b32b8144
FG
7711 const string& tenant, const string& bucket_name,
7712 RGWBucketInfo& info,
9f95a23c 7713 real_time *pmtime,
b3b6e05e
TL
7714 optional_yield y,
7715 const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
b32b8144 7716{
9f95a23c
TL
7717 auto obj_ctx = svc->sysobj->init_obj_ctx();
7718 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7719 rgw_bucket bucket;
7720 bucket.tenant = tenant;
7721 bucket.name = bucket_name;
b3b6e05e 7722 return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
9f95a23c
TL
7723 RGWBucketCtl::BucketInstance::GetParams()
7724 .set_mtime(pmtime)
7725 .set_attrs(pattrs)
7726 .set_bectx_params(bectx_params));
b32b8144
FG
7727}
7728
7729int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7730 ceph::real_time *pmtime,
b3b6e05e 7731 const DoutPrefixProvider *dpp,
b32b8144
FG
7732 map<string, bufferlist> *pattrs)
7733{
9f95a23c
TL
7734 rgw_bucket bucket = info.bucket;
7735 bucket.bucket_id.clear();
b32b8144 7736
9f95a23c 7737 auto rv = info.objv_tracker.read_version;
b32b8144 7738
b3b6e05e 7739 return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
9f95a23c
TL
7740 RGWBucketCtl::BucketInstance::GetParams()
7741 .set_mtime(pmtime)
7742 .set_attrs(pattrs)
7743 .set_refresh_version(rv));
7c673cae
FG
7744}
7745
7746int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
b3b6e05e
TL
7747 real_time mtime, map<string, bufferlist> *pattrs,
7748 const DoutPrefixProvider *dpp)
7c673cae 7749{
b3b6e05e 7750 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
9f95a23c
TL
7751 RGWBucketCtl::BucketInstance::PutParams()
7752 .set_exclusive(exclusive)
7753 .set_mtime(mtime)
7754 .set_attrs(pattrs));
7c673cae
FG
7755}
7756
7757int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
b3b6e05e
TL
7758 map<string, bufferlist> *pattrs, bool create_entry_point,
7759 const DoutPrefixProvider *dpp)
7c673cae
FG
7760{
7761 bool create_head = !info.has_instance_obj || create_entry_point;
7762
b3b6e05e 7763 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
7c673cae
FG
7764 if (ret < 0) {
7765 return ret;
7766 }
7767
7768 if (!create_head)
7769 return 0; /* done! */
7770
7771 RGWBucketEntryPoint entry_point;
7772 entry_point.bucket = info.bucket;
7773 entry_point.owner = info.owner;
7774 entry_point.creation_time = info.creation_time;
7775 entry_point.linked = true;
7776 RGWObjVersionTracker ot;
7777 if (pep_objv && !pep_objv->tag.empty()) {
7778 ot.write_version = *pep_objv;
7779 } else {
7780 ot.generate_new_write_ver(cct);
7781 if (pep_objv) {
7782 *pep_objv = ot.write_version;
7783 }
7784 }
b3b6e05e 7785 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
9f95a23c
TL
7786 .set_exclusive(exclusive)
7787 .set_objv_tracker(&ot)
7788 .set_mtime(mtime));
7c673cae
FG
7789 if (ret < 0)
7790 return ret;
7791
7792 return 0;
7793}
7794
b3b6e05e 7795int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
7c673cae 7796{
11fdf7f2 7797 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
7798
7799 map<string, RGWBucketEnt>::iterator iter;
7800 for (iter = m.begin(); iter != m.end(); ++iter) {
7801 RGWBucketEnt& ent = iter->second;
7802 rgw_bucket& bucket = ent.bucket;
7803 ent.count = 0;
7804 ent.size = 0;
7805 ent.size_rounded = 0;
7806
a8e16298 7807 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7808
7809 RGWBucketInfo bucket_info;
b3b6e05e 7810 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae
FG
7811 if (ret < 0) {
7812 return ret;
7813 }
7814
b3b6e05e 7815 int r = cls_bucket_head(dpp, bucket_info, RGW_NO_SHARD, headers);
7c673cae
FG
7816 if (r < 0)
7817 return r;
7818
a8e16298 7819 auto hiter = headers.begin();
7c673cae
FG
7820 for (; hiter != headers.end(); ++hiter) {
7821 RGWObjCategory category = main_category;
11fdf7f2 7822 auto iter = (hiter->stats).find(category);
a8e16298 7823 if (iter != hiter->stats.end()) {
7c673cae
FG
7824 struct rgw_bucket_category_stats& stats = iter->second;
7825 ent.count += stats.num_entries;
7826 ent.size += stats.total_size;
7827 ent.size_rounded += stats.total_size_rounded;
7828 }
7829 }
3efd9988
FG
7830
7831 // fill in placement_rule from the bucket instance for use in swift's
7832 // per-storage policy statistics
7833 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
7834 }
7835
7836 return m.size();
7837}
7838
b3b6e05e 7839int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
7c673cae
FG
7840{
7841 rgw_rados_ref ref;
b3b6e05e 7842 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
7843 if (r < 0) {
7844 return r;
7845 }
7846 librados::Rados *rad = get_rados_handle();
9f95a23c 7847 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7c673cae 7848
9f95a23c 7849 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
7850 completion->release();
7851 return r;
7852}
7853
b3b6e05e 7854int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
7c673cae
FG
7855{
7856 librados::IoCtx& io_ctx = ctx.io_ctx;
7857 librados::NObjectIterator& iter = ctx.iter;
7858
b3b6e05e 7859 int r = open_pool_ctx(dpp, pool, io_ctx, false);
7c673cae
FG
7860 if (r < 0)
7861 return r;
7862
7863 iter = io_ctx.nobjects_begin();
7864
7865 return 0;
7866}
7867
b3b6e05e 7868int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
181888fb
FG
7869{
7870 librados::IoCtx& io_ctx = ctx.io_ctx;
7871 librados::NObjectIterator& iter = ctx.iter;
7872
b3b6e05e 7873 int r = open_pool_ctx(dpp, pool, io_ctx, false);
181888fb
FG
7874 if (r < 0)
7875 return r;
7876
7877 librados::ObjectCursor oc;
7878 if (!oc.from_str(cursor)) {
b3b6e05e 7879 ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
181888fb
FG
7880 return -EINVAL;
7881 }
7882
f64942e4
AA
7883 try {
7884 iter = io_ctx.nobjects_begin(oc);
7885 return 0;
7886 } catch (const std::system_error& e) {
7887 r = -e.code().value();
b3b6e05e 7888 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
7889 << ", returning " << r << dendl;
7890 return r;
7891 } catch (const std::exception& e) {
b3b6e05e 7892 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
7893 << ", returning -5" << dendl;
7894 return -EIO;
7895 }
181888fb
FG
7896}
7897
7898string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
7899{
7900 return ctx.iter.get_cursor().to_str();
7901}
7902
f64942e4
AA
7903static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
7904 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
7905 bool *is_truncated, RGWAccessListFilter *filter)
7906{
7907 librados::IoCtx& io_ctx = ctx.io_ctx;
7908 librados::NObjectIterator& iter = ctx.iter;
7909
7910 if (iter == io_ctx.nobjects_end())
7911 return -ENOENT;
7912
7913 uint32_t i;
7914
7915 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
7916 rgw_bucket_dir_entry e;
7917
7918 string oid = iter->get_oid();
7919 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7920
7921 // fill it in with initial values; we may correct later
7922 if (filter && !filter->filter(oid, oid))
7923 continue;
7924
7925 e.key = oid;
7926 objs.push_back(e);
7927 }
7928
7929 if (is_truncated)
7930 *is_truncated = (iter != io_ctx.nobjects_end());
7931
7932 return objs.size();
7933}
7c673cae 7934
f64942e4
AA
7935int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
7936 bool *is_truncated, RGWAccessListFilter *filter)
7937{
7938 // catch exceptions from NObjectIterator::operator++()
7939 try {
7940 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
7941 } catch (const std::system_error& e) {
7942 int r = -e.code().value();
7943 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7944 << ", returning " << r << dendl;
7945 return r;
7946 } catch (const std::exception& e) {
7947 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7948 << ", returning -5" << dendl;
7949 return -EIO;
7950 }
7951}
7952
b3b6e05e 7953int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 7954{
181888fb 7955 if (!ctx->initialized) {
b3b6e05e 7956 int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
7c673cae 7957 if (r < 0) {
b3b6e05e 7958 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7c673cae
FG
7959 return r;
7960 }
181888fb 7961 ctx->initialized = true;
7c673cae 7962 }
181888fb
FG
7963 return 0;
7964}
7c673cae 7965
b3b6e05e 7966int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
181888fb
FG
7967 RGWListRawObjsCtx& ctx, list<string>& oids,
7968 bool *is_truncated)
7969{
7970 if (!ctx.initialized) {
7971 return -EINVAL;
7972 }
7973 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
7974 vector<rgw_bucket_dir_entry> objs;
7975 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
7976 if (r < 0) {
7977 if(r != -ENOENT)
b3b6e05e 7978 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7c673cae
FG
7979 return r;
7980 }
7981
7982 vector<rgw_bucket_dir_entry>::iterator iter;
7983 for (iter = objs.begin(); iter != objs.end(); ++iter) {
7984 oids.push_back(iter->key.name);
7985 }
7986
7987 return oids.size();
7988}
7989
b3b6e05e 7990int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
181888fb
FG
7991 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
7992 bool *is_truncated)
7993{
7994 if (!ctx.initialized) {
b3b6e05e 7995 int r = list_raw_objects_init(dpp, pool, string(), &ctx);
181888fb
FG
7996 if (r < 0) {
7997 return r;
7998 }
7999 }
8000
b3b6e05e 8001 return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
181888fb
FG
8002}
8003
8004string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8005{
8006 return pool_iterate_get_cursor(ctx.iter_ctx);
8007}
8008
b3b6e05e 8009int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8010 rgw_bucket_dir_entry *dirent)
7c673cae 8011{
a8e16298 8012 rgw_cls_bi_entry bi_entry;
b3b6e05e 8013 int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298 8014 if (r < 0 && r != -ENOENT) {
b3b6e05e 8015 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
a8e16298 8016 }
7c673cae
FG
8017 if (r < 0) {
8018 return r;
8019 }
11fdf7f2 8020 auto iter = bi_entry.data.cbegin();
a8e16298 8021 try {
11fdf7f2 8022 decode(*dirent, iter);
a8e16298 8023 } catch (buffer::error& err) {
b3b6e05e 8024 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
a8e16298
TL
8025 return -EIO;
8026 }
8027
8028 return 0;
8029}
7c673cae 8030
b3b6e05e 8031int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298
TL
8032 rgw_bucket_olh_entry *olh)
8033{
7c673cae 8034 rgw_cls_bi_entry bi_entry;
b3b6e05e 8035 int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae 8036 if (r < 0 && r != -ENOENT) {
b3b6e05e 8037 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7c673cae
FG
8038 }
8039 if (r < 0) {
8040 return r;
8041 }
11fdf7f2 8042 auto iter = bi_entry.data.cbegin();
7c673cae 8043 try {
a8e16298 8044 decode(*olh, iter);
7c673cae 8045 } catch (buffer::error& err) {
b3b6e05e 8046 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7c673cae
FG
8047 return -EIO;
8048 }
8049
8050 return 0;
8051}
8052
b3b6e05e 8053int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8054 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
8055{
8056 BucketShard bs(this);
b3b6e05e 8057 int ret = bs.init(dpp, bucket_info, obj);
7c673cae 8058 if (ret < 0) {
b3b6e05e 8059 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8060 return ret;
8061 }
8062
8063 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9f95a23c
TL
8064
8065 auto& ref = bs.bucket_obj.get_ref();
7c673cae 8066
9f95a23c 8067 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7c673cae
FG
8068}
8069
8070void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8071{
9f95a23c
TL
8072 auto& ref = bs.bucket_obj.get_ref();
8073 cls_rgw_bi_put(op, ref.obj.oid, entry);
7c673cae
FG
8074}
8075
8076int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8077{
9f95a23c
TL
8078 auto& ref = bs.bucket_obj.get_ref();
8079 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7c673cae
FG
8080 if (ret < 0)
8081 return ret;
8082
8083 return 0;
8084}
8085
b3b6e05e 8086int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7c673cae
FG
8087{
8088 BucketShard bs(this);
b3b6e05e 8089 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8090 if (ret < 0) {
b3b6e05e 8091 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8092 return ret;
8093 }
8094
8095 return bi_put(bs, entry);
8096}
8097
b3b6e05e 8098int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae
FG
8099{
8100 rgw_obj obj(bucket, obj_name);
8101 BucketShard bs(this);
b3b6e05e 8102 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8103 if (ret < 0) {
b3b6e05e 8104 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8105 return ret;
8106 }
8107
9f95a23c
TL
8108 auto& ref = bs.bucket_obj.get_ref();
8109 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
8110 if (ret == -ENOENT) {
8111 *is_truncated = false;
8112 }
7c673cae
FG
8113 if (ret < 0)
8114 return ret;
8115
8116 return 0;
8117}
8118
8119int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8120{
9f95a23c
TL
8121 auto& ref = bs.bucket_obj.get_ref();
8122 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, filter_obj, marker, max, entries, is_truncated);
7c673cae
FG
8123 if (ret < 0)
8124 return ret;
8125
8126 return 0;
8127}
8128
8129int RGWRados::bi_remove(BucketShard& bs)
8130{
9f95a23c
TL
8131 auto& ref = bs.bucket_obj.get_ref();
8132 int ret = ref.pool.ioctx().remove(ref.obj.oid);
7c673cae
FG
8133 if (ret == -ENOENT) {
8134 ret = 0;
8135 }
8136 if (ret < 0) {
8137 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8138 return ret;
8139 }
8140
8141 return 0;
8142}
8143
b3b6e05e 8144int RGWRados::bi_list(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae
FG
8145{
8146 BucketShard bs(this);
b3b6e05e 8147 int ret = bs.init(bucket_info.bucket, shard_id, bucket_info.layout.current_index, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8148 if (ret < 0) {
b3b6e05e 8149 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8150 return ret;
8151 }
8152
8153 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8154}
8155
b3b6e05e 8156int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
7c673cae 8157{
b3b6e05e 8158 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
7c673cae
FG
8159}
8160
9f95a23c
TL
8161int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8162 librados::ObjectWriteOperation *op)
7c673cae 8163{
9f95a23c 8164 return gc_pool_ctx.aio_operate(oid, c, op);
7c673cae
FG
8165}
8166
b3b6e05e 8167int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
7c673cae 8168{
b3b6e05e 8169 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
7c673cae
FG
8170}
8171
9f95a23c 8172int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
7c673cae 8173{
9f95a23c 8174 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
7c673cae
FG
8175}
8176
11fdf7f2 8177int RGWRados::process_gc(bool expired_only)
7c673cae 8178{
11fdf7f2 8179 return gc->process(expired_only);
7c673cae
FG
8180}
8181
f6b5b4d7 8182int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
f67539c2 8183 vector<rgw::sal::Lifecycle::LCEntry>& progress_map,
f6b5b4d7 8184 int& index)
7c673cae 8185{
f6b5b4d7 8186 return lc->list_lc_progress(marker, max_entries, progress_map, index);
7c673cae
FG
8187}
8188
8189int RGWRados::process_lc()
8190{
f6b5b4d7
TL
8191 RGWLC lc;
8192 lc.initialize(cct, this->store);
8193 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
8194 auto ret = lc.process(&worker, true /* once */);
8195 lc.stop_processor(); // sets down_flag, but returns immediately
8196 return ret;
7c673cae
FG
8197}
8198
b3b6e05e 8199bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
7c673cae 8200{
b3b6e05e 8201 return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
7c673cae
FG
8202}
8203
b3b6e05e 8204int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
9f95a23c 8205 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
7c673cae 8206{
31f18b77
FG
8207 rgw_zone_set zones_trace;
8208 if (_zones_trace) {
8209 zones_trace = *_zones_trace;
8210 }
9f95a23c 8211 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8212
7c673cae
FG
8213 ObjectWriteOperation o;
8214 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8215 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8216 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
b3b6e05e 8217 return bs.bucket_obj.operate(dpp, &o, y);
7c673cae
FG
8218}
8219
31f18b77 8220int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8221 int64_t pool, uint64_t epoch,
8222 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8223 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8224{
7c673cae
FG
8225 ObjectWriteOperation o;
8226 rgw_bucket_dir_entry_meta dir_meta;
8227 dir_meta = ent.meta;
8228 dir_meta.category = category;
8229
1adf2230
AA
8230 rgw_zone_set zones_trace;
8231 if (_zones_trace) {
8232 zones_trace = *_zones_trace;
8233 }
9f95a23c 8234 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8235
7c673cae
FG
8236 rgw_bucket_entry_ver ver;
8237 ver.pool = pool;
8238 ver.epoch = epoch;
8239 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
8240 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8241 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8242 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
8243 complete_op_data *arg;
8244 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8245 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77 8246 librados::AioCompletion *completion = arg->rados_completion;
9f95a23c 8247 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
31f18b77 8248 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
8249 return ret;
8250}
8251
31f18b77 8252int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
8253 int64_t pool, uint64_t epoch,
8254 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8255 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 8256{
31f18b77 8257 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
8258}
8259
8260int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8261 int64_t pool, uint64_t epoch,
8262 rgw_obj& obj,
8263 real_time& removed_mtime,
8264 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
8265 uint16_t bilog_flags,
8266 rgw_zone_set *zones_trace)
7c673cae
FG
8267{
8268 rgw_bucket_dir_entry ent;
8269 ent.meta.mtime = removed_mtime;
8270 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8271 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8272 ent, RGWObjCategory::None, remove_objs,
8273 bilog_flags, zones_trace);
7c673cae
FG
8274}
8275
31f18b77 8276int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
8277{
8278 rgw_bucket_dir_entry ent;
8279 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8280 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8281 -1 /* pool id */, 0, ent,
8282 RGWObjCategory::None, NULL, bilog_flags,
8283 zones_trace);
7c673cae
FG
8284}
8285
b3b6e05e 8286int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
7c673cae 8287{
9f95a23c 8288 RGWSI_RADOS::Pool index_pool;
7c673cae 8289 map<int, string> bucket_objs;
b3b6e05e 8290 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8291 if (r < 0)
8292 return r;
8293
9f95a23c
TL
8294 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8295}
8296
8297
8298uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8299 uint32_t num_shards)
8300{
8301 // We want to minimize the chances that when num_shards >>
8302 // num_entries that we return much fewer than num_entries to the
8303 // client. Given all the overhead of making a cls call to the osd,
8304 // returning a few entries is not much more work than returning one
8305 // entry. This minimum might be better tuned based on future
8306 // experiments where num_shards >> num_entries. (Note: ">>" should
8307 // be interpreted as "much greater than".)
8308 constexpr uint32_t min_read = 8;
8309
8310 // The following is based on _"Balls into Bins" -- A Simple and
8311 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8312 // cases when num_shards >> num_entries (it almost serves as a
8313 // ceiling calculation). We also assume alpha is 1.0 and extract it
8314 // from the calculation. Future work could involve memoizing some of
8315 // the transcendental functions to minimize repeatedly re-calling
8316 // them with the same parameters, which we expect to be the case the
8317 // majority of the time.
8318 uint32_t calc_read =
8319 1 +
8320 static_cast<uint32_t>((num_entries / num_shards) +
8321 sqrt((2 * num_entries) *
8322 log(num_shards) / num_shards));
8323
8324 return std::max(min_read, calc_read);
7c673cae
FG
8325}
8326
1adf2230 8327
b3b6e05e
TL
8328int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
8329 RGWBucketInfo& bucket_info,
9f95a23c
TL
8330 const int shard_id,
8331 const rgw_obj_index_key& start_after,
1adf2230 8332 const string& prefix,
9f95a23c
TL
8333 const string& delimiter,
8334 const uint32_t num_entries,
8335 const bool list_versions,
8336 const uint16_t expansion_factor,
8337 ent_map_t& m,
8338 bool* is_truncated,
8339 bool* cls_filtered,
1adf2230 8340 rgw_obj_index_key *last_entry,
9f95a23c
TL
8341 optional_yield y,
8342 check_filter_t force_check_filter)
7c673cae 8343{
9f95a23c
TL
8344 /* expansion_factor allows the number of entries to read to grow
8345 * exponentially; this is used when earlier reads are producing too
8346 * few results, perhaps due to filtering or to a series of
8347 * namespaced entries */
8348
b3b6e05e 8349 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
9f95a23c
TL
8350 " start_after=\"" << start_after.name <<
8351 "[" << start_after.instance <<
8352 "]\", prefix=\"" << prefix <<
8353 "\" num_entries=" << num_entries <<
8354 ", list_versions=" << list_versions <<
8355 ", expansion_factor=" << expansion_factor << dendl;
7c673cae 8356
9f95a23c
TL
8357 m.clear();
8358
8359 RGWSI_RADOS::Pool index_pool;
7c673cae 8360 // key - oid (for different shards if there is any)
1adf2230
AA
8361 // value - list result for the corresponding oid (shard), it is filled by
8362 // the AIO callback
9f95a23c 8363 map<int, string> shard_oids;
b3b6e05e 8364 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id,
9f95a23c
TL
8365 &index_pool, &shard_oids,
8366 nullptr);
8367 if (r < 0) {
7c673cae 8368 return r;
9f95a23c
TL
8369 }
8370
8371 const uint32_t shard_count = shard_oids.size();
8372 uint32_t num_entries_per_shard;
8373 if (expansion_factor == 0) {
8374 num_entries_per_shard =
8375 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8376 } else if (expansion_factor <= 11) {
8377 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8378 num_entries_per_shard =
8379 std::min(num_entries,
8380 (uint32_t(1 << (expansion_factor - 1)) *
8381 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8382 } else {
8383 num_entries_per_shard = num_entries;
8384 }
8385
b3b6e05e 8386 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ <<
9f95a23c
TL
8387 " request from each of " << shard_count <<
8388 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8389 num_entries << " total entries" << dendl;
7c673cae 8390
9f95a23c
TL
8391 auto& ioctx = index_pool.ioctx();
8392 map<int, rgw_cls_list_ret> shard_list_results;
8393 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8394 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8395 num_entries_per_shard,
8396 list_versions, shard_oids, shard_list_results,
1adf2230 8397 cct->_conf->rgw_bucket_index_max_aio)();
9f95a23c 8398 if (r < 0) {
7c673cae 8399 return r;
9f95a23c 8400 }
7c673cae 8401
9f95a23c
TL
8402 // to manage the iterators through each shard's list results
8403 struct ShardTracker {
8404 const size_t shard_idx;
8405 rgw_cls_list_ret& result;
8406 const std::string& oid_name;
8407 RGWRados::ent_map_t::iterator cursor;
8408 RGWRados::ent_map_t::iterator end;
8409
8410 // manages an iterator through a shard and provides other
8411 // accessors
8412 ShardTracker(size_t _shard_idx,
8413 rgw_cls_list_ret& _result,
8414 const std::string& _oid_name):
8415 shard_idx(_shard_idx),
8416 result(_result),
8417 oid_name(_oid_name),
8418 cursor(_result.dir.m.begin()),
8419 end(_result.dir.m.end())
8420 {}
8421
8422 inline const std::string& entry_name() const {
8423 return cursor->first;
8424 }
8425 rgw_bucket_dir_entry& dir_entry() const {
8426 return cursor->second;
8427 }
8428 inline bool is_truncated() const {
8429 return result.is_truncated;
8430 }
8431 inline ShardTracker& advance() {
8432 ++cursor;
8433 // return a self-reference to allow for chaining of calls, such
8434 // as x.advance().at_end()
8435 return *this;
8436 }
8437 inline bool at_end() const {
8438 return cursor == end;
8439 }
8440 }; // ShardTracker
8441
8442 // add the next unique candidate, or return false if we reach the end
f67539c2 8443 auto next_candidate = [] (CephContext *cct, ShardTracker& t,
9f95a23c
TL
8444 std::map<std::string, size_t>& candidates,
8445 size_t tracker_idx) {
8446 while (!t.at_end()) {
8447 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8448 return;
8449 }
8450 t.advance(); // skip duplicate common prefixes
8451 }
8452 };
8453
8454 // one tracker per shard requested (may not be all shards)
8455 std::vector<ShardTracker> results_trackers;
8456 results_trackers.reserve(shard_list_results.size());
8457 for (auto& r : shard_list_results) {
8458 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8459
8460 // if any *one* shard's result is trucated, the entire result is
8461 // truncated
8462 *is_truncated = *is_truncated || r.second.is_truncated;
8463
8464 // unless *all* are shards are cls_filtered, the entire result is
8465 // not filtered
8466 *cls_filtered = *cls_filtered && r.second.cls_filtered;
7c673cae
FG
8467 }
8468
9f95a23c
TL
8469 // create a map to track the next candidate entry from ShardTracker
8470 // (key=candidate, value=index into results_trackers); as we consume
8471 // entries from shards, we replace them with the next entries in the
8472 // shards until we run out
7c673cae 8473 map<string, size_t> candidates;
9f95a23c
TL
8474 size_t tracker_idx = 0;
8475 for (auto& t : results_trackers) {
8476 // it's important that the values in the map refer to the index
8477 // into the results_trackers vector, which may not be the same
8478 // as the shard number (i.e., when not all shards are requested)
f67539c2 8479 next_candidate(cct, t, candidates, tracker_idx);
9f95a23c 8480 ++tracker_idx;
7c673cae
FG
8481 }
8482
9f95a23c
TL
8483 rgw_bucket_dir_entry*
8484 last_entry_visited = nullptr; // to set last_entry (marker)
7c673cae
FG
8485 map<string, bufferlist> updates;
8486 uint32_t count = 0;
8487 while (count < num_entries && !candidates.empty()) {
8488 r = 0;
9f95a23c
TL
8489 // select the next entry in lexical order (first key in map);
8490 // again tracker_idx is not necessarily shard number, but is index
8491 // into results_trackers vector
8492 tracker_idx = candidates.begin()->second;
8493 auto& tracker = results_trackers.at(tracker_idx);
e306af50 8494
9f95a23c
TL
8495 const string& name = tracker.entry_name();
8496 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8497
b3b6e05e 8498 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ << " currently processing " <<
9f95a23c
TL
8499 dirent.key << " from shard " << tracker.shard_idx << dendl;
8500
8501 const bool force_check =
8502 force_check_filter && force_check_filter(dirent.key.name);
8503
8504 if ((!dirent.exists &&
8505 !dirent.is_delete_marker() &&
8506 !dirent.is_common_prefix()) ||
3efd9988
FG
8507 !dirent.pending_map.empty() ||
8508 force_check) {
9f95a23c
TL
8509 /* there are uncommitted ops. We need to check the current
8510 * state, and if the tags are old we need to do clean-up as
8511 * well. */
7c673cae 8512 librados::IoCtx sub_ctx;
9f95a23c 8513 sub_ctx.dup(ioctx);
b3b6e05e 8514 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
9f95a23c 8515 updates[tracker.oid_name], y);
7c673cae 8516 if (r < 0 && r != -ENOENT) {
9f95a23c 8517 return r;
7c673cae 8518 }
eafe8130 8519 } else {
9f95a23c 8520 r = 0;
7c673cae 8521 }
9f95a23c 8522
7c673cae 8523 if (r >= 0) {
b3b6e05e 8524 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": got " <<
1adf2230 8525 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae 8526 m[name] = std::move(dirent);
e306af50 8527 last_entry_visited = &(m[name]);
7c673cae 8528 ++count;
9f95a23c 8529 } else {
b3b6e05e 8530 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ << ": skipping " <<
9f95a23c 8531 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
e306af50 8532 last_entry_visited = &tracker.dir_entry();
7c673cae
FG
8533 }
8534
9f95a23c 8535 // refresh the candidates map
7c673cae 8536 candidates.erase(candidates.begin());
9f95a23c
TL
8537 tracker.advance();
8538
f67539c2 8539 next_candidate(cct, tracker, candidates, tracker_idx);
9f95a23c
TL
8540
8541 if (tracker.at_end() && tracker.is_truncated()) {
8542 // once we exhaust one shard that is truncated, we need to stop,
8543 // as we cannot be certain that one of the next entries needs to
8544 // come from that shard; S3 and swift protocols allow returning
8545 // fewer than what was requested
8546 break;
7c673cae 8547 }
9f95a23c 8548 } // while we haven't provided requested # of result entries
7c673cae 8549
9f95a23c
TL
8550 // suggest updates if there are any
8551 for (auto& miter : updates) {
8552 if (miter.second.length()) {
7c673cae 8553 ObjectWriteOperation o;
9f95a23c 8554 cls_rgw_suggest_changes(o, miter.second);
7c673cae 8555 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8556 AioCompletion *c =
8557 librados::Rados::aio_create_completion(nullptr, nullptr);
8558 ioctx.aio_operate(miter.first, c, &o);
1adf2230 8559 c->release();
7c673cae 8560 }
9f95a23c 8561 } // updates loop
7c673cae 8562
9f95a23c
TL
8563 // determine truncation by checking if all the returned entries are
8564 // consumed or not
8565 *is_truncated = false;
8566 for (const auto& t : results_trackers) {
8567 if (!t.at_end() || t.is_truncated()) {
7c673cae 8568 *is_truncated = true;
1adf2230
AA
8569 break;
8570 }
7c673cae 8571 }
92f5a8d4 8572
b3b6e05e 8573 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
9f95a23c
TL
8574 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8575 dendl;
8576
8577 if (*is_truncated && count < num_entries) {
b3b6e05e 8578 ldpp_dout(dpp, 10) << "RGWRados::" << __func__ <<
9f95a23c
TL
8579 ": INFO requested " << num_entries << " entries but returning " <<
8580 count << ", which is truncated" << dendl;
8581 }
8582
8583 if (last_entry_visited != nullptr && last_entry) {
e306af50 8584 *last_entry = last_entry_visited->key;
b3b6e05e 8585 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
9f95a23c
TL
8586 ": returning, last_entry=" << *last_entry << dendl;
8587 } else {
b3b6e05e 8588 ldpp_dout(dpp, 20) << "RGWRados::" << __func__ <<
9f95a23c
TL
8589 ": returning, last_entry NOT SET" << dendl;
8590 }
7c673cae
FG
8591
8592 return 0;
8593}
8594
1adf2230 8595
b3b6e05e
TL
8596int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
8597 RGWBucketInfo& bucket_info,
1adf2230 8598 int shard_id,
9f95a23c 8599 const rgw_obj_index_key& start_after,
1adf2230
AA
8600 const string& prefix,
8601 uint32_t num_entries,
8602 bool list_versions,
8603 std::vector<rgw_bucket_dir_entry>& ent_list,
8604 bool *is_truncated,
8605 rgw_obj_index_key *last_entry,
9f95a23c
TL
8606 optional_yield y,
8607 check_filter_t force_check_filter) {
b3b6e05e 8608 ldpp_dout(dpp, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9f95a23c 8609 " start_after " << start_after.name << "[" << start_after.instance <<
1adf2230
AA
8610 "] num_entries " << num_entries << dendl;
8611
9f95a23c 8612 ent_list.clear();
11fdf7f2
TL
8613 static MultipartMetaFilter multipart_meta_filter;
8614
1adf2230 8615 *is_truncated = false;
9f95a23c 8616 RGWSI_RADOS::Pool index_pool;
1adf2230 8617
1adf2230 8618 map<int, string> oids;
b3b6e05e 8619 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, nullptr);
1adf2230
AA
8620 if (r < 0)
8621 return r;
9f95a23c
TL
8622
8623 auto& ioctx = index_pool.ioctx();
8624
1adf2230
AA
8625 const uint32_t num_shards = oids.size();
8626
9f95a23c 8627 rgw_obj_index_key marker = start_after;
1adf2230
AA
8628 uint32_t current_shard;
8629 if (shard_id >= 0) {
8630 current_shard = shard_id;
9f95a23c 8631 } else if (start_after.empty()) {
1adf2230
AA
8632 current_shard = 0u;
8633 } else {
9f95a23c
TL
8634 // at this point we have a marker (start_after) that has something
8635 // in it, so we need to get to the bucket shard index, so we can
11fdf7f2
TL
8636 // start reading from there
8637
8638 std::string key;
8639 // test whether object name is a multipart meta name
9f95a23c 8640 if(! multipart_meta_filter.filter(start_after.name, key)) {
11fdf7f2
TL
8641 // if multipart_meta_filter fails, must be "regular" (i.e.,
8642 // unadorned) and the name is the key
9f95a23c 8643 key = start_after.name;
11fdf7f2
TL
8644 }
8645
8646 // now convert the key (oid) to an rgw_obj_key since that will
8647 // separate out the namespace, name, and instance
8648 rgw_obj_key obj_key;
8649 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
8650 if (!parsed) {
b3b6e05e 8651 ldpp_dout(dpp, 0) <<
11fdf7f2 8652 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9f95a23c 8653 "start marker: '" << start_after << "'" << dendl;
11fdf7f2
TL
8654 return -EINVAL;
8655 } else if (obj_key.name.empty()) {
8656 // if the name is empty that means the object name came in with
8657 // a namespace only, and therefore we need to start our scan at
8658 // the first bucket index shard
8659 current_shard = 0u;
8660 } else {
8661 // so now we have the key used to compute the bucket index shard
8662 // and can extract the specific shard from it
9f95a23c 8663 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
11fdf7f2 8664 }
1adf2230
AA
8665 }
8666
8667 uint32_t count = 0u;
8668 map<string, bufferlist> updates;
11fdf7f2 8669 rgw_obj_index_key last_added_entry;
1adf2230
AA
8670 while (count <= num_entries &&
8671 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8672 current_shard < num_shards)) {
81eedcae
TL
8673 const std::string& oid = oids[current_shard];
8674 rgw_cls_list_ret result;
8675
8676 librados::ObjectReadOperation op;
9f95a23c
TL
8677 string empty_delimiter;
8678 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8679 num_entries,
81eedcae 8680 list_versions, &result);
b3b6e05e 8681 r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
1adf2230
AA
8682 if (r < 0)
8683 return r;
8684
1adf2230
AA
8685 for (auto& entry : result.dir.m) {
8686 rgw_bucket_dir_entry& dirent = entry.second;
8687
8688 bool force_check = force_check_filter &&
8689 force_check_filter(dirent.key.name);
8690 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8691 !dirent.pending_map.empty() ||
8692 force_check) {
8693 /* there are uncommitted ops. We need to check the current state,
8694 * and if the tags are old we need to do cleanup as well. */
8695 librados::IoCtx sub_ctx;
9f95a23c 8696 sub_ctx.dup(ioctx);
b3b6e05e 8697 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
1adf2230
AA
8698 if (r < 0 && r != -ENOENT) {
8699 return r;
8700 }
eafe8130
TL
8701 } else {
8702 r = 0;
1adf2230
AA
8703 }
8704
8705 // at this point either r >=0 or r == -ENOENT
8706 if (r >= 0) { // i.e., if r != -ENOENT
b3b6e05e 8707 ldpp_dout(dpp, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
1adf2230
AA
8708 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8709
8710 if (count < num_entries) {
11fdf7f2 8711 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
8712 ent_list.emplace_back(std::move(dirent));
8713 ++count;
8714 } else {
8715 *is_truncated = true;
8716 goto check_updates;
8717 }
8718 } else { // r == -ENOENT
8719 // in the case of -ENOENT, make sure we're advancing marker
8720 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 8721 marker = dirent.key;
1adf2230
AA
8722 }
8723 } // entry for loop
8724
8725 if (!result.is_truncated) {
8726 // if we reached the end of the shard read next shard
8727 ++current_shard;
11fdf7f2 8728 marker = rgw_obj_index_key();
1adf2230
AA
8729 }
8730 } // shard loop
8731
8732check_updates:
11fdf7f2 8733
1adf2230
AA
8734 // suggest updates if there is any
8735 map<string, bufferlist>::iterator miter = updates.begin();
8736 for (; miter != updates.end(); ++miter) {
8737 if (miter->second.length()) {
8738 ObjectWriteOperation o;
8739 cls_rgw_suggest_changes(o, miter->second);
8740 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8741 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8742 ioctx.aio_operate(miter->first, c, &o);
1adf2230
AA
8743 c->release();
8744 }
8745 }
8746
8747 if (last_entry && !ent_list.empty()) {
8748 *last_entry = last_added_entry;
8749 }
8750
8751 return 0;
11fdf7f2 8752} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
8753
8754
b3b6e05e 8755int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
1adf2230 8756 rgw_usage_log_info& info)
7c673cae 8757{
11fdf7f2 8758 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8759
8760 rgw_rados_ref ref;
b3b6e05e 8761 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
8762 if (r < 0) {
8763 return r;
8764 }
8765
8766 ObjectWriteOperation op;
8767 cls_rgw_usage_log_add(op, info);
8768
b3b6e05e 8769 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8770 return r;
8771}
8772
b3b6e05e 8773int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2
TL
8774 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
8775 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
8776 bool *is_truncated)
7c673cae 8777{
11fdf7f2 8778 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8779
8780 rgw_rados_ref ref;
b3b6e05e 8781 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
8782 if (r < 0) {
8783 return r;
8784 }
8785
8786 *is_truncated = false;
8787
9f95a23c 8788 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
8789 max_entries, read_iter, usage, is_truncated);
8790
8791 return r;
8792}
8793
b3b6e05e 8794static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
9f95a23c
TL
8795{
8796 bool done = false;
8797 do {
8798 librados::ObjectWriteOperation op;
8799 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
b3b6e05e 8800 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
9f95a23c
TL
8801 if (r == -ENODATA)
8802 done = true;
8803 else if (r < 0)
8804 return r;
8805 } while (!done);
8806
8807 return 0;
8808}
8809
b3b6e05e 8810int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2 8811 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 8812{
11fdf7f2 8813 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8814
8815 rgw_rados_ref ref;
b3b6e05e 8816 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
8817 if (r < 0) {
8818 return r;
8819 }
8820
b3b6e05e 8821 r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
11fdf7f2
TL
8822 return r;
8823}
8824
b3b6e05e 8825int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
11fdf7f2
TL
8826{
8827 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8828
8829 rgw_rados_ref ref;
b3b6e05e 8830 int r = get_raw_obj_ref(dpp, obj, &ref);
11fdf7f2
TL
8831 if (r < 0) {
8832 return r;
8833 }
8834 librados::ObjectWriteOperation op;
8835 cls_rgw_usage_log_clear(op);
b3b6e05e 8836 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8837 return r;
8838}
8839
11fdf7f2 8840
b3b6e05e 8841int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
7c673cae 8842{
9f95a23c 8843 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8844 string dir_oid;
8845
11fdf7f2 8846 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae 8847
b3b6e05e 8848 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, &index_pool, &dir_oid);
7c673cae
FG
8849 if (r < 0)
8850 return r;
8851
8852 bufferlist updates;
8853
8854 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
8855 rgw_bucket_dir_entry entry;
8856 entry.key = *iter;
b3b6e05e 8857 ldpp_dout(dpp, 2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
7c673cae
FG
8858 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8859 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 8860 encode(entry, updates);
7c673cae
FG
8861 }
8862
8863 bufferlist out;
8864
9f95a23c 8865 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
7c673cae
FG
8866
8867 return r;
8868}
8869
b3b6e05e
TL
8870int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
8871 librados::IoCtx io_ctx,
7c673cae
FG
8872 const RGWBucketInfo& bucket_info,
8873 rgw_bucket_dir_entry& list_state,
8874 rgw_bucket_dir_entry& object,
9f95a23c
TL
8875 bufferlist& suggested_updates,
8876 optional_yield y)
7c673cae
FG
8877{
8878 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 8879 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
8880
8881 std::string loc;
8882
8883 rgw_obj obj(bucket, list_state.key);
8884
8885 string oid;
8886 get_obj_bucket_and_oid_loc(obj, oid, loc);
8887
8888 if (loc != list_state.locator) {
b3b6e05e 8889 ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
7c673cae
FG
8890 }
8891
8892 io_ctx.locator_set_key(list_state.locator);
8893
8894 RGWObjState *astate = NULL;
9f95a23c 8895 RGWObjectCtx rctx(this->store);
b3b6e05e 8896 int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
8897 if (r < 0)
8898 return r;
8899
8900 list_state.pending_map.clear(); // we don't need this and it inflates size
9f95a23c 8901 if (!list_state.is_delete_marker() && !astate->exists) {
7c673cae
FG
8902 /* object doesn't exist right now -- hopefully because it's
8903 * marked as !exists and got deleted */
8904 if (list_state.exists) {
8905 /* FIXME: what should happen now? Work out if there are any
8906 * non-bad ways this could happen (there probably are, but annoying
8907 * to handle!) */
8908 }
8909 // encode a suggested removal of that key
8910 list_state.ver.epoch = io_ctx.get_last_version();
8911 list_state.ver.pool = io_ctx.get_id();
8912 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
8913 return -ENOENT;
8914 }
8915
8916 string etag;
8917 string content_type;
8918 ACLOwner owner;
8919
8920 object.meta.size = astate->size;
8921 object.meta.accounted_size = astate->accounted_size;
8922 object.meta.mtime = astate->mtime;
8923
8924 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
8925 if (iter != astate->attrset.end()) {
11fdf7f2 8926 etag = rgw_bl_str(iter->second);
7c673cae
FG
8927 }
8928 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
8929 if (iter != astate->attrset.end()) {
11fdf7f2 8930 content_type = rgw_bl_str(iter->second);
7c673cae
FG
8931 }
8932 iter = astate->attrset.find(RGW_ATTR_ACL);
8933 if (iter != astate->attrset.end()) {
8934 r = decode_policy(iter->second, &owner);
8935 if (r < 0) {
b3b6e05e 8936 ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
7c673cae
FG
8937 }
8938 }
8939
9f95a23c 8940 if (astate->manifest) {
7c673cae 8941 RGWObjManifest::obj_iterator miter;
9f95a23c 8942 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 8943 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 8944 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(store);
7c673cae 8945 rgw_obj loc;
9f95a23c 8946 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
7c673cae
FG
8947
8948 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
b3b6e05e
TL
8949 ldpp_dout(dpp, 0) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
8950 r = delete_obj_index(loc, astate->mtime, dpp);
7c673cae 8951 if (r < 0) {
b3b6e05e 8952 ldpp_dout(dpp, 0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
7c673cae
FG
8953 }
8954 }
8955 }
8956 }
8957
8958 object.meta.etag = etag;
8959 object.meta.content_type = content_type;
8960 object.meta.owner = owner.get_id().to_str();
8961 object.meta.owner_display_name = owner.get_display_name();
8962
8963 // encode suggested updates
8964 list_state.ver.pool = io_ctx.get_id();
8965 list_state.ver.epoch = astate->epoch;
8966 list_state.meta.size = object.meta.size;
8967 list_state.meta.accounted_size = object.meta.accounted_size;
8968 list_state.meta.mtime = object.meta.mtime;
8969 list_state.meta.category = main_category;
8970 list_state.meta.etag = etag;
8971 list_state.meta.content_type = content_type;
8972 if (astate->obj_tag.length() > 0)
8973 list_state.tag = astate->obj_tag.c_str();
8974 list_state.meta.owner = owner.get_id().to_str();
8975 list_state.meta.owner_display_name = owner.get_display_name();
8976
8977 list_state.exists = true;
8978 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
8979 return 0;
8980}
8981
b3b6e05e 8982int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae 8983{
9f95a23c 8984 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8985 map<int, string> oids;
8986 map<int, struct rgw_cls_list_ret> list_results;
b3b6e05e 8987 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
9f95a23c 8988 if (r < 0) {
b3b6e05e 8989 ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
9f95a23c 8990 << r << dendl;
7c673cae 8991 return r;
9f95a23c 8992 }
7c673cae 8993
9f95a23c
TL
8994 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
8995 if (r < 0) {
b3b6e05e 8996 ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
9f95a23c 8997 << r << dendl;
7c673cae 8998 return r;
9f95a23c 8999 }
7c673cae
FG
9000
9001 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9002 for(; iter != list_results.end(); ++iter) {
a8e16298 9003 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
9004 }
9005 return 0;
9006}
9007
b3b6e05e 9008int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
7c673cae 9009{
9f95a23c 9010 RGWSI_RADOS::Pool index_pool;
7c673cae 9011 map<int, string> bucket_objs;
b3b6e05e 9012 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
9013 if (r < 0)
9014 return r;
9015
9016 map<int, string>::iterator iter = bucket_objs.begin();
9017 for (; iter != bucket_objs.end(); ++iter) {
9f95a23c 9018 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
7c673cae
FG
9019 if (r < 0) {
9020 ctx->put();
9021 break;
9022 } else {
9023 (*num_aio)++;
9024 }
9025 }
9026 return r;
9027}
9028
9f95a23c
TL
9029int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
9030 const rgw_bucket& bucket,
b3b6e05e
TL
9031 uint64_t num_objs,
9032 const DoutPrefixProvider *dpp)
31f18b77 9033{
11fdf7f2 9034 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
9035 return 0;
9036 }
9037
9038 bool need_resharding = false;
9f95a23c 9039 uint32_t num_source_shards =
f67539c2 9040 (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
9f95a23c
TL
9041 const uint32_t max_dynamic_shards =
9042 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
9043
9044 if (num_source_shards >= max_dynamic_shards) {
9045 return 0;
9046 }
31f18b77 9047
9f95a23c 9048 uint32_t suggested_num_shards = 0;
11fdf7f2
TL
9049 const uint64_t max_objs_per_shard =
9050 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9f95a23c
TL
9051
9052 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
9053 num_objs, need_resharding, &suggested_num_shards);
9054 if (! need_resharding) {
9055 return 0;
31f18b77
FG
9056 }
9057
9f95a23c
TL
9058 const uint32_t final_num_shards =
9059 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
9060 max_dynamic_shards);
9061 // final verification, so we don't reduce number of shards
9062 if (final_num_shards <= num_source_shards) {
9063 return 0;
31f18b77
FG
9064 }
9065
b3b6e05e 9066 ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
f67539c2 9067 " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
9f95a23c
TL
9068 "; new num shards " << final_num_shards << " (suggested " <<
9069 suggested_num_shards << ")" << dendl;
9070
b3b6e05e 9071 return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
31f18b77
FG
9072}
9073
b3b6e05e 9074int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
31f18b77 9075{
b3b6e05e 9076 RGWReshard reshard(this->store, dpp);
31f18b77 9077
f67539c2 9078 uint32_t num_source_shards = (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
31f18b77 9079
11fdf7f2 9080 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77 9081 if (new_num_shards <= num_source_shards) {
b3b6e05e 9082 ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
31f18b77
FG
9083 return 0;
9084 }
9085
9086 cls_rgw_reshard_entry entry;
9087 entry.time = real_clock::now();
9088 entry.tenant = bucket_info.owner.tenant;
9089 entry.bucket_name = bucket_info.bucket.name;
9090 entry.bucket_id = bucket_info.bucket.bucket_id;
9091 entry.old_num_shards = num_source_shards;
9092 entry.new_num_shards = new_num_shards;
9093
b3b6e05e 9094 return reshard.add(dpp, entry);
31f18b77
FG
9095}
9096
7c673cae 9097int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
f67539c2
TL
9098 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
9099 uint64_t obj_size, optional_yield y,
9100 bool check_size_only)
7c673cae 9101{
11fdf7f2
TL
9102 // if we only check size, then num_objs will set to 0
9103 if(check_size_only)
f67539c2 9104 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size, y);
11fdf7f2 9105
f67539c2 9106 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size, y);
7c673cae
FG
9107}
9108
f67539c2 9109int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
11fdf7f2 9110 int *shard_id)
7c673cae 9111{
11fdf7f2 9112 int r = 0;
f67539c2
TL
9113 switch (layout.hash_type) {
9114 case rgw::BucketHashType::Mod:
9115 if (!layout.num_shards) {
11fdf7f2
TL
9116 if (shard_id) {
9117 *shard_id = -1;
9118 }
9119 } else {
f67539c2 9120 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
11fdf7f2
TL
9121 if (shard_id) {
9122 *shard_id = (int)sid;
9123 }
9124 }
9125 break;
9126 default:
9127 r = -ENOTSUP;
7c673cae 9128 }
11fdf7f2 9129 return r;
7c673cae
FG
9130}
9131
7c673cae
FG
9132uint64_t RGWRados::instance_id()
9133{
9134 return get_rados_handle()->get_instance_id();
9135}
9136
9137uint64_t RGWRados::next_bucket_id()
9138{
9f95a23c 9139 std::lock_guard l{bucket_id_lock};
7c673cae
FG
9140 return ++max_bucket_id;
9141}
9142
7c673cae
FG
9143librados::Rados* RGWRados::get_rados_handle()
9144{
494da23a 9145 return &rados;
7c673cae
FG
9146}
9147
b3b6e05e 9148int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
7c673cae
FG
9149{
9150 rgw_rados_ref ref;
b3b6e05e 9151 int ret = get_raw_obj_ref(dpp, obj, &ref);
7c673cae 9152 if (ret < 0) {
b3b6e05e 9153 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9154 return ret;
9155 }
9156
9157 ObjectWriteOperation op;
9158 list<string> prefixes;
9159 cls_rgw_remove_obj(op, prefixes);
9160
9f95a23c
TL
9161 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9162 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9163 if (ret < 0) {
b3b6e05e 9164 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9165 c->release();
9166 return ret;
9167 }
9168
9169 handles.push_back(c);
9170
9171 return 0;
9172}
9173
b3b6e05e 9174int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
7c673cae 9175 RGWBucketInfo& bucket_info, RGWObjState *astate,
9f95a23c
TL
9176 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9177 optional_yield y)
7c673cae
FG
9178{
9179 rgw_rados_ref ref;
b3b6e05e 9180 int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae 9181 if (ret < 0) {
b3b6e05e 9182 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9183 return ret;
9184 }
9185
9186 if (keep_index_consistent) {
9187 RGWRados::Bucket bop(this, bucket_info);
9188 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9189
b3b6e05e 9190 ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
7c673cae 9191 if (ret < 0) {
b3b6e05e 9192 ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
7c673cae
FG
9193 return ret;
9194 }
9195 }
9196
9197 ObjectWriteOperation op;
9198 list<string> prefixes;
9199 cls_rgw_remove_obj(op, prefixes);
9200
9f95a23c
TL
9201 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9202 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9203 if (ret < 0) {
b3b6e05e 9204 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9205 c->release();
9206 return ret;
9207 }
9208
9209 handles.push_back(c);
9210
9211 if (keep_index_consistent) {
b3b6e05e 9212 ret = delete_obj_index(obj, astate->mtime, dpp);
7c673cae 9213 if (ret < 0) {
b3b6e05e 9214 ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
7c673cae
FG
9215 return ret;
9216 }
9217 }
9218 return ret;
9219}