]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
buildsys: change download over to reef release
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
9f95a23c 2// vim: ts=8 sw=2 smarttab ft=cpp
7c673cae 3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
9f95a23c
TL
8#include <sstream>
9
7c673cae 10#include <boost/algorithm/string.hpp>
11fdf7f2 11#include <string_view>
7c673cae 12
11fdf7f2 13#include <boost/container/flat_set.hpp>
7c673cae
FG
14#include <boost/format.hpp>
15#include <boost/optional.hpp>
16#include <boost/utility/in_place_factory.hpp>
17
18#include "common/ceph_json.h"
7c673cae
FG
19
20#include "common/errno.h"
21#include "common/Formatter.h"
22#include "common/Throttle.h"
7c673cae 23
9f95a23c 24#include "rgw_sal.h"
11fdf7f2 25#include "rgw_zone.h"
7c673cae
FG
26#include "rgw_cache.h"
27#include "rgw_acl.h"
28#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 29#include "rgw_aio_throttle.h"
7c673cae
FG
30#include "rgw_bucket.h"
31#include "rgw_rest_conn.h"
32#include "rgw_cr_rados.h"
33#include "rgw_cr_rest.h"
f67539c2 34#include "rgw_datalog.h"
11fdf7f2 35#include "rgw_putobj_processor.h"
7c673cae
FG
36
37#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
38#include "cls/rgw/cls_rgw_client.h"
39#include "cls/rgw/cls_rgw_const.h"
40#include "cls/refcount/cls_refcount_client.h"
41#include "cls/version/cls_version_client.h"
c07f9fc5 42#include "osd/osd_types.h"
7c673cae
FG
43
44#include "rgw_tools.h"
45#include "rgw_coroutine.h"
46#include "rgw_compression.h"
adb31ebb 47#include "rgw_etag_verifier.h"
9f95a23c 48#include "rgw_worker.h"
f67539c2 49#include "rgw_notify.h"
7c673cae 50
7c673cae
FG
51#undef fork // fails to compile RGWPeriod::fork() below
52
53#include "common/Clock.h"
54
7c673cae
FG
55using namespace librados;
56
57#include <string>
58#include <iostream>
59#include <vector>
60#include <atomic>
61#include <list>
62#include <map>
11fdf7f2 63#include "include/random.h"
7c673cae
FG
64
65#include "rgw_gc.h"
66#include "rgw_lc.h"
67
68#include "rgw_object_expirer_core.h"
69#include "rgw_sync.h"
81eedcae 70#include "rgw_sync_counters.h"
11fdf7f2 71#include "rgw_sync_trace.h"
9f95a23c
TL
72#include "rgw_trim_datalog.h"
73#include "rgw_trim_mdlog.h"
7c673cae
FG
74#include "rgw_data_sync.h"
75#include "rgw_realm_watcher.h"
31f18b77 76#include "rgw_reshard.h"
7c673cae 77
11fdf7f2
TL
78#include "services/svc_zone.h"
79#include "services/svc_zone_utils.h"
80#include "services/svc_quota.h"
81#include "services/svc_sync_modules.h"
82#include "services/svc_sys_obj.h"
83#include "services/svc_sys_obj_cache.h"
9f95a23c
TL
84#include "services/svc_bucket.h"
85#include "services/svc_mdlog.h"
11fdf7f2 86
7c673cae
FG
87#include "compressor/Compressor.h"
88
20effc67
TL
89#include "rgw_d3n_datacache.h"
90
11fdf7f2
TL
91#ifdef WITH_LTTNG
92#define TRACEPOINT_DEFINE
93#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
94#include "tracing/rgw_rados.h"
95#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
96#undef TRACEPOINT_DEFINE
97#else
98#define tracepoint(...)
99#endif
100
7c673cae
FG
101#define dout_context g_ceph_context
102#define dout_subsys ceph_subsys_rgw
103
7c673cae 104
7c673cae 105static string shadow_ns = "shadow";
7c673cae
FG
106static string default_bucket_index_pool_suffix = "rgw.buckets.index";
107static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 108
11fdf7f2 109static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 110#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae 111
7c673cae 112
20effc67 113// returns true on success, false on failure
7c673cae 114static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
115 const rgw_placement_rule& head_placement_rule,
116 const rgw_obj& obj, rgw_pool *pool)
7c673cae 117{
11fdf7f2 118 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 119 RGWZonePlacementInfo placement;
11fdf7f2 120 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
121 return false;
122 }
123
124 if (!obj.in_extra_data) {
11fdf7f2 125 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 126 } else {
31f18b77 127 *pool = placement.get_data_extra_pool();
7c673cae
FG
128 }
129 }
130
131 return true;
132}
133
134static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
135 const rgw_placement_rule& head_placement_rule,
136 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
137{
138 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
139
11fdf7f2 140 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
141}
142
143rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
144{
145 if (!is_raw) {
146 rgw_raw_obj r;
147 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
148 return r;
149 }
150 return raw_obj;
151}
152
20effc67 153rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* store) const
7c673cae
FG
154{
155 if (!is_raw) {
156 rgw_raw_obj r;
f67539c2 157 store->get_raw_obj(placement_rule, obj, &r);
7c673cae
FG
158 return r;
159 }
160 return raw_obj;
161}
162
11fdf7f2
TL
163void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
164{
165 obj_version *check_objv = version_for_check();
7c673cae 166
11fdf7f2
TL
167 if (check_objv) {
168 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
169 }
170
11fdf7f2 171 cls_version_read(*op, &read_version);
7c673cae
FG
172}
173
11fdf7f2
TL
174void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
175{
176 obj_version *check_objv = version_for_check();
177 obj_version *modify_version = version_for_write();
7c673cae 178
11fdf7f2
TL
179 if (check_objv) {
180 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 181 }
7c673cae 182
11fdf7f2
TL
183 if (modify_version) {
184 cls_version_set(*op, *modify_version);
185 } else {
186 cls_version_inc(*op);
7c673cae 187 }
7c673cae
FG
188}
189
f91f0fd5
TL
190void RGWObjVersionTracker::apply_write()
191{
192 const bool checked = (read_version.ver != 0);
193 const bool incremented = (write_version.ver == 0);
194
195 if (checked && incremented) {
196 // apply cls_version_inc() so our next operation can recheck it
197 ++read_version.ver;
198 } else {
199 read_version = write_version;
200 }
201 write_version = obj_version();
202}
203
9f95a23c 204RGWObjState::RGWObjState() {
7c673cae
FG
205}
206
9f95a23c 207RGWObjState::~RGWObjState() {
7c673cae
FG
208}
209
9f95a23c
TL
210RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
211 is_atomic = rhs.is_atomic;
212 has_attrs = rhs.has_attrs;
213 exists = rhs.exists;
214 size = rhs.size;
215 accounted_size = rhs.accounted_size;
216 mtime = rhs.mtime;
217 epoch = rhs.epoch;
218 if (rhs.obj_tag.length()) {
219 obj_tag = rhs.obj_tag;
7c673cae 220 }
9f95a23c
TL
221 if (rhs.tail_tag.length()) {
222 tail_tag = rhs.tail_tag;
7c673cae 223 }
9f95a23c
TL
224 write_tag = rhs.write_tag;
225 fake_tag = rhs.fake_tag;
226 manifest = rhs.manifest;
227 shadow_obj = rhs.shadow_obj;
228 has_data = rhs.has_data;
229 if (rhs.data.length()) {
230 data = rhs.data;
7c673cae 231 }
9f95a23c
TL
232 prefetch_data = rhs.prefetch_data;
233 keep_tail = rhs.keep_tail;
234 is_olh = rhs.is_olh;
235 objv_tracker = rhs.objv_tracker;
236 pg_ver = rhs.pg_ver;
20effc67 237 compressed = rhs.compressed;
7c673cae
FG
238}
239
9f95a23c
TL
240RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
241 RGWObjState *result;
242 typename std::map<rgw_obj, RGWObjState>::iterator iter;
243 lock.lock_shared();
244 assert (!obj.empty());
245 iter = objs_state.find(obj);
246 if (iter != objs_state.end()) {
247 result = &iter->second;
248 lock.unlock_shared();
249 } else {
250 lock.unlock_shared();
251 lock.lock();
252 result = &objs_state[obj];
253 lock.unlock();
224ce89b 254 }
9f95a23c 255 return result;
7c673cae
FG
256}
257
20effc67
TL
258void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
259 std::unique_lock wl{lock};
260 assert (!obj.empty());
261 objs_state[obj].compressed = true;
262}
263
9f95a23c
TL
264void RGWObjectCtx::set_atomic(rgw_obj& obj) {
265 std::unique_lock wl{lock};
266 assert (!obj.empty());
267 objs_state[obj].is_atomic = true;
7c673cae 268}
9f95a23c
TL
269void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
270 std::unique_lock wl{lock};
271 assert (!obj.empty());
272 objs_state[obj].prefetch_data = true;
7c673cae
FG
273}
274
9f95a23c
TL
275void RGWObjectCtx::invalidate(const rgw_obj& obj) {
276 std::unique_lock wl{lock};
277 auto iter = objs_state.find(obj);
278 if (iter == objs_state.end()) {
11fdf7f2 279 return;
7c673cae 280 }
9f95a23c
TL
281 bool is_atomic = iter->second.is_atomic;
282 bool prefetch_data = iter->second.prefetch_data;
20effc67 283 bool compressed = iter->second.compressed;
7c673cae 284
9f95a23c 285 objs_state.erase(iter);
7c673cae 286
20effc67 287 if (is_atomic || prefetch_data || compressed) {
9f95a23c
TL
288 auto& state = objs_state[obj];
289 state.is_atomic = is_atomic;
290 state.prefetch_data = prefetch_data;
20effc67 291 state.compressed = compressed;
11fdf7f2 292 }
7c673cae
FG
293}
294
11fdf7f2 295void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 296{
11fdf7f2
TL
297 write_version.ver = 1;
298#define TAG_LEN 24
7c673cae 299
11fdf7f2
TL
300 write_version.tag.clear();
301 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
302}
303
7c673cae 304class RGWMetaNotifierManager : public RGWCoroutinesManager {
20effc67 305 RGWRados* store;
7c673cae
FG
306 RGWHTTPManager http_manager;
307
308public:
309 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
310 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 311 http_manager.start();
7c673cae
FG
312 }
313
b3b6e05e 314 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
7c673cae
FG
315 rgw_http_param_pair pairs[] = { { "type", "metadata" },
316 { "notify", NULL },
317 { NULL, NULL } };
318
319 list<RGWCoroutinesStack *> stacks;
9f95a23c 320 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
321 RGWRESTConn *conn = iter->second;
322 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
323 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
324
325 stacks.push_back(stack);
326 }
b3b6e05e 327 return run(dpp, stacks);
7c673cae
FG
328 }
329};
330
331class RGWDataNotifierManager : public RGWCoroutinesManager {
20effc67 332 RGWRados* store;
7c673cae
FG
333 RGWHTTPManager http_manager;
334
335public:
336 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
337 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 338 http_manager.start();
7c673cae
FG
339 }
340
b3b6e05e 341 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
f67539c2 342 bc::flat_map<int, bc::flat_set<string> >& shards) {
7c673cae
FG
343 rgw_http_param_pair pairs[] = { { "type", "data" },
344 { "notify", NULL },
11fdf7f2 345 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
346 { NULL, NULL } };
347
348 list<RGWCoroutinesStack *> stacks;
9f95a23c 349 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
350 RGWRESTConn *conn = iter->second;
351 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
f67539c2 352 stack->call(new RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
7c673cae
FG
353
354 stacks.push_back(stack);
355 }
b3b6e05e 356 return run(dpp, stacks);
7c673cae
FG
357 }
358};
359
11fdf7f2
TL
360/* class RGWRadosThread */
361
7c673cae
FG
362void RGWRadosThread::start()
363{
364 worker = new Worker(cct, this);
365 worker->create(thread_name.c_str());
366}
367
368void RGWRadosThread::stop()
369{
370 down_flag = true;
371 stop_process();
372 if (worker) {
31f18b77 373 worker->signal();
7c673cae
FG
374 worker->join();
375 }
376 delete worker;
377 worker = NULL;
378}
379
380void *RGWRadosThread::Worker::entry() {
381 uint64_t msec = processor->interval_msec();
9f95a23c 382 auto interval = std::chrono::milliseconds(msec);
7c673cae
FG
383
384 do {
9f95a23c 385 auto start = ceph::real_clock::now();
b3b6e05e 386 int r = processor->process(this);
7c673cae 387 if (r < 0) {
b3b6e05e 388 ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
7c673cae
FG
389 }
390
391 if (processor->going_down())
392 break;
393
9f95a23c 394 auto end = ceph::real_clock::now() - start;
7c673cae
FG
395
396 uint64_t cur_msec = processor->interval_msec();
397 if (cur_msec != msec) { /* was it reconfigured? */
398 msec = cur_msec;
9f95a23c 399 interval = std::chrono::milliseconds(msec);
7c673cae
FG
400 }
401
402 if (cur_msec > 0) {
403 if (interval <= end)
404 continue; // next round
405
9f95a23c 406 auto wait_time = interval - end;
31f18b77 407 wait_interval(wait_time);
7c673cae 408 } else {
31f18b77 409 wait();
7c673cae
FG
410 }
411 } while (!processor->going_down());
412
413 return NULL;
414}
415
416class RGWMetaNotifier : public RGWRadosThread {
417 RGWMetaNotifierManager notify_mgr;
418 RGWMetadataLog *const log;
419
420 uint64_t interval_msec() override {
421 return cct->_conf->rgw_md_notify_interval_msec;
422 }
1adf2230
AA
423 void stop_process() override {
424 notify_mgr.stop();
425 }
7c673cae
FG
426public:
427 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
428 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
429
b3b6e05e 430 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
431};
432
b3b6e05e 433int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
7c673cae
FG
434{
435 set<int> shards;
436
437 log->read_clear_modified(shards);
438
439 if (shards.empty()) {
440 return 0;
441 }
442
443 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
b3b6e05e 444 ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
7c673cae
FG
445 }
446
b3b6e05e 447 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
448
449 return 0;
450}
451
452class RGWDataNotifier : public RGWRadosThread {
453 RGWDataNotifierManager notify_mgr;
454
455 uint64_t interval_msec() override {
11fdf7f2 456 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 457 }
1adf2230
AA
458 void stop_process() override {
459 notify_mgr.stop();
460 }
7c673cae
FG
461public:
462 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
463
b3b6e05e 464 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
465};
466
b3b6e05e 467int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
7c673cae 468{
f67539c2 469 auto data_log = store->svc.datalog_rados;
9f95a23c 470 if (!data_log) {
7c673cae
FG
471 return 0;
472 }
473
f67539c2 474 auto shards = data_log->read_clear_modified();
7c673cae
FG
475
476 if (shards.empty()) {
477 return 0;
478 }
479
f67539c2 480 for (const auto& [shard_id, keys] : shards) {
b3b6e05e 481 ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
f67539c2 482 << shard_id << ": " << keys << dendl;
7c673cae
FG
483 }
484
b3b6e05e 485 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
486
487 return 0;
488}
489
490class RGWSyncProcessorThread : public RGWRadosThread {
491public:
492 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
493 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
494 ~RGWSyncProcessorThread() override {}
b3b6e05e
TL
495 int init(const DoutPrefixProvider *dpp) override = 0 ;
496 int process(const DoutPrefixProvider *dpp) override = 0;
7c673cae
FG
497};
498
499class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
500{
501 RGWMetaSyncStatusManager sync;
502
503 uint64_t interval_msec() override {
504 return 0; /* no interval associated, it'll run once until stopped */
505 }
506 void stop_process() override {
507 sync.stop();
508 }
509public:
20effc67 510 RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
9f95a23c 511 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
7c673cae
FG
512
513 void wakeup_sync_shards(set<int>& shard_ids) {
514 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
515 sync.wakeup(*iter);
516 }
517 }
518 RGWMetaSyncStatusManager* get_manager() { return &sync; }
519
b3b6e05e
TL
520 int init(const DoutPrefixProvider *dpp) override {
521 int ret = sync.init(dpp);
7c673cae 522 if (ret < 0) {
b3b6e05e 523 ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
7c673cae
FG
524 return ret;
525 }
526 return 0;
527 }
528
b3b6e05e
TL
529 int process(const DoutPrefixProvider *dpp) override {
530 sync.run(dpp, null_yield);
7c673cae
FG
531 return 0;
532 }
533};
534
535class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
536{
81eedcae 537 PerfCountersRef counters;
7c673cae
FG
538 RGWDataSyncStatusManager sync;
539 bool initialized;
540
541 uint64_t interval_msec() override {
542 if (initialized) {
543 return 0; /* no interval associated, it'll run once until stopped */
544 } else {
545#define DATA_SYNC_INIT_WAIT_SEC 20
546 return DATA_SYNC_INIT_WAIT_SEC * 1000;
547 }
548 }
549 void stop_process() override {
550 sync.stop();
551 }
552public:
20effc67 553 RGWDataSyncProcessorThread(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados,
81eedcae 554 const RGWZone* source_zone)
9f95a23c 555 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
81eedcae
TL
556 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
557 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
558 initialized(false) {}
559
560 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
561 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
562 sync.wakeup(iter->first, iter->second);
563 }
564 }
565 RGWDataSyncStatusManager* get_manager() { return &sync; }
566
b3b6e05e 567 int init(const DoutPrefixProvider *dpp) override {
7c673cae
FG
568 return 0;
569 }
570
b3b6e05e 571 int process(const DoutPrefixProvider *dpp) override {
7c673cae
FG
572 while (!initialized) {
573 if (going_down()) {
574 return 0;
575 }
b3b6e05e 576 int ret = sync.init(dpp);
7c673cae
FG
577 if (ret >= 0) {
578 initialized = true;
579 break;
580 }
581 /* we'll be back! */
582 return 0;
583 }
b3b6e05e 584 sync.run(dpp);
7c673cae
FG
585 return 0;
586 }
587};
588
11fdf7f2 589class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
590{
591 RGWCoroutinesManager crs;
20effc67 592 rgw::sal::RadosStore* store;
b32b8144 593 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
594 RGWHTTPManager http;
595 const utime_t trim_interval;
596
597 uint64_t interval_msec() override { return 0; }
598 void stop_process() override { crs.stop(); }
599public:
20effc67 600 RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
b32b8144 601 int interval)
9f95a23c
TL
602 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
603 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
b32b8144 604 bucket_trim(bucket_trim),
7c673cae
FG
605 http(store->ctx(), crs.get_completion_mgr()),
606 trim_interval(interval, 0)
607 {}
608
b3b6e05e 609 int init(const DoutPrefixProvider *dpp) override {
11fdf7f2 610 return http.start();
7c673cae 611 }
b3b6e05e 612 int process(const DoutPrefixProvider *dpp) override {
7c673cae 613 list<RGWCoroutinesStack*> stacks;
20effc67
TL
614 auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
615 cct->_conf->rgw_md_log_max_shards,
616 trim_interval);
617 if (!metatrimcr) {
618 ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
619 return -EINVAL;
620 }
7c673cae 621 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
20effc67
TL
622 meta->call(metatrimcr);
623
7c673cae
FG
624 stacks.push_back(meta);
625
9f95a23c
TL
626 if (store->svc()->zone->sync_module_exports_data()) {
627 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
20effc67 628 data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
9f95a23c
TL
629 cct->_conf->rgw_data_log_num_shards,
630 trim_interval));
631 stacks.push_back(data);
7c673cae 632
9f95a23c
TL
633 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
634 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
635 stacks.push_back(bucket);
636 }
b32b8144 637
b3b6e05e 638 crs.run(dpp, stacks);
7c673cae
FG
639 return 0;
640 }
11fdf7f2
TL
641
642 // implements DoutPrefixProvider
643 CephContext *get_cct() const override { return store->ctx(); }
9f95a23c 644 unsigned get_subsys() const override
11fdf7f2
TL
645 {
646 return dout_subsys;
647 }
648
9f95a23c 649 std::ostream& gen_prefix(std::ostream& out) const override
11fdf7f2
TL
650 {
651 return out << "sync log trim: ";
652 }
653
7c673cae
FG
654};
655
656void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
657{
9f95a23c 658 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
659 if (meta_sync_processor_thread) {
660 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
661 }
662}
663
20effc67 664void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
7c673cae 665{
20effc67 666 ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
9f95a23c
TL
667 std::lock_guard l{data_sync_thread_lock};
668 auto iter = data_sync_processor_threads.find(source_zone);
7c673cae 669 if (iter == data_sync_processor_threads.end()) {
20effc67 670 ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
7c673cae
FG
671 return;
672 }
673
674 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 675 ceph_assert(thread);
7c673cae
FG
676 thread->wakeup_sync_shards(shard_ids);
677}
678
679RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
680{
9f95a23c 681 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
682 if (meta_sync_processor_thread) {
683 return meta_sync_processor_thread->get_manager();
684 }
685 return nullptr;
686}
687
9f95a23c 688RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
7c673cae 689{
9f95a23c 690 std::lock_guard l{data_sync_thread_lock};
7c673cae
FG
691 auto thread = data_sync_processor_threads.find(source_zone);
692 if (thread == data_sync_processor_threads.end()) {
693 return nullptr;
694 }
695 return thread->second->get_manager();
696}
697
b3b6e05e 698int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
7c673cae
FG
699{
700 IoCtx ioctx;
b3b6e05e 701 int r = open_pool_ctx(dpp, pool, ioctx, false);
7c673cae 702 if (r < 0) {
20effc67 703 ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
7c673cae
FG
704 return r;
705 }
706
707 bool requires;
708 r = ioctx.pool_requires_alignment2(&requires);
709 if (r < 0) {
20effc67 710 ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
7c673cae
FG
711 << r << dendl;
712 return r;
713 }
714
715 if (!requires) {
716 *alignment = 0;
717 return 0;
718 }
719
720 uint64_t align;
721 r = ioctx.pool_required_alignment2(&align);
722 if (r < 0) {
20effc67 723 ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
7c673cae
FG
724 << r << dendl;
725 return r;
726 }
727 if (align != 0) {
20effc67 728 ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
7c673cae
FG
729 }
730 *alignment = align;
731 return 0;
732}
733
11fdf7f2
TL
734void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
735{
736 if (alignment == 0) {
737 *max_size = size;
738 return;
739 }
740
741 if (size <= alignment) {
742 *max_size = alignment;
743 return;
744 }
745
746 *max_size = size - (size % alignment);
747}
748
b3b6e05e 749int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae 750{
11fdf7f2 751 uint64_t alignment;
b3b6e05e 752 int r = get_required_alignment(dpp, pool, &alignment);
7c673cae
FG
753 if (r < 0) {
754 return r;
755 }
756
11fdf7f2
TL
757 if (palignment) {
758 *palignment = alignment;
7c673cae
FG
759 }
760
11fdf7f2 761 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 762
11fdf7f2 763 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae 764
b3b6e05e 765 ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
7c673cae
FG
766
767 return 0;
768}
769
11fdf7f2 770int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
b3b6e05e 771 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae
FG
772{
773 rgw_pool pool;
774 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
b3b6e05e 775 ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
7c673cae
FG
776 return -EIO;
777 }
b3b6e05e 778 return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
7c673cae
FG
779}
780
31f18b77
FG
781class RGWIndexCompletionManager;
782
783struct complete_op_data {
9f95a23c 784 ceph::mutex lock = ceph::make_mutex("complete_op_data");
31f18b77
FG
785 AioCompletion *rados_completion{nullptr};
786 int manager_shard_id{-1};
787 RGWIndexCompletionManager *manager{nullptr};
788 rgw_obj obj;
789 RGWModifyOp op;
790 string tag;
791 rgw_bucket_entry_ver ver;
792 cls_rgw_obj_key key;
793 rgw_bucket_dir_entry_meta dir_meta;
794 list<cls_rgw_obj_key> remove_objs;
795 bool log_op;
796 uint16_t bilog_op;
797 rgw_zone_set zones_trace;
798
799 bool stopped{false};
800
801 void stop() {
9f95a23c 802 std::lock_guard l{lock};
31f18b77
FG
803 stopped = true;
804 }
805};
806
39ae355f
TL
807class RGWIndexCompletionManager {
808 RGWRados* const store;
809 const int num_shards;
810 ceph::containers::tiny_vector<ceph::mutex> locks;
811 std::vector<set<complete_op_data*>> completions;
812 std::vector<complete_op_data*> retry_completions;
31f18b77 813
39ae355f
TL
814 std::condition_variable cond;
815 std::mutex retry_completions_lock;
816 bool _stop{false};
817 std::thread retry_thread;
31f18b77 818
39ae355f 819 std::atomic<int> cur_shard {0};
31f18b77 820
39ae355f
TL
821 void process();
822
823 void add_completion(complete_op_data *completion);
824
825 void stop() {
826 if (retry_thread.joinable()) {
827 _stop = true;
828 cond.notify_all();
829 retry_thread.join();
31f18b77
FG
830 }
831
39ae355f
TL
832 for (int i = 0; i < num_shards; ++i) {
833 std::lock_guard l{locks[i]};
834 for (auto c : completions[i]) {
835 c->stop();
836 }
31f18b77 837 }
39ae355f
TL
838 completions.clear();
839 }
840
841 int next_shard() {
842 int result = cur_shard % num_shards;
843 cur_shard++;
844 return result;
31f18b77 845 }
31f18b77
FG
846
847public:
9f95a23c
TL
848 RGWIndexCompletionManager(RGWRados *_store) :
849 store(_store),
39ae355f 850 num_shards(store->ctx()->_conf->rgw_thread_pool_size),
9f95a23c 851 locks{ceph::make_lock_container<ceph::mutex>(
39ae355f 852 num_shards,
9f95a23c
TL
853 [](const size_t i) {
854 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
855 std::to_string(i));
39ae355f
TL
856 })},
857 completions(num_shards),
858 retry_thread(&RGWIndexCompletionManager::process, this)
859 {}
860
31f18b77
FG
861 ~RGWIndexCompletionManager() {
862 stop();
31f18b77
FG
863 }
864
31f18b77
FG
865 void create_completion(const rgw_obj& obj,
866 RGWModifyOp op, string& tag,
867 rgw_bucket_entry_ver& ver,
868 const cls_rgw_obj_key& key,
869 rgw_bucket_dir_entry_meta& dir_meta,
870 list<cls_rgw_obj_key> *remove_objs, bool log_op,
871 uint16_t bilog_op,
872 rgw_zone_set *zones_trace,
873 complete_op_data **result);
31f18b77 874
39ae355f 875 bool handle_completion(completion_t cb, complete_op_data *arg);
31f18b77 876
39ae355f
TL
877 CephContext* ctx() {
878 return store->ctx();
31f18b77
FG
879 }
880};
881
882static void obj_complete_cb(completion_t cb, void *arg)
883{
39ae355f 884 complete_op_data *completion = reinterpret_cast<complete_op_data*>(arg);
9f95a23c 885 completion->lock.lock();
31f18b77 886 if (completion->stopped) {
9f95a23c 887 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
31f18b77
FG
888 delete completion;
889 return;
890 }
891 bool need_delete = completion->manager->handle_completion(cb, completion);
9f95a23c 892 completion->lock.unlock();
31f18b77
FG
893 if (need_delete) {
894 delete completion;
895 }
896}
897
39ae355f
TL
898void RGWIndexCompletionManager::process()
899{
900 DoutPrefix dpp(store->ctx(), dout_subsys, "rgw index completion thread: ");
901 while(!_stop) {
902 std::vector<complete_op_data*> comps;
903
904 {
905 std::unique_lock l{retry_completions_lock};
906 cond.wait(l, [this](){return _stop || !retry_completions.empty();});
907 if (_stop) {
908 return;
909 }
910 retry_completions.swap(comps);
911 }
912
913 for (auto c : comps) {
914 std::unique_ptr<complete_op_data> up{c};
915
916 ldpp_dout(&dpp, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
917
918 RGWRados::BucketShard bs(store);
919 RGWBucketInfo bucket_info;
920
921 int r = bs.init(c->obj.bucket, c->obj, &bucket_info, &dpp);
922 if (r < 0) {
923 ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
924 /* not much to do */
925 continue;
926 }
927
928 r = store->guard_reshard(&dpp, &bs, c->obj, bucket_info,
929 [&](RGWRados::BucketShard *bs) -> int {
930 librados::ObjectWriteOperation o;
931 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
932 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
933 c->log_op, c->bilog_op, &c->zones_trace);
934 return bs->bucket_obj.operate(&dpp, &o, null_yield);
935 });
936 if (r < 0) {
937 ldpp_dout(&dpp, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
938 /* ignoring error, can't do anything about it */
939 continue;
940 }
941
942 r = store->svc.datalog_rados->add_entry(&dpp, bucket_info, bs.shard_id);
943 if (r < 0) {
944 ldpp_dout(&dpp, -1) << "ERROR: failed writing data log" << dendl;
945 }
946 }
947 }
948}
31f18b77
FG
949
950void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
951 RGWModifyOp op, string& tag,
952 rgw_bucket_entry_ver& ver,
953 const cls_rgw_obj_key& key,
954 rgw_bucket_dir_entry_meta& dir_meta,
955 list<cls_rgw_obj_key> *remove_objs, bool log_op,
956 uint16_t bilog_op,
957 rgw_zone_set *zones_trace,
958 complete_op_data **result)
959{
960 complete_op_data *entry = new complete_op_data;
961
962 int shard_id = next_shard();
963
964 entry->manager_shard_id = shard_id;
965 entry->manager = this;
966 entry->obj = obj;
967 entry->op = op;
968 entry->tag = tag;
969 entry->ver = ver;
970 entry->key = key;
971 entry->dir_meta = dir_meta;
972 entry->log_op = log_op;
973 entry->bilog_op = bilog_op;
974
975 if (remove_objs) {
976 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
977 entry->remove_objs.push_back(*iter);
978 }
979 }
980
981 if (zones_trace) {
982 entry->zones_trace = *zones_trace;
983 } else {
9f95a23c 984 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
31f18b77
FG
985 }
986
987 *result = entry;
988
9f95a23c 989 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
31f18b77 990
9f95a23c 991 std::lock_guard l{locks[shard_id]};
39ae355f
TL
992 const auto ok = completions[shard_id].insert(entry).second;
993 ceph_assert(ok);
994}
995
996void RGWIndexCompletionManager::add_completion(complete_op_data *completion) {
997 {
998 std::lock_guard l{retry_completions_lock};
999 retry_completions.push_back(completion);
1000 }
1001 cond.notify_all();
31f18b77
FG
1002}
1003
1004bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1005{
1006 int shard_id = arg->manager_shard_id;
1007 {
9f95a23c 1008 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1009
1010 auto& comps = completions[shard_id];
1011
1012 auto iter = comps.find(arg);
1013 if (iter == comps.end()) {
39ae355f 1014 ldout(arg->manager->ctx(), 0) << __func__ << "(): cannot find completion for obj=" << arg->key << dendl;
31f18b77
FG
1015 return true;
1016 }
1017
1018 comps.erase(iter);
1019 }
1020
1021 int r = rados_aio_get_return_value(cb);
1022 if (r != -ERR_BUSY_RESHARDING) {
39ae355f
TL
1023 ldout(arg->manager->ctx(), 20) << __func__ << "(): completion " <<
1024 (r == 0 ? "ok" : "failed with " + to_string(r)) <<
1025 " for obj=" << arg->key << dendl;
31f18b77
FG
1026 return true;
1027 }
39ae355f
TL
1028 add_completion(arg);
1029 ldout(arg->manager->ctx(), 20) << __func__ << "(): async completion added for obj=" << arg->key << dendl;
31f18b77
FG
1030 return false;
1031}
1032
7c673cae
FG
1033void RGWRados::finalize()
1034{
1035 if (run_sync_thread) {
9f95a23c 1036 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
1037 meta_sync_processor_thread->stop();
1038
9f95a23c 1039 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1040 for (auto iter : data_sync_processor_threads) {
1041 RGWDataSyncProcessorThread *thread = iter.second;
1042 thread->stop();
1043 }
1044 if (sync_log_trimmer) {
1045 sync_log_trimmer->stop();
1046 }
1047 }
7c673cae
FG
1048 if (run_sync_thread) {
1049 delete meta_sync_processor_thread;
1050 meta_sync_processor_thread = NULL;
9f95a23c 1051 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1052 for (auto iter : data_sync_processor_threads) {
1053 RGWDataSyncProcessorThread *thread = iter.second;
1054 delete thread;
1055 }
1056 data_sync_processor_threads.clear();
1057 delete sync_log_trimmer;
1058 sync_log_trimmer = nullptr;
b32b8144 1059 bucket_trim = boost::none;
7c673cae 1060 }
7c673cae
FG
1061 if (meta_notifier) {
1062 meta_notifier->stop();
1063 delete meta_notifier;
1064 }
1065 if (data_notifier) {
1066 data_notifier->stop();
1067 delete data_notifier;
1068 }
11fdf7f2 1069 delete sync_tracer;
11fdf7f2
TL
1070
1071 delete lc;
1072 lc = NULL;
7c673cae 1073
11fdf7f2
TL
1074 delete gc;
1075 gc = NULL;
7c673cae 1076
11fdf7f2
TL
1077 delete obj_expirer;
1078 obj_expirer = NULL;
7c673cae 1079
11fdf7f2
TL
1080 RGWQuotaHandler::free_handler(quota_handler);
1081 if (cr_registry) {
1082 cr_registry->put();
7c673cae
FG
1083 }
1084
11fdf7f2 1085 svc.shutdown();
7c673cae 1086
11fdf7f2
TL
1087 delete binfo_cache;
1088 delete obj_tombstone_cache;
20effc67
TL
1089 if (d3n_data_cache)
1090 delete d3n_data_cache;
7c673cae 1091
11fdf7f2
TL
1092 if (reshard_wait.get()) {
1093 reshard_wait->stop();
1094 reshard_wait.reset();
7c673cae
FG
1095 }
1096
11fdf7f2
TL
1097 if (run_reshard_thread) {
1098 reshard->stop_processor();
7c673cae 1099 }
11fdf7f2
TL
1100 delete reshard;
1101 delete index_completion_manager;
f67539c2
TL
1102
1103 rgw::notify::shutdown();
11fdf7f2
TL
1104}
1105
1106/**
1107 * Initialize the RADOS instance and prepare to do other ops
1108 * Returns 0 on success, -ERR# on failure.
1109 */
1110int RGWRados::init_rados()
1111{
1112 int ret = 0;
7c673cae 1113
494da23a
TL
1114 ret = rados.init_with_context(cct);
1115 if (ret < 0) {
1116 return ret;
1117 }
1118 ret = rados.connect();
1119 if (ret < 0) {
1120 return ret;
7c673cae 1121 }
11fdf7f2
TL
1122
1123 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1124 new RGWCoroutinesManagerRegistry(cct)};
1125 ret = crs->hook_to_admin_command("cr dump");
1126 if (ret < 0) {
1127 return ret;
7c673cae
FG
1128 }
1129
11fdf7f2 1130 cr_registry = crs.release();
20effc67
TL
1131
1132 if (use_datacache) {
1133 d3n_data_cache = new D3nDataCache();
1134 d3n_data_cache->init(cct);
1135 }
1136
11fdf7f2 1137 return ret;
7c673cae
FG
1138}
1139
20effc67 1140int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
7c673cae 1141{
f67539c2
TL
1142 string name = cct->_conf->name.get_id();
1143 if (name.compare(0, 4, "rgw.") == 0) {
1144 name = name.substr(4);
1145 }
11fdf7f2 1146 map<string,string> metadata = meta;
494da23a 1147 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1148 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1149 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1150 metadata["zone_name"] = svc.zone->zone_name();
9f95a23c 1151 metadata["zone_id"] = svc.zone->zone_id().id;
522d829b
TL
1152 metadata["realm_name"] = svc.zone->get_realm().get_name();
1153 metadata["realm_id"] = svc.zone->get_realm().get_id();
f67539c2
TL
1154 metadata["id"] = name;
1155 int ret = rados.service_daemon_register(
1156 daemon_type,
1157 stringify(rados.get_instance_id()),
1158 metadata);
11fdf7f2 1159 if (ret < 0) {
20effc67 1160 ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
11fdf7f2 1161 return ret;
7c673cae
FG
1162 }
1163
1164 return 0;
1165}
1166
20effc67 1167int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
7c673cae 1168{
494da23a 1169 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2 1170 if (ret < 0) {
20effc67 1171 ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
11fdf7f2
TL
1172 return ret;
1173 }
1174
1175 return 0;
7c673cae
FG
1176}
1177
1178/**
1179 * Initialize the RADOS instance and prepare to do other ops
1180 * Returns 0 on success, -ERR# on failure.
1181 */
b3b6e05e 1182int RGWRados::init_complete(const DoutPrefixProvider *dpp)
7c673cae 1183{
11fdf7f2 1184 int ret;
7c673cae 1185
11fdf7f2
TL
1186 /*
1187 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1188 */
9f95a23c 1189 sync_module = svc.sync_modules->get_sync_module();
7c673cae 1190
b3b6e05e 1191 ret = open_root_pool_ctx(dpp);
7c673cae
FG
1192 if (ret < 0)
1193 return ret;
1194
b3b6e05e 1195 ret = open_gc_pool_ctx(dpp);
7c673cae
FG
1196 if (ret < 0)
1197 return ret;
1198
b3b6e05e 1199 ret = open_lc_pool_ctx(dpp);
7c673cae
FG
1200 if (ret < 0)
1201 return ret;
1202
b3b6e05e 1203 ret = open_objexp_pool_ctx(dpp);
7c673cae
FG
1204 if (ret < 0)
1205 return ret;
1206
b3b6e05e 1207 ret = open_reshard_pool_ctx(dpp);
31f18b77
FG
1208 if (ret < 0)
1209 return ret;
1210
b3b6e05e 1211 ret = open_notif_pool_ctx(dpp);
f67539c2
TL
1212 if (ret < 0)
1213 return ret;
1214
7c673cae
FG
1215 pools_initialized = true;
1216
522d829b
TL
1217 if (use_gc) {
1218 gc = new RGWGC();
1219 gc->initialize(cct, this);
1220 } else {
1221 ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
1222 }
7c673cae 1223
9f95a23c 1224 obj_expirer = new RGWObjectExpirer(this->store);
7c673cae 1225
522d829b 1226 if (use_gc_thread && use_gc) {
7c673cae
FG
1227 gc->start_processor();
1228 obj_expirer->start_processor();
1229 }
1230
11fdf7f2
TL
1231 auto& current_period = svc.zone->get_current_period();
1232 auto& zonegroup = svc.zone->get_zonegroup();
1233 auto& zone_params = svc.zone->get_zone_params();
1234 auto& zone = svc.zone->get_zone();
1235
7c673cae
FG
1236 /* no point of running sync thread if we don't have a master zone configured
1237 or there is no rest_master_conn */
9f95a23c 1238 if (!svc.zone->need_to_sync()) {
7c673cae
FG
1239 run_sync_thread = false;
1240 }
1241
11fdf7f2 1242 if (svc.zone->is_meta_master()) {
9f95a23c 1243 auto md_log = svc.mdlog->get_log(current_period.get_id());
7c673cae
FG
1244 meta_notifier = new RGWMetaNotifier(this, md_log);
1245 meta_notifier->start();
1246 }
1247
11fdf7f2
TL
1248 /* init it anyway, might run sync through radosgw-admin explicitly */
1249 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1250 sync_tracer->init(this);
1251 ret = sync_tracer->hook_to_admin_command();
1252 if (ret < 0) {
1253 return ret;
1254 }
1255
7c673cae 1256 if (run_sync_thread) {
11fdf7f2
TL
1257 for (const auto &pt: zonegroup.placement_targets) {
1258 if (zone_params.placement_pools.find(pt.second.name)
1259 == zone_params.placement_pools.end()){
b3b6e05e 1260 ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
11fdf7f2
TL
1261 << pt.second.name << " present in zonegroup" << dendl;
1262 }
1263 }
9f95a23c
TL
1264 auto async_processor = svc.rados->get_async_processor();
1265 std::lock_guard l{meta_sync_thread_lock};
1266 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
b3b6e05e 1267 ret = meta_sync_processor_thread->init(dpp);
7c673cae 1268 if (ret < 0) {
b3b6e05e 1269 ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
7c673cae
FG
1270 return ret;
1271 }
1272 meta_sync_processor_thread->start();
1273
b32b8144
FG
1274 // configure the bucket trim manager
1275 rgw::BucketTrimConfig config;
1276 rgw::configure_bucket_trim(cct, config);
1277
9f95a23c 1278 bucket_trim.emplace(this->store, config);
b32b8144
FG
1279 ret = bucket_trim->init();
1280 if (ret < 0) {
b3b6e05e 1281 ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
b32b8144
FG
1282 return ret;
1283 }
9f95a23c 1284 svc.datalog_rados->set_observer(&*bucket_trim);
b32b8144 1285
9f95a23c 1286 std::lock_guard dl{data_sync_thread_lock};
81eedcae 1287 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
b3b6e05e 1288 ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
9f95a23c 1289 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
b3b6e05e 1290 ret = thread->init(dpp);
7c673cae 1291 if (ret < 0) {
b3b6e05e 1292 ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
7c673cae
FG
1293 return ret;
1294 }
1295 thread->start();
9f95a23c 1296 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
7c673cae
FG
1297 }
1298 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1299 if (interval > 0) {
9f95a23c 1300 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
b3b6e05e 1301 ret = sync_log_trimmer->init(dpp);
7c673cae 1302 if (ret < 0) {
b3b6e05e 1303 ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
7c673cae
FG
1304 return ret;
1305 }
1306 sync_log_trimmer->start();
1307 }
1308 }
20effc67
TL
1309 if (cct->_conf->rgw_data_notify_interval_msec) {
1310 data_notifier = new RGWDataNotifier(this);
1311 data_notifier->start();
1312 }
7c673cae 1313
92f5a8d4
TL
1314 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1315 binfo_cache->init(svc.cache);
1316
7c673cae 1317 lc = new RGWLC();
9f95a23c 1318 lc->initialize(cct, this->store);
31f18b77 1319
7c673cae
FG
1320 if (use_lc_thread)
1321 lc->start_processor();
31f18b77 1322
b3b6e05e 1323 quota_handler = RGWQuotaHandler::generate_handler(dpp, this->store, quota_threads);
7c673cae
FG
1324
1325 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1326 zone.bucket_index_max_shards);
31f18b77
FG
1327 if (bucket_index_max_shards > get_max_bucket_shards()) {
1328 bucket_index_max_shards = get_max_bucket_shards();
b3b6e05e 1329 ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1330 << get_max_bucket_shards() << dendl;
7c673cae 1331 }
b3b6e05e 1332 ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
7c673cae 1333
11fdf7f2 1334 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1335
1336 if (need_tombstone_cache) {
1337 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1338 }
1339
11fdf7f2 1340 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77 1341
9f95a23c 1342 reshard = new RGWReshard(this->store);
31f18b77
FG
1343
1344 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1345 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1346 if (run_reshard_thread) {
1347 reshard->start_processor();
1348 }
1349
1350 index_completion_manager = new RGWIndexCompletionManager(this);
b3b6e05e 1351 ret = rgw::notify::init(cct, store, dpp);
f67539c2 1352 if (ret < 0 ) {
b3b6e05e 1353 ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
f67539c2 1354 }
31f18b77 1355
7c673cae
FG
1356 return ret;
1357}
1358
b3b6e05e 1359int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
11fdf7f2
TL
1360{
1361 if (raw) {
b3b6e05e 1362 return svc.init_raw(cct, use_cache, null_yield, dpp);
11fdf7f2
TL
1363 }
1364
b3b6e05e 1365 return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
9f95a23c
TL
1366}
1367
b3b6e05e 1368int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
9f95a23c 1369{
39ae355f 1370 return ctl.init(&svc, store, dpp);
11fdf7f2
TL
1371}
1372
7c673cae
FG
1373/**
1374 * Initialize the RADOS instance and prepare to do other ops
1375 * Returns 0 on success, -ERR# on failure.
1376 */
b3b6e05e 1377int RGWRados::initialize(const DoutPrefixProvider *dpp)
7c673cae
FG
1378{
1379 int ret;
1380
11fdf7f2
TL
1381 inject_notify_timeout_probability =
1382 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1383 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1384
b3b6e05e 1385 ret = init_svc(false, dpp);
7c673cae 1386 if (ret < 0) {
b3b6e05e 1387 ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1388 return ret;
1389 }
7c673cae 1390
b3b6e05e 1391 ret = init_ctl(dpp);
9f95a23c 1392 if (ret < 0) {
b3b6e05e 1393 ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
9f95a23c
TL
1394 return ret;
1395 }
1396
11fdf7f2 1397 host_id = svc.zone_utils->gen_host_id();
7c673cae 1398
11fdf7f2
TL
1399 ret = init_rados();
1400 if (ret < 0)
1401 return ret;
1402
b3b6e05e 1403 return init_complete(dpp);
7c673cae
FG
1404}
1405
1406/**
1407 * Open the pool used as root for this gateway
1408 * Returns: 0 on success, -ERR# otherwise.
1409 */
b3b6e05e 1410int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1411{
b3b6e05e 1412 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1413}
1414
b3b6e05e 1415int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1416{
b3b6e05e 1417 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1418}
1419
b3b6e05e 1420int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1421{
b3b6e05e 1422 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1423}
1424
b3b6e05e 1425int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1426{
b3b6e05e 1427 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1428}
1429
b3b6e05e 1430int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
31f18b77 1431{
b3b6e05e 1432 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1433}
1434
b3b6e05e 1435int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
f67539c2 1436{
b3b6e05e 1437 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
f67539c2
TL
1438}
1439
b3b6e05e 1440int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
494da23a 1441 bool mostly_omap)
7c673cae 1442{
28e407b8 1443 constexpr bool create = true; // create the pool if it doesn't exist
b3b6e05e 1444 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1445}
1446
7c673cae
FG
1447/**** logs ****/
1448
1449struct log_list_state {
1450 string prefix;
1451 librados::IoCtx io_ctx;
1452 librados::NObjectIterator obit;
1453};
1454
b3b6e05e 1455int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
7c673cae
FG
1456{
1457 log_list_state *state = new log_list_state;
b3b6e05e 1458 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1459 if (r < 0) {
1460 delete state;
1461 return r;
1462 }
1463 state->prefix = prefix;
1464 state->obit = state->io_ctx.nobjects_begin();
1465 *handle = (RGWAccessHandle)state;
1466 return 0;
1467}
1468
1469int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1470{
1471 log_list_state *state = static_cast<log_list_state *>(handle);
1472 while (true) {
1473 if (state->obit == state->io_ctx.nobjects_end()) {
1474 delete state;
1475 return -ENOENT;
1476 }
1477 if (state->prefix.length() &&
1478 state->obit->get_oid().find(state->prefix) != 0) {
1479 state->obit++;
1480 continue;
1481 }
1482 *name = state->obit->get_oid();
1483 state->obit++;
1484 break;
1485 }
1486 return 0;
1487}
1488
b3b6e05e 1489int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
7c673cae
FG
1490{
1491 librados::IoCtx io_ctx;
b3b6e05e 1492 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1493 if (r < 0)
1494 return r;
1495 return io_ctx.remove(name);
1496}
1497
1498struct log_show_state {
1499 librados::IoCtx io_ctx;
1500 bufferlist bl;
11fdf7f2 1501 bufferlist::const_iterator p;
7c673cae
FG
1502 string name;
1503 uint64_t pos;
1504 bool eof;
1505 log_show_state() : pos(0), eof(false) {}
1506};
1507
b3b6e05e 1508int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
7c673cae
FG
1509{
1510 log_show_state *state = new log_show_state;
b3b6e05e 1511 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1512 if (r < 0) {
1513 delete state;
1514 return r;
1515 }
1516 state->name = name;
1517 *handle = (RGWAccessHandle)state;
1518 return 0;
1519}
1520
20effc67 1521int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
7c673cae
FG
1522{
1523 log_show_state *state = static_cast<log_show_state *>(handle);
1524 off_t off = state->p.get_off();
1525
20effc67 1526 ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
7c673cae
FG
1527 << " off " << off
1528 << " eof " << (int)state->eof
1529 << dendl;
1530 // read some?
1531 unsigned chunk = 1024*1024;
1532 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1533 bufferlist more;
1534 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1535 if (r < 0)
1536 return r;
1537 state->pos += r;
1538 bufferlist old;
1539 try {
1540 old.substr_of(state->bl, off, state->bl.length() - off);
1541 } catch (buffer::error& err) {
1542 return -EINVAL;
1543 }
f67539c2 1544 state->bl = std::move(old);
7c673cae 1545 state->bl.claim_append(more);
11fdf7f2 1546 state->p = state->bl.cbegin();
7c673cae
FG
1547 if ((unsigned)r < chunk)
1548 state->eof = true;
20effc67 1549 ldpp_dout(dpp, 10) << " read " << r << dendl;
7c673cae
FG
1550 }
1551
1552 if (state->p.end())
1553 return 0; // end of file
1554 try {
11fdf7f2 1555 decode(*entry, state->p);
7c673cae
FG
1556 }
1557 catch (const buffer::error &e) {
1558 return -EINVAL;
1559 }
1560 return 1;
1561}
1562
1563/**
1564 * usage_log_hash: get usage log key hash, based on name and index
1565 *
1566 * Get the usage object name. Since a user may have more than 1
1567 * object holding that info (multiple shards), we use index to
1568 * specify that shard number. Once index exceeds max shards it
1569 * wraps.
1570 * If name is not being set, results for all users will be returned
1571 * and index will wrap only after total shards number.
1572 *
1573 * @param cct [in] ceph context
1574 * @param name [in] user name
1575 * @param hash [out] hash value
1576 * @param index [in] shard index number
1577 */
1578static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1579{
1580 uint32_t val = index;
1581
1582 if (!name.empty()) {
c07f9fc5 1583 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1584 val %= max_user_shards;
1585 val += ceph_str_hash_linux(name.c_str(), name.size());
1586 }
1587 char buf[17];
c07f9fc5 1588 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1589 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1590 hash = buf;
1591}
1592
b3b6e05e 1593int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
7c673cae
FG
1594{
1595 uint32_t index = 0;
1596
1597 map<string, rgw_usage_log_info> log_objs;
1598
1599 string hash;
1600 string last_user;
1601
1602 /* restructure usage map, zone by object hash */
1603 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1604 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1605 const rgw_user_bucket& ub = iter->first;
1606 RGWUsageBatch& info = iter->second;
1607
1608 if (ub.user.empty()) {
b3b6e05e 1609 ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
7c673cae
FG
1610 continue;
1611 }
1612
1613 if (ub.user != last_user) {
1614 /* index *should* be random, but why waste extra cycles
1615 in most cases max user shards is not going to exceed 1,
1616 so just incrementing it */
1617 usage_log_hash(cct, ub.user, hash, index++);
1618 }
1619 last_user = ub.user;
1620 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1621
1622 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1623 v.push_back(miter->second);
1624 }
1625 }
1626
1627 map<string, rgw_usage_log_info>::iterator liter;
1628
1629 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
b3b6e05e 1630 int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
7c673cae
FG
1631 if (r < 0)
1632 return r;
1633 }
1634 return 0;
1635}
1636
b3b6e05e 1637int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
11fdf7f2
TL
1638 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1639 rgw_usage_log_entry>& usage)
7c673cae
FG
1640{
1641 uint32_t num = max_entries;
1642 string hash, first_hash;
1643 string user_str = user.to_str();
1644 usage_log_hash(cct, user_str, first_hash, 0);
1645
1646 if (usage_iter.index) {
1647 usage_log_hash(cct, user_str, hash, usage_iter.index);
1648 } else {
1649 hash = first_hash;
1650 }
1651
1652 usage.clear();
1653
1654 do {
1655 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1656 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1657
b3b6e05e 1658 int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1659 usage_iter.read_iter, ret_usage, is_truncated);
1660 if (ret == -ENOENT)
1661 goto next;
1662
1663 if (ret < 0)
1664 return ret;
1665
1666 num -= ret_usage.size();
1667
1668 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1669 usage[iter->first].aggregate(iter->second);
1670 }
1671
1672next:
1673 if (!*is_truncated) {
1674 usage_iter.read_iter.clear();
1675 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1676 }
1677 } while (num && !*is_truncated && hash != first_hash);
1678 return 0;
1679}
1680
b3b6e05e 1681int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1682{
1683 uint32_t index = 0;
1684 string hash, first_hash;
1685 string user_str = user.to_str();
1686 usage_log_hash(cct, user_str, first_hash, index);
1687
1688 hash = first_hash;
7c673cae 1689 do {
b3b6e05e 1690 int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1691
b32b8144 1692 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1693 return ret;
1694
7c673cae
FG
1695 usage_log_hash(cct, user_str, hash, ++index);
1696 } while (hash != first_hash);
1697
1698 return 0;
1699}
1700
11fdf7f2 1701
b3b6e05e 1702int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
11fdf7f2
TL
1703{
1704 auto max_shards = cct->_conf->rgw_usage_max_shards;
1705 int ret=0;
1706 for (unsigned i=0; i < max_shards; i++){
1707 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
b3b6e05e 1708 ret = cls_obj_usage_log_clear(dpp, oid);
11fdf7f2 1709 if (ret < 0){
b3b6e05e 1710 ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
11fdf7f2
TL
1711 return ret;
1712 }
1713 }
1714 return ret;
1715}
1716
20effc67 1717int RGWRados::decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner)
7c673cae 1718{
9f95a23c
TL
1719 auto i = bl.cbegin();
1720 RGWAccessControlPolicy policy(cct);
1721 try {
1722 policy.decode_owner(i);
1723 } catch (buffer::error& err) {
20effc67 1724 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
9f95a23c 1725 return -EIO;
7c673cae 1726 }
9f95a23c
TL
1727 *owner = policy.get_owner();
1728 return 0;
7c673cae
FG
1729}
1730
b3b6e05e 1731int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
7c673cae 1732{
9f95a23c
TL
1733 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1734 if (aiter == attrset.end())
1735 return -EIO;
7c673cae 1736
9f95a23c
TL
1737 bufferlist& bl = aiter->second;
1738 auto iter = bl.cbegin();
1739 try {
1740 policy->decode(iter);
1741 } catch (buffer::error& err) {
b3b6e05e 1742 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
1743 return -EIO;
1744 }
1745 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1746 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
b3b6e05e 1747 ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
9f95a23c
TL
1748 s3policy->to_xml(*_dout);
1749 *_dout << dendl;
1750 }
1751 return 0;
7c673cae
FG
1752}
1753
7c673cae 1754
b3b6e05e 1755int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
7c673cae 1756{
9f95a23c
TL
1757 rgw_bucket bucket = bucket_info.bucket;
1758 bucket.update_bucket_id(new_bucket_id);
7c673cae 1759
9f95a23c 1760 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 1761
9f95a23c 1762 bucket_info.objv_tracker.clear();
b3b6e05e 1763 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
9f95a23c
TL
1764 if (ret < 0) {
1765 return ret;
7c673cae
FG
1766 }
1767
9f95a23c 1768 return 0;
eafe8130
TL
1769}
1770
1771
1adf2230
AA
1772/**
1773 * Get ordered listing of the objects in a bucket.
7c673cae 1774 *
9f95a23c 1775 * max_p: maximum number of results to return
7c673cae
FG
1776 * bucket: bucket to list contents of
1777 * prefix: only return results that match this prefix
1778 * delim: do not include results that match this string.
1779 * Any skipped results will have the matching portion of their name
1780 * inserted in common_prefixes with a "true" mark.
1781 * marker: if filled in, begin the listing with this object.
1782 * end_marker: if filled in, end the listing with this object.
1783 * result: the objects are put in here.
11fdf7f2
TL
1784 * common_prefixes: if delim is filled in, any matching prefixes are
1785 * placed here.
1786 * is_truncated: if number of objects in the bucket is bigger than
1787 * max, then truncated.
7c673cae 1788 */
11fdf7f2 1789int RGWRados::Bucket::List::list_objects_ordered(
b3b6e05e 1790 const DoutPrefixProvider *dpp,
eafe8130 1791 int64_t max_p,
20effc67
TL
1792 std::vector<rgw_bucket_dir_entry> *result,
1793 std::map<std::string, bool> *common_prefixes,
9f95a23c
TL
1794 bool *is_truncated,
1795 optional_yield y)
7c673cae
FG
1796{
1797 RGWRados *store = target->get_store();
1798 CephContext *cct = store->ctx();
1799 int shard_id = target->get_shard_id();
1800
1801 int count = 0;
1802 bool truncated = true;
9f95a23c 1803 bool cls_filtered = false;
eafe8130
TL
1804 const int64_t max = // protect against memory issues and negative vals
1805 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1806 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
7c673cae
FG
1807
1808 result->clear();
1809
9f95a23c
TL
1810 // use a local marker; either the marker will have a previous entry
1811 // or it will be empty; either way it's OK to copy
1812 rgw_obj_key marker_obj(params.marker.name,
1813 params.marker.instance,
f91f0fd5 1814 params.ns.empty() ? params.marker.ns : params.ns);
7c673cae
FG
1815 rgw_obj_index_key cur_marker;
1816 marker_obj.get_index_key(&cur_marker);
1817
9f95a23c
TL
1818 rgw_obj_key end_marker_obj(params.end_marker.name,
1819 params.end_marker.instance,
f91f0fd5 1820 params.ns.empty() ? params.end_marker.ns : params.ns);
3efd9988
FG
1821 rgw_obj_index_key cur_end_marker;
1822 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
1823 const bool cur_end_marker_valid = !params.end_marker.empty();
1824
1825 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 1826 prefix_obj.set_ns(params.ns);
20effc67
TL
1827 std::string cur_prefix = prefix_obj.get_index_key_name();
1828 std::string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
1829
1830 if (!params.delim.empty()) {
9f95a23c 1831 after_delim_s = cls_rgw_after_delim(params.delim);
11fdf7f2
TL
1832 /* if marker points at a common prefix, fast forward it into its
1833 * upper bound string */
224ce89b 1834 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
1835 if (delim_pos >= 0) {
1836 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 1837 s.append(after_delim_s);
7c673cae
FG
1838 cur_marker = s;
1839 }
1840 }
1adf2230 1841
20effc67
TL
1842 // we'll stop after this many attempts as long we return at least
1843 // one entry; but we will also go beyond this number of attempts
1844 // until we return at least one entry
1845 constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
1846
9f95a23c 1847 rgw_obj_index_key prev_marker;
f6b5b4d7 1848 for (uint16_t attempt = 1; /* empty */; ++attempt) {
20effc67
TL
1849 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1850 ": starting attempt " << attempt << dendl;
9f95a23c
TL
1851
1852 if (attempt > 1 && !(prev_marker < cur_marker)) {
1853 // we've failed to make forward progress
20effc67
TL
1854 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
1855 " marker failed to make forward progress; attempt=" << attempt <<
9f95a23c
TL
1856 ", prev_marker=" << prev_marker <<
1857 ", cur_marker=" << cur_marker << dendl;
1858 break;
1859 }
1860 prev_marker = cur_marker;
1861
1862 ent_map_t ent_map;
1863 ent_map.reserve(read_ahead);
b3b6e05e
TL
1864 int r = store->cls_bucket_list_ordered(dpp,
1865 target->get_bucket_info(),
1adf2230
AA
1866 shard_id,
1867 cur_marker,
1868 cur_prefix,
9f95a23c 1869 params.delim,
1adf2230
AA
1870 read_ahead + 1 - count,
1871 params.list_versions,
9f95a23c 1872 attempt,
1adf2230
AA
1873 ent_map,
1874 &truncated,
9f95a23c
TL
1875 &cls_filtered,
1876 &cur_marker,
20effc67
TL
1877 y,
1878 params.force_check_filter);
9f95a23c 1879 if (r < 0) {
7c673cae 1880 return r;
9f95a23c 1881 }
7c673cae 1882
1adf2230 1883 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
1884 rgw_bucket_dir_entry& entry = eiter->second;
1885 rgw_obj_index_key index_key = entry.key;
7c673cae
FG
1886 rgw_obj_key obj(index_key);
1887
20effc67
TL
1888 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1889 ": considering entry " << entry.key << dendl;
9f95a23c 1890
1adf2230
AA
1891 /* note that parse_raw_oid() here will not set the correct
1892 * object's instance, as rgw_obj_index_key encodes that
1893 * separately. We don't need to set the instance because it's
1894 * not needed for the checks here and we end up using the raw
1895 * entry for the return vector
7c673cae
FG
1896 */
1897 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1898 if (!valid) {
20effc67
TL
1899 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
1900 " could not parse object name: " << obj.name << dendl;
7c673cae
FG
1901 continue;
1902 }
11fdf7f2 1903
9f95a23c 1904 bool matched_ns = (obj.ns == params.ns);
7c673cae 1905 if (!params.list_versions && !entry.is_visible()) {
20effc67
TL
1906 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1907 ": skipping not visible entry \"" << entry.key << "\"" << dendl;
7c673cae
FG
1908 continue;
1909 }
1910
9f95a23c 1911 if (params.enforce_ns && !matched_ns) {
7c673cae
FG
1912 if (!params.ns.empty()) {
1913 /* we've iterated past the namespace we're searching -- done now */
1914 truncated = false;
20effc67
TL
1915 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1916 ": finished due to getting past requested namespace \"" <<
1917 params.ns << "\"" << dendl;
7c673cae
FG
1918 goto done;
1919 }
1920
20effc67
TL
1921 /* we're skipping past namespaced objects */
1922 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1923 ": skipping past namespaced objects, including \"" << entry.key <<
1924 "\"" << dendl;
7c673cae
FG
1925 continue;
1926 }
1927
1928 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1929 truncated = false;
20effc67
TL
1930 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1931 ": finished due to gitting end marker of \"" << cur_end_marker <<
1932 "\" with \"" << entry.key << "\"" << dendl;
7c673cae
FG
1933 goto done;
1934 }
1935
1936 if (count < max) {
9f95a23c
TL
1937 params.marker = index_key;
1938 next_marker = index_key;
7c673cae
FG
1939 }
1940
20effc67
TL
1941 if (params.access_list_filter &&
1942 ! params.access_list_filter->filter(obj.name, index_key.name)) {
1943 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1944 ": skipping past namespaced objects, including \"" << entry.key <<
1945 "\"" << dendl;
7c673cae 1946 continue;
9f95a23c 1947 }
7c673cae 1948
1adf2230 1949 if (params.prefix.size() &&
9f95a23c 1950 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
20effc67
TL
1951 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1952 ": skipping object \"" << entry.key <<
1953 "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
7c673cae 1954 continue;
9f95a23c 1955 }
7c673cae
FG
1956
1957 if (!params.delim.empty()) {
9f95a23c
TL
1958 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1959 if (delim_pos >= 0) {
1960 // run either the code where delimiter filtering is done a)
1961 // in the OSD/CLS or b) here.
1962 if (cls_filtered) {
1963 // NOTE: this condition is for the newer versions of the
20effc67
TL
1964 // OSD that does filtering on the CLS side should only
1965 // find one delimiter at the end if it finds any after the
1966 // prefix
9f95a23c
TL
1967 if (delim_pos !=
1968 int(obj.name.length() - params.delim.length())) {
20effc67
TL
1969 ldpp_dout(dpp, 0) << "WARNING: " << __PRETTY_FUNCTION__ <<
1970 " found delimiter in place other than the end of "
9f95a23c
TL
1971 "the prefix; obj.name=" << obj.name <<
1972 ", prefix=" << params.prefix << dendl;
1973 }
1974 if (common_prefixes) {
1975 if (count >= max) {
1976 truncated = true;
20effc67
TL
1977 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1978 ": stopping early with common prefix \"" << entry.key <<
1979 "\" because requested number (" << max <<
1980 ") reached (cls filtered)" << dendl;
9f95a23c
TL
1981 goto done;
1982 }
1983
1984 (*common_prefixes)[obj.name] = true;
1985 count++;
1986 }
1987
20effc67
TL
1988 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1989 ": finished entry with common prefix \"" << entry.key <<
1990 "\" so continuing loop (cls filtered)" << dendl;
9f95a23c
TL
1991 continue;
1992 } else {
1993 // NOTE: this condition is for older versions of the OSD
1994 // that do not filter on the CLS side, so the following code
1995 // must do the filtering; once we reach version 16 of ceph,
1996 // this code can be removed along with the conditional that
1997 // can lead this way
1998
1999 /* extract key -with trailing delimiter- for CommonPrefix */
2000 string prefix_key =
2001 obj.name.substr(0, delim_pos + params.delim.length());
2002
2003 if (common_prefixes &&
2004 common_prefixes->find(prefix_key) == common_prefixes->end()) {
2005 if (count >= max) {
2006 truncated = true;
20effc67
TL
2007 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2008 ": stopping early with common prefix \"" << entry.key <<
2009 "\" because requested number (" << max <<
2010 ") reached (not cls filtered)" << dendl;
9f95a23c
TL
2011 goto done;
2012 }
2013 next_marker = prefix_key;
2014 (*common_prefixes)[prefix_key] = true;
2015
2016 count++;
2017 }
2018
20effc67
TL
2019 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2020 ": finished entry with common prefix \"" << entry.key <<
2021 "\" so continuing loop (not cls filtered)" << dendl;
9f95a23c
TL
2022 continue;
2023 } // if we're running an older OSD version
2024 } // if a delimiter was found after prefix
2025 } // if a delimiter was passed in
7c673cae
FG
2026
2027 if (count >= max) {
2028 truncated = true;
20effc67
TL
2029 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2030 ": stopping early with entry \"" << entry.key <<
2031 "\" because requested number (" << max <<
2032 ") reached" << dendl;
7c673cae
FG
2033 goto done;
2034 }
2035
20effc67
TL
2036 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2037 ": adding entry " << entry.key << " to result" << dendl;
9f95a23c 2038
7c673cae
FG
2039 result->emplace_back(std::move(entry));
2040 count++;
9f95a23c
TL
2041 } // eiter for loop
2042
2043 // NOTE: the following conditional is needed by older versions of
2044 // the OSD that don't do delimiter filtering on the CLS side; once
2045 // we reach version 16 of ceph, the following conditional and the
2046 // code within can be removed
2047 if (!cls_filtered && !params.delim.empty()) {
2048 int marker_delim_pos =
2049 cur_marker.name.find(params.delim, cur_prefix.size());
eafe8130 2050 if (marker_delim_pos >= 0) {
9f95a23c
TL
2051 std::string skip_after_delim =
2052 cur_marker.name.substr(0, marker_delim_pos);
eafe8130
TL
2053 skip_after_delim.append(after_delim_s);
2054
20effc67
TL
2055 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2056 ": skip_after_delim=" << skip_after_delim << dendl;
eafe8130
TL
2057
2058 if (skip_after_delim > cur_marker.name) {
2059 cur_marker = skip_after_delim;
20effc67
TL
2060 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2061 ": setting cur_marker=" << cur_marker.name <<
2062 "[" << cur_marker.instance << "]" << dendl;
eafe8130
TL
2063 }
2064 }
9f95a23c
TL
2065 } // if older osd didn't do delimiter filtering
2066
20effc67
TL
2067 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2068 ": end of outer loop, truncated=" << truncated <<
9f95a23c
TL
2069 ", count=" << count << ", attempt=" << attempt << dendl;
2070
2071 if (!truncated || count >= (max + 1) / 2) {
2072 // if we finished listing, or if we're returning at least half the
2073 // requested entries, that's enough; S3 and swift protocols allow
2074 // returning fewer than max entries
20effc67
TL
2075 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2076 ": exiting attempt loop because we reached end (" << truncated <<
2077 ") or we're returning half the requested entries (" << count <<
2078 " of " << max << ")" << dendl;
9f95a23c 2079 break;
20effc67 2080 } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
9f95a23c
TL
2081 // if we've made at least 8 attempts and we have some, but very
2082 // few, results, return with what we have
20effc67
TL
2083 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2084 ": exiting attempt loop because we made " << attempt <<
2085 " attempts and we're returning " << count << " entries" << dendl;
9f95a23c 2086 break;
eafe8130 2087 }
f6b5b4d7 2088 } // for (uint16_t attempt...
7c673cae
FG
2089
2090done:
9f95a23c
TL
2091
2092 if (is_truncated) {
7c673cae 2093 *is_truncated = truncated;
9f95a23c 2094 }
7c673cae
FG
2095
2096 return 0;
1adf2230
AA
2097} // list_objects_ordered
2098
2099
2100/**
2101 * Get listing of the objects in a bucket and allow the results to be out
2102 * of order.
2103 *
2104 * Even though there are key differences with the ordered counterpart,
2105 * the parameters are the same to maintain some compatability.
2106 *
2107 * max: maximum number of results to return
2108 * bucket: bucket to list contents of
2109 * prefix: only return results that match this prefix
2110 * delim: should not be set; if it is we should have indicated an error
2111 * marker: if filled in, begin the listing with this object.
2112 * end_marker: if filled in, end the listing with this object.
2113 * result: the objects are put in here.
2114 * common_prefixes: this is never filled with an unordered list; the param
2115 * is maintained for compatibility
2116 * is_truncated: if number of objects in the bucket is bigger than max, then
2117 * truncated.
2118 */
20effc67 2119int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
b3b6e05e 2120 int64_t max_p,
20effc67
TL
2121 std::vector<rgw_bucket_dir_entry>* result,
2122 std::map<std::string, bool>* common_prefixes,
2123 bool* is_truncated,
9f95a23c 2124 optional_yield y)
1adf2230
AA
2125{
2126 RGWRados *store = target->get_store();
1adf2230
AA
2127 int shard_id = target->get_shard_id();
2128
2129 int count = 0;
2130 bool truncated = true;
2131
eafe8130
TL
2132 const int64_t max = // protect against memory issues and negative vals
2133 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2134
1adf2230
AA
2135 // read a few extra in each call to cls_bucket_list_unordered in
2136 // case some are filtered out due to namespace matching, versioning,
2137 // filtering, etc.
2138 const int64_t max_read_ahead = 100;
2139 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2140
2141 result->clear();
2142
9f95a23c
TL
2143 // use a local marker; either the marker will have a previous entry
2144 // or it will be empty; either way it's OK to copy
11fdf7f2
TL
2145 rgw_obj_key marker_obj(params.marker.name,
2146 params.marker.instance,
f91f0fd5 2147 params.ns.empty() ? params.marker.ns : params.ns);
1adf2230
AA
2148 rgw_obj_index_key cur_marker;
2149 marker_obj.get_index_key(&cur_marker);
2150
11fdf7f2
TL
2151 rgw_obj_key end_marker_obj(params.end_marker.name,
2152 params.end_marker.instance,
f91f0fd5 2153 params.ns.empty() ? params.end_marker.ns : params.ns);
1adf2230
AA
2154 rgw_obj_index_key cur_end_marker;
2155 end_marker_obj.get_index_key(&cur_end_marker);
2156 const bool cur_end_marker_valid = !params.end_marker.empty();
2157
2158 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 2159 prefix_obj.set_ns(params.ns);
20effc67 2160 std::string cur_prefix = prefix_obj.get_index_key_name();
1adf2230
AA
2161
2162 while (truncated && count <= max) {
2163 std::vector<rgw_bucket_dir_entry> ent_list;
9f95a23c
TL
2164 ent_list.reserve(read_ahead);
2165
20effc67 2166 int r = store->cls_bucket_list_unordered(dpp,
b3b6e05e 2167 target->get_bucket_info(),
1adf2230
AA
2168 shard_id,
2169 cur_marker,
2170 cur_prefix,
2171 read_ahead,
2172 params.list_versions,
2173 ent_list,
2174 &truncated,
9f95a23c
TL
2175 &cur_marker,
2176 y);
20effc67
TL
2177 if (r < 0) {
2178 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
2179 " cls_bucket_list_unordered returned " << r << " for " <<
2180 target->get_bucket_info().bucket << dendl;
1adf2230 2181 return r;
20effc67 2182 }
1adf2230
AA
2183
2184 // NB: while regions of ent_list will be sorted, we have no
2185 // guarantee that all items will be sorted since they can cross
2186 // shard boundaries
2187
2188 for (auto& entry : ent_list) {
2189 rgw_obj_index_key index_key = entry.key;
2190 rgw_obj_key obj(index_key);
2191
9f95a23c
TL
2192 if (count < max) {
2193 params.marker.set(index_key);
2194 next_marker.set(index_key);
2195 }
2196
1adf2230
AA
2197 /* note that parse_raw_oid() here will not set the correct
2198 * object's instance, as rgw_obj_index_key encodes that
2199 * separately. We don't need to set the instance because it's
2200 * not needed for the checks here and we end up using the raw
2201 * entry for the return vector
2202 */
2203 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2204 if (!valid) {
20effc67
TL
2205 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
2206 " could not parse object name: " << obj.name << dendl;
1adf2230
AA
2207 continue;
2208 }
2209
2210 if (!params.list_versions && !entry.is_visible()) {
20effc67
TL
2211 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2212 ": skippping \"" << index_key <<
2213 "\" because not listing versions and entry not visibile" << dendl;
1adf2230
AA
2214 continue;
2215 }
2216
2217 if (params.enforce_ns && obj.ns != params.ns) {
20effc67
TL
2218 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2219 ": skippping \"" << index_key <<
2220 "\" because namespace does not match" << dendl;
1adf2230
AA
2221 continue;
2222 }
2223
2224 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2225 // we're not guaranteed items will come in order, so we have
2226 // to loop through all
20effc67
TL
2227 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2228 ": skippping \"" << index_key <<
2229 "\" because after end_marker" << dendl;
1adf2230
AA
2230 continue;
2231 }
2232
20effc67
TL
2233 if (params.access_list_filter &&
2234 !params.access_list_filter->filter(obj.name, index_key.name)) {
2235 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2236 ": skippping \"" << index_key <<
2237 "\" because doesn't match filter" << dendl;
1adf2230 2238 continue;
20effc67 2239 }
1adf2230
AA
2240
2241 if (params.prefix.size() &&
20effc67
TL
2242 (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
2243 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2244 ": skippping \"" << index_key <<
2245 "\" because doesn't match prefix" << dendl;
2246 continue;
2247 }
1adf2230
AA
2248
2249 if (count >= max) {
2250 truncated = true;
2251 goto done;
2252 }
2253
2254 result->emplace_back(std::move(entry));
2255 count++;
2256 } // for (auto& entry : ent_list)
2257 } // while (truncated && count <= max)
2258
2259done:
20effc67
TL
2260
2261 if (is_truncated) {
1adf2230 2262 *is_truncated = truncated;
20effc67 2263 }
1adf2230
AA
2264
2265 return 0;
2266} // list_objects_unordered
2267
7c673cae
FG
2268
2269/**
2270 * create a rados pool, associated meta info
2271 * returns 0 on success, -ERR# otherwise.
2272 */
b3b6e05e 2273int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
7c673cae 2274{
c07f9fc5 2275 librados::IoCtx io_ctx;
28e407b8 2276 constexpr bool create = true;
b3b6e05e 2277 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2278}
2279
9f95a23c 2280void RGWRados::create_bucket_id(string *bucket_id)
7c673cae 2281{
9f95a23c
TL
2282 uint64_t iid = instance_id();
2283 uint64_t bid = next_bucket_id();
2284 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2285 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2286 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2287 *bucket_id = buf;
2288}
7c673cae 2289
11fdf7f2 2290int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2291 const string& zonegroup_id,
11fdf7f2 2292 const rgw_placement_rule& placement_rule,
7c673cae
FG
2293 const string& swift_ver_location,
2294 const RGWQuotaInfo * pquota_info,
2295 map<std::string, bufferlist>& attrs,
2296 RGWBucketInfo& info,
2297 obj_version *pobjv,
2298 obj_version *pep_objv,
2299 real_time creation_time,
2300 rgw_bucket *pmaster_bucket,
2301 uint32_t *pmaster_num_shards,
f67539c2 2302 optional_yield y,
b3b6e05e 2303 const DoutPrefixProvider *dpp,
7c673cae
FG
2304 bool exclusive)
2305{
2306#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2307 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2308 RGWZonePlacementInfo rule_info;
2309
2310 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2311 int ret = 0;
b3b6e05e 2312 ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
f67539c2 2313 &selected_placement_rule, &rule_info, y);
7c673cae
FG
2314 if (ret < 0)
2315 return ret;
2316
2317 if (!pmaster_bucket) {
2318 create_bucket_id(&bucket.marker);
2319 bucket.bucket_id = bucket.marker;
2320 } else {
2321 bucket.marker = pmaster_bucket->marker;
2322 bucket.bucket_id = pmaster_bucket->bucket_id;
2323 }
2324
2325 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2326
9f95a23c
TL
2327 objv_tracker.read_version.clear();
2328
7c673cae
FG
2329 if (pobjv) {
2330 objv_tracker.write_version = *pobjv;
2331 } else {
2332 objv_tracker.generate_new_write_ver(cct);
2333 }
2334
2335 info.bucket = bucket;
2336 info.owner = owner.user_id;
2337 info.zonegroup = zonegroup_id;
11fdf7f2 2338 info.placement_rule = selected_placement_rule;
7c673cae
FG
2339 info.swift_ver_location = swift_ver_location;
2340 info.swift_versioning = (!swift_ver_location.empty());
f67539c2
TL
2341
2342 init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
2343 pmaster_num_shards ?
2344 std::optional{*pmaster_num_shards} :
2345 std::nullopt,
2346 rule_info.index_type);
2347
7c673cae
FG
2348 info.requester_pays = false;
2349 if (real_clock::is_zero(creation_time)) {
2350 info.creation_time = ceph::real_clock::now();
2351 } else {
2352 info.creation_time = creation_time;
2353 }
2354 if (pquota_info) {
2355 info.quota = *pquota_info;
2356 }
2357
b3b6e05e 2358 int r = svc.bi->init_index(dpp, info);
11fdf7f2
TL
2359 if (r < 0) {
2360 return r;
2361 }
7c673cae 2362
b3b6e05e 2363 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
9f95a23c
TL
2364 if (ret == -ECANCELED) {
2365 ret = -EEXIST;
2366 }
11fdf7f2 2367 if (ret == -EEXIST) {
11fdf7f2 2368 /* we need to reread the info and return it, caller will have a use for it */
9f95a23c
TL
2369 RGWBucketInfo orig_info;
2370 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
11fdf7f2
TL
2371 if (r < 0) {
2372 if (r == -ENOENT) {
2373 continue;
2374 }
b3b6e05e 2375 ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
11fdf7f2
TL
2376 return r;
2377 }
7c673cae 2378
11fdf7f2 2379 /* only remove it if it's a different bucket instance */
9f95a23c 2380 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
b3b6e05e 2381 int r = svc.bi->clean_index(dpp, info);
9f95a23c 2382 if (r < 0) {
b3b6e05e 2383 ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
9f95a23c 2384 }
b3b6e05e 2385 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
9f95a23c 2386 if (r < 0) {
b3b6e05e 2387 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
9f95a23c
TL
2388 /* continue anyway */
2389 }
11fdf7f2 2390 }
9f95a23c
TL
2391
2392 info = std::move(orig_info);
2393 /* ret == -EEXIST here */
11fdf7f2 2394 }
7c673cae 2395 return ret;
7c673cae
FG
2396 }
2397
11fdf7f2 2398 /* this is highly unlikely */
b3b6e05e 2399 ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
11fdf7f2 2400 return -ENOENT;
7c673cae
FG
2401}
2402
20effc67 2403// returns true on success, false on failure
11fdf7f2 2404bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2405{
11fdf7f2
TL
2406 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2407}
c07f9fc5 2408
11fdf7f2
TL
2409bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2410{
2411 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2412
11fdf7f2 2413 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2414}
2415
20effc67
TL
2416std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
2417{
2418 return svc.rados->cluster_fsid();
2419}
2420
39ae355f
TL
2421int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp,
2422 const RGWBucketInfo& bucket_info,
2423 const rgw_obj& obj,
2424 librados::IoCtx *ioctx)
7c673cae 2425{
39ae355f 2426 std::string oid, key;
7c673cae
FG
2427 get_obj_bucket_and_oid_loc(obj, oid, key);
2428
2429 rgw_pool pool;
2430 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
39ae355f
TL
2431 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj <<
2432 ", probably misconfiguration" << dendl;
7c673cae
FG
2433 return -EIO;
2434 }
2435
b3b6e05e 2436 int r = open_pool_ctx(dpp, pool, *ioctx, false);
7c673cae 2437 if (r < 0) {
39ae355f
TL
2438 ldpp_dout(dpp, 0) << "ERROR: unable to open data-pool=" << pool.to_str() <<
2439 " for obj=" << obj << " with error-code=" << r << dendl;
7c673cae
FG
2440 return r;
2441 }
2442
2443 ioctx->locator_set_key(key);
2444
2445 return 0;
2446}
2447
20effc67
TL
2448int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
2449 const rgw_placement_rule& target_placement_rule,
2450 const rgw_obj& obj,
2451 rgw_rados_ref *ref)
7c673cae 2452{
11fdf7f2 2453 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2454
2455 rgw_pool pool;
20effc67 2456 if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
b3b6e05e 2457 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
7c673cae
FG
2458 return -EIO;
2459 }
2460
9f95a23c
TL
2461 ref->pool = svc.rados->pool(pool);
2462
b3b6e05e 2463 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c 2464 .set_mostly_omap(false));
7c673cae 2465 if (r < 0) {
b3b6e05e 2466 ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
7c673cae
FG
2467 return r;
2468 }
2469
9f95a23c 2470 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2471
2472 return 0;
2473}
2474
20effc67
TL
2475int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
2476 const RGWBucketInfo& bucket_info,
2477 const rgw_obj& obj,
2478 rgw_rados_ref *ref)
2479{
2480 return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
2481}
2482
b3b6e05e 2483int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2484{
11fdf7f2 2485 ref->obj = obj;
7c673cae 2486
11fdf7f2
TL
2487 if (ref->obj.oid.empty()) {
2488 ref->obj.oid = obj.pool.to_str();
2489 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2490 }
9f95a23c 2491 ref->pool = svc.rados->pool(obj.pool);
b3b6e05e 2492 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c
TL
2493 .set_mostly_omap(false));
2494 if (r < 0) {
b3b6e05e 2495 ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
7c673cae 2496 return r;
9f95a23c 2497 }
7c673cae 2498
9f95a23c 2499 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2500
2501 return 0;
2502}
2503
b3b6e05e 2504int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2505{
b3b6e05e 2506 return get_raw_obj_ref(dpp, obj, ref);
7c673cae
FG
2507}
2508
2509/*
2510 * fixes an issue where head objects were supposed to have a locator created, but ended
2511 * up without one
2512 */
b3b6e05e 2513int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
7c673cae
FG
2514{
2515 const rgw_bucket& bucket = bucket_info.bucket;
2516 string oid;
2517 string locator;
2518
2519 rgw_obj obj(bucket, key);
2520
2521 get_obj_bucket_and_oid_loc(obj, oid, locator);
2522
2523 if (locator.empty()) {
b3b6e05e 2524 ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
7c673cae
FG
2525 return 0;
2526 }
2527
2528 librados::IoCtx ioctx;
2529
b3b6e05e 2530 int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
7c673cae
FG
2531 if (ret < 0) {
2532 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2533 return ret;
2534 }
2535 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2536
2537 uint64_t size;
2538 bufferlist data;
2539
2540 struct timespec mtime_ts;
2541 map<string, bufferlist> attrs;
2542 librados::ObjectReadOperation op;
2543 op.getxattrs(&attrs, NULL);
2544 op.stat2(&size, &mtime_ts, NULL);
2545#define HEAD_SIZE 512 * 1024
2546 op.read(0, HEAD_SIZE, &data, NULL);
2547
b3b6e05e 2548 ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
7c673cae 2549 if (ret < 0) {
b3b6e05e 2550 ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
7c673cae
FG
2551 return ret;
2552 }
2553
2554 if (size > HEAD_SIZE) {
b3b6e05e 2555 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
7c673cae
FG
2556 return -EIO;
2557 }
2558
2559 if (size != data.length()) {
b3b6e05e 2560 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
7c673cae
FG
2561 return -EIO;
2562 }
2563
2564 if (copy_obj) {
2565 librados::ObjectWriteOperation wop;
2566
2567 wop.mtime2(&mtime_ts);
2568
2569 map<string, bufferlist>::iterator iter;
2570 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2571 wop.setxattr(iter->first.c_str(), iter->second);
2572 }
2573
2574 wop.write(0, data);
2575
2576 ioctx.locator_set_key(locator);
b3b6e05e 2577 rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
7c673cae
FG
2578 }
2579
2580 if (remove_bad) {
2581 ioctx.locator_set_key(string());
2582
2583 ret = ioctx.remove(oid);
2584 if (ret < 0) {
b3b6e05e 2585 ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
7c673cae
FG
2586 return ret;
2587 }
2588 }
2589
2590 return 0;
2591}
2592
20effc67 2593int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
b3b6e05e 2594 librados::IoCtx& src_ioctx,
7c673cae
FG
2595 const string& src_oid, const string& src_locator,
2596 librados::IoCtx& dst_ioctx,
2597 const string& dst_oid, const string& dst_locator)
2598{
2599
2600#define COPY_BUF_SIZE (4 * 1024 * 1024)
2601 bool done = false;
2602 uint64_t chunk_size = COPY_BUF_SIZE;
2603 uint64_t ofs = 0;
2604 int ret = 0;
2605 real_time mtime;
2606 struct timespec mtime_ts;
2607 uint64_t size;
2608
2609 if (src_oid == dst_oid && src_locator == dst_locator) {
2610 return 0;
2611 }
2612
2613 src_ioctx.locator_set_key(src_locator);
2614 dst_ioctx.locator_set_key(dst_locator);
2615
2616 do {
2617 bufferlist data;
2618 ObjectReadOperation rop;
2619 ObjectWriteOperation wop;
2620
2621 if (ofs == 0) {
2622 rop.stat2(&size, &mtime_ts, NULL);
2623 mtime = real_clock::from_timespec(mtime_ts);
2624 }
2625 rop.read(ofs, chunk_size, &data, NULL);
b3b6e05e 2626 ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
7c673cae
FG
2627 if (ret < 0) {
2628 goto done_err;
2629 }
2630
2631 if (data.length() == 0) {
2632 break;
2633 }
2634
2635 if (ofs == 0) {
2636 wop.create(true); /* make it exclusive */
2637 wop.mtime2(&mtime_ts);
2638 mtime = real_clock::from_timespec(mtime_ts);
2639 }
2640 wop.write(ofs, data);
b3b6e05e 2641 ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
11fdf7f2
TL
2642 if (ret < 0) {
2643 goto done_err;
2644 }
7c673cae
FG
2645 ofs += data.length();
2646 done = data.length() != chunk_size;
2647 } while (!done);
2648
2649 if (ofs != size) {
b3b6e05e 2650 ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
7c673cae
FG
2651 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2652 ret = -EIO;
2653 goto done_err;
2654 }
2655
2656 src_ioctx.remove(src_oid);
2657
2658 return 0;
2659
2660done_err:
11fdf7f2 2661 // TODO: clean up dst_oid if we created it
b3b6e05e 2662 ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
7c673cae
FG
2663 return ret;
2664}
2665
2666/*
2667 * fixes an issue where head objects were supposed to have a locator created, but ended
2668 * up without one
2669 */
b3b6e05e 2670int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
7c673cae
FG
2671{
2672 const rgw_bucket& bucket = bucket_info.bucket;
2673 rgw_obj obj(bucket, key);
2674
2675 if (need_fix) {
2676 *need_fix = false;
2677 }
2678
2679 rgw_rados_ref ref;
b3b6e05e 2680 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
2681 if (r < 0) {
2682 return r;
2683 }
2684
2685 RGWObjState *astate = NULL;
9f95a23c 2686 RGWObjectCtx rctx(this->store);
b3b6e05e 2687 r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
2688 if (r < 0)
2689 return r;
2690
9f95a23c 2691 if (astate->manifest) {
7c673cae 2692 RGWObjManifest::obj_iterator miter;
9f95a23c 2693 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 2694 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 2695 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
7c673cae
FG
2696 rgw_obj loc;
2697 string oid;
2698 string locator;
2699
9f95a23c 2700 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
7c673cae
FG
2701
2702 if (loc.key.ns.empty()) {
2703 /* continue, we're only interested in tail objects */
2704 continue;
2705 }
2706
9f95a23c
TL
2707 auto& ioctx = ref.pool.ioctx();
2708
7c673cae 2709 get_obj_bucket_and_oid_loc(loc, oid, locator);
9f95a23c 2710 ref.pool.ioctx().locator_set_key(locator);
7c673cae 2711
b3b6e05e 2712 ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
7c673cae 2713
9f95a23c 2714 r = ioctx.stat(oid, NULL, NULL);
7c673cae
FG
2715 if (r != -ENOENT) {
2716 continue;
2717 }
2718
2719 string bad_loc;
2720 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2721
2722 /* create a new ioctx with the bad locator */
2723 librados::IoCtx src_ioctx;
9f95a23c 2724 src_ioctx.dup(ioctx);
7c673cae
FG
2725 src_ioctx.locator_set_key(bad_loc);
2726
2727 r = src_ioctx.stat(oid, NULL, NULL);
2728 if (r != 0) {
2729 /* cannot find a broken part */
2730 continue;
2731 }
b3b6e05e 2732 ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
7c673cae
FG
2733 if (need_fix) {
2734 *need_fix = true;
2735 }
2736 if (fix) {
b3b6e05e 2737 r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
7c673cae 2738 if (r < 0) {
b3b6e05e 2739 ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
7c673cae
FG
2740 }
2741 }
2742 }
2743 }
2744
2745 return 0;
2746}
2747
f64942e4
AA
2748int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2749 const rgw_obj& obj,
b3b6e05e
TL
2750 RGWBucketInfo* bucket_info_out,
2751 const DoutPrefixProvider *dpp)
7c673cae
FG
2752{
2753 bucket = _bucket;
2754
11fdf7f2 2755 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2756
2757 RGWBucketInfo bucket_info;
f64942e4
AA
2758 RGWBucketInfo* bucket_info_p =
2759 bucket_info_out ? bucket_info_out : &bucket_info;
2760
b3b6e05e 2761 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2762 if (ret < 0) {
2763 return ret;
2764 }
2765
9f95a23c
TL
2766 string oid;
2767
b3b6e05e 2768 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae 2769 if (ret < 0) {
b3b6e05e 2770 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2771 return ret;
2772 }
b3b6e05e 2773 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2774
2775 return 0;
2776}
2777
f64942e4 2778int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
f67539c2 2779 int sid, const rgw::bucket_index_layout_generation& idx_layout,
b3b6e05e
TL
2780 RGWBucketInfo* bucket_info_out,
2781 const DoutPrefixProvider *dpp)
7c673cae
FG
2782{
2783 bucket = _bucket;
2784 shard_id = sid;
2785
11fdf7f2 2786 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 2787
f67539c2 2788
7c673cae 2789 RGWBucketInfo bucket_info;
f64942e4
AA
2790 RGWBucketInfo* bucket_info_p =
2791 bucket_info_out ? bucket_info_out : &bucket_info;
b3b6e05e 2792 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2793 if (ret < 0) {
2794 return ret;
2795 }
2796
9f95a23c
TL
2797 string oid;
2798
b3b6e05e 2799 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, shard_id, idx_layout, &bucket_obj);
7c673cae 2800 if (ret < 0) {
b3b6e05e 2801 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2802 return ret;
2803 }
b3b6e05e 2804 ldpp_dout(dpp, 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2805
2806 return 0;
2807}
2808
b3b6e05e 2809int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
a8e16298
TL
2810 const rgw_obj& obj)
2811{
2812 bucket = bucket_info.bucket;
2813
b3b6e05e 2814 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
9f95a23c
TL
2815 obj.get_hash_object(),
2816 &bucket_obj,
2817 &shard_id);
a8e16298 2818 if (ret < 0) {
b3b6e05e 2819 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
a8e16298
TL
2820 return ret;
2821 }
b3b6e05e 2822 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
a8e16298
TL
2823
2824 return 0;
2825}
2826
b3b6e05e 2827int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int sid)
b32b8144
FG
2828{
2829 bucket = bucket_info.bucket;
2830 shard_id = sid;
2831
b3b6e05e 2832 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, idx_layout, &bucket_obj);
b32b8144 2833 if (ret < 0) {
b3b6e05e 2834 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
b32b8144
FG
2835 return ret;
2836 }
b3b6e05e 2837 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
b32b8144
FG
2838
2839 return 0;
2840}
2841
7c673cae
FG
2842
2843/* Execute @handler on last item in bucket listing for bucket specified
2844 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2845 * to objects matching these criterias. */
20effc67 2846int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
b3b6e05e 2847 RGWBucketInfo& bucket_info,
7c673cae
FG
2848 const std::string& obj_prefix,
2849 const std::string& obj_delim,
2850 std::function<int(const rgw_bucket_dir_entry&)> handler)
2851{
2852 RGWRados::Bucket target(this, bucket_info);
2853 RGWRados::Bucket::List list_op(&target);
2854
2855 list_op.params.prefix = obj_prefix;
2856 list_op.params.delim = obj_delim;
2857
b3b6e05e 2858 ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
7c673cae
FG
2859 << ", obj_prefix=" << obj_prefix
2860 << ", obj_delim=" << obj_delim
2861 << dendl;
2862
2863 bool is_truncated = false;
2864
2865 boost::optional<rgw_bucket_dir_entry> last_entry;
2866 /* We need to rewind to the last object in a listing. */
2867 do {
2868 /* List bucket entries in chunks. */
2869 static constexpr int MAX_LIST_OBJS = 100;
2870 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2871
b3b6e05e 2872 int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
9f95a23c 2873 &is_truncated, null_yield);
7c673cae
FG
2874 if (ret < 0) {
2875 return ret;
2876 } else if (!entries.empty()) {
2877 last_entry = entries.back();
2878 }
2879 } while (is_truncated);
2880
2881 if (last_entry) {
2882 return handler(*last_entry);
2883 }
2884
2885 /* Empty listing - no items we can run handler on. */
2886 return 0;
2887}
2888
20effc67 2889bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
f67539c2
TL
2890{
2891 return bucket->get_info().has_swift_versioning() &&
2892 bucket->get_info().swift_ver_location.size();
2893}
7c673cae
FG
2894
2895int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2896 const rgw_user& user,
20effc67
TL
2897 rgw::sal::Bucket* bucket,
2898 rgw::sal::Object* obj,
9f95a23c
TL
2899 const DoutPrefixProvider *dpp,
2900 optional_yield y)
7c673cae 2901{
f67539c2 2902 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
2903 return 0;
2904 }
2905
f67539c2 2906 obj->set_atomic(&obj_ctx);
7c673cae
FG
2907
2908 RGWObjState * state = nullptr;
b3b6e05e 2909 int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj->get_obj(), &state, false, y);
7c673cae
FG
2910 if (r < 0) {
2911 return r;
2912 }
2913
2914 if (!state->exists) {
2915 return 0;
2916 }
2917
f67539c2 2918 const string& src_name = obj->get_oid();
7c673cae
FG
2919 char buf[src_name.size() + 32];
2920 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2921 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2922 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2923
2924 RGWBucketInfo dest_bucket_info;
2925
f67539c2 2926 r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
7c673cae 2927 if (r < 0) {
b3b6e05e 2928 ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
7c673cae
FG
2929 if (r == -ENOENT) {
2930 return -ERR_PRECONDITION_FAILED;
2931 }
2932 return r;
2933 }
2934
f67539c2 2935 if (dest_bucket_info.owner != bucket->get_info().owner) {
7c673cae
FG
2936 return -ERR_PRECONDITION_FAILED;
2937 }
2938
20effc67
TL
2939 rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
2940 rgw::sal::RadosObject dest_obj(store, rgw_obj_key(buf), &dest_bucket);
11fdf7f2
TL
2941
2942 if (dest_bucket_info.versioning_enabled()){
f67539c2 2943 dest_obj.gen_rand_obj_instance_name();
11fdf7f2
TL
2944 }
2945
f67539c2 2946 dest_obj.set_atomic(&obj_ctx);
7c673cae 2947
9f95a23c 2948 rgw_zone_id no_zone;
7c673cae
FG
2949
2950 r = copy_obj(obj_ctx,
2951 user,
7c673cae
FG
2952 NULL, /* req_info *info */
2953 no_zone,
f67539c2 2954 &dest_obj,
7c673cae 2955 obj,
f67539c2
TL
2956 &dest_bucket,
2957 bucket,
2958 bucket->get_placement_rule(),
7c673cae
FG
2959 NULL, /* time_t *src_mtime */
2960 NULL, /* time_t *mtime */
2961 NULL, /* const time_t *mod_ptr */
2962 NULL, /* const time_t *unmod_ptr */
2963 false, /* bool high_precision_time */
2964 NULL, /* const char *if_match */
2965 NULL, /* const char *if_nomatch */
2966 RGWRados::ATTRSMOD_NONE,
2967 true, /* bool copy_if_newer */
2968 state->attrset,
11fdf7f2 2969 RGWObjCategory::Main,
7c673cae
FG
2970 0, /* uint64_t olh_epoch */
2971 real_time(), /* time_t delete_at */
2972 NULL, /* string *version_id */
2973 NULL, /* string *ptag */
2974 NULL, /* string *petag */
7c673cae 2975 NULL, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2976 NULL, /* void *progress_data */
2977 dpp,
2978 null_yield);
7c673cae
FG
2979 if (r == -ECANCELED || r == -ENOENT) {
2980 /* Has already been overwritten, meaning another rgw process already
2981 * copied it out */
2982 return 0;
2983 }
2984
2985 return r;
2986}
2987
9f95a23c 2988int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
7c673cae 2989 const rgw_user& user,
20effc67
TL
2990 rgw::sal::Bucket* bucket,
2991 rgw::sal::Object* obj,
9f95a23c
TL
2992 bool& restored, /* out */
2993 const DoutPrefixProvider *dpp)
7c673cae 2994{
f67539c2 2995 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
2996 return 0;
2997 }
2998
2999 /* Bucket info of the bucket that stores previous versions of our object. */
3000 RGWBucketInfo archive_binfo;
3001
f67539c2
TL
3002 int ret = get_bucket_info(&svc, bucket->get_tenant(),
3003 bucket->get_info().swift_ver_location,
3004 archive_binfo, nullptr, null_yield, nullptr);
7c673cae
FG
3005 if (ret < 0) {
3006 return ret;
3007 }
3008
3009 /* Abort the operation if the bucket storing our archive belongs to someone
3010 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3011 * into consideration. For we can live with that.
3012 *
3013 * TODO: delegate this check to un upper layer and compare with ACLs. */
f67539c2 3014 if (bucket->get_info().owner != archive_binfo.owner) {
7c673cae
FG
3015 return -EPERM;
3016 }
3017
3018 /* This code will be executed on latest version of the object. */
3019 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
9f95a23c 3020 rgw_zone_id no_zone;
7c673cae
FG
3021
3022 /* We don't support object versioning of Swift API on those buckets that
3023 * are already versioned using the S3 mechanism. This affects also bucket
3024 * storing archived objects. Otherwise the delete operation would create
3025 * a deletion marker. */
3026 if (archive_binfo.versioned()) {
3027 restored = false;
3028 return -ERR_PRECONDITION_FAILED;
3029 }
3030
3031 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3032 * irrelevant and may be safely skipped. */
3033 std::map<std::string, ceph::bufferlist> no_attrs;
3034
20effc67
TL
3035 rgw::sal::RadosBucket archive_bucket(store, archive_binfo);
3036 rgw::sal::RadosObject archive_obj(store, entry.key, &archive_bucket);
11fdf7f2 3037
f67539c2
TL
3038 if (bucket->versioning_enabled()){
3039 obj->gen_rand_obj_instance_name();
11fdf7f2
TL
3040 }
3041
f67539c2
TL
3042 archive_obj.set_atomic(&obj_ctx);
3043 obj->set_atomic(&obj_ctx);
7c673cae
FG
3044
3045 int ret = copy_obj(obj_ctx,
3046 user,
7c673cae
FG
3047 nullptr, /* req_info *info */
3048 no_zone,
3049 obj, /* dest obj */
f67539c2
TL
3050 &archive_obj, /* src obj */
3051 bucket, /* dest bucket info */
3052 &archive_bucket, /* src bucket info */
3053 bucket->get_placement_rule(), /* placement_rule */
7c673cae
FG
3054 nullptr, /* time_t *src_mtime */
3055 nullptr, /* time_t *mtime */
3056 nullptr, /* const time_t *mod_ptr */
3057 nullptr, /* const time_t *unmod_ptr */
3058 false, /* bool high_precision_time */
3059 nullptr, /* const char *if_match */
3060 nullptr, /* const char *if_nomatch */
3061 RGWRados::ATTRSMOD_NONE,
3062 true, /* bool copy_if_newer */
3063 no_attrs,
11fdf7f2 3064 RGWObjCategory::Main,
7c673cae
FG
3065 0, /* uint64_t olh_epoch */
3066 real_time(), /* time_t delete_at */
3067 nullptr, /* string *version_id */
3068 nullptr, /* string *ptag */
3069 nullptr, /* string *petag */
7c673cae 3070 nullptr, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
3071 nullptr, /* void *progress_data */
3072 dpp,
3073 null_yield);
7c673cae
FG
3074 if (ret == -ECANCELED || ret == -ENOENT) {
3075 /* Has already been overwritten, meaning another rgw process already
3076 * copied it out */
3077 return 0;
3078 } else if (ret < 0) {
3079 return ret;
3080 } else {
3081 restored = true;
3082 }
3083
3084 /* Need to remove the archived copy. */
b3b6e05e 3085 ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj.get_obj(),
7c673cae
FG
3086 archive_binfo.versioning_status());
3087
3088 return ret;
3089 };
3090
f67539c2 3091 const std::string& obj_name = obj->get_oid();
7c673cae
FG
3092 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
3093 % obj_name);
3094
b3b6e05e 3095 return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
7c673cae
FG
3096 handler);
3097}
3098
b3b6e05e
TL
3099int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
3100 uint64_t size, uint64_t accounted_size,
181888fb
FG
3101 map<string, bufferlist>& attrs,
3102 bool assume_noent, bool modify_tail,
9f95a23c 3103 void *_index_op, optional_yield y)
7c673cae
FG
3104{
3105 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
3106 RGWRados *store = target->get_store();
3107
3108 ObjectWriteOperation op;
11fdf7f2
TL
3109#ifdef WITH_LTTNG
3110 const struct req_state* s = get_req_state();
3111 string req_id;
3112 if (!s) {
3113 // fake req_id
3114 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3115 } else {
3116 req_id = s->req_id;
3117 }
3118#endif
7c673cae
FG
3119
3120 RGWObjState *state;
b3b6e05e 3121 int r = target->get_state(dpp, &state, false, y, assume_noent);
7c673cae
FG
3122 if (r < 0)
3123 return r;
3124
3125 rgw_obj& obj = target->get_obj();
3126
3127 if (obj.get_oid().empty()) {
b3b6e05e 3128 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7c673cae
FG
3129 return -EIO;
3130 }
3131
224ce89b 3132 rgw_rados_ref ref;
20effc67 3133 r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
7c673cae
FG
3134 if (r < 0)
3135 return r;
3136
3137 bool is_olh = state->is_olh;
3138
3139 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3140
3141 const string *ptag = meta.ptag;
3142 if (!ptag && !index_op->get_optag()->empty()) {
3143 ptag = index_op->get_optag();
3144 }
b3b6e05e 3145 r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
7c673cae
FG
3146 if (r < 0)
3147 return r;
3148
3149 if (real_clock::is_zero(meta.set_mtime)) {
3150 meta.set_mtime = real_clock::now();
3151 }
3152
eafe8130
TL
3153 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
3154 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
3155 if (iter == attrs.end()) {
3156 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3157 string mode = target->bucket_info.obj_lock.get_mode();
3158 RGWObjectRetention obj_retention(mode, lock_until_date);
3159 bufferlist bl;
3160 obj_retention.encode(bl);
3161 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3162 }
3163 }
3164
7c673cae
FG
3165 if (state->is_olh) {
3166 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3167 }
3168
3169 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3170 op.mtime2(&mtime_ts);
3171
3172 if (meta.data) {
3173 /* if we want to overwrite the data, we also want to overwrite the
3174 xattrs, so just remove the object */
3175 op.write_full(*meta.data);
20effc67
TL
3176 if (state->compressed) {
3177 uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
3178 op.set_alloc_hint2(0, 0, alloc_hint_flags);
3179 }
7c673cae
FG
3180 }
3181
3182 string etag;
3183 string content_type;
3184 bufferlist acl_bl;
11fdf7f2 3185 string storage_class;
7c673cae
FG
3186
3187 map<string, bufferlist>::iterator iter;
3188 if (meta.rmattrs) {
3189 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3190 const string& name = iter->first;
3191 op.rmxattr(name.c_str());
3192 }
3193 }
3194
3195 if (meta.manifest) {
11fdf7f2
TL
3196 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3197
7c673cae
FG
3198 /* remove existing manifest attr */
3199 iter = attrs.find(RGW_ATTR_MANIFEST);
3200 if (iter != attrs.end())
3201 attrs.erase(iter);
3202
3203 bufferlist bl;
11fdf7f2 3204 encode(*meta.manifest, bl);
7c673cae
FG
3205 op.setxattr(RGW_ATTR_MANIFEST, bl);
3206 }
3207
3208 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3209 const string& name = iter->first;
3210 bufferlist& bl = iter->second;
3211
3212 if (!bl.length())
3213 continue;
3214
3215 op.setxattr(name.c_str(), bl);
3216
3217 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3218 etag = rgw_bl_str(bl);
7c673cae 3219 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3220 content_type = rgw_bl_str(bl);
7c673cae
FG
3221 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3222 acl_bl = bl;
3223 }
3224 }
3225 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3226 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3227 }
3228
3229 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3230 bufferlist bl;
11fdf7f2 3231 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3232 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3233 }
3234
11fdf7f2
TL
3235 if (!storage_class.empty()) {
3236 bufferlist bl;
3237 bl.append(storage_class);
3238 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3239 }
3240
7c673cae
FG
3241 if (!op.size())
3242 return 0;
3243
3244 uint64_t epoch;
3245 int64_t poolid;
224ce89b
WB
3246 bool orig_exists;
3247 uint64_t orig_size;
3248
3249 if (!reset_obj) { //Multipart upload, it has immutable head.
3250 orig_exists = false;
3251 orig_size = 0;
3252 } else {
3253 orig_exists = state->exists;
3254 orig_size = state->accounted_size;
3255 }
7c673cae 3256
91327a77
AA
3257 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3258 !obj.key.instance.empty();
7c673cae
FG
3259
3260 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3261
3262 if (versioned_op) {
3263 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3264 }
3265
3266 if (!index_op->is_prepared()) {
11fdf7f2 3267 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
b3b6e05e 3268 r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
11fdf7f2 3269 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3270 if (r < 0)
3271 return r;
3272 }
3273
9f95a23c
TL
3274 auto& ioctx = ref.pool.ioctx();
3275
11fdf7f2 3276 tracepoint(rgw_rados, operate_enter, req_id.c_str());
b3b6e05e 3277 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
11fdf7f2 3278 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3279 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3280 or -ENOENT if was removed, or -EEXIST if it did not exist
3281 before and now it does */
3282 if (r == -EEXIST && assume_noent) {
3283 target->invalidate_state();
3284 return r;
3285 }
3286 goto done_cancel;
3287 }
3288
9f95a23c
TL
3289 epoch = ioctx.get_last_version();
3290 poolid = ioctx.get_id();
7c673cae 3291
b3b6e05e 3292 r = target->complete_atomic_modification(dpp);
7c673cae 3293 if (r < 0) {
b3b6e05e 3294 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7c673cae
FG
3295 }
3296
11fdf7f2 3297 tracepoint(rgw_rados, complete_enter, req_id.c_str());
b3b6e05e 3298 r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
11fdf7f2
TL
3299 meta.set_mtime, etag, content_type,
3300 storage_class, &acl_bl,
3301 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3302 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3303 if (r < 0)
3304 goto done_cancel;
3305
3306 if (meta.mtime) {
3307 *meta.mtime = meta.set_mtime;
3308 }
3309
3310 /* note that index_op was using state so we couldn't invalidate it earlier */
3311 target->invalidate_state();
3312 state = NULL;
3313
91327a77 3314 if (versioned_op && meta.olh_epoch) {
b3b6e05e 3315 r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
7c673cae
FG
3316 if (r < 0) {
3317 return r;
3318 }
3319 }
3320
3321 if (!real_clock::is_zero(meta.delete_at)) {
3322 rgw_obj_index_key obj_key;
3323 obj.key.get_index_key(&obj_key);
3324
b3b6e05e 3325 r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
9f95a23c 3326 obj.bucket.bucket_id, obj_key);
7c673cae 3327 if (r < 0) {
b3b6e05e 3328 ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7c673cae
FG
3329 /* ignoring error, nothing we can do at this point */
3330 }
3331 }
3332 meta.canceled = false;
3333
3334 /* update quota cache */
3efd9988
FG
3335 if (meta.completeMultipart){
3336 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3337 0, orig_size);
3338 }
3339 else {
3340 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3341 accounted_size, orig_size);
3342 }
7c673cae
FG
3343 return 0;
3344
3345done_cancel:
20effc67 3346 int ret = index_op->cancel(dpp, meta.remove_objs);
7c673cae 3347 if (ret < 0) {
b3b6e05e 3348 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7c673cae
FG
3349 }
3350
3351 meta.canceled = true;
3352
3353 /* we lost in a race. There are a few options:
3354 * - existing object was rewritten (ECANCELED)
3355 * - non existing object was created (EEXIST)
3356 * - object was removed (ENOENT)
3357 * should treat it as a success
3358 */
3359 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3360 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3361 r = 0;
3362 }
3363 } else {
3364 if (meta.if_match != NULL) {
3365 // only overwrite existing object
3366 if (strcmp(meta.if_match, "*") == 0) {
3367 if (r == -ENOENT) {
3368 r = -ERR_PRECONDITION_FAILED;
3369 } else if (r == -ECANCELED) {
3370 r = 0;
3371 }
3372 }
3373 }
3374
3375 if (meta.if_nomatch != NULL) {
3376 // only create a new object
3377 if (strcmp(meta.if_nomatch, "*") == 0) {
3378 if (r == -EEXIST) {
3379 r = -ERR_PRECONDITION_FAILED;
3380 } else if (r == -ENOENT) {
3381 r = 0;
3382 }
3383 }
3384 }
3385 }
3386
3387 return r;
3388}
3389
b3b6e05e 3390int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
9f95a23c 3391 map<string, bufferlist>& attrs, optional_yield y)
7c673cae
FG
3392{
3393 RGWBucketInfo& bucket_info = target->get_bucket_info();
3394
3395 RGWRados::Bucket bop(target->get_store(), bucket_info);
3396 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3397 index_op.set_zones_trace(meta.zones_trace);
3398
7c673cae
FG
3399 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3400 int r;
3401 if (assume_noent) {
b3b6e05e 3402 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3403 if (r == -EEXIST) {
3404 assume_noent = false;
3405 }
3406 }
3407 if (!assume_noent) {
b3b6e05e 3408 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3409 }
3410 return r;
3411}
3412
11fdf7f2 3413class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae 3414{
b3b6e05e 3415 const DoutPrefixProvider *dpp;
7c673cae
FG
3416 CephContext* cct;
3417 rgw_obj obj;
20effc67 3418 rgw::sal::DataProcessor *filter;
7c673cae 3419 boost::optional<RGWPutObj_Compress>& compressor;
adb31ebb
TL
3420 bool try_etag_verify;
3421 rgw::putobj::etag_verifier_ptr etag_verifier;
11fdf7f2 3422 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3423 CompressorRef& plugin;
20effc67 3424 rgw::sal::ObjectProcessor *processor;
7c673cae
FG
3425 void (*progress_cb)(off_t, void *);
3426 void *progress_data;
adb31ebb
TL
3427 bufferlist extra_data_bl, manifest_bl;
3428 std::optional<RGWCompressionInfo> compression_info;
11fdf7f2
TL
3429 uint64_t extra_data_left{0};
3430 bool need_to_process_attrs{true};
3431 uint64_t data_len{0};
7c673cae 3432 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3433 uint64_t ofs{0};
3434 uint64_t lofs{0}; /* logical ofs */
9f95a23c 3435 std::function<int(map<string, bufferlist>&)> attrs_handler;
20effc67 3436
7c673cae 3437public:
20effc67 3438 RGWRadosPutObj(const DoutPrefixProvider *dpp,
b3b6e05e 3439 CephContext* cct,
7c673cae
FG
3440 CompressorRef& plugin,
3441 boost::optional<RGWPutObj_Compress>& compressor,
20effc67 3442 rgw::sal::ObjectProcessor *p,
7c673cae 3443 void (*_progress_cb)(off_t, void *),
11fdf7f2 3444 void *_progress_data,
9f95a23c 3445 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
b3b6e05e 3446 dpp(dpp),
7c673cae
FG
3447 cct(cct),
3448 filter(p),
3449 compressor(compressor),
adb31ebb 3450 try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
7c673cae
FG
3451 plugin(plugin),
3452 processor(p),
7c673cae
FG
3453 progress_cb(_progress_cb),
3454 progress_data(_progress_data),
11fdf7f2 3455 attrs_handler(_attrs_handler) {}
7c673cae 3456
20effc67 3457
7c673cae
FG
3458 int process_attrs(void) {
3459 if (extra_data_bl.length()) {
3460 JSONParser jp;
3461 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
b3b6e05e 3462 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7c673cae
FG
3463 return -EIO;
3464 }
3465
3466 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3467
adb31ebb
TL
3468 auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
3469 if (iter != src_attrs.end()) {
3470 const bufferlist bl = std::move(iter->second);
3471 src_attrs.erase(iter); // don't preserve source compression info
3472
3473 if (try_etag_verify) {
3474 // if we're trying to verify etags, we need to convert compressed
3475 // ranges in the manifest back into logical multipart part offsets
3476 RGWCompressionInfo info;
3477 bool compressed = false;
3478 int r = rgw_compression_info_from_attr(bl, compressed, info);
3479 if (r < 0) {
b3b6e05e 3480 ldpp_dout(dpp, 4) << "failed to decode compression info, "
adb31ebb
TL
3481 "disabling etag verification" << dendl;
3482 try_etag_verify = false;
3483 } else if (compressed) {
3484 compression_info = std::move(info);
3485 }
3486 }
3487 }
3488 /* We need the manifest to recompute the ETag for verification */
3489 iter = src_attrs.find(RGW_ATTR_MANIFEST);
3490 if (iter != src_attrs.end()) {
3491 manifest_bl = std::move(iter->second);
3492 src_attrs.erase(iter);
3493 }
a8e16298
TL
3494
3495 // filter out olh attributes
adb31ebb 3496 iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
a8e16298
TL
3497 while (iter != src_attrs.end()) {
3498 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3499 break;
3500 }
3501 iter = src_attrs.erase(iter);
3502 }
7c673cae
FG
3503 }
3504
11fdf7f2
TL
3505 int ret = attrs_handler(src_attrs);
3506 if (ret < 0) {
3507 return ret;
3508 }
3509
7c673cae
FG
3510 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3511 //do not compress if object is encrypted
3512 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3513 // add a filter that buffers data so we don't try to compress tiny blocks.
3514 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3515 // compression ratio
28e407b8
AA
3516 constexpr unsigned buffer_size = 512 * 1024;
3517 buffering = boost::in_place(&*compressor, buffer_size);
3518 filter = &*buffering;
7c673cae 3519 }
11fdf7f2 3520
adb31ebb
TL
3521 /*
3522 * Presently we don't support ETag based verification if encryption is
3523 * requested. We can enable simultaneous support once we have a mechanism
3524 * to know the sequence in which the filters must be applied.
3525 */
3526 if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
b3b6e05e 3527 ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
adb31ebb
TL
3528 compression_info,
3529 etag_verifier);
3530 if (ret < 0) {
b3b6e05e 3531 ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
adb31ebb
TL
3532 "disabling etag verification" << dendl;
3533 } else {
3534 filter = etag_verifier.get();
3535 }
3536 }
3537
11fdf7f2
TL
3538 need_to_process_attrs = false;
3539
7c673cae
FG
3540 return 0;
3541 }
3542
11fdf7f2 3543 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3544 if (progress_cb) {
11fdf7f2 3545 progress_cb(data_len, progress_data);
7c673cae 3546 }
b32b8144 3547 if (extra_data_left) {
11fdf7f2 3548 uint64_t extra_len = bl.length();
b32b8144
FG
3549 if (extra_len > extra_data_left)
3550 extra_len = extra_data_left;
7c673cae
FG
3551
3552 bufferlist extra;
3553 bl.splice(0, extra_len, &extra);
3554 extra_data_bl.append(extra);
3555
b32b8144
FG
3556 extra_data_left -= extra_len;
3557 if (extra_data_left == 0) {
7c673cae
FG
3558 int res = process_attrs();
3559 if (res < 0)
3560 return res;
3561 }
11fdf7f2 3562 ofs += extra_len;
7c673cae
FG
3563 if (bl.length() == 0) {
3564 return 0;
3565 }
3566 }
11fdf7f2
TL
3567 if (need_to_process_attrs) {
3568 /* need to call process_attrs() even if we don't get any attrs,
3569 * need it to call attrs_handler().
3570 */
3571 int res = process_attrs();
3572 if (res < 0) {
3573 return res;
3574 }
3575 }
7c673cae 3576
11fdf7f2 3577 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3578
11fdf7f2
TL
3579 uint64_t size = bl.length();
3580 ofs += size;
7c673cae 3581
11fdf7f2
TL
3582 const uint64_t lofs = data_len;
3583 data_len += size;
7c673cae 3584
11fdf7f2 3585 return filter->process(std::move(bl), lofs);
7c673cae
FG
3586 }
3587
28e407b8 3588 int flush() {
11fdf7f2 3589 return filter->process({}, data_len);
28e407b8
AA
3590 }
3591
7c673cae
FG
3592 bufferlist& get_extra_data() { return extra_data_bl; }
3593
3594 map<string, bufferlist>& get_attrs() { return src_attrs; }
3595
3596 void set_extra_data_len(uint64_t len) override {
b32b8144 3597 extra_data_left = len;
11fdf7f2 3598 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3599 }
3600
3601 uint64_t get_data_len() {
3602 return data_len;
3603 }
adb31ebb
TL
3604
3605 std::string get_verifier_etag() {
3606 if (etag_verifier) {
3607 etag_verifier->calculate_etag();
3608 return etag_verifier->get_calculated_etag();
3609 } else {
3610 return "";
3611 }
3612 }
7c673cae
FG
3613};
3614
3615/*
3616 * prepare attrset depending on attrs_mod.
3617 */
3618static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3619 map<string, bufferlist>& attrs,
3620 RGWRados::AttrsMod attrs_mod)
3621{
3622 switch (attrs_mod) {
3623 case RGWRados::ATTRSMOD_NONE:
3624 attrs = src_attrs;
3625 break;
3626 case RGWRados::ATTRSMOD_REPLACE:
3627 if (!attrs[RGW_ATTR_ETAG].length()) {
3628 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3629 }
181888fb
FG
3630 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3631 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3632 if (ttiter != src_attrs.end()) {
3633 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3634 }
3635 }
7c673cae
FG
3636 break;
3637 case RGWRados::ATTRSMOD_MERGE:
3638 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3639 if (attrs.find(it->first) == attrs.end()) {
3640 attrs[it->first] = it->second;
3641 }
3642 }
3643 break;
3644 }
3645}
3646
20effc67 3647int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
7c673cae 3648{
9f95a23c 3649 RGWObjectCtx rctx(this->store);
20effc67
TL
3650 rgw::sal::Attrs attrset;
3651 uint64_t obj_size;
3652 ceph::real_time mtime;
3653 RGWRados::Object op_target(this, obj->get_bucket()->get_info(), rctx, obj->get_obj());
3654 RGWRados::Object::Read read_op(&op_target);
3655
3656 read_op.params.attrs = &attrset;
3657 read_op.params.obj_size = &obj_size;
3658 read_op.params.lastmod = &mtime;
3659
3660 int ret = read_op.prepare(y, dpp);
3661 if (ret < 0)
3662 return ret;
7c673cae 3663
20effc67
TL
3664 attrset.erase(RGW_ATTR_ID_TAG);
3665 attrset.erase(RGW_ATTR_TAIL_TAG);
3666
3667 return store->getRados()->copy_obj_data(rctx, obj->get_bucket(),
3668 obj->get_bucket()->get_info().placement_rule,
3669 read_op, obj_size - 1, obj, NULL, mtime,
3670 attrset, 0, real_time(), NULL, dpp, y);
7c673cae
FG
3671}
3672
3673struct obj_time_weight {
3674 real_time mtime;
3675 uint32_t zone_short_id;
3676 uint64_t pg_ver;
3677 bool high_precision;
3678
3679 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3680
3681 bool compare_low_precision(const obj_time_weight& rhs) {
3682 struct timespec l = ceph::real_clock::to_timespec(mtime);
3683 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3684 l.tv_nsec = 0;
3685 r.tv_nsec = 0;
3686 if (l > r) {
3687 return false;
3688 }
3689 if (l < r) {
3690 return true;
3691 }
11fdf7f2
TL
3692 if (!zone_short_id || !rhs.zone_short_id) {
3693 /* don't compare zone ids, if one wasn't provided */
3694 return false;
3695 }
7c673cae
FG
3696 if (zone_short_id != rhs.zone_short_id) {
3697 return (zone_short_id < rhs.zone_short_id);
3698 }
3699 return (pg_ver < rhs.pg_ver);
3700
3701 }
3702
3703 bool operator<(const obj_time_weight& rhs) {
3704 if (!high_precision || !rhs.high_precision) {
3705 return compare_low_precision(rhs);
3706 }
3707 if (mtime > rhs.mtime) {
3708 return false;
3709 }
3710 if (mtime < rhs.mtime) {
3711 return true;
3712 }
11fdf7f2
TL
3713 if (!zone_short_id || !rhs.zone_short_id) {
3714 /* don't compare zone ids, if one wasn't provided */
3715 return false;
3716 }
7c673cae
FG
3717 if (zone_short_id != rhs.zone_short_id) {
3718 return (zone_short_id < rhs.zone_short_id);
3719 }
3720 return (pg_ver < rhs.pg_ver);
3721 }
3722
3723 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3724 mtime = _mtime;
3725 zone_short_id = _short_id;
3726 pg_ver = _pg_ver;
3727 }
3728
3729 void init(RGWObjState *state) {
3730 mtime = state->mtime;
3731 zone_short_id = state->zone_short_id;
3732 pg_ver = state->pg_ver;
3733 }
3734};
3735
3736inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3737 out << o.mtime;
3738
3739 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3740 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3741 }
3742
3743 return out;
3744}
3745
11fdf7f2 3746class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
3747 bufferlist extra_data;
3748public:
3749 RGWGetExtraDataCB() {}
11fdf7f2
TL
3750 int handle_data(bufferlist& bl, bool *pause) override {
3751 int bl_len = (int)bl.length();
7c673cae
FG
3752 if (extra_data.length() < extra_data_len) {
3753 off_t max = extra_data_len - extra_data.length();
3754 if (max > bl_len) {
3755 max = bl_len;
3756 }
3757 bl.splice(0, max, &extra_data);
3758 }
3759 return bl_len;
3760 }
3761
3762 bufferlist& get_extra_data() {
3763 return extra_data;
3764 }
3765};
3766
b3b6e05e
TL
3767int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
3768 RGWObjectCtx& obj_ctx,
7c673cae 3769 const rgw_user& user_id,
7c673cae 3770 req_info *info,
9f95a23c 3771 const rgw_zone_id& source_zone,
20effc67 3772 rgw::sal::Object* src_obj,
9f95a23c 3773 const RGWBucketInfo *src_bucket_info,
7c673cae
FG
3774 real_time *src_mtime,
3775 uint64_t *psize,
3776 const real_time *mod_ptr,
3777 const real_time *unmod_ptr,
3778 bool high_precision_time,
3779 const char *if_match,
3780 const char *if_nomatch,
3781 map<string, bufferlist> *pattrs,
11fdf7f2 3782 map<string, string> *pheaders,
7c673cae
FG
3783 string *version_id,
3784 string *ptag,
3785 string *petag)
3786{
3787 /* source is in a different zonegroup, copy from there */
3788
3789 RGWRESTStreamRWRequest *in_stream_req;
3790 string tag;
3791 map<string, bufferlist> src_attrs;
3792 append_rand_alpha(cct, tag, tag, 32);
3793 obj_time_weight set_mtime_weight;
3794 set_mtime_weight.high_precision = high_precision_time;
3795
3796 RGWRESTConn *conn;
3797 if (source_zone.empty()) {
9f95a23c 3798 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3799 /* source is in the master zonegroup */
11fdf7f2 3800 conn = svc.zone->get_master_conn();
7c673cae 3801 } else {
11fdf7f2 3802 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
9f95a23c 3803 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae 3804 if (iter == zonegroup_conn_map.end()) {
20effc67 3805 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7c673cae
FG
3806 return -ENOENT;
3807 }
3808 conn = iter->second;
3809 }
3810 } else {
11fdf7f2 3811 auto& zone_conn_map = svc.zone->get_zone_conn_map();
9f95a23c 3812 auto iter = zone_conn_map.find(source_zone);
7c673cae 3813 if (iter == zone_conn_map.end()) {
20effc67 3814 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7c673cae
FG
3815 return -ENOENT;
3816 }
3817 conn = iter->second;
3818 }
3819
3820 RGWGetExtraDataCB cb;
7c673cae
FG
3821 map<string, string> req_headers;
3822 real_time set_mtime;
3823
3824 const real_time *pmod = mod_ptr;
3825
3826 obj_time_weight dest_mtime_weight;
3827
181888fb
FG
3828 constexpr bool prepend_meta = true;
3829 constexpr bool get_op = true;
3830 constexpr bool rgwx_stat = true;
3831 constexpr bool sync_manifest = true;
3832 constexpr bool skip_decrypt = true;
b3b6e05e 3833 int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 3834 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3835 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3836 sync_manifest, skip_decrypt,
3837 true, &cb, &in_stream_req);
7c673cae
FG
3838 if (ret < 0) {
3839 return ret;
3840 }
3841
f67539c2
TL
3842 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
3843 nullptr, pheaders, null_yield);
7c673cae
FG
3844 if (ret < 0) {
3845 return ret;
3846 }
3847
3848 bufferlist& extra_data_bl = cb.get_extra_data();
3849 if (extra_data_bl.length()) {
3850 JSONParser jp;
3851 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
20effc67 3852 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7c673cae
FG
3853 return -EIO;
3854 }
3855
3856 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3857
3858 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3859 }
3860
3861 if (src_mtime) {
3862 *src_mtime = set_mtime;
3863 }
3864
3865 if (petag) {
3866 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3867 if (iter != src_attrs.end()) {
3868 bufferlist& etagbl = iter->second;
3869 *petag = etagbl.to_str();
11fdf7f2
TL
3870 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3871 *petag = petag->substr(0, petag->size() - 1);
3872 }
7c673cae
FG
3873 }
3874 }
3875
3876 if (pattrs) {
11fdf7f2 3877 *pattrs = std::move(src_attrs);
7c673cae
FG
3878 }
3879
3880 return 0;
3881}
3882
9f95a23c
TL
3883int RGWFetchObjFilter_Default::filter(CephContext *cct,
3884 const rgw_obj_key& source_key,
3885 const RGWBucketInfo& dest_bucket_info,
3886 std::optional<rgw_placement_rule> dest_placement_rule,
3887 const map<string, bufferlist>& obj_attrs,
3888 std::optional<rgw_user> *poverride_owner,
3889 const rgw_placement_rule **prule)
3890{
3891 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3892 if (!ptail_rule) {
3893 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3894 if (iter != obj_attrs.end()) {
3895 dest_rule.storage_class = iter->second.to_str();
3896 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3897 ptail_rule = &dest_rule;
3898 } else {
3899 ptail_rule = &dest_bucket_info.placement_rule;
3900 }
3901 }
3902 *prule = ptail_rule;
3903 return 0;
3904}
3905
7c673cae
FG
3906int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3907 const rgw_user& user_id,
7c673cae 3908 req_info *info,
9f95a23c 3909 const rgw_zone_id& source_zone,
20effc67
TL
3910 rgw::sal::Object* dest_obj,
3911 rgw::sal::Object* src_obj,
3912 rgw::sal::Bucket* dest_bucket,
3913 rgw::sal::Bucket* src_bucket,
11fdf7f2 3914 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
3915 real_time *src_mtime,
3916 real_time *mtime,
3917 const real_time *mod_ptr,
3918 const real_time *unmod_ptr,
3919 bool high_precision_time,
3920 const char *if_match,
3921 const char *if_nomatch,
3922 AttrsMod attrs_mod,
3923 bool copy_if_newer,
20effc67 3924 rgw::sal::Attrs& attrs,
7c673cae 3925 RGWObjCategory category,
11fdf7f2 3926 std::optional<uint64_t> olh_epoch,
7c673cae 3927 real_time delete_at,
7c673cae 3928 string *ptag,
11fdf7f2 3929 string *petag,
7c673cae 3930 void (*progress_cb)(off_t, void *),
31f18b77 3931 void *progress_data,
9f95a23c
TL
3932 const DoutPrefixProvider *dpp,
3933 RGWFetchObjFilter *filter,
81eedcae
TL
3934 rgw_zone_set *zones_trace,
3935 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
3936{
3937 /* source is in a different zonegroup, copy from there */
3938
3939 RGWRESTStreamRWRequest *in_stream_req;
3940 string tag;
3941 int i;
3942 append_rand_alpha(cct, tag, tag, 32);
3943 obj_time_weight set_mtime_weight;
3944 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 3945 int ret;
7c673cae 3946
9f95a23c 3947 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 3948 using namespace rgw::putobj;
20effc67 3949 AtomicObjectProcessor processor(&aio, this->store, nullptr, user_id,
f67539c2
TL
3950 obj_ctx, dest_obj->clone(), olh_epoch,
3951 tag, dpp, null_yield);
7c673cae 3952 RGWRESTConn *conn;
11fdf7f2
TL
3953 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3954 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae 3955 if (source_zone.empty()) {
f67539c2 3956 if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
7c673cae 3957 /* source is in the master zonegroup */
11fdf7f2 3958 conn = svc.zone->get_master_conn();
7c673cae 3959 } else {
f67539c2 3960 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
7c673cae 3961 if (iter == zonegroup_conn_map.end()) {
b3b6e05e 3962 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7c673cae
FG
3963 return -ENOENT;
3964 }
3965 conn = iter->second;
3966 }
3967 } else {
9f95a23c 3968 auto iter = zone_conn_map.find(source_zone);
7c673cae 3969 if (iter == zone_conn_map.end()) {
b3b6e05e 3970 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 3971 return -ENOENT;
7c673cae 3972 }
11fdf7f2 3973 conn = iter->second;
7c673cae
FG
3974 }
3975
3976 boost::optional<RGWPutObj_Compress> compressor;
3977 CompressorRef plugin;
3978
9f95a23c
TL
3979 RGWFetchObjFilter_Default source_filter;
3980 if (!filter) {
3981 filter = &source_filter;
3982 }
3983
3984 std::optional<rgw_user> override_owner;
3985
b3b6e05e 3986 RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
9f95a23c
TL
3987 [&](map<string, bufferlist>& obj_attrs) {
3988 const rgw_placement_rule *ptail_rule;
3989
3990 int ret = filter->filter(cct,
f67539c2
TL
3991 src_obj->get_key(),
3992 dest_bucket->get_info(),
9f95a23c
TL
3993 dest_placement_rule,
3994 obj_attrs,
3995 &override_owner,
3996 &ptail_rule);
3997 if (ret < 0) {
b3b6e05e 3998 ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
9f95a23c 3999 return ret;
11fdf7f2 4000 }
9f95a23c
TL
4001
4002 processor.set_tail_placement(*ptail_rule);
4003
11fdf7f2
TL
4004 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
4005 if (compression_type != "none") {
4006 plugin = Compressor::create(cct, compression_type);
4007 if (!plugin) {
b3b6e05e 4008 ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
11fdf7f2
TL
4009 << compression_type << dendl;
4010 }
4011 }
4012
9f95a23c 4013 ret = processor.prepare(null_yield);
11fdf7f2
TL
4014 if (ret < 0) {
4015 return ret;
4016 }
4017 return 0;
4018 });
7c673cae
FG
4019
4020 string etag;
7c673cae 4021 real_time set_mtime;
81eedcae 4022 uint64_t expected_size = 0;
7c673cae
FG
4023
4024 RGWObjState *dest_state = NULL;
4025
4026 const real_time *pmod = mod_ptr;
4027
4028 obj_time_weight dest_mtime_weight;
4029
4030 if (copy_if_newer) {
4031 /* need to get mtime for destination */
b3b6e05e 4032 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae
FG
4033 if (ret < 0)
4034 goto set_err_state;
4035
4036 if (!real_clock::is_zero(dest_state->mtime)) {
4037 dest_mtime_weight.init(dest_state);
4038 pmod = &dest_mtime_weight.mtime;
4039 }
4040 }
4041
181888fb
FG
4042 static constexpr bool prepend_meta = true;
4043 static constexpr bool get_op = true;
4044 static constexpr bool rgwx_stat = false;
4045 static constexpr bool sync_manifest = true;
4046 static constexpr bool skip_decrypt = true;
b3b6e05e 4047 ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 4048 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 4049 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
4050 sync_manifest, skip_decrypt,
4051 true,
4052 &cb, &in_stream_req);
7c673cae
FG
4053 if (ret < 0) {
4054 goto set_err_state;
4055 }
4056
81eedcae 4057 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
f67539c2 4058 &expected_size, nullptr, nullptr, null_yield);
7c673cae
FG
4059 if (ret < 0) {
4060 goto set_err_state;
4061 }
28e407b8
AA
4062 ret = cb.flush();
4063 if (ret < 0) {
4064 goto set_err_state;
4065 }
81eedcae
TL
4066 if (cb.get_data_len() != expected_size) {
4067 ret = -EIO;
b3b6e05e 4068 ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
81eedcae
TL
4069 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
4070 goto set_err_state;
4071 }
7c673cae
FG
4072 if (compressor && compressor->is_compressed()) {
4073 bufferlist tmp;
4074 RGWCompressionInfo cs_info;
4075 cs_info.compression_type = plugin->get_type_name();
4076 cs_info.orig_size = cb.get_data_len();
f67539c2 4077 cs_info.compressor_message = compressor->get_compressor_message();
7c673cae 4078 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 4079 encode(cs_info, tmp);
7c673cae
FG
4080 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
4081 }
4082
9f95a23c
TL
4083 if (override_owner) {
4084 processor.set_owner(*override_owner);
4085
4086 auto& obj_attrs = cb.get_attrs();
4087
4088 RGWUserInfo owner_info;
b3b6e05e
TL
4089 if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
4090 ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
9f95a23c
TL
4091 return -EINVAL;
4092 }
4093
4094 RGWAccessControlPolicy acl;
4095
4096 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
4097 if (aiter == obj_attrs.end()) {
b3b6e05e 4098 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
9f95a23c
TL
4099 acl.create_default(owner_info.user_id, owner_info.display_name);
4100 } else {
4101 auto iter = aiter->second.cbegin();
4102 try {
4103 acl.decode(iter);
4104 } catch (buffer::error& err) {
b3b6e05e 4105 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
4106 return -EIO;
4107 }
4108 }
4109
4110 ACLOwner new_owner;
4111 new_owner.set_id(*override_owner);
4112 new_owner.set_name(owner_info.display_name);
4113
4114 acl.set_owner(new_owner);
4115
4116 bufferlist bl;
4117 acl.encode(bl);
4118 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
4119 }
4120
7c673cae
FG
4121 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4122 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
4123 } else {
4124 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
4125 if (iter != cb.get_attrs().end()) {
4126 try {
11fdf7f2 4127 decode(delete_at, iter->second);
7c673cae 4128 } catch (buffer::error& err) {
b3b6e05e 4129 ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7c673cae
FG
4130 }
4131 }
4132 }
4133
4134 if (src_mtime) {
4135 *src_mtime = set_mtime;
4136 }
4137
4138 if (petag) {
4139 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4140 if (iter != cb.get_attrs().end()) {
11fdf7f2 4141 *petag = iter->second.to_str();
7c673cae
FG
4142 }
4143 }
4144
11fdf7f2
TL
4145 //erase the append attr
4146 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4147
7c673cae
FG
4148 if (source_zone.empty()) {
4149 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4150 } else {
4151 attrs = cb.get_attrs();
4152 }
4153
4154 if (copy_if_newer) {
4155 uint64_t pg_ver = 0;
4156 auto i = attrs.find(RGW_ATTR_PG_VER);
4157 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 4158 auto iter = i->second.cbegin();
7c673cae 4159 try {
11fdf7f2 4160 decode(pg_ver, iter);
7c673cae 4161 } catch (buffer::error& err) {
b3b6e05e 4162 ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7c673cae
FG
4163 /* non critical error */
4164 }
4165 }
11fdf7f2 4166 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
4167 }
4168
adb31ebb
TL
4169 /* Perform ETag verification is we have computed the object's MD5 sum at our end */
4170 if (const auto& verifier_etag = cb.get_verifier_etag();
4171 !verifier_etag.empty()) {
4172 string trimmed_etag = etag;
4173
4174 /* Remove the leading and trailing double quotes from etag */
4175 trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
4176 trimmed_etag.end());
4177
4178 if (verifier_etag != trimmed_etag) {
4179 ret = -EIO;
b3b6e05e 4180 ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
adb31ebb
TL
4181 << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
4182 goto set_err_state;
4183 }
4184 }
4185
7c673cae
FG
4186#define MAX_COMPLETE_RETRY 100
4187 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
4188 bool canceled = false;
4189 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4190 attrs, delete_at, nullptr, nullptr, nullptr,
9f95a23c 4191 zones_trace, &canceled, null_yield);
7c673cae
FG
4192 if (ret < 0) {
4193 goto set_err_state;
4194 }
adb31ebb 4195
11fdf7f2 4196 if (copy_if_newer && canceled) {
b3b6e05e 4197 ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
f67539c2 4198 obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
b3b6e05e 4199 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae 4200 if (ret < 0) {
b3b6e05e 4201 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7c673cae
FG
4202 goto set_err_state;
4203 }
4204 dest_mtime_weight.init(dest_state);
4205 dest_mtime_weight.high_precision = high_precision_time;
4206 if (!dest_state->exists ||
4207 dest_mtime_weight < set_mtime_weight) {
b3b6e05e 4208 ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4209 continue;
4210 } else {
b3b6e05e 4211 ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4212 }
4213 }
4214 break;
4215 }
4216
4217 if (i == MAX_COMPLETE_RETRY) {
b3b6e05e 4218 ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7c673cae
FG
4219 ret = -EIO;
4220 goto set_err_state;
4221 }
4222
81eedcae
TL
4223 if (bytes_transferred) {
4224 *bytes_transferred = cb.get_data_len();
4225 }
7c673cae
FG
4226 return 0;
4227set_err_state:
4228 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
4229 // we may have already fetched during sync of OP_ADD, but were waiting
4230 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4231 if (olh_epoch && *olh_epoch > 0) {
4232 constexpr bool log_data_change = true;
b3b6e05e 4233 ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), false, nullptr,
9f95a23c 4234 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
91327a77
AA
4235 } else {
4236 // we already have the latest copy
4237 ret = 0;
4238 }
7c673cae 4239 }
7c673cae
FG
4240 return ret;
4241}
4242
4243
20effc67 4244int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
b3b6e05e 4245 RGWObjState *astate,
7c673cae
FG
4246 map<string, bufferlist>& src_attrs,
4247 RGWRados::Object::Read& read_op,
4248 const rgw_user& user_id,
20effc67 4249 rgw::sal::Object* dest_obj,
7c673cae
FG
4250 real_time *mtime)
4251{
4252 string etag;
4253
11fdf7f2 4254 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4255
11fdf7f2
TL
4256 auto rest_master_conn = svc.zone->get_master_conn();
4257
20effc67 4258 int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
7c673cae 4259 if (ret < 0) {
7c673cae
FG
4260 return ret;
4261 }
4262
20effc67
TL
4263 out_stream_req->set_send_length(astate->size);
4264
4265 ret = RGWHTTP::send(out_stream_req);
4266 if (ret < 0) {
4267 delete out_stream_req;
4268 return ret;
4269 }
4270
b3b6e05e 4271 ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
224ce89b
WB
4272 if (ret < 0) {
4273 delete out_stream_req;
7c673cae 4274 return ret;
224ce89b 4275 }
7c673cae 4276
f67539c2 4277 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
7c673cae
FG
4278 if (ret < 0)
4279 return ret;
4280
4281 return 0;
4282}
4283
4284/**
4285 * Copy an object.
4286 * dest_obj: the object to copy into
4287 * src_obj: the object to copy from
4288 * attrs: usage depends on attrs_mod parameter
4289 * attrs_mod: the modification mode of the attrs, may have the following values:
4290 * ATTRSMOD_NONE - the attributes of the source object will be
4291 * copied without modifications, attrs parameter is ignored;
4292 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4293 * parameter, source object attributes are not copied;
4294 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4295 * are overwritten by values contained in attrs parameter.
4296 * err: stores any errors resulting from the get of the original object
4297 * Returns: 0 on success, -ERR# otherwise.
4298 */
4299int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4300 const rgw_user& user_id,
7c673cae 4301 req_info *info,
9f95a23c 4302 const rgw_zone_id& source_zone,
20effc67
TL
4303 rgw::sal::Object* dest_obj,
4304 rgw::sal::Object* src_obj,
4305 rgw::sal::Bucket* dest_bucket,
4306 rgw::sal::Bucket* src_bucket,
11fdf7f2 4307 const rgw_placement_rule& dest_placement,
7c673cae
FG
4308 real_time *src_mtime,
4309 real_time *mtime,
4310 const real_time *mod_ptr,
4311 const real_time *unmod_ptr,
4312 bool high_precision_time,
4313 const char *if_match,
4314 const char *if_nomatch,
4315 AttrsMod attrs_mod,
4316 bool copy_if_newer,
20effc67 4317 rgw::sal::Attrs& attrs,
7c673cae
FG
4318 RGWObjCategory category,
4319 uint64_t olh_epoch,
4320 real_time delete_at,
4321 string *version_id,
4322 string *ptag,
11fdf7f2 4323 string *petag,
7c673cae 4324 void (*progress_cb)(off_t, void *),
9f95a23c
TL
4325 void *progress_data,
4326 const DoutPrefixProvider *dpp,
4327 optional_yield y)
7c673cae
FG
4328{
4329 int ret;
4330 uint64_t obj_size;
f67539c2 4331 rgw_obj shadow_obj = dest_obj->get_obj();
7c673cae
FG
4332 string shadow_oid;
4333
4334 bool remote_src;
4335 bool remote_dest;
4336
f67539c2
TL
4337 append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
4338 shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
7c673cae 4339
11fdf7f2
TL
4340 auto& zonegroup = svc.zone->get_zonegroup();
4341
f67539c2
TL
4342 remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
4343 remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
7c673cae
FG
4344
4345 if (remote_src && remote_dest) {
9f95a23c 4346 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7c673cae
FG
4347 return -EINVAL;
4348 }
4349
f67539c2 4350 ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
7c673cae
FG
4351
4352 if (remote_src || !source_zone.empty()) {
11fdf7f2 4353 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
f67539c2 4354 dest_obj, src_obj, dest_bucket, src_bucket,
11fdf7f2 4355 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4356 unmod_ptr, high_precision_time,
4357 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
9f95a23c
TL
4358 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4359 nullptr /* filter */);
7c673cae
FG
4360 }
4361
4362 map<string, bufferlist> src_attrs;
f67539c2 4363 RGWRados::Object src_op_target(this, src_bucket->get_info(), obj_ctx, src_obj->get_obj());
7c673cae
FG
4364 RGWRados::Object::Read read_op(&src_op_target);
4365
4366 read_op.conds.mod_ptr = mod_ptr;
4367 read_op.conds.unmod_ptr = unmod_ptr;
4368 read_op.conds.high_precision_time = high_precision_time;
4369 read_op.conds.if_match = if_match;
4370 read_op.conds.if_nomatch = if_nomatch;
4371 read_op.params.attrs = &src_attrs;
4372 read_op.params.lastmod = src_mtime;
4373 read_op.params.obj_size = &obj_size;
7c673cae 4374
b3b6e05e 4375 ret = read_op.prepare(y, dpp);
7c673cae
FG
4376 if (ret < 0) {
4377 return ret;
4378 }
94b18763
FG
4379 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4380 // Current implementation does not follow S3 spec and even
4381 // may result in data corruption silently when copying
4382 // multipart objects acorss pools. So reject COPY operations
4383 //on encrypted objects before it is fully functional.
9f95a23c 4384 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
94b18763
FG
4385 << " has not been implemented." << dendl;
4386 return -ERR_NOT_IMPLEMENTED;
4387 }
7c673cae
FG
4388
4389 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4390 src_attrs.erase(RGW_ATTR_DELETE_AT);
4391
20effc67
TL
4392 src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
4393 src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
4394 map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
4395 if (rt != attrs.end())
4396 src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
4397 map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
4398 if (lh != attrs.end())
4399 src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
4400
7c673cae
FG
4401 set_copy_attrs(src_attrs, attrs, attrs_mod);
4402 attrs.erase(RGW_ATTR_ID_TAG);
4403 attrs.erase(RGW_ATTR_PG_VER);
4404 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4405 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4406 if (cmp != src_attrs.end())
4407 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4408
4409 RGWObjManifest manifest;
4410 RGWObjState *astate = NULL;
4411
b3b6e05e 4412 ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj->get_obj(), &astate, y);
7c673cae
FG
4413 if (ret < 0) {
4414 return ret;
4415 }
4416
4417 vector<rgw_raw_obj> ref_objs;
4418
4419 if (remote_dest) {
4420 /* dest is in a different zonegroup, copy it there */
b3b6e05e 4421 return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
7c673cae
FG
4422 }
4423 uint64_t max_chunk_size;
4424
b3b6e05e 4425 ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
7c673cae 4426 if (ret < 0) {
f67539c2 4427 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
7c673cae
FG
4428 return ret;
4429 }
4430
4431 rgw_pool src_pool;
4432 rgw_pool dest_pool;
11fdf7f2
TL
4433
4434 const rgw_placement_rule *src_rule{nullptr};
4435
9f95a23c
TL
4436 if (astate->manifest) {
4437 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4438 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
11fdf7f2
TL
4439 }
4440
4441 if (!src_rule || src_rule->empty()) {
f67539c2 4442 src_rule = &src_bucket->get_placement_rule();
11fdf7f2
TL
4443 }
4444
f67539c2 4445 if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
9f95a23c 4446 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7c673cae
FG
4447 return -EIO;
4448 }
11fdf7f2 4449
f67539c2 4450 if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
9f95a23c 4451 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7c673cae
FG
4452 return -EIO;
4453 }
4454
9f95a23c 4455 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
11fdf7f2
TL
4456 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4457
9f95a23c 4458 bool copy_data = (!astate->manifest) ||
11fdf7f2
TL
4459 (*src_rule != dest_placement) ||
4460 (src_pool != dest_pool);
7c673cae 4461
7c673cae 4462 bool copy_first = false;
9f95a23c
TL
4463 if (astate->manifest) {
4464 if (!astate->manifest->has_tail()) {
7c673cae
FG
4465 copy_data = true;
4466 } else {
9f95a23c 4467 uint64_t head_size = astate->manifest->get_head_size();
7c673cae
FG
4468
4469 if (head_size > 0) {
4470 if (head_size > max_chunk_size) {
4471 copy_data = true;
4472 } else {
4473 copy_first = true;
4474 }
4475 }
4476 }
4477 }
4478
4479 if (petag) {
4480 const auto iter = attrs.find(RGW_ATTR_ETAG);
4481 if (iter != attrs.end()) {
11fdf7f2 4482 *petag = iter->second.to_str();
7c673cae
FG
4483 }
4484 }
4485
4486 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2 4487 attrs.erase(RGW_ATTR_TAIL_TAG);
f67539c2 4488 return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
9f95a23c 4489 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
7c673cae
FG
4490 }
4491
b3b6e05e 4492 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin(dpp);
7c673cae
FG
4493
4494 if (copy_first) { // we need to copy first chunk, not increase refcount
4495 ++miter;
4496 }
4497
4498 rgw_rados_ref ref;
b3b6e05e 4499 ret = get_raw_obj_ref(dpp, miter.get_location().get_raw_obj(store), &ref);
7c673cae
FG
4500 if (ret < 0) {
4501 return ret;
4502 }
4503
7c673cae
FG
4504 bufferlist first_chunk;
4505
20effc67 4506 const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
7c673cae 4507 RGWObjManifest *pmanifest;
9f95a23c 4508 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae 4509
f67539c2 4510 RGWRados::Object dest_op_target(this, dest_bucket->get_info(), obj_ctx, dest_obj->get_obj());
7c673cae
FG
4511 RGWRados::Object::Write write_op(&dest_op_target);
4512
4513 string tag;
4514
4515 if (ptag) {
4516 tag = *ptag;
4517 }
4518
4519 if (tag.empty()) {
4520 append_rand_alpha(cct, tag, tag, 32);
4521 }
4522
4523 if (!copy_itself) {
181888fb 4524 attrs.erase(RGW_ATTR_TAIL_TAG);
9f95a23c 4525 manifest = *astate->manifest;
7c673cae
FG
4526 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4527 if (tail_placement.bucket.name.empty()) {
f67539c2 4528 manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
7c673cae 4529 }
3efd9988 4530 string ref_tag;
b3b6e05e 4531 for (; miter != astate->manifest->obj_end(dpp); ++miter) {
7c673cae 4532 ObjectWriteOperation op;
3efd9988
FG
4533 ref_tag = tag + '\0';
4534 cls_refcount_get(op, ref_tag, true);
f67539c2 4535 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store);
7c673cae 4536
9f95a23c
TL
4537 auto& ioctx = ref.pool.ioctx();
4538 ioctx.locator_set_key(loc.loc);
4539
b3b6e05e 4540 ret = rgw_rados_operate(dpp, ioctx, loc.oid, &op, null_yield);
7c673cae
FG
4541 if (ret < 0) {
4542 goto done_ret;
4543 }
4544
4545 ref_objs.push_back(loc);
4546 }
4547
4548 pmanifest = &manifest;
4549 } else {
9f95a23c 4550 pmanifest = &(*astate->manifest);
7c673cae
FG
4551 /* don't send the object's tail for garbage collection */
4552 astate->keep_tail = true;
4553 }
4554
4555 if (copy_first) {
b3b6e05e 4556 ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
7c673cae
FG
4557 if (ret < 0) {
4558 goto done_ret;
4559 }
4560
f67539c2 4561 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
7c673cae 4562 } else {
f67539c2 4563 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
7c673cae
FG
4564 }
4565
4566 write_op.meta.data = &first_chunk;
4567 write_op.meta.manifest = pmanifest;
4568 write_op.meta.ptag = &tag;
f67539c2 4569 write_op.meta.owner = dest_bucket->get_info().owner;
7c673cae
FG
4570 write_op.meta.mtime = mtime;
4571 write_op.meta.flags = PUT_OBJ_CREATE;
4572 write_op.meta.category = category;
4573 write_op.meta.olh_epoch = olh_epoch;
4574 write_op.meta.delete_at = delete_at;
181888fb 4575 write_op.meta.modify_tail = !copy_itself;
7c673cae 4576
b3b6e05e 4577 ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
7c673cae
FG
4578 if (ret < 0) {
4579 goto done_ret;
4580 }
4581
4582 return 0;
4583
4584done_ret:
4585 if (!copy_itself) {
4586 vector<rgw_raw_obj>::iterator riter;
4587
7c673cae 4588 /* rollback reference */
92f5a8d4 4589 string ref_tag = tag + '\0';
7c673cae
FG
4590 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4591 ObjectWriteOperation op;
92f5a8d4 4592 cls_refcount_put(op, ref_tag, true);
7c673cae 4593
9f95a23c 4594 ref.pool.ioctx().locator_set_key(riter->loc);
7c673cae 4595
b3b6e05e 4596 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), riter->oid, &op, null_yield);
7c673cae 4597 if (r < 0) {
9f95a23c 4598 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7c673cae
FG
4599 }
4600 }
4601 }
4602 return ret;
4603}
4604
4605
4606int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
20effc67 4607 rgw::sal::Bucket* bucket,
11fdf7f2 4608 const rgw_placement_rule& dest_placement,
7c673cae 4609 RGWRados::Object::Read& read_op, off_t end,
20effc67 4610 rgw::sal::Object* dest_obj,
7c673cae
FG
4611 real_time *mtime,
4612 real_time set_mtime,
20effc67 4613 rgw::sal::Attrs& attrs,
7c673cae
FG
4614 uint64_t olh_epoch,
4615 real_time delete_at,
9f95a23c
TL
4616 string *petag,
4617 const DoutPrefixProvider *dpp,
4618 optional_yield y)
7c673cae 4619{
7c673cae
FG
4620 string tag;
4621 append_rand_alpha(cct, tag, tag, 32);
4622
9f95a23c 4623 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 4624 using namespace rgw::putobj;
9f95a23c
TL
4625 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4626 // it causes crashes in the ragweed tests
20effc67 4627 AtomicObjectProcessor processor(&aio, this->store, &dest_placement,
f67539c2
TL
4628 bucket->get_info().owner, obj_ctx,
4629 dest_obj->clone(), olh_epoch, tag,
4630 dpp, null_yield);
9f95a23c 4631 int ret = processor.prepare(y);
7c673cae
FG
4632 if (ret < 0)
4633 return ret;
4634
4635 off_t ofs = 0;
4636
4637 do {
4638 bufferlist bl;
b3b6e05e 4639 ret = read_op.read(ofs, end, bl, y, dpp);
11fdf7f2 4640 if (ret < 0) {
9f95a23c 4641 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
11fdf7f2
TL
4642 return ret;
4643 }
7c673cae
FG
4644
4645 uint64_t read_len = ret;
11fdf7f2
TL
4646 ret = processor.process(std::move(bl), ofs);
4647 if (ret < 0) {
4648 return ret;
4649 }
7c673cae
FG
4650
4651 ofs += read_len;
4652 } while (ofs <= end);
4653
11fdf7f2
TL
4654 // flush
4655 ret = processor.process({}, ofs);
4656 if (ret < 0) {
4657 return ret;
4658 }
4659
7c673cae
FG
4660 string etag;
4661 auto iter = attrs.find(RGW_ATTR_ETAG);
4662 if (iter != attrs.end()) {
4663 bufferlist& bl = iter->second;
11fdf7f2 4664 etag = bl.to_str();
7c673cae 4665 if (petag) {
11fdf7f2 4666 *petag = etag;
7c673cae
FG
4667 }
4668 }
4669
4670 uint64_t accounted_size;
4671 {
4672 bool compressed{false};
4673 RGWCompressionInfo cs_info;
4674 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4675 if (ret < 0) {
9f95a23c 4676 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
7c673cae
FG
4677 return ret;
4678 }
4679 // pass original size if compressed
4680 accounted_size = compressed ? cs_info.orig_size : ofs;
4681 }
4682
11fdf7f2 4683 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
9f95a23c 4684 nullptr, nullptr, nullptr, nullptr, nullptr, y);
7c673cae
FG
4685}
4686
11fdf7f2 4687int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
20effc67
TL
4688 rgw::sal::Bucket* bucket,
4689 rgw::sal::Object& obj,
11fdf7f2
TL
4690 const rgw_placement_rule& placement_rule,
4691 const real_time& mtime,
9f95a23c
TL
4692 uint64_t olh_epoch,
4693 const DoutPrefixProvider *dpp,
4694 optional_yield y)
7c673cae 4695{
20effc67 4696 rgw::sal::Attrs attrs;
11fdf7f2
TL
4697 real_time read_mtime;
4698 uint64_t obj_size;
7c673cae 4699
f67539c2
TL
4700 obj.set_atomic(&obj_ctx);
4701 RGWRados::Object op_target(this, bucket->get_info(), obj_ctx, obj.get_obj());
11fdf7f2 4702 RGWRados::Object::Read read_op(&op_target);
7c673cae 4703
11fdf7f2
TL
4704 read_op.params.attrs = &attrs;
4705 read_op.params.lastmod = &read_mtime;
4706 read_op.params.obj_size = &obj_size;
7c673cae 4707
b3b6e05e 4708 int ret = read_op.prepare(y, dpp);
11fdf7f2
TL
4709 if (ret < 0) {
4710 return ret;
7c673cae
FG
4711 }
4712
11fdf7f2
TL
4713 if (read_mtime != mtime) {
4714 /* raced */
4715 return -ECANCELED;
7c673cae
FG
4716 }
4717
9f95a23c
TL
4718 attrs.erase(RGW_ATTR_ID_TAG);
4719 attrs.erase(RGW_ATTR_TAIL_TAG);
4720
11fdf7f2 4721 ret = copy_obj_data(obj_ctx,
f67539c2 4722 bucket,
11fdf7f2
TL
4723 placement_rule,
4724 read_op,
4725 obj_size - 1,
f67539c2 4726 &obj,
11fdf7f2
TL
4727 nullptr /* pmtime */,
4728 mtime,
4729 attrs,
4730 olh_epoch,
4731 real_time(),
9f95a23c
TL
4732 nullptr /* petag */,
4733 dpp,
4734 y);
11fdf7f2
TL
4735 if (ret < 0) {
4736 return ret;
7c673cae
FG
4737 }
4738
11fdf7f2 4739 return 0;
7c673cae
FG
4740}
4741
b3b6e05e 4742int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
7c673cae 4743{
9f95a23c
TL
4744 constexpr uint NUM_ENTRIES = 1000u;
4745
7c673cae
FG
4746 rgw_obj_index_key marker;
4747 string prefix;
4748 bool is_truncated;
4749
4750 do {
9f95a23c
TL
4751 std::vector<rgw_bucket_dir_entry> ent_list;
4752 ent_list.reserve(NUM_ENTRIES);
4753
20effc67 4754 int r = cls_bucket_list_unordered(dpp,
b3b6e05e 4755 bucket_info,
1adf2230
AA
4756 RGW_NO_SHARD,
4757 marker,
4758 prefix,
4759 NUM_ENTRIES,
4760 true,
4761 ent_list,
4762 &is_truncated,
9f95a23c
TL
4763 &marker,
4764 y);
4765 if (r < 0) {
7c673cae 4766 return r;
9f95a23c 4767 }
7c673cae
FG
4768
4769 string ns;
1adf2230 4770 for (auto const& dirent : ent_list) {
7c673cae
FG
4771 rgw_obj_key obj;
4772
9f95a23c 4773 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
7c673cae 4774 return -ENOTEMPTY;
9f95a23c 4775 }
7c673cae
FG
4776 }
4777 } while (is_truncated);
1adf2230 4778
7c673cae
FG
4779 return 0;
4780}
4781
4782/**
4783 * Delete a bucket.
4784 * bucket: the name of the bucket to delete
4785 * Returns 0 on success, -ERR# otherwise.
4786 */
b3b6e05e 4787int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
7c673cae
FG
4788{
4789 const rgw_bucket& bucket = bucket_info.bucket;
9f95a23c 4790 RGWSI_RADOS::Pool index_pool;
7c673cae 4791 map<int, string> bucket_objs;
b3b6e05e 4792 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
4793 if (r < 0)
4794 return r;
4795
4796 if (check_empty) {
b3b6e05e 4797 r = check_bucket_empty(dpp, bucket_info, y);
7c673cae
FG
4798 if (r < 0) {
4799 return r;
4800 }
4801 }
9f95a23c
TL
4802
4803 bool remove_ep = true;
4804
4805 if (objv_tracker.read_version.empty()) {
4806 RGWBucketEntryPoint ep;
4807 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4808 &ep,
4809 null_yield,
b3b6e05e 4810 dpp,
9f95a23c
TL
4811 RGWBucketCtl::Bucket::GetParams()
4812 .set_objv_tracker(&objv_tracker));
4813 if (r < 0 ||
4814 (!bucket_info.bucket.bucket_id.empty() &&
4815 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4816 if (r != -ENOENT) {
b3b6e05e 4817 ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
9f95a23c
TL
4818 /* we have no idea what caused the error, will not try to remove it */
4819 }
4820 /*
4821 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4822 * requested
4823 */
4824 remove_ep = false;
4825 }
4826 }
4827
4828 if (remove_ep) {
b3b6e05e 4829 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
9f95a23c
TL
4830 RGWBucketCtl::Bucket::RemoveParams()
4831 .set_objv_tracker(&objv_tracker));
4832 if (r < 0)
4833 return r;
4834 }
7c673cae
FG
4835
4836 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 4837 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 4838 RGWObjVersionTracker objv_tracker;
b3b6e05e 4839 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
7c673cae
FG
4840 if (r < 0) {
4841 return r;
4842 }
f64942e4
AA
4843
4844 /* remove bucket index objects asynchronously by best effort */
9f95a23c 4845 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
f64942e4
AA
4846 bucket_objs,
4847 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 4848 }
f64942e4 4849
7c673cae
FG
4850 return 0;
4851}
4852
b3b6e05e 4853int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
7c673cae
FG
4854{
4855 RGWBucketInfo info;
4856 map<string, bufferlist> attrs;
31f18b77 4857 int r;
9f95a23c
TL
4858 auto obj_ctx = svc.sysobj->init_obj_ctx();
4859
31f18b77 4860 if (bucket.bucket_id.empty()) {
b3b6e05e 4861 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
31f18b77 4862 } else {
b3b6e05e 4863 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield, dpp);
31f18b77 4864 }
7c673cae 4865 if (r < 0) {
b3b6e05e 4866 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4867 return r;
4868 }
4869
4870 info.owner = owner.get_id();
4871
b3b6e05e 4872 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4873 if (r < 0) {
b3b6e05e 4874 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4875 return r;
4876 }
4877
4878 return 0;
4879}
4880
4881
b3b6e05e 4882int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
7c673cae
FG
4883{
4884 int ret = 0;
4885
4886 vector<rgw_bucket>::iterator iter;
4887
4888 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4889 rgw_bucket& bucket = *iter;
b3b6e05e
TL
4890 if (enabled) {
4891 ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
4892 } else {
4893 ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
4894 }
7c673cae
FG
4895
4896 RGWBucketInfo info;
4897 map<string, bufferlist> attrs;
b3b6e05e 4898 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
7c673cae 4899 if (r < 0) {
b3b6e05e 4900 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4901 ret = r;
4902 continue;
4903 }
4904 if (enabled) {
4905 info.flags &= ~BUCKET_SUSPENDED;
4906 } else {
4907 info.flags |= BUCKET_SUSPENDED;
4908 }
4909
b3b6e05e 4910 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4911 if (r < 0) {
b3b6e05e 4912 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4913 ret = r;
4914 continue;
4915 }
4916 }
4917 return ret;
4918}
4919
b3b6e05e 4920int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
7c673cae
FG
4921{
4922 RGWBucketInfo bucket_info;
b3b6e05e 4923 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
7c673cae
FG
4924 if (ret < 0) {
4925 return ret;
4926 }
4927
4928 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4929 return 0;
4930}
4931
b3b6e05e 4932int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
7c673cae 4933{
9f95a23c 4934 if ((!state->manifest)|| state->keep_tail)
7c673cae
FG
4935 return 0;
4936
4937 cls_rgw_obj_chain chain;
b3b6e05e 4938 store->update_gc_chain(dpp, obj, *state->manifest, &chain);
7c673cae
FG
4939
4940 if (chain.empty()) {
4941 return 0;
4942 }
4943
181888fb 4944 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
1d09f67e
TL
4945 if (store->gc == nullptr) {
4946 ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
4947 //Delete objects inline just in case gc hasn't been initialised, prevents crashes
b3b6e05e 4948 store->delete_objs_inline(dpp, chain, tag);
1d09f67e 4949 } else {
39ae355f
TL
4950 auto [ret, leftover_chain] = store->gc->send_split_chain(chain, tag); // do it synchronously
4951 if (ret < 0 && leftover_chain) {
1d09f67e 4952 //Delete objects inline if send chain to gc fails
39ae355f 4953 store->delete_objs_inline(dpp, *leftover_chain, tag);
1d09f67e 4954 }
9f95a23c
TL
4955 }
4956 return 0;
7c673cae
FG
4957}
4958
b3b6e05e 4959void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
7c673cae
FG
4960{
4961 RGWObjManifest::obj_iterator iter;
4962 rgw_raw_obj raw_head;
4963 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
b3b6e05e 4964 for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
f67539c2 4965 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(store);
7c673cae
FG
4966 if (mobj == raw_head)
4967 continue;
4968 cls_rgw_obj_key key(mobj.oid);
4969 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4970 }
4971}
4972
39ae355f 4973std::tuple<int, std::optional<cls_rgw_obj_chain>> RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4974{
f67539c2 4975 if (chain.empty()) {
39ae355f 4976 return {0, std::nullopt};
f67539c2
TL
4977 }
4978
39ae355f 4979 return gc->send_split_chain(chain, tag);
7c673cae
FG
4980}
4981
b3b6e05e 4982void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4983{
9f95a23c
TL
4984 string last_pool;
4985 std::unique_ptr<IoCtx> ctx(new IoCtx);
4986 int ret = 0;
4987 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4988 cls_rgw_obj& obj = *liter;
4989 if (obj.pool != last_pool) {
4990 ctx.reset(new IoCtx);
b3b6e05e 4991 ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
9f95a23c
TL
4992 if (ret < 0) {
4993 last_pool = "";
b3b6e05e 4994 ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
9f95a23c
TL
4995 obj.pool << dendl;
4996 continue;
4997 }
4998 last_pool = obj.pool;
4999 }
5000 ctx->locator_set_key(obj.loc);
5001 const string& oid = obj.key.name; /* just stored raw oid there */
b3b6e05e 5002 ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
9f95a23c
TL
5003 ":" << obj.key.name << dendl;
5004 ObjectWriteOperation op;
5005 cls_refcount_put(op, tag, true);
5006 ret = ctx->operate(oid, &op);
5007 if (ret < 0) {
b3b6e05e 5008 ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
9f95a23c 5009 }
7c673cae 5010 }
7c673cae
FG
5011}
5012
5013static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
5014 map<RGWObjCategory, RGWStorageStats>& stats)
5015{
5016 for (const auto& pair : header.stats) {
5017 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
5018 const rgw_bucket_category_stats& header_stats = pair.second;
5019
5020 RGWStorageStats& s = stats[category];
5021
5022 s.category = category;
5023 s.size += header_stats.total_size;
5024 s.size_rounded += header_stats.total_size_rounded;
5025 s.size_utilized += header_stats.actual_size;
5026 s.num_objects += header_stats.num_entries;
5027 }
5028}
5029
b3b6e05e 5030int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
7c673cae
FG
5031 map<RGWObjCategory, RGWStorageStats> *existing_stats,
5032 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
5033{
9f95a23c 5034 RGWSI_RADOS::Pool index_pool;
20effc67 5035
7c673cae
FG
5036 // key - bucket index object id
5037 // value - bucket index check OP returned result with the given bucket index object (shard)
5038 map<int, string> oids;
b3b6e05e 5039 int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &oids, nullptr);
31f18b77 5040 if (ret < 0) {
20effc67 5041 return ret;
31f18b77 5042 }
7c673cae 5043
20effc67
TL
5044 // declare and pre-populate
5045 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
9f95a23c 5046 for (auto& iter : oids) {
20effc67 5047 bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
9f95a23c
TL
5048 }
5049
5050 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 5051 if (ret < 0) {
20effc67 5052 return ret;
31f18b77 5053 }
7c673cae 5054
20effc67
TL
5055 // aggregate results (from different shards if there are any)
5056 for (const auto& iter : bucket_objs_ret) {
5057 accumulate_raw_stats(iter.second.existing_header, *existing_stats);
5058 accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
7c673cae
FG
5059 }
5060
5061 return 0;
5062}
5063
b3b6e05e 5064int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
7c673cae 5065{
9f95a23c 5066 RGWSI_RADOS::Pool index_pool;
7c673cae 5067 map<int, string> bucket_objs;
31f18b77 5068
b3b6e05e 5069 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77 5070 if (r < 0) {
7c673cae 5071 return r;
31f18b77 5072 }
7c673cae 5073
9f95a23c 5074 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
5075}
5076
b3b6e05e 5077int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77 5078{
9f95a23c 5079 RGWSI_RADOS::Pool index_pool;
31f18b77
FG
5080 map<int, string> bucket_objs;
5081
b3b6e05e 5082 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77
FG
5083 if (r < 0) {
5084 return r;
5085 }
5086
39ae355f
TL
5087 r = CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
5088 if (r < 0) {
5089 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
5090 ": unable to issue set bucket resharding, r=" << r << " (" <<
5091 cpp_strerror(-r) << ")" << dendl;
5092 }
5093
5094 return r;
31f18b77 5095}
7c673cae 5096
b3b6e05e 5097int RGWRados::defer_gc(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
7c673cae
FG
5098{
5099 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5100 std::string oid, key;
5101 get_obj_bucket_and_oid_loc(obj, oid, key);
5102 if (!rctx)
5103 return 0;
5104
5105 RGWObjState *state = NULL;
5106
b3b6e05e 5107 int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, false, y);
7c673cae
FG
5108 if (r < 0)
5109 return r;
5110
5111 if (!state->is_atomic) {
b3b6e05e 5112 ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
7c673cae
FG
5113 return -EINVAL;
5114 }
5115
181888fb
FG
5116 string tag;
5117
5118 if (state->tail_tag.length() > 0) {
5119 tag = state->tail_tag.c_str();
5120 } else if (state->obj_tag.length() > 0) {
5121 tag = state->obj_tag.c_str();
5122 } else {
b3b6e05e 5123 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
7c673cae
FG
5124 return -EINVAL;
5125 }
5126
b3b6e05e 5127 ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
7c673cae 5128
9f95a23c 5129 cls_rgw_obj_chain chain;
b3b6e05e 5130 update_gc_chain(dpp, state->obj, *state->manifest, &chain);
9f95a23c 5131 return gc->async_defer_chain(tag, chain);
7c673cae
FG
5132}
5133
5134void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
5135{
5136 list<string> prefixes;
5137 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
5138 cls_rgw_remove_obj(op, prefixes);
5139}
5140
5141void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
5142{
5143 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
5144}
5145
5146void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
5147{
5148 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
5149}
5150
9f95a23c
TL
5151struct tombstone_entry {
5152 ceph::real_time mtime;
5153 uint32_t zone_short_id;
5154 uint64_t pg_ver;
5155
5156 tombstone_entry() = default;
5157 explicit tombstone_entry(const RGWObjState& state)
5158 : mtime(state.mtime), zone_short_id(state.zone_short_id),
5159 pg_ver(state.pg_ver) {}
5160};
7c673cae
FG
5161
5162/**
5163 * Delete an object.
5164 * bucket: name of the bucket storing the object
5165 * obj: name of the object to delete
5166 * Returns: 0 on success, -ERR# otherwise.
5167 */
b3b6e05e 5168int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
5169{
5170 RGWRados *store = target->get_store();
5171 rgw_obj& src_obj = target->get_obj();
5172 const string& instance = src_obj.key.instance;
5173 rgw_obj obj = src_obj;
5174
5175 if (instance == "null") {
5176 obj.key.instance.clear();
5177 }
5178
5179 bool explicit_marker_version = (!params.marker_version_id.empty());
5180
5181 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5182 if (instance.empty() || explicit_marker_version) {
5183 rgw_obj marker = obj;
5184
5185 if (!params.marker_version_id.empty()) {
5186 if (params.marker_version_id != "null") {
5187 marker.key.set_instance(params.marker_version_id);
5188 }
5189 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5190 store->gen_rand_obj_instance_name(&marker);
5191 }
5192
5193 result.version_id = marker.key.instance;
91327a77
AA
5194 if (result.version_id.empty())
5195 result.version_id = "null";
7c673cae
FG
5196 result.delete_marker = true;
5197
5198 struct rgw_bucket_dir_entry_meta meta;
5199
5200 meta.owner = params.obj_owner.get_id().to_str();
5201 meta.owner_display_name = params.obj_owner.get_display_name();
5202
5203 if (real_clock::is_zero(params.mtime)) {
5204 meta.mtime = real_clock::now();
5205 } else {
5206 meta.mtime = params.mtime;
5207 }
5208
b3b6e05e 5209 int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
7c673cae
FG
5210 if (r < 0) {
5211 return r;
5212 }
5213 } else {
5214 rgw_bucket_dir_entry dirent;
5215
b3b6e05e 5216 int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
7c673cae
FG
5217 if (r < 0) {
5218 return r;
5219 }
5220 result.delete_marker = dirent.is_delete_marker();
b3b6e05e 5221 r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
7c673cae
FG
5222 if (r < 0) {
5223 return r;
5224 }
5225 result.version_id = instance;
5226 }
5227
20effc67 5228 BucketShard *bs = nullptr;
b3b6e05e 5229 int r = target->get_bucket_shard(&bs, dpp);
7c673cae 5230 if (r < 0) {
b3b6e05e 5231 ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
7c673cae
FG
5232 return r;
5233 }
5234
b3b6e05e 5235 r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 5236 if (r < 0) {
b3b6e05e 5237 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
9f95a23c 5238 return r;
7c673cae
FG
5239 }
5240
5241 return 0;
5242 }
5243
5244 rgw_rados_ref ref;
b3b6e05e 5245 int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
7c673cae
FG
5246 if (r < 0) {
5247 return r;
5248 }
5249
5250 RGWObjState *state;
b3b6e05e 5251 r = target->get_state(dpp, &state, false, y);
7c673cae
FG
5252 if (r < 0)
5253 return r;
5254
5255 ObjectWriteOperation op;
5256
5257 if (!real_clock::is_zero(params.unmod_since)) {
5258 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5259 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5260 if (!params.high_precision_time) {
5261 ctime.tv_nsec = 0;
5262 unmod.tv_nsec = 0;
5263 }
5264
b3b6e05e 5265 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
7c673cae
FG
5266 if (ctime > unmod) {
5267 return -ERR_PRECONDITION_FAILED;
5268 }
5269
5270 /* only delete object if mtime is less than or equal to params.unmod_since */
5271 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5272 }
11fdf7f2 5273 uint64_t obj_accounted_size = state->accounted_size;
7c673cae 5274
9f95a23c
TL
5275 if(params.abortmp) {
5276 obj_accounted_size = params.parts_accounted_size;
5277 }
5278
7c673cae
FG
5279 if (!real_clock::is_zero(params.expiration_time)) {
5280 bufferlist bl;
5281 real_time delete_at;
5282
5283 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5284 try {
11fdf7f2
TL
5285 auto iter = bl.cbegin();
5286 decode(delete_at, iter);
7c673cae 5287 } catch (buffer::error& err) {
b3b6e05e 5288 ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
7c673cae
FG
5289 return -EIO;
5290 }
5291
5292 if (params.expiration_time != delete_at) {
5293 return -ERR_PRECONDITION_FAILED;
5294 }
5295 } else {
5296 return -ERR_PRECONDITION_FAILED;
5297 }
5298 }
5299
5300 if (!state->exists) {
5301 target->invalidate_state();
5302 return -ENOENT;
5303 }
5304
b3b6e05e 5305 r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
7c673cae
FG
5306 if (r < 0)
5307 return r;
5308
5309 RGWBucketInfo& bucket_info = target->get_bucket_info();
5310
5311 RGWRados::Bucket bop(store, bucket_info);
5312 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5313
5314 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5315 index_op.set_bilog_flags(params.bilog_flags);
5316
b3b6e05e 5317 r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
7c673cae
FG
5318 if (r < 0)
5319 return r;
5320
5321 store->remove_rgw_head_obj(op);
9f95a23c
TL
5322
5323 auto& ioctx = ref.pool.ioctx();
b3b6e05e 5324 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
94b18763
FG
5325
5326 /* raced with another operation, object state is indeterminate */
5327 const bool need_invalidate = (r == -ECANCELED);
7c673cae 5328
9f95a23c 5329 int64_t poolid = ioctx.get_id();
7c673cae
FG
5330 if (r >= 0) {
5331 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5332 if (obj_tombstone_cache) {
5333 tombstone_entry entry{*state};
5334 obj_tombstone_cache->add(obj, entry);
5335 }
b3b6e05e 5336 r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5337
b3b6e05e 5338 int ret = target->complete_atomic_modification(dpp);
7c673cae 5339 if (ret < 0) {
b3b6e05e 5340 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
7c673cae
FG
5341 }
5342 /* other than that, no need to propagate error */
224ce89b 5343 } else {
20effc67 5344 int ret = index_op.cancel(dpp, params.remove_objs);
224ce89b 5345 if (ret < 0) {
b3b6e05e 5346 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
224ce89b 5347 }
7c673cae
FG
5348 }
5349
5350 if (need_invalidate) {
5351 target->invalidate_state();
5352 }
5353
5354 if (r < 0)
5355 return r;
5356
5357 /* update quota cache */
11fdf7f2 5358 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5359
5360 return 0;
5361}
5362
b3b6e05e
TL
5363int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
5364 RGWObjectCtx& obj_ctx,
7c673cae
FG
5365 const RGWBucketInfo& bucket_info,
5366 const rgw_obj& obj,
20effc67 5367 int versioning_status, // versioning flags defined in enum RGWBucketFlags
7c673cae 5368 uint16_t bilog_flags,
31f18b77
FG
5369 const real_time& expiration_time,
5370 rgw_zone_set *zones_trace)
7c673cae
FG
5371{
5372 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5373 RGWRados::Object::Delete del_op(&del_target);
5374
5375 del_op.params.bucket_owner = bucket_info.owner;
5376 del_op.params.versioning_status = versioning_status;
5377 del_op.params.bilog_flags = bilog_flags;
5378 del_op.params.expiration_time = expiration_time;
31f18b77 5379 del_op.params.zones_trace = zones_trace;
7c673cae 5380
b3b6e05e 5381 return del_op.delete_obj(null_yield, dpp);
7c673cae
FG
5382}
5383
b3b6e05e 5384int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
7c673cae
FG
5385{
5386 rgw_rados_ref ref;
b3b6e05e 5387 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
5388 if (r < 0) {
5389 return r;
5390 }
5391
5392 ObjectWriteOperation op;
5393
5394 op.remove();
b3b6e05e 5395 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
5396 if (r < 0)
5397 return r;
5398
5399 return 0;
5400}
5401
b3b6e05e 5402int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
7c673cae
FG
5403{
5404 std::string oid, key;
5405 get_obj_bucket_and_oid_loc(obj, oid, key);
5406
11fdf7f2 5407 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5408
5409 RGWBucketInfo bucket_info;
b3b6e05e 5410 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae 5411 if (ret < 0) {
b3b6e05e 5412 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
7c673cae
FG
5413 return ret;
5414 }
5415
5416 RGWRados::Bucket bop(this, bucket_info);
5417 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5418
b3b6e05e 5419 return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5420}
5421
20effc67 5422static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Store* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
7c673cae
FG
5423{
5424 string tag;
5425
b3b6e05e
TL
5426 RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
5427 if (mi != manifest.obj_end(dpp)) {
7c673cae
FG
5428 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5429 ++mi;
20effc67
TL
5430 rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
5431 tag = mi.get_location().get_raw_obj(rstore).oid;
7c673cae
FG
5432 tag.append("_");
5433 }
5434
5435 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5436 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5437 MD5 hash;
20effc67
TL
5438 // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
5439 hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
11fdf7f2 5440 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5441
5442 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5443 if (iter != attrset.end()) {
5444 bufferlist& bl = iter->second;
11fdf7f2 5445 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5446 }
5447
5448 hash.Final(md5);
5449 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5450 tag.append(md5_str);
5451
20effc67 5452 ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
7c673cae
FG
5453
5454 tag_bl.append(tag.c_str(), tag.size() + 1);
5455}
5456
5457static bool is_olh(map<string, bufferlist>& attrs)
5458{
5459 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5460 return (iter != attrs.end());
5461}
5462
5463static bool has_olh_tag(map<string, bufferlist>& attrs)
5464{
5465 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5466 return (iter != attrs.end());
5467}
5468
b3b6e05e 5469int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5470 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
7c673cae 5471{
11fdf7f2 5472 ceph_assert(olh_state->is_olh);
7c673cae
FG
5473
5474 rgw_obj target;
b3b6e05e 5475 int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
7c673cae
FG
5476 if (r < 0) {
5477 return r;
5478 }
b3b6e05e 5479 r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state, false, y);
7c673cae
FG
5480 if (r < 0) {
5481 return r;
5482 }
5483
5484 return 0;
5485}
5486
b3b6e05e 5487int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5488 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5489{
5490 if (obj.empty()) {
5491 return -EINVAL;
5492 }
5493
5494 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5495
11fdf7f2 5496 RGWObjState *s = rctx->get_state(obj);
b3b6e05e 5497 ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
7c673cae
FG
5498 *state = s;
5499 if (s->has_attrs) {
5500 if (s->is_olh && need_follow_olh) {
b3b6e05e 5501 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
7c673cae
FG
5502 }
5503 return 0;
5504 }
5505
5506 s->obj = obj;
5507
5508 rgw_raw_obj raw_obj;
5509 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5510
5511 int r = -ENOENT;
5512
5513 if (!assume_noent) {
b3b6e05e 5514 r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
7c673cae
FG
5515 }
5516
5517 if (r == -ENOENT) {
5518 s->exists = false;
5519 s->has_attrs = true;
5520 tombstone_entry entry;
5521 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5522 s->mtime = entry.mtime;
5523 s->zone_short_id = entry.zone_short_id;
5524 s->pg_ver = entry.pg_ver;
b3b6e05e 5525 ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
7c673cae
FG
5526 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5527 } else {
5528 s->mtime = real_time();
5529 }
5530 return 0;
5531 }
5532 if (r < 0)
5533 return r;
5534
5535 s->exists = true;
5536 s->has_attrs = true;
5537 s->accounted_size = s->size;
5538
11fdf7f2
TL
5539 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5540 if (iter != s->attrset.end()) {
5541 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5542 bufferlist& bletag = iter->second;
5543 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5544 bufferlist newbl;
5545 bletag.splice(0, bletag.length() - 1, &newbl);
f67539c2 5546 bletag = std::move(newbl);
11fdf7f2
TL
5547 }
5548 }
5549
5550 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5551 const bool compressed = (iter != s->attrset.end());
5552 if (compressed) {
7c673cae
FG
5553 // use uncompressed size for accounted_size
5554 try {
5555 RGWCompressionInfo info;
11fdf7f2
TL
5556 auto p = iter->second.cbegin();
5557 decode(info, p);
31f18b77 5558 s->accounted_size = info.orig_size;
7c673cae 5559 } catch (buffer::error&) {
b3b6e05e 5560 ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
7c673cae
FG
5561 return -EIO;
5562 }
5563 }
5564
5565 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5566 if (iter != s->attrset.end()) {
5567 bufferlist bl = iter->second;
5568 bufferlist::iterator it = bl.begin();
5569 it.copy(bl.length(), s->shadow_obj);
5570 s->shadow_obj[bl.length()] = '\0';
5571 }
5572 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5573 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5574 if (ttiter != s->attrset.end()) {
5575 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5576 }
7c673cae
FG
5577
5578 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5579 if (manifest_bl.length()) {
11fdf7f2 5580 auto miter = manifest_bl.cbegin();
7c673cae 5581 try {
9f95a23c
TL
5582 s->manifest.emplace();
5583 decode(*s->manifest, miter);
5584 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
7c673cae 5585 broken due to old bugs */
9f95a23c 5586 s->size = s->manifest->get_obj_size();
31f18b77
FG
5587 if (!compressed)
5588 s->accounted_size = s->size;
7c673cae 5589 } catch (buffer::error& err) {
b3b6e05e 5590 ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
7c673cae
FG
5591 return -EIO;
5592 }
b3b6e05e 5593 ldpp_dout(dpp, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
11fdf7f2 5594 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
9f95a23c 5595 s->manifest->has_explicit_objs()) {
7c673cae 5596 RGWObjManifest::obj_iterator mi;
b3b6e05e
TL
5597 for (mi = s->manifest->obj_begin(dpp); mi != s->manifest->obj_end(dpp); ++mi) {
5598 ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(store) << dendl;
7c673cae
FG
5599 }
5600 }
5601
5602 if (!s->obj_tag.length()) {
5603 /*
5604 * Uh oh, something's wrong, object with manifest should have tag. Let's
5605 * create one out of the manifest, would be unique
5606 */
b3b6e05e 5607 generate_fake_tag(dpp, store, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
7c673cae
FG
5608 s->fake_tag = true;
5609 }
5610 }
5611 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5612 if (aiter != s->attrset.end()) {
5613 bufferlist& pg_ver_bl = aiter->second;
5614 if (pg_ver_bl.length()) {
11fdf7f2 5615 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5616 try {
11fdf7f2 5617 decode(s->pg_ver, pgbl);
7c673cae 5618 } catch (buffer::error& err) {
b3b6e05e 5619 ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5620 }
5621 }
5622 }
5623 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5624 if (aiter != s->attrset.end()) {
5625 bufferlist& zone_short_id_bl = aiter->second;
5626 if (zone_short_id_bl.length()) {
11fdf7f2 5627 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5628 try {
11fdf7f2 5629 decode(s->zone_short_id, zbl);
7c673cae 5630 } catch (buffer::error& err) {
b3b6e05e 5631 ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5632 }
5633 }
5634 }
b3b6e05e
TL
5635 if (s->obj_tag.length()) {
5636 ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5637 } else {
5638 ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5639 }
7c673cae
FG
5640
5641 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5642 * it exist, and not only if is_olh() returns true
5643 */
5644 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5645 if (iter != s->attrset.end()) {
5646 s->olh_tag = iter->second;
5647 }
5648
5649 if (is_olh(s->attrset)) {
5650 s->is_olh = true;
5651
b3b6e05e 5652 ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
7c673cae
FG
5653
5654 if (need_follow_olh) {
b3b6e05e 5655 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
9f95a23c 5656 } else if (obj.key.have_null_instance() && !s->manifest) {
11fdf7f2
TL
5657 // read null version, and the head object only have olh info
5658 s->exists = false;
5659 return -ENOENT;
7c673cae
FG
5660 }
5661 }
5662
5663 return 0;
5664}
5665
b3b6e05e 5666int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9f95a23c 5667 bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5668{
5669 int ret;
5670
5671 do {
b3b6e05e 5672 ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
7c673cae
FG
5673 } while (ret == -EAGAIN);
5674
5675 return ret;
5676}
5677
b3b6e05e 5678int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
7c673cae
FG
5679{
5680 RGWObjState *astate;
b3b6e05e 5681 int r = get_state(dpp, &astate, true, y);
7c673cae
FG
5682 if (r < 0) {
5683 return r;
5684 }
5685
9f95a23c 5686 *pmanifest = &(*astate->manifest);
7c673cae
FG
5687
5688 return 0;
5689}
5690
b3b6e05e 5691int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
7c673cae
FG
5692{
5693 RGWObjState *state;
b3b6e05e 5694 int r = source->get_state(dpp, &state, true, y);
7c673cae
FG
5695 if (r < 0)
5696 return r;
5697 if (!state->exists)
5698 return -ENOENT;
5699 if (!state->get_attr(name, dest))
5700 return -ENODATA;
5701
5702 return 0;
5703}
5704
b3b6e05e 5705int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
7c673cae
FG
5706{
5707 RGWObjectCtx& ctx = source->get_ctx();
5708 rgw_obj& obj = source->get_obj();
5709 RGWRados *store = source->get_store();
5710
11fdf7f2 5711 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5712 result.obj = obj;
5713 if (s->has_attrs) {
5714 state.ret = 0;
5715 result.size = s->size;
5716 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5717 result.attrs = s->attrset;
7c673cae
FG
5718 result.manifest = s->manifest;
5719 return 0;
5720 }
5721
5722 string oid;
5723 string loc;
5724 get_obj_bucket_and_oid_loc(obj, oid, loc);
5725
b3b6e05e 5726 int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
7c673cae
FG
5727 if (r < 0) {
5728 return r;
5729 }
5730
5731 librados::ObjectReadOperation op;
5732 op.stat2(&result.size, &result.mtime, NULL);
5733 op.getxattrs(&result.attrs, NULL);
9f95a23c 5734 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
7c673cae
FG
5735 state.io_ctx.locator_set_key(loc);
5736 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5737 if (r < 0) {
b3b6e05e 5738 ldpp_dout(dpp, 5) << __func__
7c673cae
FG
5739 << ": ERROR: aio_operate() returned ret=" << r
5740 << dendl;
5741 return r;
5742 }
5743
5744 return 0;
5745}
5746
5747
20effc67 5748int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
7c673cae
FG
5749{
5750 if (!state.completion) {
5751 return state.ret;
5752 }
5753
9f95a23c 5754 state.completion->wait_for_complete();
7c673cae
FG
5755 state.ret = state.completion->get_return_value();
5756 state.completion->release();
5757
5758 if (state.ret != 0) {
5759 return state.ret;
5760 }
5761
20effc67 5762 return finish(dpp);
7c673cae
FG
5763}
5764
20effc67 5765int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
7c673cae
FG
5766{
5767 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5768 if (iter != result.attrs.end()) {
5769 bufferlist& bl = iter->second;
11fdf7f2 5770 auto biter = bl.cbegin();
7c673cae 5771 try {
9f95a23c
TL
5772 result.manifest.emplace();
5773 decode(*result.manifest, biter);
7c673cae 5774 } catch (buffer::error& err) {
20effc67 5775 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
7c673cae
FG
5776 return -EIO;
5777 }
7c673cae
FG
5778 }
5779
5780 return 0;
5781}
5782
b3b6e05e 5783int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
7c673cae 5784 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5785 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
7c673cae
FG
5786{
5787 if (!rctx)
5788 return 0;
5789
b3b6e05e 5790 int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, false, y);
7c673cae
FG
5791 if (r < 0)
5792 return r;
5793
b3b6e05e 5794 return append_atomic_test(dpp, *pstate, op);
11fdf7f2 5795}
7c673cae 5796
b3b6e05e
TL
5797int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
5798 const RGWObjState* state,
11fdf7f2
TL
5799 librados::ObjectOperation& op)
5800{
7c673cae 5801 if (!state->is_atomic) {
b3b6e05e 5802 ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
5803 return 0;
5804 }
5805
5806 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5807 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5808 } else {
b3b6e05e 5809 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
7c673cae
FG
5810 }
5811 return 0;
5812}
5813
b3b6e05e 5814int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae 5815{
b3b6e05e 5816 return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
7c673cae
FG
5817}
5818
5819void RGWRados::Object::invalidate_state()
5820{
11fdf7f2 5821 ctx.invalidate(obj);
7c673cae
FG
5822}
5823
20effc67 5824int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
b3b6e05e 5825 ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb 5826 const char *if_match, const char *if_nomatch, bool removal_op,
9f95a23c 5827 bool modify_tail, optional_yield y)
7c673cae 5828{
b3b6e05e 5829 int r = get_state(dpp, &state, false, y);
7c673cae
FG
5830 if (r < 0)
5831 return r;
5832
9f95a23c 5833 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
7c673cae
FG
5834 if_match != NULL || if_nomatch != NULL) &&
5835 (!state->fake_tag);
5836
5837 if (!state->is_atomic) {
b3b6e05e 5838 ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
7c673cae
FG
5839
5840 if (reset_obj) {
5841 op.create(false);
5842 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5843 }
5844
5845 return 0;
5846 }
5847
5848 if (need_guard) {
5849 /* first verify that the object wasn't replaced under */
5850 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5851 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5852 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5853 }
5854
5855 if (if_match) {
5856 if (strcmp(if_match, "*") == 0) {
5857 // test the object is existing
5858 if (!state->exists) {
5859 return -ERR_PRECONDITION_FAILED;
5860 }
5861 } else {
5862 bufferlist bl;
5863 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5864 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5865 return -ERR_PRECONDITION_FAILED;
5866 }
5867 }
5868 }
5869
5870 if (if_nomatch) {
5871 if (strcmp(if_nomatch, "*") == 0) {
5872 // test the object is NOT existing
5873 if (state->exists) {
5874 return -ERR_PRECONDITION_FAILED;
5875 }
5876 } else {
5877 bufferlist bl;
5878 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5879 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5880 return -ERR_PRECONDITION_FAILED;
5881 }
5882 }
5883 }
5884 }
5885
5886 if (reset_obj) {
5887 if (state->exists) {
5888 op.create(false);
5889 store->remove_rgw_head_obj(op);
5890 } else {
5891 op.create(true);
5892 }
5893 }
5894
5895 if (removal_op) {
5896 /* the object is being removed, no need to update its tag */
5897 return 0;
5898 }
5899
5900 if (ptag) {
5901 state->write_tag = *ptag;
5902 } else {
5903 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5904 }
5905 bufferlist bl;
5906 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5907
b3b6e05e 5908 ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
7c673cae
FG
5909
5910 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
5911 if (modify_tail) {
5912 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5913 }
7c673cae
FG
5914
5915 return 0;
5916}
5917
7c673cae
FG
5918/**
5919 * Set an attr on an object.
5920 * bucket: name of the bucket holding the object
5921 * obj: name of the object to set the attr on
5922 * name: the attr to set
5923 * bl: the contents of the attr
5924 * Returns: 0 on success, -ERR# otherwise.
5925 */
b3b6e05e 5926int RGWRados::set_attr(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
7c673cae
FG
5927{
5928 map<string, bufferlist> attrs;
5929 attrs[name] = bl;
b3b6e05e 5930 return set_attrs(dpp, ctx, bucket_info, obj, attrs, NULL, null_yield);
7c673cae
FG
5931}
5932
b3b6e05e 5933int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae 5934 map<string, bufferlist>& attrs,
9f95a23c
TL
5935 map<string, bufferlist>* rmattrs,
5936 optional_yield y)
7c673cae 5937{
494da23a
TL
5938 rgw_obj obj = src_obj;
5939 if (obj.key.instance == "null") {
5940 obj.key.instance.clear();
5941 }
5942
7c673cae 5943 rgw_rados_ref ref;
b3b6e05e 5944 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
5945 if (r < 0) {
5946 return r;
5947 }
5948 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5949
5950 ObjectWriteOperation op;
5951 RGWObjState *state = NULL;
5952
b3b6e05e 5953 r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, y);
7c673cae
FG
5954 if (r < 0)
5955 return r;
5956
494da23a 5957 // ensure null version object exist
9f95a23c 5958 if (src_obj.key.instance == "null" && !state->manifest) {
494da23a
TL
5959 return -ENOENT;
5960 }
5961
7c673cae
FG
5962 map<string, bufferlist>::iterator iter;
5963 if (rmattrs) {
5964 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5965 const string& name = iter->first;
5966 op.rmxattr(name.c_str());
5967 }
5968 }
5969
5970 const rgw_bucket& bucket = obj.bucket;
5971
5972 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5973 const string& name = iter->first;
5974 bufferlist& bl = iter->second;
5975
5976 if (!bl.length())
5977 continue;
5978
5979 op.setxattr(name.c_str(), bl);
5980
5981 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5982 real_time ts;
5983 try {
11fdf7f2 5984 decode(ts, bl);
7c673cae
FG
5985
5986 rgw_obj_index_key obj_key;
5987 obj.key.get_index_key(&obj_key);
5988
b3b6e05e 5989 obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
7c673cae 5990 } catch (buffer::error& err) {
b3b6e05e 5991 ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
7c673cae
FG
5992 }
5993 }
5994 }
5995
5996 if (!op.size())
5997 return 0;
5998
9f95a23c 5999 RGWObjectCtx obj_ctx(this->store);
7c673cae
FG
6000
6001 bufferlist bl;
6002 RGWRados::Bucket bop(this, bucket_info);
6003 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
6004
6005 if (state) {
6006 string tag;
6007 append_rand_alpha(cct, tag, tag, 32);
6008 state->write_tag = tag;
b3b6e05e 6009 r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
7c673cae
FG
6010
6011 if (r < 0)
6012 return r;
6013
6014 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
6015 op.setxattr(RGW_ATTR_ID_TAG, bl);
6016 }
6017
3efd9988
FG
6018
6019 real_time mtime = real_clock::now();
6020 struct timespec mtime_ts = real_clock::to_timespec(mtime);
6021 op.mtime2(&mtime_ts);
9f95a23c 6022 auto& ioctx = ref.pool.ioctx();
b3b6e05e 6023 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
7c673cae
FG
6024 if (state) {
6025 if (r >= 0) {
6026 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
6027 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
6028 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
6029 string etag = rgw_bl_str(etag_bl);
6030 string content_type = rgw_bl_str(content_type_bl);
6031 string storage_class;
6032 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
6033 if (iter != attrs.end()) {
6034 storage_class = rgw_bl_str(iter->second);
6035 }
9f95a23c
TL
6036 uint64_t epoch = ioctx.get_last_version();
6037 int64_t poolid = ioctx.get_id();
b3b6e05e 6038 r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
6039 mtime, etag, content_type, storage_class, &acl_bl,
6040 RGWObjCategory::Main, NULL);
7c673cae 6041 } else {
20effc67 6042 int ret = index_op.cancel(dpp, nullptr);
7c673cae 6043 if (ret < 0) {
b3b6e05e 6044 ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
7c673cae
FG
6045 }
6046 }
6047 }
6048 if (r < 0)
6049 return r;
6050
6051 if (state) {
6052 state->obj_tag.swap(bl);
6053 if (rmattrs) {
6054 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6055 state->attrset.erase(iter->first);
6056 }
6057 }
92f5a8d4 6058
7c673cae
FG
6059 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6060 state->attrset[iter->first] = iter->second;
6061 }
92f5a8d4
TL
6062
6063 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
6064 if (iter != state->attrset.end()) {
6065 iter->second = state->obj_tag;
6066 }
7c673cae
FG
6067 }
6068
6069 return 0;
6070}
6071
b3b6e05e 6072int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
6073{
6074 RGWRados *store = source->get_store();
6075 CephContext *cct = store->ctx();
6076
6077 bufferlist etag;
6078
6079 map<string, bufferlist>::iterator iter;
6080
6081 RGWObjState *astate;
b3b6e05e 6082 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
6083 if (r < 0)
6084 return r;
6085
6086 if (!astate->exists) {
6087 return -ENOENT;
6088 }
6089
6090 const RGWBucketInfo& bucket_info = source->get_bucket_info();
6091
6092 state.obj = astate->obj;
6093 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
6094
11fdf7f2
TL
6095 state.cur_pool = state.head_obj.pool;
6096 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
6097
b3b6e05e 6098 r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
6099 if (r < 0) {
6100 return r;
6101 }
eafe8130
TL
6102 if (params.target_obj) {
6103 *params.target_obj = state.obj;
6104 }
7c673cae
FG
6105 if (params.attrs) {
6106 *params.attrs = astate->attrset;
11fdf7f2 6107 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae 6108 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
b3b6e05e 6109 ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
7c673cae
FG
6110 }
6111 }
6112 }
6113
6114 /* Convert all times go GMT to make them compatible */
6115 if (conds.mod_ptr || conds.unmod_ptr) {
6116 obj_time_weight src_weight;
6117 src_weight.init(astate);
6118 src_weight.high_precision = conds.high_precision_time;
6119
6120 obj_time_weight dest_weight;
6121 dest_weight.high_precision = conds.high_precision_time;
6122
9f95a23c 6123 if (conds.mod_ptr && !conds.if_nomatch) {
7c673cae 6124 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 6125 ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
6126 if (!(dest_weight < src_weight)) {
6127 return -ERR_NOT_MODIFIED;
6128 }
6129 }
6130
9f95a23c 6131 if (conds.unmod_ptr && !conds.if_match) {
7c673cae 6132 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 6133 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
6134 if (dest_weight < src_weight) {
6135 return -ERR_PRECONDITION_FAILED;
6136 }
6137 }
6138 }
6139 if (conds.if_match || conds.if_nomatch) {
b3b6e05e 6140 r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
7c673cae
FG
6141 if (r < 0)
6142 return r;
6143
6144 if (conds.if_match) {
6145 string if_match_str = rgw_string_unquote(conds.if_match);
b3b6e05e 6146 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
11fdf7f2 6147 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
6148 return -ERR_PRECONDITION_FAILED;
6149 }
6150 }
6151
6152 if (conds.if_nomatch) {
6153 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
b3b6e05e 6154 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
11fdf7f2 6155 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
6156 return -ERR_NOT_MODIFIED;
6157 }
6158 }
6159 }
6160
6161 if (params.obj_size)
6162 *params.obj_size = astate->size;
6163 if (params.lastmod)
6164 *params.lastmod = astate->mtime;
6165
6166 return 0;
6167}
6168
6169int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6170{
6171 if (ofs < 0) {
6172 ofs += obj_size;
11fdf7f2
TL
6173 if (ofs < 0)
6174 ofs = 0;
6175 end = obj_size - 1;
6176 } else if (end < 0) {
6177 end = obj_size - 1;
7c673cae
FG
6178 }
6179
11fdf7f2
TL
6180 if (obj_size > 0) {
6181 if (ofs >= (off_t)obj_size) {
6182 return -ERANGE;
6183 }
6184 if (end >= (off_t)obj_size) {
6185 end = obj_size - 1;
7c673cae
FG
6186 }
6187 }
7c673cae
FG
6188 return 0;
6189}
6190
b3b6e05e 6191int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, BucketShard **pbs, std::function<int(BucketShard *)> call)
31f18b77
FG
6192{
6193 RGWRados *store = target->get_store();
20effc67 6194 BucketShard *bs = nullptr;
31f18b77
FG
6195 int r;
6196
6197#define NUM_RESHARD_RETRIES 10
6198 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6199 int ret = get_bucket_shard(&bs, dpp);
31f18b77 6200 if (ret < 0) {
b3b6e05e 6201 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
31f18b77
FG
6202 return ret;
6203 }
39ae355f 6204
31f18b77 6205 r = call(bs);
39ae355f 6206 if (r != -ERR_BUSY_RESHARDING && r != -ENOENT) {
31f18b77
FG
6207 break;
6208 }
39ae355f
TL
6209
6210 std::string new_bucket_id;
6211
6212 // different logic depending whether resharding completed or is
6213 // underway
6214
6215 if (r == -ENOENT) { // case where resharding must have completed
6216 ldpp_dout(dpp, 0) <<
6217 "NOTICE: resharding operation recently completed, invalidating "
6218 "old BucketInfo" << dendl;
6219
6220 r = store->fetch_new_bucket_id(target->bucket_info,
6221 nullptr,
6222 new_bucket_id, dpp);
6223 if (r == -ENOENT) {
6224 // apparently this op raced with a bucket deletion
6225 ldpp_dout(dpp, 10) << "WARNING: " << __func__ <<
6226 " unable to fetch bucket_id, apparently due to race "
6227 "with deletion of bucket: " <<
6228 target->bucket_info.bucket.get_key() << dendl;
6229 return -ERR_NO_SUCH_BUCKET;
6230 } else if (r < 0) {
6231 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
6232 " unable to refresh stale bucket_id after reshard; r=" <<
6233 r << dendl;
6234 return r;
6235 }
6236 } else { // must have been resharding at the time
6237 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6238
6239 r = store->block_while_resharding(bs, &new_bucket_id,
6240 target->bucket_info, null_yield, dpp);
6241 if (r == -ERR_BUSY_RESHARDING) {
6242 continue;
6243 }
6244 if (r < 0) {
6245 return r;
6246 }
6247
6248 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6249 i = 0; /* resharding is finished, make sure we can retry */
31f18b77 6250 }
39ae355f
TL
6251
6252 // common portion -- finished resharding either way
6253
b3b6e05e 6254 r = target->update_bucket_id(new_bucket_id, dpp);
31f18b77 6255 if (r < 0) {
b3b6e05e 6256 ldpp_dout(dpp, 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
31f18b77
FG
6257 return r;
6258 }
39ae355f 6259
31f18b77 6260 invalidate_bs();
81eedcae 6261 } // for loop
31f18b77
FG
6262
6263 if (r < 0) {
6264 return r;
6265 }
6266
6267 if (pbs) {
6268 *pbs = bs;
6269 }
6270
6271 return 0;
6272}
6273
b3b6e05e 6274int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
7c673cae
FG
6275{
6276 if (blind) {
6277 return 0;
6278 }
6279 RGWRados *store = target->get_store();
7c673cae
FG
6280
6281 if (write_tag && write_tag->length()) {
6282 optag = string(write_tag->c_str(), write_tag->length());
6283 } else {
6284 if (optag.empty()) {
6285 append_rand_alpha(store->ctx(), optag, optag, 32);
6286 }
6287 }
6288
b3b6e05e
TL
6289 int r = guard_reshard(dpp, nullptr, [&](BucketShard *bs) -> int {
6290 return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
f64942e4 6291 });
31f18b77 6292
7c673cae
FG
6293 if (r < 0) {
6294 return r;
6295 }
6296 prepared = true;
31f18b77 6297
7c673cae
FG
6298 return 0;
6299}
6300
b3b6e05e 6301int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
7c673cae
FG
6302 uint64_t size, uint64_t accounted_size,
6303 ceph::real_time& ut, const string& etag,
11fdf7f2 6304 const string& content_type, const string& storage_class,
7c673cae
FG
6305 bufferlist *acl_bl,
6306 RGWObjCategory category,
11fdf7f2
TL
6307 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6308 bool appendable)
7c673cae
FG
6309{
6310 if (blind) {
6311 return 0;
6312 }
6313 RGWRados *store = target->get_store();
20effc67 6314 BucketShard *bs = nullptr;
31f18b77 6315
b3b6e05e 6316 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6317 if (ret < 0) {
b3b6e05e 6318 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6319 return ret;
6320 }
6321
6322 rgw_bucket_dir_entry ent;
6323 obj.key.get_index_key(&ent.key);
6324 ent.meta.size = size;
6325 ent.meta.accounted_size = accounted_size;
6326 ent.meta.mtime = ut;
6327 ent.meta.etag = etag;
11fdf7f2 6328 ent.meta.storage_class = storage_class;
7c673cae
FG
6329 if (user_data)
6330 ent.meta.user_data = *user_data;
6331
6332 ACLOwner owner;
6333 if (acl_bl && acl_bl->length()) {
20effc67 6334 int ret = store->decode_policy(dpp, *acl_bl, &owner);
7c673cae 6335 if (ret < 0) {
b3b6e05e 6336 ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
7c673cae
FG
6337 }
6338 }
6339 ent.meta.owner = owner.get_id().to_str();
6340 ent.meta.owner_display_name = owner.get_display_name();
6341 ent.meta.content_type = content_type;
11fdf7f2 6342 ent.meta.appendable = appendable;
7c673cae 6343
31f18b77 6344 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6345
b3b6e05e 6346 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6347 if (r < 0) {
b3b6e05e 6348 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6349 }
6350
6351 return ret;
6352}
6353
20effc67 6354int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
b3b6e05e 6355 int64_t poolid, uint64_t epoch,
7c673cae
FG
6356 real_time& removed_mtime,
6357 list<rgw_obj_index_key> *remove_objs)
6358{
6359 if (blind) {
6360 return 0;
6361 }
6362 RGWRados *store = target->get_store();
20effc67 6363 BucketShard *bs = nullptr;
31f18b77 6364
b3b6e05e 6365 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6366 if (ret < 0) {
b3b6e05e 6367 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6368 return ret;
6369 }
6370
31f18b77 6371 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6372
b3b6e05e 6373 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6374 if (r < 0) {
b3b6e05e 6375 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6376 }
6377
6378 return ret;
6379}
6380
6381
20effc67
TL
6382int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
6383 list<rgw_obj_index_key> *remove_objs)
7c673cae
FG
6384{
6385 if (blind) {
6386 return 0;
6387 }
6388 RGWRados *store = target->get_store();
6389 BucketShard *bs;
7c673cae 6390
b3b6e05e 6391 int ret = guard_reshard(dpp, &bs, [&](BucketShard *bs) -> int {
20effc67 6392 return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
f64942e4 6393 });
7c673cae
FG
6394
6395 /*
6396 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6397 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6398 * have no way to tell that they're all caught up
6399 */
b3b6e05e 6400 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6401 if (r < 0) {
b3b6e05e 6402 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6403 }
6404
6405 return ret;
6406}
6407
b3b6e05e 6408int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
6409{
6410 RGWRados *store = source->get_store();
7c673cae 6411
7c673cae
FG
6412 rgw_raw_obj read_obj;
6413 uint64_t read_ofs = ofs;
6414 uint64_t len, read_len;
6415 bool reading_from_head = true;
6416 ObjectReadOperation op;
6417
6418 bool merge_bl = false;
6419 bufferlist *pbl = &bl;
6420 bufferlist read_bl;
6421 uint64_t max_chunk_size;
6422
6423 RGWObjState *astate;
b3b6e05e 6424 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
6425 if (r < 0)
6426 return r;
6427
11fdf7f2
TL
6428 if (astate->size == 0) {
6429 end = 0;
6430 } else if (end >= (int64_t)astate->size) {
6431 end = astate->size - 1;
6432 }
6433
7c673cae
FG
6434 if (end < 0)
6435 len = 0;
6436 else
6437 len = end - ofs + 1;
6438
9f95a23c 6439 if (astate->manifest && astate->manifest->has_tail()) {
7c673cae 6440 /* now get the relevant object part */
b3b6e05e 6441 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae
FG
6442
6443 uint64_t stripe_ofs = iter.get_stripe_ofs();
f67539c2 6444 read_obj = iter.get_location().get_raw_obj(store->store);
11fdf7f2 6445 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6446 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6447 reading_from_head = (read_obj == state.head_obj);
6448 } else {
6449 read_obj = state.head_obj;
6450 }
6451
b3b6e05e 6452 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
7c673cae 6453 if (r < 0) {
b3b6e05e 6454 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
7c673cae
FG
6455 return r;
6456 }
6457
6458 if (len > max_chunk_size)
6459 len = max_chunk_size;
6460
6461
7c673cae
FG
6462 read_len = len;
6463
6464 if (reading_from_head) {
6465 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6466 r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
7c673cae
FG
6467 if (r < 0)
6468 return r;
6469
6470 if (astate && astate->prefetch_data) {
6471 if (!ofs && astate->data.length() >= len) {
6472 bl = astate->data;
6473 return bl.length();
6474 }
6475
6476 if (ofs < astate->data.length()) {
11fdf7f2 6477 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
9f95a23c 6478 astate->data.begin(ofs).copy(copy_len, bl);
7c673cae
FG
6479 read_len -= copy_len;
6480 read_ofs += copy_len;
6481 if (!read_len)
6482 return bl.length();
6483
6484 merge_bl = true;
6485 pbl = &read_bl;
6486 }
6487 }
6488 }
6489
b3b6e05e 6490 ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
7c673cae
FG
6491 op.read(read_ofs, read_len, pbl, NULL);
6492
11fdf7f2
TL
6493 if (state.cur_pool != read_obj.pool) {
6494 auto iter = state.io_ctxs.find(read_obj.pool);
6495 if (iter == state.io_ctxs.end()) {
6496 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
b3b6e05e 6497 r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
11fdf7f2 6498 if (r < 0) {
b3b6e05e 6499 ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
11fdf7f2
TL
6500 return r;
6501 }
6502 } else {
6503 state.cur_ioctx = &iter->second;
7c673cae 6504 }
11fdf7f2 6505 state.cur_pool = read_obj.pool;
7c673cae
FG
6506 }
6507
11fdf7f2 6508 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6509
11fdf7f2 6510 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
b3b6e05e 6511 ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6512
7c673cae 6513 if (r < 0) {
7c673cae
FG
6514 return r;
6515 }
7c673cae 6516
11fdf7f2
TL
6517 if (merge_bl) {
6518 bl.append(read_bl);
7c673cae
FG
6519 }
6520
7c673cae
FG
6521 return bl.length();
6522}
6523
20effc67
TL
6524int get_obj_data::flush(rgw::AioResultList&& results) {
6525 int r = rgw::check_for_errors(results);
6526 if (r < 0) {
6527 return r;
6528 }
6529 std::list<bufferlist> bl_list;
7c673cae 6530
20effc67
TL
6531 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6532 results.sort(cmp); // merge() requires results to be sorted first
6533 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6534
20effc67
TL
6535 while (!completed.empty() && completed.front().id == offset) {
6536 auto bl = std::move(completed.front().data);
7c673cae 6537
20effc67
TL
6538 bl_list.push_back(bl);
6539 offset += bl.length();
6540 int r = client_cb->handle_data(bl, 0, bl.length());
6541 if (r < 0) {
6542 return r;
7c673cae 6543 }
7c673cae 6544
20effc67
TL
6545 if (rgwrados->get_use_datacache()) {
6546 const std::lock_guard l(d3n_get_data.d3n_lock);
6547 auto oid = completed.front().obj.get_ref().obj.oid;
6548 if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
6549 lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
6550 rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
6551 } else {
6552 lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
7c673cae 6553 }
7c673cae 6554 }
20effc67 6555 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6556 }
20effc67
TL
6557 return 0;
6558}
7c673cae 6559
20effc67 6560static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
b3b6e05e 6561 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6562 off_t read_ofs, off_t len, bool is_head_obj,
6563 RGWObjState *astate, void *arg)
7c673cae 6564{
20effc67
TL
6565 struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
6566 return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
11fdf7f2 6567 is_head_obj, astate, arg);
7c673cae
FG
6568}
6569
b3b6e05e
TL
6570int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6571 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6572 off_t read_ofs, off_t len, bool is_head_obj,
6573 RGWObjState *astate, void *arg)
7c673cae 6574{
7c673cae 6575 ObjectReadOperation op;
20effc67 6576 struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
7c673cae 6577 string oid, key;
7c673cae
FG
6578
6579 if (is_head_obj) {
6580 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6581 int r = append_atomic_test(dpp, astate, op);
7c673cae
FG
6582 if (r < 0)
6583 return r;
6584
6585 if (astate &&
6586 obj_ofs < astate->data.length()) {
11fdf7f2 6587 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6588
7c673cae 6589 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6590 if (r < 0)
6591 return r;
6592
7c673cae 6593 len -= chunk_len;
11fdf7f2 6594 d->offset += chunk_len;
7c673cae
FG
6595 read_ofs += chunk_len;
6596 obj_ofs += chunk_len;
6597 if (!len)
6598 return 0;
6599 }
6600 }
6601
20effc67 6602 auto obj = d->rgwrados->svc.rados->obj(read_obj);
b3b6e05e 6603 int r = obj.open(dpp);
7c673cae 6604 if (r < 0) {
b3b6e05e 6605 ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
11fdf7f2 6606 return r;
7c673cae
FG
6607 }
6608
b3b6e05e 6609 ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
11fdf7f2 6610 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6611
11fdf7f2
TL
6612 const uint64_t cost = len;
6613 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6614
9f95a23c 6615 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
7c673cae 6616
11fdf7f2 6617 return d->flush(std::move(completed));
7c673cae
FG
6618}
6619
b3b6e05e 6620int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
9f95a23c 6621 optional_yield y)
7c673cae
FG
6622{
6623 RGWRados *store = source->get_store();
6624 CephContext *cct = store->ctx();
7c673cae 6625 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6626 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6627 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6628
9f95a23c
TL
6629 auto aio = rgw::make_throttle(window_size, y);
6630 get_obj_data data(store, cb, &*aio, ofs, y);
7c673cae 6631
b3b6e05e 6632 int r = store->iterate_obj(dpp, obj_ctx, source->get_bucket_info(), state.obj,
9f95a23c 6633 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
7c673cae 6634 if (r < 0) {
b3b6e05e 6635 ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
11fdf7f2
TL
6636 data.cancel(); // drain completions without writing back to client
6637 return r;
7c673cae
FG
6638 }
6639
11fdf7f2 6640 return data.drain();
7c673cae
FG
6641}
6642
b3b6e05e 6643int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
7c673cae 6644 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2 6645 off_t ofs, off_t end, uint64_t max_chunk_size,
9f95a23c 6646 iterate_obj_cb cb, void *arg, optional_yield y)
7c673cae
FG
6647{
6648 rgw_raw_obj head_obj;
6649 rgw_raw_obj read_obj;
6650 uint64_t read_ofs = ofs;
6651 uint64_t len;
6652 bool reading_from_head = true;
6653 RGWObjState *astate = NULL;
6654
6655 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6656
b3b6e05e 6657 int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
6658 if (r < 0) {
6659 return r;
6660 }
6661
6662 if (end < 0)
6663 len = 0;
6664 else
6665 len = end - ofs + 1;
6666
9f95a23c 6667 if (astate->manifest) {
7c673cae 6668 /* now get the relevant object stripe */
b3b6e05e 6669 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae 6670
b3b6e05e 6671 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end(dpp);
7c673cae
FG
6672
6673 for (; iter != obj_end && ofs <= end; ++iter) {
6674 off_t stripe_ofs = iter.get_stripe_ofs();
6675 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6676
6677 while (ofs < next_stripe_ofs && ofs <= end) {
f67539c2 6678 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6679 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6680 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6681
6682 if (read_len > max_chunk_size) {
6683 read_len = max_chunk_size;
6684 }
6685
6686 reading_from_head = (read_obj == head_obj);
b3b6e05e 6687 r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6688 if (r < 0) {
6689 return r;
6690 }
6691
6692 len -= read_len;
6693 ofs += read_len;
6694 }
6695 }
6696 } else {
6697 while (ofs <= end) {
6698 read_obj = head_obj;
11fdf7f2 6699 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6700
b3b6e05e 6701 r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6702 if (r < 0) {
6703 return r;
6704 }
6705
6706 len -= read_len;
6707 ofs += read_len;
6708 }
6709 }
6710
6711 return 0;
6712}
6713
b3b6e05e 6714int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
7c673cae
FG
6715{
6716 rgw_rados_ref ref;
b3b6e05e 6717 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6718 if (r < 0) {
6719 return r;
6720 }
6721
b3b6e05e 6722 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
7c673cae
FG
6723}
6724
b3b6e05e 6725int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
7c673cae
FG
6726{
6727 rgw_rados_ref ref;
b3b6e05e 6728 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6729 if (r < 0) {
6730 return r;
6731 }
6732
6733 bufferlist outbl;
6734
b3b6e05e 6735 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
7c673cae
FG
6736}
6737
b3b6e05e 6738int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
7c673cae
FG
6739{
6740 ObjectWriteOperation op;
6741
11fdf7f2 6742 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6743
6744 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6745
6746 if (!state.exists) {
6747 op.create(true);
6748 } else {
6749 op.assert_exists();
b32b8144
FG
6750 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6751 op.mtime2(&mtime_ts);
7c673cae
FG
6752 }
6753
6754 /*
6755 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6756 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6757 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6758 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6759 * log will reflect that.
6760 *
6761 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6762 * is used for object data instance, olh_tag for olh instance.
6763 */
6764 if (has_tag) {
6765 /* guard against racing writes */
b3b6e05e 6766 bucket_index_guard_olh_op(dpp, state, op);
7c673cae
FG
6767 }
6768
6769 if (!has_tag) {
6770 /* obj tag */
9f95a23c 6771 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6772
7c673cae
FG
6773 bufferlist bl;
6774 bl.append(obj_tag.c_str(), obj_tag.size());
6775 op.setxattr(RGW_ATTR_ID_TAG, bl);
6776
6777 state.attrset[RGW_ATTR_ID_TAG] = bl;
6778 state.obj_tag = bl;
6779
6780 /* olh tag */
9f95a23c 6781 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6782
7c673cae
FG
6783 bufferlist olh_bl;
6784 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6785 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6786
6787 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6788 state.olh_tag = olh_bl;
6789 state.is_olh = true;
6790
6791 bufferlist verbl;
6792 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6793 }
6794
6795 bufferlist bl;
6796 RGWOLHPendingInfo pending_info;
6797 pending_info.time = real_clock::now();
11fdf7f2 6798 encode(pending_info, bl);
7c673cae
FG
6799
6800#define OLH_PENDING_TAG_LEN 32
6801 /* tag will start with current time epoch, this so that entries are sorted by time */
6802 char buf[32];
6803 utime_t ut(pending_info.time);
6804 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6805 *op_tag = buf;
6806
9f95a23c 6807 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
11fdf7f2 6808
7c673cae
FG
6809 op_tag->append(s);
6810
6811 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6812 attr_name.append(*op_tag);
6813
6814 op.setxattr(attr_name.c_str(), bl);
6815
b3b6e05e 6816 int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
7c673cae
FG
6817 if (ret < 0) {
6818 return ret;
6819 }
6820
6821 state.exists = true;
6822 state.attrset[attr_name] = bl;
6823
6824 return 0;
6825}
6826
b3b6e05e 6827int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7c673cae
FG
6828{
6829 int ret;
6830
b3b6e05e 6831 ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
7c673cae
FG
6832 if (ret == -EEXIST) {
6833 ret = -ECANCELED;
6834 }
6835
6836 return ret;
6837}
6838
20effc67 6839int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
b3b6e05e 6840 BucketShard *bs,
f64942e4
AA
6841 const rgw_obj& obj_instance,
6842 const RGWBucketInfo& bucket_info,
6843 std::function<int(BucketShard *)> call)
31f18b77
FG
6844{
6845 rgw_obj obj;
6846 const rgw_obj *pobj = &obj_instance;
6847 int r;
6848
6849 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6850 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
31f18b77 6851 if (r < 0) {
b3b6e05e 6852 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
31f18b77
FG
6853 return r;
6854 }
6855 r = call(bs);
6856 if (r != -ERR_BUSY_RESHARDING) {
6857 break;
6858 }
b3b6e05e 6859 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
31f18b77 6860 string new_bucket_id;
b3b6e05e 6861 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield, dpp);
31f18b77
FG
6862 if (r == -ERR_BUSY_RESHARDING) {
6863 continue;
6864 }
6865 if (r < 0) {
6866 return r;
6867 }
b3b6e05e 6868 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
31f18b77
FG
6869 i = 0; /* resharding is finished, make sure we can retry */
6870
6871 obj = *pobj;
6872 obj.bucket.update_bucket_id(new_bucket_id);
6873 pobj = &obj;
81eedcae 6874 } // for loop
31f18b77
FG
6875
6876 if (r < 0) {
6877 return r;
6878 }
6879
6880 return 0;
6881}
6882
39ae355f
TL
6883
6884int RGWRados::fetch_new_bucket_id(
6885 const RGWBucketInfo& curr_bucket_info,
6886 RGWBucketInfo* save_bucket_info, // nullptr -> no save
6887 std::string& new_bucket_id,
6888 const DoutPrefixProvider* dpp)
6889{
6890 RGWBucketInfo local_bucket_info; // use if save_bucket_info is null
6891 RGWBucketInfo* bip = save_bucket_info ? save_bucket_info : &local_bucket_info;
6892 *bip = curr_bucket_info; // copy
6893
6894 int ret = try_refresh_bucket_info(*bip, nullptr, dpp);
6895 if (ret < 0) {
6896 return ret;
6897 }
6898
6899 new_bucket_id = bip->bucket.bucket_id;
6900 return 0;
6901} // fetch_new_bucket_id
6902
6903
f64942e4 6904int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
39ae355f 6905 std::string *new_bucket_id,
11fdf7f2 6906 const RGWBucketInfo& bucket_info,
b3b6e05e
TL
6907 optional_yield y,
6908 const DoutPrefixProvider *dpp)
31f18b77 6909{
11fdf7f2
TL
6910 int ret = 0;
6911 cls_rgw_bucket_instance_entry entry;
6912
81eedcae
TL
6913 constexpr int num_retries = 10;
6914 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
9f95a23c
TL
6915 auto& ref = bs->bucket_obj.get_ref();
6916 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
81eedcae 6917 if (ret == -ENOENT) {
39ae355f
TL
6918 ret = fetch_new_bucket_id(bucket_info, nullptr, *new_bucket_id, dpp);
6919 if (ret < 0) {
6920 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
6921 " failed to refresh bucket info after reshard when get bucket "
6922 "resharding failed, error: " << cpp_strerror(-ret) << dendl;
6923 return ret;
6924 }
81eedcae 6925 } else if (ret < 0) {
b3b6e05e 6926 ldpp_dout(dpp, 0) << __func__ <<
81eedcae
TL
6927 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6928 dendl;
11fdf7f2
TL
6929 return ret;
6930 }
81eedcae 6931
11fdf7f2 6932 if (!entry.resharding_in_progress()) {
39ae355f
TL
6933 ret = fetch_new_bucket_id(bucket_info, nullptr, *new_bucket_id, dpp);
6934 if (ret < 0) {
6935 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
6936 " failed to refresh bucket info after reshard when get bucket "
6937 "resharding succeeded, error: " << cpp_strerror(-ret) << dendl;
6938 return ret;
6939 }
11fdf7f2 6940 }
31f18b77 6941
b3b6e05e 6942 ldpp_dout(dpp, 20) << "NOTICE: reshard still in progress; " <<
81eedcae
TL
6943 (i < num_retries ? "retrying" : "too many retries") << dendl;
6944
6945 if (i == num_retries) {
11fdf7f2
TL
6946 break;
6947 }
6948
6949 // If bucket is erroneously marked as resharding (e.g., crash or
6950 // other error) then fix it. If we can take the bucket reshard
6951 // lock then it means no other resharding should be taking place,
6952 // and we're free to clear the flags.
6953 {
6954 // since we expect to do this rarely, we'll do our work in a
6955 // block and erase our work after each try
6956
9f95a23c 6957 RGWObjectCtx obj_ctx(this->store);
11fdf7f2
TL
6958 const rgw_bucket& b = bs->bucket;
6959 std::string bucket_id = b.get_key();
9f95a23c 6960 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
20effc67 6961 ret = reshard_lock.lock(dpp);
39ae355f
TL
6962 if (ret == -ENOENT) {
6963 continue;
6964 } else if (ret < 0) {
20effc67
TL
6965 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
6966 ": failed to take reshard lock for bucket " <<
11fdf7f2
TL
6967 bucket_id << "; expected if resharding underway" << dendl;
6968 } else {
20effc67
TL
6969 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
6970 ": was able to take reshard lock for bucket " <<
11fdf7f2 6971 bucket_id << dendl;
39ae355f 6972
b3b6e05e 6973 ret = RGWBucketReshard::clear_resharding(dpp, this->store, bucket_info);
39ae355f
TL
6974 reshard_lock.unlock();
6975
6976 if (ret == -ENOENT) {
6977 ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ <<
6978 " INFO: no need to reset reshard flags; old shards apparently"
6979 " removed after successful resharding of bucket " <<
6980 bucket_id << dendl;
6981 continue;
6982 } else if (ret < 0) {
20effc67 6983 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
11fdf7f2 6984 " ERROR: failed to clear resharding flags for bucket " <<
39ae355f 6985 bucket_id << ", " << cpp_strerror(-ret) << dendl;
11fdf7f2 6986 } else {
20effc67
TL
6987 ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ <<
6988 ": apparently successfully cleared resharding flags for "
11fdf7f2
TL
6989 "bucket " << bucket_id << dendl;
6990 continue; // if we apparently succeed immediately test again
6991 } // if clear resharding succeeded
6992 } // if taking of lock succeeded
6993 } // block to encapsulate recovery from incomplete reshard
6994
6995 ret = reshard_wait->wait(y);
6996 if (ret < 0) {
20effc67 6997 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
81eedcae 6998 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
6999 return ret;
7000 }
81eedcae
TL
7001 } // for loop
7002
20effc67 7003 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
81eedcae 7004 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 7005 return -ERR_BUSY_RESHARDING;
31f18b77
FG
7006}
7007
b3b6e05e 7008int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7c673cae
FG
7009 bool delete_marker,
7010 const string& op_tag,
7011 struct rgw_bucket_dir_entry_meta *meta,
7012 uint64_t olh_epoch,
91327a77
AA
7013 real_time unmod_since, bool high_precision_time,
7014 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
7015{
7016 rgw_rados_ref ref;
b3b6e05e 7017 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7018 if (r < 0) {
7019 return r;
7020 }
7021
31f18b77
FG
7022 rgw_zone_set zones_trace;
7023 if (_zones_trace) {
7024 zones_trace = *_zones_trace;
7c673cae 7025 }
9f95a23c 7026 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
7c673cae 7027
31f18b77
FG
7028 BucketShard bs(this);
7029
b3b6e05e 7030 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 7031 [&](BucketShard *bs) -> int {
9f95a23c
TL
7032 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
7033 auto& ref = bs->bucket_obj.get_ref();
f64942e4 7034 librados::ObjectWriteOperation op;
39ae355f 7035 op.assert_exists(); // bucket index shard must exist
f64942e4 7036 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
7037 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
7038 delete_marker, op_tag, meta, olh_epoch,
7039 unmod_since, high_precision_time,
7040 svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 7041 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
7042 });
7043 if (r < 0) {
b3b6e05e 7044 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
31f18b77 7045 return r;
7c673cae
FG
7046 }
7047
b3b6e05e 7048 r = svc.datalog_rados->add_entry(dpp, bucket_info, bs.shard_id);
9f95a23c 7049 if (r < 0) {
b3b6e05e 7050 ldpp_dout(dpp, 0) << "ERROR: failed writing data log" << dendl;
91327a77
AA
7051 }
7052
7c673cae
FG
7053 return 0;
7054}
7055
b3b6e05e 7056void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
7c673cae 7057{
b3b6e05e 7058 ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7c673cae
FG
7059 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
7060}
7061
b3b6e05e 7062int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 7063 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
7064{
7065 rgw_rados_ref ref;
b3b6e05e 7066 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7067 if (r < 0) {
7068 return r;
7069 }
7070
31f18b77
FG
7071 rgw_zone_set zones_trace;
7072 if (_zones_trace) {
7073 zones_trace = *_zones_trace;
7c673cae 7074 }
9f95a23c 7075 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
31f18b77
FG
7076
7077 BucketShard bs(this);
7c673cae
FG
7078
7079 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
b3b6e05e 7080 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 7081 [&](BucketShard *bs) -> int {
9f95a23c 7082 auto& ref = bs->bucket_obj.get_ref();
f64942e4 7083 librados::ObjectWriteOperation op;
39ae355f 7084 op.assert_exists(); // bucket index shard must exist
f64942e4 7085 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
7086 cls_rgw_bucket_unlink_instance(op, key, op_tag,
7087 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 7088 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
7089 });
7090 if (r < 0) {
b3b6e05e 7091 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
31f18b77 7092 return r;
7c673cae
FG
7093 }
7094
7095 return 0;
7096}
7097
20effc67 7098int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
b3b6e05e 7099 const RGWBucketInfo& bucket_info, RGWObjState& state,
7c673cae
FG
7100 const rgw_obj& obj_instance, uint64_t ver_marker,
7101 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
7102 bool *is_truncated)
7103{
7104 rgw_rados_ref ref;
b3b6e05e 7105 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7106 if (r < 0) {
7107 return r;
7108 }
7109
7110 BucketShard bs(this);
f64942e4 7111 int ret =
b3b6e05e 7112 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 7113 if (ret < 0) {
b3b6e05e 7114 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
7115 return ret;
7116 }
7117
7118 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7119
7120 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7121
39ae355f
TL
7122 auto& shard_ref = bs.bucket_obj.get_ref();
7123 ObjectReadOperation op;
9f95a23c 7124
39ae355f
TL
7125 rgw_cls_read_olh_log_ret log_ret;
7126 int op_ret = 0;
7127 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
7128 bufferlist outbl;
7129 r = rgw_rados_operate(dpp, shard_ref.pool.ioctx(), shard_ref.obj.oid, &op, &outbl, null_yield);
7130 if (r < 0) {
7131 return r;
7132 }
7133 if (op_ret < 0) {
7134 ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned op_ret=" << op_ret << dendl;
7135 return op_ret;
31f18b77 7136 }
7c673cae 7137
39ae355f
TL
7138 *log = std::move(log_ret.log);
7139 *is_truncated = log_ret.is_truncated;
7140
7c673cae
FG
7141 return 0;
7142}
7143
a8e16298
TL
7144// a multisite sync bug resulted in the OLH head attributes being overwritten by
7145// the attributes from another zone, causing link_olh() to fail endlessly due to
7146// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7147// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
b3b6e05e 7148int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
a8e16298
TL
7149 const rgw_obj& obj)
7150{
7151 // fetch the current olh entry from the bucket index
7152 rgw_bucket_olh_entry olh;
b3b6e05e 7153 int r = bi_get_olh(dpp, bucket_info, obj, &olh);
a8e16298 7154 if (r < 0) {
b3b6e05e 7155 ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
a8e16298
TL
7156 return r;
7157 }
11fdf7f2 7158 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
7159 return 0;
7160 }
7161
b3b6e05e 7162 ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
a8e16298
TL
7163 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
7164
7165 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7166 ObjectWriteOperation op;
7167 // assert this is the same olh tag we think we're fixing
b3b6e05e 7168 bucket_index_guard_olh_op(dpp, *state, op);
a8e16298
TL
7169 // preserve existing mtime
7170 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
7171 op.mtime2(&mtime_ts);
7172 {
7173 bufferlist bl;
7174 bl.append(olh.tag.c_str(), olh.tag.size());
7175 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
7176 }
7177 {
7178 RGWOLHInfo info;
7179 info.target = rgw_obj(bucket_info.bucket, olh.key);
7180 info.removed = olh.delete_marker;
7181 bufferlist bl;
7182 encode(info, bl);
7183 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7184 }
7185 rgw_rados_ref ref;
b3b6e05e 7186 r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
a8e16298
TL
7187 if (r < 0) {
7188 return r;
7189 }
b3b6e05e 7190 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
a8e16298 7191 if (r < 0) {
b3b6e05e 7192 ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
a8e16298
TL
7193 << cpp_strerror(r) << dendl;
7194 return r;
7195 }
7196 return 0;
7197}
7198
b3b6e05e 7199int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7c673cae
FG
7200{
7201 rgw_rados_ref ref;
b3b6e05e 7202 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7203 if (r < 0) {
7204 return r;
7205 }
7206
7207 BucketShard bs(this);
f64942e4 7208 int ret =
b3b6e05e 7209 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 7210 if (ret < 0) {
b3b6e05e 7211 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
7212 return ret;
7213 }
7214
7215 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7216
7217 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7218
b3b6e05e 7219 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7220 [&](BucketShard *pbs) -> int {
7221 ObjectWriteOperation op;
39ae355f 7222 op.assert_exists(); // bucket index shard must exist
f64942e4
AA
7223 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7224 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
b3b6e05e 7225 return pbs->bucket_obj.operate(dpp, &op, null_yield);
31f18b77
FG
7226 });
7227 if (ret < 0) {
b3b6e05e 7228 ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 7229 return ret;
31f18b77 7230 }
7c673cae
FG
7231
7232 return 0;
7233}
7234
b3b6e05e 7235int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7c673cae
FG
7236{
7237 rgw_rados_ref ref;
b3b6e05e 7238 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7239 if (r < 0) {
7240 return r;
7241 }
7242
7243 BucketShard bs(this);
7c673cae
FG
7244
7245 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7246
7247 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7248
b3b6e05e 7249 int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7250 [&](BucketShard *pbs) -> int {
7251 ObjectWriteOperation op;
39ae355f 7252 op.assert_exists(); // bucket index shard must exist
9f95a23c 7253 auto& ref = pbs->bucket_obj.get_ref();
f64942e4 7254 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c 7255 cls_rgw_clear_olh(op, key, olh_tag);
b3b6e05e 7256 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77 7257 });
7c673cae 7258 if (ret < 0) {
b3b6e05e 7259 ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7c673cae
FG
7260 return ret;
7261 }
7262
7263 return 0;
7264}
7265
20effc67 7266static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
92f5a8d4
TL
7267{
7268 try {
7269 auto biter = bl.cbegin();
7270 decode(*olh, biter);
7271 return 0;
7272 } catch (buffer::error& err) {
20effc67 7273 ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
92f5a8d4
TL
7274 return -EIO;
7275 }
7276}
7277
522d829b
TL
7278int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
7279 RGWObjectCtx& obj_ctx,
7280 RGWObjState& state,
7281 const RGWBucketInfo& bucket_info,
7282 const rgw_obj& obj,
7283 bufferlist& olh_tag,
7284 std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
7285 uint64_t *plast_ver,
7286 rgw_zone_set* zones_trace)
7c673cae
FG
7287{
7288 if (log.empty()) {
7289 return 0;
7290 }
7291
7292 librados::ObjectWriteOperation op;
7293
7294 uint64_t last_ver = log.rbegin()->first;
7295 *plast_ver = last_ver;
7296
7297 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7298
7299 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
92f5a8d4 7300 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7c673cae 7301
a8e16298
TL
7302 bufferlist ver_bl;
7303 string last_ver_s = to_string(last_ver);
7304 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7305 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7306
b32b8144
FG
7307 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7308 op.mtime2(&mtime_ts);
7309
7c673cae 7310 bool need_to_link = false;
92f5a8d4 7311 uint64_t link_epoch = 0;
7c673cae
FG
7312 cls_rgw_obj_key key;
7313 bool delete_marker = false;
7314 list<cls_rgw_obj_key> remove_instances;
7315 bool need_to_remove = false;
7316
92f5a8d4
TL
7317 // decode current epoch and instance
7318 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
7319 if (olh_ver != state.attrset.end()) {
7320 std::string str = olh_ver->second.to_str();
7321 std::string err;
7322 link_epoch = strict_strtoll(str.c_str(), 10, &err);
7323 }
7324 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
7325 if (olh_info != state.attrset.end()) {
7326 RGWOLHInfo info;
20effc67 7327 int r = decode_olh_info(dpp, cct, olh_info->second, &info);
92f5a8d4
TL
7328 if (r < 0) {
7329 return r;
7330 }
7331 info.target.key.get_index_key(&key);
7332 delete_marker = info.removed;
7333 }
7334
7c673cae
FG
7335 for (iter = log.begin(); iter != log.end(); ++iter) {
7336 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7337 for (; viter != iter->second.end(); ++viter) {
7338 rgw_bucket_olh_log_entry& entry = *viter;
7339
b3b6e05e 7340 ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7c673cae
FG
7341 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7342 << (entry.delete_marker ? "(delete)" : "") << dendl;
7343 switch (entry.op) {
7344 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7345 remove_instances.push_back(entry.key);
7346 break;
7347 case CLS_RGW_OLH_OP_LINK_OLH:
92f5a8d4
TL
7348 // only overwrite a link of the same epoch if its key sorts before
7349 if (link_epoch < iter->first || key.instance.empty() ||
7350 key.instance > entry.key.instance) {
b3b6e05e 7351 ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7352 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7353 need_to_link = true;
7354 need_to_remove = false;
7355 key = entry.key;
7356 delete_marker = entry.delete_marker;
7357 } else {
b3b6e05e 7358 ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7359 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7360 }
7c673cae
FG
7361 break;
7362 case CLS_RGW_OLH_OP_UNLINK_OLH:
7363 need_to_remove = true;
7364 need_to_link = false;
7365 break;
7366 default:
b3b6e05e 7367 ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7c673cae
FG
7368 return -EIO;
7369 }
7370 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7371 attr_name.append(entry.op_tag);
7372 op.rmxattr(attr_name.c_str());
7373 }
7374 }
7375
7376 rgw_rados_ref ref;
b3b6e05e 7377 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
7378 if (r < 0) {
7379 return r;
7380 }
7381
7382 const rgw_bucket& bucket = obj.bucket;
7383
7384 if (need_to_link) {
7385 rgw_obj target(bucket, key);
7386 RGWOLHInfo info;
7387 info.target = target;
7388 info.removed = delete_marker;
7389 bufferlist bl;
11fdf7f2 7390 encode(info, bl);
7c673cae
FG
7391 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7392 }
7393
7394 /* first remove object instances */
7395 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7396 liter != remove_instances.end(); ++liter) {
7397 cls_rgw_obj_key& key = *liter;
7398 rgw_obj obj_instance(bucket, key);
b3b6e05e 7399 int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae 7400 if (ret < 0 && ret != -ENOENT) {
b3b6e05e 7401 ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7c673cae
FG
7402 return ret;
7403 }
7404 }
7405
7406 /* update olh object */
b3b6e05e 7407 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
7408 if (r == -ECANCELED) {
7409 r = 0;
7410 }
7411 if (r < 0) {
b3b6e05e 7412 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7c673cae
FG
7413 return r;
7414 }
7415
b3b6e05e 7416 r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
7c673cae 7417 if (r < 0) {
b3b6e05e 7418 ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7c673cae
FG
7419 return r;
7420 }
7421
7422 if (need_to_remove) {
7423 ObjectWriteOperation rm_op;
7424
7425 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7426 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7427 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7428 rm_op.remove();
7429
b3b6e05e 7430 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7c673cae
FG
7431 if (r == -ECANCELED) {
7432 return 0; /* someone else won this race */
7433 } else {
7434 /*
7435 * only clear if was successful, otherwise we might clobber pending operations on this object
7436 */
b3b6e05e 7437 r = bucket_index_clear_olh(dpp, bucket_info, state, obj);
7c673cae 7438 if (r < 0) {
b3b6e05e 7439 ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7c673cae
FG
7440 return r;
7441 }
7442 }
7443 }
7444
7445 return 0;
7446}
7447
7448/*
7449 * read olh log and apply it
7450 */
b3b6e05e 7451int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7452{
7453 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7454 bool is_truncated;
7455 uint64_t ver_marker = 0;
7456
7457 do {
b3b6e05e 7458 int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7c673cae
FG
7459 if (ret < 0) {
7460 return ret;
7461 }
b3b6e05e 7462 ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7463 if (ret < 0) {
7464 return ret;
7465 }
7466 } while (is_truncated);
7467
7468 return 0;
7469}
7470
b3b6e05e 7471int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77 7472 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
9f95a23c 7473 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7474{
7475 string op_tag;
7476
7477 rgw_obj olh_obj = target_obj;
7478 olh_obj.key.instance.clear();
7479
7480 RGWObjState *state = NULL;
7481
7482 int ret = 0;
7483 int i;
31f18b77 7484
7c673cae
FG
7485#define MAX_ECANCELED_RETRY 100
7486 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7487 if (ret == -ECANCELED) {
11fdf7f2 7488 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7489 }
7490
b3b6e05e 7491 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7492 if (ret < 0) {
7493 return ret;
7494 }
7495
b3b6e05e 7496 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7497 if (ret < 0) {
b3b6e05e 7498 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae
FG
7499 if (ret == -ECANCELED) {
7500 continue;
7501 }
7502 return ret;
7503 }
b3b6e05e 7504 ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj, delete_marker,
91327a77
AA
7505 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7506 zones_trace, log_data_change);
7c673cae 7507 if (ret < 0) {
b3b6e05e 7508 ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae 7509 if (ret == -ECANCELED) {
a8e16298
TL
7510 // the bucket index rejected the link_olh() due to olh tag mismatch;
7511 // attempt to reconstruct olh head attributes based on the bucket index
b3b6e05e 7512 int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
a8e16298
TL
7513 if (r2 < 0 && r2 != -ECANCELED) {
7514 return r2;
7515 }
7c673cae
FG
7516 continue;
7517 }
7518 return ret;
7519 }
7520 break;
7521 }
7522
7523 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7524 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7525 return -EIO;
7526 }
7527
b3b6e05e 7528 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7529 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7530 ret = 0;
7531 }
7532 if (ret < 0) {
b3b6e05e 7533 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7534 return ret;
7535 }
7536
7537 return 0;
7538}
7539
b3b6e05e 7540int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
9f95a23c 7541 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7c673cae
FG
7542{
7543 string op_tag;
7544
7545 rgw_obj olh_obj = target_obj;
7546 olh_obj.key.instance.clear();
7547
7548 RGWObjState *state = NULL;
7549
7550 int ret = 0;
7551 int i;
7552
7553 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7554 if (ret == -ECANCELED) {
11fdf7f2 7555 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7556 }
7557
b3b6e05e 7558 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7559 if (ret < 0)
7560 return ret;
7561
b3b6e05e 7562 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7563 if (ret < 0) {
b3b6e05e 7564 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7565 if (ret == -ECANCELED) {
7566 continue;
7567 }
7568 return ret;
7569 }
7570
7571 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7572
b3b6e05e 7573 ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae 7574 if (ret < 0) {
b3b6e05e 7575 ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7576 if (ret == -ECANCELED) {
7577 continue;
7578 }
7579 return ret;
7580 }
7581 break;
7582 }
7583
7584 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7585 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7586 return -EIO;
7587 }
7588
b3b6e05e 7589 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7590 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7591 return 0;
7592 }
7593 if (ret < 0) {
b3b6e05e 7594 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7595 return ret;
7596 }
7597
7598 return 0;
7599}
7600
11fdf7f2 7601void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7602{
7603#define OBJ_INSTANCE_LEN 32
7604 char buf[OBJ_INSTANCE_LEN + 1];
7605
7606 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7607 no underscore for instance name due to the way we encode the raw keys */
7608
11fdf7f2 7609 target_key->set_instance(buf);
7c673cae
FG
7610}
7611
11fdf7f2 7612void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7613{
11fdf7f2 7614 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7615}
7616
b3b6e05e 7617int RGWRados::get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7c673cae 7618{
92f5a8d4 7619 map<string, bufferlist> attrset;
7c673cae
FG
7620
7621 ObjectReadOperation op;
92f5a8d4 7622 op.getxattrs(&attrset, NULL);
7c673cae 7623
b3b6e05e 7624 int r = obj_operate(dpp, bucket_info, obj, &op);
7c673cae
FG
7625 if (r < 0) {
7626 return r;
7627 }
7c673cae 7628
92f5a8d4 7629 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7c673cae
FG
7630 if (iter == attrset.end()) { /* not an olh */
7631 return -EINVAL;
7632 }
7633
20effc67 7634 return decode_olh_info(dpp, cct, iter->second, olh);
7c673cae
FG
7635}
7636
20effc67 7637void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp, map<string, bufferlist>& pending_entries,
7c673cae
FG
7638 map<string, bufferlist> *rm_pending_entries)
7639{
7640 map<string, bufferlist>::iterator iter = pending_entries.begin();
7641
7642 real_time now = real_clock::now();
7643
7644 while (iter != pending_entries.end()) {
11fdf7f2 7645 auto biter = iter->second.cbegin();
7c673cae
FG
7646 RGWOLHPendingInfo pending_info;
7647 try {
11fdf7f2 7648 decode(pending_info, biter);
7c673cae
FG
7649 } catch (buffer::error& err) {
7650 /* skipping bad entry, we could remove it but it might hide a bug */
20effc67 7651 ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7c673cae
FG
7652 ++iter;
7653 continue;
7654 }
7655
7656 map<string, bufferlist>::iterator cur_iter = iter;
7657 ++iter;
7658 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7659 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7660 pending_entries.erase(cur_iter);
7661 } else {
7662 /* entries names are sorted by time (rounded to a second) */
7663 break;
7664 }
7665 }
7666}
7667
b3b6e05e 7668int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7c673cae 7669{
7c673cae 7670 rgw_rados_ref ref;
b3b6e05e 7671 int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
7c673cae
FG
7672 if (r < 0) {
7673 return r;
7674 }
7675
81eedcae
TL
7676 // trim no more than 1000 entries per osd op
7677 constexpr int max_entries = 1000;
7c673cae 7678
81eedcae
TL
7679 auto i = pending_attrs.begin();
7680 while (i != pending_attrs.end()) {
7681 ObjectWriteOperation op;
b3b6e05e 7682 bucket_index_guard_olh_op(dpp, state, op);
81eedcae
TL
7683
7684 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7685 op.rmxattr(i->first.c_str());
7686 }
7687
b3b6e05e 7688 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
81eedcae
TL
7689 if (r == -ENOENT || r == -ECANCELED) {
7690 /* raced with some other change, shouldn't sweat about it */
7691 return 0;
7692 }
7693 if (r < 0) {
b3b6e05e 7694 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
81eedcae
TL
7695 return r;
7696 }
7697 }
7c673cae
FG
7698 return 0;
7699}
7700
b3b6e05e 7701int RGWRados::follow_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7c673cae
FG
7702{
7703 map<string, bufferlist> pending_entries;
11fdf7f2 7704 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7705
7706 map<string, bufferlist> rm_pending_entries;
20effc67 7707 check_pending_olh_entries(dpp,pending_entries, &rm_pending_entries);
7c673cae
FG
7708
7709 if (!rm_pending_entries.empty()) {
b3b6e05e 7710 int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
7c673cae 7711 if (ret < 0) {
b3b6e05e 7712 ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7c673cae
FG
7713 return ret;
7714 }
7715 }
7716 if (!pending_entries.empty()) {
b3b6e05e 7717 ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7c673cae 7718
b3b6e05e 7719 int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7720 if (ret < 0) {
7721 return ret;
7722 }
7723 }
7724
92f5a8d4
TL
7725 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7726 if (iter == state->attrset.end()) {
7727 return -EINVAL;
7728 }
7729
7c673cae 7730 RGWOLHInfo olh;
20effc67 7731 int ret = decode_olh_info(dpp, cct, iter->second, &olh);
92f5a8d4
TL
7732 if (ret < 0) {
7733 return ret;
7c673cae
FG
7734 }
7735
7736 if (olh.removed) {
7737 return -ENOENT;
7738 }
7739
7740 *target = olh.target;
7741
7742 return 0;
7743}
7744
20effc67 7745int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
b3b6e05e 7746 rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7c673cae 7747 map<string, bufferlist> *attrs, bufferlist *first_chunk,
9f95a23c 7748 RGWObjVersionTracker *objv_tracker, optional_yield y)
7c673cae
FG
7749{
7750 rgw_rados_ref ref;
b3b6e05e 7751 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
7752 if (r < 0) {
7753 return r;
7754 }
7755
7756 map<string, bufferlist> unfiltered_attrset;
7757 uint64_t size = 0;
7758 struct timespec mtime_ts;
7759
7760 ObjectReadOperation op;
7761 if (objv_tracker) {
7762 objv_tracker->prepare_op_for_read(&op);
7763 }
7764 if (attrs) {
7765 op.getxattrs(&unfiltered_attrset, NULL);
7766 }
7767 if (psize || pmtime) {
7768 op.stat2(&size, &mtime_ts, NULL);
7769 }
7770 if (first_chunk) {
7771 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7772 }
7773 bufferlist outbl;
b3b6e05e 7774 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7c673cae
FG
7775
7776 if (epoch) {
9f95a23c 7777 *epoch = ref.pool.ioctx().get_last_version();
7c673cae
FG
7778 }
7779
7780 if (r < 0)
7781 return r;
7782
7783 if (psize)
7784 *psize = size;
7785 if (pmtime)
7786 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7787 if (attrs) {
11fdf7f2 7788 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7789 }
7790
7791 return 0;
7792}
7793
b3b6e05e 7794int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7795 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7796{
a8e16298 7797 vector<rgw_bucket_dir_header> headers;
7c673cae 7798 map<int, string> bucket_instance_ids;
b3b6e05e 7799 int r = cls_bucket_head(dpp, bucket_info, shard_id, headers, &bucket_instance_ids);
7c673cae
FG
7800 if (r < 0) {
7801 return r;
7802 }
7803
11fdf7f2 7804 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7805
a8e16298 7806 auto iter = headers.begin();
7c673cae
FG
7807 map<int, string>::iterator viter = bucket_instance_ids.begin();
7808 BucketIndexShardsManager ver_mgr;
7809 BucketIndexShardsManager master_ver_mgr;
7810 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7811 char buf[64];
7812 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7813 accumulate_raw_stats(*iter, stats);
7814 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7815 ver_mgr.add(viter->first, string(buf));
a8e16298 7816 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7817 master_ver_mgr.add(viter->first, string(buf));
7818 if (shard_id >= 0) {
a8e16298 7819 *max_marker = iter->max_marker;
7c673cae 7820 } else {
a8e16298 7821 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7822 }
c07f9fc5 7823 if (syncstopped != NULL)
a8e16298 7824 *syncstopped = iter->syncstopped;
7c673cae
FG
7825 }
7826 ver_mgr.to_string(bucket_ver);
7827 master_ver_mgr.to_string(master_ver);
7828 if (shard_id < 0) {
7829 marker_mgr.to_string(max_marker);
7830 }
7831 return 0;
7832}
7833
7c673cae
FG
7834class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7835 RGWGetBucketStats_CB *cb;
7836 uint32_t pendings;
7837 map<RGWObjCategory, RGWStorageStats> stats;
7838 int ret_code;
7839 bool should_cb;
9f95a23c 7840 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7c673cae
FG
7841
7842public:
7843 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
9f95a23c
TL
7844 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7845 {}
7c673cae
FG
7846
7847 void handle_response(int r, rgw_bucket_dir_header& header) override {
9f95a23c 7848 std::lock_guard l{lock};
7c673cae
FG
7849 if (should_cb) {
7850 if ( r >= 0) {
7851 accumulate_raw_stats(header, stats);
7852 } else {
7853 ret_code = r;
7854 }
7855
7856 // Are we all done?
7857 if (--pendings == 0) {
7858 if (!ret_code) {
7859 cb->set_response(&stats);
7860 }
7861 cb->handle_response(ret_code);
7862 cb->put();
7863 }
7864 }
7865 }
7866
7867 void unset_cb() {
9f95a23c 7868 std::lock_guard l{lock};
7c673cae
FG
7869 should_cb = false;
7870 }
7871};
7872
b3b6e05e 7873int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7c673cae
FG
7874{
7875 int num_aio = 0;
f67539c2 7876 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
11fdf7f2 7877 ceph_assert(get_ctx);
b3b6e05e 7878 int r = cls_bucket_head_async(dpp, bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
7879 if (r < 0) {
7880 ctx->put();
7881 if (num_aio) {
7882 get_ctx->unset_cb();
7883 }
7884 }
c07f9fc5 7885 get_ctx->put();
7c673cae
FG
7886 return r;
7887}
7888
e306af50
TL
7889int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7890 const string& meta_key,
7891 RGWBucketInfo& info,
7892 real_time *pmtime,
7893 map<string, bufferlist> *pattrs,
b3b6e05e
TL
7894 optional_yield y,
7895 const DoutPrefixProvider *dpp)
9f95a23c
TL
7896{
7897 rgw_bucket bucket;
7898 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7c673cae 7899
b3b6e05e 7900 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y, dpp);
9f95a23c 7901}
7c673cae 7902
11fdf7f2 7903int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
b3b6e05e
TL
7904 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
7905 const DoutPrefixProvider *dpp)
7c673cae 7906{
9f95a23c
TL
7907 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7908 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7909 y,
b3b6e05e 7910 dpp,
9f95a23c
TL
7911 RGWBucketCtl::BucketInstance::GetParams()
7912 .set_mtime(pmtime)
7913 .set_attrs(pattrs)
7914 .set_bectx_params(bectx_params));
7c673cae
FG
7915}
7916
9f95a23c 7917int RGWRados::get_bucket_info(RGWServices *svc,
b32b8144
FG
7918 const string& tenant, const string& bucket_name,
7919 RGWBucketInfo& info,
9f95a23c 7920 real_time *pmtime,
b3b6e05e
TL
7921 optional_yield y,
7922 const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
b32b8144 7923{
9f95a23c
TL
7924 auto obj_ctx = svc->sysobj->init_obj_ctx();
7925 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7926 rgw_bucket bucket;
7927 bucket.tenant = tenant;
7928 bucket.name = bucket_name;
b3b6e05e 7929 return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
9f95a23c
TL
7930 RGWBucketCtl::BucketInstance::GetParams()
7931 .set_mtime(pmtime)
7932 .set_attrs(pattrs)
7933 .set_bectx_params(bectx_params));
b32b8144
FG
7934}
7935
7936int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7937 ceph::real_time *pmtime,
b3b6e05e 7938 const DoutPrefixProvider *dpp,
b32b8144
FG
7939 map<string, bufferlist> *pattrs)
7940{
9f95a23c
TL
7941 rgw_bucket bucket = info.bucket;
7942 bucket.bucket_id.clear();
b32b8144 7943
9f95a23c 7944 auto rv = info.objv_tracker.read_version;
b32b8144 7945
b3b6e05e 7946 return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
9f95a23c
TL
7947 RGWBucketCtl::BucketInstance::GetParams()
7948 .set_mtime(pmtime)
7949 .set_attrs(pattrs)
7950 .set_refresh_version(rv));
7c673cae
FG
7951}
7952
7953int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
b3b6e05e
TL
7954 real_time mtime, map<string, bufferlist> *pattrs,
7955 const DoutPrefixProvider *dpp)
7c673cae 7956{
b3b6e05e 7957 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
9f95a23c
TL
7958 RGWBucketCtl::BucketInstance::PutParams()
7959 .set_exclusive(exclusive)
7960 .set_mtime(mtime)
7961 .set_attrs(pattrs));
7c673cae
FG
7962}
7963
7964int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
b3b6e05e
TL
7965 map<string, bufferlist> *pattrs, bool create_entry_point,
7966 const DoutPrefixProvider *dpp)
7c673cae
FG
7967{
7968 bool create_head = !info.has_instance_obj || create_entry_point;
7969
b3b6e05e 7970 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
7c673cae
FG
7971 if (ret < 0) {
7972 return ret;
7973 }
7974
7975 if (!create_head)
7976 return 0; /* done! */
7977
7978 RGWBucketEntryPoint entry_point;
7979 entry_point.bucket = info.bucket;
7980 entry_point.owner = info.owner;
7981 entry_point.creation_time = info.creation_time;
7982 entry_point.linked = true;
7983 RGWObjVersionTracker ot;
7984 if (pep_objv && !pep_objv->tag.empty()) {
7985 ot.write_version = *pep_objv;
7986 } else {
7987 ot.generate_new_write_ver(cct);
7988 if (pep_objv) {
7989 *pep_objv = ot.write_version;
7990 }
7991 }
b3b6e05e 7992 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
9f95a23c
TL
7993 .set_exclusive(exclusive)
7994 .set_objv_tracker(&ot)
7995 .set_mtime(mtime));
7c673cae
FG
7996 if (ret < 0)
7997 return ret;
7998
7999 return 0;
8000}
8001
b3b6e05e 8002int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
7c673cae 8003{
11fdf7f2 8004 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
8005
8006 map<string, RGWBucketEnt>::iterator iter;
8007 for (iter = m.begin(); iter != m.end(); ++iter) {
8008 RGWBucketEnt& ent = iter->second;
8009 rgw_bucket& bucket = ent.bucket;
8010 ent.count = 0;
8011 ent.size = 0;
8012 ent.size_rounded = 0;
8013
a8e16298 8014 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
8015
8016 RGWBucketInfo bucket_info;
b3b6e05e 8017 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae
FG
8018 if (ret < 0) {
8019 return ret;
8020 }
8021
b3b6e05e 8022 int r = cls_bucket_head(dpp, bucket_info, RGW_NO_SHARD, headers);
7c673cae
FG
8023 if (r < 0)
8024 return r;
8025
a8e16298 8026 auto hiter = headers.begin();
7c673cae
FG
8027 for (; hiter != headers.end(); ++hiter) {
8028 RGWObjCategory category = main_category;
11fdf7f2 8029 auto iter = (hiter->stats).find(category);
a8e16298 8030 if (iter != hiter->stats.end()) {
7c673cae
FG
8031 struct rgw_bucket_category_stats& stats = iter->second;
8032 ent.count += stats.num_entries;
8033 ent.size += stats.total_size;
8034 ent.size_rounded += stats.total_size_rounded;
8035 }
8036 }
3efd9988
FG
8037
8038 // fill in placement_rule from the bucket instance for use in swift's
8039 // per-storage policy statistics
8040 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
8041 }
8042
8043 return m.size();
8044}
8045
b3b6e05e 8046int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
7c673cae
FG
8047{
8048 rgw_rados_ref ref;
b3b6e05e 8049 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
8050 if (r < 0) {
8051 return r;
8052 }
8053 librados::Rados *rad = get_rados_handle();
9f95a23c 8054 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7c673cae 8055
9f95a23c 8056 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
8057 completion->release();
8058 return r;
8059}
8060
b3b6e05e 8061int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
7c673cae
FG
8062{
8063 librados::IoCtx& io_ctx = ctx.io_ctx;
8064 librados::NObjectIterator& iter = ctx.iter;
8065
b3b6e05e 8066 int r = open_pool_ctx(dpp, pool, io_ctx, false);
7c673cae
FG
8067 if (r < 0)
8068 return r;
8069
8070 iter = io_ctx.nobjects_begin();
8071
8072 return 0;
8073}
8074
b3b6e05e 8075int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
181888fb
FG
8076{
8077 librados::IoCtx& io_ctx = ctx.io_ctx;
8078 librados::NObjectIterator& iter = ctx.iter;
8079
b3b6e05e 8080 int r = open_pool_ctx(dpp, pool, io_ctx, false);
181888fb
FG
8081 if (r < 0)
8082 return r;
8083
8084 librados::ObjectCursor oc;
8085 if (!oc.from_str(cursor)) {
b3b6e05e 8086 ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
181888fb
FG
8087 return -EINVAL;
8088 }
8089
f64942e4
AA
8090 try {
8091 iter = io_ctx.nobjects_begin(oc);
8092 return 0;
8093 } catch (const std::system_error& e) {
8094 r = -e.code().value();
b3b6e05e 8095 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
8096 << ", returning " << r << dendl;
8097 return r;
8098 } catch (const std::exception& e) {
b3b6e05e 8099 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
8100 << ", returning -5" << dendl;
8101 return -EIO;
8102 }
181888fb
FG
8103}
8104
8105string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
8106{
8107 return ctx.iter.get_cursor().to_str();
8108}
8109
20effc67 8110static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
f64942e4 8111 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
8112 bool *is_truncated, RGWAccessListFilter *filter)
8113{
8114 librados::IoCtx& io_ctx = ctx.io_ctx;
8115 librados::NObjectIterator& iter = ctx.iter;
8116
8117 if (iter == io_ctx.nobjects_end())
8118 return -ENOENT;
8119
8120 uint32_t i;
8121
8122 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
8123 rgw_bucket_dir_entry e;
8124
8125 string oid = iter->get_oid();
20effc67 8126 ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7c673cae
FG
8127
8128 // fill it in with initial values; we may correct later
8129 if (filter && !filter->filter(oid, oid))
8130 continue;
8131
8132 e.key = oid;
8133 objs.push_back(e);
8134 }
8135
8136 if (is_truncated)
8137 *is_truncated = (iter != io_ctx.nobjects_end());
8138
8139 return objs.size();
8140}
7c673cae 8141
20effc67 8142int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
f64942e4
AA
8143 bool *is_truncated, RGWAccessListFilter *filter)
8144{
8145 // catch exceptions from NObjectIterator::operator++()
8146 try {
20effc67 8147 return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
f64942e4
AA
8148 } catch (const std::system_error& e) {
8149 int r = -e.code().value();
20effc67 8150 ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
f64942e4
AA
8151 << ", returning " << r << dendl;
8152 return r;
8153 } catch (const std::exception& e) {
20effc67 8154 ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
f64942e4
AA
8155 << ", returning -5" << dendl;
8156 return -EIO;
8157 }
8158}
8159
b3b6e05e 8160int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 8161{
181888fb 8162 if (!ctx->initialized) {
b3b6e05e 8163 int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
7c673cae 8164 if (r < 0) {
b3b6e05e 8165 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7c673cae
FG
8166 return r;
8167 }
181888fb 8168 ctx->initialized = true;
7c673cae 8169 }
181888fb
FG
8170 return 0;
8171}
7c673cae 8172
b3b6e05e 8173int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
181888fb
FG
8174 RGWListRawObjsCtx& ctx, list<string>& oids,
8175 bool *is_truncated)
8176{
8177 if (!ctx.initialized) {
8178 return -EINVAL;
8179 }
8180 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae 8181 vector<rgw_bucket_dir_entry> objs;
20effc67 8182 int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
7c673cae
FG
8183 if (r < 0) {
8184 if(r != -ENOENT)
b3b6e05e 8185 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7c673cae
FG
8186 return r;
8187 }
8188
8189 vector<rgw_bucket_dir_entry>::iterator iter;
8190 for (iter = objs.begin(); iter != objs.end(); ++iter) {
8191 oids.push_back(iter->key.name);
8192 }
8193
8194 return oids.size();
8195}
8196
b3b6e05e 8197int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
181888fb
FG
8198 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8199 bool *is_truncated)
8200{
8201 if (!ctx.initialized) {
b3b6e05e 8202 int r = list_raw_objects_init(dpp, pool, string(), &ctx);
181888fb
FG
8203 if (r < 0) {
8204 return r;
8205 }
8206 }
8207
b3b6e05e 8208 return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
181888fb
FG
8209}
8210
8211string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8212{
8213 return pool_iterate_get_cursor(ctx.iter_ctx);
8214}
8215
b3b6e05e 8216int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8217 rgw_bucket_dir_entry *dirent)
7c673cae 8218{
a8e16298 8219 rgw_cls_bi_entry bi_entry;
b3b6e05e 8220 int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298 8221 if (r < 0 && r != -ENOENT) {
b3b6e05e 8222 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
a8e16298 8223 }
7c673cae
FG
8224 if (r < 0) {
8225 return r;
8226 }
11fdf7f2 8227 auto iter = bi_entry.data.cbegin();
a8e16298 8228 try {
11fdf7f2 8229 decode(*dirent, iter);
a8e16298 8230 } catch (buffer::error& err) {
b3b6e05e 8231 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
a8e16298
TL
8232 return -EIO;
8233 }
8234
8235 return 0;
8236}
7c673cae 8237
b3b6e05e 8238int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298
TL
8239 rgw_bucket_olh_entry *olh)
8240{
7c673cae 8241 rgw_cls_bi_entry bi_entry;
b3b6e05e 8242 int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae 8243 if (r < 0 && r != -ENOENT) {
b3b6e05e 8244 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7c673cae
FG
8245 }
8246 if (r < 0) {
8247 return r;
8248 }
11fdf7f2 8249 auto iter = bi_entry.data.cbegin();
7c673cae 8250 try {
a8e16298 8251 decode(*olh, iter);
7c673cae 8252 } catch (buffer::error& err) {
b3b6e05e 8253 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7c673cae
FG
8254 return -EIO;
8255 }
8256
8257 return 0;
8258}
8259
b3b6e05e 8260int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8261 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
8262{
8263 BucketShard bs(this);
b3b6e05e 8264 int ret = bs.init(dpp, bucket_info, obj);
7c673cae 8265 if (ret < 0) {
b3b6e05e 8266 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8267 return ret;
8268 }
8269
8270 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9f95a23c
TL
8271
8272 auto& ref = bs.bucket_obj.get_ref();
7c673cae 8273
9f95a23c 8274 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7c673cae
FG
8275}
8276
8277void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8278{
9f95a23c
TL
8279 auto& ref = bs.bucket_obj.get_ref();
8280 cls_rgw_bi_put(op, ref.obj.oid, entry);
7c673cae
FG
8281}
8282
8283int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8284{
9f95a23c
TL
8285 auto& ref = bs.bucket_obj.get_ref();
8286 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7c673cae
FG
8287 if (ret < 0)
8288 return ret;
8289
8290 return 0;
8291}
8292
b3b6e05e 8293int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7c673cae 8294{
20effc67
TL
8295 // make sure incomplete multipart uploads are hashed correctly
8296 if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
8297 RGWMPObj mp;
8298 mp.from_meta(obj.key.name);
8299 obj.index_hash_source = mp.get_key();
8300 }
7c673cae 8301 BucketShard bs(this);
20effc67 8302
b3b6e05e 8303 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8304 if (ret < 0) {
b3b6e05e 8305 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8306 return ret;
8307 }
8308
8309 return bi_put(bs, entry);
8310}
8311
20effc67
TL
8312int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
8313 const string& obj_name_filter, const string& marker, uint32_t max,
8314 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8315{
20effc67 8316 rgw_obj obj(bucket, obj_name_filter);
7c673cae 8317 BucketShard bs(this);
b3b6e05e 8318 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8319 if (ret < 0) {
b3b6e05e 8320 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8321 return ret;
8322 }
8323
9f95a23c 8324 auto& ref = bs.bucket_obj.get_ref();
20effc67 8325 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
31f18b77
FG
8326 if (ret == -ENOENT) {
8327 *is_truncated = false;
8328 }
7c673cae
FG
8329 if (ret < 0)
8330 return ret;
8331
8332 return 0;
8333}
8334
20effc67
TL
8335int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
8336 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8337{
9f95a23c 8338 auto& ref = bs.bucket_obj.get_ref();
20effc67 8339 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
7c673cae
FG
8340 if (ret < 0)
8341 return ret;
8342
8343 return 0;
8344}
8345
20effc67
TL
8346int RGWRados::bi_list(const DoutPrefixProvider *dpp,
8347 const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
8348 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8349{
20effc67
TL
8350 BucketShard bs(this);
8351 int ret = bs.init(bucket_info.bucket, shard_id, bucket_info.layout.current_index, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8352 if (ret < 0) {
20effc67 8353 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8354 return ret;
8355 }
8356
20effc67 8357 return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
7c673cae
FG
8358}
8359
20effc67 8360int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
7c673cae 8361{
20effc67
TL
8362 auto& ref = bs.bucket_obj.get_ref();
8363 int ret = ref.pool.ioctx().remove(ref.obj.oid);
8364 if (ret == -ENOENT) {
8365 ret = 0;
8366 }
7c673cae 8367 if (ret < 0) {
20effc67 8368 ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
7c673cae
FG
8369 return ret;
8370 }
8371
20effc67 8372 return 0;
7c673cae
FG
8373}
8374
b3b6e05e 8375int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
7c673cae 8376{
b3b6e05e 8377 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
7c673cae
FG
8378}
8379
9f95a23c
TL
8380int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8381 librados::ObjectWriteOperation *op)
7c673cae 8382{
9f95a23c 8383 return gc_pool_ctx.aio_operate(oid, c, op);
7c673cae
FG
8384}
8385
b3b6e05e 8386int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
7c673cae 8387{
b3b6e05e 8388 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
7c673cae
FG
8389}
8390
9f95a23c 8391int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
7c673cae 8392{
9f95a23c 8393 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
7c673cae
FG
8394}
8395
11fdf7f2 8396int RGWRados::process_gc(bool expired_only)
7c673cae 8397{
11fdf7f2 8398 return gc->process(expired_only);
7c673cae
FG
8399}
8400
f6b5b4d7 8401int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
f67539c2 8402 vector<rgw::sal::Lifecycle::LCEntry>& progress_map,
f6b5b4d7 8403 int& index)
7c673cae 8404{
f6b5b4d7 8405 return lc->list_lc_progress(marker, max_entries, progress_map, index);
7c673cae
FG
8406}
8407
20effc67 8408int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
7c673cae 8409{
f6b5b4d7
TL
8410 RGWLC lc;
8411 lc.initialize(cct, this->store);
8412 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
20effc67 8413 auto ret = lc.process(&worker, optional_bucket, true /* once */);
f6b5b4d7
TL
8414 lc.stop_processor(); // sets down_flag, but returns immediately
8415 return ret;
7c673cae
FG
8416}
8417
b3b6e05e 8418bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
7c673cae 8419{
b3b6e05e 8420 return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
7c673cae
FG
8421}
8422
b3b6e05e 8423int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
9f95a23c 8424 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
7c673cae 8425{
31f18b77
FG
8426 rgw_zone_set zones_trace;
8427 if (_zones_trace) {
8428 zones_trace = *_zones_trace;
8429 }
9f95a23c 8430 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8431
7c673cae 8432 ObjectWriteOperation o;
39ae355f
TL
8433 o.assert_exists(); // bucket index shard must exist
8434
7c673cae 8435 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8436 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8437 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
b3b6e05e 8438 return bs.bucket_obj.operate(dpp, &o, y);
7c673cae
FG
8439}
8440
31f18b77 8441int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8442 int64_t pool, uint64_t epoch,
8443 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8444 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8445{
7c673cae 8446 ObjectWriteOperation o;
39ae355f
TL
8447 o.assert_exists(); // bucket index shard must exist
8448
7c673cae
FG
8449 rgw_bucket_dir_entry_meta dir_meta;
8450 dir_meta = ent.meta;
8451 dir_meta.category = category;
8452
1adf2230
AA
8453 rgw_zone_set zones_trace;
8454 if (_zones_trace) {
8455 zones_trace = *_zones_trace;
8456 }
9f95a23c 8457 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8458
7c673cae
FG
8459 rgw_bucket_entry_ver ver;
8460 ver.pool = pool;
8461 ver.epoch = epoch;
8462 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
8463 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8464 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8465 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
8466 complete_op_data *arg;
8467 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8468 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77 8469 librados::AioCompletion *completion = arg->rados_completion;
9f95a23c 8470 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
31f18b77 8471 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
8472 return ret;
8473}
8474
31f18b77 8475int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
8476 int64_t pool, uint64_t epoch,
8477 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8478 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 8479{
31f18b77 8480 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
8481}
8482
8483int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8484 int64_t pool, uint64_t epoch,
8485 rgw_obj& obj,
8486 real_time& removed_mtime,
8487 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
8488 uint16_t bilog_flags,
8489 rgw_zone_set *zones_trace)
7c673cae
FG
8490{
8491 rgw_bucket_dir_entry ent;
8492 ent.meta.mtime = removed_mtime;
8493 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8494 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8495 ent, RGWObjCategory::None, remove_objs,
8496 bilog_flags, zones_trace);
7c673cae
FG
8497}
8498
20effc67
TL
8499int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
8500 list<rgw_obj_index_key> *remove_objs,
8501 uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
8502{
8503 rgw_bucket_dir_entry ent;
8504 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8505 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8506 -1 /* pool id */, 0, ent,
20effc67 8507 RGWObjCategory::None, remove_objs, bilog_flags,
11fdf7f2 8508 zones_trace);
7c673cae
FG
8509}
8510
b3b6e05e 8511int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
7c673cae 8512{
9f95a23c 8513 RGWSI_RADOS::Pool index_pool;
7c673cae 8514 map<int, string> bucket_objs;
b3b6e05e 8515 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8516 if (r < 0)
8517 return r;
8518
9f95a23c
TL
8519 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8520}
8521
8522
8523uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8524 uint32_t num_shards)
8525{
8526 // We want to minimize the chances that when num_shards >>
8527 // num_entries that we return much fewer than num_entries to the
8528 // client. Given all the overhead of making a cls call to the osd,
8529 // returning a few entries is not much more work than returning one
8530 // entry. This minimum might be better tuned based on future
8531 // experiments where num_shards >> num_entries. (Note: ">>" should
8532 // be interpreted as "much greater than".)
8533 constexpr uint32_t min_read = 8;
8534
8535 // The following is based on _"Balls into Bins" -- A Simple and
8536 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8537 // cases when num_shards >> num_entries (it almost serves as a
8538 // ceiling calculation). We also assume alpha is 1.0 and extract it
8539 // from the calculation. Future work could involve memoizing some of
8540 // the transcendental functions to minimize repeatedly re-calling
8541 // them with the same parameters, which we expect to be the case the
8542 // majority of the time.
8543 uint32_t calc_read =
8544 1 +
8545 static_cast<uint32_t>((num_entries / num_shards) +
8546 sqrt((2 * num_entries) *
8547 log(num_shards) / num_shards));
8548
8549 return std::max(min_read, calc_read);
7c673cae
FG
8550}
8551
1adf2230 8552
20effc67 8553int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
b3b6e05e 8554 RGWBucketInfo& bucket_info,
9f95a23c
TL
8555 const int shard_id,
8556 const rgw_obj_index_key& start_after,
20effc67
TL
8557 const std::string& prefix,
8558 const std::string& delimiter,
9f95a23c
TL
8559 const uint32_t num_entries,
8560 const bool list_versions,
8561 const uint16_t expansion_factor,
8562 ent_map_t& m,
8563 bool* is_truncated,
8564 bool* cls_filtered,
20effc67 8565 rgw_obj_index_key* last_entry,
9f95a23c 8566 optional_yield y,
20effc67 8567 RGWBucketListNameFilter force_check_filter)
7c673cae 8568{
9f95a23c
TL
8569 /* expansion_factor allows the number of entries to read to grow
8570 * exponentially; this is used when earlier reads are producing too
8571 * few results, perhaps due to filtering or to a series of
8572 * namespaced entries */
8573
20effc67
TL
8574 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": " <<
8575 bucket_info.bucket <<
8576 " start_after=\"" << start_after <<
8577 "\", prefix=\"" << prefix <<
8578 ", delimiter=\"" << delimiter <<
8579 "\", shard_id=" << shard_id <<
8580 "\", num_entries=" << num_entries <<
9f95a23c 8581 ", list_versions=" << list_versions <<
20effc67
TL
8582 ", expansion_factor=" << expansion_factor <<
8583 ", force_check_filter is " <<
8584 (force_check_filter ? "set" : "unset") << dendl;
7c673cae 8585
9f95a23c
TL
8586 m.clear();
8587
8588 RGWSI_RADOS::Pool index_pool;
7c673cae 8589 // key - oid (for different shards if there is any)
1adf2230
AA
8590 // value - list result for the corresponding oid (shard), it is filled by
8591 // the AIO callback
20effc67 8592 std::map<int, std::string> shard_oids;
b3b6e05e 8593 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id,
9f95a23c
TL
8594 &index_pool, &shard_oids,
8595 nullptr);
8596 if (r < 0) {
20effc67
TL
8597 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8598 ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
7c673cae 8599 return r;
9f95a23c
TL
8600 }
8601
8602 const uint32_t shard_count = shard_oids.size();
8603 uint32_t num_entries_per_shard;
8604 if (expansion_factor == 0) {
8605 num_entries_per_shard =
8606 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8607 } else if (expansion_factor <= 11) {
8608 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8609 num_entries_per_shard =
8610 std::min(num_entries,
8611 (uint32_t(1 << (expansion_factor - 1)) *
8612 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8613 } else {
8614 num_entries_per_shard = num_entries;
8615 }
8616
20effc67
TL
8617 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8618 ": request from each of " << shard_count <<
9f95a23c
TL
8619 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8620 num_entries << " total entries" << dendl;
7c673cae 8621
9f95a23c 8622 auto& ioctx = index_pool.ioctx();
20effc67 8623 std::map<int, rgw_cls_list_ret> shard_list_results;
9f95a23c
TL
8624 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8625 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8626 num_entries_per_shard,
8627 list_versions, shard_oids, shard_list_results,
1adf2230 8628 cct->_conf->rgw_bucket_index_max_aio)();
9f95a23c 8629 if (r < 0) {
20effc67
TL
8630 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8631 ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
8632 " failed" << dendl;
7c673cae 8633 return r;
9f95a23c 8634 }
7c673cae 8635
9f95a23c
TL
8636 // to manage the iterators through each shard's list results
8637 struct ShardTracker {
8638 const size_t shard_idx;
8639 rgw_cls_list_ret& result;
8640 const std::string& oid_name;
8641 RGWRados::ent_map_t::iterator cursor;
8642 RGWRados::ent_map_t::iterator end;
8643
8644 // manages an iterator through a shard and provides other
8645 // accessors
8646 ShardTracker(size_t _shard_idx,
8647 rgw_cls_list_ret& _result,
8648 const std::string& _oid_name):
8649 shard_idx(_shard_idx),
8650 result(_result),
8651 oid_name(_oid_name),
8652 cursor(_result.dir.m.begin()),
8653 end(_result.dir.m.end())
8654 {}
8655
8656 inline const std::string& entry_name() const {
8657 return cursor->first;
8658 }
8659 rgw_bucket_dir_entry& dir_entry() const {
8660 return cursor->second;
8661 }
8662 inline bool is_truncated() const {
8663 return result.is_truncated;
8664 }
8665 inline ShardTracker& advance() {
8666 ++cursor;
8667 // return a self-reference to allow for chaining of calls, such
8668 // as x.advance().at_end()
8669 return *this;
8670 }
8671 inline bool at_end() const {
8672 return cursor == end;
8673 }
8674 }; // ShardTracker
8675
8676 // add the next unique candidate, or return false if we reach the end
f67539c2 8677 auto next_candidate = [] (CephContext *cct, ShardTracker& t,
39ae355f 8678 std::multimap<std::string, size_t>& candidates,
9f95a23c 8679 size_t tracker_idx) {
39ae355f
TL
8680 if (!t.at_end()) {
8681 candidates.emplace(t.entry_name(), tracker_idx);
9f95a23c 8682 }
39ae355f 8683 return;
9f95a23c
TL
8684 };
8685
8686 // one tracker per shard requested (may not be all shards)
8687 std::vector<ShardTracker> results_trackers;
8688 results_trackers.reserve(shard_list_results.size());
8689 for (auto& r : shard_list_results) {
8690 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8691
8692 // if any *one* shard's result is trucated, the entire result is
8693 // truncated
8694 *is_truncated = *is_truncated || r.second.is_truncated;
8695
8696 // unless *all* are shards are cls_filtered, the entire result is
8697 // not filtered
8698 *cls_filtered = *cls_filtered && r.second.cls_filtered;
7c673cae
FG
8699 }
8700
9f95a23c
TL
8701 // create a map to track the next candidate entry from ShardTracker
8702 // (key=candidate, value=index into results_trackers); as we consume
8703 // entries from shards, we replace them with the next entries in the
8704 // shards until we run out
39ae355f 8705 std::multimap<std::string, size_t> candidates;
9f95a23c 8706 size_t tracker_idx = 0;
39ae355f
TL
8707 std::vector<size_t> vidx;
8708 vidx.reserve(shard_list_results.size());
9f95a23c
TL
8709 for (auto& t : results_trackers) {
8710 // it's important that the values in the map refer to the index
8711 // into the results_trackers vector, which may not be the same
8712 // as the shard number (i.e., when not all shards are requested)
f67539c2 8713 next_candidate(cct, t, candidates, tracker_idx);
9f95a23c 8714 ++tracker_idx;
7c673cae
FG
8715 }
8716
9f95a23c
TL
8717 rgw_bucket_dir_entry*
8718 last_entry_visited = nullptr; // to set last_entry (marker)
20effc67 8719 std::map<std::string, bufferlist> updates;
7c673cae
FG
8720 uint32_t count = 0;
8721 while (count < num_entries && !candidates.empty()) {
8722 r = 0;
9f95a23c
TL
8723 // select the next entry in lexical order (first key in map);
8724 // again tracker_idx is not necessarily shard number, but is index
8725 // into results_trackers vector
8726 tracker_idx = candidates.begin()->second;
8727 auto& tracker = results_trackers.at(tracker_idx);
e306af50 8728
20effc67 8729 const std::string& name = tracker.entry_name();
9f95a23c
TL
8730 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8731
20effc67 8732 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ": currently processing " <<
9f95a23c
TL
8733 dirent.key << " from shard " << tracker.shard_idx << dendl;
8734
8735 const bool force_check =
8736 force_check_filter && force_check_filter(dirent.key.name);
8737
8738 if ((!dirent.exists &&
8739 !dirent.is_delete_marker() &&
8740 !dirent.is_common_prefix()) ||
3efd9988
FG
8741 !dirent.pending_map.empty() ||
8742 force_check) {
9f95a23c
TL
8743 /* there are uncommitted ops. We need to check the current
8744 * state, and if the tags are old we need to do clean-up as
8745 * well. */
7c673cae 8746 librados::IoCtx sub_ctx;
9f95a23c 8747 sub_ctx.dup(ioctx);
b3b6e05e 8748 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
9f95a23c 8749 updates[tracker.oid_name], y);
7c673cae 8750 if (r < 0 && r != -ENOENT) {
20effc67
TL
8751 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8752 ": check_disk_state for \"" << dirent.key <<
8753 "\" failed with r=" << r << dendl;
9f95a23c 8754 return r;
7c673cae 8755 }
eafe8130 8756 } else {
9f95a23c 8757 r = 0;
7c673cae 8758 }
9f95a23c 8759
20effc67
TL
8760 // at this point either r >= 0 or r == -ENOENT
8761 if (r >= 0) { // i.e., if r != -ENOENT
8762 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": got " <<
8763 dirent.key << dendl;
8764
8765 auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
8766 last_entry_visited = &it->second;
8767 if (inserted) {
8768 ++count;
8769 } else {
8770 ldpp_dout(dpp, 0) << "WARNING: " << __PRETTY_FUNCTION__ <<
8771 " reassigned map value at \"" << name <<
8772 "\", which should not happen" << dendl;
8773 }
9f95a23c 8774 } else {
20effc67 8775 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": skipping " <<
9f95a23c 8776 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
e306af50 8777 last_entry_visited = &tracker.dir_entry();
7c673cae
FG
8778 }
8779
9f95a23c 8780 // refresh the candidates map
39ae355f
TL
8781 vidx.clear();
8782 bool need_to_stop = false;
8783 auto range = candidates.equal_range(name);
8784 for (auto i = range.first; i != range.second; ++i) {
8785 vidx.push_back(i->second);
8786 }
8787 candidates.erase(range.first, range.second);
8788 for (auto idx : vidx) {
8789 auto& tracker_match = results_trackers.at(idx);
8790 tracker_match.advance();
8791 next_candidate(cct, tracker_match, candidates, idx);
8792 if (tracker_match.at_end() && tracker_match.is_truncated()) {
8793 need_to_stop = true;
8794 break;
8795 }
8796 }
8797 if (need_to_stop) {
9f95a23c
TL
8798 // once we exhaust one shard that is truncated, we need to stop,
8799 // as we cannot be certain that one of the next entries needs to
8800 // come from that shard; S3 and swift protocols allow returning
8801 // fewer than what was requested
20effc67
TL
8802 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8803 ": stopped accumulating results at count=" << count <<
8804 ", dirent=\"" << dirent.key <<
39ae355f 8805 "\", because its shard is truncated and exhausted" << dendl;
9f95a23c 8806 break;
7c673cae 8807 }
9f95a23c 8808 } // while we haven't provided requested # of result entries
7c673cae 8809
9f95a23c
TL
8810 // suggest updates if there are any
8811 for (auto& miter : updates) {
8812 if (miter.second.length()) {
7c673cae 8813 ObjectWriteOperation o;
9f95a23c 8814 cls_rgw_suggest_changes(o, miter.second);
7c673cae 8815 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8816 AioCompletion *c =
8817 librados::Rados::aio_create_completion(nullptr, nullptr);
8818 ioctx.aio_operate(miter.first, c, &o);
1adf2230 8819 c->release();
7c673cae 8820 }
9f95a23c 8821 } // updates loop
7c673cae 8822
9f95a23c
TL
8823 // determine truncation by checking if all the returned entries are
8824 // consumed or not
8825 *is_truncated = false;
8826 for (const auto& t : results_trackers) {
8827 if (!t.at_end() || t.is_truncated()) {
7c673cae 8828 *is_truncated = true;
1adf2230
AA
8829 break;
8830 }
7c673cae 8831 }
92f5a8d4 8832
20effc67 8833 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8834 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8835 dendl;
8836
8837 if (*is_truncated && count < num_entries) {
20effc67
TL
8838 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8839 ": requested " << num_entries << " entries but returning " <<
9f95a23c
TL
8840 count << ", which is truncated" << dendl;
8841 }
8842
8843 if (last_entry_visited != nullptr && last_entry) {
e306af50 8844 *last_entry = last_entry_visited->key;
20effc67 8845 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8846 ": returning, last_entry=" << *last_entry << dendl;
8847 } else {
20effc67 8848 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8849 ": returning, last_entry NOT SET" << dendl;
8850 }
7c673cae
FG
8851
8852 return 0;
8853}
8854
1adf2230 8855
20effc67
TL
8856// A helper function to retrieve the hash source from an incomplete
8857// multipart entry by removing everything from the second to last
8858// period on.
522d829b
TL
8859static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
8860 std::size_t found = oid_wo_ns.rfind('.');
8861 if (found == std::string::npos || found < 1) {
8862 return -EINVAL;
8863 }
8864 found = oid_wo_ns.rfind('.', found - 1);
8865 if (found == std::string::npos || found < 1) {
8866 return -EINVAL;
8867 }
8868 *index_hash_source = oid_wo_ns.substr(0, found);
8869 return 0;
8870}
8871
8872
20effc67 8873int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
b3b6e05e 8874 RGWBucketInfo& bucket_info,
1adf2230 8875 int shard_id,
9f95a23c 8876 const rgw_obj_index_key& start_after,
20effc67 8877 const std::string& prefix,
1adf2230
AA
8878 uint32_t num_entries,
8879 bool list_versions,
8880 std::vector<rgw_bucket_dir_entry>& ent_list,
8881 bool *is_truncated,
8882 rgw_obj_index_key *last_entry,
9f95a23c 8883 optional_yield y,
20effc67
TL
8884 RGWBucketListNameFilter force_check_filter) {
8885 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << " " <<
8886 bucket_info.bucket <<
8887 " start_after=\"" << start_after <<
8888 "\", prefix=\"" << prefix <<
8889 "\", shard_id=" << shard_id <<
8890 "\", num_entries=" << num_entries <<
8891 ", list_versions=" << list_versions <<
8892 ", force_check_filter is " <<
8893 (force_check_filter ? "set" : "unset") << dendl;
1adf2230 8894
9f95a23c 8895 ent_list.clear();
11fdf7f2
TL
8896 static MultipartMetaFilter multipart_meta_filter;
8897
1adf2230 8898 *is_truncated = false;
9f95a23c 8899 RGWSI_RADOS::Pool index_pool;
1adf2230 8900
20effc67 8901 std::map<int, std::string> oids;
b3b6e05e 8902 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, nullptr);
20effc67 8903 if (r < 0) {
1adf2230 8904 return r;
20effc67 8905 }
9f95a23c
TL
8906
8907 auto& ioctx = index_pool.ioctx();
8908
1adf2230
AA
8909 const uint32_t num_shards = oids.size();
8910
9f95a23c 8911 rgw_obj_index_key marker = start_after;
1adf2230
AA
8912 uint32_t current_shard;
8913 if (shard_id >= 0) {
8914 current_shard = shard_id;
9f95a23c 8915 } else if (start_after.empty()) {
1adf2230
AA
8916 current_shard = 0u;
8917 } else {
9f95a23c
TL
8918 // at this point we have a marker (start_after) that has something
8919 // in it, so we need to get to the bucket shard index, so we can
11fdf7f2
TL
8920 // start reading from there
8921
11fdf7f2
TL
8922
8923 // now convert the key (oid) to an rgw_obj_key since that will
8924 // separate out the namespace, name, and instance
8925 rgw_obj_key obj_key;
522d829b 8926 bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
11fdf7f2 8927 if (!parsed) {
20effc67
TL
8928 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8929 " received an invalid start marker: \"" << start_after << "\"" <<
8930 dendl;
11fdf7f2
TL
8931 return -EINVAL;
8932 } else if (obj_key.name.empty()) {
8933 // if the name is empty that means the object name came in with
8934 // a namespace only, and therefore we need to start our scan at
8935 // the first bucket index shard
8936 current_shard = 0u;
8937 } else {
8938 // so now we have the key used to compute the bucket index shard
8939 // and can extract the specific shard from it
522d829b
TL
8940 if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
8941 // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
8942 // the implementation relying on MultipartMetaFilter
8943 // because MultipartMetaFilter only checks .meta suffix, which may
8944 // exclude data multiparts but include some regular objects with .meta suffix
8945 // by mistake.
8946 string index_hash_source;
8947 r = parse_index_hash_source(obj_key.name, &index_hash_source);
8948 if (r < 0) {
20effc67
TL
8949 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8950 " parse_index_hash_source unable to parse \"" << obj_key.name <<
8951 "\", r=" << r << dendl;
522d829b
TL
8952 return r;
8953 }
8954 current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
8955 } else {
8956 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
8957 }
11fdf7f2 8958 }
1adf2230
AA
8959 }
8960
8961 uint32_t count = 0u;
20effc67 8962 std::map<std::string, bufferlist> updates;
11fdf7f2 8963 rgw_obj_index_key last_added_entry;
1adf2230
AA
8964 while (count <= num_entries &&
8965 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8966 current_shard < num_shards)) {
81eedcae
TL
8967 const std::string& oid = oids[current_shard];
8968 rgw_cls_list_ret result;
8969
8970 librados::ObjectReadOperation op;
20effc67 8971 const std::string empty_delimiter;
9f95a23c
TL
8972 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8973 num_entries,
81eedcae 8974 list_versions, &result);
b3b6e05e 8975 r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
20effc67
TL
8976 if (r < 0) {
8977 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8978 " error in rgw_rados_operate (bucket list op), r=" << r << dendl;
1adf2230 8979 return r;
20effc67 8980 }
1adf2230 8981
1adf2230
AA
8982 for (auto& entry : result.dir.m) {
8983 rgw_bucket_dir_entry& dirent = entry.second;
8984
8985 bool force_check = force_check_filter &&
8986 force_check_filter(dirent.key.name);
8987 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8988 !dirent.pending_map.empty() ||
8989 force_check) {
8990 /* there are uncommitted ops. We need to check the current state,
8991 * and if the tags are old we need to do cleanup as well. */
8992 librados::IoCtx sub_ctx;
9f95a23c 8993 sub_ctx.dup(ioctx);
b3b6e05e 8994 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
1adf2230 8995 if (r < 0 && r != -ENOENT) {
20effc67
TL
8996 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8997 " error in check_disk_state, r=" << r << dendl;
1adf2230
AA
8998 return r;
8999 }
eafe8130
TL
9000 } else {
9001 r = 0;
1adf2230
AA
9002 }
9003
20effc67 9004 // at this point either r >= 0 or r == -ENOENT
1adf2230 9005 if (r >= 0) { // i.e., if r != -ENOENT
20effc67
TL
9006 ldpp_dout(dpp, 10) << __func__ << ": got " <<
9007 dirent.key << dendl;
1adf2230
AA
9008
9009 if (count < num_entries) {
11fdf7f2 9010 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
9011 ent_list.emplace_back(std::move(dirent));
9012 ++count;
9013 } else {
20effc67 9014 last_added_entry = dirent.key;
1adf2230 9015 *is_truncated = true;
20effc67
TL
9016 ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
9017 ": reached max entries (" << num_entries << ") to return at \"" <<
9018 dirent.key << "\"" << dendl;
1adf2230
AA
9019 goto check_updates;
9020 }
9021 } else { // r == -ENOENT
9022 // in the case of -ENOENT, make sure we're advancing marker
9023 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 9024 marker = dirent.key;
1adf2230
AA
9025 }
9026 } // entry for loop
9027
9028 if (!result.is_truncated) {
9029 // if we reached the end of the shard read next shard
9030 ++current_shard;
11fdf7f2 9031 marker = rgw_obj_index_key();
1adf2230
AA
9032 }
9033 } // shard loop
9034
9035check_updates:
11fdf7f2 9036
1adf2230 9037 // suggest updates if there is any
20effc67 9038 std::map<std::string, bufferlist>::iterator miter = updates.begin();
1adf2230
AA
9039 for (; miter != updates.end(); ++miter) {
9040 if (miter->second.length()) {
9041 ObjectWriteOperation o;
9042 cls_rgw_suggest_changes(o, miter->second);
9043 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
9044 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9045 ioctx.aio_operate(miter->first, c, &o);
1adf2230
AA
9046 c->release();
9047 }
9048 }
9049
9050 if (last_entry && !ent_list.empty()) {
9051 *last_entry = last_added_entry;
9052 }
9053
9054 return 0;
11fdf7f2 9055} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
9056
9057
b3b6e05e 9058int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
1adf2230 9059 rgw_usage_log_info& info)
7c673cae 9060{
11fdf7f2 9061 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9062
9063 rgw_rados_ref ref;
b3b6e05e 9064 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9065 if (r < 0) {
9066 return r;
9067 }
9068
9069 ObjectWriteOperation op;
9070 cls_rgw_usage_log_add(op, info);
9071
b3b6e05e 9072 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
9073 return r;
9074}
9075
b3b6e05e 9076int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2
TL
9077 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
9078 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
9079 bool *is_truncated)
7c673cae 9080{
11fdf7f2 9081 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9082
9083 rgw_rados_ref ref;
b3b6e05e 9084 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9085 if (r < 0) {
9086 return r;
9087 }
9088
9089 *is_truncated = false;
9090
9f95a23c 9091 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
9092 max_entries, read_iter, usage, is_truncated);
9093
9094 return r;
9095}
9096
b3b6e05e 9097static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
9f95a23c
TL
9098{
9099 bool done = false;
9100 do {
9101 librados::ObjectWriteOperation op;
9102 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
b3b6e05e 9103 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
9f95a23c
TL
9104 if (r == -ENODATA)
9105 done = true;
9106 else if (r < 0)
9107 return r;
9108 } while (!done);
9109
9110 return 0;
9111}
9112
b3b6e05e 9113int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2 9114 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 9115{
11fdf7f2 9116 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9117
9118 rgw_rados_ref ref;
b3b6e05e 9119 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9120 if (r < 0) {
9121 return r;
9122 }
9123
b3b6e05e 9124 r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
11fdf7f2
TL
9125 return r;
9126}
9127
b3b6e05e 9128int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
11fdf7f2
TL
9129{
9130 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9131
9132 rgw_rados_ref ref;
b3b6e05e 9133 int r = get_raw_obj_ref(dpp, obj, &ref);
11fdf7f2
TL
9134 if (r < 0) {
9135 return r;
9136 }
9137 librados::ObjectWriteOperation op;
9138 cls_rgw_usage_log_clear(op);
b3b6e05e 9139 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
9140 return r;
9141}
9142
11fdf7f2 9143
b3b6e05e 9144int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
7c673cae 9145{
9f95a23c 9146 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
9147 string dir_oid;
9148
11fdf7f2 9149 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae 9150
b3b6e05e 9151 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, &index_pool, &dir_oid);
7c673cae
FG
9152 if (r < 0)
9153 return r;
9154
9155 bufferlist updates;
9156
9157 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
9158 rgw_bucket_dir_entry entry;
9159 entry.key = *iter;
b3b6e05e 9160 ldpp_dout(dpp, 2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
7c673cae
FG
9161 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9162 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 9163 encode(entry, updates);
7c673cae
FG
9164 }
9165
9166 bufferlist out;
9167
9f95a23c 9168 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
7c673cae
FG
9169
9170 return r;
9171}
9172
20effc67 9173int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
b3b6e05e 9174 librados::IoCtx io_ctx,
7c673cae
FG
9175 const RGWBucketInfo& bucket_info,
9176 rgw_bucket_dir_entry& list_state,
9177 rgw_bucket_dir_entry& object,
9f95a23c
TL
9178 bufferlist& suggested_updates,
9179 optional_yield y)
7c673cae
FG
9180{
9181 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 9182 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
9183
9184 std::string loc;
9185
9186 rgw_obj obj(bucket, list_state.key);
9187
9188 string oid;
9189 get_obj_bucket_and_oid_loc(obj, oid, loc);
9190
9191 if (loc != list_state.locator) {
b3b6e05e 9192 ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
7c673cae
FG
9193 }
9194
9195 io_ctx.locator_set_key(list_state.locator);
9196
9197 RGWObjState *astate = NULL;
9f95a23c 9198 RGWObjectCtx rctx(this->store);
b3b6e05e 9199 int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
9200 if (r < 0)
9201 return r;
9202
9203 list_state.pending_map.clear(); // we don't need this and it inflates size
9f95a23c 9204 if (!list_state.is_delete_marker() && !astate->exists) {
7c673cae
FG
9205 /* object doesn't exist right now -- hopefully because it's
9206 * marked as !exists and got deleted */
9207 if (list_state.exists) {
9208 /* FIXME: what should happen now? Work out if there are any
9209 * non-bad ways this could happen (there probably are, but annoying
9210 * to handle!) */
9211 }
20effc67 9212
7c673cae
FG
9213 // encode a suggested removal of that key
9214 list_state.ver.epoch = io_ctx.get_last_version();
9215 list_state.ver.pool = io_ctx.get_id();
9216 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
9217 return -ENOENT;
9218 }
9219
9220 string etag;
9221 string content_type;
2a845540 9222 string storage_class;
7c673cae
FG
9223 ACLOwner owner;
9224
9225 object.meta.size = astate->size;
9226 object.meta.accounted_size = astate->accounted_size;
9227 object.meta.mtime = astate->mtime;
9228
9229 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
9230 if (iter != astate->attrset.end()) {
11fdf7f2 9231 etag = rgw_bl_str(iter->second);
7c673cae
FG
9232 }
9233 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
9234 if (iter != astate->attrset.end()) {
11fdf7f2 9235 content_type = rgw_bl_str(iter->second);
7c673cae 9236 }
2a845540
TL
9237 iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
9238 if (iter != astate->attrset.end()) {
9239 storage_class = rgw_bl_str(iter->second);
9240 }
7c673cae
FG
9241 iter = astate->attrset.find(RGW_ATTR_ACL);
9242 if (iter != astate->attrset.end()) {
20effc67 9243 r = decode_policy(dpp, iter->second, &owner);
7c673cae 9244 if (r < 0) {
b3b6e05e 9245 ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
7c673cae
FG
9246 }
9247 }
9248
9f95a23c 9249 if (astate->manifest) {
7c673cae 9250 RGWObjManifest::obj_iterator miter;
9f95a23c 9251 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 9252 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 9253 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(store);
7c673cae 9254 rgw_obj loc;
9f95a23c 9255 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
7c673cae
FG
9256
9257 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
b3b6e05e
TL
9258 ldpp_dout(dpp, 0) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
9259 r = delete_obj_index(loc, astate->mtime, dpp);
7c673cae 9260 if (r < 0) {
b3b6e05e 9261 ldpp_dout(dpp, 0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
7c673cae
FG
9262 }
9263 }
9264 }
9265 }
9266
9267 object.meta.etag = etag;
9268 object.meta.content_type = content_type;
2a845540 9269 object.meta.storage_class = storage_class;
7c673cae
FG
9270 object.meta.owner = owner.get_id().to_str();
9271 object.meta.owner_display_name = owner.get_display_name();
9272
9273 // encode suggested updates
20effc67 9274
7c673cae
FG
9275 list_state.meta.size = object.meta.size;
9276 list_state.meta.accounted_size = object.meta.accounted_size;
9277 list_state.meta.mtime = object.meta.mtime;
9278 list_state.meta.category = main_category;
9279 list_state.meta.etag = etag;
9280 list_state.meta.content_type = content_type;
2a845540 9281 list_state.meta.storage_class = storage_class;
20effc67
TL
9282
9283 librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
39ae355f
TL
9284 r = get_obj_head_ioctx(dpp, bucket_info, obj, &head_obj_ctx);
9285 if (r < 0) {
20effc67
TL
9286 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
9287 " WARNING: unable to find head object data pool for \"" <<
9288 obj << "\", not updating version pool/epoch" << dendl;
1d09f67e
TL
9289 } else {
9290 list_state.ver.pool = head_obj_ctx.get_id();
9291 list_state.ver.epoch = astate->epoch;
20effc67
TL
9292 }
9293
9294 if (astate->obj_tag.length() > 0) {
7c673cae 9295 list_state.tag = astate->obj_tag.c_str();
20effc67
TL
9296 }
9297
7c673cae
FG
9298 list_state.meta.owner = owner.get_id().to_str();
9299 list_state.meta.owner_display_name = owner.get_display_name();
9300
9301 list_state.exists = true;
20effc67 9302
7c673cae
FG
9303 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9304 return 0;
9305}
9306
b3b6e05e 9307int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae 9308{
9f95a23c 9309 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
9310 map<int, string> oids;
9311 map<int, struct rgw_cls_list_ret> list_results;
b3b6e05e 9312 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
9f95a23c 9313 if (r < 0) {
b3b6e05e 9314 ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
9f95a23c 9315 << r << dendl;
7c673cae 9316 return r;
9f95a23c 9317 }
7c673cae 9318
9f95a23c
TL
9319 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9320 if (r < 0) {
b3b6e05e 9321 ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
9f95a23c 9322 << r << dendl;
7c673cae 9323 return r;
9f95a23c 9324 }
7c673cae
FG
9325
9326 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9327 for(; iter != list_results.end(); ++iter) {
a8e16298 9328 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
9329 }
9330 return 0;
9331}
9332
b3b6e05e 9333int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
7c673cae 9334{
9f95a23c 9335 RGWSI_RADOS::Pool index_pool;
7c673cae 9336 map<int, string> bucket_objs;
b3b6e05e 9337 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
9338 if (r < 0)
9339 return r;
9340
9341 map<int, string>::iterator iter = bucket_objs.begin();
9342 for (; iter != bucket_objs.end(); ++iter) {
9f95a23c 9343 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
7c673cae
FG
9344 if (r < 0) {
9345 ctx->put();
9346 break;
9347 } else {
9348 (*num_aio)++;
9349 }
9350 }
9351 return r;
9352}
9353
9f95a23c
TL
9354int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
9355 const rgw_bucket& bucket,
b3b6e05e
TL
9356 uint64_t num_objs,
9357 const DoutPrefixProvider *dpp)
31f18b77 9358{
11fdf7f2 9359 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
9360 return 0;
9361 }
9362
9363 bool need_resharding = false;
9f95a23c 9364 uint32_t num_source_shards =
f67539c2 9365 (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
9f95a23c
TL
9366 const uint32_t max_dynamic_shards =
9367 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
9368
9369 if (num_source_shards >= max_dynamic_shards) {
9370 return 0;
9371 }
31f18b77 9372
9f95a23c 9373 uint32_t suggested_num_shards = 0;
11fdf7f2
TL
9374 const uint64_t max_objs_per_shard =
9375 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9f95a23c 9376
20effc67 9377 quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
9f95a23c
TL
9378 num_objs, need_resharding, &suggested_num_shards);
9379 if (! need_resharding) {
9380 return 0;
31f18b77
FG
9381 }
9382
9f95a23c
TL
9383 const uint32_t final_num_shards =
9384 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
9385 max_dynamic_shards);
9386 // final verification, so we don't reduce number of shards
9387 if (final_num_shards <= num_source_shards) {
9388 return 0;
31f18b77
FG
9389 }
9390
b3b6e05e 9391 ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
f67539c2 9392 " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
9f95a23c
TL
9393 "; new num shards " << final_num_shards << " (suggested " <<
9394 suggested_num_shards << ")" << dendl;
9395
b3b6e05e 9396 return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
31f18b77
FG
9397}
9398
b3b6e05e 9399int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
31f18b77 9400{
b3b6e05e 9401 RGWReshard reshard(this->store, dpp);
31f18b77 9402
f67539c2 9403 uint32_t num_source_shards = (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
31f18b77 9404
11fdf7f2 9405 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77 9406 if (new_num_shards <= num_source_shards) {
b3b6e05e 9407 ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
31f18b77
FG
9408 return 0;
9409 }
9410
9411 cls_rgw_reshard_entry entry;
9412 entry.time = real_clock::now();
9413 entry.tenant = bucket_info.owner.tenant;
9414 entry.bucket_name = bucket_info.bucket.name;
9415 entry.bucket_id = bucket_info.bucket.bucket_id;
9416 entry.old_num_shards = num_source_shards;
9417 entry.new_num_shards = new_num_shards;
9418
b3b6e05e 9419 return reshard.add(dpp, entry);
31f18b77
FG
9420}
9421
20effc67 9422int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
f67539c2
TL
9423 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
9424 uint64_t obj_size, optional_yield y,
9425 bool check_size_only)
7c673cae 9426{
11fdf7f2
TL
9427 // if we only check size, then num_objs will set to 0
9428 if(check_size_only)
20effc67 9429 return quota_handler->check_quota(dpp, bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size, y);
11fdf7f2 9430
20effc67 9431 return quota_handler->check_quota(dpp, bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size, y);
7c673cae
FG
9432}
9433
f67539c2 9434int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
11fdf7f2 9435 int *shard_id)
7c673cae 9436{
11fdf7f2 9437 int r = 0;
f67539c2
TL
9438 switch (layout.hash_type) {
9439 case rgw::BucketHashType::Mod:
9440 if (!layout.num_shards) {
11fdf7f2
TL
9441 if (shard_id) {
9442 *shard_id = -1;
9443 }
9444 } else {
f67539c2 9445 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
11fdf7f2
TL
9446 if (shard_id) {
9447 *shard_id = (int)sid;
9448 }
9449 }
9450 break;
9451 default:
9452 r = -ENOTSUP;
7c673cae 9453 }
11fdf7f2 9454 return r;
7c673cae
FG
9455}
9456
7c673cae
FG
9457uint64_t RGWRados::instance_id()
9458{
9459 return get_rados_handle()->get_instance_id();
9460}
9461
9462uint64_t RGWRados::next_bucket_id()
9463{
9f95a23c 9464 std::lock_guard l{bucket_id_lock};
7c673cae
FG
9465 return ++max_bucket_id;
9466}
9467
7c673cae
FG
9468librados::Rados* RGWRados::get_rados_handle()
9469{
494da23a 9470 return &rados;
7c673cae
FG
9471}
9472
b3b6e05e 9473int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
7c673cae
FG
9474{
9475 rgw_rados_ref ref;
b3b6e05e 9476 int ret = get_raw_obj_ref(dpp, obj, &ref);
7c673cae 9477 if (ret < 0) {
b3b6e05e 9478 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9479 return ret;
9480 }
9481
9482 ObjectWriteOperation op;
9483 list<string> prefixes;
9484 cls_rgw_remove_obj(op, prefixes);
9485
9f95a23c
TL
9486 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9487 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9488 if (ret < 0) {
b3b6e05e 9489 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9490 c->release();
9491 return ret;
9492 }
9493
9494 handles.push_back(c);
9495
9496 return 0;
9497}
9498
b3b6e05e 9499int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
7c673cae 9500 RGWBucketInfo& bucket_info, RGWObjState *astate,
9f95a23c
TL
9501 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9502 optional_yield y)
7c673cae
FG
9503{
9504 rgw_rados_ref ref;
b3b6e05e 9505 int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae 9506 if (ret < 0) {
b3b6e05e 9507 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9508 return ret;
9509 }
9510
9511 if (keep_index_consistent) {
9512 RGWRados::Bucket bop(this, bucket_info);
9513 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9514
b3b6e05e 9515 ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
7c673cae 9516 if (ret < 0) {
b3b6e05e 9517 ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
7c673cae
FG
9518 return ret;
9519 }
9520 }
9521
9522 ObjectWriteOperation op;
9523 list<string> prefixes;
9524 cls_rgw_remove_obj(op, prefixes);
9525
9f95a23c
TL
9526 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9527 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9528 if (ret < 0) {
b3b6e05e 9529 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9530 c->release();
9531 return ret;
9532 }
9533
9534 handles.push_back(c);
9535
9536 if (keep_index_consistent) {
b3b6e05e 9537 ret = delete_obj_index(obj, astate->mtime, dpp);
7c673cae 9538 if (ret < 0) {
b3b6e05e 9539 ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
7c673cae
FG
9540 return ret;
9541 }
9542 }
9543 return ret;
9544}
20effc67
TL
9545
9546void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
9547{
9548 auto it = new objexp_hint_entry;
9549 it->tenant = "tenant1";
9550 it->bucket_name = "bucket1";
9551 it->bucket_id = "1234";
9552 it->obj_key = rgw_obj_key("obj");
9553 o.push_back(it);
9554 o.push_back(new objexp_hint_entry);
9555}
9556
9557void objexp_hint_entry::dump(Formatter *f) const
9558{
9559 f->open_object_section("objexp_hint_entry");
9560 encode_json("tenant", tenant, f);
9561 encode_json("bucket_name", bucket_name, f);
9562 encode_json("bucket_id", bucket_id, f);
9563 encode_json("rgw_obj_key", obj_key, f);
9564 utime_t ut(exp_time);
9565 encode_json("exp_time", ut, f);
9566 f->close_section();
9567}
9568
9569void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
9570{
9571 RGWOLHInfo *olh = new RGWOLHInfo;
9572 olh->removed = false;
9573 o.push_back(olh);
9574 o.push_back(new RGWOLHInfo);
9575}
9576
9577void RGWOLHInfo::dump(Formatter *f) const
9578{
9579 encode_json("target", target, f);
9580}
9581
9582void RGWOLHPendingInfo::dump(Formatter *f) const
9583{
9584 utime_t ut(time);
9585 encode_json("time", ut, f);
9586}
9587