]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
import ceph quincy 17.2.4
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
9f95a23c 2// vim: ts=8 sw=2 smarttab ft=cpp
7c673cae 3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
9f95a23c
TL
8#include <sstream>
9
7c673cae 10#include <boost/algorithm/string.hpp>
11fdf7f2 11#include <string_view>
7c673cae 12
11fdf7f2 13#include <boost/container/flat_set.hpp>
7c673cae
FG
14#include <boost/format.hpp>
15#include <boost/optional.hpp>
16#include <boost/utility/in_place_factory.hpp>
17
18#include "common/ceph_json.h"
7c673cae
FG
19
20#include "common/errno.h"
21#include "common/Formatter.h"
22#include "common/Throttle.h"
7c673cae 23
9f95a23c 24#include "rgw_sal.h"
11fdf7f2 25#include "rgw_zone.h"
7c673cae
FG
26#include "rgw_cache.h"
27#include "rgw_acl.h"
28#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 29#include "rgw_aio_throttle.h"
7c673cae
FG
30#include "rgw_bucket.h"
31#include "rgw_rest_conn.h"
32#include "rgw_cr_rados.h"
33#include "rgw_cr_rest.h"
f67539c2 34#include "rgw_datalog.h"
11fdf7f2 35#include "rgw_putobj_processor.h"
7c673cae
FG
36
37#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
38#include "cls/rgw/cls_rgw_client.h"
39#include "cls/rgw/cls_rgw_const.h"
40#include "cls/refcount/cls_refcount_client.h"
41#include "cls/version/cls_version_client.h"
c07f9fc5 42#include "osd/osd_types.h"
7c673cae
FG
43
44#include "rgw_tools.h"
45#include "rgw_coroutine.h"
46#include "rgw_compression.h"
adb31ebb 47#include "rgw_etag_verifier.h"
9f95a23c 48#include "rgw_worker.h"
f67539c2 49#include "rgw_notify.h"
7c673cae 50
7c673cae
FG
51#undef fork // fails to compile RGWPeriod::fork() below
52
53#include "common/Clock.h"
54
7c673cae
FG
55using namespace librados;
56
57#include <string>
58#include <iostream>
59#include <vector>
60#include <atomic>
61#include <list>
62#include <map>
11fdf7f2 63#include "include/random.h"
7c673cae
FG
64
65#include "rgw_gc.h"
66#include "rgw_lc.h"
67
68#include "rgw_object_expirer_core.h"
69#include "rgw_sync.h"
81eedcae 70#include "rgw_sync_counters.h"
11fdf7f2 71#include "rgw_sync_trace.h"
9f95a23c
TL
72#include "rgw_trim_datalog.h"
73#include "rgw_trim_mdlog.h"
7c673cae
FG
74#include "rgw_data_sync.h"
75#include "rgw_realm_watcher.h"
31f18b77 76#include "rgw_reshard.h"
7c673cae 77
11fdf7f2
TL
78#include "services/svc_zone.h"
79#include "services/svc_zone_utils.h"
80#include "services/svc_quota.h"
81#include "services/svc_sync_modules.h"
82#include "services/svc_sys_obj.h"
83#include "services/svc_sys_obj_cache.h"
9f95a23c
TL
84#include "services/svc_bucket.h"
85#include "services/svc_mdlog.h"
11fdf7f2 86
7c673cae
FG
87#include "compressor/Compressor.h"
88
20effc67
TL
89#include "rgw_d3n_datacache.h"
90
11fdf7f2
TL
91#ifdef WITH_LTTNG
92#define TRACEPOINT_DEFINE
93#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
94#include "tracing/rgw_rados.h"
95#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
96#undef TRACEPOINT_DEFINE
97#else
98#define tracepoint(...)
99#endif
100
7c673cae
FG
101#define dout_context g_ceph_context
102#define dout_subsys ceph_subsys_rgw
103
7c673cae 104
7c673cae 105static string shadow_ns = "shadow";
7c673cae
FG
106static string default_bucket_index_pool_suffix = "rgw.buckets.index";
107static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 108
11fdf7f2 109static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 110#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae 111
7c673cae 112
20effc67 113// returns true on success, false on failure
7c673cae 114static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
115 const rgw_placement_rule& head_placement_rule,
116 const rgw_obj& obj, rgw_pool *pool)
7c673cae 117{
11fdf7f2 118 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 119 RGWZonePlacementInfo placement;
11fdf7f2 120 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
121 return false;
122 }
123
124 if (!obj.in_extra_data) {
11fdf7f2 125 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 126 } else {
31f18b77 127 *pool = placement.get_data_extra_pool();
7c673cae
FG
128 }
129 }
130
131 return true;
132}
133
134static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
135 const rgw_placement_rule& head_placement_rule,
136 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
137{
138 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
139
11fdf7f2 140 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
141}
142
143rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
144{
145 if (!is_raw) {
146 rgw_raw_obj r;
147 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
148 return r;
149 }
150 return raw_obj;
151}
152
20effc67 153rgw_raw_obj rgw_obj_select::get_raw_obj(rgw::sal::RadosStore* store) const
7c673cae
FG
154{
155 if (!is_raw) {
156 rgw_raw_obj r;
f67539c2 157 store->get_raw_obj(placement_rule, obj, &r);
7c673cae
FG
158 return r;
159 }
160 return raw_obj;
161}
162
11fdf7f2
TL
163void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
164{
165 obj_version *check_objv = version_for_check();
7c673cae 166
11fdf7f2
TL
167 if (check_objv) {
168 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
169 }
170
11fdf7f2 171 cls_version_read(*op, &read_version);
7c673cae
FG
172}
173
11fdf7f2
TL
174void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
175{
176 obj_version *check_objv = version_for_check();
177 obj_version *modify_version = version_for_write();
7c673cae 178
11fdf7f2
TL
179 if (check_objv) {
180 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 181 }
7c673cae 182
11fdf7f2
TL
183 if (modify_version) {
184 cls_version_set(*op, *modify_version);
185 } else {
186 cls_version_inc(*op);
7c673cae 187 }
7c673cae
FG
188}
189
f91f0fd5
TL
190void RGWObjVersionTracker::apply_write()
191{
192 const bool checked = (read_version.ver != 0);
193 const bool incremented = (write_version.ver == 0);
194
195 if (checked && incremented) {
196 // apply cls_version_inc() so our next operation can recheck it
197 ++read_version.ver;
198 } else {
199 read_version = write_version;
200 }
201 write_version = obj_version();
202}
203
9f95a23c 204RGWObjState::RGWObjState() {
7c673cae
FG
205}
206
9f95a23c 207RGWObjState::~RGWObjState() {
7c673cae
FG
208}
209
9f95a23c
TL
210RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
211 is_atomic = rhs.is_atomic;
212 has_attrs = rhs.has_attrs;
213 exists = rhs.exists;
214 size = rhs.size;
215 accounted_size = rhs.accounted_size;
216 mtime = rhs.mtime;
217 epoch = rhs.epoch;
218 if (rhs.obj_tag.length()) {
219 obj_tag = rhs.obj_tag;
7c673cae 220 }
9f95a23c
TL
221 if (rhs.tail_tag.length()) {
222 tail_tag = rhs.tail_tag;
7c673cae 223 }
9f95a23c
TL
224 write_tag = rhs.write_tag;
225 fake_tag = rhs.fake_tag;
226 manifest = rhs.manifest;
227 shadow_obj = rhs.shadow_obj;
228 has_data = rhs.has_data;
229 if (rhs.data.length()) {
230 data = rhs.data;
7c673cae 231 }
9f95a23c
TL
232 prefetch_data = rhs.prefetch_data;
233 keep_tail = rhs.keep_tail;
234 is_olh = rhs.is_olh;
235 objv_tracker = rhs.objv_tracker;
236 pg_ver = rhs.pg_ver;
20effc67 237 compressed = rhs.compressed;
7c673cae
FG
238}
239
9f95a23c
TL
240RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
241 RGWObjState *result;
242 typename std::map<rgw_obj, RGWObjState>::iterator iter;
243 lock.lock_shared();
244 assert (!obj.empty());
245 iter = objs_state.find(obj);
246 if (iter != objs_state.end()) {
247 result = &iter->second;
248 lock.unlock_shared();
249 } else {
250 lock.unlock_shared();
251 lock.lock();
252 result = &objs_state[obj];
253 lock.unlock();
224ce89b 254 }
9f95a23c 255 return result;
7c673cae
FG
256}
257
20effc67
TL
258void RGWObjectCtx::set_compressed(const rgw_obj& obj) {
259 std::unique_lock wl{lock};
260 assert (!obj.empty());
261 objs_state[obj].compressed = true;
262}
263
9f95a23c
TL
264void RGWObjectCtx::set_atomic(rgw_obj& obj) {
265 std::unique_lock wl{lock};
266 assert (!obj.empty());
267 objs_state[obj].is_atomic = true;
7c673cae 268}
9f95a23c
TL
269void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
270 std::unique_lock wl{lock};
271 assert (!obj.empty());
272 objs_state[obj].prefetch_data = true;
7c673cae
FG
273}
274
9f95a23c
TL
275void RGWObjectCtx::invalidate(const rgw_obj& obj) {
276 std::unique_lock wl{lock};
277 auto iter = objs_state.find(obj);
278 if (iter == objs_state.end()) {
11fdf7f2 279 return;
7c673cae 280 }
9f95a23c
TL
281 bool is_atomic = iter->second.is_atomic;
282 bool prefetch_data = iter->second.prefetch_data;
20effc67 283 bool compressed = iter->second.compressed;
7c673cae 284
9f95a23c 285 objs_state.erase(iter);
7c673cae 286
20effc67 287 if (is_atomic || prefetch_data || compressed) {
9f95a23c
TL
288 auto& state = objs_state[obj];
289 state.is_atomic = is_atomic;
290 state.prefetch_data = prefetch_data;
20effc67 291 state.compressed = compressed;
11fdf7f2 292 }
7c673cae
FG
293}
294
11fdf7f2 295void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 296{
11fdf7f2
TL
297 write_version.ver = 1;
298#define TAG_LEN 24
7c673cae 299
11fdf7f2
TL
300 write_version.tag.clear();
301 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
302}
303
7c673cae 304class RGWMetaNotifierManager : public RGWCoroutinesManager {
20effc67 305 RGWRados* store;
7c673cae
FG
306 RGWHTTPManager http_manager;
307
308public:
309 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
310 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 311 http_manager.start();
7c673cae
FG
312 }
313
b3b6e05e 314 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
7c673cae
FG
315 rgw_http_param_pair pairs[] = { { "type", "metadata" },
316 { "notify", NULL },
317 { NULL, NULL } };
318
319 list<RGWCoroutinesStack *> stacks;
9f95a23c 320 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
321 RGWRESTConn *conn = iter->second;
322 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
323 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
324
325 stacks.push_back(stack);
326 }
b3b6e05e 327 return run(dpp, stacks);
7c673cae
FG
328 }
329};
330
331class RGWDataNotifierManager : public RGWCoroutinesManager {
20effc67 332 RGWRados* store;
7c673cae
FG
333 RGWHTTPManager http_manager;
334
335public:
336 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
337 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 338 http_manager.start();
7c673cae
FG
339 }
340
b3b6e05e 341 int notify_all(const DoutPrefixProvider *dpp, map<rgw_zone_id, RGWRESTConn *>& conn_map,
f67539c2 342 bc::flat_map<int, bc::flat_set<string> >& shards) {
7c673cae
FG
343 rgw_http_param_pair pairs[] = { { "type", "data" },
344 { "notify", NULL },
11fdf7f2 345 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
346 { NULL, NULL } };
347
348 list<RGWCoroutinesStack *> stacks;
9f95a23c 349 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
350 RGWRESTConn *conn = iter->second;
351 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
f67539c2 352 stack->call(new RGWPostRESTResourceCR<bc::flat_map<int, bc::flat_set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
7c673cae
FG
353
354 stacks.push_back(stack);
355 }
b3b6e05e 356 return run(dpp, stacks);
7c673cae
FG
357 }
358};
359
11fdf7f2
TL
360/* class RGWRadosThread */
361
7c673cae
FG
362void RGWRadosThread::start()
363{
364 worker = new Worker(cct, this);
365 worker->create(thread_name.c_str());
366}
367
368void RGWRadosThread::stop()
369{
370 down_flag = true;
371 stop_process();
372 if (worker) {
31f18b77 373 worker->signal();
7c673cae
FG
374 worker->join();
375 }
376 delete worker;
377 worker = NULL;
378}
379
380void *RGWRadosThread::Worker::entry() {
381 uint64_t msec = processor->interval_msec();
9f95a23c 382 auto interval = std::chrono::milliseconds(msec);
7c673cae
FG
383
384 do {
9f95a23c 385 auto start = ceph::real_clock::now();
b3b6e05e 386 int r = processor->process(this);
7c673cae 387 if (r < 0) {
b3b6e05e 388 ldpp_dout(this, 0) << "ERROR: processor->process() returned error r=" << r << dendl;
7c673cae
FG
389 }
390
391 if (processor->going_down())
392 break;
393
9f95a23c 394 auto end = ceph::real_clock::now() - start;
7c673cae
FG
395
396 uint64_t cur_msec = processor->interval_msec();
397 if (cur_msec != msec) { /* was it reconfigured? */
398 msec = cur_msec;
9f95a23c 399 interval = std::chrono::milliseconds(msec);
7c673cae
FG
400 }
401
402 if (cur_msec > 0) {
403 if (interval <= end)
404 continue; // next round
405
9f95a23c 406 auto wait_time = interval - end;
31f18b77 407 wait_interval(wait_time);
7c673cae 408 } else {
31f18b77 409 wait();
7c673cae
FG
410 }
411 } while (!processor->going_down());
412
413 return NULL;
414}
415
416class RGWMetaNotifier : public RGWRadosThread {
417 RGWMetaNotifierManager notify_mgr;
418 RGWMetadataLog *const log;
419
420 uint64_t interval_msec() override {
421 return cct->_conf->rgw_md_notify_interval_msec;
422 }
1adf2230
AA
423 void stop_process() override {
424 notify_mgr.stop();
425 }
7c673cae
FG
426public:
427 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
428 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
429
b3b6e05e 430 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
431};
432
b3b6e05e 433int RGWMetaNotifier::process(const DoutPrefixProvider *dpp)
7c673cae
FG
434{
435 set<int> shards;
436
437 log->read_clear_modified(shards);
438
439 if (shards.empty()) {
440 return 0;
441 }
442
443 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
b3b6e05e 444 ldpp_dout(dpp, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
7c673cae
FG
445 }
446
b3b6e05e 447 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
448
449 return 0;
450}
451
452class RGWDataNotifier : public RGWRadosThread {
453 RGWDataNotifierManager notify_mgr;
454
455 uint64_t interval_msec() override {
11fdf7f2 456 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 457 }
1adf2230
AA
458 void stop_process() override {
459 notify_mgr.stop();
460 }
7c673cae
FG
461public:
462 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
463
b3b6e05e 464 int process(const DoutPrefixProvider *dpp) override;
7c673cae
FG
465};
466
b3b6e05e 467int RGWDataNotifier::process(const DoutPrefixProvider *dpp)
7c673cae 468{
f67539c2 469 auto data_log = store->svc.datalog_rados;
9f95a23c 470 if (!data_log) {
7c673cae
FG
471 return 0;
472 }
473
f67539c2 474 auto shards = data_log->read_clear_modified();
7c673cae
FG
475
476 if (shards.empty()) {
477 return 0;
478 }
479
f67539c2 480 for (const auto& [shard_id, keys] : shards) {
b3b6e05e 481 ldpp_dout(dpp, 20) << __func__ << "(): notifying datalog change, shard_id="
f67539c2 482 << shard_id << ": " << keys << dendl;
7c673cae
FG
483 }
484
b3b6e05e 485 notify_mgr.notify_all(dpp, store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
486
487 return 0;
488}
489
490class RGWSyncProcessorThread : public RGWRadosThread {
491public:
492 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
493 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
494 ~RGWSyncProcessorThread() override {}
b3b6e05e
TL
495 int init(const DoutPrefixProvider *dpp) override = 0 ;
496 int process(const DoutPrefixProvider *dpp) override = 0;
7c673cae
FG
497};
498
499class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
500{
501 RGWMetaSyncStatusManager sync;
502
503 uint64_t interval_msec() override {
504 return 0; /* no interval associated, it'll run once until stopped */
505 }
506 void stop_process() override {
507 sync.stop();
508 }
509public:
20effc67 510 RGWMetaSyncProcessorThread(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados)
9f95a23c 511 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
7c673cae
FG
512
513 void wakeup_sync_shards(set<int>& shard_ids) {
514 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
515 sync.wakeup(*iter);
516 }
517 }
518 RGWMetaSyncStatusManager* get_manager() { return &sync; }
519
b3b6e05e
TL
520 int init(const DoutPrefixProvider *dpp) override {
521 int ret = sync.init(dpp);
7c673cae 522 if (ret < 0) {
b3b6e05e 523 ldpp_dout(dpp, 0) << "ERROR: sync.init() returned " << ret << dendl;
7c673cae
FG
524 return ret;
525 }
526 return 0;
527 }
528
b3b6e05e
TL
529 int process(const DoutPrefixProvider *dpp) override {
530 sync.run(dpp, null_yield);
7c673cae
FG
531 return 0;
532 }
533};
534
535class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
536{
81eedcae 537 PerfCountersRef counters;
7c673cae
FG
538 RGWDataSyncStatusManager sync;
539 bool initialized;
540
541 uint64_t interval_msec() override {
542 if (initialized) {
543 return 0; /* no interval associated, it'll run once until stopped */
544 } else {
545#define DATA_SYNC_INIT_WAIT_SEC 20
546 return DATA_SYNC_INIT_WAIT_SEC * 1000;
547 }
548 }
549 void stop_process() override {
550 sync.stop();
551 }
552public:
20effc67 553 RGWDataSyncProcessorThread(rgw::sal::RadosStore* _store, RGWAsyncRadosProcessor *async_rados,
81eedcae 554 const RGWZone* source_zone)
9f95a23c 555 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
81eedcae
TL
556 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
557 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
558 initialized(false) {}
559
560 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
561 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
562 sync.wakeup(iter->first, iter->second);
563 }
564 }
565 RGWDataSyncStatusManager* get_manager() { return &sync; }
566
b3b6e05e 567 int init(const DoutPrefixProvider *dpp) override {
7c673cae
FG
568 return 0;
569 }
570
b3b6e05e 571 int process(const DoutPrefixProvider *dpp) override {
7c673cae
FG
572 while (!initialized) {
573 if (going_down()) {
574 return 0;
575 }
b3b6e05e 576 int ret = sync.init(dpp);
7c673cae
FG
577 if (ret >= 0) {
578 initialized = true;
579 break;
580 }
581 /* we'll be back! */
582 return 0;
583 }
b3b6e05e 584 sync.run(dpp);
7c673cae
FG
585 return 0;
586 }
587};
588
11fdf7f2 589class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
590{
591 RGWCoroutinesManager crs;
20effc67 592 rgw::sal::RadosStore* store;
b32b8144 593 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
594 RGWHTTPManager http;
595 const utime_t trim_interval;
596
597 uint64_t interval_msec() override { return 0; }
598 void stop_process() override { crs.stop(); }
599public:
20effc67 600 RGWSyncLogTrimThread(rgw::sal::RadosStore* store, rgw::BucketTrimManager *bucket_trim,
b32b8144 601 int interval)
9f95a23c
TL
602 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
603 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
b32b8144 604 bucket_trim(bucket_trim),
7c673cae
FG
605 http(store->ctx(), crs.get_completion_mgr()),
606 trim_interval(interval, 0)
607 {}
608
b3b6e05e 609 int init(const DoutPrefixProvider *dpp) override {
11fdf7f2 610 return http.start();
7c673cae 611 }
b3b6e05e 612 int process(const DoutPrefixProvider *dpp) override {
7c673cae 613 list<RGWCoroutinesStack*> stacks;
20effc67
TL
614 auto metatrimcr = create_meta_log_trim_cr(this, static_cast<rgw::sal::RadosStore*>(store), &http,
615 cct->_conf->rgw_md_log_max_shards,
616 trim_interval);
617 if (!metatrimcr) {
618 ldpp_dout(dpp, -1) << "Bailing out of trim thread!" << dendl;
619 return -EINVAL;
620 }
7c673cae 621 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
20effc67
TL
622 meta->call(metatrimcr);
623
7c673cae
FG
624 stacks.push_back(meta);
625
9f95a23c
TL
626 if (store->svc()->zone->sync_module_exports_data()) {
627 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
20effc67 628 data->call(create_data_log_trim_cr(dpp, static_cast<rgw::sal::RadosStore*>(store), &http,
9f95a23c
TL
629 cct->_conf->rgw_data_log_num_shards,
630 trim_interval));
631 stacks.push_back(data);
7c673cae 632
9f95a23c
TL
633 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
634 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
635 stacks.push_back(bucket);
636 }
b32b8144 637
b3b6e05e 638 crs.run(dpp, stacks);
7c673cae
FG
639 return 0;
640 }
11fdf7f2
TL
641
642 // implements DoutPrefixProvider
643 CephContext *get_cct() const override { return store->ctx(); }
9f95a23c 644 unsigned get_subsys() const override
11fdf7f2
TL
645 {
646 return dout_subsys;
647 }
648
9f95a23c 649 std::ostream& gen_prefix(std::ostream& out) const override
11fdf7f2
TL
650 {
651 return out << "sync log trim: ";
652 }
653
7c673cae
FG
654};
655
656void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
657{
9f95a23c 658 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
659 if (meta_sync_processor_thread) {
660 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
661 }
662}
663
20effc67 664void RGWRados::wakeup_data_sync_shards(const DoutPrefixProvider *dpp, const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
7c673cae 665{
20effc67 666 ldpp_dout(dpp, 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
9f95a23c
TL
667 std::lock_guard l{data_sync_thread_lock};
668 auto iter = data_sync_processor_threads.find(source_zone);
7c673cae 669 if (iter == data_sync_processor_threads.end()) {
20effc67 670 ldpp_dout(dpp, 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
7c673cae
FG
671 return;
672 }
673
674 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 675 ceph_assert(thread);
7c673cae
FG
676 thread->wakeup_sync_shards(shard_ids);
677}
678
679RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
680{
9f95a23c 681 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
682 if (meta_sync_processor_thread) {
683 return meta_sync_processor_thread->get_manager();
684 }
685 return nullptr;
686}
687
9f95a23c 688RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
7c673cae 689{
9f95a23c 690 std::lock_guard l{data_sync_thread_lock};
7c673cae
FG
691 auto thread = data_sync_processor_threads.find(source_zone);
692 if (thread == data_sync_processor_threads.end()) {
693 return nullptr;
694 }
695 return thread->second->get_manager();
696}
697
b3b6e05e 698int RGWRados::get_required_alignment(const DoutPrefixProvider *dpp, const rgw_pool& pool, uint64_t *alignment)
7c673cae
FG
699{
700 IoCtx ioctx;
b3b6e05e 701 int r = open_pool_ctx(dpp, pool, ioctx, false);
7c673cae 702 if (r < 0) {
20effc67 703 ldpp_dout(dpp, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
7c673cae
FG
704 return r;
705 }
706
707 bool requires;
708 r = ioctx.pool_requires_alignment2(&requires);
709 if (r < 0) {
20effc67 710 ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
7c673cae
FG
711 << r << dendl;
712 return r;
713 }
714
715 if (!requires) {
716 *alignment = 0;
717 return 0;
718 }
719
720 uint64_t align;
721 r = ioctx.pool_required_alignment2(&align);
722 if (r < 0) {
20effc67 723 ldpp_dout(dpp, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
7c673cae
FG
724 << r << dendl;
725 return r;
726 }
727 if (align != 0) {
20effc67 728 ldpp_dout(dpp, 20) << "required alignment=" << align << dendl;
7c673cae
FG
729 }
730 *alignment = align;
731 return 0;
732}
733
11fdf7f2
TL
734void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
735{
736 if (alignment == 0) {
737 *max_size = size;
738 return;
739 }
740
741 if (size <= alignment) {
742 *max_size = alignment;
743 return;
744 }
745
746 *max_size = size - (size % alignment);
747}
748
b3b6e05e 749int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae 750{
11fdf7f2 751 uint64_t alignment;
b3b6e05e 752 int r = get_required_alignment(dpp, pool, &alignment);
7c673cae
FG
753 if (r < 0) {
754 return r;
755 }
756
11fdf7f2
TL
757 if (palignment) {
758 *palignment = alignment;
7c673cae
FG
759 }
760
11fdf7f2 761 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 762
11fdf7f2 763 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae 764
b3b6e05e 765 ldpp_dout(dpp, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
7c673cae
FG
766
767 return 0;
768}
769
11fdf7f2 770int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
b3b6e05e 771 uint64_t *max_chunk_size, const DoutPrefixProvider *dpp, uint64_t *palignment)
7c673cae
FG
772{
773 rgw_pool pool;
774 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
b3b6e05e 775 ldpp_dout(dpp, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
7c673cae
FG
776 return -EIO;
777 }
b3b6e05e 778 return get_max_chunk_size(pool, max_chunk_size, dpp, palignment);
7c673cae
FG
779}
780
31f18b77
FG
781class RGWIndexCompletionManager;
782
783struct complete_op_data {
9f95a23c 784 ceph::mutex lock = ceph::make_mutex("complete_op_data");
31f18b77
FG
785 AioCompletion *rados_completion{nullptr};
786 int manager_shard_id{-1};
787 RGWIndexCompletionManager *manager{nullptr};
788 rgw_obj obj;
789 RGWModifyOp op;
790 string tag;
791 rgw_bucket_entry_ver ver;
792 cls_rgw_obj_key key;
793 rgw_bucket_dir_entry_meta dir_meta;
794 list<cls_rgw_obj_key> remove_objs;
795 bool log_op;
796 uint16_t bilog_op;
797 rgw_zone_set zones_trace;
798
799 bool stopped{false};
800
801 void stop() {
9f95a23c 802 std::lock_guard l{lock};
31f18b77
FG
803 stopped = true;
804 }
805};
806
b3b6e05e 807class RGWIndexCompletionThread : public RGWRadosThread, public DoutPrefixProvider {
31f18b77
FG
808 RGWRados *store;
809
810 uint64_t interval_msec() override {
811 return 0;
812 }
813
814 list<complete_op_data *> completions;
815
9f95a23c
TL
816 ceph::mutex completions_lock =
817 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
31f18b77
FG
818public:
819 RGWIndexCompletionThread(RGWRados *_store)
9f95a23c 820 : RGWRadosThread(_store, "index-complete"), store(_store) {}
31f18b77 821
b3b6e05e 822 int process(const DoutPrefixProvider *dpp) override;
31f18b77
FG
823
824 void add_completion(complete_op_data *completion) {
825 {
9f95a23c 826 std::lock_guard l{completions_lock};
31f18b77
FG
827 completions.push_back(completion);
828 }
829
830 signal();
831 }
b3b6e05e
TL
832
833 CephContext *get_cct() const override { return store->ctx(); }
834 unsigned get_subsys() const { return dout_subsys; }
835 std::ostream& gen_prefix(std::ostream& out) const { return out << "rgw index completion thread: "; }
31f18b77
FG
836};
837
b3b6e05e 838int RGWIndexCompletionThread::process(const DoutPrefixProvider *dpp)
31f18b77
FG
839{
840 list<complete_op_data *> comps;
841
842 {
9f95a23c 843 std::lock_guard l{completions_lock};
31f18b77
FG
844 completions.swap(comps);
845 }
846
847 for (auto c : comps) {
848 std::unique_ptr<complete_op_data> up{c};
849
850 if (going_down()) {
851 continue;
852 }
b3b6e05e 853 ldpp_dout(this, 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
31f18b77
FG
854
855 RGWRados::BucketShard bs(store);
f64942e4 856 RGWBucketInfo bucket_info;
31f18b77 857
b3b6e05e 858 int r = bs.init(c->obj.bucket, c->obj, &bucket_info, this);
31f18b77 859 if (r < 0) {
b3b6e05e 860 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
31f18b77
FG
861 /* not much to do */
862 continue;
863 }
864
b3b6e05e 865 r = store->guard_reshard(this, &bs, c->obj, bucket_info,
f64942e4
AA
866 [&](RGWRados::BucketShard *bs) -> int {
867 librados::ObjectWriteOperation o;
868 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
869 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
870 c->log_op, c->bilog_op, &c->zones_trace);
b3b6e05e 871 return bs->bucket_obj.operate(this, &o, null_yield);
31f18b77
FG
872 });
873 if (r < 0) {
b3b6e05e 874 ldpp_dout(this, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
31f18b77
FG
875 /* ignoring error, can't do anything about it */
876 continue;
877 }
b3b6e05e 878 r = store->svc.datalog_rados->add_entry(this, bucket_info, bs.shard_id);
31f18b77 879 if (r < 0) {
b3b6e05e 880 ldpp_dout(this, -1) << "ERROR: failed writing data log" << dendl;
31f18b77
FG
881 }
882 }
883
884 return 0;
885}
886
887class RGWIndexCompletionManager {
888 RGWRados *store{nullptr};
9f95a23c 889 ceph::containers::tiny_vector<ceph::mutex> locks;
31f18b77
FG
890 vector<set<complete_op_data *> > completions;
891
892 RGWIndexCompletionThread *completion_thread{nullptr};
893
894 int num_shards;
895
896 std::atomic<int> cur_shard {0};
897
898
899public:
9f95a23c
TL
900 RGWIndexCompletionManager(RGWRados *_store) :
901 store(_store),
902 locks{ceph::make_lock_container<ceph::mutex>(
903 store->ctx()->_conf->rgw_thread_pool_size,
904 [](const size_t i) {
905 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
906 std::to_string(i));
907 })}
908 {
31f18b77 909 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
31f18b77
FG
910 completions.resize(num_shards);
911 }
912 ~RGWIndexCompletionManager() {
913 stop();
31f18b77
FG
914 }
915
916 int next_shard() {
917 int result = cur_shard % num_shards;
918 cur_shard++;
919 return result;
920 }
921
922 void create_completion(const rgw_obj& obj,
923 RGWModifyOp op, string& tag,
924 rgw_bucket_entry_ver& ver,
925 const cls_rgw_obj_key& key,
926 rgw_bucket_dir_entry_meta& dir_meta,
927 list<cls_rgw_obj_key> *remove_objs, bool log_op,
928 uint16_t bilog_op,
929 rgw_zone_set *zones_trace,
930 complete_op_data **result);
931 bool handle_completion(completion_t cb, complete_op_data *arg);
932
b3b6e05e 933 int start(const DoutPrefixProvider *dpp) {
31f18b77 934 completion_thread = new RGWIndexCompletionThread(store);
b3b6e05e 935 int ret = completion_thread->init(dpp);
31f18b77
FG
936 if (ret < 0) {
937 return ret;
938 }
939 completion_thread->start();
940 return 0;
941 }
942 void stop() {
943 if (completion_thread) {
944 completion_thread->stop();
945 delete completion_thread;
946 }
947
948 for (int i = 0; i < num_shards; ++i) {
9f95a23c 949 std::lock_guard l{locks[i]};
31f18b77 950 for (auto c : completions[i]) {
31f18b77
FG
951 c->stop();
952 }
953 }
954 completions.clear();
955 }
956};
957
958static void obj_complete_cb(completion_t cb, void *arg)
959{
960 complete_op_data *completion = (complete_op_data *)arg;
9f95a23c 961 completion->lock.lock();
31f18b77 962 if (completion->stopped) {
9f95a23c 963 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
31f18b77
FG
964 delete completion;
965 return;
966 }
967 bool need_delete = completion->manager->handle_completion(cb, completion);
9f95a23c 968 completion->lock.unlock();
31f18b77
FG
969 if (need_delete) {
970 delete completion;
971 }
972}
973
974
975void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
976 RGWModifyOp op, string& tag,
977 rgw_bucket_entry_ver& ver,
978 const cls_rgw_obj_key& key,
979 rgw_bucket_dir_entry_meta& dir_meta,
980 list<cls_rgw_obj_key> *remove_objs, bool log_op,
981 uint16_t bilog_op,
982 rgw_zone_set *zones_trace,
983 complete_op_data **result)
984{
985 complete_op_data *entry = new complete_op_data;
986
987 int shard_id = next_shard();
988
989 entry->manager_shard_id = shard_id;
990 entry->manager = this;
991 entry->obj = obj;
992 entry->op = op;
993 entry->tag = tag;
994 entry->ver = ver;
995 entry->key = key;
996 entry->dir_meta = dir_meta;
997 entry->log_op = log_op;
998 entry->bilog_op = bilog_op;
999
1000 if (remove_objs) {
1001 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
1002 entry->remove_objs.push_back(*iter);
1003 }
1004 }
1005
1006 if (zones_trace) {
1007 entry->zones_trace = *zones_trace;
1008 } else {
9f95a23c 1009 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
31f18b77
FG
1010 }
1011
1012 *result = entry;
1013
9f95a23c 1014 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
31f18b77 1015
9f95a23c 1016 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1017 completions[shard_id].insert(entry);
1018}
1019
1020bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1021{
1022 int shard_id = arg->manager_shard_id;
1023 {
9f95a23c 1024 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1025
1026 auto& comps = completions[shard_id];
1027
1028 auto iter = comps.find(arg);
1029 if (iter == comps.end()) {
1030 return true;
1031 }
1032
1033 comps.erase(iter);
1034 }
1035
1036 int r = rados_aio_get_return_value(cb);
1037 if (r != -ERR_BUSY_RESHARDING) {
1038 return true;
1039 }
1040 completion_thread->add_completion(arg);
1041 return false;
1042}
1043
7c673cae
FG
1044void RGWRados::finalize()
1045{
1046 if (run_sync_thread) {
9f95a23c 1047 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
1048 meta_sync_processor_thread->stop();
1049
9f95a23c 1050 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1051 for (auto iter : data_sync_processor_threads) {
1052 RGWDataSyncProcessorThread *thread = iter.second;
1053 thread->stop();
1054 }
1055 if (sync_log_trimmer) {
1056 sync_log_trimmer->stop();
1057 }
1058 }
7c673cae
FG
1059 if (run_sync_thread) {
1060 delete meta_sync_processor_thread;
1061 meta_sync_processor_thread = NULL;
9f95a23c 1062 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1063 for (auto iter : data_sync_processor_threads) {
1064 RGWDataSyncProcessorThread *thread = iter.second;
1065 delete thread;
1066 }
1067 data_sync_processor_threads.clear();
1068 delete sync_log_trimmer;
1069 sync_log_trimmer = nullptr;
b32b8144 1070 bucket_trim = boost::none;
7c673cae 1071 }
7c673cae
FG
1072 if (meta_notifier) {
1073 meta_notifier->stop();
1074 delete meta_notifier;
1075 }
1076 if (data_notifier) {
1077 data_notifier->stop();
1078 delete data_notifier;
1079 }
11fdf7f2 1080 delete sync_tracer;
11fdf7f2
TL
1081
1082 delete lc;
1083 lc = NULL;
7c673cae 1084
11fdf7f2
TL
1085 delete gc;
1086 gc = NULL;
7c673cae 1087
11fdf7f2
TL
1088 delete obj_expirer;
1089 obj_expirer = NULL;
7c673cae 1090
11fdf7f2
TL
1091 RGWQuotaHandler::free_handler(quota_handler);
1092 if (cr_registry) {
1093 cr_registry->put();
7c673cae
FG
1094 }
1095
11fdf7f2 1096 svc.shutdown();
7c673cae 1097
11fdf7f2
TL
1098 delete binfo_cache;
1099 delete obj_tombstone_cache;
20effc67
TL
1100 if (d3n_data_cache)
1101 delete d3n_data_cache;
7c673cae 1102
11fdf7f2
TL
1103 if (reshard_wait.get()) {
1104 reshard_wait->stop();
1105 reshard_wait.reset();
7c673cae
FG
1106 }
1107
11fdf7f2
TL
1108 if (run_reshard_thread) {
1109 reshard->stop_processor();
7c673cae 1110 }
11fdf7f2
TL
1111 delete reshard;
1112 delete index_completion_manager;
f67539c2
TL
1113
1114 rgw::notify::shutdown();
11fdf7f2
TL
1115}
1116
1117/**
1118 * Initialize the RADOS instance and prepare to do other ops
1119 * Returns 0 on success, -ERR# on failure.
1120 */
1121int RGWRados::init_rados()
1122{
1123 int ret = 0;
7c673cae 1124
494da23a
TL
1125 ret = rados.init_with_context(cct);
1126 if (ret < 0) {
1127 return ret;
1128 }
1129 ret = rados.connect();
1130 if (ret < 0) {
1131 return ret;
7c673cae 1132 }
11fdf7f2
TL
1133
1134 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1135 new RGWCoroutinesManagerRegistry(cct)};
1136 ret = crs->hook_to_admin_command("cr dump");
1137 if (ret < 0) {
1138 return ret;
7c673cae
FG
1139 }
1140
11fdf7f2 1141 cr_registry = crs.release();
20effc67
TL
1142
1143 if (use_datacache) {
1144 d3n_data_cache = new D3nDataCache();
1145 d3n_data_cache->init(cct);
1146 }
1147
11fdf7f2 1148 return ret;
7c673cae
FG
1149}
1150
20effc67 1151int RGWRados::register_to_service_map(const DoutPrefixProvider *dpp, const string& daemon_type, const map<string, string>& meta)
7c673cae 1152{
f67539c2
TL
1153 string name = cct->_conf->name.get_id();
1154 if (name.compare(0, 4, "rgw.") == 0) {
1155 name = name.substr(4);
1156 }
11fdf7f2 1157 map<string,string> metadata = meta;
494da23a 1158 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1159 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1160 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1161 metadata["zone_name"] = svc.zone->zone_name();
9f95a23c 1162 metadata["zone_id"] = svc.zone->zone_id().id;
522d829b
TL
1163 metadata["realm_name"] = svc.zone->get_realm().get_name();
1164 metadata["realm_id"] = svc.zone->get_realm().get_id();
f67539c2
TL
1165 metadata["id"] = name;
1166 int ret = rados.service_daemon_register(
1167 daemon_type,
1168 stringify(rados.get_instance_id()),
1169 metadata);
11fdf7f2 1170 if (ret < 0) {
20effc67 1171 ldpp_dout(dpp, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
11fdf7f2 1172 return ret;
7c673cae
FG
1173 }
1174
1175 return 0;
1176}
1177
20effc67 1178int RGWRados::update_service_map(const DoutPrefixProvider *dpp, std::map<std::string, std::string>&& status)
7c673cae 1179{
494da23a 1180 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2 1181 if (ret < 0) {
20effc67 1182 ldpp_dout(dpp, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
11fdf7f2
TL
1183 return ret;
1184 }
1185
1186 return 0;
7c673cae
FG
1187}
1188
1189/**
1190 * Initialize the RADOS instance and prepare to do other ops
1191 * Returns 0 on success, -ERR# on failure.
1192 */
b3b6e05e 1193int RGWRados::init_complete(const DoutPrefixProvider *dpp)
7c673cae 1194{
11fdf7f2 1195 int ret;
7c673cae 1196
11fdf7f2
TL
1197 /*
1198 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1199 */
9f95a23c 1200 sync_module = svc.sync_modules->get_sync_module();
7c673cae 1201
b3b6e05e 1202 ret = open_root_pool_ctx(dpp);
7c673cae
FG
1203 if (ret < 0)
1204 return ret;
1205
b3b6e05e 1206 ret = open_gc_pool_ctx(dpp);
7c673cae
FG
1207 if (ret < 0)
1208 return ret;
1209
b3b6e05e 1210 ret = open_lc_pool_ctx(dpp);
7c673cae
FG
1211 if (ret < 0)
1212 return ret;
1213
b3b6e05e 1214 ret = open_objexp_pool_ctx(dpp);
7c673cae
FG
1215 if (ret < 0)
1216 return ret;
1217
b3b6e05e 1218 ret = open_reshard_pool_ctx(dpp);
31f18b77
FG
1219 if (ret < 0)
1220 return ret;
1221
b3b6e05e 1222 ret = open_notif_pool_ctx(dpp);
f67539c2
TL
1223 if (ret < 0)
1224 return ret;
1225
7c673cae
FG
1226 pools_initialized = true;
1227
522d829b
TL
1228 if (use_gc) {
1229 gc = new RGWGC();
1230 gc->initialize(cct, this);
1231 } else {
1232 ldpp_dout(dpp, 5) << "note: GC not initialized" << dendl;
1233 }
7c673cae 1234
9f95a23c 1235 obj_expirer = new RGWObjectExpirer(this->store);
7c673cae 1236
522d829b 1237 if (use_gc_thread && use_gc) {
7c673cae
FG
1238 gc->start_processor();
1239 obj_expirer->start_processor();
1240 }
1241
11fdf7f2
TL
1242 auto& current_period = svc.zone->get_current_period();
1243 auto& zonegroup = svc.zone->get_zonegroup();
1244 auto& zone_params = svc.zone->get_zone_params();
1245 auto& zone = svc.zone->get_zone();
1246
7c673cae
FG
1247 /* no point of running sync thread if we don't have a master zone configured
1248 or there is no rest_master_conn */
9f95a23c 1249 if (!svc.zone->need_to_sync()) {
7c673cae
FG
1250 run_sync_thread = false;
1251 }
1252
11fdf7f2 1253 if (svc.zone->is_meta_master()) {
9f95a23c 1254 auto md_log = svc.mdlog->get_log(current_period.get_id());
7c673cae
FG
1255 meta_notifier = new RGWMetaNotifier(this, md_log);
1256 meta_notifier->start();
1257 }
1258
11fdf7f2
TL
1259 /* init it anyway, might run sync through radosgw-admin explicitly */
1260 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1261 sync_tracer->init(this);
1262 ret = sync_tracer->hook_to_admin_command();
1263 if (ret < 0) {
1264 return ret;
1265 }
1266
7c673cae 1267 if (run_sync_thread) {
11fdf7f2
TL
1268 for (const auto &pt: zonegroup.placement_targets) {
1269 if (zone_params.placement_pools.find(pt.second.name)
1270 == zone_params.placement_pools.end()){
b3b6e05e 1271 ldpp_dout(dpp, 0) << "WARNING: This zone does not contain the placement target "
11fdf7f2
TL
1272 << pt.second.name << " present in zonegroup" << dendl;
1273 }
1274 }
9f95a23c
TL
1275 auto async_processor = svc.rados->get_async_processor();
1276 std::lock_guard l{meta_sync_thread_lock};
1277 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
b3b6e05e 1278 ret = meta_sync_processor_thread->init(dpp);
7c673cae 1279 if (ret < 0) {
b3b6e05e 1280 ldpp_dout(dpp, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
7c673cae
FG
1281 return ret;
1282 }
1283 meta_sync_processor_thread->start();
1284
b32b8144
FG
1285 // configure the bucket trim manager
1286 rgw::BucketTrimConfig config;
1287 rgw::configure_bucket_trim(cct, config);
1288
9f95a23c 1289 bucket_trim.emplace(this->store, config);
b32b8144
FG
1290 ret = bucket_trim->init();
1291 if (ret < 0) {
b3b6e05e 1292 ldpp_dout(dpp, 0) << "ERROR: failed to start bucket trim manager" << dendl;
b32b8144
FG
1293 return ret;
1294 }
9f95a23c 1295 svc.datalog_rados->set_observer(&*bucket_trim);
b32b8144 1296
9f95a23c 1297 std::lock_guard dl{data_sync_thread_lock};
81eedcae 1298 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
b3b6e05e 1299 ldpp_dout(dpp, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
9f95a23c 1300 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
b3b6e05e 1301 ret = thread->init(dpp);
7c673cae 1302 if (ret < 0) {
b3b6e05e 1303 ldpp_dout(dpp, 0) << "ERROR: failed to initialize data sync thread" << dendl;
7c673cae
FG
1304 return ret;
1305 }
1306 thread->start();
9f95a23c 1307 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
7c673cae
FG
1308 }
1309 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1310 if (interval > 0) {
9f95a23c 1311 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
b3b6e05e 1312 ret = sync_log_trimmer->init(dpp);
7c673cae 1313 if (ret < 0) {
b3b6e05e 1314 ldpp_dout(dpp, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
7c673cae
FG
1315 return ret;
1316 }
1317 sync_log_trimmer->start();
1318 }
1319 }
20effc67
TL
1320 if (cct->_conf->rgw_data_notify_interval_msec) {
1321 data_notifier = new RGWDataNotifier(this);
1322 data_notifier->start();
1323 }
7c673cae 1324
92f5a8d4
TL
1325 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1326 binfo_cache->init(svc.cache);
1327
7c673cae 1328 lc = new RGWLC();
9f95a23c 1329 lc->initialize(cct, this->store);
31f18b77 1330
7c673cae
FG
1331 if (use_lc_thread)
1332 lc->start_processor();
31f18b77 1333
b3b6e05e 1334 quota_handler = RGWQuotaHandler::generate_handler(dpp, this->store, quota_threads);
7c673cae
FG
1335
1336 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1337 zone.bucket_index_max_shards);
31f18b77
FG
1338 if (bucket_index_max_shards > get_max_bucket_shards()) {
1339 bucket_index_max_shards = get_max_bucket_shards();
b3b6e05e 1340 ldpp_dout(dpp, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1341 << get_max_bucket_shards() << dendl;
7c673cae 1342 }
b3b6e05e 1343 ldpp_dout(dpp, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
7c673cae 1344
11fdf7f2 1345 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1346
1347 if (need_tombstone_cache) {
1348 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1349 }
1350
11fdf7f2 1351 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77 1352
9f95a23c 1353 reshard = new RGWReshard(this->store);
31f18b77
FG
1354
1355 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1356 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1357 if (run_reshard_thread) {
1358 reshard->start_processor();
1359 }
1360
1361 index_completion_manager = new RGWIndexCompletionManager(this);
b3b6e05e 1362 ret = index_completion_manager->start(dpp);
f67539c2
TL
1363 if (ret < 0) {
1364 return ret;
1365 }
b3b6e05e 1366 ret = rgw::notify::init(cct, store, dpp);
f67539c2 1367 if (ret < 0 ) {
b3b6e05e 1368 ldpp_dout(dpp, 1) << "ERROR: failed to initialize notification manager" << dendl;
f67539c2 1369 }
31f18b77 1370
7c673cae
FG
1371 return ret;
1372}
1373
b3b6e05e 1374int RGWRados::init_svc(bool raw, const DoutPrefixProvider *dpp)
11fdf7f2
TL
1375{
1376 if (raw) {
b3b6e05e 1377 return svc.init_raw(cct, use_cache, null_yield, dpp);
11fdf7f2
TL
1378 }
1379
b3b6e05e 1380 return svc.init(cct, use_cache, run_sync_thread, null_yield, dpp);
9f95a23c
TL
1381}
1382
b3b6e05e 1383int RGWRados::init_ctl(const DoutPrefixProvider *dpp)
9f95a23c 1384{
b3b6e05e 1385 return ctl.init(&svc, dpp);
11fdf7f2
TL
1386}
1387
7c673cae
FG
1388/**
1389 * Initialize the RADOS instance and prepare to do other ops
1390 * Returns 0 on success, -ERR# on failure.
1391 */
b3b6e05e 1392int RGWRados::initialize(const DoutPrefixProvider *dpp)
7c673cae
FG
1393{
1394 int ret;
1395
11fdf7f2
TL
1396 inject_notify_timeout_probability =
1397 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1398 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1399
b3b6e05e 1400 ret = init_svc(false, dpp);
7c673cae 1401 if (ret < 0) {
b3b6e05e 1402 ldpp_dout(dpp, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1403 return ret;
1404 }
7c673cae 1405
b3b6e05e 1406 ret = init_ctl(dpp);
9f95a23c 1407 if (ret < 0) {
b3b6e05e 1408 ldpp_dout(dpp, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
9f95a23c
TL
1409 return ret;
1410 }
1411
11fdf7f2 1412 host_id = svc.zone_utils->gen_host_id();
7c673cae 1413
11fdf7f2
TL
1414 ret = init_rados();
1415 if (ret < 0)
1416 return ret;
1417
b3b6e05e 1418 return init_complete(dpp);
7c673cae
FG
1419}
1420
1421/**
1422 * Open the pool used as root for this gateway
1423 * Returns: 0 on success, -ERR# otherwise.
1424 */
b3b6e05e 1425int RGWRados::open_root_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1426{
b3b6e05e 1427 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1428}
1429
b3b6e05e 1430int RGWRados::open_gc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1431{
b3b6e05e 1432 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1433}
1434
b3b6e05e 1435int RGWRados::open_lc_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1436{
b3b6e05e 1437 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1438}
1439
b3b6e05e 1440int RGWRados::open_objexp_pool_ctx(const DoutPrefixProvider *dpp)
7c673cae 1441{
b3b6e05e 1442 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1443}
1444
b3b6e05e 1445int RGWRados::open_reshard_pool_ctx(const DoutPrefixProvider *dpp)
31f18b77 1446{
b3b6e05e 1447 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1448}
1449
b3b6e05e 1450int RGWRados::open_notif_pool_ctx(const DoutPrefixProvider *dpp)
f67539c2 1451{
b3b6e05e 1452 return rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().notif_pool, notif_pool_ctx, true, true);
f67539c2
TL
1453}
1454
b3b6e05e 1455int RGWRados::open_pool_ctx(const DoutPrefixProvider *dpp, const rgw_pool& pool, librados::IoCtx& io_ctx,
494da23a 1456 bool mostly_omap)
7c673cae 1457{
28e407b8 1458 constexpr bool create = true; // create the pool if it doesn't exist
b3b6e05e 1459 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1460}
1461
7c673cae
FG
1462/**** logs ****/
1463
1464struct log_list_state {
1465 string prefix;
1466 librados::IoCtx io_ctx;
1467 librados::NObjectIterator obit;
1468};
1469
b3b6e05e 1470int RGWRados::log_list_init(const DoutPrefixProvider *dpp, const string& prefix, RGWAccessHandle *handle)
7c673cae
FG
1471{
1472 log_list_state *state = new log_list_state;
b3b6e05e 1473 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1474 if (r < 0) {
1475 delete state;
1476 return r;
1477 }
1478 state->prefix = prefix;
1479 state->obit = state->io_ctx.nobjects_begin();
1480 *handle = (RGWAccessHandle)state;
1481 return 0;
1482}
1483
1484int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1485{
1486 log_list_state *state = static_cast<log_list_state *>(handle);
1487 while (true) {
1488 if (state->obit == state->io_ctx.nobjects_end()) {
1489 delete state;
1490 return -ENOENT;
1491 }
1492 if (state->prefix.length() &&
1493 state->obit->get_oid().find(state->prefix) != 0) {
1494 state->obit++;
1495 continue;
1496 }
1497 *name = state->obit->get_oid();
1498 state->obit++;
1499 break;
1500 }
1501 return 0;
1502}
1503
b3b6e05e 1504int RGWRados::log_remove(const DoutPrefixProvider *dpp, const string& name)
7c673cae
FG
1505{
1506 librados::IoCtx io_ctx;
b3b6e05e 1507 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1508 if (r < 0)
1509 return r;
1510 return io_ctx.remove(name);
1511}
1512
1513struct log_show_state {
1514 librados::IoCtx io_ctx;
1515 bufferlist bl;
11fdf7f2 1516 bufferlist::const_iterator p;
7c673cae
FG
1517 string name;
1518 uint64_t pos;
1519 bool eof;
1520 log_show_state() : pos(0), eof(false) {}
1521};
1522
b3b6e05e 1523int RGWRados::log_show_init(const DoutPrefixProvider *dpp, const string& name, RGWAccessHandle *handle)
7c673cae
FG
1524{
1525 log_show_state *state = new log_show_state;
b3b6e05e 1526 int r = rgw_init_ioctx(dpp, get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1527 if (r < 0) {
1528 delete state;
1529 return r;
1530 }
1531 state->name = name;
1532 *handle = (RGWAccessHandle)state;
1533 return 0;
1534}
1535
20effc67 1536int RGWRados::log_show_next(const DoutPrefixProvider *dpp, RGWAccessHandle handle, rgw_log_entry *entry)
7c673cae
FG
1537{
1538 log_show_state *state = static_cast<log_show_state *>(handle);
1539 off_t off = state->p.get_off();
1540
20effc67 1541 ldpp_dout(dpp, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
7c673cae
FG
1542 << " off " << off
1543 << " eof " << (int)state->eof
1544 << dendl;
1545 // read some?
1546 unsigned chunk = 1024*1024;
1547 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1548 bufferlist more;
1549 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1550 if (r < 0)
1551 return r;
1552 state->pos += r;
1553 bufferlist old;
1554 try {
1555 old.substr_of(state->bl, off, state->bl.length() - off);
1556 } catch (buffer::error& err) {
1557 return -EINVAL;
1558 }
f67539c2 1559 state->bl = std::move(old);
7c673cae 1560 state->bl.claim_append(more);
11fdf7f2 1561 state->p = state->bl.cbegin();
7c673cae
FG
1562 if ((unsigned)r < chunk)
1563 state->eof = true;
20effc67 1564 ldpp_dout(dpp, 10) << " read " << r << dendl;
7c673cae
FG
1565 }
1566
1567 if (state->p.end())
1568 return 0; // end of file
1569 try {
11fdf7f2 1570 decode(*entry, state->p);
7c673cae
FG
1571 }
1572 catch (const buffer::error &e) {
1573 return -EINVAL;
1574 }
1575 return 1;
1576}
1577
1578/**
1579 * usage_log_hash: get usage log key hash, based on name and index
1580 *
1581 * Get the usage object name. Since a user may have more than 1
1582 * object holding that info (multiple shards), we use index to
1583 * specify that shard number. Once index exceeds max shards it
1584 * wraps.
1585 * If name is not being set, results for all users will be returned
1586 * and index will wrap only after total shards number.
1587 *
1588 * @param cct [in] ceph context
1589 * @param name [in] user name
1590 * @param hash [out] hash value
1591 * @param index [in] shard index number
1592 */
1593static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1594{
1595 uint32_t val = index;
1596
1597 if (!name.empty()) {
c07f9fc5 1598 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1599 val %= max_user_shards;
1600 val += ceph_str_hash_linux(name.c_str(), name.size());
1601 }
1602 char buf[17];
c07f9fc5 1603 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1604 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1605 hash = buf;
1606}
1607
b3b6e05e 1608int RGWRados::log_usage(const DoutPrefixProvider *dpp, map<rgw_user_bucket, RGWUsageBatch>& usage_info)
7c673cae
FG
1609{
1610 uint32_t index = 0;
1611
1612 map<string, rgw_usage_log_info> log_objs;
1613
1614 string hash;
1615 string last_user;
1616
1617 /* restructure usage map, zone by object hash */
1618 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1619 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1620 const rgw_user_bucket& ub = iter->first;
1621 RGWUsageBatch& info = iter->second;
1622
1623 if (ub.user.empty()) {
b3b6e05e 1624 ldpp_dout(dpp, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
7c673cae
FG
1625 continue;
1626 }
1627
1628 if (ub.user != last_user) {
1629 /* index *should* be random, but why waste extra cycles
1630 in most cases max user shards is not going to exceed 1,
1631 so just incrementing it */
1632 usage_log_hash(cct, ub.user, hash, index++);
1633 }
1634 last_user = ub.user;
1635 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1636
1637 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1638 v.push_back(miter->second);
1639 }
1640 }
1641
1642 map<string, rgw_usage_log_info>::iterator liter;
1643
1644 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
b3b6e05e 1645 int r = cls_obj_usage_log_add(dpp, liter->first, liter->second);
7c673cae
FG
1646 if (r < 0)
1647 return r;
1648 }
1649 return 0;
1650}
1651
b3b6e05e 1652int RGWRados::read_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
11fdf7f2
TL
1653 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1654 rgw_usage_log_entry>& usage)
7c673cae
FG
1655{
1656 uint32_t num = max_entries;
1657 string hash, first_hash;
1658 string user_str = user.to_str();
1659 usage_log_hash(cct, user_str, first_hash, 0);
1660
1661 if (usage_iter.index) {
1662 usage_log_hash(cct, user_str, hash, usage_iter.index);
1663 } else {
1664 hash = first_hash;
1665 }
1666
1667 usage.clear();
1668
1669 do {
1670 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1671 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1672
b3b6e05e 1673 int ret = cls_obj_usage_log_read(dpp, hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1674 usage_iter.read_iter, ret_usage, is_truncated);
1675 if (ret == -ENOENT)
1676 goto next;
1677
1678 if (ret < 0)
1679 return ret;
1680
1681 num -= ret_usage.size();
1682
1683 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1684 usage[iter->first].aggregate(iter->second);
1685 }
1686
1687next:
1688 if (!*is_truncated) {
1689 usage_iter.read_iter.clear();
1690 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1691 }
1692 } while (num && !*is_truncated && hash != first_hash);
1693 return 0;
1694}
1695
b3b6e05e 1696int RGWRados::trim_usage(const DoutPrefixProvider *dpp, const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1697{
1698 uint32_t index = 0;
1699 string hash, first_hash;
1700 string user_str = user.to_str();
1701 usage_log_hash(cct, user_str, first_hash, index);
1702
1703 hash = first_hash;
7c673cae 1704 do {
b3b6e05e 1705 int ret = cls_obj_usage_log_trim(dpp, hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1706
b32b8144 1707 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1708 return ret;
1709
7c673cae
FG
1710 usage_log_hash(cct, user_str, hash, ++index);
1711 } while (hash != first_hash);
1712
1713 return 0;
1714}
1715
11fdf7f2 1716
b3b6e05e 1717int RGWRados::clear_usage(const DoutPrefixProvider *dpp)
11fdf7f2
TL
1718{
1719 auto max_shards = cct->_conf->rgw_usage_max_shards;
1720 int ret=0;
1721 for (unsigned i=0; i < max_shards; i++){
1722 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
b3b6e05e 1723 ret = cls_obj_usage_log_clear(dpp, oid);
11fdf7f2 1724 if (ret < 0){
b3b6e05e 1725 ldpp_dout(dpp,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
11fdf7f2
TL
1726 return ret;
1727 }
1728 }
1729 return ret;
1730}
1731
20effc67 1732int RGWRados::decode_policy(const DoutPrefixProvider *dpp, bufferlist& bl, ACLOwner *owner)
7c673cae 1733{
9f95a23c
TL
1734 auto i = bl.cbegin();
1735 RGWAccessControlPolicy policy(cct);
1736 try {
1737 policy.decode_owner(i);
1738 } catch (buffer::error& err) {
20effc67 1739 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
9f95a23c 1740 return -EIO;
7c673cae 1741 }
9f95a23c
TL
1742 *owner = policy.get_owner();
1743 return 0;
7c673cae
FG
1744}
1745
b3b6e05e 1746int rgw_policy_from_attrset(const DoutPrefixProvider *dpp, CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
7c673cae 1747{
9f95a23c
TL
1748 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1749 if (aiter == attrset.end())
1750 return -EIO;
7c673cae 1751
9f95a23c
TL
1752 bufferlist& bl = aiter->second;
1753 auto iter = bl.cbegin();
1754 try {
1755 policy->decode(iter);
1756 } catch (buffer::error& err) {
b3b6e05e 1757 ldpp_dout(dpp, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
1758 return -EIO;
1759 }
1760 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1761 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
b3b6e05e 1762 ldpp_dout(dpp, 15) << __func__ << " Read AccessControlPolicy";
9f95a23c
TL
1763 s3policy->to_xml(*_dout);
1764 *_dout << dendl;
1765 }
1766 return 0;
7c673cae
FG
1767}
1768
7c673cae 1769
b3b6e05e 1770int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id, const DoutPrefixProvider *dpp)
7c673cae 1771{
9f95a23c
TL
1772 rgw_bucket bucket = bucket_info.bucket;
1773 bucket.update_bucket_id(new_bucket_id);
7c673cae 1774
9f95a23c 1775 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 1776
9f95a23c 1777 bucket_info.objv_tracker.clear();
b3b6e05e 1778 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield, dpp);
9f95a23c
TL
1779 if (ret < 0) {
1780 return ret;
7c673cae
FG
1781 }
1782
9f95a23c 1783 return 0;
eafe8130
TL
1784}
1785
1786
1adf2230
AA
1787/**
1788 * Get ordered listing of the objects in a bucket.
7c673cae 1789 *
9f95a23c 1790 * max_p: maximum number of results to return
7c673cae
FG
1791 * bucket: bucket to list contents of
1792 * prefix: only return results that match this prefix
1793 * delim: do not include results that match this string.
1794 * Any skipped results will have the matching portion of their name
1795 * inserted in common_prefixes with a "true" mark.
1796 * marker: if filled in, begin the listing with this object.
1797 * end_marker: if filled in, end the listing with this object.
1798 * result: the objects are put in here.
11fdf7f2
TL
1799 * common_prefixes: if delim is filled in, any matching prefixes are
1800 * placed here.
1801 * is_truncated: if number of objects in the bucket is bigger than
1802 * max, then truncated.
7c673cae 1803 */
11fdf7f2 1804int RGWRados::Bucket::List::list_objects_ordered(
b3b6e05e 1805 const DoutPrefixProvider *dpp,
eafe8130 1806 int64_t max_p,
20effc67
TL
1807 std::vector<rgw_bucket_dir_entry> *result,
1808 std::map<std::string, bool> *common_prefixes,
9f95a23c
TL
1809 bool *is_truncated,
1810 optional_yield y)
7c673cae
FG
1811{
1812 RGWRados *store = target->get_store();
1813 CephContext *cct = store->ctx();
1814 int shard_id = target->get_shard_id();
1815
1816 int count = 0;
1817 bool truncated = true;
9f95a23c 1818 bool cls_filtered = false;
eafe8130
TL
1819 const int64_t max = // protect against memory issues and negative vals
1820 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1821 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
7c673cae
FG
1822
1823 result->clear();
1824
9f95a23c
TL
1825 // use a local marker; either the marker will have a previous entry
1826 // or it will be empty; either way it's OK to copy
1827 rgw_obj_key marker_obj(params.marker.name,
1828 params.marker.instance,
f91f0fd5 1829 params.ns.empty() ? params.marker.ns : params.ns);
7c673cae
FG
1830 rgw_obj_index_key cur_marker;
1831 marker_obj.get_index_key(&cur_marker);
1832
9f95a23c
TL
1833 rgw_obj_key end_marker_obj(params.end_marker.name,
1834 params.end_marker.instance,
f91f0fd5 1835 params.ns.empty() ? params.end_marker.ns : params.ns);
3efd9988
FG
1836 rgw_obj_index_key cur_end_marker;
1837 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
1838 const bool cur_end_marker_valid = !params.end_marker.empty();
1839
1840 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 1841 prefix_obj.set_ns(params.ns);
20effc67
TL
1842 std::string cur_prefix = prefix_obj.get_index_key_name();
1843 std::string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
1844
1845 if (!params.delim.empty()) {
9f95a23c 1846 after_delim_s = cls_rgw_after_delim(params.delim);
11fdf7f2
TL
1847 /* if marker points at a common prefix, fast forward it into its
1848 * upper bound string */
224ce89b 1849 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
1850 if (delim_pos >= 0) {
1851 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 1852 s.append(after_delim_s);
7c673cae
FG
1853 cur_marker = s;
1854 }
1855 }
1adf2230 1856
20effc67
TL
1857 // we'll stop after this many attempts as long we return at least
1858 // one entry; but we will also go beyond this number of attempts
1859 // until we return at least one entry
1860 constexpr uint16_t SOFT_MAX_ATTEMPTS = 8;
1861
9f95a23c 1862 rgw_obj_index_key prev_marker;
f6b5b4d7 1863 for (uint16_t attempt = 1; /* empty */; ++attempt) {
20effc67
TL
1864 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1865 ": starting attempt " << attempt << dendl;
9f95a23c
TL
1866
1867 if (attempt > 1 && !(prev_marker < cur_marker)) {
1868 // we've failed to make forward progress
20effc67
TL
1869 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
1870 " marker failed to make forward progress; attempt=" << attempt <<
9f95a23c
TL
1871 ", prev_marker=" << prev_marker <<
1872 ", cur_marker=" << cur_marker << dendl;
1873 break;
1874 }
1875 prev_marker = cur_marker;
1876
1877 ent_map_t ent_map;
1878 ent_map.reserve(read_ahead);
b3b6e05e
TL
1879 int r = store->cls_bucket_list_ordered(dpp,
1880 target->get_bucket_info(),
1adf2230
AA
1881 shard_id,
1882 cur_marker,
1883 cur_prefix,
9f95a23c 1884 params.delim,
1adf2230
AA
1885 read_ahead + 1 - count,
1886 params.list_versions,
9f95a23c 1887 attempt,
1adf2230
AA
1888 ent_map,
1889 &truncated,
9f95a23c
TL
1890 &cls_filtered,
1891 &cur_marker,
20effc67
TL
1892 y,
1893 params.force_check_filter);
9f95a23c 1894 if (r < 0) {
7c673cae 1895 return r;
9f95a23c 1896 }
7c673cae 1897
1adf2230 1898 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
1899 rgw_bucket_dir_entry& entry = eiter->second;
1900 rgw_obj_index_key index_key = entry.key;
7c673cae
FG
1901 rgw_obj_key obj(index_key);
1902
20effc67
TL
1903 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1904 ": considering entry " << entry.key << dendl;
9f95a23c 1905
1adf2230
AA
1906 /* note that parse_raw_oid() here will not set the correct
1907 * object's instance, as rgw_obj_index_key encodes that
1908 * separately. We don't need to set the instance because it's
1909 * not needed for the checks here and we end up using the raw
1910 * entry for the return vector
7c673cae
FG
1911 */
1912 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1913 if (!valid) {
20effc67
TL
1914 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
1915 " could not parse object name: " << obj.name << dendl;
7c673cae
FG
1916 continue;
1917 }
11fdf7f2 1918
9f95a23c 1919 bool matched_ns = (obj.ns == params.ns);
7c673cae 1920 if (!params.list_versions && !entry.is_visible()) {
20effc67
TL
1921 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1922 ": skipping not visible entry \"" << entry.key << "\"" << dendl;
7c673cae
FG
1923 continue;
1924 }
1925
9f95a23c 1926 if (params.enforce_ns && !matched_ns) {
7c673cae
FG
1927 if (!params.ns.empty()) {
1928 /* we've iterated past the namespace we're searching -- done now */
1929 truncated = false;
20effc67
TL
1930 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1931 ": finished due to getting past requested namespace \"" <<
1932 params.ns << "\"" << dendl;
7c673cae
FG
1933 goto done;
1934 }
1935
20effc67
TL
1936 /* we're skipping past namespaced objects */
1937 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1938 ": skipping past namespaced objects, including \"" << entry.key <<
1939 "\"" << dendl;
7c673cae
FG
1940 continue;
1941 }
1942
1943 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1944 truncated = false;
20effc67
TL
1945 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1946 ": finished due to gitting end marker of \"" << cur_end_marker <<
1947 "\" with \"" << entry.key << "\"" << dendl;
7c673cae
FG
1948 goto done;
1949 }
1950
1951 if (count < max) {
9f95a23c
TL
1952 params.marker = index_key;
1953 next_marker = index_key;
7c673cae
FG
1954 }
1955
20effc67
TL
1956 if (params.access_list_filter &&
1957 ! params.access_list_filter->filter(obj.name, index_key.name)) {
1958 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1959 ": skipping past namespaced objects, including \"" << entry.key <<
1960 "\"" << dendl;
7c673cae 1961 continue;
9f95a23c 1962 }
7c673cae 1963
1adf2230 1964 if (params.prefix.size() &&
9f95a23c 1965 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
20effc67
TL
1966 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
1967 ": skipping object \"" << entry.key <<
1968 "\" that doesn't match prefix \"" << params.prefix << "\"" << dendl;
7c673cae 1969 continue;
9f95a23c 1970 }
7c673cae
FG
1971
1972 if (!params.delim.empty()) {
9f95a23c
TL
1973 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1974 if (delim_pos >= 0) {
1975 // run either the code where delimiter filtering is done a)
1976 // in the OSD/CLS or b) here.
1977 if (cls_filtered) {
1978 // NOTE: this condition is for the newer versions of the
20effc67
TL
1979 // OSD that does filtering on the CLS side should only
1980 // find one delimiter at the end if it finds any after the
1981 // prefix
9f95a23c
TL
1982 if (delim_pos !=
1983 int(obj.name.length() - params.delim.length())) {
20effc67
TL
1984 ldpp_dout(dpp, 0) << "WARNING: " << __PRETTY_FUNCTION__ <<
1985 " found delimiter in place other than the end of "
9f95a23c
TL
1986 "the prefix; obj.name=" << obj.name <<
1987 ", prefix=" << params.prefix << dendl;
1988 }
1989 if (common_prefixes) {
1990 if (count >= max) {
1991 truncated = true;
20effc67
TL
1992 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
1993 ": stopping early with common prefix \"" << entry.key <<
1994 "\" because requested number (" << max <<
1995 ") reached (cls filtered)" << dendl;
9f95a23c
TL
1996 goto done;
1997 }
1998
1999 (*common_prefixes)[obj.name] = true;
2000 count++;
2001 }
2002
20effc67
TL
2003 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2004 ": finished entry with common prefix \"" << entry.key <<
2005 "\" so continuing loop (cls filtered)" << dendl;
9f95a23c
TL
2006 continue;
2007 } else {
2008 // NOTE: this condition is for older versions of the OSD
2009 // that do not filter on the CLS side, so the following code
2010 // must do the filtering; once we reach version 16 of ceph,
2011 // this code can be removed along with the conditional that
2012 // can lead this way
2013
2014 /* extract key -with trailing delimiter- for CommonPrefix */
2015 string prefix_key =
2016 obj.name.substr(0, delim_pos + params.delim.length());
2017
2018 if (common_prefixes &&
2019 common_prefixes->find(prefix_key) == common_prefixes->end()) {
2020 if (count >= max) {
2021 truncated = true;
20effc67
TL
2022 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2023 ": stopping early with common prefix \"" << entry.key <<
2024 "\" because requested number (" << max <<
2025 ") reached (not cls filtered)" << dendl;
9f95a23c
TL
2026 goto done;
2027 }
2028 next_marker = prefix_key;
2029 (*common_prefixes)[prefix_key] = true;
2030
2031 count++;
2032 }
2033
20effc67
TL
2034 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2035 ": finished entry with common prefix \"" << entry.key <<
2036 "\" so continuing loop (not cls filtered)" << dendl;
9f95a23c
TL
2037 continue;
2038 } // if we're running an older OSD version
2039 } // if a delimiter was found after prefix
2040 } // if a delimiter was passed in
7c673cae
FG
2041
2042 if (count >= max) {
2043 truncated = true;
20effc67
TL
2044 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2045 ": stopping early with entry \"" << entry.key <<
2046 "\" because requested number (" << max <<
2047 ") reached" << dendl;
7c673cae
FG
2048 goto done;
2049 }
2050
20effc67
TL
2051 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2052 ": adding entry " << entry.key << " to result" << dendl;
9f95a23c 2053
7c673cae
FG
2054 result->emplace_back(std::move(entry));
2055 count++;
9f95a23c
TL
2056 } // eiter for loop
2057
2058 // NOTE: the following conditional is needed by older versions of
2059 // the OSD that don't do delimiter filtering on the CLS side; once
2060 // we reach version 16 of ceph, the following conditional and the
2061 // code within can be removed
2062 if (!cls_filtered && !params.delim.empty()) {
2063 int marker_delim_pos =
2064 cur_marker.name.find(params.delim, cur_prefix.size());
eafe8130 2065 if (marker_delim_pos >= 0) {
9f95a23c
TL
2066 std::string skip_after_delim =
2067 cur_marker.name.substr(0, marker_delim_pos);
eafe8130
TL
2068 skip_after_delim.append(after_delim_s);
2069
20effc67
TL
2070 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2071 ": skip_after_delim=" << skip_after_delim << dendl;
eafe8130
TL
2072
2073 if (skip_after_delim > cur_marker.name) {
2074 cur_marker = skip_after_delim;
20effc67
TL
2075 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2076 ": setting cur_marker=" << cur_marker.name <<
2077 "[" << cur_marker.instance << "]" << dendl;
eafe8130
TL
2078 }
2079 }
9f95a23c
TL
2080 } // if older osd didn't do delimiter filtering
2081
20effc67
TL
2082 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2083 ": end of outer loop, truncated=" << truncated <<
9f95a23c
TL
2084 ", count=" << count << ", attempt=" << attempt << dendl;
2085
2086 if (!truncated || count >= (max + 1) / 2) {
2087 // if we finished listing, or if we're returning at least half the
2088 // requested entries, that's enough; S3 and swift protocols allow
2089 // returning fewer than max entries
20effc67
TL
2090 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2091 ": exiting attempt loop because we reached end (" << truncated <<
2092 ") or we're returning half the requested entries (" << count <<
2093 " of " << max << ")" << dendl;
9f95a23c 2094 break;
20effc67 2095 } else if (attempt > SOFT_MAX_ATTEMPTS && count >= 1) {
9f95a23c
TL
2096 // if we've made at least 8 attempts and we have some, but very
2097 // few, results, return with what we have
20effc67
TL
2098 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
2099 ": exiting attempt loop because we made " << attempt <<
2100 " attempts and we're returning " << count << " entries" << dendl;
9f95a23c 2101 break;
eafe8130 2102 }
f6b5b4d7 2103 } // for (uint16_t attempt...
7c673cae
FG
2104
2105done:
9f95a23c
TL
2106
2107 if (is_truncated) {
7c673cae 2108 *is_truncated = truncated;
9f95a23c 2109 }
7c673cae
FG
2110
2111 return 0;
1adf2230
AA
2112} // list_objects_ordered
2113
2114
2115/**
2116 * Get listing of the objects in a bucket and allow the results to be out
2117 * of order.
2118 *
2119 * Even though there are key differences with the ordered counterpart,
2120 * the parameters are the same to maintain some compatability.
2121 *
2122 * max: maximum number of results to return
2123 * bucket: bucket to list contents of
2124 * prefix: only return results that match this prefix
2125 * delim: should not be set; if it is we should have indicated an error
2126 * marker: if filled in, begin the listing with this object.
2127 * end_marker: if filled in, end the listing with this object.
2128 * result: the objects are put in here.
2129 * common_prefixes: this is never filled with an unordered list; the param
2130 * is maintained for compatibility
2131 * is_truncated: if number of objects in the bucket is bigger than max, then
2132 * truncated.
2133 */
20effc67 2134int RGWRados::Bucket::List::list_objects_unordered(const DoutPrefixProvider *dpp,
b3b6e05e 2135 int64_t max_p,
20effc67
TL
2136 std::vector<rgw_bucket_dir_entry>* result,
2137 std::map<std::string, bool>* common_prefixes,
2138 bool* is_truncated,
9f95a23c 2139 optional_yield y)
1adf2230
AA
2140{
2141 RGWRados *store = target->get_store();
1adf2230
AA
2142 int shard_id = target->get_shard_id();
2143
2144 int count = 0;
2145 bool truncated = true;
2146
eafe8130
TL
2147 const int64_t max = // protect against memory issues and negative vals
2148 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2149
1adf2230
AA
2150 // read a few extra in each call to cls_bucket_list_unordered in
2151 // case some are filtered out due to namespace matching, versioning,
2152 // filtering, etc.
2153 const int64_t max_read_ahead = 100;
2154 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2155
2156 result->clear();
2157
9f95a23c
TL
2158 // use a local marker; either the marker will have a previous entry
2159 // or it will be empty; either way it's OK to copy
11fdf7f2
TL
2160 rgw_obj_key marker_obj(params.marker.name,
2161 params.marker.instance,
f91f0fd5 2162 params.ns.empty() ? params.marker.ns : params.ns);
1adf2230
AA
2163 rgw_obj_index_key cur_marker;
2164 marker_obj.get_index_key(&cur_marker);
2165
11fdf7f2
TL
2166 rgw_obj_key end_marker_obj(params.end_marker.name,
2167 params.end_marker.instance,
f91f0fd5 2168 params.ns.empty() ? params.end_marker.ns : params.ns);
1adf2230
AA
2169 rgw_obj_index_key cur_end_marker;
2170 end_marker_obj.get_index_key(&cur_end_marker);
2171 const bool cur_end_marker_valid = !params.end_marker.empty();
2172
2173 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 2174 prefix_obj.set_ns(params.ns);
20effc67 2175 std::string cur_prefix = prefix_obj.get_index_key_name();
1adf2230
AA
2176
2177 while (truncated && count <= max) {
2178 std::vector<rgw_bucket_dir_entry> ent_list;
9f95a23c
TL
2179 ent_list.reserve(read_ahead);
2180
20effc67 2181 int r = store->cls_bucket_list_unordered(dpp,
b3b6e05e 2182 target->get_bucket_info(),
1adf2230
AA
2183 shard_id,
2184 cur_marker,
2185 cur_prefix,
2186 read_ahead,
2187 params.list_versions,
2188 ent_list,
2189 &truncated,
9f95a23c
TL
2190 &cur_marker,
2191 y);
20effc67
TL
2192 if (r < 0) {
2193 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
2194 " cls_bucket_list_unordered returned " << r << " for " <<
2195 target->get_bucket_info().bucket << dendl;
1adf2230 2196 return r;
20effc67 2197 }
1adf2230
AA
2198
2199 // NB: while regions of ent_list will be sorted, we have no
2200 // guarantee that all items will be sorted since they can cross
2201 // shard boundaries
2202
2203 for (auto& entry : ent_list) {
2204 rgw_obj_index_key index_key = entry.key;
2205 rgw_obj_key obj(index_key);
2206
9f95a23c
TL
2207 if (count < max) {
2208 params.marker.set(index_key);
2209 next_marker.set(index_key);
2210 }
2211
1adf2230
AA
2212 /* note that parse_raw_oid() here will not set the correct
2213 * object's instance, as rgw_obj_index_key encodes that
2214 * separately. We don't need to set the instance because it's
2215 * not needed for the checks here and we end up using the raw
2216 * entry for the return vector
2217 */
2218 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2219 if (!valid) {
20effc67
TL
2220 ldpp_dout(dpp, 0) << "ERROR: " << __PRETTY_FUNCTION__ <<
2221 " could not parse object name: " << obj.name << dendl;
1adf2230
AA
2222 continue;
2223 }
2224
2225 if (!params.list_versions && !entry.is_visible()) {
20effc67
TL
2226 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2227 ": skippping \"" << index_key <<
2228 "\" because not listing versions and entry not visibile" << dendl;
1adf2230
AA
2229 continue;
2230 }
2231
2232 if (params.enforce_ns && obj.ns != params.ns) {
20effc67
TL
2233 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2234 ": skippping \"" << index_key <<
2235 "\" because namespace does not match" << dendl;
1adf2230
AA
2236 continue;
2237 }
2238
2239 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2240 // we're not guaranteed items will come in order, so we have
2241 // to loop through all
20effc67
TL
2242 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2243 ": skippping \"" << index_key <<
2244 "\" because after end_marker" << dendl;
1adf2230
AA
2245 continue;
2246 }
2247
20effc67
TL
2248 if (params.access_list_filter &&
2249 !params.access_list_filter->filter(obj.name, index_key.name)) {
2250 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2251 ": skippping \"" << index_key <<
2252 "\" because doesn't match filter" << dendl;
1adf2230 2253 continue;
20effc67 2254 }
1adf2230
AA
2255
2256 if (params.prefix.size() &&
20effc67
TL
2257 (0 != obj.name.compare(0, params.prefix.size(), params.prefix))) {
2258 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
2259 ": skippping \"" << index_key <<
2260 "\" because doesn't match prefix" << dendl;
2261 continue;
2262 }
1adf2230
AA
2263
2264 if (count >= max) {
2265 truncated = true;
2266 goto done;
2267 }
2268
2269 result->emplace_back(std::move(entry));
2270 count++;
2271 } // for (auto& entry : ent_list)
2272 } // while (truncated && count <= max)
2273
2274done:
20effc67
TL
2275
2276 if (is_truncated) {
1adf2230 2277 *is_truncated = truncated;
20effc67 2278 }
1adf2230
AA
2279
2280 return 0;
2281} // list_objects_unordered
2282
7c673cae
FG
2283
2284/**
2285 * create a rados pool, associated meta info
2286 * returns 0 on success, -ERR# otherwise.
2287 */
b3b6e05e 2288int RGWRados::create_pool(const DoutPrefixProvider *dpp, const rgw_pool& pool)
7c673cae 2289{
c07f9fc5 2290 librados::IoCtx io_ctx;
28e407b8 2291 constexpr bool create = true;
b3b6e05e 2292 return rgw_init_ioctx(dpp, get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2293}
2294
9f95a23c 2295void RGWRados::create_bucket_id(string *bucket_id)
7c673cae 2296{
9f95a23c
TL
2297 uint64_t iid = instance_id();
2298 uint64_t bid = next_bucket_id();
2299 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2300 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2301 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2302 *bucket_id = buf;
2303}
7c673cae 2304
11fdf7f2 2305int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2306 const string& zonegroup_id,
11fdf7f2 2307 const rgw_placement_rule& placement_rule,
7c673cae
FG
2308 const string& swift_ver_location,
2309 const RGWQuotaInfo * pquota_info,
2310 map<std::string, bufferlist>& attrs,
2311 RGWBucketInfo& info,
2312 obj_version *pobjv,
2313 obj_version *pep_objv,
2314 real_time creation_time,
2315 rgw_bucket *pmaster_bucket,
2316 uint32_t *pmaster_num_shards,
f67539c2 2317 optional_yield y,
b3b6e05e 2318 const DoutPrefixProvider *dpp,
7c673cae
FG
2319 bool exclusive)
2320{
2321#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2322 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2323 RGWZonePlacementInfo rule_info;
2324
2325 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2326 int ret = 0;
b3b6e05e 2327 ret = svc.zone->select_bucket_placement(dpp, owner, zonegroup_id, placement_rule,
f67539c2 2328 &selected_placement_rule, &rule_info, y);
7c673cae
FG
2329 if (ret < 0)
2330 return ret;
2331
2332 if (!pmaster_bucket) {
2333 create_bucket_id(&bucket.marker);
2334 bucket.bucket_id = bucket.marker;
2335 } else {
2336 bucket.marker = pmaster_bucket->marker;
2337 bucket.bucket_id = pmaster_bucket->bucket_id;
2338 }
2339
2340 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2341
9f95a23c
TL
2342 objv_tracker.read_version.clear();
2343
7c673cae
FG
2344 if (pobjv) {
2345 objv_tracker.write_version = *pobjv;
2346 } else {
2347 objv_tracker.generate_new_write_ver(cct);
2348 }
2349
2350 info.bucket = bucket;
2351 info.owner = owner.user_id;
2352 info.zonegroup = zonegroup_id;
11fdf7f2 2353 info.placement_rule = selected_placement_rule;
7c673cae
FG
2354 info.swift_ver_location = swift_ver_location;
2355 info.swift_versioning = (!swift_ver_location.empty());
f67539c2
TL
2356
2357 init_default_bucket_layout(cct, info.layout, svc.zone->get_zone(),
2358 pmaster_num_shards ?
2359 std::optional{*pmaster_num_shards} :
2360 std::nullopt,
2361 rule_info.index_type);
2362
7c673cae
FG
2363 info.requester_pays = false;
2364 if (real_clock::is_zero(creation_time)) {
2365 info.creation_time = ceph::real_clock::now();
2366 } else {
2367 info.creation_time = creation_time;
2368 }
2369 if (pquota_info) {
2370 info.quota = *pquota_info;
2371 }
2372
b3b6e05e 2373 int r = svc.bi->init_index(dpp, info);
11fdf7f2
TL
2374 if (r < 0) {
2375 return r;
2376 }
7c673cae 2377
b3b6e05e 2378 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true, dpp);
9f95a23c
TL
2379 if (ret == -ECANCELED) {
2380 ret = -EEXIST;
2381 }
11fdf7f2 2382 if (ret == -EEXIST) {
11fdf7f2 2383 /* we need to reread the info and return it, caller will have a use for it */
9f95a23c
TL
2384 RGWBucketInfo orig_info;
2385 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
11fdf7f2
TL
2386 if (r < 0) {
2387 if (r == -ENOENT) {
2388 continue;
2389 }
b3b6e05e 2390 ldpp_dout(dpp, 0) << "get_bucket_info returned " << r << dendl;
11fdf7f2
TL
2391 return r;
2392 }
7c673cae 2393
11fdf7f2 2394 /* only remove it if it's a different bucket instance */
9f95a23c 2395 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
b3b6e05e 2396 int r = svc.bi->clean_index(dpp, info);
9f95a23c 2397 if (r < 0) {
b3b6e05e 2398 ldpp_dout(dpp, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
9f95a23c 2399 }
b3b6e05e 2400 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield, dpp);
9f95a23c 2401 if (r < 0) {
b3b6e05e 2402 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
9f95a23c
TL
2403 /* continue anyway */
2404 }
11fdf7f2 2405 }
9f95a23c
TL
2406
2407 info = std::move(orig_info);
2408 /* ret == -EEXIST here */
11fdf7f2 2409 }
7c673cae 2410 return ret;
7c673cae
FG
2411 }
2412
11fdf7f2 2413 /* this is highly unlikely */
b3b6e05e 2414 ldpp_dout(dpp, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
11fdf7f2 2415 return -ENOENT;
7c673cae
FG
2416}
2417
20effc67 2418// returns true on success, false on failure
11fdf7f2 2419bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2420{
11fdf7f2
TL
2421 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2422}
c07f9fc5 2423
11fdf7f2
TL
2424bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2425{
2426 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2427
11fdf7f2 2428 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2429}
2430
20effc67
TL
2431std::string RGWRados::get_cluster_fsid(const DoutPrefixProvider *dpp, optional_yield y)
2432{
2433 return svc.rados->cluster_fsid();
2434}
2435
b3b6e05e 2436int RGWRados::get_obj_head_ioctx(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
7c673cae
FG
2437{
2438 string oid, key;
2439 get_obj_bucket_and_oid_loc(obj, oid, key);
2440
2441 rgw_pool pool;
2442 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
b3b6e05e 2443 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
7c673cae
FG
2444 return -EIO;
2445 }
2446
b3b6e05e 2447 int r = open_pool_ctx(dpp, pool, *ioctx, false);
7c673cae
FG
2448 if (r < 0) {
2449 return r;
2450 }
2451
2452 ioctx->locator_set_key(key);
2453
2454 return 0;
2455}
2456
20effc67
TL
2457int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
2458 const rgw_placement_rule& target_placement_rule,
2459 const rgw_obj& obj,
2460 rgw_rados_ref *ref)
7c673cae 2461{
11fdf7f2 2462 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2463
2464 rgw_pool pool;
20effc67 2465 if (!get_obj_data_pool(target_placement_rule, obj, &pool)) {
b3b6e05e 2466 ldpp_dout(dpp, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
7c673cae
FG
2467 return -EIO;
2468 }
2469
9f95a23c
TL
2470 ref->pool = svc.rados->pool(pool);
2471
b3b6e05e 2472 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c 2473 .set_mostly_omap(false));
7c673cae 2474 if (r < 0) {
b3b6e05e 2475 ldpp_dout(dpp, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
7c673cae
FG
2476 return r;
2477 }
2478
9f95a23c 2479 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2480
2481 return 0;
2482}
2483
20effc67
TL
2484int RGWRados::get_obj_head_ref(const DoutPrefixProvider *dpp,
2485 const RGWBucketInfo& bucket_info,
2486 const rgw_obj& obj,
2487 rgw_rados_ref *ref)
2488{
2489 return get_obj_head_ref(dpp, bucket_info.placement_rule, obj, ref);
2490}
2491
b3b6e05e 2492int RGWRados::get_raw_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2493{
11fdf7f2 2494 ref->obj = obj;
7c673cae 2495
11fdf7f2
TL
2496 if (ref->obj.oid.empty()) {
2497 ref->obj.oid = obj.pool.to_str();
2498 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2499 }
9f95a23c 2500 ref->pool = svc.rados->pool(obj.pool);
b3b6e05e 2501 int r = ref->pool.open(dpp, RGWSI_RADOS::OpenParams()
9f95a23c
TL
2502 .set_mostly_omap(false));
2503 if (r < 0) {
b3b6e05e 2504 ldpp_dout(dpp, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
7c673cae 2505 return r;
9f95a23c 2506 }
7c673cae 2507
9f95a23c 2508 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2509
2510 return 0;
2511}
2512
b3b6e05e 2513int RGWRados::get_system_obj_ref(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2514{
b3b6e05e 2515 return get_raw_obj_ref(dpp, obj, ref);
7c673cae
FG
2516}
2517
2518/*
2519 * fixes an issue where head objects were supposed to have a locator created, but ended
2520 * up without one
2521 */
b3b6e05e 2522int RGWRados::fix_head_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
7c673cae
FG
2523{
2524 const rgw_bucket& bucket = bucket_info.bucket;
2525 string oid;
2526 string locator;
2527
2528 rgw_obj obj(bucket, key);
2529
2530 get_obj_bucket_and_oid_loc(obj, oid, locator);
2531
2532 if (locator.empty()) {
b3b6e05e 2533 ldpp_dout(dpp, 20) << "object does not have a locator, nothing to fix" << dendl;
7c673cae
FG
2534 return 0;
2535 }
2536
2537 librados::IoCtx ioctx;
2538
b3b6e05e 2539 int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &ioctx);
7c673cae
FG
2540 if (ret < 0) {
2541 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2542 return ret;
2543 }
2544 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2545
2546 uint64_t size;
2547 bufferlist data;
2548
2549 struct timespec mtime_ts;
2550 map<string, bufferlist> attrs;
2551 librados::ObjectReadOperation op;
2552 op.getxattrs(&attrs, NULL);
2553 op.stat2(&size, &mtime_ts, NULL);
2554#define HEAD_SIZE 512 * 1024
2555 op.read(0, HEAD_SIZE, &data, NULL);
2556
b3b6e05e 2557 ret = rgw_rados_operate(dpp, ioctx, oid, &op, &data, null_yield);
7c673cae 2558 if (ret < 0) {
b3b6e05e 2559 ldpp_dout(dpp, -1) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
7c673cae
FG
2560 return ret;
2561 }
2562
2563 if (size > HEAD_SIZE) {
b3b6e05e 2564 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
7c673cae
FG
2565 return -EIO;
2566 }
2567
2568 if (size != data.length()) {
b3b6e05e 2569 ldpp_dout(dpp, -1) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
7c673cae
FG
2570 return -EIO;
2571 }
2572
2573 if (copy_obj) {
2574 librados::ObjectWriteOperation wop;
2575
2576 wop.mtime2(&mtime_ts);
2577
2578 map<string, bufferlist>::iterator iter;
2579 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2580 wop.setxattr(iter->first.c_str(), iter->second);
2581 }
2582
2583 wop.write(0, data);
2584
2585 ioctx.locator_set_key(locator);
b3b6e05e 2586 rgw_rados_operate(dpp, ioctx, oid, &wop, null_yield);
7c673cae
FG
2587 }
2588
2589 if (remove_bad) {
2590 ioctx.locator_set_key(string());
2591
2592 ret = ioctx.remove(oid);
2593 if (ret < 0) {
b3b6e05e 2594 ldpp_dout(dpp, -1) << "ERROR: failed to remove original bad object" << dendl;
7c673cae
FG
2595 return ret;
2596 }
2597 }
2598
2599 return 0;
2600}
2601
20effc67 2602int RGWRados::move_rados_obj(const DoutPrefixProvider *dpp,
b3b6e05e 2603 librados::IoCtx& src_ioctx,
7c673cae
FG
2604 const string& src_oid, const string& src_locator,
2605 librados::IoCtx& dst_ioctx,
2606 const string& dst_oid, const string& dst_locator)
2607{
2608
2609#define COPY_BUF_SIZE (4 * 1024 * 1024)
2610 bool done = false;
2611 uint64_t chunk_size = COPY_BUF_SIZE;
2612 uint64_t ofs = 0;
2613 int ret = 0;
2614 real_time mtime;
2615 struct timespec mtime_ts;
2616 uint64_t size;
2617
2618 if (src_oid == dst_oid && src_locator == dst_locator) {
2619 return 0;
2620 }
2621
2622 src_ioctx.locator_set_key(src_locator);
2623 dst_ioctx.locator_set_key(dst_locator);
2624
2625 do {
2626 bufferlist data;
2627 ObjectReadOperation rop;
2628 ObjectWriteOperation wop;
2629
2630 if (ofs == 0) {
2631 rop.stat2(&size, &mtime_ts, NULL);
2632 mtime = real_clock::from_timespec(mtime_ts);
2633 }
2634 rop.read(ofs, chunk_size, &data, NULL);
b3b6e05e 2635 ret = rgw_rados_operate(dpp, src_ioctx, src_oid, &rop, &data, null_yield);
7c673cae
FG
2636 if (ret < 0) {
2637 goto done_err;
2638 }
2639
2640 if (data.length() == 0) {
2641 break;
2642 }
2643
2644 if (ofs == 0) {
2645 wop.create(true); /* make it exclusive */
2646 wop.mtime2(&mtime_ts);
2647 mtime = real_clock::from_timespec(mtime_ts);
2648 }
2649 wop.write(ofs, data);
b3b6e05e 2650 ret = rgw_rados_operate(dpp, dst_ioctx, dst_oid, &wop, null_yield);
11fdf7f2
TL
2651 if (ret < 0) {
2652 goto done_err;
2653 }
7c673cae
FG
2654 ofs += data.length();
2655 done = data.length() != chunk_size;
2656 } while (!done);
2657
2658 if (ofs != size) {
b3b6e05e 2659 ldpp_dout(dpp, -1) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
7c673cae
FG
2660 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2661 ret = -EIO;
2662 goto done_err;
2663 }
2664
2665 src_ioctx.remove(src_oid);
2666
2667 return 0;
2668
2669done_err:
11fdf7f2 2670 // TODO: clean up dst_oid if we created it
b3b6e05e 2671 ldpp_dout(dpp, -1) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
7c673cae
FG
2672 return ret;
2673}
2674
2675/*
2676 * fixes an issue where head objects were supposed to have a locator created, but ended
2677 * up without one
2678 */
b3b6e05e 2679int RGWRados::fix_tail_obj_locator(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
7c673cae
FG
2680{
2681 const rgw_bucket& bucket = bucket_info.bucket;
2682 rgw_obj obj(bucket, key);
2683
2684 if (need_fix) {
2685 *need_fix = false;
2686 }
2687
2688 rgw_rados_ref ref;
b3b6e05e 2689 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
2690 if (r < 0) {
2691 return r;
2692 }
2693
2694 RGWObjState *astate = NULL;
9f95a23c 2695 RGWObjectCtx rctx(this->store);
b3b6e05e 2696 r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
2697 if (r < 0)
2698 return r;
2699
9f95a23c 2700 if (astate->manifest) {
7c673cae 2701 RGWObjManifest::obj_iterator miter;
9f95a23c 2702 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 2703 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 2704 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(store);
7c673cae
FG
2705 rgw_obj loc;
2706 string oid;
2707 string locator;
2708
9f95a23c 2709 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
7c673cae
FG
2710
2711 if (loc.key.ns.empty()) {
2712 /* continue, we're only interested in tail objects */
2713 continue;
2714 }
2715
9f95a23c
TL
2716 auto& ioctx = ref.pool.ioctx();
2717
7c673cae 2718 get_obj_bucket_and_oid_loc(loc, oid, locator);
9f95a23c 2719 ref.pool.ioctx().locator_set_key(locator);
7c673cae 2720
b3b6e05e 2721 ldpp_dout(dpp, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
7c673cae 2722
9f95a23c 2723 r = ioctx.stat(oid, NULL, NULL);
7c673cae
FG
2724 if (r != -ENOENT) {
2725 continue;
2726 }
2727
2728 string bad_loc;
2729 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2730
2731 /* create a new ioctx with the bad locator */
2732 librados::IoCtx src_ioctx;
9f95a23c 2733 src_ioctx.dup(ioctx);
7c673cae
FG
2734 src_ioctx.locator_set_key(bad_loc);
2735
2736 r = src_ioctx.stat(oid, NULL, NULL);
2737 if (r != 0) {
2738 /* cannot find a broken part */
2739 continue;
2740 }
b3b6e05e 2741 ldpp_dout(dpp, 20) << __func__ << ": found bad object part: " << loc << dendl;
7c673cae
FG
2742 if (need_fix) {
2743 *need_fix = true;
2744 }
2745 if (fix) {
b3b6e05e 2746 r = move_rados_obj(dpp, src_ioctx, oid, bad_loc, ioctx, oid, locator);
7c673cae 2747 if (r < 0) {
b3b6e05e 2748 ldpp_dout(dpp, -1) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
7c673cae
FG
2749 }
2750 }
2751 }
2752 }
2753
2754 return 0;
2755}
2756
f64942e4
AA
2757int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2758 const rgw_obj& obj,
b3b6e05e
TL
2759 RGWBucketInfo* bucket_info_out,
2760 const DoutPrefixProvider *dpp)
7c673cae
FG
2761{
2762 bucket = _bucket;
2763
11fdf7f2 2764 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2765
2766 RGWBucketInfo bucket_info;
f64942e4
AA
2767 RGWBucketInfo* bucket_info_p =
2768 bucket_info_out ? bucket_info_out : &bucket_info;
2769
b3b6e05e 2770 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2771 if (ret < 0) {
2772 return ret;
2773 }
2774
9f95a23c
TL
2775 string oid;
2776
b3b6e05e 2777 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae 2778 if (ret < 0) {
b3b6e05e 2779 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2780 return ret;
2781 }
b3b6e05e 2782 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2783
2784 return 0;
2785}
2786
f64942e4 2787int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
f67539c2 2788 int sid, const rgw::bucket_index_layout_generation& idx_layout,
b3b6e05e
TL
2789 RGWBucketInfo* bucket_info_out,
2790 const DoutPrefixProvider *dpp)
7c673cae
FG
2791{
2792 bucket = _bucket;
2793 shard_id = sid;
2794
11fdf7f2 2795 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 2796
f67539c2 2797
7c673cae 2798 RGWBucketInfo bucket_info;
f64942e4
AA
2799 RGWBucketInfo* bucket_info_p =
2800 bucket_info_out ? bucket_info_out : &bucket_info;
b3b6e05e 2801 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield, dpp);
7c673cae
FG
2802 if (ret < 0) {
2803 return ret;
2804 }
2805
9f95a23c
TL
2806 string oid;
2807
b3b6e05e 2808 ret = store->svc.bi_rados->open_bucket_index_shard(dpp, *bucket_info_p, shard_id, idx_layout, &bucket_obj);
7c673cae 2809 if (ret < 0) {
b3b6e05e 2810 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
7c673cae
FG
2811 return ret;
2812 }
b3b6e05e 2813 ldpp_dout(dpp, 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2814
2815 return 0;
2816}
2817
b3b6e05e 2818int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info,
a8e16298
TL
2819 const rgw_obj& obj)
2820{
2821 bucket = bucket_info.bucket;
2822
b3b6e05e 2823 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info,
9f95a23c
TL
2824 obj.get_hash_object(),
2825 &bucket_obj,
2826 &shard_id);
a8e16298 2827 if (ret < 0) {
b3b6e05e 2828 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
a8e16298
TL
2829 return ret;
2830 }
b3b6e05e 2831 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
a8e16298
TL
2832
2833 return 0;
2834}
2835
b3b6e05e 2836int RGWRados::BucketShard::init(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw::bucket_index_layout_generation& idx_layout, int sid)
b32b8144
FG
2837{
2838 bucket = bucket_info.bucket;
2839 shard_id = sid;
2840
b3b6e05e 2841 int ret = store->svc.bi_rados->open_bucket_index_shard(dpp, bucket_info, shard_id, idx_layout, &bucket_obj);
b32b8144 2842 if (ret < 0) {
b3b6e05e 2843 ldpp_dout(dpp, 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
b32b8144
FG
2844 return ret;
2845 }
b3b6e05e 2846 ldpp_dout(dpp, 20) << " bucket index object: " << bucket_obj << dendl;
b32b8144
FG
2847
2848 return 0;
2849}
2850
7c673cae
FG
2851
2852/* Execute @handler on last item in bucket listing for bucket specified
2853 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2854 * to objects matching these criterias. */
20effc67 2855int RGWRados::on_last_entry_in_listing(const DoutPrefixProvider *dpp,
b3b6e05e 2856 RGWBucketInfo& bucket_info,
7c673cae
FG
2857 const std::string& obj_prefix,
2858 const std::string& obj_delim,
2859 std::function<int(const rgw_bucket_dir_entry&)> handler)
2860{
2861 RGWRados::Bucket target(this, bucket_info);
2862 RGWRados::Bucket::List list_op(&target);
2863
2864 list_op.params.prefix = obj_prefix;
2865 list_op.params.delim = obj_delim;
2866
b3b6e05e 2867 ldpp_dout(dpp, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
7c673cae
FG
2868 << ", obj_prefix=" << obj_prefix
2869 << ", obj_delim=" << obj_delim
2870 << dendl;
2871
2872 bool is_truncated = false;
2873
2874 boost::optional<rgw_bucket_dir_entry> last_entry;
2875 /* We need to rewind to the last object in a listing. */
2876 do {
2877 /* List bucket entries in chunks. */
2878 static constexpr int MAX_LIST_OBJS = 100;
2879 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2880
b3b6e05e 2881 int ret = list_op.list_objects(dpp, MAX_LIST_OBJS, &entries, nullptr,
9f95a23c 2882 &is_truncated, null_yield);
7c673cae
FG
2883 if (ret < 0) {
2884 return ret;
2885 } else if (!entries.empty()) {
2886 last_entry = entries.back();
2887 }
2888 } while (is_truncated);
2889
2890 if (last_entry) {
2891 return handler(*last_entry);
2892 }
2893
2894 /* Empty listing - no items we can run handler on. */
2895 return 0;
2896}
2897
20effc67 2898bool RGWRados::swift_versioning_enabled(rgw::sal::Bucket* bucket) const
f67539c2
TL
2899{
2900 return bucket->get_info().has_swift_versioning() &&
2901 bucket->get_info().swift_ver_location.size();
2902}
7c673cae
FG
2903
2904int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2905 const rgw_user& user,
20effc67
TL
2906 rgw::sal::Bucket* bucket,
2907 rgw::sal::Object* obj,
9f95a23c
TL
2908 const DoutPrefixProvider *dpp,
2909 optional_yield y)
7c673cae 2910{
f67539c2 2911 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
2912 return 0;
2913 }
2914
f67539c2 2915 obj->set_atomic(&obj_ctx);
7c673cae
FG
2916
2917 RGWObjState * state = nullptr;
b3b6e05e 2918 int r = get_obj_state(dpp, &obj_ctx, bucket->get_info(), obj->get_obj(), &state, false, y);
7c673cae
FG
2919 if (r < 0) {
2920 return r;
2921 }
2922
2923 if (!state->exists) {
2924 return 0;
2925 }
2926
f67539c2 2927 const string& src_name = obj->get_oid();
7c673cae
FG
2928 char buf[src_name.size() + 32];
2929 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2930 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2931 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2932
2933 RGWBucketInfo dest_bucket_info;
2934
f67539c2 2935 r = get_bucket_info(&svc, bucket->get_tenant(), bucket->get_info().swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
7c673cae 2936 if (r < 0) {
b3b6e05e 2937 ldpp_dout(dpp, 10) << "failed to read dest bucket info: r=" << r << dendl;
7c673cae
FG
2938 if (r == -ENOENT) {
2939 return -ERR_PRECONDITION_FAILED;
2940 }
2941 return r;
2942 }
2943
f67539c2 2944 if (dest_bucket_info.owner != bucket->get_info().owner) {
7c673cae
FG
2945 return -ERR_PRECONDITION_FAILED;
2946 }
2947
20effc67
TL
2948 rgw::sal::RadosBucket dest_bucket(store, dest_bucket_info);
2949 rgw::sal::RadosObject dest_obj(store, rgw_obj_key(buf), &dest_bucket);
11fdf7f2
TL
2950
2951 if (dest_bucket_info.versioning_enabled()){
f67539c2 2952 dest_obj.gen_rand_obj_instance_name();
11fdf7f2
TL
2953 }
2954
f67539c2 2955 dest_obj.set_atomic(&obj_ctx);
7c673cae 2956
9f95a23c 2957 rgw_zone_id no_zone;
7c673cae
FG
2958
2959 r = copy_obj(obj_ctx,
2960 user,
7c673cae
FG
2961 NULL, /* req_info *info */
2962 no_zone,
f67539c2 2963 &dest_obj,
7c673cae 2964 obj,
f67539c2
TL
2965 &dest_bucket,
2966 bucket,
2967 bucket->get_placement_rule(),
7c673cae
FG
2968 NULL, /* time_t *src_mtime */
2969 NULL, /* time_t *mtime */
2970 NULL, /* const time_t *mod_ptr */
2971 NULL, /* const time_t *unmod_ptr */
2972 false, /* bool high_precision_time */
2973 NULL, /* const char *if_match */
2974 NULL, /* const char *if_nomatch */
2975 RGWRados::ATTRSMOD_NONE,
2976 true, /* bool copy_if_newer */
2977 state->attrset,
11fdf7f2 2978 RGWObjCategory::Main,
7c673cae
FG
2979 0, /* uint64_t olh_epoch */
2980 real_time(), /* time_t delete_at */
2981 NULL, /* string *version_id */
2982 NULL, /* string *ptag */
2983 NULL, /* string *petag */
7c673cae 2984 NULL, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2985 NULL, /* void *progress_data */
2986 dpp,
2987 null_yield);
7c673cae
FG
2988 if (r == -ECANCELED || r == -ENOENT) {
2989 /* Has already been overwritten, meaning another rgw process already
2990 * copied it out */
2991 return 0;
2992 }
2993
2994 return r;
2995}
2996
9f95a23c 2997int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
7c673cae 2998 const rgw_user& user,
20effc67
TL
2999 rgw::sal::Bucket* bucket,
3000 rgw::sal::Object* obj,
9f95a23c
TL
3001 bool& restored, /* out */
3002 const DoutPrefixProvider *dpp)
7c673cae 3003{
f67539c2 3004 if (! swift_versioning_enabled(bucket)) {
7c673cae
FG
3005 return 0;
3006 }
3007
3008 /* Bucket info of the bucket that stores previous versions of our object. */
3009 RGWBucketInfo archive_binfo;
3010
f67539c2
TL
3011 int ret = get_bucket_info(&svc, bucket->get_tenant(),
3012 bucket->get_info().swift_ver_location,
3013 archive_binfo, nullptr, null_yield, nullptr);
7c673cae
FG
3014 if (ret < 0) {
3015 return ret;
3016 }
3017
3018 /* Abort the operation if the bucket storing our archive belongs to someone
3019 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3020 * into consideration. For we can live with that.
3021 *
3022 * TODO: delegate this check to un upper layer and compare with ACLs. */
f67539c2 3023 if (bucket->get_info().owner != archive_binfo.owner) {
7c673cae
FG
3024 return -EPERM;
3025 }
3026
3027 /* This code will be executed on latest version of the object. */
3028 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
9f95a23c 3029 rgw_zone_id no_zone;
7c673cae
FG
3030
3031 /* We don't support object versioning of Swift API on those buckets that
3032 * are already versioned using the S3 mechanism. This affects also bucket
3033 * storing archived objects. Otherwise the delete operation would create
3034 * a deletion marker. */
3035 if (archive_binfo.versioned()) {
3036 restored = false;
3037 return -ERR_PRECONDITION_FAILED;
3038 }
3039
3040 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3041 * irrelevant and may be safely skipped. */
3042 std::map<std::string, ceph::bufferlist> no_attrs;
3043
20effc67
TL
3044 rgw::sal::RadosBucket archive_bucket(store, archive_binfo);
3045 rgw::sal::RadosObject archive_obj(store, entry.key, &archive_bucket);
11fdf7f2 3046
f67539c2
TL
3047 if (bucket->versioning_enabled()){
3048 obj->gen_rand_obj_instance_name();
11fdf7f2
TL
3049 }
3050
f67539c2
TL
3051 archive_obj.set_atomic(&obj_ctx);
3052 obj->set_atomic(&obj_ctx);
7c673cae
FG
3053
3054 int ret = copy_obj(obj_ctx,
3055 user,
7c673cae
FG
3056 nullptr, /* req_info *info */
3057 no_zone,
3058 obj, /* dest obj */
f67539c2
TL
3059 &archive_obj, /* src obj */
3060 bucket, /* dest bucket info */
3061 &archive_bucket, /* src bucket info */
3062 bucket->get_placement_rule(), /* placement_rule */
7c673cae
FG
3063 nullptr, /* time_t *src_mtime */
3064 nullptr, /* time_t *mtime */
3065 nullptr, /* const time_t *mod_ptr */
3066 nullptr, /* const time_t *unmod_ptr */
3067 false, /* bool high_precision_time */
3068 nullptr, /* const char *if_match */
3069 nullptr, /* const char *if_nomatch */
3070 RGWRados::ATTRSMOD_NONE,
3071 true, /* bool copy_if_newer */
3072 no_attrs,
11fdf7f2 3073 RGWObjCategory::Main,
7c673cae
FG
3074 0, /* uint64_t olh_epoch */
3075 real_time(), /* time_t delete_at */
3076 nullptr, /* string *version_id */
3077 nullptr, /* string *ptag */
3078 nullptr, /* string *petag */
7c673cae 3079 nullptr, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
3080 nullptr, /* void *progress_data */
3081 dpp,
3082 null_yield);
7c673cae
FG
3083 if (ret == -ECANCELED || ret == -ENOENT) {
3084 /* Has already been overwritten, meaning another rgw process already
3085 * copied it out */
3086 return 0;
3087 } else if (ret < 0) {
3088 return ret;
3089 } else {
3090 restored = true;
3091 }
3092
3093 /* Need to remove the archived copy. */
b3b6e05e 3094 ret = delete_obj(dpp, obj_ctx, archive_binfo, archive_obj.get_obj(),
7c673cae
FG
3095 archive_binfo.versioning_status());
3096
3097 return ret;
3098 };
3099
f67539c2 3100 const std::string& obj_name = obj->get_oid();
7c673cae
FG
3101 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
3102 % obj_name);
3103
b3b6e05e 3104 return on_last_entry_in_listing(dpp, archive_binfo, prefix, std::string(),
7c673cae
FG
3105 handler);
3106}
3107
b3b6e05e
TL
3108int RGWRados::Object::Write::_do_write_meta(const DoutPrefixProvider *dpp,
3109 uint64_t size, uint64_t accounted_size,
181888fb
FG
3110 map<string, bufferlist>& attrs,
3111 bool assume_noent, bool modify_tail,
9f95a23c 3112 void *_index_op, optional_yield y)
7c673cae
FG
3113{
3114 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
3115 RGWRados *store = target->get_store();
3116
3117 ObjectWriteOperation op;
11fdf7f2
TL
3118#ifdef WITH_LTTNG
3119 const struct req_state* s = get_req_state();
3120 string req_id;
3121 if (!s) {
3122 // fake req_id
3123 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3124 } else {
3125 req_id = s->req_id;
3126 }
3127#endif
7c673cae
FG
3128
3129 RGWObjState *state;
b3b6e05e 3130 int r = target->get_state(dpp, &state, false, y, assume_noent);
7c673cae
FG
3131 if (r < 0)
3132 return r;
3133
3134 rgw_obj& obj = target->get_obj();
3135
3136 if (obj.get_oid().empty()) {
b3b6e05e 3137 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
7c673cae
FG
3138 return -EIO;
3139 }
3140
224ce89b 3141 rgw_rados_ref ref;
20effc67 3142 r = store->get_obj_head_ref(dpp, target->get_meta_placement_rule(), obj, &ref);
7c673cae
FG
3143 if (r < 0)
3144 return r;
3145
3146 bool is_olh = state->is_olh;
3147
3148 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3149
3150 const string *ptag = meta.ptag;
3151 if (!ptag && !index_op->get_optag()->empty()) {
3152 ptag = index_op->get_optag();
3153 }
b3b6e05e 3154 r = target->prepare_atomic_modification(dpp, op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
7c673cae
FG
3155 if (r < 0)
3156 return r;
3157
3158 if (real_clock::is_zero(meta.set_mtime)) {
3159 meta.set_mtime = real_clock::now();
3160 }
3161
eafe8130
TL
3162 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
3163 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
3164 if (iter == attrs.end()) {
3165 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3166 string mode = target->bucket_info.obj_lock.get_mode();
3167 RGWObjectRetention obj_retention(mode, lock_until_date);
3168 bufferlist bl;
3169 obj_retention.encode(bl);
3170 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3171 }
3172 }
3173
7c673cae
FG
3174 if (state->is_olh) {
3175 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3176 }
3177
3178 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3179 op.mtime2(&mtime_ts);
3180
3181 if (meta.data) {
3182 /* if we want to overwrite the data, we also want to overwrite the
3183 xattrs, so just remove the object */
3184 op.write_full(*meta.data);
20effc67
TL
3185 if (state->compressed) {
3186 uint32_t alloc_hint_flags = librados::ALLOC_HINT_FLAG_INCOMPRESSIBLE;
3187 op.set_alloc_hint2(0, 0, alloc_hint_flags);
3188 }
7c673cae
FG
3189 }
3190
3191 string etag;
3192 string content_type;
3193 bufferlist acl_bl;
11fdf7f2 3194 string storage_class;
7c673cae
FG
3195
3196 map<string, bufferlist>::iterator iter;
3197 if (meta.rmattrs) {
3198 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3199 const string& name = iter->first;
3200 op.rmxattr(name.c_str());
3201 }
3202 }
3203
3204 if (meta.manifest) {
11fdf7f2
TL
3205 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3206
7c673cae
FG
3207 /* remove existing manifest attr */
3208 iter = attrs.find(RGW_ATTR_MANIFEST);
3209 if (iter != attrs.end())
3210 attrs.erase(iter);
3211
3212 bufferlist bl;
11fdf7f2 3213 encode(*meta.manifest, bl);
7c673cae
FG
3214 op.setxattr(RGW_ATTR_MANIFEST, bl);
3215 }
3216
3217 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3218 const string& name = iter->first;
3219 bufferlist& bl = iter->second;
3220
3221 if (!bl.length())
3222 continue;
3223
3224 op.setxattr(name.c_str(), bl);
3225
3226 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3227 etag = rgw_bl_str(bl);
7c673cae 3228 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3229 content_type = rgw_bl_str(bl);
7c673cae
FG
3230 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3231 acl_bl = bl;
3232 }
3233 }
3234 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3235 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3236 }
3237
3238 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3239 bufferlist bl;
11fdf7f2 3240 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3241 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3242 }
3243
11fdf7f2
TL
3244 if (!storage_class.empty()) {
3245 bufferlist bl;
3246 bl.append(storage_class);
3247 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3248 }
3249
7c673cae
FG
3250 if (!op.size())
3251 return 0;
3252
3253 uint64_t epoch;
3254 int64_t poolid;
224ce89b
WB
3255 bool orig_exists;
3256 uint64_t orig_size;
3257
3258 if (!reset_obj) { //Multipart upload, it has immutable head.
3259 orig_exists = false;
3260 orig_size = 0;
3261 } else {
3262 orig_exists = state->exists;
3263 orig_size = state->accounted_size;
3264 }
7c673cae 3265
91327a77
AA
3266 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3267 !obj.key.instance.empty();
7c673cae
FG
3268
3269 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3270
3271 if (versioned_op) {
3272 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3273 }
3274
3275 if (!index_op->is_prepared()) {
11fdf7f2 3276 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
b3b6e05e 3277 r = index_op->prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
11fdf7f2 3278 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3279 if (r < 0)
3280 return r;
3281 }
3282
9f95a23c
TL
3283 auto& ioctx = ref.pool.ioctx();
3284
11fdf7f2 3285 tracepoint(rgw_rados, operate_enter, req_id.c_str());
b3b6e05e 3286 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
11fdf7f2 3287 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3288 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3289 or -ENOENT if was removed, or -EEXIST if it did not exist
3290 before and now it does */
3291 if (r == -EEXIST && assume_noent) {
3292 target->invalidate_state();
3293 return r;
3294 }
3295 goto done_cancel;
3296 }
3297
9f95a23c
TL
3298 epoch = ioctx.get_last_version();
3299 poolid = ioctx.get_id();
7c673cae 3300
b3b6e05e 3301 r = target->complete_atomic_modification(dpp);
7c673cae 3302 if (r < 0) {
b3b6e05e 3303 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
7c673cae
FG
3304 }
3305
11fdf7f2 3306 tracepoint(rgw_rados, complete_enter, req_id.c_str());
b3b6e05e 3307 r = index_op->complete(dpp, poolid, epoch, size, accounted_size,
11fdf7f2
TL
3308 meta.set_mtime, etag, content_type,
3309 storage_class, &acl_bl,
3310 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3311 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3312 if (r < 0)
3313 goto done_cancel;
3314
3315 if (meta.mtime) {
3316 *meta.mtime = meta.set_mtime;
3317 }
3318
3319 /* note that index_op was using state so we couldn't invalidate it earlier */
3320 target->invalidate_state();
3321 state = NULL;
3322
91327a77 3323 if (versioned_op && meta.olh_epoch) {
b3b6e05e 3324 r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
7c673cae
FG
3325 if (r < 0) {
3326 return r;
3327 }
3328 }
3329
3330 if (!real_clock::is_zero(meta.delete_at)) {
3331 rgw_obj_index_key obj_key;
3332 obj.key.get_index_key(&obj_key);
3333
b3b6e05e 3334 r = store->obj_expirer->hint_add(dpp, meta.delete_at, obj.bucket.tenant, obj.bucket.name,
9f95a23c 3335 obj.bucket.bucket_id, obj_key);
7c673cae 3336 if (r < 0) {
b3b6e05e 3337 ldpp_dout(dpp, 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
7c673cae
FG
3338 /* ignoring error, nothing we can do at this point */
3339 }
3340 }
3341 meta.canceled = false;
3342
3343 /* update quota cache */
3efd9988
FG
3344 if (meta.completeMultipart){
3345 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3346 0, orig_size);
3347 }
3348 else {
3349 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3350 accounted_size, orig_size);
3351 }
7c673cae
FG
3352 return 0;
3353
3354done_cancel:
20effc67 3355 int ret = index_op->cancel(dpp, meta.remove_objs);
7c673cae 3356 if (ret < 0) {
b3b6e05e 3357 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
7c673cae
FG
3358 }
3359
3360 meta.canceled = true;
3361
3362 /* we lost in a race. There are a few options:
3363 * - existing object was rewritten (ECANCELED)
3364 * - non existing object was created (EEXIST)
3365 * - object was removed (ENOENT)
3366 * should treat it as a success
3367 */
3368 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3369 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3370 r = 0;
3371 }
3372 } else {
3373 if (meta.if_match != NULL) {
3374 // only overwrite existing object
3375 if (strcmp(meta.if_match, "*") == 0) {
3376 if (r == -ENOENT) {
3377 r = -ERR_PRECONDITION_FAILED;
3378 } else if (r == -ECANCELED) {
3379 r = 0;
3380 }
3381 }
3382 }
3383
3384 if (meta.if_nomatch != NULL) {
3385 // only create a new object
3386 if (strcmp(meta.if_nomatch, "*") == 0) {
3387 if (r == -EEXIST) {
3388 r = -ERR_PRECONDITION_FAILED;
3389 } else if (r == -ENOENT) {
3390 r = 0;
3391 }
3392 }
3393 }
3394 }
3395
3396 return r;
3397}
3398
b3b6e05e 3399int RGWRados::Object::Write::write_meta(const DoutPrefixProvider *dpp, uint64_t size, uint64_t accounted_size,
9f95a23c 3400 map<string, bufferlist>& attrs, optional_yield y)
7c673cae
FG
3401{
3402 RGWBucketInfo& bucket_info = target->get_bucket_info();
3403
3404 RGWRados::Bucket bop(target->get_store(), bucket_info);
3405 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3406 index_op.set_zones_trace(meta.zones_trace);
3407
7c673cae
FG
3408 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3409 int r;
3410 if (assume_noent) {
b3b6e05e 3411 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3412 if (r == -EEXIST) {
3413 assume_noent = false;
3414 }
3415 }
3416 if (!assume_noent) {
b3b6e05e 3417 r = _do_write_meta(dpp, size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3418 }
3419 return r;
3420}
3421
11fdf7f2 3422class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae 3423{
b3b6e05e 3424 const DoutPrefixProvider *dpp;
7c673cae
FG
3425 CephContext* cct;
3426 rgw_obj obj;
20effc67 3427 rgw::sal::DataProcessor *filter;
7c673cae 3428 boost::optional<RGWPutObj_Compress>& compressor;
adb31ebb
TL
3429 bool try_etag_verify;
3430 rgw::putobj::etag_verifier_ptr etag_verifier;
11fdf7f2 3431 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3432 CompressorRef& plugin;
20effc67 3433 rgw::sal::ObjectProcessor *processor;
7c673cae
FG
3434 void (*progress_cb)(off_t, void *);
3435 void *progress_data;
adb31ebb
TL
3436 bufferlist extra_data_bl, manifest_bl;
3437 std::optional<RGWCompressionInfo> compression_info;
11fdf7f2
TL
3438 uint64_t extra_data_left{0};
3439 bool need_to_process_attrs{true};
3440 uint64_t data_len{0};
7c673cae 3441 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3442 uint64_t ofs{0};
3443 uint64_t lofs{0}; /* logical ofs */
9f95a23c 3444 std::function<int(map<string, bufferlist>&)> attrs_handler;
20effc67 3445
7c673cae 3446public:
20effc67 3447 RGWRadosPutObj(const DoutPrefixProvider *dpp,
b3b6e05e 3448 CephContext* cct,
7c673cae
FG
3449 CompressorRef& plugin,
3450 boost::optional<RGWPutObj_Compress>& compressor,
20effc67 3451 rgw::sal::ObjectProcessor *p,
7c673cae 3452 void (*_progress_cb)(off_t, void *),
11fdf7f2 3453 void *_progress_data,
9f95a23c 3454 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
b3b6e05e 3455 dpp(dpp),
7c673cae
FG
3456 cct(cct),
3457 filter(p),
3458 compressor(compressor),
adb31ebb 3459 try_etag_verify(cct->_conf->rgw_sync_obj_etag_verify),
7c673cae
FG
3460 plugin(plugin),
3461 processor(p),
7c673cae
FG
3462 progress_cb(_progress_cb),
3463 progress_data(_progress_data),
11fdf7f2 3464 attrs_handler(_attrs_handler) {}
7c673cae 3465
20effc67 3466
7c673cae
FG
3467 int process_attrs(void) {
3468 if (extra_data_bl.length()) {
3469 JSONParser jp;
3470 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
b3b6e05e 3471 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7c673cae
FG
3472 return -EIO;
3473 }
3474
3475 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3476
adb31ebb
TL
3477 auto iter = src_attrs.find(RGW_ATTR_COMPRESSION);
3478 if (iter != src_attrs.end()) {
3479 const bufferlist bl = std::move(iter->second);
3480 src_attrs.erase(iter); // don't preserve source compression info
3481
3482 if (try_etag_verify) {
3483 // if we're trying to verify etags, we need to convert compressed
3484 // ranges in the manifest back into logical multipart part offsets
3485 RGWCompressionInfo info;
3486 bool compressed = false;
3487 int r = rgw_compression_info_from_attr(bl, compressed, info);
3488 if (r < 0) {
b3b6e05e 3489 ldpp_dout(dpp, 4) << "failed to decode compression info, "
adb31ebb
TL
3490 "disabling etag verification" << dendl;
3491 try_etag_verify = false;
3492 } else if (compressed) {
3493 compression_info = std::move(info);
3494 }
3495 }
3496 }
3497 /* We need the manifest to recompute the ETag for verification */
3498 iter = src_attrs.find(RGW_ATTR_MANIFEST);
3499 if (iter != src_attrs.end()) {
3500 manifest_bl = std::move(iter->second);
3501 src_attrs.erase(iter);
3502 }
a8e16298
TL
3503
3504 // filter out olh attributes
adb31ebb 3505 iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
a8e16298
TL
3506 while (iter != src_attrs.end()) {
3507 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3508 break;
3509 }
3510 iter = src_attrs.erase(iter);
3511 }
7c673cae
FG
3512 }
3513
11fdf7f2
TL
3514 int ret = attrs_handler(src_attrs);
3515 if (ret < 0) {
3516 return ret;
3517 }
3518
7c673cae
FG
3519 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3520 //do not compress if object is encrypted
3521 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3522 // add a filter that buffers data so we don't try to compress tiny blocks.
3523 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3524 // compression ratio
28e407b8
AA
3525 constexpr unsigned buffer_size = 512 * 1024;
3526 buffering = boost::in_place(&*compressor, buffer_size);
3527 filter = &*buffering;
7c673cae 3528 }
11fdf7f2 3529
adb31ebb
TL
3530 /*
3531 * Presently we don't support ETag based verification if encryption is
3532 * requested. We can enable simultaneous support once we have a mechanism
3533 * to know the sequence in which the filters must be applied.
3534 */
3535 if (try_etag_verify && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
b3b6e05e 3536 ret = rgw::putobj::create_etag_verifier(dpp, cct, filter, manifest_bl,
adb31ebb
TL
3537 compression_info,
3538 etag_verifier);
3539 if (ret < 0) {
b3b6e05e 3540 ldpp_dout(dpp, 4) << "failed to initial etag verifier, "
adb31ebb
TL
3541 "disabling etag verification" << dendl;
3542 } else {
3543 filter = etag_verifier.get();
3544 }
3545 }
3546
11fdf7f2
TL
3547 need_to_process_attrs = false;
3548
7c673cae
FG
3549 return 0;
3550 }
3551
11fdf7f2 3552 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3553 if (progress_cb) {
11fdf7f2 3554 progress_cb(data_len, progress_data);
7c673cae 3555 }
b32b8144 3556 if (extra_data_left) {
11fdf7f2 3557 uint64_t extra_len = bl.length();
b32b8144
FG
3558 if (extra_len > extra_data_left)
3559 extra_len = extra_data_left;
7c673cae
FG
3560
3561 bufferlist extra;
3562 bl.splice(0, extra_len, &extra);
3563 extra_data_bl.append(extra);
3564
b32b8144
FG
3565 extra_data_left -= extra_len;
3566 if (extra_data_left == 0) {
7c673cae
FG
3567 int res = process_attrs();
3568 if (res < 0)
3569 return res;
3570 }
11fdf7f2 3571 ofs += extra_len;
7c673cae
FG
3572 if (bl.length() == 0) {
3573 return 0;
3574 }
3575 }
11fdf7f2
TL
3576 if (need_to_process_attrs) {
3577 /* need to call process_attrs() even if we don't get any attrs,
3578 * need it to call attrs_handler().
3579 */
3580 int res = process_attrs();
3581 if (res < 0) {
3582 return res;
3583 }
3584 }
7c673cae 3585
11fdf7f2 3586 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3587
11fdf7f2
TL
3588 uint64_t size = bl.length();
3589 ofs += size;
7c673cae 3590
11fdf7f2
TL
3591 const uint64_t lofs = data_len;
3592 data_len += size;
7c673cae 3593
11fdf7f2 3594 return filter->process(std::move(bl), lofs);
7c673cae
FG
3595 }
3596
28e407b8 3597 int flush() {
11fdf7f2 3598 return filter->process({}, data_len);
28e407b8
AA
3599 }
3600
7c673cae
FG
3601 bufferlist& get_extra_data() { return extra_data_bl; }
3602
3603 map<string, bufferlist>& get_attrs() { return src_attrs; }
3604
3605 void set_extra_data_len(uint64_t len) override {
b32b8144 3606 extra_data_left = len;
11fdf7f2 3607 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3608 }
3609
3610 uint64_t get_data_len() {
3611 return data_len;
3612 }
adb31ebb
TL
3613
3614 std::string get_verifier_etag() {
3615 if (etag_verifier) {
3616 etag_verifier->calculate_etag();
3617 return etag_verifier->get_calculated_etag();
3618 } else {
3619 return "";
3620 }
3621 }
7c673cae
FG
3622};
3623
3624/*
3625 * prepare attrset depending on attrs_mod.
3626 */
3627static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3628 map<string, bufferlist>& attrs,
3629 RGWRados::AttrsMod attrs_mod)
3630{
3631 switch (attrs_mod) {
3632 case RGWRados::ATTRSMOD_NONE:
3633 attrs = src_attrs;
3634 break;
3635 case RGWRados::ATTRSMOD_REPLACE:
3636 if (!attrs[RGW_ATTR_ETAG].length()) {
3637 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3638 }
181888fb
FG
3639 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3640 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3641 if (ttiter != src_attrs.end()) {
3642 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3643 }
3644 }
7c673cae
FG
3645 break;
3646 case RGWRados::ATTRSMOD_MERGE:
3647 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3648 if (attrs.find(it->first) == attrs.end()) {
3649 attrs[it->first] = it->second;
3650 }
3651 }
3652 break;
3653 }
3654}
3655
20effc67 3656int RGWRados::rewrite_obj(rgw::sal::Object* obj, const DoutPrefixProvider *dpp, optional_yield y)
7c673cae 3657{
9f95a23c 3658 RGWObjectCtx rctx(this->store);
20effc67
TL
3659 rgw::sal::Attrs attrset;
3660 uint64_t obj_size;
3661 ceph::real_time mtime;
3662 RGWRados::Object op_target(this, obj->get_bucket()->get_info(), rctx, obj->get_obj());
3663 RGWRados::Object::Read read_op(&op_target);
3664
3665 read_op.params.attrs = &attrset;
3666 read_op.params.obj_size = &obj_size;
3667 read_op.params.lastmod = &mtime;
3668
3669 int ret = read_op.prepare(y, dpp);
3670 if (ret < 0)
3671 return ret;
7c673cae 3672
20effc67
TL
3673 attrset.erase(RGW_ATTR_ID_TAG);
3674 attrset.erase(RGW_ATTR_TAIL_TAG);
3675
3676 return store->getRados()->copy_obj_data(rctx, obj->get_bucket(),
3677 obj->get_bucket()->get_info().placement_rule,
3678 read_op, obj_size - 1, obj, NULL, mtime,
3679 attrset, 0, real_time(), NULL, dpp, y);
7c673cae
FG
3680}
3681
3682struct obj_time_weight {
3683 real_time mtime;
3684 uint32_t zone_short_id;
3685 uint64_t pg_ver;
3686 bool high_precision;
3687
3688 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3689
3690 bool compare_low_precision(const obj_time_weight& rhs) {
3691 struct timespec l = ceph::real_clock::to_timespec(mtime);
3692 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3693 l.tv_nsec = 0;
3694 r.tv_nsec = 0;
3695 if (l > r) {
3696 return false;
3697 }
3698 if (l < r) {
3699 return true;
3700 }
11fdf7f2
TL
3701 if (!zone_short_id || !rhs.zone_short_id) {
3702 /* don't compare zone ids, if one wasn't provided */
3703 return false;
3704 }
7c673cae
FG
3705 if (zone_short_id != rhs.zone_short_id) {
3706 return (zone_short_id < rhs.zone_short_id);
3707 }
3708 return (pg_ver < rhs.pg_ver);
3709
3710 }
3711
3712 bool operator<(const obj_time_weight& rhs) {
3713 if (!high_precision || !rhs.high_precision) {
3714 return compare_low_precision(rhs);
3715 }
3716 if (mtime > rhs.mtime) {
3717 return false;
3718 }
3719 if (mtime < rhs.mtime) {
3720 return true;
3721 }
11fdf7f2
TL
3722 if (!zone_short_id || !rhs.zone_short_id) {
3723 /* don't compare zone ids, if one wasn't provided */
3724 return false;
3725 }
7c673cae
FG
3726 if (zone_short_id != rhs.zone_short_id) {
3727 return (zone_short_id < rhs.zone_short_id);
3728 }
3729 return (pg_ver < rhs.pg_ver);
3730 }
3731
3732 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3733 mtime = _mtime;
3734 zone_short_id = _short_id;
3735 pg_ver = _pg_ver;
3736 }
3737
3738 void init(RGWObjState *state) {
3739 mtime = state->mtime;
3740 zone_short_id = state->zone_short_id;
3741 pg_ver = state->pg_ver;
3742 }
3743};
3744
3745inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3746 out << o.mtime;
3747
3748 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3749 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3750 }
3751
3752 return out;
3753}
3754
11fdf7f2 3755class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
3756 bufferlist extra_data;
3757public:
3758 RGWGetExtraDataCB() {}
11fdf7f2
TL
3759 int handle_data(bufferlist& bl, bool *pause) override {
3760 int bl_len = (int)bl.length();
7c673cae
FG
3761 if (extra_data.length() < extra_data_len) {
3762 off_t max = extra_data_len - extra_data.length();
3763 if (max > bl_len) {
3764 max = bl_len;
3765 }
3766 bl.splice(0, max, &extra_data);
3767 }
3768 return bl_len;
3769 }
3770
3771 bufferlist& get_extra_data() {
3772 return extra_data;
3773 }
3774};
3775
b3b6e05e
TL
3776int RGWRados::stat_remote_obj(const DoutPrefixProvider *dpp,
3777 RGWObjectCtx& obj_ctx,
7c673cae 3778 const rgw_user& user_id,
7c673cae 3779 req_info *info,
9f95a23c 3780 const rgw_zone_id& source_zone,
20effc67 3781 rgw::sal::Object* src_obj,
9f95a23c 3782 const RGWBucketInfo *src_bucket_info,
7c673cae
FG
3783 real_time *src_mtime,
3784 uint64_t *psize,
3785 const real_time *mod_ptr,
3786 const real_time *unmod_ptr,
3787 bool high_precision_time,
3788 const char *if_match,
3789 const char *if_nomatch,
3790 map<string, bufferlist> *pattrs,
11fdf7f2 3791 map<string, string> *pheaders,
7c673cae
FG
3792 string *version_id,
3793 string *ptag,
3794 string *petag)
3795{
3796 /* source is in a different zonegroup, copy from there */
3797
3798 RGWRESTStreamRWRequest *in_stream_req;
3799 string tag;
3800 map<string, bufferlist> src_attrs;
3801 append_rand_alpha(cct, tag, tag, 32);
3802 obj_time_weight set_mtime_weight;
3803 set_mtime_weight.high_precision = high_precision_time;
3804
3805 RGWRESTConn *conn;
3806 if (source_zone.empty()) {
9f95a23c 3807 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3808 /* source is in the master zonegroup */
11fdf7f2 3809 conn = svc.zone->get_master_conn();
7c673cae 3810 } else {
11fdf7f2 3811 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
9f95a23c 3812 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae 3813 if (iter == zonegroup_conn_map.end()) {
20effc67 3814 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7c673cae
FG
3815 return -ENOENT;
3816 }
3817 conn = iter->second;
3818 }
3819 } else {
11fdf7f2 3820 auto& zone_conn_map = svc.zone->get_zone_conn_map();
9f95a23c 3821 auto iter = zone_conn_map.find(source_zone);
7c673cae 3822 if (iter == zone_conn_map.end()) {
20effc67 3823 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
7c673cae
FG
3824 return -ENOENT;
3825 }
3826 conn = iter->second;
3827 }
3828
3829 RGWGetExtraDataCB cb;
7c673cae
FG
3830 map<string, string> req_headers;
3831 real_time set_mtime;
3832
3833 const real_time *pmod = mod_ptr;
3834
3835 obj_time_weight dest_mtime_weight;
3836
181888fb
FG
3837 constexpr bool prepend_meta = true;
3838 constexpr bool get_op = true;
3839 constexpr bool rgwx_stat = true;
3840 constexpr bool sync_manifest = true;
3841 constexpr bool skip_decrypt = true;
b3b6e05e 3842 int ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 3843 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3844 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3845 sync_manifest, skip_decrypt,
3846 true, &cb, &in_stream_req);
7c673cae
FG
3847 if (ret < 0) {
3848 return ret;
3849 }
3850
f67539c2
TL
3851 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize,
3852 nullptr, pheaders, null_yield);
7c673cae
FG
3853 if (ret < 0) {
3854 return ret;
3855 }
3856
3857 bufferlist& extra_data_bl = cb.get_extra_data();
3858 if (extra_data_bl.length()) {
3859 JSONParser jp;
3860 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
20effc67 3861 ldpp_dout(dpp, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
7c673cae
FG
3862 return -EIO;
3863 }
3864
3865 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3866
3867 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3868 }
3869
3870 if (src_mtime) {
3871 *src_mtime = set_mtime;
3872 }
3873
3874 if (petag) {
3875 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3876 if (iter != src_attrs.end()) {
3877 bufferlist& etagbl = iter->second;
3878 *petag = etagbl.to_str();
11fdf7f2
TL
3879 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3880 *petag = petag->substr(0, petag->size() - 1);
3881 }
7c673cae
FG
3882 }
3883 }
3884
3885 if (pattrs) {
11fdf7f2 3886 *pattrs = std::move(src_attrs);
7c673cae
FG
3887 }
3888
3889 return 0;
3890}
3891
9f95a23c
TL
3892int RGWFetchObjFilter_Default::filter(CephContext *cct,
3893 const rgw_obj_key& source_key,
3894 const RGWBucketInfo& dest_bucket_info,
3895 std::optional<rgw_placement_rule> dest_placement_rule,
3896 const map<string, bufferlist>& obj_attrs,
3897 std::optional<rgw_user> *poverride_owner,
3898 const rgw_placement_rule **prule)
3899{
3900 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3901 if (!ptail_rule) {
3902 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3903 if (iter != obj_attrs.end()) {
3904 dest_rule.storage_class = iter->second.to_str();
3905 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3906 ptail_rule = &dest_rule;
3907 } else {
3908 ptail_rule = &dest_bucket_info.placement_rule;
3909 }
3910 }
3911 *prule = ptail_rule;
3912 return 0;
3913}
3914
7c673cae
FG
3915int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3916 const rgw_user& user_id,
7c673cae 3917 req_info *info,
9f95a23c 3918 const rgw_zone_id& source_zone,
20effc67
TL
3919 rgw::sal::Object* dest_obj,
3920 rgw::sal::Object* src_obj,
3921 rgw::sal::Bucket* dest_bucket,
3922 rgw::sal::Bucket* src_bucket,
11fdf7f2 3923 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
3924 real_time *src_mtime,
3925 real_time *mtime,
3926 const real_time *mod_ptr,
3927 const real_time *unmod_ptr,
3928 bool high_precision_time,
3929 const char *if_match,
3930 const char *if_nomatch,
3931 AttrsMod attrs_mod,
3932 bool copy_if_newer,
20effc67 3933 rgw::sal::Attrs& attrs,
7c673cae 3934 RGWObjCategory category,
11fdf7f2 3935 std::optional<uint64_t> olh_epoch,
7c673cae 3936 real_time delete_at,
7c673cae 3937 string *ptag,
11fdf7f2 3938 string *petag,
7c673cae 3939 void (*progress_cb)(off_t, void *),
31f18b77 3940 void *progress_data,
9f95a23c
TL
3941 const DoutPrefixProvider *dpp,
3942 RGWFetchObjFilter *filter,
81eedcae
TL
3943 rgw_zone_set *zones_trace,
3944 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
3945{
3946 /* source is in a different zonegroup, copy from there */
3947
3948 RGWRESTStreamRWRequest *in_stream_req;
3949 string tag;
3950 int i;
3951 append_rand_alpha(cct, tag, tag, 32);
3952 obj_time_weight set_mtime_weight;
3953 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 3954 int ret;
7c673cae 3955
9f95a23c 3956 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 3957 using namespace rgw::putobj;
20effc67 3958 AtomicObjectProcessor processor(&aio, this->store, nullptr, user_id,
f67539c2
TL
3959 obj_ctx, dest_obj->clone(), olh_epoch,
3960 tag, dpp, null_yield);
7c673cae 3961 RGWRESTConn *conn;
11fdf7f2
TL
3962 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3963 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae 3964 if (source_zone.empty()) {
f67539c2 3965 if (!src_bucket || src_bucket->get_info().zonegroup.empty()) {
7c673cae 3966 /* source is in the master zonegroup */
11fdf7f2 3967 conn = svc.zone->get_master_conn();
7c673cae 3968 } else {
f67539c2 3969 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket->get_info().zonegroup);
7c673cae 3970 if (iter == zonegroup_conn_map.end()) {
b3b6e05e 3971 ldpp_dout(dpp, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
7c673cae
FG
3972 return -ENOENT;
3973 }
3974 conn = iter->second;
3975 }
3976 } else {
9f95a23c 3977 auto iter = zone_conn_map.find(source_zone);
7c673cae 3978 if (iter == zone_conn_map.end()) {
b3b6e05e 3979 ldpp_dout(dpp, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 3980 return -ENOENT;
7c673cae 3981 }
11fdf7f2 3982 conn = iter->second;
7c673cae
FG
3983 }
3984
3985 boost::optional<RGWPutObj_Compress> compressor;
3986 CompressorRef plugin;
3987
9f95a23c
TL
3988 RGWFetchObjFilter_Default source_filter;
3989 if (!filter) {
3990 filter = &source_filter;
3991 }
3992
3993 std::optional<rgw_user> override_owner;
3994
b3b6e05e 3995 RGWRadosPutObj cb(dpp, cct, plugin, compressor, &processor, progress_cb, progress_data,
9f95a23c
TL
3996 [&](map<string, bufferlist>& obj_attrs) {
3997 const rgw_placement_rule *ptail_rule;
3998
3999 int ret = filter->filter(cct,
f67539c2
TL
4000 src_obj->get_key(),
4001 dest_bucket->get_info(),
9f95a23c
TL
4002 dest_placement_rule,
4003 obj_attrs,
4004 &override_owner,
4005 &ptail_rule);
4006 if (ret < 0) {
b3b6e05e 4007 ldpp_dout(dpp, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
9f95a23c 4008 return ret;
11fdf7f2 4009 }
9f95a23c
TL
4010
4011 processor.set_tail_placement(*ptail_rule);
4012
11fdf7f2
TL
4013 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
4014 if (compression_type != "none") {
4015 plugin = Compressor::create(cct, compression_type);
4016 if (!plugin) {
b3b6e05e 4017 ldpp_dout(dpp, 1) << "Cannot load plugin for compression type "
11fdf7f2
TL
4018 << compression_type << dendl;
4019 }
4020 }
4021
9f95a23c 4022 ret = processor.prepare(null_yield);
11fdf7f2
TL
4023 if (ret < 0) {
4024 return ret;
4025 }
4026 return 0;
4027 });
7c673cae
FG
4028
4029 string etag;
7c673cae 4030 real_time set_mtime;
81eedcae 4031 uint64_t expected_size = 0;
7c673cae
FG
4032
4033 RGWObjState *dest_state = NULL;
4034
4035 const real_time *pmod = mod_ptr;
4036
4037 obj_time_weight dest_mtime_weight;
4038
4039 if (copy_if_newer) {
4040 /* need to get mtime for destination */
b3b6e05e 4041 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae
FG
4042 if (ret < 0)
4043 goto set_err_state;
4044
4045 if (!real_clock::is_zero(dest_state->mtime)) {
4046 dest_mtime_weight.init(dest_state);
4047 pmod = &dest_mtime_weight.mtime;
4048 }
4049 }
4050
181888fb
FG
4051 static constexpr bool prepend_meta = true;
4052 static constexpr bool get_op = true;
4053 static constexpr bool rgwx_stat = false;
4054 static constexpr bool sync_manifest = true;
4055 static constexpr bool skip_decrypt = true;
b3b6e05e 4056 ret = conn->get_obj(dpp, user_id, info, src_obj, pmod, unmod_ptr,
7c673cae 4057 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 4058 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
4059 sync_manifest, skip_decrypt,
4060 true,
4061 &cb, &in_stream_req);
7c673cae
FG
4062 if (ret < 0) {
4063 goto set_err_state;
4064 }
4065
81eedcae 4066 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
f67539c2 4067 &expected_size, nullptr, nullptr, null_yield);
7c673cae
FG
4068 if (ret < 0) {
4069 goto set_err_state;
4070 }
28e407b8
AA
4071 ret = cb.flush();
4072 if (ret < 0) {
4073 goto set_err_state;
4074 }
81eedcae
TL
4075 if (cb.get_data_len() != expected_size) {
4076 ret = -EIO;
b3b6e05e 4077 ldpp_dout(dpp, 0) << "ERROR: object truncated during fetching, expected "
81eedcae
TL
4078 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
4079 goto set_err_state;
4080 }
7c673cae
FG
4081 if (compressor && compressor->is_compressed()) {
4082 bufferlist tmp;
4083 RGWCompressionInfo cs_info;
4084 cs_info.compression_type = plugin->get_type_name();
4085 cs_info.orig_size = cb.get_data_len();
f67539c2 4086 cs_info.compressor_message = compressor->get_compressor_message();
7c673cae 4087 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 4088 encode(cs_info, tmp);
7c673cae
FG
4089 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
4090 }
4091
9f95a23c
TL
4092 if (override_owner) {
4093 processor.set_owner(*override_owner);
4094
4095 auto& obj_attrs = cb.get_attrs();
4096
4097 RGWUserInfo owner_info;
b3b6e05e
TL
4098 if (ctl.user->get_info_by_uid(dpp, *override_owner, &owner_info, null_yield) < 0) {
4099 ldpp_dout(dpp, 10) << "owner info does not exist" << dendl;
9f95a23c
TL
4100 return -EINVAL;
4101 }
4102
4103 RGWAccessControlPolicy acl;
4104
4105 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
4106 if (aiter == obj_attrs.end()) {
b3b6e05e 4107 ldpp_dout(dpp, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
9f95a23c
TL
4108 acl.create_default(owner_info.user_id, owner_info.display_name);
4109 } else {
4110 auto iter = aiter->second.cbegin();
4111 try {
4112 acl.decode(iter);
4113 } catch (buffer::error& err) {
b3b6e05e 4114 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
9f95a23c
TL
4115 return -EIO;
4116 }
4117 }
4118
4119 ACLOwner new_owner;
4120 new_owner.set_id(*override_owner);
4121 new_owner.set_name(owner_info.display_name);
4122
4123 acl.set_owner(new_owner);
4124
4125 bufferlist bl;
4126 acl.encode(bl);
4127 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
4128 }
4129
7c673cae
FG
4130 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4131 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
4132 } else {
4133 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
4134 if (iter != cb.get_attrs().end()) {
4135 try {
11fdf7f2 4136 decode(delete_at, iter->second);
7c673cae 4137 } catch (buffer::error& err) {
b3b6e05e 4138 ldpp_dout(dpp, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
7c673cae
FG
4139 }
4140 }
4141 }
4142
4143 if (src_mtime) {
4144 *src_mtime = set_mtime;
4145 }
4146
4147 if (petag) {
4148 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4149 if (iter != cb.get_attrs().end()) {
11fdf7f2 4150 *petag = iter->second.to_str();
7c673cae
FG
4151 }
4152 }
4153
11fdf7f2
TL
4154 //erase the append attr
4155 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4156
7c673cae
FG
4157 if (source_zone.empty()) {
4158 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4159 } else {
4160 attrs = cb.get_attrs();
4161 }
4162
4163 if (copy_if_newer) {
4164 uint64_t pg_ver = 0;
4165 auto i = attrs.find(RGW_ATTR_PG_VER);
4166 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 4167 auto iter = i->second.cbegin();
7c673cae 4168 try {
11fdf7f2 4169 decode(pg_ver, iter);
7c673cae 4170 } catch (buffer::error& err) {
b3b6e05e 4171 ldpp_dout(dpp, 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
7c673cae
FG
4172 /* non critical error */
4173 }
4174 }
11fdf7f2 4175 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
4176 }
4177
adb31ebb
TL
4178 /* Perform ETag verification is we have computed the object's MD5 sum at our end */
4179 if (const auto& verifier_etag = cb.get_verifier_etag();
4180 !verifier_etag.empty()) {
4181 string trimmed_etag = etag;
4182
4183 /* Remove the leading and trailing double quotes from etag */
4184 trimmed_etag.erase(std::remove(trimmed_etag.begin(), trimmed_etag.end(),'\"'),
4185 trimmed_etag.end());
4186
4187 if (verifier_etag != trimmed_etag) {
4188 ret = -EIO;
b3b6e05e 4189 ldpp_dout(dpp, 0) << "ERROR: source and destination objects don't match. Expected etag:"
adb31ebb
TL
4190 << trimmed_etag << " Computed etag:" << verifier_etag << dendl;
4191 goto set_err_state;
4192 }
4193 }
4194
7c673cae
FG
4195#define MAX_COMPLETE_RETRY 100
4196 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
4197 bool canceled = false;
4198 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4199 attrs, delete_at, nullptr, nullptr, nullptr,
9f95a23c 4200 zones_trace, &canceled, null_yield);
7c673cae
FG
4201 if (ret < 0) {
4202 goto set_err_state;
4203 }
adb31ebb 4204
11fdf7f2 4205 if (copy_if_newer && canceled) {
b3b6e05e 4206 ldpp_dout(dpp, 20) << "raced with another write of obj: " << dest_obj << dendl;
f67539c2 4207 obj_ctx.invalidate(dest_obj->get_obj()); /* object was overwritten */
b3b6e05e 4208 ret = get_obj_state(dpp, &obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), &dest_state, false, null_yield);
7c673cae 4209 if (ret < 0) {
b3b6e05e 4210 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
7c673cae
FG
4211 goto set_err_state;
4212 }
4213 dest_mtime_weight.init(dest_state);
4214 dest_mtime_weight.high_precision = high_precision_time;
4215 if (!dest_state->exists ||
4216 dest_mtime_weight < set_mtime_weight) {
b3b6e05e 4217 ldpp_dout(dpp, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4218 continue;
4219 } else {
b3b6e05e 4220 ldpp_dout(dpp, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
7c673cae
FG
4221 }
4222 }
4223 break;
4224 }
4225
4226 if (i == MAX_COMPLETE_RETRY) {
b3b6e05e 4227 ldpp_dout(dpp, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
7c673cae
FG
4228 ret = -EIO;
4229 goto set_err_state;
4230 }
4231
81eedcae
TL
4232 if (bytes_transferred) {
4233 *bytes_transferred = cb.get_data_len();
4234 }
7c673cae
FG
4235 return 0;
4236set_err_state:
4237 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
4238 // we may have already fetched during sync of OP_ADD, but were waiting
4239 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4240 if (olh_epoch && *olh_epoch > 0) {
4241 constexpr bool log_data_change = true;
b3b6e05e 4242 ret = set_olh(dpp, obj_ctx, dest_bucket->get_info(), dest_obj->get_obj(), false, nullptr,
9f95a23c 4243 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
91327a77
AA
4244 } else {
4245 // we already have the latest copy
4246 ret = 0;
4247 }
7c673cae 4248 }
7c673cae
FG
4249 return ret;
4250}
4251
4252
20effc67 4253int RGWRados::copy_obj_to_remote_dest(const DoutPrefixProvider *dpp,
b3b6e05e 4254 RGWObjState *astate,
7c673cae
FG
4255 map<string, bufferlist>& src_attrs,
4256 RGWRados::Object::Read& read_op,
4257 const rgw_user& user_id,
20effc67 4258 rgw::sal::Object* dest_obj,
7c673cae
FG
4259 real_time *mtime)
4260{
4261 string etag;
4262
11fdf7f2 4263 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4264
11fdf7f2
TL
4265 auto rest_master_conn = svc.zone->get_master_conn();
4266
20effc67 4267 int ret = rest_master_conn->put_obj_async_init(dpp, user_id, dest_obj, src_attrs, &out_stream_req);
7c673cae 4268 if (ret < 0) {
7c673cae
FG
4269 return ret;
4270 }
4271
20effc67
TL
4272 out_stream_req->set_send_length(astate->size);
4273
4274 ret = RGWHTTP::send(out_stream_req);
4275 if (ret < 0) {
4276 delete out_stream_req;
4277 return ret;
4278 }
4279
b3b6e05e 4280 ret = read_op.iterate(dpp, 0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
224ce89b
WB
4281 if (ret < 0) {
4282 delete out_stream_req;
7c673cae 4283 return ret;
224ce89b 4284 }
7c673cae 4285
f67539c2 4286 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime, null_yield);
7c673cae
FG
4287 if (ret < 0)
4288 return ret;
4289
4290 return 0;
4291}
4292
4293/**
4294 * Copy an object.
4295 * dest_obj: the object to copy into
4296 * src_obj: the object to copy from
4297 * attrs: usage depends on attrs_mod parameter
4298 * attrs_mod: the modification mode of the attrs, may have the following values:
4299 * ATTRSMOD_NONE - the attributes of the source object will be
4300 * copied without modifications, attrs parameter is ignored;
4301 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4302 * parameter, source object attributes are not copied;
4303 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4304 * are overwritten by values contained in attrs parameter.
4305 * err: stores any errors resulting from the get of the original object
4306 * Returns: 0 on success, -ERR# otherwise.
4307 */
4308int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4309 const rgw_user& user_id,
7c673cae 4310 req_info *info,
9f95a23c 4311 const rgw_zone_id& source_zone,
20effc67
TL
4312 rgw::sal::Object* dest_obj,
4313 rgw::sal::Object* src_obj,
4314 rgw::sal::Bucket* dest_bucket,
4315 rgw::sal::Bucket* src_bucket,
11fdf7f2 4316 const rgw_placement_rule& dest_placement,
7c673cae
FG
4317 real_time *src_mtime,
4318 real_time *mtime,
4319 const real_time *mod_ptr,
4320 const real_time *unmod_ptr,
4321 bool high_precision_time,
4322 const char *if_match,
4323 const char *if_nomatch,
4324 AttrsMod attrs_mod,
4325 bool copy_if_newer,
20effc67 4326 rgw::sal::Attrs& attrs,
7c673cae
FG
4327 RGWObjCategory category,
4328 uint64_t olh_epoch,
4329 real_time delete_at,
4330 string *version_id,
4331 string *ptag,
11fdf7f2 4332 string *petag,
7c673cae 4333 void (*progress_cb)(off_t, void *),
9f95a23c
TL
4334 void *progress_data,
4335 const DoutPrefixProvider *dpp,
4336 optional_yield y)
7c673cae
FG
4337{
4338 int ret;
4339 uint64_t obj_size;
f67539c2 4340 rgw_obj shadow_obj = dest_obj->get_obj();
7c673cae
FG
4341 string shadow_oid;
4342
4343 bool remote_src;
4344 bool remote_dest;
4345
f67539c2
TL
4346 append_rand_alpha(cct, dest_obj->get_oid(), shadow_oid, 32);
4347 shadow_obj.init_ns(dest_obj->get_bucket()->get_key(), shadow_oid, shadow_ns);
7c673cae 4348
11fdf7f2
TL
4349 auto& zonegroup = svc.zone->get_zonegroup();
4350
f67539c2
TL
4351 remote_dest = !zonegroup.equals(dest_bucket->get_info().zonegroup);
4352 remote_src = !zonegroup.equals(src_bucket->get_info().zonegroup);
7c673cae
FG
4353
4354 if (remote_src && remote_dest) {
9f95a23c 4355 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7c673cae
FG
4356 return -EINVAL;
4357 }
4358
f67539c2 4359 ldpp_dout(dpp, 5) << "Copy object " << src_obj->get_bucket() << ":" << src_obj->get_oid() << " => " << dest_obj->get_bucket() << ":" << dest_obj->get_oid() << dendl;
7c673cae
FG
4360
4361 if (remote_src || !source_zone.empty()) {
11fdf7f2 4362 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
f67539c2 4363 dest_obj, src_obj, dest_bucket, src_bucket,
11fdf7f2 4364 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4365 unmod_ptr, high_precision_time,
4366 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
9f95a23c
TL
4367 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4368 nullptr /* filter */);
7c673cae
FG
4369 }
4370
4371 map<string, bufferlist> src_attrs;
f67539c2 4372 RGWRados::Object src_op_target(this, src_bucket->get_info(), obj_ctx, src_obj->get_obj());
7c673cae
FG
4373 RGWRados::Object::Read read_op(&src_op_target);
4374
4375 read_op.conds.mod_ptr = mod_ptr;
4376 read_op.conds.unmod_ptr = unmod_ptr;
4377 read_op.conds.high_precision_time = high_precision_time;
4378 read_op.conds.if_match = if_match;
4379 read_op.conds.if_nomatch = if_nomatch;
4380 read_op.params.attrs = &src_attrs;
4381 read_op.params.lastmod = src_mtime;
4382 read_op.params.obj_size = &obj_size;
7c673cae 4383
b3b6e05e 4384 ret = read_op.prepare(y, dpp);
7c673cae
FG
4385 if (ret < 0) {
4386 return ret;
4387 }
94b18763
FG
4388 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4389 // Current implementation does not follow S3 spec and even
4390 // may result in data corruption silently when copying
4391 // multipart objects acorss pools. So reject COPY operations
4392 //on encrypted objects before it is fully functional.
9f95a23c 4393 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
94b18763
FG
4394 << " has not been implemented." << dendl;
4395 return -ERR_NOT_IMPLEMENTED;
4396 }
7c673cae
FG
4397
4398 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4399 src_attrs.erase(RGW_ATTR_DELETE_AT);
4400
20effc67
TL
4401 src_attrs.erase(RGW_ATTR_OBJECT_RETENTION);
4402 src_attrs.erase(RGW_ATTR_OBJECT_LEGAL_HOLD);
4403 map<string, bufferlist>::iterator rt = attrs.find(RGW_ATTR_OBJECT_RETENTION);
4404 if (rt != attrs.end())
4405 src_attrs[RGW_ATTR_OBJECT_RETENTION] = rt->second;
4406 map<string, bufferlist>::iterator lh = attrs.find(RGW_ATTR_OBJECT_LEGAL_HOLD);
4407 if (lh != attrs.end())
4408 src_attrs[RGW_ATTR_OBJECT_LEGAL_HOLD] = lh->second;
4409
7c673cae
FG
4410 set_copy_attrs(src_attrs, attrs, attrs_mod);
4411 attrs.erase(RGW_ATTR_ID_TAG);
4412 attrs.erase(RGW_ATTR_PG_VER);
4413 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4414 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4415 if (cmp != src_attrs.end())
4416 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4417
4418 RGWObjManifest manifest;
4419 RGWObjState *astate = NULL;
4420
b3b6e05e 4421 ret = get_obj_state(dpp, &obj_ctx, src_bucket->get_info(), src_obj->get_obj(), &astate, y);
7c673cae
FG
4422 if (ret < 0) {
4423 return ret;
4424 }
4425
4426 vector<rgw_raw_obj> ref_objs;
4427
4428 if (remote_dest) {
4429 /* dest is in a different zonegroup, copy it there */
b3b6e05e 4430 return copy_obj_to_remote_dest(dpp, astate, attrs, read_op, user_id, dest_obj, mtime);
7c673cae
FG
4431 }
4432 uint64_t max_chunk_size;
4433
b3b6e05e 4434 ret = get_max_chunk_size(dest_bucket->get_placement_rule(), dest_obj->get_obj(), &max_chunk_size, dpp);
7c673cae 4435 if (ret < 0) {
f67539c2 4436 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj->get_bucket() << dendl;
7c673cae
FG
4437 return ret;
4438 }
4439
4440 rgw_pool src_pool;
4441 rgw_pool dest_pool;
11fdf7f2
TL
4442
4443 const rgw_placement_rule *src_rule{nullptr};
4444
9f95a23c
TL
4445 if (astate->manifest) {
4446 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4447 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
11fdf7f2
TL
4448 }
4449
4450 if (!src_rule || src_rule->empty()) {
f67539c2 4451 src_rule = &src_bucket->get_placement_rule();
11fdf7f2
TL
4452 }
4453
f67539c2 4454 if (!get_obj_data_pool(*src_rule, src_obj->get_obj(), &src_pool)) {
9f95a23c 4455 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7c673cae
FG
4456 return -EIO;
4457 }
11fdf7f2 4458
f67539c2 4459 if (!get_obj_data_pool(dest_placement, dest_obj->get_obj(), &dest_pool)) {
9f95a23c 4460 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7c673cae
FG
4461 return -EIO;
4462 }
4463
9f95a23c 4464 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
11fdf7f2
TL
4465 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4466
9f95a23c 4467 bool copy_data = (!astate->manifest) ||
11fdf7f2
TL
4468 (*src_rule != dest_placement) ||
4469 (src_pool != dest_pool);
7c673cae 4470
7c673cae 4471 bool copy_first = false;
9f95a23c
TL
4472 if (astate->manifest) {
4473 if (!astate->manifest->has_tail()) {
7c673cae
FG
4474 copy_data = true;
4475 } else {
9f95a23c 4476 uint64_t head_size = astate->manifest->get_head_size();
7c673cae
FG
4477
4478 if (head_size > 0) {
4479 if (head_size > max_chunk_size) {
4480 copy_data = true;
4481 } else {
4482 copy_first = true;
4483 }
4484 }
4485 }
4486 }
4487
4488 if (petag) {
4489 const auto iter = attrs.find(RGW_ATTR_ETAG);
4490 if (iter != attrs.end()) {
11fdf7f2 4491 *petag = iter->second.to_str();
7c673cae
FG
4492 }
4493 }
4494
4495 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2 4496 attrs.erase(RGW_ATTR_TAIL_TAG);
f67539c2 4497 return copy_obj_data(obj_ctx, dest_bucket, dest_placement, read_op, obj_size - 1, dest_obj,
9f95a23c 4498 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
7c673cae
FG
4499 }
4500
b3b6e05e 4501 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin(dpp);
7c673cae
FG
4502
4503 if (copy_first) { // we need to copy first chunk, not increase refcount
4504 ++miter;
4505 }
4506
4507 rgw_rados_ref ref;
b3b6e05e 4508 ret = get_raw_obj_ref(dpp, miter.get_location().get_raw_obj(store), &ref);
7c673cae
FG
4509 if (ret < 0) {
4510 return ret;
4511 }
4512
7c673cae
FG
4513 bufferlist first_chunk;
4514
20effc67 4515 const bool copy_itself = (dest_obj->get_obj() == src_obj->get_obj());
7c673cae 4516 RGWObjManifest *pmanifest;
9f95a23c 4517 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae 4518
f67539c2 4519 RGWRados::Object dest_op_target(this, dest_bucket->get_info(), obj_ctx, dest_obj->get_obj());
7c673cae
FG
4520 RGWRados::Object::Write write_op(&dest_op_target);
4521
4522 string tag;
4523
4524 if (ptag) {
4525 tag = *ptag;
4526 }
4527
4528 if (tag.empty()) {
4529 append_rand_alpha(cct, tag, tag, 32);
4530 }
4531
4532 if (!copy_itself) {
181888fb 4533 attrs.erase(RGW_ATTR_TAIL_TAG);
9f95a23c 4534 manifest = *astate->manifest;
7c673cae
FG
4535 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4536 if (tail_placement.bucket.name.empty()) {
f67539c2 4537 manifest.set_tail_placement(tail_placement.placement_rule, src_obj->get_bucket()->get_key());
7c673cae 4538 }
3efd9988 4539 string ref_tag;
b3b6e05e 4540 for (; miter != astate->manifest->obj_end(dpp); ++miter) {
7c673cae 4541 ObjectWriteOperation op;
3efd9988
FG
4542 ref_tag = tag + '\0';
4543 cls_refcount_get(op, ref_tag, true);
f67539c2 4544 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(store);
7c673cae 4545
9f95a23c
TL
4546 auto& ioctx = ref.pool.ioctx();
4547 ioctx.locator_set_key(loc.loc);
4548
b3b6e05e 4549 ret = rgw_rados_operate(dpp, ioctx, loc.oid, &op, null_yield);
7c673cae
FG
4550 if (ret < 0) {
4551 goto done_ret;
4552 }
4553
4554 ref_objs.push_back(loc);
4555 }
4556
4557 pmanifest = &manifest;
4558 } else {
9f95a23c 4559 pmanifest = &(*astate->manifest);
7c673cae
FG
4560 /* don't send the object's tail for garbage collection */
4561 astate->keep_tail = true;
4562 }
4563
4564 if (copy_first) {
b3b6e05e 4565 ret = read_op.read(0, max_chunk_size, first_chunk, y, dpp);
7c673cae
FG
4566 if (ret < 0) {
4567 goto done_ret;
4568 }
4569
f67539c2 4570 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), first_chunk.length());
7c673cae 4571 } else {
f67539c2 4572 pmanifest->set_head(dest_bucket->get_placement_rule(), dest_obj->get_obj(), 0);
7c673cae
FG
4573 }
4574
4575 write_op.meta.data = &first_chunk;
4576 write_op.meta.manifest = pmanifest;
4577 write_op.meta.ptag = &tag;
f67539c2 4578 write_op.meta.owner = dest_bucket->get_info().owner;
7c673cae
FG
4579 write_op.meta.mtime = mtime;
4580 write_op.meta.flags = PUT_OBJ_CREATE;
4581 write_op.meta.category = category;
4582 write_op.meta.olh_epoch = olh_epoch;
4583 write_op.meta.delete_at = delete_at;
181888fb 4584 write_op.meta.modify_tail = !copy_itself;
7c673cae 4585
b3b6e05e 4586 ret = write_op.write_meta(dpp, obj_size, astate->accounted_size, attrs, y);
7c673cae
FG
4587 if (ret < 0) {
4588 goto done_ret;
4589 }
4590
4591 return 0;
4592
4593done_ret:
4594 if (!copy_itself) {
4595 vector<rgw_raw_obj>::iterator riter;
4596
7c673cae 4597 /* rollback reference */
92f5a8d4 4598 string ref_tag = tag + '\0';
7c673cae
FG
4599 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4600 ObjectWriteOperation op;
92f5a8d4 4601 cls_refcount_put(op, ref_tag, true);
7c673cae 4602
9f95a23c 4603 ref.pool.ioctx().locator_set_key(riter->loc);
7c673cae 4604
b3b6e05e 4605 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), riter->oid, &op, null_yield);
7c673cae 4606 if (r < 0) {
9f95a23c 4607 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7c673cae
FG
4608 }
4609 }
4610 }
4611 return ret;
4612}
4613
4614
4615int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
20effc67 4616 rgw::sal::Bucket* bucket,
11fdf7f2 4617 const rgw_placement_rule& dest_placement,
7c673cae 4618 RGWRados::Object::Read& read_op, off_t end,
20effc67 4619 rgw::sal::Object* dest_obj,
7c673cae
FG
4620 real_time *mtime,
4621 real_time set_mtime,
20effc67 4622 rgw::sal::Attrs& attrs,
7c673cae
FG
4623 uint64_t olh_epoch,
4624 real_time delete_at,
9f95a23c
TL
4625 string *petag,
4626 const DoutPrefixProvider *dpp,
4627 optional_yield y)
7c673cae 4628{
7c673cae
FG
4629 string tag;
4630 append_rand_alpha(cct, tag, tag, 32);
4631
9f95a23c 4632 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 4633 using namespace rgw::putobj;
9f95a23c
TL
4634 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4635 // it causes crashes in the ragweed tests
20effc67 4636 AtomicObjectProcessor processor(&aio, this->store, &dest_placement,
f67539c2
TL
4637 bucket->get_info().owner, obj_ctx,
4638 dest_obj->clone(), olh_epoch, tag,
4639 dpp, null_yield);
9f95a23c 4640 int ret = processor.prepare(y);
7c673cae
FG
4641 if (ret < 0)
4642 return ret;
4643
4644 off_t ofs = 0;
4645
4646 do {
4647 bufferlist bl;
b3b6e05e 4648 ret = read_op.read(ofs, end, bl, y, dpp);
11fdf7f2 4649 if (ret < 0) {
9f95a23c 4650 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
11fdf7f2
TL
4651 return ret;
4652 }
7c673cae
FG
4653
4654 uint64_t read_len = ret;
11fdf7f2
TL
4655 ret = processor.process(std::move(bl), ofs);
4656 if (ret < 0) {
4657 return ret;
4658 }
7c673cae
FG
4659
4660 ofs += read_len;
4661 } while (ofs <= end);
4662
11fdf7f2
TL
4663 // flush
4664 ret = processor.process({}, ofs);
4665 if (ret < 0) {
4666 return ret;
4667 }
4668
7c673cae
FG
4669 string etag;
4670 auto iter = attrs.find(RGW_ATTR_ETAG);
4671 if (iter != attrs.end()) {
4672 bufferlist& bl = iter->second;
11fdf7f2 4673 etag = bl.to_str();
7c673cae 4674 if (petag) {
11fdf7f2 4675 *petag = etag;
7c673cae
FG
4676 }
4677 }
4678
4679 uint64_t accounted_size;
4680 {
4681 bool compressed{false};
4682 RGWCompressionInfo cs_info;
4683 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4684 if (ret < 0) {
9f95a23c 4685 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
7c673cae
FG
4686 return ret;
4687 }
4688 // pass original size if compressed
4689 accounted_size = compressed ? cs_info.orig_size : ofs;
4690 }
4691
11fdf7f2 4692 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
9f95a23c 4693 nullptr, nullptr, nullptr, nullptr, nullptr, y);
7c673cae
FG
4694}
4695
11fdf7f2 4696int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
20effc67
TL
4697 rgw::sal::Bucket* bucket,
4698 rgw::sal::Object& obj,
11fdf7f2
TL
4699 const rgw_placement_rule& placement_rule,
4700 const real_time& mtime,
9f95a23c
TL
4701 uint64_t olh_epoch,
4702 const DoutPrefixProvider *dpp,
4703 optional_yield y)
7c673cae 4704{
20effc67 4705 rgw::sal::Attrs attrs;
11fdf7f2
TL
4706 real_time read_mtime;
4707 uint64_t obj_size;
7c673cae 4708
f67539c2
TL
4709 obj.set_atomic(&obj_ctx);
4710 RGWRados::Object op_target(this, bucket->get_info(), obj_ctx, obj.get_obj());
11fdf7f2 4711 RGWRados::Object::Read read_op(&op_target);
7c673cae 4712
11fdf7f2
TL
4713 read_op.params.attrs = &attrs;
4714 read_op.params.lastmod = &read_mtime;
4715 read_op.params.obj_size = &obj_size;
7c673cae 4716
b3b6e05e 4717 int ret = read_op.prepare(y, dpp);
11fdf7f2
TL
4718 if (ret < 0) {
4719 return ret;
7c673cae
FG
4720 }
4721
11fdf7f2
TL
4722 if (read_mtime != mtime) {
4723 /* raced */
4724 return -ECANCELED;
7c673cae
FG
4725 }
4726
9f95a23c
TL
4727 attrs.erase(RGW_ATTR_ID_TAG);
4728 attrs.erase(RGW_ATTR_TAIL_TAG);
4729
11fdf7f2 4730 ret = copy_obj_data(obj_ctx,
f67539c2 4731 bucket,
11fdf7f2
TL
4732 placement_rule,
4733 read_op,
4734 obj_size - 1,
f67539c2 4735 &obj,
11fdf7f2
TL
4736 nullptr /* pmtime */,
4737 mtime,
4738 attrs,
4739 olh_epoch,
4740 real_time(),
9f95a23c
TL
4741 nullptr /* petag */,
4742 dpp,
4743 y);
11fdf7f2
TL
4744 if (ret < 0) {
4745 return ret;
7c673cae
FG
4746 }
4747
11fdf7f2 4748 return 0;
7c673cae
FG
4749}
4750
b3b6e05e 4751int RGWRados::check_bucket_empty(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, optional_yield y)
7c673cae 4752{
9f95a23c
TL
4753 constexpr uint NUM_ENTRIES = 1000u;
4754
7c673cae
FG
4755 rgw_obj_index_key marker;
4756 string prefix;
4757 bool is_truncated;
4758
4759 do {
9f95a23c
TL
4760 std::vector<rgw_bucket_dir_entry> ent_list;
4761 ent_list.reserve(NUM_ENTRIES);
4762
20effc67 4763 int r = cls_bucket_list_unordered(dpp,
b3b6e05e 4764 bucket_info,
1adf2230
AA
4765 RGW_NO_SHARD,
4766 marker,
4767 prefix,
4768 NUM_ENTRIES,
4769 true,
4770 ent_list,
4771 &is_truncated,
9f95a23c
TL
4772 &marker,
4773 y);
4774 if (r < 0) {
7c673cae 4775 return r;
9f95a23c 4776 }
7c673cae
FG
4777
4778 string ns;
1adf2230 4779 for (auto const& dirent : ent_list) {
7c673cae
FG
4780 rgw_obj_key obj;
4781
9f95a23c 4782 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
7c673cae 4783 return -ENOTEMPTY;
9f95a23c 4784 }
7c673cae
FG
4785 }
4786 } while (is_truncated);
1adf2230 4787
7c673cae
FG
4788 return 0;
4789}
4790
4791/**
4792 * Delete a bucket.
4793 * bucket: the name of the bucket to delete
4794 * Returns 0 on success, -ERR# otherwise.
4795 */
b3b6e05e 4796int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, const DoutPrefixProvider *dpp, bool check_empty)
7c673cae
FG
4797{
4798 const rgw_bucket& bucket = bucket_info.bucket;
9f95a23c 4799 RGWSI_RADOS::Pool index_pool;
7c673cae 4800 map<int, string> bucket_objs;
b3b6e05e 4801 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
4802 if (r < 0)
4803 return r;
4804
4805 if (check_empty) {
b3b6e05e 4806 r = check_bucket_empty(dpp, bucket_info, y);
7c673cae
FG
4807 if (r < 0) {
4808 return r;
4809 }
4810 }
9f95a23c
TL
4811
4812 bool remove_ep = true;
4813
4814 if (objv_tracker.read_version.empty()) {
4815 RGWBucketEntryPoint ep;
4816 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4817 &ep,
4818 null_yield,
b3b6e05e 4819 dpp,
9f95a23c
TL
4820 RGWBucketCtl::Bucket::GetParams()
4821 .set_objv_tracker(&objv_tracker));
4822 if (r < 0 ||
4823 (!bucket_info.bucket.bucket_id.empty() &&
4824 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4825 if (r != -ENOENT) {
b3b6e05e 4826 ldpp_dout(dpp, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
9f95a23c
TL
4827 /* we have no idea what caused the error, will not try to remove it */
4828 }
4829 /*
4830 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4831 * requested
4832 */
4833 remove_ep = false;
4834 }
4835 }
4836
4837 if (remove_ep) {
b3b6e05e 4838 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield, dpp,
9f95a23c
TL
4839 RGWBucketCtl::Bucket::RemoveParams()
4840 .set_objv_tracker(&objv_tracker));
4841 if (r < 0)
4842 return r;
4843 }
7c673cae
FG
4844
4845 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 4846 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 4847 RGWObjVersionTracker objv_tracker;
b3b6e05e 4848 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield, dpp);
7c673cae
FG
4849 if (r < 0) {
4850 return r;
4851 }
f64942e4
AA
4852
4853 /* remove bucket index objects asynchronously by best effort */
9f95a23c 4854 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
f64942e4
AA
4855 bucket_objs,
4856 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 4857 }
f64942e4 4858
7c673cae
FG
4859 return 0;
4860}
4861
b3b6e05e 4862int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner, const DoutPrefixProvider *dpp)
7c673cae
FG
4863{
4864 RGWBucketInfo info;
4865 map<string, bufferlist> attrs;
31f18b77 4866 int r;
9f95a23c
TL
4867 auto obj_ctx = svc.sysobj->init_obj_ctx();
4868
31f18b77 4869 if (bucket.bucket_id.empty()) {
b3b6e05e 4870 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
31f18b77 4871 } else {
b3b6e05e 4872 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield, dpp);
31f18b77 4873 }
7c673cae 4874 if (r < 0) {
b3b6e05e 4875 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4876 return r;
4877 }
4878
4879 info.owner = owner.get_id();
4880
b3b6e05e 4881 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4882 if (r < 0) {
b3b6e05e 4883 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
7c673cae
FG
4884 return r;
4885 }
4886
4887 return 0;
4888}
4889
4890
b3b6e05e 4891int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled, const DoutPrefixProvider *dpp)
7c673cae
FG
4892{
4893 int ret = 0;
4894
4895 vector<rgw_bucket>::iterator iter;
4896
4897 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4898 rgw_bucket& bucket = *iter;
b3b6e05e
TL
4899 if (enabled) {
4900 ldpp_dout(dpp, 20) << "enabling bucket name=" << bucket.name << dendl;
4901 } else {
4902 ldpp_dout(dpp, 20) << "disabling bucket name=" << bucket.name << dendl;
4903 }
7c673cae
FG
4904
4905 RGWBucketInfo info;
4906 map<string, bufferlist> attrs;
b3b6e05e 4907 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, dpp, &attrs);
7c673cae 4908 if (r < 0) {
b3b6e05e 4909 ldpp_dout(dpp, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4910 ret = r;
4911 continue;
4912 }
4913 if (enabled) {
4914 info.flags &= ~BUCKET_SUSPENDED;
4915 } else {
4916 info.flags |= BUCKET_SUSPENDED;
4917 }
4918
b3b6e05e 4919 r = put_bucket_instance_info(info, false, real_time(), &attrs, dpp);
7c673cae 4920 if (r < 0) {
b3b6e05e 4921 ldpp_dout(dpp, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
7c673cae
FG
4922 ret = r;
4923 continue;
4924 }
4925 }
4926 return ret;
4927}
4928
b3b6e05e 4929int RGWRados::bucket_suspended(const DoutPrefixProvider *dpp, rgw_bucket& bucket, bool *suspended)
7c673cae
FG
4930{
4931 RGWBucketInfo bucket_info;
b3b6e05e 4932 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield, dpp);
7c673cae
FG
4933 if (ret < 0) {
4934 return ret;
4935 }
4936
4937 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4938 return 0;
4939}
4940
b3b6e05e 4941int RGWRados::Object::complete_atomic_modification(const DoutPrefixProvider *dpp)
7c673cae 4942{
9f95a23c 4943 if ((!state->manifest)|| state->keep_tail)
7c673cae
FG
4944 return 0;
4945
4946 cls_rgw_obj_chain chain;
b3b6e05e 4947 store->update_gc_chain(dpp, obj, *state->manifest, &chain);
7c673cae
FG
4948
4949 if (chain.empty()) {
4950 return 0;
4951 }
4952
181888fb 4953 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
1d09f67e
TL
4954 if (store->gc == nullptr) {
4955 ldpp_dout(dpp, 0) << "deleting objects inline since gc isn't initialized" << dendl;
4956 //Delete objects inline just in case gc hasn't been initialised, prevents crashes
b3b6e05e 4957 store->delete_objs_inline(dpp, chain, tag);
1d09f67e
TL
4958 } else {
4959 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4960 if (ret < 0) {
4961 //Delete objects inline if send chain to gc fails
4962 store->delete_objs_inline(dpp, chain, tag);
4963 }
9f95a23c
TL
4964 }
4965 return 0;
7c673cae
FG
4966}
4967
b3b6e05e 4968void RGWRados::update_gc_chain(const DoutPrefixProvider *dpp, rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
7c673cae
FG
4969{
4970 RGWObjManifest::obj_iterator iter;
4971 rgw_raw_obj raw_head;
4972 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
b3b6e05e 4973 for (iter = manifest.obj_begin(dpp); iter != manifest.obj_end(dpp); ++iter) {
f67539c2 4974 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(store);
7c673cae
FG
4975 if (mobj == raw_head)
4976 continue;
4977 cls_rgw_obj_key key(mobj.oid);
4978 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4979 }
4980}
4981
9f95a23c 4982int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4983{
f67539c2
TL
4984 if (chain.empty()) {
4985 return 0;
4986 }
4987
9f95a23c 4988 return gc->send_chain(chain, tag);
7c673cae
FG
4989}
4990
b3b6e05e 4991void RGWRados::delete_objs_inline(const DoutPrefixProvider *dpp, cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4992{
9f95a23c
TL
4993 string last_pool;
4994 std::unique_ptr<IoCtx> ctx(new IoCtx);
4995 int ret = 0;
4996 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4997 cls_rgw_obj& obj = *liter;
4998 if (obj.pool != last_pool) {
4999 ctx.reset(new IoCtx);
b3b6e05e 5000 ret = rgw_init_ioctx(dpp, get_rados_handle(), obj.pool, *ctx);
9f95a23c
TL
5001 if (ret < 0) {
5002 last_pool = "";
b3b6e05e 5003 ldpp_dout(dpp, 0) << "ERROR: failed to create ioctx pool=" <<
9f95a23c
TL
5004 obj.pool << dendl;
5005 continue;
5006 }
5007 last_pool = obj.pool;
5008 }
5009 ctx->locator_set_key(obj.loc);
5010 const string& oid = obj.key.name; /* just stored raw oid there */
b3b6e05e 5011 ldpp_dout(dpp, 5) << "delete_objs_inline: removing " << obj.pool <<
9f95a23c
TL
5012 ":" << obj.key.name << dendl;
5013 ObjectWriteOperation op;
5014 cls_refcount_put(op, tag, true);
5015 ret = ctx->operate(oid, &op);
5016 if (ret < 0) {
b3b6e05e 5017 ldpp_dout(dpp, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
9f95a23c 5018 }
7c673cae 5019 }
7c673cae
FG
5020}
5021
5022static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
5023 map<RGWObjCategory, RGWStorageStats>& stats)
5024{
5025 for (const auto& pair : header.stats) {
5026 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
5027 const rgw_bucket_category_stats& header_stats = pair.second;
5028
5029 RGWStorageStats& s = stats[category];
5030
5031 s.category = category;
5032 s.size += header_stats.total_size;
5033 s.size_rounded += header_stats.total_size_rounded;
5034 s.size_utilized += header_stats.actual_size;
5035 s.num_objects += header_stats.num_entries;
5036 }
5037}
5038
b3b6e05e 5039int RGWRados::bucket_check_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info,
7c673cae
FG
5040 map<RGWObjCategory, RGWStorageStats> *existing_stats,
5041 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
5042{
9f95a23c 5043 RGWSI_RADOS::Pool index_pool;
20effc67 5044
7c673cae
FG
5045 // key - bucket index object id
5046 // value - bucket index check OP returned result with the given bucket index object (shard)
5047 map<int, string> oids;
b3b6e05e 5048 int ret = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &oids, nullptr);
31f18b77 5049 if (ret < 0) {
20effc67 5050 return ret;
31f18b77 5051 }
7c673cae 5052
20effc67
TL
5053 // declare and pre-populate
5054 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
9f95a23c 5055 for (auto& iter : oids) {
20effc67 5056 bucket_objs_ret.emplace(iter.first, rgw_cls_check_index_ret());
9f95a23c
TL
5057 }
5058
5059 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 5060 if (ret < 0) {
20effc67 5061 return ret;
31f18b77 5062 }
7c673cae 5063
20effc67
TL
5064 // aggregate results (from different shards if there are any)
5065 for (const auto& iter : bucket_objs_ret) {
5066 accumulate_raw_stats(iter.second.existing_header, *existing_stats);
5067 accumulate_raw_stats(iter.second.calculated_header, *calculated_stats);
7c673cae
FG
5068 }
5069
5070 return 0;
5071}
5072
b3b6e05e 5073int RGWRados::bucket_rebuild_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info)
7c673cae 5074{
9f95a23c 5075 RGWSI_RADOS::Pool index_pool;
7c673cae 5076 map<int, string> bucket_objs;
31f18b77 5077
b3b6e05e 5078 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77 5079 if (r < 0) {
7c673cae 5080 return r;
31f18b77 5081 }
7c673cae 5082
9f95a23c 5083 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
5084}
5085
b3b6e05e 5086int RGWRados::bucket_set_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77 5087{
9f95a23c 5088 RGWSI_RADOS::Pool index_pool;
31f18b77
FG
5089 map<int, string> bucket_objs;
5090
b3b6e05e 5091 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77
FG
5092 if (r < 0) {
5093 return r;
5094 }
5095
9f95a23c 5096 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 5097}
7c673cae 5098
b3b6e05e 5099int RGWRados::defer_gc(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
7c673cae
FG
5100{
5101 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5102 std::string oid, key;
5103 get_obj_bucket_and_oid_loc(obj, oid, key);
5104 if (!rctx)
5105 return 0;
5106
5107 RGWObjState *state = NULL;
5108
b3b6e05e 5109 int r = get_obj_state(dpp, rctx, bucket_info, obj, &state, false, y);
7c673cae
FG
5110 if (r < 0)
5111 return r;
5112
5113 if (!state->is_atomic) {
b3b6e05e 5114 ldpp_dout(dpp, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
7c673cae
FG
5115 return -EINVAL;
5116 }
5117
181888fb
FG
5118 string tag;
5119
5120 if (state->tail_tag.length() > 0) {
5121 tag = state->tail_tag.c_str();
5122 } else if (state->obj_tag.length() > 0) {
5123 tag = state->obj_tag.c_str();
5124 } else {
b3b6e05e 5125 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
7c673cae
FG
5126 return -EINVAL;
5127 }
5128
b3b6e05e 5129 ldpp_dout(dpp, 0) << "defer chain tag=" << tag << dendl;
7c673cae 5130
9f95a23c 5131 cls_rgw_obj_chain chain;
b3b6e05e 5132 update_gc_chain(dpp, state->obj, *state->manifest, &chain);
9f95a23c 5133 return gc->async_defer_chain(tag, chain);
7c673cae
FG
5134}
5135
5136void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
5137{
5138 list<string> prefixes;
5139 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
5140 cls_rgw_remove_obj(op, prefixes);
5141}
5142
5143void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
5144{
5145 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
5146}
5147
5148void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
5149{
5150 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
5151}
5152
9f95a23c
TL
5153struct tombstone_entry {
5154 ceph::real_time mtime;
5155 uint32_t zone_short_id;
5156 uint64_t pg_ver;
5157
5158 tombstone_entry() = default;
5159 explicit tombstone_entry(const RGWObjState& state)
5160 : mtime(state.mtime), zone_short_id(state.zone_short_id),
5161 pg_ver(state.pg_ver) {}
5162};
7c673cae
FG
5163
5164/**
5165 * Delete an object.
5166 * bucket: name of the bucket storing the object
5167 * obj: name of the object to delete
5168 * Returns: 0 on success, -ERR# otherwise.
5169 */
b3b6e05e 5170int RGWRados::Object::Delete::delete_obj(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
5171{
5172 RGWRados *store = target->get_store();
5173 rgw_obj& src_obj = target->get_obj();
5174 const string& instance = src_obj.key.instance;
5175 rgw_obj obj = src_obj;
5176
5177 if (instance == "null") {
5178 obj.key.instance.clear();
5179 }
5180
5181 bool explicit_marker_version = (!params.marker_version_id.empty());
5182
5183 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5184 if (instance.empty() || explicit_marker_version) {
5185 rgw_obj marker = obj;
5186
5187 if (!params.marker_version_id.empty()) {
5188 if (params.marker_version_id != "null") {
5189 marker.key.set_instance(params.marker_version_id);
5190 }
5191 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5192 store->gen_rand_obj_instance_name(&marker);
5193 }
5194
5195 result.version_id = marker.key.instance;
91327a77
AA
5196 if (result.version_id.empty())
5197 result.version_id = "null";
7c673cae
FG
5198 result.delete_marker = true;
5199
5200 struct rgw_bucket_dir_entry_meta meta;
5201
5202 meta.owner = params.obj_owner.get_id().to_str();
5203 meta.owner_display_name = params.obj_owner.get_display_name();
5204
5205 if (real_clock::is_zero(params.mtime)) {
5206 meta.mtime = real_clock::now();
5207 } else {
5208 meta.mtime = params.mtime;
5209 }
5210
b3b6e05e 5211 int r = store->set_olh(dpp, target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
7c673cae
FG
5212 if (r < 0) {
5213 return r;
5214 }
5215 } else {
5216 rgw_bucket_dir_entry dirent;
5217
b3b6e05e 5218 int r = store->bi_get_instance(dpp, target->get_bucket_info(), obj, &dirent);
7c673cae
FG
5219 if (r < 0) {
5220 return r;
5221 }
5222 result.delete_marker = dirent.is_delete_marker();
b3b6e05e 5223 r = store->unlink_obj_instance(dpp, target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
7c673cae
FG
5224 if (r < 0) {
5225 return r;
5226 }
5227 result.version_id = instance;
5228 }
5229
20effc67 5230 BucketShard *bs = nullptr;
b3b6e05e 5231 int r = target->get_bucket_shard(&bs, dpp);
7c673cae 5232 if (r < 0) {
b3b6e05e 5233 ldpp_dout(dpp, 5) << "failed to get BucketShard object: r=" << r << dendl;
7c673cae
FG
5234 return r;
5235 }
5236
b3b6e05e 5237 r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 5238 if (r < 0) {
b3b6e05e 5239 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
9f95a23c 5240 return r;
7c673cae
FG
5241 }
5242
5243 return 0;
5244 }
5245
5246 rgw_rados_ref ref;
b3b6e05e 5247 int r = store->get_obj_head_ref(dpp, target->get_bucket_info(), obj, &ref);
7c673cae
FG
5248 if (r < 0) {
5249 return r;
5250 }
5251
5252 RGWObjState *state;
b3b6e05e 5253 r = target->get_state(dpp, &state, false, y);
7c673cae
FG
5254 if (r < 0)
5255 return r;
5256
5257 ObjectWriteOperation op;
5258
5259 if (!real_clock::is_zero(params.unmod_since)) {
5260 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5261 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5262 if (!params.high_precision_time) {
5263 ctime.tv_nsec = 0;
5264 unmod.tv_nsec = 0;
5265 }
5266
b3b6e05e 5267 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
7c673cae
FG
5268 if (ctime > unmod) {
5269 return -ERR_PRECONDITION_FAILED;
5270 }
5271
5272 /* only delete object if mtime is less than or equal to params.unmod_since */
5273 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5274 }
11fdf7f2 5275 uint64_t obj_accounted_size = state->accounted_size;
7c673cae 5276
9f95a23c
TL
5277 if(params.abortmp) {
5278 obj_accounted_size = params.parts_accounted_size;
5279 }
5280
7c673cae
FG
5281 if (!real_clock::is_zero(params.expiration_time)) {
5282 bufferlist bl;
5283 real_time delete_at;
5284
5285 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5286 try {
11fdf7f2
TL
5287 auto iter = bl.cbegin();
5288 decode(delete_at, iter);
7c673cae 5289 } catch (buffer::error& err) {
b3b6e05e 5290 ldpp_dout(dpp, 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
7c673cae
FG
5291 return -EIO;
5292 }
5293
5294 if (params.expiration_time != delete_at) {
5295 return -ERR_PRECONDITION_FAILED;
5296 }
5297 } else {
5298 return -ERR_PRECONDITION_FAILED;
5299 }
5300 }
5301
5302 if (!state->exists) {
5303 target->invalidate_state();
5304 return -ENOENT;
5305 }
5306
b3b6e05e 5307 r = target->prepare_atomic_modification(dpp, op, false, NULL, NULL, NULL, true, false, y);
7c673cae
FG
5308 if (r < 0)
5309 return r;
5310
5311 RGWBucketInfo& bucket_info = target->get_bucket_info();
5312
5313 RGWRados::Bucket bop(store, bucket_info);
5314 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5315
5316 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5317 index_op.set_bilog_flags(params.bilog_flags);
5318
b3b6e05e 5319 r = index_op.prepare(dpp, CLS_RGW_OP_DEL, &state->write_tag, y);
7c673cae
FG
5320 if (r < 0)
5321 return r;
5322
5323 store->remove_rgw_head_obj(op);
9f95a23c
TL
5324
5325 auto& ioctx = ref.pool.ioctx();
b3b6e05e 5326 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
94b18763
FG
5327
5328 /* raced with another operation, object state is indeterminate */
5329 const bool need_invalidate = (r == -ECANCELED);
7c673cae 5330
9f95a23c 5331 int64_t poolid = ioctx.get_id();
7c673cae
FG
5332 if (r >= 0) {
5333 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5334 if (obj_tombstone_cache) {
5335 tombstone_entry entry{*state};
5336 obj_tombstone_cache->add(obj, entry);
5337 }
b3b6e05e 5338 r = index_op.complete_del(dpp, poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5339
b3b6e05e 5340 int ret = target->complete_atomic_modification(dpp);
7c673cae 5341 if (ret < 0) {
b3b6e05e 5342 ldpp_dout(dpp, 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
7c673cae
FG
5343 }
5344 /* other than that, no need to propagate error */
224ce89b 5345 } else {
20effc67 5346 int ret = index_op.cancel(dpp, params.remove_objs);
224ce89b 5347 if (ret < 0) {
b3b6e05e 5348 ldpp_dout(dpp, 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
224ce89b 5349 }
7c673cae
FG
5350 }
5351
5352 if (need_invalidate) {
5353 target->invalidate_state();
5354 }
5355
5356 if (r < 0)
5357 return r;
5358
5359 /* update quota cache */
11fdf7f2 5360 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5361
5362 return 0;
5363}
5364
b3b6e05e
TL
5365int RGWRados::delete_obj(const DoutPrefixProvider *dpp,
5366 RGWObjectCtx& obj_ctx,
7c673cae
FG
5367 const RGWBucketInfo& bucket_info,
5368 const rgw_obj& obj,
20effc67 5369 int versioning_status, // versioning flags defined in enum RGWBucketFlags
7c673cae 5370 uint16_t bilog_flags,
31f18b77
FG
5371 const real_time& expiration_time,
5372 rgw_zone_set *zones_trace)
7c673cae
FG
5373{
5374 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5375 RGWRados::Object::Delete del_op(&del_target);
5376
5377 del_op.params.bucket_owner = bucket_info.owner;
5378 del_op.params.versioning_status = versioning_status;
5379 del_op.params.bilog_flags = bilog_flags;
5380 del_op.params.expiration_time = expiration_time;
31f18b77 5381 del_op.params.zones_trace = zones_trace;
7c673cae 5382
b3b6e05e 5383 return del_op.delete_obj(null_yield, dpp);
7c673cae
FG
5384}
5385
b3b6e05e 5386int RGWRados::delete_raw_obj(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj)
7c673cae
FG
5387{
5388 rgw_rados_ref ref;
b3b6e05e 5389 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
5390 if (r < 0) {
5391 return r;
5392 }
5393
5394 ObjectWriteOperation op;
5395
5396 op.remove();
b3b6e05e 5397 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
5398 if (r < 0)
5399 return r;
5400
5401 return 0;
5402}
5403
b3b6e05e 5404int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime, const DoutPrefixProvider *dpp)
7c673cae
FG
5405{
5406 std::string oid, key;
5407 get_obj_bucket_and_oid_loc(obj, oid, key);
5408
11fdf7f2 5409 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5410
5411 RGWBucketInfo bucket_info;
b3b6e05e 5412 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae 5413 if (ret < 0) {
b3b6e05e 5414 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
7c673cae
FG
5415 return ret;
5416 }
5417
5418 RGWRados::Bucket bop(this, bucket_info);
5419 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5420
b3b6e05e 5421 return index_op.complete_del(dpp, -1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5422}
5423
20effc67 5424static void generate_fake_tag(const DoutPrefixProvider *dpp, rgw::sal::Store* store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
7c673cae
FG
5425{
5426 string tag;
5427
b3b6e05e
TL
5428 RGWObjManifest::obj_iterator mi = manifest.obj_begin(dpp);
5429 if (mi != manifest.obj_end(dpp)) {
7c673cae
FG
5430 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5431 ++mi;
20effc67
TL
5432 rgw::sal::RadosStore* rstore = dynamic_cast<rgw::sal::RadosStore*>(store);
5433 tag = mi.get_location().get_raw_obj(rstore).oid;
7c673cae
FG
5434 tag.append("_");
5435 }
5436
5437 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5438 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5439 MD5 hash;
20effc67
TL
5440 // Allow use of MD5 digest in FIPS mode for non-cryptographic purposes
5441 hash.SetFlags(EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
11fdf7f2 5442 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5443
5444 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5445 if (iter != attrset.end()) {
5446 bufferlist& bl = iter->second;
11fdf7f2 5447 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5448 }
5449
5450 hash.Final(md5);
5451 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5452 tag.append(md5_str);
5453
20effc67 5454 ldpp_dout(dpp, 10) << "generate_fake_tag new tag=" << tag << dendl;
7c673cae
FG
5455
5456 tag_bl.append(tag.c_str(), tag.size() + 1);
5457}
5458
5459static bool is_olh(map<string, bufferlist>& attrs)
5460{
5461 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5462 return (iter != attrs.end());
5463}
5464
5465static bool has_olh_tag(map<string, bufferlist>& attrs)
5466{
5467 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5468 return (iter != attrs.end());
5469}
5470
b3b6e05e 5471int RGWRados::get_olh_target_state(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5472 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
7c673cae 5473{
11fdf7f2 5474 ceph_assert(olh_state->is_olh);
7c673cae
FG
5475
5476 rgw_obj target;
b3b6e05e 5477 int r = RGWRados::follow_olh(dpp, bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
7c673cae
FG
5478 if (r < 0) {
5479 return r;
5480 }
b3b6e05e 5481 r = get_obj_state(dpp, &obj_ctx, bucket_info, target, target_state, false, y);
7c673cae
FG
5482 if (r < 0) {
5483 return r;
5484 }
5485
5486 return 0;
5487}
5488
b3b6e05e 5489int RGWRados::get_obj_state_impl(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5490 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5491{
5492 if (obj.empty()) {
5493 return -EINVAL;
5494 }
5495
5496 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5497
11fdf7f2 5498 RGWObjState *s = rctx->get_state(obj);
b3b6e05e 5499 ldpp_dout(dpp, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
7c673cae
FG
5500 *state = s;
5501 if (s->has_attrs) {
5502 if (s->is_olh && need_follow_olh) {
b3b6e05e 5503 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
7c673cae
FG
5504 }
5505 return 0;
5506 }
5507
5508 s->obj = obj;
5509
5510 rgw_raw_obj raw_obj;
5511 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5512
5513 int r = -ENOENT;
5514
5515 if (!assume_noent) {
b3b6e05e 5516 r = RGWRados::raw_obj_stat(dpp, raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
7c673cae
FG
5517 }
5518
5519 if (r == -ENOENT) {
5520 s->exists = false;
5521 s->has_attrs = true;
5522 tombstone_entry entry;
5523 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5524 s->mtime = entry.mtime;
5525 s->zone_short_id = entry.zone_short_id;
5526 s->pg_ver = entry.pg_ver;
b3b6e05e 5527 ldpp_dout(dpp, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
7c673cae
FG
5528 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5529 } else {
5530 s->mtime = real_time();
5531 }
5532 return 0;
5533 }
5534 if (r < 0)
5535 return r;
5536
5537 s->exists = true;
5538 s->has_attrs = true;
5539 s->accounted_size = s->size;
5540
11fdf7f2
TL
5541 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5542 if (iter != s->attrset.end()) {
5543 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5544 bufferlist& bletag = iter->second;
5545 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5546 bufferlist newbl;
5547 bletag.splice(0, bletag.length() - 1, &newbl);
f67539c2 5548 bletag = std::move(newbl);
11fdf7f2
TL
5549 }
5550 }
5551
5552 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5553 const bool compressed = (iter != s->attrset.end());
5554 if (compressed) {
7c673cae
FG
5555 // use uncompressed size for accounted_size
5556 try {
5557 RGWCompressionInfo info;
11fdf7f2
TL
5558 auto p = iter->second.cbegin();
5559 decode(info, p);
31f18b77 5560 s->accounted_size = info.orig_size;
7c673cae 5561 } catch (buffer::error&) {
b3b6e05e 5562 ldpp_dout(dpp, 0) << "ERROR: could not decode compression info for object: " << obj << dendl;
7c673cae
FG
5563 return -EIO;
5564 }
5565 }
5566
5567 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5568 if (iter != s->attrset.end()) {
5569 bufferlist bl = iter->second;
5570 bufferlist::iterator it = bl.begin();
5571 it.copy(bl.length(), s->shadow_obj);
5572 s->shadow_obj[bl.length()] = '\0';
5573 }
5574 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5575 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5576 if (ttiter != s->attrset.end()) {
5577 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5578 }
7c673cae
FG
5579
5580 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5581 if (manifest_bl.length()) {
11fdf7f2 5582 auto miter = manifest_bl.cbegin();
7c673cae 5583 try {
9f95a23c
TL
5584 s->manifest.emplace();
5585 decode(*s->manifest, miter);
5586 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
7c673cae 5587 broken due to old bugs */
9f95a23c 5588 s->size = s->manifest->get_obj_size();
31f18b77
FG
5589 if (!compressed)
5590 s->accounted_size = s->size;
7c673cae 5591 } catch (buffer::error& err) {
b3b6e05e 5592 ldpp_dout(dpp, 0) << "ERROR: couldn't decode manifest" << dendl;
7c673cae
FG
5593 return -EIO;
5594 }
b3b6e05e 5595 ldpp_dout(dpp, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
11fdf7f2 5596 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
9f95a23c 5597 s->manifest->has_explicit_objs()) {
7c673cae 5598 RGWObjManifest::obj_iterator mi;
b3b6e05e
TL
5599 for (mi = s->manifest->obj_begin(dpp); mi != s->manifest->obj_end(dpp); ++mi) {
5600 ldpp_dout(dpp, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(store) << dendl;
7c673cae
FG
5601 }
5602 }
5603
5604 if (!s->obj_tag.length()) {
5605 /*
5606 * Uh oh, something's wrong, object with manifest should have tag. Let's
5607 * create one out of the manifest, would be unique
5608 */
b3b6e05e 5609 generate_fake_tag(dpp, store, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
7c673cae
FG
5610 s->fake_tag = true;
5611 }
5612 }
5613 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5614 if (aiter != s->attrset.end()) {
5615 bufferlist& pg_ver_bl = aiter->second;
5616 if (pg_ver_bl.length()) {
11fdf7f2 5617 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5618 try {
11fdf7f2 5619 decode(s->pg_ver, pgbl);
7c673cae 5620 } catch (buffer::error& err) {
b3b6e05e 5621 ldpp_dout(dpp, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5622 }
5623 }
5624 }
5625 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5626 if (aiter != s->attrset.end()) {
5627 bufferlist& zone_short_id_bl = aiter->second;
5628 if (zone_short_id_bl.length()) {
11fdf7f2 5629 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5630 try {
11fdf7f2 5631 decode(s->zone_short_id, zbl);
7c673cae 5632 } catch (buffer::error& err) {
b3b6e05e 5633 ldpp_dout(dpp, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
7c673cae
FG
5634 }
5635 }
5636 }
b3b6e05e
TL
5637 if (s->obj_tag.length()) {
5638 ldpp_dout(dpp, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5639 } else {
5640 ldpp_dout(dpp, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5641 }
7c673cae
FG
5642
5643 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5644 * it exist, and not only if is_olh() returns true
5645 */
5646 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5647 if (iter != s->attrset.end()) {
5648 s->olh_tag = iter->second;
5649 }
5650
5651 if (is_olh(s->attrset)) {
5652 s->is_olh = true;
5653
b3b6e05e 5654 ldpp_dout(dpp, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
7c673cae
FG
5655
5656 if (need_follow_olh) {
b3b6e05e 5657 return get_olh_target_state(dpp, *rctx, bucket_info, obj, s, state, y);
9f95a23c 5658 } else if (obj.key.have_null_instance() && !s->manifest) {
11fdf7f2
TL
5659 // read null version, and the head object only have olh info
5660 s->exists = false;
5661 return -ENOENT;
7c673cae
FG
5662 }
5663 }
5664
5665 return 0;
5666}
5667
b3b6e05e 5668int RGWRados::get_obj_state(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9f95a23c 5669 bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5670{
5671 int ret;
5672
5673 do {
b3b6e05e 5674 ret = get_obj_state_impl(dpp, rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
7c673cae
FG
5675 } while (ret == -EAGAIN);
5676
5677 return ret;
5678}
5679
b3b6e05e 5680int RGWRados::Object::get_manifest(const DoutPrefixProvider *dpp, RGWObjManifest **pmanifest, optional_yield y)
7c673cae
FG
5681{
5682 RGWObjState *astate;
b3b6e05e 5683 int r = get_state(dpp, &astate, true, y);
7c673cae
FG
5684 if (r < 0) {
5685 return r;
5686 }
5687
9f95a23c 5688 *pmanifest = &(*astate->manifest);
7c673cae
FG
5689
5690 return 0;
5691}
5692
b3b6e05e 5693int RGWRados::Object::Read::get_attr(const DoutPrefixProvider *dpp, const char *name, bufferlist& dest, optional_yield y)
7c673cae
FG
5694{
5695 RGWObjState *state;
b3b6e05e 5696 int r = source->get_state(dpp, &state, true, y);
7c673cae
FG
5697 if (r < 0)
5698 return r;
5699 if (!state->exists)
5700 return -ENOENT;
5701 if (!state->get_attr(name, dest))
5702 return -ENODATA;
5703
5704 return 0;
5705}
5706
b3b6e05e 5707int RGWRados::Object::Stat::stat_async(const DoutPrefixProvider *dpp)
7c673cae
FG
5708{
5709 RGWObjectCtx& ctx = source->get_ctx();
5710 rgw_obj& obj = source->get_obj();
5711 RGWRados *store = source->get_store();
5712
11fdf7f2 5713 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5714 result.obj = obj;
5715 if (s->has_attrs) {
5716 state.ret = 0;
5717 result.size = s->size;
5718 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5719 result.attrs = s->attrset;
7c673cae
FG
5720 result.manifest = s->manifest;
5721 return 0;
5722 }
5723
5724 string oid;
5725 string loc;
5726 get_obj_bucket_and_oid_loc(obj, oid, loc);
5727
b3b6e05e 5728 int r = store->get_obj_head_ioctx(dpp, source->get_bucket_info(), obj, &state.io_ctx);
7c673cae
FG
5729 if (r < 0) {
5730 return r;
5731 }
5732
5733 librados::ObjectReadOperation op;
5734 op.stat2(&result.size, &result.mtime, NULL);
5735 op.getxattrs(&result.attrs, NULL);
9f95a23c 5736 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
7c673cae
FG
5737 state.io_ctx.locator_set_key(loc);
5738 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5739 if (r < 0) {
b3b6e05e 5740 ldpp_dout(dpp, 5) << __func__
7c673cae
FG
5741 << ": ERROR: aio_operate() returned ret=" << r
5742 << dendl;
5743 return r;
5744 }
5745
5746 return 0;
5747}
5748
5749
20effc67 5750int RGWRados::Object::Stat::wait(const DoutPrefixProvider *dpp)
7c673cae
FG
5751{
5752 if (!state.completion) {
5753 return state.ret;
5754 }
5755
9f95a23c 5756 state.completion->wait_for_complete();
7c673cae
FG
5757 state.ret = state.completion->get_return_value();
5758 state.completion->release();
5759
5760 if (state.ret != 0) {
5761 return state.ret;
5762 }
5763
20effc67 5764 return finish(dpp);
7c673cae
FG
5765}
5766
20effc67 5767int RGWRados::Object::Stat::finish(const DoutPrefixProvider *dpp)
7c673cae
FG
5768{
5769 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5770 if (iter != result.attrs.end()) {
5771 bufferlist& bl = iter->second;
11fdf7f2 5772 auto biter = bl.cbegin();
7c673cae 5773 try {
9f95a23c
TL
5774 result.manifest.emplace();
5775 decode(*result.manifest, biter);
7c673cae 5776 } catch (buffer::error& err) {
20effc67 5777 ldpp_dout(dpp, 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
7c673cae
FG
5778 return -EIO;
5779 }
7c673cae
FG
5780 }
5781
5782 return 0;
5783}
5784
b3b6e05e 5785int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp, RGWObjectCtx *rctx,
7c673cae 5786 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5787 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
7c673cae
FG
5788{
5789 if (!rctx)
5790 return 0;
5791
b3b6e05e 5792 int r = get_obj_state(dpp, rctx, bucket_info, obj, pstate, false, y);
7c673cae
FG
5793 if (r < 0)
5794 return r;
5795
b3b6e05e 5796 return append_atomic_test(dpp, *pstate, op);
11fdf7f2 5797}
7c673cae 5798
b3b6e05e
TL
5799int RGWRados::append_atomic_test(const DoutPrefixProvider *dpp,
5800 const RGWObjState* state,
11fdf7f2
TL
5801 librados::ObjectOperation& op)
5802{
7c673cae 5803 if (!state->is_atomic) {
b3b6e05e 5804 ldpp_dout(dpp, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
5805 return 0;
5806 }
5807
5808 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5809 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5810 } else {
b3b6e05e 5811 ldpp_dout(dpp, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
7c673cae
FG
5812 }
5813 return 0;
5814}
5815
b3b6e05e 5816int RGWRados::Object::get_state(const DoutPrefixProvider *dpp, RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae 5817{
b3b6e05e 5818 return store->get_obj_state(dpp, &ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
7c673cae
FG
5819}
5820
5821void RGWRados::Object::invalidate_state()
5822{
11fdf7f2 5823 ctx.invalidate(obj);
7c673cae
FG
5824}
5825
20effc67 5826int RGWRados::Object::prepare_atomic_modification(const DoutPrefixProvider *dpp,
b3b6e05e 5827 ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb 5828 const char *if_match, const char *if_nomatch, bool removal_op,
9f95a23c 5829 bool modify_tail, optional_yield y)
7c673cae 5830{
b3b6e05e 5831 int r = get_state(dpp, &state, false, y);
7c673cae
FG
5832 if (r < 0)
5833 return r;
5834
9f95a23c 5835 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
7c673cae
FG
5836 if_match != NULL || if_nomatch != NULL) &&
5837 (!state->fake_tag);
5838
5839 if (!state->is_atomic) {
b3b6e05e 5840 ldpp_dout(dpp, 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
7c673cae
FG
5841
5842 if (reset_obj) {
5843 op.create(false);
5844 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5845 }
5846
5847 return 0;
5848 }
5849
5850 if (need_guard) {
5851 /* first verify that the object wasn't replaced under */
5852 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5853 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5854 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5855 }
5856
5857 if (if_match) {
5858 if (strcmp(if_match, "*") == 0) {
5859 // test the object is existing
5860 if (!state->exists) {
5861 return -ERR_PRECONDITION_FAILED;
5862 }
5863 } else {
5864 bufferlist bl;
5865 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5866 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5867 return -ERR_PRECONDITION_FAILED;
5868 }
5869 }
5870 }
5871
5872 if (if_nomatch) {
5873 if (strcmp(if_nomatch, "*") == 0) {
5874 // test the object is NOT existing
5875 if (state->exists) {
5876 return -ERR_PRECONDITION_FAILED;
5877 }
5878 } else {
5879 bufferlist bl;
5880 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5881 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5882 return -ERR_PRECONDITION_FAILED;
5883 }
5884 }
5885 }
5886 }
5887
5888 if (reset_obj) {
5889 if (state->exists) {
5890 op.create(false);
5891 store->remove_rgw_head_obj(op);
5892 } else {
5893 op.create(true);
5894 }
5895 }
5896
5897 if (removal_op) {
5898 /* the object is being removed, no need to update its tag */
5899 return 0;
5900 }
5901
5902 if (ptag) {
5903 state->write_tag = *ptag;
5904 } else {
5905 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5906 }
5907 bufferlist bl;
5908 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5909
b3b6e05e 5910 ldpp_dout(dpp, 10) << "setting object write_tag=" << state->write_tag << dendl;
7c673cae
FG
5911
5912 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
5913 if (modify_tail) {
5914 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5915 }
7c673cae
FG
5916
5917 return 0;
5918}
5919
7c673cae
FG
5920/**
5921 * Set an attr on an object.
5922 * bucket: name of the bucket holding the object
5923 * obj: name of the object to set the attr on
5924 * name: the attr to set
5925 * bl: the contents of the attr
5926 * Returns: 0 on success, -ERR# otherwise.
5927 */
b3b6e05e 5928int RGWRados::set_attr(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
7c673cae
FG
5929{
5930 map<string, bufferlist> attrs;
5931 attrs[name] = bl;
b3b6e05e 5932 return set_attrs(dpp, ctx, bucket_info, obj, attrs, NULL, null_yield);
7c673cae
FG
5933}
5934
b3b6e05e 5935int RGWRados::set_attrs(const DoutPrefixProvider *dpp, void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae 5936 map<string, bufferlist>& attrs,
9f95a23c
TL
5937 map<string, bufferlist>* rmattrs,
5938 optional_yield y)
7c673cae 5939{
494da23a
TL
5940 rgw_obj obj = src_obj;
5941 if (obj.key.instance == "null") {
5942 obj.key.instance.clear();
5943 }
5944
7c673cae 5945 rgw_rados_ref ref;
b3b6e05e 5946 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
5947 if (r < 0) {
5948 return r;
5949 }
5950 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5951
5952 ObjectWriteOperation op;
5953 RGWObjState *state = NULL;
5954
b3b6e05e 5955 r = append_atomic_test(dpp, rctx, bucket_info, obj, op, &state, y);
7c673cae
FG
5956 if (r < 0)
5957 return r;
5958
494da23a 5959 // ensure null version object exist
9f95a23c 5960 if (src_obj.key.instance == "null" && !state->manifest) {
494da23a
TL
5961 return -ENOENT;
5962 }
5963
7c673cae
FG
5964 map<string, bufferlist>::iterator iter;
5965 if (rmattrs) {
5966 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5967 const string& name = iter->first;
5968 op.rmxattr(name.c_str());
5969 }
5970 }
5971
5972 const rgw_bucket& bucket = obj.bucket;
5973
5974 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5975 const string& name = iter->first;
5976 bufferlist& bl = iter->second;
5977
5978 if (!bl.length())
5979 continue;
5980
5981 op.setxattr(name.c_str(), bl);
5982
5983 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5984 real_time ts;
5985 try {
11fdf7f2 5986 decode(ts, bl);
7c673cae
FG
5987
5988 rgw_obj_index_key obj_key;
5989 obj.key.get_index_key(&obj_key);
5990
b3b6e05e 5991 obj_expirer->hint_add(dpp, ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
7c673cae 5992 } catch (buffer::error& err) {
b3b6e05e 5993 ldpp_dout(dpp, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
7c673cae
FG
5994 }
5995 }
5996 }
5997
5998 if (!op.size())
5999 return 0;
6000
9f95a23c 6001 RGWObjectCtx obj_ctx(this->store);
7c673cae
FG
6002
6003 bufferlist bl;
6004 RGWRados::Bucket bop(this, bucket_info);
6005 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
6006
6007 if (state) {
6008 string tag;
6009 append_rand_alpha(cct, tag, tag, 32);
6010 state->write_tag = tag;
b3b6e05e 6011 r = index_op.prepare(dpp, CLS_RGW_OP_ADD, &state->write_tag, y);
7c673cae
FG
6012
6013 if (r < 0)
6014 return r;
6015
6016 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
6017 op.setxattr(RGW_ATTR_ID_TAG, bl);
6018 }
6019
3efd9988
FG
6020
6021 real_time mtime = real_clock::now();
6022 struct timespec mtime_ts = real_clock::to_timespec(mtime);
6023 op.mtime2(&mtime_ts);
9f95a23c 6024 auto& ioctx = ref.pool.ioctx();
b3b6e05e 6025 r = rgw_rados_operate(dpp, ioctx, ref.obj.oid, &op, null_yield);
7c673cae
FG
6026 if (state) {
6027 if (r >= 0) {
6028 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
6029 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
6030 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
6031 string etag = rgw_bl_str(etag_bl);
6032 string content_type = rgw_bl_str(content_type_bl);
6033 string storage_class;
6034 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
6035 if (iter != attrs.end()) {
6036 storage_class = rgw_bl_str(iter->second);
6037 }
9f95a23c
TL
6038 uint64_t epoch = ioctx.get_last_version();
6039 int64_t poolid = ioctx.get_id();
b3b6e05e 6040 r = index_op.complete(dpp, poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
6041 mtime, etag, content_type, storage_class, &acl_bl,
6042 RGWObjCategory::Main, NULL);
7c673cae 6043 } else {
20effc67 6044 int ret = index_op.cancel(dpp, nullptr);
7c673cae 6045 if (ret < 0) {
b3b6e05e 6046 ldpp_dout(dpp, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
7c673cae
FG
6047 }
6048 }
6049 }
6050 if (r < 0)
6051 return r;
6052
6053 if (state) {
6054 state->obj_tag.swap(bl);
6055 if (rmattrs) {
6056 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6057 state->attrset.erase(iter->first);
6058 }
6059 }
92f5a8d4 6060
7c673cae
FG
6061 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6062 state->attrset[iter->first] = iter->second;
6063 }
92f5a8d4
TL
6064
6065 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
6066 if (iter != state->attrset.end()) {
6067 iter->second = state->obj_tag;
6068 }
7c673cae
FG
6069 }
6070
6071 return 0;
6072}
6073
b3b6e05e 6074int RGWRados::Object::Read::prepare(optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
6075{
6076 RGWRados *store = source->get_store();
6077 CephContext *cct = store->ctx();
6078
6079 bufferlist etag;
6080
6081 map<string, bufferlist>::iterator iter;
6082
6083 RGWObjState *astate;
b3b6e05e 6084 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
6085 if (r < 0)
6086 return r;
6087
6088 if (!astate->exists) {
6089 return -ENOENT;
6090 }
6091
6092 const RGWBucketInfo& bucket_info = source->get_bucket_info();
6093
6094 state.obj = astate->obj;
6095 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
6096
11fdf7f2
TL
6097 state.cur_pool = state.head_obj.pool;
6098 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
6099
b3b6e05e 6100 r = store->get_obj_head_ioctx(dpp, bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
6101 if (r < 0) {
6102 return r;
6103 }
eafe8130
TL
6104 if (params.target_obj) {
6105 *params.target_obj = state.obj;
6106 }
7c673cae
FG
6107 if (params.attrs) {
6108 *params.attrs = astate->attrset;
11fdf7f2 6109 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae 6110 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
b3b6e05e 6111 ldpp_dout(dpp, 20) << "Read xattr rgw_rados: " << iter->first << dendl;
7c673cae
FG
6112 }
6113 }
6114 }
6115
6116 /* Convert all times go GMT to make them compatible */
6117 if (conds.mod_ptr || conds.unmod_ptr) {
6118 obj_time_weight src_weight;
6119 src_weight.init(astate);
6120 src_weight.high_precision = conds.high_precision_time;
6121
6122 obj_time_weight dest_weight;
6123 dest_weight.high_precision = conds.high_precision_time;
6124
9f95a23c 6125 if (conds.mod_ptr && !conds.if_nomatch) {
7c673cae 6126 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 6127 ldpp_dout(dpp, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
6128 if (!(dest_weight < src_weight)) {
6129 return -ERR_NOT_MODIFIED;
6130 }
6131 }
6132
9f95a23c 6133 if (conds.unmod_ptr && !conds.if_match) {
7c673cae 6134 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
b3b6e05e 6135 ldpp_dout(dpp, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
7c673cae
FG
6136 if (dest_weight < src_weight) {
6137 return -ERR_PRECONDITION_FAILED;
6138 }
6139 }
6140 }
6141 if (conds.if_match || conds.if_nomatch) {
b3b6e05e 6142 r = get_attr(dpp, RGW_ATTR_ETAG, etag, y);
7c673cae
FG
6143 if (r < 0)
6144 return r;
6145
6146 if (conds.if_match) {
6147 string if_match_str = rgw_string_unquote(conds.if_match);
b3b6e05e 6148 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
11fdf7f2 6149 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
6150 return -ERR_PRECONDITION_FAILED;
6151 }
6152 }
6153
6154 if (conds.if_nomatch) {
6155 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
b3b6e05e 6156 ldpp_dout(dpp, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
11fdf7f2 6157 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
6158 return -ERR_NOT_MODIFIED;
6159 }
6160 }
6161 }
6162
6163 if (params.obj_size)
6164 *params.obj_size = astate->size;
6165 if (params.lastmod)
6166 *params.lastmod = astate->mtime;
6167
6168 return 0;
6169}
6170
6171int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6172{
6173 if (ofs < 0) {
6174 ofs += obj_size;
11fdf7f2
TL
6175 if (ofs < 0)
6176 ofs = 0;
6177 end = obj_size - 1;
6178 } else if (end < 0) {
6179 end = obj_size - 1;
7c673cae
FG
6180 }
6181
11fdf7f2
TL
6182 if (obj_size > 0) {
6183 if (ofs >= (off_t)obj_size) {
6184 return -ERANGE;
6185 }
6186 if (end >= (off_t)obj_size) {
6187 end = obj_size - 1;
7c673cae
FG
6188 }
6189 }
7c673cae
FG
6190 return 0;
6191}
6192
b3b6e05e 6193int RGWRados::Bucket::UpdateIndex::guard_reshard(const DoutPrefixProvider *dpp, BucketShard **pbs, std::function<int(BucketShard *)> call)
31f18b77
FG
6194{
6195 RGWRados *store = target->get_store();
20effc67 6196 BucketShard *bs = nullptr;
31f18b77
FG
6197 int r;
6198
6199#define NUM_RESHARD_RETRIES 10
6200 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6201 int ret = get_bucket_shard(&bs, dpp);
31f18b77 6202 if (ret < 0) {
b3b6e05e 6203 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
31f18b77
FG
6204 return ret;
6205 }
6206 r = call(bs);
6207 if (r != -ERR_BUSY_RESHARDING) {
6208 break;
6209 }
b3b6e05e 6210 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
31f18b77 6211 string new_bucket_id;
11fdf7f2 6212 r = store->block_while_resharding(bs, &new_bucket_id,
b3b6e05e 6213 target->bucket_info, null_yield, dpp);
31f18b77
FG
6214 if (r == -ERR_BUSY_RESHARDING) {
6215 continue;
6216 }
6217 if (r < 0) {
6218 return r;
6219 }
b3b6e05e 6220 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
31f18b77 6221 i = 0; /* resharding is finished, make sure we can retry */
b3b6e05e 6222 r = target->update_bucket_id(new_bucket_id, dpp);
31f18b77 6223 if (r < 0) {
b3b6e05e 6224 ldpp_dout(dpp, 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
31f18b77
FG
6225 return r;
6226 }
6227 invalidate_bs();
81eedcae 6228 } // for loop
31f18b77
FG
6229
6230 if (r < 0) {
6231 return r;
6232 }
6233
6234 if (pbs) {
6235 *pbs = bs;
6236 }
6237
6238 return 0;
6239}
6240
b3b6e05e 6241int RGWRados::Bucket::UpdateIndex::prepare(const DoutPrefixProvider *dpp, RGWModifyOp op, const string *write_tag, optional_yield y)
7c673cae
FG
6242{
6243 if (blind) {
6244 return 0;
6245 }
6246 RGWRados *store = target->get_store();
7c673cae
FG
6247
6248 if (write_tag && write_tag->length()) {
6249 optag = string(write_tag->c_str(), write_tag->length());
6250 } else {
6251 if (optag.empty()) {
6252 append_rand_alpha(store->ctx(), optag, optag, 32);
6253 }
6254 }
6255
b3b6e05e
TL
6256 int r = guard_reshard(dpp, nullptr, [&](BucketShard *bs) -> int {
6257 return store->cls_obj_prepare_op(dpp, *bs, op, optag, obj, bilog_flags, y, zones_trace);
f64942e4 6258 });
31f18b77 6259
7c673cae
FG
6260 if (r < 0) {
6261 return r;
6262 }
6263 prepared = true;
31f18b77 6264
7c673cae
FG
6265 return 0;
6266}
6267
b3b6e05e 6268int RGWRados::Bucket::UpdateIndex::complete(const DoutPrefixProvider *dpp, int64_t poolid, uint64_t epoch,
7c673cae
FG
6269 uint64_t size, uint64_t accounted_size,
6270 ceph::real_time& ut, const string& etag,
11fdf7f2 6271 const string& content_type, const string& storage_class,
7c673cae
FG
6272 bufferlist *acl_bl,
6273 RGWObjCategory category,
11fdf7f2
TL
6274 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6275 bool appendable)
7c673cae
FG
6276{
6277 if (blind) {
6278 return 0;
6279 }
6280 RGWRados *store = target->get_store();
20effc67 6281 BucketShard *bs = nullptr;
31f18b77 6282
b3b6e05e 6283 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6284 if (ret < 0) {
b3b6e05e 6285 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6286 return ret;
6287 }
6288
6289 rgw_bucket_dir_entry ent;
6290 obj.key.get_index_key(&ent.key);
6291 ent.meta.size = size;
6292 ent.meta.accounted_size = accounted_size;
6293 ent.meta.mtime = ut;
6294 ent.meta.etag = etag;
11fdf7f2 6295 ent.meta.storage_class = storage_class;
7c673cae
FG
6296 if (user_data)
6297 ent.meta.user_data = *user_data;
6298
6299 ACLOwner owner;
6300 if (acl_bl && acl_bl->length()) {
20effc67 6301 int ret = store->decode_policy(dpp, *acl_bl, &owner);
7c673cae 6302 if (ret < 0) {
b3b6e05e 6303 ldpp_dout(dpp, 0) << "WARNING: could not decode policy ret=" << ret << dendl;
7c673cae
FG
6304 }
6305 }
6306 ent.meta.owner = owner.get_id().to_str();
6307 ent.meta.owner_display_name = owner.get_display_name();
6308 ent.meta.content_type = content_type;
11fdf7f2 6309 ent.meta.appendable = appendable;
7c673cae 6310
31f18b77 6311 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6312
b3b6e05e 6313 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6314 if (r < 0) {
b3b6e05e 6315 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6316 }
6317
6318 return ret;
6319}
6320
20effc67 6321int RGWRados::Bucket::UpdateIndex::complete_del(const DoutPrefixProvider *dpp,
b3b6e05e 6322 int64_t poolid, uint64_t epoch,
7c673cae
FG
6323 real_time& removed_mtime,
6324 list<rgw_obj_index_key> *remove_objs)
6325{
6326 if (blind) {
6327 return 0;
6328 }
6329 RGWRados *store = target->get_store();
20effc67 6330 BucketShard *bs = nullptr;
31f18b77 6331
b3b6e05e 6332 int ret = get_bucket_shard(&bs, dpp);
7c673cae 6333 if (ret < 0) {
b3b6e05e 6334 ldpp_dout(dpp, 5) << "failed to get BucketShard object: ret=" << ret << dendl;
7c673cae
FG
6335 return ret;
6336 }
6337
31f18b77 6338 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6339
b3b6e05e 6340 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6341 if (r < 0) {
b3b6e05e 6342 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6343 }
6344
6345 return ret;
6346}
6347
6348
20effc67
TL
6349int RGWRados::Bucket::UpdateIndex::cancel(const DoutPrefixProvider *dpp,
6350 list<rgw_obj_index_key> *remove_objs)
7c673cae
FG
6351{
6352 if (blind) {
6353 return 0;
6354 }
6355 RGWRados *store = target->get_store();
6356 BucketShard *bs;
7c673cae 6357
b3b6e05e 6358 int ret = guard_reshard(dpp, &bs, [&](BucketShard *bs) -> int {
20effc67 6359 return store->cls_obj_complete_cancel(*bs, optag, obj, remove_objs, bilog_flags, zones_trace);
f64942e4 6360 });
7c673cae
FG
6361
6362 /*
6363 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6364 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6365 * have no way to tell that they're all caught up
6366 */
b3b6e05e 6367 int r = store->svc.datalog_rados->add_entry(dpp, target->bucket_info, bs->shard_id);
9f95a23c 6368 if (r < 0) {
b3b6e05e 6369 ldpp_dout(dpp, -1) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6370 }
6371
6372 return ret;
6373}
6374
b3b6e05e 6375int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y, const DoutPrefixProvider *dpp)
7c673cae
FG
6376{
6377 RGWRados *store = source->get_store();
7c673cae 6378
7c673cae
FG
6379 rgw_raw_obj read_obj;
6380 uint64_t read_ofs = ofs;
6381 uint64_t len, read_len;
6382 bool reading_from_head = true;
6383 ObjectReadOperation op;
6384
6385 bool merge_bl = false;
6386 bufferlist *pbl = &bl;
6387 bufferlist read_bl;
6388 uint64_t max_chunk_size;
6389
6390 RGWObjState *astate;
b3b6e05e 6391 int r = source->get_state(dpp, &astate, true, y);
7c673cae
FG
6392 if (r < 0)
6393 return r;
6394
11fdf7f2
TL
6395 if (astate->size == 0) {
6396 end = 0;
6397 } else if (end >= (int64_t)astate->size) {
6398 end = astate->size - 1;
6399 }
6400
7c673cae
FG
6401 if (end < 0)
6402 len = 0;
6403 else
6404 len = end - ofs + 1;
6405
9f95a23c 6406 if (astate->manifest && astate->manifest->has_tail()) {
7c673cae 6407 /* now get the relevant object part */
b3b6e05e 6408 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae
FG
6409
6410 uint64_t stripe_ofs = iter.get_stripe_ofs();
f67539c2 6411 read_obj = iter.get_location().get_raw_obj(store->store);
11fdf7f2 6412 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6413 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6414 reading_from_head = (read_obj == state.head_obj);
6415 } else {
6416 read_obj = state.head_obj;
6417 }
6418
b3b6e05e 6419 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size, dpp);
7c673cae 6420 if (r < 0) {
b3b6e05e 6421 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
7c673cae
FG
6422 return r;
6423 }
6424
6425 if (len > max_chunk_size)
6426 len = max_chunk_size;
6427
6428
7c673cae
FG
6429 read_len = len;
6430
6431 if (reading_from_head) {
6432 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6433 r = store->append_atomic_test(dpp, &source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
7c673cae
FG
6434 if (r < 0)
6435 return r;
6436
6437 if (astate && astate->prefetch_data) {
6438 if (!ofs && astate->data.length() >= len) {
6439 bl = astate->data;
6440 return bl.length();
6441 }
6442
6443 if (ofs < astate->data.length()) {
11fdf7f2 6444 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
9f95a23c 6445 astate->data.begin(ofs).copy(copy_len, bl);
7c673cae
FG
6446 read_len -= copy_len;
6447 read_ofs += copy_len;
6448 if (!read_len)
6449 return bl.length();
6450
6451 merge_bl = true;
6452 pbl = &read_bl;
6453 }
6454 }
6455 }
6456
b3b6e05e 6457 ldpp_dout(dpp, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
7c673cae
FG
6458 op.read(read_ofs, read_len, pbl, NULL);
6459
11fdf7f2
TL
6460 if (state.cur_pool != read_obj.pool) {
6461 auto iter = state.io_ctxs.find(read_obj.pool);
6462 if (iter == state.io_ctxs.end()) {
6463 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
b3b6e05e 6464 r = store->open_pool_ctx(dpp, read_obj.pool, *state.cur_ioctx, false);
11fdf7f2 6465 if (r < 0) {
b3b6e05e 6466 ldpp_dout(dpp, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
11fdf7f2
TL
6467 return r;
6468 }
6469 } else {
6470 state.cur_ioctx = &iter->second;
7c673cae 6471 }
11fdf7f2 6472 state.cur_pool = read_obj.pool;
7c673cae
FG
6473 }
6474
11fdf7f2 6475 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6476
11fdf7f2 6477 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
b3b6e05e 6478 ldpp_dout(dpp, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6479
7c673cae 6480 if (r < 0) {
7c673cae
FG
6481 return r;
6482 }
7c673cae 6483
11fdf7f2
TL
6484 if (merge_bl) {
6485 bl.append(read_bl);
7c673cae
FG
6486 }
6487
7c673cae
FG
6488 return bl.length();
6489}
6490
20effc67
TL
6491int get_obj_data::flush(rgw::AioResultList&& results) {
6492 int r = rgw::check_for_errors(results);
6493 if (r < 0) {
6494 return r;
6495 }
6496 std::list<bufferlist> bl_list;
7c673cae 6497
20effc67
TL
6498 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6499 results.sort(cmp); // merge() requires results to be sorted first
6500 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6501
20effc67
TL
6502 while (!completed.empty() && completed.front().id == offset) {
6503 auto bl = std::move(completed.front().data);
7c673cae 6504
20effc67
TL
6505 bl_list.push_back(bl);
6506 offset += bl.length();
6507 int r = client_cb->handle_data(bl, 0, bl.length());
6508 if (r < 0) {
6509 return r;
7c673cae 6510 }
7c673cae 6511
20effc67
TL
6512 if (rgwrados->get_use_datacache()) {
6513 const std::lock_guard l(d3n_get_data.d3n_lock);
6514 auto oid = completed.front().obj.get_ref().obj.oid;
6515 if (bl.length() <= g_conf()->rgw_get_obj_max_req_size && !d3n_bypass_cache_write) {
6516 lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): bl.length <= rgw_get_obj_max_req_size (default 4MB) - write to datacache, bl.length=" << bl.length() << dendl;
6517 rgwrados->d3n_data_cache->put(bl, bl.length(), oid);
6518 } else {
6519 lsubdout(g_ceph_context, rgw_datacache, 10) << "D3nDataCache: " << __func__ << "(): not writing to datacache - bl.length > rgw_get_obj_max_req_size (default 4MB), bl.length=" << bl.length() << " or d3n_bypass_cache_write=" << d3n_bypass_cache_write << dendl;
7c673cae 6520 }
7c673cae 6521 }
20effc67 6522 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6523 }
20effc67
TL
6524 return 0;
6525}
7c673cae 6526
20effc67 6527static int _get_obj_iterate_cb(const DoutPrefixProvider *dpp,
b3b6e05e 6528 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6529 off_t read_ofs, off_t len, bool is_head_obj,
6530 RGWObjState *astate, void *arg)
7c673cae 6531{
20effc67
TL
6532 struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
6533 return d->rgwrados->get_obj_iterate_cb(dpp, read_obj, obj_ofs, read_ofs, len,
11fdf7f2 6534 is_head_obj, astate, arg);
7c673cae
FG
6535}
6536
b3b6e05e
TL
6537int RGWRados::get_obj_iterate_cb(const DoutPrefixProvider *dpp,
6538 const rgw_raw_obj& read_obj, off_t obj_ofs,
11fdf7f2
TL
6539 off_t read_ofs, off_t len, bool is_head_obj,
6540 RGWObjState *astate, void *arg)
7c673cae 6541{
7c673cae 6542 ObjectReadOperation op;
20effc67 6543 struct get_obj_data* d = static_cast<struct get_obj_data*>(arg);
7c673cae 6544 string oid, key;
7c673cae
FG
6545
6546 if (is_head_obj) {
6547 /* only when reading from the head object do we need to do the atomic test */
b3b6e05e 6548 int r = append_atomic_test(dpp, astate, op);
7c673cae
FG
6549 if (r < 0)
6550 return r;
6551
6552 if (astate &&
6553 obj_ofs < astate->data.length()) {
11fdf7f2 6554 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6555
7c673cae 6556 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6557 if (r < 0)
6558 return r;
6559
7c673cae 6560 len -= chunk_len;
11fdf7f2 6561 d->offset += chunk_len;
7c673cae
FG
6562 read_ofs += chunk_len;
6563 obj_ofs += chunk_len;
6564 if (!len)
6565 return 0;
6566 }
6567 }
6568
20effc67 6569 auto obj = d->rgwrados->svc.rados->obj(read_obj);
b3b6e05e 6570 int r = obj.open(dpp);
7c673cae 6571 if (r < 0) {
b3b6e05e 6572 ldpp_dout(dpp, 4) << "failed to open rados context for " << read_obj << dendl;
11fdf7f2 6573 return r;
7c673cae
FG
6574 }
6575
b3b6e05e 6576 ldpp_dout(dpp, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
11fdf7f2 6577 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6578
11fdf7f2
TL
6579 const uint64_t cost = len;
6580 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6581
9f95a23c 6582 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
7c673cae 6583
11fdf7f2 6584 return d->flush(std::move(completed));
7c673cae
FG
6585}
6586
b3b6e05e 6587int RGWRados::Object::Read::iterate(const DoutPrefixProvider *dpp, int64_t ofs, int64_t end, RGWGetDataCB *cb,
9f95a23c 6588 optional_yield y)
7c673cae
FG
6589{
6590 RGWRados *store = source->get_store();
6591 CephContext *cct = store->ctx();
7c673cae 6592 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6593 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6594 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6595
9f95a23c
TL
6596 auto aio = rgw::make_throttle(window_size, y);
6597 get_obj_data data(store, cb, &*aio, ofs, y);
7c673cae 6598
b3b6e05e 6599 int r = store->iterate_obj(dpp, obj_ctx, source->get_bucket_info(), state.obj,
9f95a23c 6600 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
7c673cae 6601 if (r < 0) {
b3b6e05e 6602 ldpp_dout(dpp, 0) << "iterate_obj() failed with " << r << dendl;
11fdf7f2
TL
6603 data.cancel(); // drain completions without writing back to client
6604 return r;
7c673cae
FG
6605 }
6606
11fdf7f2 6607 return data.drain();
7c673cae
FG
6608}
6609
b3b6e05e 6610int RGWRados::iterate_obj(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx,
7c673cae 6611 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2 6612 off_t ofs, off_t end, uint64_t max_chunk_size,
9f95a23c 6613 iterate_obj_cb cb, void *arg, optional_yield y)
7c673cae
FG
6614{
6615 rgw_raw_obj head_obj;
6616 rgw_raw_obj read_obj;
6617 uint64_t read_ofs = ofs;
6618 uint64_t len;
6619 bool reading_from_head = true;
6620 RGWObjState *astate = NULL;
6621
6622 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6623
b3b6e05e 6624 int r = get_obj_state(dpp, &obj_ctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
6625 if (r < 0) {
6626 return r;
6627 }
6628
6629 if (end < 0)
6630 len = 0;
6631 else
6632 len = end - ofs + 1;
6633
9f95a23c 6634 if (astate->manifest) {
7c673cae 6635 /* now get the relevant object stripe */
b3b6e05e 6636 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(dpp, ofs);
7c673cae 6637
b3b6e05e 6638 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end(dpp);
7c673cae
FG
6639
6640 for (; iter != obj_end && ofs <= end; ++iter) {
6641 off_t stripe_ofs = iter.get_stripe_ofs();
6642 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6643
6644 while (ofs < next_stripe_ofs && ofs <= end) {
f67539c2 6645 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6646 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6647 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6648
6649 if (read_len > max_chunk_size) {
6650 read_len = max_chunk_size;
6651 }
6652
6653 reading_from_head = (read_obj == head_obj);
b3b6e05e 6654 r = cb(dpp, read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6655 if (r < 0) {
6656 return r;
6657 }
6658
6659 len -= read_len;
6660 ofs += read_len;
6661 }
6662 }
6663 } else {
6664 while (ofs <= end) {
6665 read_obj = head_obj;
11fdf7f2 6666 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6667
b3b6e05e 6668 r = cb(dpp, read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6669 if (r < 0) {
6670 return r;
6671 }
6672
6673 len -= read_len;
6674 ofs += read_len;
6675 }
6676 }
6677
6678 return 0;
6679}
6680
b3b6e05e 6681int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
7c673cae
FG
6682{
6683 rgw_rados_ref ref;
b3b6e05e 6684 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6685 if (r < 0) {
6686 return r;
6687 }
6688
b3b6e05e 6689 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, null_yield);
7c673cae
FG
6690}
6691
b3b6e05e 6692int RGWRados::obj_operate(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
7c673cae
FG
6693{
6694 rgw_rados_ref ref;
b3b6e05e 6695 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
6696 if (r < 0) {
6697 return r;
6698 }
6699
6700 bufferlist outbl;
6701
b3b6e05e 6702 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
7c673cae
FG
6703}
6704
b3b6e05e 6705int RGWRados::olh_init_modification_impl(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
7c673cae
FG
6706{
6707 ObjectWriteOperation op;
6708
11fdf7f2 6709 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6710
6711 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6712
6713 if (!state.exists) {
6714 op.create(true);
6715 } else {
6716 op.assert_exists();
b32b8144
FG
6717 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6718 op.mtime2(&mtime_ts);
7c673cae
FG
6719 }
6720
6721 /*
6722 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6723 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6724 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6725 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6726 * log will reflect that.
6727 *
6728 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6729 * is used for object data instance, olh_tag for olh instance.
6730 */
6731 if (has_tag) {
6732 /* guard against racing writes */
b3b6e05e 6733 bucket_index_guard_olh_op(dpp, state, op);
7c673cae
FG
6734 }
6735
6736 if (!has_tag) {
6737 /* obj tag */
9f95a23c 6738 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6739
7c673cae
FG
6740 bufferlist bl;
6741 bl.append(obj_tag.c_str(), obj_tag.size());
6742 op.setxattr(RGW_ATTR_ID_TAG, bl);
6743
6744 state.attrset[RGW_ATTR_ID_TAG] = bl;
6745 state.obj_tag = bl;
6746
6747 /* olh tag */
9f95a23c 6748 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6749
7c673cae
FG
6750 bufferlist olh_bl;
6751 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6752 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6753
6754 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6755 state.olh_tag = olh_bl;
6756 state.is_olh = true;
6757
6758 bufferlist verbl;
6759 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6760 }
6761
6762 bufferlist bl;
6763 RGWOLHPendingInfo pending_info;
6764 pending_info.time = real_clock::now();
11fdf7f2 6765 encode(pending_info, bl);
7c673cae
FG
6766
6767#define OLH_PENDING_TAG_LEN 32
6768 /* tag will start with current time epoch, this so that entries are sorted by time */
6769 char buf[32];
6770 utime_t ut(pending_info.time);
6771 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6772 *op_tag = buf;
6773
9f95a23c 6774 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
11fdf7f2 6775
7c673cae
FG
6776 op_tag->append(s);
6777
6778 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6779 attr_name.append(*op_tag);
6780
6781 op.setxattr(attr_name.c_str(), bl);
6782
b3b6e05e 6783 int ret = obj_operate(dpp, bucket_info, olh_obj, &op);
7c673cae
FG
6784 if (ret < 0) {
6785 return ret;
6786 }
6787
6788 state.exists = true;
6789 state.attrset[attr_name] = bl;
6790
6791 return 0;
6792}
6793
b3b6e05e 6794int RGWRados::olh_init_modification(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7c673cae
FG
6795{
6796 int ret;
6797
b3b6e05e 6798 ret = olh_init_modification_impl(dpp, bucket_info, state, obj, op_tag);
7c673cae
FG
6799 if (ret == -EEXIST) {
6800 ret = -ECANCELED;
6801 }
6802
6803 return ret;
6804}
6805
20effc67 6806int RGWRados::guard_reshard(const DoutPrefixProvider *dpp,
b3b6e05e 6807 BucketShard *bs,
f64942e4
AA
6808 const rgw_obj& obj_instance,
6809 const RGWBucketInfo& bucket_info,
6810 std::function<int(BucketShard *)> call)
31f18b77
FG
6811{
6812 rgw_obj obj;
6813 const rgw_obj *pobj = &obj_instance;
6814 int r;
6815
6816 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
b3b6e05e 6817 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */, dpp);
31f18b77 6818 if (r < 0) {
b3b6e05e 6819 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << r << dendl;
31f18b77
FG
6820 return r;
6821 }
6822 r = call(bs);
6823 if (r != -ERR_BUSY_RESHARDING) {
6824 break;
6825 }
b3b6e05e 6826 ldpp_dout(dpp, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
31f18b77 6827 string new_bucket_id;
b3b6e05e 6828 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield, dpp);
31f18b77
FG
6829 if (r == -ERR_BUSY_RESHARDING) {
6830 continue;
6831 }
6832 if (r < 0) {
6833 return r;
6834 }
b3b6e05e 6835 ldpp_dout(dpp, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
31f18b77
FG
6836 i = 0; /* resharding is finished, make sure we can retry */
6837
6838 obj = *pobj;
6839 obj.bucket.update_bucket_id(new_bucket_id);
6840 pobj = &obj;
81eedcae 6841 } // for loop
31f18b77
FG
6842
6843 if (r < 0) {
6844 return r;
6845 }
6846
6847 return 0;
6848}
6849
f64942e4
AA
6850int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6851 string *new_bucket_id,
11fdf7f2 6852 const RGWBucketInfo& bucket_info,
b3b6e05e
TL
6853 optional_yield y,
6854 const DoutPrefixProvider *dpp)
31f18b77 6855{
11fdf7f2
TL
6856 int ret = 0;
6857 cls_rgw_bucket_instance_entry entry;
6858
81eedcae
TL
6859 // since we want to run this recovery code from two distinct places,
6860 // let's just put it in a lambda so we can easily re-use; if the
6861 // lambda successfully fetches a new bucket id, it sets
6862 // new_bucket_id and returns 0, otherwise it returns a negative
6863 // error code
6864 auto fetch_new_bucket_id =
b3b6e05e 6865 [this, &bucket_info, dpp](const std::string& log_tag,
9f95a23c 6866 std::string* new_bucket_id) -> int {
81eedcae 6867 RGWBucketInfo fresh_bucket_info = bucket_info;
b3b6e05e 6868 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr, dpp);
81eedcae 6869 if (ret < 0) {
b3b6e05e 6870 ldpp_dout(dpp, 0) << __func__ <<
81eedcae
TL
6871 " ERROR: failed to refresh bucket info after reshard at " <<
6872 log_tag << ": " << cpp_strerror(-ret) << dendl;
6873 return ret;
6874 }
6875 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6876 return 0;
6877 };
6878
6879 constexpr int num_retries = 10;
6880 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
9f95a23c
TL
6881 auto& ref = bs->bucket_obj.get_ref();
6882 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
81eedcae
TL
6883 if (ret == -ENOENT) {
6884 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6885 } else if (ret < 0) {
b3b6e05e 6886 ldpp_dout(dpp, 0) << __func__ <<
81eedcae
TL
6887 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6888 dendl;
11fdf7f2
TL
6889 return ret;
6890 }
81eedcae 6891
11fdf7f2 6892 if (!entry.resharding_in_progress()) {
81eedcae
TL
6893 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6894 new_bucket_id);
11fdf7f2 6895 }
31f18b77 6896
b3b6e05e 6897 ldpp_dout(dpp, 20) << "NOTICE: reshard still in progress; " <<
81eedcae
TL
6898 (i < num_retries ? "retrying" : "too many retries") << dendl;
6899
6900 if (i == num_retries) {
11fdf7f2
TL
6901 break;
6902 }
6903
6904 // If bucket is erroneously marked as resharding (e.g., crash or
6905 // other error) then fix it. If we can take the bucket reshard
6906 // lock then it means no other resharding should be taking place,
6907 // and we're free to clear the flags.
6908 {
6909 // since we expect to do this rarely, we'll do our work in a
6910 // block and erase our work after each try
6911
9f95a23c 6912 RGWObjectCtx obj_ctx(this->store);
11fdf7f2
TL
6913 const rgw_bucket& b = bs->bucket;
6914 std::string bucket_id = b.get_key();
9f95a23c 6915 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
20effc67 6916 ret = reshard_lock.lock(dpp);
11fdf7f2 6917 if (ret < 0) {
20effc67
TL
6918 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
6919 ": failed to take reshard lock for bucket " <<
11fdf7f2
TL
6920 bucket_id << "; expected if resharding underway" << dendl;
6921 } else {
20effc67
TL
6922 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
6923 ": was able to take reshard lock for bucket " <<
11fdf7f2 6924 bucket_id << dendl;
b3b6e05e 6925 ret = RGWBucketReshard::clear_resharding(dpp, this->store, bucket_info);
11fdf7f2
TL
6926 if (ret < 0) {
6927 reshard_lock.unlock();
20effc67 6928 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
11fdf7f2
TL
6929 " ERROR: failed to clear resharding flags for bucket " <<
6930 bucket_id << dendl;
6931 } else {
6932 reshard_lock.unlock();
20effc67
TL
6933 ldpp_dout(dpp, 5) << __PRETTY_FUNCTION__ <<
6934 ": apparently successfully cleared resharding flags for "
11fdf7f2
TL
6935 "bucket " << bucket_id << dendl;
6936 continue; // if we apparently succeed immediately test again
6937 } // if clear resharding succeeded
6938 } // if taking of lock succeeded
6939 } // block to encapsulate recovery from incomplete reshard
6940
6941 ret = reshard_wait->wait(y);
6942 if (ret < 0) {
20effc67 6943 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
81eedcae 6944 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
6945 return ret;
6946 }
81eedcae
TL
6947 } // for loop
6948
20effc67 6949 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
81eedcae 6950 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 6951 return -ERR_BUSY_RESHARDING;
31f18b77
FG
6952}
6953
b3b6e05e 6954int RGWRados::bucket_index_link_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7c673cae
FG
6955 bool delete_marker,
6956 const string& op_tag,
6957 struct rgw_bucket_dir_entry_meta *meta,
6958 uint64_t olh_epoch,
91327a77
AA
6959 real_time unmod_since, bool high_precision_time,
6960 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
6961{
6962 rgw_rados_ref ref;
b3b6e05e 6963 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
6964 if (r < 0) {
6965 return r;
6966 }
6967
31f18b77
FG
6968 rgw_zone_set zones_trace;
6969 if (_zones_trace) {
6970 zones_trace = *_zones_trace;
7c673cae 6971 }
9f95a23c 6972 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
7c673cae 6973
31f18b77
FG
6974 BucketShard bs(this);
6975
b3b6e05e 6976 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 6977 [&](BucketShard *bs) -> int {
9f95a23c
TL
6978 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6979 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6980 librados::ObjectWriteOperation op;
6981 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6982 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6983 delete_marker, op_tag, meta, olh_epoch,
6984 unmod_since, high_precision_time,
6985 svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 6986 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6987 });
6988 if (r < 0) {
b3b6e05e 6989 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
31f18b77 6990 return r;
7c673cae
FG
6991 }
6992
b3b6e05e 6993 r = svc.datalog_rados->add_entry(dpp, bucket_info, bs.shard_id);
9f95a23c 6994 if (r < 0) {
b3b6e05e 6995 ldpp_dout(dpp, 0) << "ERROR: failed writing data log" << dendl;
91327a77
AA
6996 }
6997
7c673cae
FG
6998 return 0;
6999}
7000
b3b6e05e 7001void RGWRados::bucket_index_guard_olh_op(const DoutPrefixProvider *dpp, RGWObjState& olh_state, ObjectOperation& op)
7c673cae 7002{
b3b6e05e 7003 ldpp_dout(dpp, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7c673cae
FG
7004 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
7005}
7006
b3b6e05e 7007int RGWRados::bucket_index_unlink_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 7008 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
7009{
7010 rgw_rados_ref ref;
b3b6e05e 7011 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7012 if (r < 0) {
7013 return r;
7014 }
7015
31f18b77
FG
7016 rgw_zone_set zones_trace;
7017 if (_zones_trace) {
7018 zones_trace = *_zones_trace;
7c673cae 7019 }
9f95a23c 7020 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
31f18b77
FG
7021
7022 BucketShard bs(this);
7c673cae
FG
7023
7024 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
b3b6e05e 7025 r = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 7026 [&](BucketShard *bs) -> int {
9f95a23c 7027 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
7028 librados::ObjectWriteOperation op;
7029 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
7030 cls_rgw_bucket_unlink_instance(op, key, op_tag,
7031 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
b3b6e05e 7032 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
7033 });
7034 if (r < 0) {
b3b6e05e 7035 ldpp_dout(dpp, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
31f18b77 7036 return r;
7c673cae
FG
7037 }
7038
7039 return 0;
7040}
7041
20effc67 7042int RGWRados::bucket_index_read_olh_log(const DoutPrefixProvider *dpp,
b3b6e05e 7043 const RGWBucketInfo& bucket_info, RGWObjState& state,
7c673cae
FG
7044 const rgw_obj& obj_instance, uint64_t ver_marker,
7045 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
7046 bool *is_truncated)
7047{
7048 rgw_rados_ref ref;
b3b6e05e 7049 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7050 if (r < 0) {
7051 return r;
7052 }
7053
7054 BucketShard bs(this);
f64942e4 7055 int ret =
b3b6e05e 7056 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 7057 if (ret < 0) {
b3b6e05e 7058 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
7059 return ret;
7060 }
7061
7062 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7063
7064 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7065
b3b6e05e 7066 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4 7067 [&](BucketShard *bs) -> int {
9f95a23c 7068 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
7069 ObjectReadOperation op;
7070 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
7071
7072 rgw_cls_read_olh_log_ret log_ret;
7073 int op_ret = 0;
7074 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
7075 bufferlist outbl;
b3b6e05e 7076 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
9f95a23c
TL
7077 if (r < 0) {
7078 return r;
7079 }
7080 if (op_ret < 0) {
7081 return op_ret;
7082 }
7083
7084 *log = std::move(log_ret.log);
7085 *is_truncated = log_ret.is_truncated;
7086 return r;
f64942e4 7087 });
31f18b77 7088 if (ret < 0) {
b3b6e05e 7089 ldpp_dout(dpp, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 7090 return ret;
31f18b77 7091 }
7c673cae
FG
7092
7093 return 0;
7094}
7095
a8e16298
TL
7096// a multisite sync bug resulted in the OLH head attributes being overwritten by
7097// the attributes from another zone, causing link_olh() to fail endlessly due to
7098// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7099// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
b3b6e05e 7100int RGWRados::repair_olh(const DoutPrefixProvider *dpp, RGWObjState* state, const RGWBucketInfo& bucket_info,
a8e16298
TL
7101 const rgw_obj& obj)
7102{
7103 // fetch the current olh entry from the bucket index
7104 rgw_bucket_olh_entry olh;
b3b6e05e 7105 int r = bi_get_olh(dpp, bucket_info, obj, &olh);
a8e16298 7106 if (r < 0) {
b3b6e05e 7107 ldpp_dout(dpp, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
a8e16298
TL
7108 return r;
7109 }
11fdf7f2 7110 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
7111 return 0;
7112 }
7113
b3b6e05e 7114 ldpp_dout(dpp, 4) << "repair_olh setting olh_tag=" << olh.tag
a8e16298
TL
7115 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
7116
7117 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7118 ObjectWriteOperation op;
7119 // assert this is the same olh tag we think we're fixing
b3b6e05e 7120 bucket_index_guard_olh_op(dpp, *state, op);
a8e16298
TL
7121 // preserve existing mtime
7122 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
7123 op.mtime2(&mtime_ts);
7124 {
7125 bufferlist bl;
7126 bl.append(olh.tag.c_str(), olh.tag.size());
7127 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
7128 }
7129 {
7130 RGWOLHInfo info;
7131 info.target = rgw_obj(bucket_info.bucket, olh.key);
7132 info.removed = olh.delete_marker;
7133 bufferlist bl;
7134 encode(info, bl);
7135 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7136 }
7137 rgw_rados_ref ref;
b3b6e05e 7138 r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
a8e16298
TL
7139 if (r < 0) {
7140 return r;
7141 }
b3b6e05e 7142 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
a8e16298 7143 if (r < 0) {
b3b6e05e 7144 ldpp_dout(dpp, 0) << "repair_olh failed to write olh attributes with "
a8e16298
TL
7145 << cpp_strerror(r) << dendl;
7146 return r;
7147 }
7148 return 0;
7149}
7150
b3b6e05e 7151int RGWRados::bucket_index_trim_olh_log(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7c673cae
FG
7152{
7153 rgw_rados_ref ref;
b3b6e05e 7154 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7155 if (r < 0) {
7156 return r;
7157 }
7158
7159 BucketShard bs(this);
f64942e4 7160 int ret =
b3b6e05e 7161 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 7162 if (ret < 0) {
b3b6e05e 7163 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
7164 return ret;
7165 }
7166
7167 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7168
7169 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7170
b3b6e05e 7171 ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7172 [&](BucketShard *pbs) -> int {
7173 ObjectWriteOperation op;
7174 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7175 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
b3b6e05e 7176 return pbs->bucket_obj.operate(dpp, &op, null_yield);
31f18b77
FG
7177 });
7178 if (ret < 0) {
b3b6e05e 7179 ldpp_dout(dpp, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 7180 return ret;
31f18b77 7181 }
7c673cae
FG
7182
7183 return 0;
7184}
7185
b3b6e05e 7186int RGWRados::bucket_index_clear_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7c673cae
FG
7187{
7188 rgw_rados_ref ref;
b3b6e05e 7189 int r = get_obj_head_ref(dpp, bucket_info, obj_instance, &ref);
7c673cae
FG
7190 if (r < 0) {
7191 return r;
7192 }
7193
7194 BucketShard bs(this);
7c673cae
FG
7195
7196 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7197
7198 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7199
b3b6e05e 7200 int ret = guard_reshard(dpp, &bs, obj_instance, bucket_info,
f64942e4
AA
7201 [&](BucketShard *pbs) -> int {
7202 ObjectWriteOperation op;
9f95a23c 7203 auto& ref = pbs->bucket_obj.get_ref();
f64942e4 7204 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c 7205 cls_rgw_clear_olh(op, key, olh_tag);
b3b6e05e 7206 return rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77 7207 });
7c673cae 7208 if (ret < 0) {
b3b6e05e 7209 ldpp_dout(dpp, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7c673cae
FG
7210 return ret;
7211 }
7212
7213 return 0;
7214}
7215
20effc67 7216static int decode_olh_info(const DoutPrefixProvider *dpp, CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
92f5a8d4
TL
7217{
7218 try {
7219 auto biter = bl.cbegin();
7220 decode(*olh, biter);
7221 return 0;
7222 } catch (buffer::error& err) {
20effc67 7223 ldpp_dout(dpp, 0) << "ERROR: failed to decode olh info" << dendl;
92f5a8d4
TL
7224 return -EIO;
7225 }
7226}
7227
522d829b
TL
7228int RGWRados::apply_olh_log(const DoutPrefixProvider *dpp,
7229 RGWObjectCtx& obj_ctx,
7230 RGWObjState& state,
7231 const RGWBucketInfo& bucket_info,
7232 const rgw_obj& obj,
7233 bufferlist& olh_tag,
7234 std::map<uint64_t, std::vector<rgw_bucket_olh_log_entry> >& log,
7235 uint64_t *plast_ver,
7236 rgw_zone_set* zones_trace)
7c673cae
FG
7237{
7238 if (log.empty()) {
7239 return 0;
7240 }
7241
7242 librados::ObjectWriteOperation op;
7243
7244 uint64_t last_ver = log.rbegin()->first;
7245 *plast_ver = last_ver;
7246
7247 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7248
7249 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
92f5a8d4 7250 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7c673cae 7251
a8e16298
TL
7252 bufferlist ver_bl;
7253 string last_ver_s = to_string(last_ver);
7254 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7255 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7256
b32b8144
FG
7257 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7258 op.mtime2(&mtime_ts);
7259
7c673cae 7260 bool need_to_link = false;
92f5a8d4 7261 uint64_t link_epoch = 0;
7c673cae
FG
7262 cls_rgw_obj_key key;
7263 bool delete_marker = false;
7264 list<cls_rgw_obj_key> remove_instances;
7265 bool need_to_remove = false;
7266
92f5a8d4
TL
7267 // decode current epoch and instance
7268 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
7269 if (olh_ver != state.attrset.end()) {
7270 std::string str = olh_ver->second.to_str();
7271 std::string err;
7272 link_epoch = strict_strtoll(str.c_str(), 10, &err);
7273 }
7274 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
7275 if (olh_info != state.attrset.end()) {
7276 RGWOLHInfo info;
20effc67 7277 int r = decode_olh_info(dpp, cct, olh_info->second, &info);
92f5a8d4
TL
7278 if (r < 0) {
7279 return r;
7280 }
7281 info.target.key.get_index_key(&key);
7282 delete_marker = info.removed;
7283 }
7284
7c673cae
FG
7285 for (iter = log.begin(); iter != log.end(); ++iter) {
7286 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7287 for (; viter != iter->second.end(); ++viter) {
7288 rgw_bucket_olh_log_entry& entry = *viter;
7289
b3b6e05e 7290 ldpp_dout(dpp, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7c673cae
FG
7291 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7292 << (entry.delete_marker ? "(delete)" : "") << dendl;
7293 switch (entry.op) {
7294 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7295 remove_instances.push_back(entry.key);
7296 break;
7297 case CLS_RGW_OLH_OP_LINK_OLH:
92f5a8d4
TL
7298 // only overwrite a link of the same epoch if its key sorts before
7299 if (link_epoch < iter->first || key.instance.empty() ||
7300 key.instance > entry.key.instance) {
b3b6e05e 7301 ldpp_dout(dpp, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7302 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7303 need_to_link = true;
7304 need_to_remove = false;
7305 key = entry.key;
7306 delete_marker = entry.delete_marker;
7307 } else {
b3b6e05e 7308 ldpp_dout(dpp, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
92f5a8d4
TL
7309 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7310 }
7c673cae
FG
7311 break;
7312 case CLS_RGW_OLH_OP_UNLINK_OLH:
7313 need_to_remove = true;
7314 need_to_link = false;
7315 break;
7316 default:
b3b6e05e 7317 ldpp_dout(dpp, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7c673cae
FG
7318 return -EIO;
7319 }
7320 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7321 attr_name.append(entry.op_tag);
7322 op.rmxattr(attr_name.c_str());
7323 }
7324 }
7325
7326 rgw_rados_ref ref;
b3b6e05e 7327 int r = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae
FG
7328 if (r < 0) {
7329 return r;
7330 }
7331
7332 const rgw_bucket& bucket = obj.bucket;
7333
7334 if (need_to_link) {
7335 rgw_obj target(bucket, key);
7336 RGWOLHInfo info;
7337 info.target = target;
7338 info.removed = delete_marker;
7339 bufferlist bl;
11fdf7f2 7340 encode(info, bl);
7c673cae
FG
7341 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7342 }
7343
7344 /* first remove object instances */
7345 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7346 liter != remove_instances.end(); ++liter) {
7347 cls_rgw_obj_key& key = *liter;
7348 rgw_obj obj_instance(bucket, key);
b3b6e05e 7349 int ret = delete_obj(dpp, obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae 7350 if (ret < 0 && ret != -ENOENT) {
b3b6e05e 7351 ldpp_dout(dpp, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7c673cae
FG
7352 return ret;
7353 }
7354 }
7355
7356 /* update olh object */
b3b6e05e 7357 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
7358 if (r == -ECANCELED) {
7359 r = 0;
7360 }
7361 if (r < 0) {
b3b6e05e 7362 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7c673cae
FG
7363 return r;
7364 }
7365
b3b6e05e 7366 r = bucket_index_trim_olh_log(dpp, bucket_info, state, obj, last_ver);
7c673cae 7367 if (r < 0) {
b3b6e05e 7368 ldpp_dout(dpp, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7c673cae
FG
7369 return r;
7370 }
7371
7372 if (need_to_remove) {
7373 ObjectWriteOperation rm_op;
7374
7375 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7376 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7377 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7378 rm_op.remove();
7379
b3b6e05e 7380 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7c673cae
FG
7381 if (r == -ECANCELED) {
7382 return 0; /* someone else won this race */
7383 } else {
7384 /*
7385 * only clear if was successful, otherwise we might clobber pending operations on this object
7386 */
b3b6e05e 7387 r = bucket_index_clear_olh(dpp, bucket_info, state, obj);
7c673cae 7388 if (r < 0) {
b3b6e05e 7389 ldpp_dout(dpp, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7c673cae
FG
7390 return r;
7391 }
7392 }
7393 }
7394
7395 return 0;
7396}
7397
7398/*
7399 * read olh log and apply it
7400 */
b3b6e05e 7401int RGWRados::update_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7402{
7403 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7404 bool is_truncated;
7405 uint64_t ver_marker = 0;
7406
7407 do {
b3b6e05e 7408 int ret = bucket_index_read_olh_log(dpp, bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7c673cae
FG
7409 if (ret < 0) {
7410 return ret;
7411 }
b3b6e05e 7412 ret = apply_olh_log(dpp, obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7413 if (ret < 0) {
7414 return ret;
7415 }
7416 } while (is_truncated);
7417
7418 return 0;
7419}
7420
b3b6e05e 7421int RGWRados::set_olh(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77 7422 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
9f95a23c 7423 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7424{
7425 string op_tag;
7426
7427 rgw_obj olh_obj = target_obj;
7428 olh_obj.key.instance.clear();
7429
7430 RGWObjState *state = NULL;
7431
7432 int ret = 0;
7433 int i;
31f18b77 7434
7c673cae
FG
7435#define MAX_ECANCELED_RETRY 100
7436 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7437 if (ret == -ECANCELED) {
11fdf7f2 7438 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7439 }
7440
b3b6e05e 7441 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7442 if (ret < 0) {
7443 return ret;
7444 }
7445
b3b6e05e 7446 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7447 if (ret < 0) {
b3b6e05e 7448 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae
FG
7449 if (ret == -ECANCELED) {
7450 continue;
7451 }
7452 return ret;
7453 }
b3b6e05e 7454 ret = bucket_index_link_olh(dpp, bucket_info, *state, target_obj, delete_marker,
91327a77
AA
7455 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7456 zones_trace, log_data_change);
7c673cae 7457 if (ret < 0) {
b3b6e05e 7458 ldpp_dout(dpp, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7c673cae 7459 if (ret == -ECANCELED) {
a8e16298
TL
7460 // the bucket index rejected the link_olh() due to olh tag mismatch;
7461 // attempt to reconstruct olh head attributes based on the bucket index
b3b6e05e 7462 int r2 = repair_olh(dpp, state, bucket_info, olh_obj);
a8e16298
TL
7463 if (r2 < 0 && r2 != -ECANCELED) {
7464 return r2;
7465 }
7c673cae
FG
7466 continue;
7467 }
7468 return ret;
7469 }
7470 break;
7471 }
7472
7473 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7474 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7475 return -EIO;
7476 }
7477
b3b6e05e 7478 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7479 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7480 ret = 0;
7481 }
7482 if (ret < 0) {
b3b6e05e 7483 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7484 return ret;
7485 }
7486
7487 return 0;
7488}
7489
b3b6e05e 7490int RGWRados::unlink_obj_instance(const DoutPrefixProvider *dpp, RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
9f95a23c 7491 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7c673cae
FG
7492{
7493 string op_tag;
7494
7495 rgw_obj olh_obj = target_obj;
7496 olh_obj.key.instance.clear();
7497
7498 RGWObjState *state = NULL;
7499
7500 int ret = 0;
7501 int i;
7502
7503 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7504 if (ret == -ECANCELED) {
11fdf7f2 7505 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7506 }
7507
b3b6e05e 7508 ret = get_obj_state(dpp, &obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7509 if (ret < 0)
7510 return ret;
7511
b3b6e05e 7512 ret = olh_init_modification(dpp, bucket_info, *state, olh_obj, &op_tag);
7c673cae 7513 if (ret < 0) {
b3b6e05e 7514 ldpp_dout(dpp, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7515 if (ret == -ECANCELED) {
7516 continue;
7517 }
7518 return ret;
7519 }
7520
7521 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7522
b3b6e05e 7523 ret = bucket_index_unlink_instance(dpp, bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae 7524 if (ret < 0) {
b3b6e05e 7525 ldpp_dout(dpp, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7526 if (ret == -ECANCELED) {
7527 continue;
7528 }
7529 return ret;
7530 }
7531 break;
7532 }
7533
7534 if (i == MAX_ECANCELED_RETRY) {
b3b6e05e 7535 ldpp_dout(dpp, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7c673cae
FG
7536 return -EIO;
7537 }
7538
b3b6e05e 7539 ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7540 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7541 return 0;
7542 }
7543 if (ret < 0) {
b3b6e05e 7544 ldpp_dout(dpp, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7c673cae
FG
7545 return ret;
7546 }
7547
7548 return 0;
7549}
7550
11fdf7f2 7551void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7552{
7553#define OBJ_INSTANCE_LEN 32
7554 char buf[OBJ_INSTANCE_LEN + 1];
7555
7556 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7557 no underscore for instance name due to the way we encode the raw keys */
7558
11fdf7f2 7559 target_key->set_instance(buf);
7c673cae
FG
7560}
7561
11fdf7f2 7562void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7563{
11fdf7f2 7564 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7565}
7566
b3b6e05e 7567int RGWRados::get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7c673cae 7568{
92f5a8d4 7569 map<string, bufferlist> attrset;
7c673cae
FG
7570
7571 ObjectReadOperation op;
92f5a8d4 7572 op.getxattrs(&attrset, NULL);
7c673cae 7573
b3b6e05e 7574 int r = obj_operate(dpp, bucket_info, obj, &op);
7c673cae
FG
7575 if (r < 0) {
7576 return r;
7577 }
7c673cae 7578
92f5a8d4 7579 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7c673cae
FG
7580 if (iter == attrset.end()) { /* not an olh */
7581 return -EINVAL;
7582 }
7583
20effc67 7584 return decode_olh_info(dpp, cct, iter->second, olh);
7c673cae
FG
7585}
7586
20effc67 7587void RGWRados::check_pending_olh_entries(const DoutPrefixProvider *dpp, map<string, bufferlist>& pending_entries,
7c673cae
FG
7588 map<string, bufferlist> *rm_pending_entries)
7589{
7590 map<string, bufferlist>::iterator iter = pending_entries.begin();
7591
7592 real_time now = real_clock::now();
7593
7594 while (iter != pending_entries.end()) {
11fdf7f2 7595 auto biter = iter->second.cbegin();
7c673cae
FG
7596 RGWOLHPendingInfo pending_info;
7597 try {
11fdf7f2 7598 decode(pending_info, biter);
7c673cae
FG
7599 } catch (buffer::error& err) {
7600 /* skipping bad entry, we could remove it but it might hide a bug */
20effc67 7601 ldpp_dout(dpp, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7c673cae
FG
7602 ++iter;
7603 continue;
7604 }
7605
7606 map<string, bufferlist>::iterator cur_iter = iter;
7607 ++iter;
7608 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7609 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7610 pending_entries.erase(cur_iter);
7611 } else {
7612 /* entries names are sorted by time (rounded to a second) */
7613 break;
7614 }
7615 }
7616}
7617
b3b6e05e 7618int RGWRados::remove_olh_pending_entries(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7c673cae 7619{
7c673cae 7620 rgw_rados_ref ref;
b3b6e05e 7621 int r = get_obj_head_ref(dpp, bucket_info, olh_obj, &ref);
7c673cae
FG
7622 if (r < 0) {
7623 return r;
7624 }
7625
81eedcae
TL
7626 // trim no more than 1000 entries per osd op
7627 constexpr int max_entries = 1000;
7c673cae 7628
81eedcae
TL
7629 auto i = pending_attrs.begin();
7630 while (i != pending_attrs.end()) {
7631 ObjectWriteOperation op;
b3b6e05e 7632 bucket_index_guard_olh_op(dpp, state, op);
81eedcae
TL
7633
7634 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7635 op.rmxattr(i->first.c_str());
7636 }
7637
b3b6e05e 7638 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
81eedcae
TL
7639 if (r == -ENOENT || r == -ECANCELED) {
7640 /* raced with some other change, shouldn't sweat about it */
7641 return 0;
7642 }
7643 if (r < 0) {
b3b6e05e 7644 ldpp_dout(dpp, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
81eedcae
TL
7645 return r;
7646 }
7647 }
7c673cae
FG
7648 return 0;
7649}
7650
b3b6e05e 7651int RGWRados::follow_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7c673cae
FG
7652{
7653 map<string, bufferlist> pending_entries;
11fdf7f2 7654 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7655
7656 map<string, bufferlist> rm_pending_entries;
20effc67 7657 check_pending_olh_entries(dpp,pending_entries, &rm_pending_entries);
7c673cae
FG
7658
7659 if (!rm_pending_entries.empty()) {
b3b6e05e 7660 int ret = remove_olh_pending_entries(dpp, bucket_info, *state, olh_obj, rm_pending_entries);
7c673cae 7661 if (ret < 0) {
b3b6e05e 7662 ldpp_dout(dpp, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7c673cae
FG
7663 return ret;
7664 }
7665 }
7666 if (!pending_entries.empty()) {
b3b6e05e 7667 ldpp_dout(dpp, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7c673cae 7668
b3b6e05e 7669 int ret = update_olh(dpp, obj_ctx, state, bucket_info, olh_obj);
7c673cae
FG
7670 if (ret < 0) {
7671 return ret;
7672 }
7673 }
7674
92f5a8d4
TL
7675 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7676 if (iter == state->attrset.end()) {
7677 return -EINVAL;
7678 }
7679
7c673cae 7680 RGWOLHInfo olh;
20effc67 7681 int ret = decode_olh_info(dpp, cct, iter->second, &olh);
92f5a8d4
TL
7682 if (ret < 0) {
7683 return ret;
7c673cae
FG
7684 }
7685
7686 if (olh.removed) {
7687 return -ENOENT;
7688 }
7689
7690 *target = olh.target;
7691
7692 return 0;
7693}
7694
20effc67 7695int RGWRados::raw_obj_stat(const DoutPrefixProvider *dpp,
b3b6e05e 7696 rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7c673cae 7697 map<string, bufferlist> *attrs, bufferlist *first_chunk,
9f95a23c 7698 RGWObjVersionTracker *objv_tracker, optional_yield y)
7c673cae
FG
7699{
7700 rgw_rados_ref ref;
b3b6e05e 7701 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
7702 if (r < 0) {
7703 return r;
7704 }
7705
7706 map<string, bufferlist> unfiltered_attrset;
7707 uint64_t size = 0;
7708 struct timespec mtime_ts;
7709
7710 ObjectReadOperation op;
7711 if (objv_tracker) {
7712 objv_tracker->prepare_op_for_read(&op);
7713 }
7714 if (attrs) {
7715 op.getxattrs(&unfiltered_attrset, NULL);
7716 }
7717 if (psize || pmtime) {
7718 op.stat2(&size, &mtime_ts, NULL);
7719 }
7720 if (first_chunk) {
7721 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7722 }
7723 bufferlist outbl;
b3b6e05e 7724 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7c673cae
FG
7725
7726 if (epoch) {
9f95a23c 7727 *epoch = ref.pool.ioctx().get_last_version();
7c673cae
FG
7728 }
7729
7730 if (r < 0)
7731 return r;
7732
7733 if (psize)
7734 *psize = size;
7735 if (pmtime)
7736 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7737 if (attrs) {
11fdf7f2 7738 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7739 }
7740
7741 return 0;
7742}
7743
b3b6e05e 7744int RGWRados::get_bucket_stats(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7745 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7746{
a8e16298 7747 vector<rgw_bucket_dir_header> headers;
7c673cae 7748 map<int, string> bucket_instance_ids;
b3b6e05e 7749 int r = cls_bucket_head(dpp, bucket_info, shard_id, headers, &bucket_instance_ids);
7c673cae
FG
7750 if (r < 0) {
7751 return r;
7752 }
7753
11fdf7f2 7754 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7755
a8e16298 7756 auto iter = headers.begin();
7c673cae
FG
7757 map<int, string>::iterator viter = bucket_instance_ids.begin();
7758 BucketIndexShardsManager ver_mgr;
7759 BucketIndexShardsManager master_ver_mgr;
7760 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7761 char buf[64];
7762 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7763 accumulate_raw_stats(*iter, stats);
7764 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7765 ver_mgr.add(viter->first, string(buf));
a8e16298 7766 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7767 master_ver_mgr.add(viter->first, string(buf));
7768 if (shard_id >= 0) {
a8e16298 7769 *max_marker = iter->max_marker;
7c673cae 7770 } else {
a8e16298 7771 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7772 }
c07f9fc5 7773 if (syncstopped != NULL)
a8e16298 7774 *syncstopped = iter->syncstopped;
7c673cae
FG
7775 }
7776 ver_mgr.to_string(bucket_ver);
7777 master_ver_mgr.to_string(master_ver);
7778 if (shard_id < 0) {
7779 marker_mgr.to_string(max_marker);
7780 }
7781 return 0;
7782}
7783
7c673cae
FG
7784class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7785 RGWGetBucketStats_CB *cb;
7786 uint32_t pendings;
7787 map<RGWObjCategory, RGWStorageStats> stats;
7788 int ret_code;
7789 bool should_cb;
9f95a23c 7790 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7c673cae
FG
7791
7792public:
7793 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
9f95a23c
TL
7794 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7795 {}
7c673cae
FG
7796
7797 void handle_response(int r, rgw_bucket_dir_header& header) override {
9f95a23c 7798 std::lock_guard l{lock};
7c673cae
FG
7799 if (should_cb) {
7800 if ( r >= 0) {
7801 accumulate_raw_stats(header, stats);
7802 } else {
7803 ret_code = r;
7804 }
7805
7806 // Are we all done?
7807 if (--pendings == 0) {
7808 if (!ret_code) {
7809 cb->set_response(&stats);
7810 }
7811 cb->handle_response(ret_code);
7812 cb->put();
7813 }
7814 }
7815 }
7816
7817 void unset_cb() {
9f95a23c 7818 std::lock_guard l{lock};
7c673cae
FG
7819 should_cb = false;
7820 }
7821};
7822
b3b6e05e 7823int RGWRados::get_bucket_stats_async(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7c673cae
FG
7824{
7825 int num_aio = 0;
f67539c2 7826 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.layout.current_index.layout.normal.num_shards ? : 1);
11fdf7f2 7827 ceph_assert(get_ctx);
b3b6e05e 7828 int r = cls_bucket_head_async(dpp, bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
7829 if (r < 0) {
7830 ctx->put();
7831 if (num_aio) {
7832 get_ctx->unset_cb();
7833 }
7834 }
c07f9fc5 7835 get_ctx->put();
7c673cae
FG
7836 return r;
7837}
7838
e306af50
TL
7839int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7840 const string& meta_key,
7841 RGWBucketInfo& info,
7842 real_time *pmtime,
7843 map<string, bufferlist> *pattrs,
b3b6e05e
TL
7844 optional_yield y,
7845 const DoutPrefixProvider *dpp)
9f95a23c
TL
7846{
7847 rgw_bucket bucket;
7848 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7c673cae 7849
b3b6e05e 7850 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y, dpp);
9f95a23c 7851}
7c673cae 7852
11fdf7f2 7853int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
b3b6e05e
TL
7854 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y,
7855 const DoutPrefixProvider *dpp)
7c673cae 7856{
9f95a23c
TL
7857 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7858 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7859 y,
b3b6e05e 7860 dpp,
9f95a23c
TL
7861 RGWBucketCtl::BucketInstance::GetParams()
7862 .set_mtime(pmtime)
7863 .set_attrs(pattrs)
7864 .set_bectx_params(bectx_params));
7c673cae
FG
7865}
7866
9f95a23c 7867int RGWRados::get_bucket_info(RGWServices *svc,
b32b8144
FG
7868 const string& tenant, const string& bucket_name,
7869 RGWBucketInfo& info,
9f95a23c 7870 real_time *pmtime,
b3b6e05e
TL
7871 optional_yield y,
7872 const DoutPrefixProvider *dpp, map<string, bufferlist> *pattrs)
b32b8144 7873{
9f95a23c
TL
7874 auto obj_ctx = svc->sysobj->init_obj_ctx();
7875 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7876 rgw_bucket bucket;
7877 bucket.tenant = tenant;
7878 bucket.name = bucket_name;
b3b6e05e 7879 return ctl.bucket->read_bucket_info(bucket, &info, y, dpp,
9f95a23c
TL
7880 RGWBucketCtl::BucketInstance::GetParams()
7881 .set_mtime(pmtime)
7882 .set_attrs(pattrs)
7883 .set_bectx_params(bectx_params));
b32b8144
FG
7884}
7885
7886int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7887 ceph::real_time *pmtime,
b3b6e05e 7888 const DoutPrefixProvider *dpp,
b32b8144
FG
7889 map<string, bufferlist> *pattrs)
7890{
9f95a23c
TL
7891 rgw_bucket bucket = info.bucket;
7892 bucket.bucket_id.clear();
b32b8144 7893
9f95a23c 7894 auto rv = info.objv_tracker.read_version;
b32b8144 7895
b3b6e05e 7896 return ctl.bucket->read_bucket_info(bucket, &info, null_yield, dpp,
9f95a23c
TL
7897 RGWBucketCtl::BucketInstance::GetParams()
7898 .set_mtime(pmtime)
7899 .set_attrs(pattrs)
7900 .set_refresh_version(rv));
7c673cae
FG
7901}
7902
7903int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
b3b6e05e
TL
7904 real_time mtime, map<string, bufferlist> *pattrs,
7905 const DoutPrefixProvider *dpp)
7c673cae 7906{
b3b6e05e 7907 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield, dpp,
9f95a23c
TL
7908 RGWBucketCtl::BucketInstance::PutParams()
7909 .set_exclusive(exclusive)
7910 .set_mtime(mtime)
7911 .set_attrs(pattrs));
7c673cae
FG
7912}
7913
7914int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
b3b6e05e
TL
7915 map<string, bufferlist> *pattrs, bool create_entry_point,
7916 const DoutPrefixProvider *dpp)
7c673cae
FG
7917{
7918 bool create_head = !info.has_instance_obj || create_entry_point;
7919
b3b6e05e 7920 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs, dpp);
7c673cae
FG
7921 if (ret < 0) {
7922 return ret;
7923 }
7924
7925 if (!create_head)
7926 return 0; /* done! */
7927
7928 RGWBucketEntryPoint entry_point;
7929 entry_point.bucket = info.bucket;
7930 entry_point.owner = info.owner;
7931 entry_point.creation_time = info.creation_time;
7932 entry_point.linked = true;
7933 RGWObjVersionTracker ot;
7934 if (pep_objv && !pep_objv->tag.empty()) {
7935 ot.write_version = *pep_objv;
7936 } else {
7937 ot.generate_new_write_ver(cct);
7938 if (pep_objv) {
7939 *pep_objv = ot.write_version;
7940 }
7941 }
b3b6e05e 7942 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, dpp, RGWBucketCtl::Bucket::PutParams()
9f95a23c
TL
7943 .set_exclusive(exclusive)
7944 .set_objv_tracker(&ot)
7945 .set_mtime(mtime));
7c673cae
FG
7946 if (ret < 0)
7947 return ret;
7948
7949 return 0;
7950}
7951
b3b6e05e 7952int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m, const DoutPrefixProvider *dpp)
7c673cae 7953{
11fdf7f2 7954 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
7955
7956 map<string, RGWBucketEnt>::iterator iter;
7957 for (iter = m.begin(); iter != m.end(); ++iter) {
7958 RGWBucketEnt& ent = iter->second;
7959 rgw_bucket& bucket = ent.bucket;
7960 ent.count = 0;
7961 ent.size = 0;
7962 ent.size_rounded = 0;
7963
a8e16298 7964 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7965
7966 RGWBucketInfo bucket_info;
b3b6e05e 7967 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield, dpp);
7c673cae
FG
7968 if (ret < 0) {
7969 return ret;
7970 }
7971
b3b6e05e 7972 int r = cls_bucket_head(dpp, bucket_info, RGW_NO_SHARD, headers);
7c673cae
FG
7973 if (r < 0)
7974 return r;
7975
a8e16298 7976 auto hiter = headers.begin();
7c673cae
FG
7977 for (; hiter != headers.end(); ++hiter) {
7978 RGWObjCategory category = main_category;
11fdf7f2 7979 auto iter = (hiter->stats).find(category);
a8e16298 7980 if (iter != hiter->stats.end()) {
7c673cae
FG
7981 struct rgw_bucket_category_stats& stats = iter->second;
7982 ent.count += stats.num_entries;
7983 ent.size += stats.total_size;
7984 ent.size_rounded += stats.total_size_rounded;
7985 }
7986 }
3efd9988
FG
7987
7988 // fill in placement_rule from the bucket instance for use in swift's
7989 // per-storage policy statistics
7990 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
7991 }
7992
7993 return m.size();
7994}
7995
b3b6e05e 7996int RGWRados::append_async(const DoutPrefixProvider *dpp, rgw_raw_obj& obj, size_t size, bufferlist& bl)
7c673cae
FG
7997{
7998 rgw_rados_ref ref;
b3b6e05e 7999 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
8000 if (r < 0) {
8001 return r;
8002 }
8003 librados::Rados *rad = get_rados_handle();
9f95a23c 8004 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7c673cae 8005
9f95a23c 8006 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
8007 completion->release();
8008 return r;
8009}
8010
b3b6e05e 8011int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, RGWPoolIterCtx& ctx)
7c673cae
FG
8012{
8013 librados::IoCtx& io_ctx = ctx.io_ctx;
8014 librados::NObjectIterator& iter = ctx.iter;
8015
b3b6e05e 8016 int r = open_pool_ctx(dpp, pool, io_ctx, false);
7c673cae
FG
8017 if (r < 0)
8018 return r;
8019
8020 iter = io_ctx.nobjects_begin();
8021
8022 return 0;
8023}
8024
b3b6e05e 8025int RGWRados::pool_iterate_begin(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
181888fb
FG
8026{
8027 librados::IoCtx& io_ctx = ctx.io_ctx;
8028 librados::NObjectIterator& iter = ctx.iter;
8029
b3b6e05e 8030 int r = open_pool_ctx(dpp, pool, io_ctx, false);
181888fb
FG
8031 if (r < 0)
8032 return r;
8033
8034 librados::ObjectCursor oc;
8035 if (!oc.from_str(cursor)) {
b3b6e05e 8036 ldpp_dout(dpp, 10) << "failed to parse cursor: " << cursor << dendl;
181888fb
FG
8037 return -EINVAL;
8038 }
8039
f64942e4
AA
8040 try {
8041 iter = io_ctx.nobjects_begin(oc);
8042 return 0;
8043 } catch (const std::system_error& e) {
8044 r = -e.code().value();
b3b6e05e 8045 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
8046 << ", returning " << r << dendl;
8047 return r;
8048 } catch (const std::exception& e) {
b3b6e05e 8049 ldpp_dout(dpp, 10) << "nobjects_begin threw " << e.what()
f64942e4
AA
8050 << ", returning -5" << dendl;
8051 return -EIO;
8052 }
181888fb
FG
8053}
8054
8055string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
8056{
8057 return ctx.iter.get_cursor().to_str();
8058}
8059
20effc67 8060static int do_pool_iterate(const DoutPrefixProvider *dpp, CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
f64942e4 8061 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
8062 bool *is_truncated, RGWAccessListFilter *filter)
8063{
8064 librados::IoCtx& io_ctx = ctx.io_ctx;
8065 librados::NObjectIterator& iter = ctx.iter;
8066
8067 if (iter == io_ctx.nobjects_end())
8068 return -ENOENT;
8069
8070 uint32_t i;
8071
8072 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
8073 rgw_bucket_dir_entry e;
8074
8075 string oid = iter->get_oid();
20effc67 8076 ldpp_dout(dpp, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7c673cae
FG
8077
8078 // fill it in with initial values; we may correct later
8079 if (filter && !filter->filter(oid, oid))
8080 continue;
8081
8082 e.key = oid;
8083 objs.push_back(e);
8084 }
8085
8086 if (is_truncated)
8087 *is_truncated = (iter != io_ctx.nobjects_end());
8088
8089 return objs.size();
8090}
7c673cae 8091
20effc67 8092int RGWRados::pool_iterate(const DoutPrefixProvider *dpp, RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
f64942e4
AA
8093 bool *is_truncated, RGWAccessListFilter *filter)
8094{
8095 // catch exceptions from NObjectIterator::operator++()
8096 try {
20effc67 8097 return do_pool_iterate(dpp, cct, ctx, num, objs, is_truncated, filter);
f64942e4
AA
8098 } catch (const std::system_error& e) {
8099 int r = -e.code().value();
20effc67 8100 ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
f64942e4
AA
8101 << ", returning " << r << dendl;
8102 return r;
8103 } catch (const std::exception& e) {
20effc67 8104 ldpp_dout(dpp, 10) << "NObjectIterator threw exception " << e.what()
f64942e4
AA
8105 << ", returning -5" << dendl;
8106 return -EIO;
8107 }
8108}
8109
b3b6e05e 8110int RGWRados::list_raw_objects_init(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 8111{
181888fb 8112 if (!ctx->initialized) {
b3b6e05e 8113 int r = pool_iterate_begin(dpp, pool, marker, ctx->iter_ctx);
7c673cae 8114 if (r < 0) {
b3b6e05e 8115 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7c673cae
FG
8116 return r;
8117 }
181888fb 8118 ctx->initialized = true;
7c673cae 8119 }
181888fb
FG
8120 return 0;
8121}
7c673cae 8122
b3b6e05e 8123int RGWRados::list_raw_objects_next(const DoutPrefixProvider *dpp, const string& prefix_filter, int max,
181888fb
FG
8124 RGWListRawObjsCtx& ctx, list<string>& oids,
8125 bool *is_truncated)
8126{
8127 if (!ctx.initialized) {
8128 return -EINVAL;
8129 }
8130 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae 8131 vector<rgw_bucket_dir_entry> objs;
20effc67 8132 int r = pool_iterate(dpp, ctx.iter_ctx, max, objs, is_truncated, &filter);
7c673cae
FG
8133 if (r < 0) {
8134 if(r != -ENOENT)
b3b6e05e 8135 ldpp_dout(dpp, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7c673cae
FG
8136 return r;
8137 }
8138
8139 vector<rgw_bucket_dir_entry>::iterator iter;
8140 for (iter = objs.begin(); iter != objs.end(); ++iter) {
8141 oids.push_back(iter->key.name);
8142 }
8143
8144 return oids.size();
8145}
8146
b3b6e05e 8147int RGWRados::list_raw_objects(const DoutPrefixProvider *dpp, const rgw_pool& pool, const string& prefix_filter,
181888fb
FG
8148 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8149 bool *is_truncated)
8150{
8151 if (!ctx.initialized) {
b3b6e05e 8152 int r = list_raw_objects_init(dpp, pool, string(), &ctx);
181888fb
FG
8153 if (r < 0) {
8154 return r;
8155 }
8156 }
8157
b3b6e05e 8158 return list_raw_objects_next(dpp, prefix_filter, max, ctx, oids, is_truncated);
181888fb
FG
8159}
8160
8161string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8162{
8163 return pool_iterate_get_cursor(ctx.iter_ctx);
8164}
8165
b3b6e05e 8166int RGWRados::bi_get_instance(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8167 rgw_bucket_dir_entry *dirent)
7c673cae 8168{
a8e16298 8169 rgw_cls_bi_entry bi_entry;
b3b6e05e 8170 int r = bi_get(dpp, bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298 8171 if (r < 0 && r != -ENOENT) {
b3b6e05e 8172 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
a8e16298 8173 }
7c673cae
FG
8174 if (r < 0) {
8175 return r;
8176 }
11fdf7f2 8177 auto iter = bi_entry.data.cbegin();
a8e16298 8178 try {
11fdf7f2 8179 decode(*dirent, iter);
a8e16298 8180 } catch (buffer::error& err) {
b3b6e05e 8181 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
a8e16298
TL
8182 return -EIO;
8183 }
8184
8185 return 0;
8186}
7c673cae 8187
b3b6e05e 8188int RGWRados::bi_get_olh(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298
TL
8189 rgw_bucket_olh_entry *olh)
8190{
7c673cae 8191 rgw_cls_bi_entry bi_entry;
b3b6e05e 8192 int r = bi_get(dpp, bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae 8193 if (r < 0 && r != -ENOENT) {
b3b6e05e 8194 ldpp_dout(dpp, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7c673cae
FG
8195 }
8196 if (r < 0) {
8197 return r;
8198 }
11fdf7f2 8199 auto iter = bi_entry.data.cbegin();
7c673cae 8200 try {
a8e16298 8201 decode(*olh, iter);
7c673cae 8202 } catch (buffer::error& err) {
b3b6e05e 8203 ldpp_dout(dpp, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7c673cae
FG
8204 return -EIO;
8205 }
8206
8207 return 0;
8208}
8209
b3b6e05e 8210int RGWRados::bi_get(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
a8e16298 8211 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
8212{
8213 BucketShard bs(this);
b3b6e05e 8214 int ret = bs.init(dpp, bucket_info, obj);
7c673cae 8215 if (ret < 0) {
b3b6e05e 8216 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8217 return ret;
8218 }
8219
8220 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9f95a23c
TL
8221
8222 auto& ref = bs.bucket_obj.get_ref();
7c673cae 8223
9f95a23c 8224 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7c673cae
FG
8225}
8226
8227void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8228{
9f95a23c
TL
8229 auto& ref = bs.bucket_obj.get_ref();
8230 cls_rgw_bi_put(op, ref.obj.oid, entry);
7c673cae
FG
8231}
8232
8233int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8234{
9f95a23c
TL
8235 auto& ref = bs.bucket_obj.get_ref();
8236 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7c673cae
FG
8237 if (ret < 0)
8238 return ret;
8239
8240 return 0;
8241}
8242
b3b6e05e 8243int RGWRados::bi_put(const DoutPrefixProvider *dpp, rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7c673cae 8244{
20effc67
TL
8245 // make sure incomplete multipart uploads are hashed correctly
8246 if (obj.key.ns == RGW_OBJ_NS_MULTIPART) {
8247 RGWMPObj mp;
8248 mp.from_meta(obj.key.name);
8249 obj.index_hash_source = mp.get_key();
8250 }
7c673cae 8251 BucketShard bs(this);
20effc67 8252
b3b6e05e 8253 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8254 if (ret < 0) {
b3b6e05e 8255 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8256 return ret;
8257 }
8258
8259 return bi_put(bs, entry);
8260}
8261
20effc67
TL
8262int RGWRados::bi_list(const DoutPrefixProvider *dpp, rgw_bucket& bucket,
8263 const string& obj_name_filter, const string& marker, uint32_t max,
8264 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8265{
20effc67 8266 rgw_obj obj(bucket, obj_name_filter);
7c673cae 8267 BucketShard bs(this);
b3b6e05e 8268 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8269 if (ret < 0) {
b3b6e05e 8270 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8271 return ret;
8272 }
8273
9f95a23c 8274 auto& ref = bs.bucket_obj.get_ref();
20effc67 8275 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
31f18b77
FG
8276 if (ret == -ENOENT) {
8277 *is_truncated = false;
8278 }
7c673cae
FG
8279 if (ret < 0)
8280 return ret;
8281
8282 return 0;
8283}
8284
20effc67
TL
8285int RGWRados::bi_list(BucketShard& bs, const string& obj_name_filter, const string& marker, uint32_t max,
8286 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8287{
9f95a23c 8288 auto& ref = bs.bucket_obj.get_ref();
20effc67 8289 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name_filter, marker, max, entries, is_truncated);
7c673cae
FG
8290 if (ret < 0)
8291 return ret;
8292
8293 return 0;
8294}
8295
20effc67
TL
8296int RGWRados::bi_list(const DoutPrefixProvider *dpp,
8297 const RGWBucketInfo& bucket_info, int shard_id, const string& obj_name_filter, const string& marker, uint32_t max,
8298 list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7c673cae 8299{
20effc67
TL
8300 BucketShard bs(this);
8301 int ret = bs.init(bucket_info.bucket, shard_id, bucket_info.layout.current_index, nullptr /* no RGWBucketInfo */, dpp);
7c673cae 8302 if (ret < 0) {
20effc67 8303 ldpp_dout(dpp, 5) << "bs.init() returned ret=" << ret << dendl;
7c673cae
FG
8304 return ret;
8305 }
8306
20effc67 8307 return bi_list(bs, obj_name_filter, marker, max, entries, is_truncated);
7c673cae
FG
8308}
8309
20effc67 8310int RGWRados::bi_remove(const DoutPrefixProvider *dpp, BucketShard& bs)
7c673cae 8311{
20effc67
TL
8312 auto& ref = bs.bucket_obj.get_ref();
8313 int ret = ref.pool.ioctx().remove(ref.obj.oid);
8314 if (ret == -ENOENT) {
8315 ret = 0;
8316 }
7c673cae 8317 if (ret < 0) {
20effc67 8318 ldpp_dout(dpp, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
7c673cae
FG
8319 return ret;
8320 }
8321
20effc67 8322 return 0;
7c673cae
FG
8323}
8324
b3b6e05e 8325int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectWriteOperation *op)
7c673cae 8326{
b3b6e05e 8327 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, null_yield);
7c673cae
FG
8328}
8329
9f95a23c
TL
8330int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8331 librados::ObjectWriteOperation *op)
7c673cae 8332{
9f95a23c 8333 return gc_pool_ctx.aio_operate(oid, c, op);
7c673cae
FG
8334}
8335
b3b6e05e 8336int RGWRados::gc_operate(const DoutPrefixProvider *dpp, string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
7c673cae 8337{
b3b6e05e 8338 return rgw_rados_operate(dpp, gc_pool_ctx, oid, op, pbl, null_yield);
7c673cae
FG
8339}
8340
9f95a23c 8341int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
7c673cae 8342{
9f95a23c 8343 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
7c673cae
FG
8344}
8345
11fdf7f2 8346int RGWRados::process_gc(bool expired_only)
7c673cae 8347{
11fdf7f2 8348 return gc->process(expired_only);
7c673cae
FG
8349}
8350
f6b5b4d7 8351int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
f67539c2 8352 vector<rgw::sal::Lifecycle::LCEntry>& progress_map,
f6b5b4d7 8353 int& index)
7c673cae 8354{
f6b5b4d7 8355 return lc->list_lc_progress(marker, max_entries, progress_map, index);
7c673cae
FG
8356}
8357
20effc67 8358int RGWRados::process_lc(const std::unique_ptr<rgw::sal::Bucket>& optional_bucket)
7c673cae 8359{
f6b5b4d7
TL
8360 RGWLC lc;
8361 lc.initialize(cct, this->store);
8362 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
20effc67 8363 auto ret = lc.process(&worker, optional_bucket, true /* once */);
f6b5b4d7
TL
8364 lc.stop_processor(); // sets down_flag, but returns immediately
8365 return ret;
7c673cae
FG
8366}
8367
b3b6e05e 8368bool RGWRados::process_expire_objects(const DoutPrefixProvider *dpp)
7c673cae 8369{
b3b6e05e 8370 return obj_expirer->inspect_all_shards(dpp, utime_t(), ceph_clock_now());
7c673cae
FG
8371}
8372
b3b6e05e 8373int RGWRados::cls_obj_prepare_op(const DoutPrefixProvider *dpp, BucketShard& bs, RGWModifyOp op, string& tag,
9f95a23c 8374 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
7c673cae 8375{
31f18b77
FG
8376 rgw_zone_set zones_trace;
8377 if (_zones_trace) {
8378 zones_trace = *_zones_trace;
8379 }
9f95a23c 8380 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8381
7c673cae
FG
8382 ObjectWriteOperation o;
8383 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8384 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8385 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
b3b6e05e 8386 return bs.bucket_obj.operate(dpp, &o, y);
7c673cae
FG
8387}
8388
31f18b77 8389int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8390 int64_t pool, uint64_t epoch,
8391 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8392 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8393{
7c673cae
FG
8394 ObjectWriteOperation o;
8395 rgw_bucket_dir_entry_meta dir_meta;
8396 dir_meta = ent.meta;
8397 dir_meta.category = category;
8398
1adf2230
AA
8399 rgw_zone_set zones_trace;
8400 if (_zones_trace) {
8401 zones_trace = *_zones_trace;
8402 }
9f95a23c 8403 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8404
7c673cae
FG
8405 rgw_bucket_entry_ver ver;
8406 ver.pool = pool;
8407 ver.epoch = epoch;
8408 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
8409 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8410 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8411 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
8412 complete_op_data *arg;
8413 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8414 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77 8415 librados::AioCompletion *completion = arg->rados_completion;
9f95a23c 8416 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
31f18b77 8417 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
8418 return ret;
8419}
8420
31f18b77 8421int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
8422 int64_t pool, uint64_t epoch,
8423 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8424 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 8425{
31f18b77 8426 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
8427}
8428
8429int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8430 int64_t pool, uint64_t epoch,
8431 rgw_obj& obj,
8432 real_time& removed_mtime,
8433 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
8434 uint16_t bilog_flags,
8435 rgw_zone_set *zones_trace)
7c673cae
FG
8436{
8437 rgw_bucket_dir_entry ent;
8438 ent.meta.mtime = removed_mtime;
8439 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8440 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8441 ent, RGWObjCategory::None, remove_objs,
8442 bilog_flags, zones_trace);
7c673cae
FG
8443}
8444
20effc67
TL
8445int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj,
8446 list<rgw_obj_index_key> *remove_objs,
8447 uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
8448{
8449 rgw_bucket_dir_entry ent;
8450 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8451 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8452 -1 /* pool id */, 0, ent,
20effc67 8453 RGWObjCategory::None, remove_objs, bilog_flags,
11fdf7f2 8454 zones_trace);
7c673cae
FG
8455}
8456
b3b6e05e 8457int RGWRados::cls_obj_set_bucket_tag_timeout(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, uint64_t timeout)
7c673cae 8458{
9f95a23c 8459 RGWSI_RADOS::Pool index_pool;
7c673cae 8460 map<int, string> bucket_objs;
b3b6e05e 8461 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8462 if (r < 0)
8463 return r;
8464
9f95a23c
TL
8465 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8466}
8467
8468
8469uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8470 uint32_t num_shards)
8471{
8472 // We want to minimize the chances that when num_shards >>
8473 // num_entries that we return much fewer than num_entries to the
8474 // client. Given all the overhead of making a cls call to the osd,
8475 // returning a few entries is not much more work than returning one
8476 // entry. This minimum might be better tuned based on future
8477 // experiments where num_shards >> num_entries. (Note: ">>" should
8478 // be interpreted as "much greater than".)
8479 constexpr uint32_t min_read = 8;
8480
8481 // The following is based on _"Balls into Bins" -- A Simple and
8482 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8483 // cases when num_shards >> num_entries (it almost serves as a
8484 // ceiling calculation). We also assume alpha is 1.0 and extract it
8485 // from the calculation. Future work could involve memoizing some of
8486 // the transcendental functions to minimize repeatedly re-calling
8487 // them with the same parameters, which we expect to be the case the
8488 // majority of the time.
8489 uint32_t calc_read =
8490 1 +
8491 static_cast<uint32_t>((num_entries / num_shards) +
8492 sqrt((2 * num_entries) *
8493 log(num_shards) / num_shards));
8494
8495 return std::max(min_read, calc_read);
7c673cae
FG
8496}
8497
1adf2230 8498
20effc67 8499int RGWRados::cls_bucket_list_ordered(const DoutPrefixProvider *dpp,
b3b6e05e 8500 RGWBucketInfo& bucket_info,
9f95a23c
TL
8501 const int shard_id,
8502 const rgw_obj_index_key& start_after,
20effc67
TL
8503 const std::string& prefix,
8504 const std::string& delimiter,
9f95a23c
TL
8505 const uint32_t num_entries,
8506 const bool list_versions,
8507 const uint16_t expansion_factor,
8508 ent_map_t& m,
8509 bool* is_truncated,
8510 bool* cls_filtered,
20effc67 8511 rgw_obj_index_key* last_entry,
9f95a23c 8512 optional_yield y,
20effc67 8513 RGWBucketListNameFilter force_check_filter)
7c673cae 8514{
9f95a23c
TL
8515 /* expansion_factor allows the number of entries to read to grow
8516 * exponentially; this is used when earlier reads are producing too
8517 * few results, perhaps due to filtering or to a series of
8518 * namespaced entries */
8519
20effc67
TL
8520 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": " <<
8521 bucket_info.bucket <<
8522 " start_after=\"" << start_after <<
8523 "\", prefix=\"" << prefix <<
8524 ", delimiter=\"" << delimiter <<
8525 "\", shard_id=" << shard_id <<
8526 "\", num_entries=" << num_entries <<
9f95a23c 8527 ", list_versions=" << list_versions <<
20effc67
TL
8528 ", expansion_factor=" << expansion_factor <<
8529 ", force_check_filter is " <<
8530 (force_check_filter ? "set" : "unset") << dendl;
7c673cae 8531
9f95a23c
TL
8532 m.clear();
8533
8534 RGWSI_RADOS::Pool index_pool;
7c673cae 8535 // key - oid (for different shards if there is any)
1adf2230
AA
8536 // value - list result for the corresponding oid (shard), it is filled by
8537 // the AIO callback
20effc67 8538 std::map<int, std::string> shard_oids;
b3b6e05e 8539 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id,
9f95a23c
TL
8540 &index_pool, &shard_oids,
8541 nullptr);
8542 if (r < 0) {
20effc67
TL
8543 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8544 ": open_bucket_index for " << bucket_info.bucket << " failed" << dendl;
7c673cae 8545 return r;
9f95a23c
TL
8546 }
8547
8548 const uint32_t shard_count = shard_oids.size();
8549 uint32_t num_entries_per_shard;
8550 if (expansion_factor == 0) {
8551 num_entries_per_shard =
8552 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8553 } else if (expansion_factor <= 11) {
8554 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8555 num_entries_per_shard =
8556 std::min(num_entries,
8557 (uint32_t(1 << (expansion_factor - 1)) *
8558 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8559 } else {
8560 num_entries_per_shard = num_entries;
8561 }
8562
20effc67
TL
8563 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8564 ": request from each of " << shard_count <<
9f95a23c
TL
8565 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8566 num_entries << " total entries" << dendl;
7c673cae 8567
9f95a23c 8568 auto& ioctx = index_pool.ioctx();
20effc67 8569 std::map<int, rgw_cls_list_ret> shard_list_results;
9f95a23c
TL
8570 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8571 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8572 num_entries_per_shard,
8573 list_versions, shard_oids, shard_list_results,
1adf2230 8574 cct->_conf->rgw_bucket_index_max_aio)();
9f95a23c 8575 if (r < 0) {
20effc67
TL
8576 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8577 ": CLSRGWIssueBucketList for " << bucket_info.bucket <<
8578 " failed" << dendl;
7c673cae 8579 return r;
9f95a23c 8580 }
7c673cae 8581
9f95a23c
TL
8582 // to manage the iterators through each shard's list results
8583 struct ShardTracker {
8584 const size_t shard_idx;
8585 rgw_cls_list_ret& result;
8586 const std::string& oid_name;
8587 RGWRados::ent_map_t::iterator cursor;
8588 RGWRados::ent_map_t::iterator end;
8589
8590 // manages an iterator through a shard and provides other
8591 // accessors
8592 ShardTracker(size_t _shard_idx,
8593 rgw_cls_list_ret& _result,
8594 const std::string& _oid_name):
8595 shard_idx(_shard_idx),
8596 result(_result),
8597 oid_name(_oid_name),
8598 cursor(_result.dir.m.begin()),
8599 end(_result.dir.m.end())
8600 {}
8601
8602 inline const std::string& entry_name() const {
8603 return cursor->first;
8604 }
8605 rgw_bucket_dir_entry& dir_entry() const {
8606 return cursor->second;
8607 }
8608 inline bool is_truncated() const {
8609 return result.is_truncated;
8610 }
8611 inline ShardTracker& advance() {
8612 ++cursor;
8613 // return a self-reference to allow for chaining of calls, such
8614 // as x.advance().at_end()
8615 return *this;
8616 }
8617 inline bool at_end() const {
8618 return cursor == end;
8619 }
8620 }; // ShardTracker
8621
8622 // add the next unique candidate, or return false if we reach the end
f67539c2 8623 auto next_candidate = [] (CephContext *cct, ShardTracker& t,
9f95a23c
TL
8624 std::map<std::string, size_t>& candidates,
8625 size_t tracker_idx) {
8626 while (!t.at_end()) {
8627 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8628 return;
8629 }
8630 t.advance(); // skip duplicate common prefixes
8631 }
8632 };
8633
8634 // one tracker per shard requested (may not be all shards)
8635 std::vector<ShardTracker> results_trackers;
8636 results_trackers.reserve(shard_list_results.size());
8637 for (auto& r : shard_list_results) {
8638 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8639
8640 // if any *one* shard's result is trucated, the entire result is
8641 // truncated
8642 *is_truncated = *is_truncated || r.second.is_truncated;
8643
8644 // unless *all* are shards are cls_filtered, the entire result is
8645 // not filtered
8646 *cls_filtered = *cls_filtered && r.second.cls_filtered;
7c673cae
FG
8647 }
8648
9f95a23c
TL
8649 // create a map to track the next candidate entry from ShardTracker
8650 // (key=candidate, value=index into results_trackers); as we consume
8651 // entries from shards, we replace them with the next entries in the
8652 // shards until we run out
20effc67 8653 std::map<std::string, size_t> candidates;
9f95a23c
TL
8654 size_t tracker_idx = 0;
8655 for (auto& t : results_trackers) {
8656 // it's important that the values in the map refer to the index
8657 // into the results_trackers vector, which may not be the same
8658 // as the shard number (i.e., when not all shards are requested)
f67539c2 8659 next_candidate(cct, t, candidates, tracker_idx);
9f95a23c 8660 ++tracker_idx;
7c673cae
FG
8661 }
8662
9f95a23c
TL
8663 rgw_bucket_dir_entry*
8664 last_entry_visited = nullptr; // to set last_entry (marker)
20effc67 8665 std::map<std::string, bufferlist> updates;
7c673cae
FG
8666 uint32_t count = 0;
8667 while (count < num_entries && !candidates.empty()) {
8668 r = 0;
9f95a23c
TL
8669 // select the next entry in lexical order (first key in map);
8670 // again tracker_idx is not necessarily shard number, but is index
8671 // into results_trackers vector
8672 tracker_idx = candidates.begin()->second;
8673 auto& tracker = results_trackers.at(tracker_idx);
e306af50 8674
20effc67 8675 const std::string& name = tracker.entry_name();
9f95a23c
TL
8676 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8677
20effc67 8678 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ << ": currently processing " <<
9f95a23c
TL
8679 dirent.key << " from shard " << tracker.shard_idx << dendl;
8680
8681 const bool force_check =
8682 force_check_filter && force_check_filter(dirent.key.name);
8683
8684 if ((!dirent.exists &&
8685 !dirent.is_delete_marker() &&
8686 !dirent.is_common_prefix()) ||
3efd9988
FG
8687 !dirent.pending_map.empty() ||
8688 force_check) {
9f95a23c
TL
8689 /* there are uncommitted ops. We need to check the current
8690 * state, and if the tags are old we need to do clean-up as
8691 * well. */
7c673cae 8692 librados::IoCtx sub_ctx;
9f95a23c 8693 sub_ctx.dup(ioctx);
b3b6e05e 8694 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent,
9f95a23c 8695 updates[tracker.oid_name], y);
7c673cae 8696 if (r < 0 && r != -ENOENT) {
20effc67
TL
8697 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
8698 ": check_disk_state for \"" << dirent.key <<
8699 "\" failed with r=" << r << dendl;
9f95a23c 8700 return r;
7c673cae 8701 }
eafe8130 8702 } else {
9f95a23c 8703 r = 0;
7c673cae 8704 }
9f95a23c 8705
20effc67
TL
8706 // at this point either r >= 0 or r == -ENOENT
8707 if (r >= 0) { // i.e., if r != -ENOENT
8708 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": got " <<
8709 dirent.key << dendl;
8710
8711 auto [it, inserted] = m.insert_or_assign(name, std::move(dirent));
8712 last_entry_visited = &it->second;
8713 if (inserted) {
8714 ++count;
8715 } else {
8716 ldpp_dout(dpp, 0) << "WARNING: " << __PRETTY_FUNCTION__ <<
8717 " reassigned map value at \"" << name <<
8718 "\", which should not happen" << dendl;
8719 }
9f95a23c 8720 } else {
20effc67 8721 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << ": skipping " <<
9f95a23c 8722 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
e306af50 8723 last_entry_visited = &tracker.dir_entry();
7c673cae
FG
8724 }
8725
9f95a23c 8726 // refresh the candidates map
7c673cae 8727 candidates.erase(candidates.begin());
9f95a23c
TL
8728 tracker.advance();
8729
f67539c2 8730 next_candidate(cct, tracker, candidates, tracker_idx);
9f95a23c
TL
8731
8732 if (tracker.at_end() && tracker.is_truncated()) {
8733 // once we exhaust one shard that is truncated, we need to stop,
8734 // as we cannot be certain that one of the next entries needs to
8735 // come from that shard; S3 and swift protocols allow returning
8736 // fewer than what was requested
20effc67
TL
8737 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8738 ": stopped accumulating results at count=" << count <<
8739 ", dirent=\"" << dirent.key <<
8740 "\", because its shard is untruncated and exhaused" << dendl;
9f95a23c 8741 break;
7c673cae 8742 }
9f95a23c 8743 } // while we haven't provided requested # of result entries
7c673cae 8744
9f95a23c
TL
8745 // suggest updates if there are any
8746 for (auto& miter : updates) {
8747 if (miter.second.length()) {
7c673cae 8748 ObjectWriteOperation o;
9f95a23c 8749 cls_rgw_suggest_changes(o, miter.second);
7c673cae 8750 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8751 AioCompletion *c =
8752 librados::Rados::aio_create_completion(nullptr, nullptr);
8753 ioctx.aio_operate(miter.first, c, &o);
1adf2230 8754 c->release();
7c673cae 8755 }
9f95a23c 8756 } // updates loop
7c673cae 8757
9f95a23c
TL
8758 // determine truncation by checking if all the returned entries are
8759 // consumed or not
8760 *is_truncated = false;
8761 for (const auto& t : results_trackers) {
8762 if (!t.at_end() || t.is_truncated()) {
7c673cae 8763 *is_truncated = true;
1adf2230
AA
8764 break;
8765 }
7c673cae 8766 }
92f5a8d4 8767
20effc67 8768 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8769 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8770 dendl;
8771
8772 if (*is_truncated && count < num_entries) {
20effc67
TL
8773 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ <<
8774 ": requested " << num_entries << " entries but returning " <<
9f95a23c
TL
8775 count << ", which is truncated" << dendl;
8776 }
8777
8778 if (last_entry_visited != nullptr && last_entry) {
e306af50 8779 *last_entry = last_entry_visited->key;
20effc67 8780 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8781 ": returning, last_entry=" << *last_entry << dendl;
8782 } else {
20effc67 8783 ldpp_dout(dpp, 20) << __PRETTY_FUNCTION__ <<
9f95a23c
TL
8784 ": returning, last_entry NOT SET" << dendl;
8785 }
7c673cae
FG
8786
8787 return 0;
8788}
8789
1adf2230 8790
20effc67
TL
8791// A helper function to retrieve the hash source from an incomplete
8792// multipart entry by removing everything from the second to last
8793// period on.
522d829b
TL
8794static int parse_index_hash_source(const std::string& oid_wo_ns, std::string *index_hash_source) {
8795 std::size_t found = oid_wo_ns.rfind('.');
8796 if (found == std::string::npos || found < 1) {
8797 return -EINVAL;
8798 }
8799 found = oid_wo_ns.rfind('.', found - 1);
8800 if (found == std::string::npos || found < 1) {
8801 return -EINVAL;
8802 }
8803 *index_hash_source = oid_wo_ns.substr(0, found);
8804 return 0;
8805}
8806
8807
20effc67 8808int RGWRados::cls_bucket_list_unordered(const DoutPrefixProvider *dpp,
b3b6e05e 8809 RGWBucketInfo& bucket_info,
1adf2230 8810 int shard_id,
9f95a23c 8811 const rgw_obj_index_key& start_after,
20effc67 8812 const std::string& prefix,
1adf2230
AA
8813 uint32_t num_entries,
8814 bool list_versions,
8815 std::vector<rgw_bucket_dir_entry>& ent_list,
8816 bool *is_truncated,
8817 rgw_obj_index_key *last_entry,
9f95a23c 8818 optional_yield y,
20effc67
TL
8819 RGWBucketListNameFilter force_check_filter) {
8820 ldpp_dout(dpp, 10) << __PRETTY_FUNCTION__ << " " <<
8821 bucket_info.bucket <<
8822 " start_after=\"" << start_after <<
8823 "\", prefix=\"" << prefix <<
8824 "\", shard_id=" << shard_id <<
8825 "\", num_entries=" << num_entries <<
8826 ", list_versions=" << list_versions <<
8827 ", force_check_filter is " <<
8828 (force_check_filter ? "set" : "unset") << dendl;
1adf2230 8829
9f95a23c 8830 ent_list.clear();
11fdf7f2
TL
8831 static MultipartMetaFilter multipart_meta_filter;
8832
1adf2230 8833 *is_truncated = false;
9f95a23c 8834 RGWSI_RADOS::Pool index_pool;
1adf2230 8835
20effc67 8836 std::map<int, std::string> oids;
b3b6e05e 8837 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, nullptr);
20effc67 8838 if (r < 0) {
1adf2230 8839 return r;
20effc67 8840 }
9f95a23c
TL
8841
8842 auto& ioctx = index_pool.ioctx();
8843
1adf2230
AA
8844 const uint32_t num_shards = oids.size();
8845
9f95a23c 8846 rgw_obj_index_key marker = start_after;
1adf2230
AA
8847 uint32_t current_shard;
8848 if (shard_id >= 0) {
8849 current_shard = shard_id;
9f95a23c 8850 } else if (start_after.empty()) {
1adf2230
AA
8851 current_shard = 0u;
8852 } else {
9f95a23c
TL
8853 // at this point we have a marker (start_after) that has something
8854 // in it, so we need to get to the bucket shard index, so we can
11fdf7f2
TL
8855 // start reading from there
8856
11fdf7f2
TL
8857
8858 // now convert the key (oid) to an rgw_obj_key since that will
8859 // separate out the namespace, name, and instance
8860 rgw_obj_key obj_key;
522d829b 8861 bool parsed = rgw_obj_key::parse_raw_oid(start_after.name, &obj_key);
11fdf7f2 8862 if (!parsed) {
20effc67
TL
8863 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8864 " received an invalid start marker: \"" << start_after << "\"" <<
8865 dendl;
11fdf7f2
TL
8866 return -EINVAL;
8867 } else if (obj_key.name.empty()) {
8868 // if the name is empty that means the object name came in with
8869 // a namespace only, and therefore we need to start our scan at
8870 // the first bucket index shard
8871 current_shard = 0u;
8872 } else {
8873 // so now we have the key used to compute the bucket index shard
8874 // and can extract the specific shard from it
522d829b
TL
8875 if (obj_key.ns == RGW_OBJ_NS_MULTIPART) {
8876 // Use obj_key.ns == RGW_OBJ_NS_MULTIPART instead of
8877 // the implementation relying on MultipartMetaFilter
8878 // because MultipartMetaFilter only checks .meta suffix, which may
8879 // exclude data multiparts but include some regular objects with .meta suffix
8880 // by mistake.
8881 string index_hash_source;
8882 r = parse_index_hash_source(obj_key.name, &index_hash_source);
8883 if (r < 0) {
20effc67
TL
8884 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8885 " parse_index_hash_source unable to parse \"" << obj_key.name <<
8886 "\", r=" << r << dendl;
522d829b
TL
8887 return r;
8888 }
8889 current_shard = svc.bi_rados->bucket_shard_index(index_hash_source, num_shards);
8890 } else {
8891 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
8892 }
11fdf7f2 8893 }
1adf2230
AA
8894 }
8895
8896 uint32_t count = 0u;
20effc67 8897 std::map<std::string, bufferlist> updates;
11fdf7f2 8898 rgw_obj_index_key last_added_entry;
1adf2230
AA
8899 while (count <= num_entries &&
8900 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8901 current_shard < num_shards)) {
81eedcae
TL
8902 const std::string& oid = oids[current_shard];
8903 rgw_cls_list_ret result;
8904
8905 librados::ObjectReadOperation op;
20effc67 8906 const std::string empty_delimiter;
9f95a23c
TL
8907 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8908 num_entries,
81eedcae 8909 list_versions, &result);
b3b6e05e 8910 r = rgw_rados_operate(dpp, ioctx, oid, &op, nullptr, null_yield);
20effc67
TL
8911 if (r < 0) {
8912 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8913 " error in rgw_rados_operate (bucket list op), r=" << r << dendl;
1adf2230 8914 return r;
20effc67 8915 }
1adf2230 8916
1adf2230
AA
8917 for (auto& entry : result.dir.m) {
8918 rgw_bucket_dir_entry& dirent = entry.second;
8919
8920 bool force_check = force_check_filter &&
8921 force_check_filter(dirent.key.name);
8922 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8923 !dirent.pending_map.empty() ||
8924 force_check) {
8925 /* there are uncommitted ops. We need to check the current state,
8926 * and if the tags are old we need to do cleanup as well. */
8927 librados::IoCtx sub_ctx;
9f95a23c 8928 sub_ctx.dup(ioctx);
b3b6e05e 8929 r = check_disk_state(dpp, sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
1adf2230 8930 if (r < 0 && r != -ENOENT) {
20effc67
TL
8931 ldpp_dout(dpp, 0) << "ERROR: " << __func__ <<
8932 " error in check_disk_state, r=" << r << dendl;
1adf2230
AA
8933 return r;
8934 }
eafe8130
TL
8935 } else {
8936 r = 0;
1adf2230
AA
8937 }
8938
20effc67 8939 // at this point either r >= 0 or r == -ENOENT
1adf2230 8940 if (r >= 0) { // i.e., if r != -ENOENT
20effc67
TL
8941 ldpp_dout(dpp, 10) << __func__ << ": got " <<
8942 dirent.key << dendl;
1adf2230
AA
8943
8944 if (count < num_entries) {
11fdf7f2 8945 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
8946 ent_list.emplace_back(std::move(dirent));
8947 ++count;
8948 } else {
20effc67 8949 last_added_entry = dirent.key;
1adf2230 8950 *is_truncated = true;
20effc67
TL
8951 ldpp_dout(dpp, 10) << "INFO: " << __func__ <<
8952 ": reached max entries (" << num_entries << ") to return at \"" <<
8953 dirent.key << "\"" << dendl;
1adf2230
AA
8954 goto check_updates;
8955 }
8956 } else { // r == -ENOENT
8957 // in the case of -ENOENT, make sure we're advancing marker
8958 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 8959 marker = dirent.key;
1adf2230
AA
8960 }
8961 } // entry for loop
8962
8963 if (!result.is_truncated) {
8964 // if we reached the end of the shard read next shard
8965 ++current_shard;
11fdf7f2 8966 marker = rgw_obj_index_key();
1adf2230
AA
8967 }
8968 } // shard loop
8969
8970check_updates:
11fdf7f2 8971
1adf2230 8972 // suggest updates if there is any
20effc67 8973 std::map<std::string, bufferlist>::iterator miter = updates.begin();
1adf2230
AA
8974 for (; miter != updates.end(); ++miter) {
8975 if (miter->second.length()) {
8976 ObjectWriteOperation o;
8977 cls_rgw_suggest_changes(o, miter->second);
8978 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8979 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8980 ioctx.aio_operate(miter->first, c, &o);
1adf2230
AA
8981 c->release();
8982 }
8983 }
8984
8985 if (last_entry && !ent_list.empty()) {
8986 *last_entry = last_added_entry;
8987 }
8988
8989 return 0;
11fdf7f2 8990} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
8991
8992
b3b6e05e 8993int RGWRados::cls_obj_usage_log_add(const DoutPrefixProvider *dpp, const string& oid,
1adf2230 8994 rgw_usage_log_info& info)
7c673cae 8995{
11fdf7f2 8996 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8997
8998 rgw_rados_ref ref;
b3b6e05e 8999 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9000 if (r < 0) {
9001 return r;
9002 }
9003
9004 ObjectWriteOperation op;
9005 cls_rgw_usage_log_add(op, info);
9006
b3b6e05e 9007 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
9008 return r;
9009}
9010
b3b6e05e 9011int RGWRados::cls_obj_usage_log_read(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2
TL
9012 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
9013 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
9014 bool *is_truncated)
7c673cae 9015{
11fdf7f2 9016 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9017
9018 rgw_rados_ref ref;
b3b6e05e 9019 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9020 if (r < 0) {
9021 return r;
9022 }
9023
9024 *is_truncated = false;
9025
9f95a23c 9026 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
9027 max_entries, read_iter, usage, is_truncated);
9028
9029 return r;
9030}
9031
b3b6e05e 9032static int cls_rgw_usage_log_trim_repeat(const DoutPrefixProvider *dpp, rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
9f95a23c
TL
9033{
9034 bool done = false;
9035 do {
9036 librados::ObjectWriteOperation op;
9037 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
b3b6e05e 9038 int r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
9f95a23c
TL
9039 if (r == -ENODATA)
9040 done = true;
9041 else if (r < 0)
9042 return r;
9043 } while (!done);
9044
9045 return 0;
9046}
9047
b3b6e05e 9048int RGWRados::cls_obj_usage_log_trim(const DoutPrefixProvider *dpp, const string& oid, const string& user, const string& bucket,
11fdf7f2 9049 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 9050{
11fdf7f2 9051 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9052
9053 rgw_rados_ref ref;
b3b6e05e 9054 int r = get_raw_obj_ref(dpp, obj, &ref);
7c673cae
FG
9055 if (r < 0) {
9056 return r;
9057 }
9058
b3b6e05e 9059 r = cls_rgw_usage_log_trim_repeat(dpp, ref, user, bucket, start_epoch, end_epoch);
11fdf7f2
TL
9060 return r;
9061}
9062
b3b6e05e 9063int RGWRados::cls_obj_usage_log_clear(const DoutPrefixProvider *dpp, string& oid)
11fdf7f2
TL
9064{
9065 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9066
9067 rgw_rados_ref ref;
b3b6e05e 9068 int r = get_raw_obj_ref(dpp, obj, &ref);
11fdf7f2
TL
9069 if (r < 0) {
9070 return r;
9071 }
9072 librados::ObjectWriteOperation op;
9073 cls_rgw_usage_log_clear(op);
b3b6e05e 9074 r = rgw_rados_operate(dpp, ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
9075 return r;
9076}
9077
11fdf7f2 9078
b3b6e05e 9079int RGWRados::remove_objs_from_index(const DoutPrefixProvider *dpp, RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
7c673cae 9080{
9f95a23c 9081 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
9082 string dir_oid;
9083
11fdf7f2 9084 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae 9085
b3b6e05e 9086 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, &index_pool, &dir_oid);
7c673cae
FG
9087 if (r < 0)
9088 return r;
9089
9090 bufferlist updates;
9091
9092 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
9093 rgw_bucket_dir_entry entry;
9094 entry.key = *iter;
b3b6e05e 9095 ldpp_dout(dpp, 2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
7c673cae
FG
9096 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9097 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 9098 encode(entry, updates);
7c673cae
FG
9099 }
9100
9101 bufferlist out;
9102
9f95a23c 9103 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
7c673cae
FG
9104
9105 return r;
9106}
9107
20effc67 9108int RGWRados::check_disk_state(const DoutPrefixProvider *dpp,
b3b6e05e 9109 librados::IoCtx io_ctx,
7c673cae
FG
9110 const RGWBucketInfo& bucket_info,
9111 rgw_bucket_dir_entry& list_state,
9112 rgw_bucket_dir_entry& object,
9f95a23c
TL
9113 bufferlist& suggested_updates,
9114 optional_yield y)
7c673cae
FG
9115{
9116 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 9117 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
9118
9119 std::string loc;
9120
9121 rgw_obj obj(bucket, list_state.key);
9122
9123 string oid;
9124 get_obj_bucket_and_oid_loc(obj, oid, loc);
9125
9126 if (loc != list_state.locator) {
b3b6e05e 9127 ldpp_dout(dpp, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
7c673cae
FG
9128 }
9129
9130 io_ctx.locator_set_key(list_state.locator);
9131
9132 RGWObjState *astate = NULL;
9f95a23c 9133 RGWObjectCtx rctx(this->store);
b3b6e05e 9134 int r = get_obj_state(dpp, &rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
9135 if (r < 0)
9136 return r;
9137
9138 list_state.pending_map.clear(); // we don't need this and it inflates size
9f95a23c 9139 if (!list_state.is_delete_marker() && !astate->exists) {
7c673cae
FG
9140 /* object doesn't exist right now -- hopefully because it's
9141 * marked as !exists and got deleted */
9142 if (list_state.exists) {
9143 /* FIXME: what should happen now? Work out if there are any
9144 * non-bad ways this could happen (there probably are, but annoying
9145 * to handle!) */
9146 }
20effc67 9147
7c673cae
FG
9148 // encode a suggested removal of that key
9149 list_state.ver.epoch = io_ctx.get_last_version();
9150 list_state.ver.pool = io_ctx.get_id();
9151 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
9152 return -ENOENT;
9153 }
9154
9155 string etag;
9156 string content_type;
2a845540 9157 string storage_class;
7c673cae
FG
9158 ACLOwner owner;
9159
9160 object.meta.size = astate->size;
9161 object.meta.accounted_size = astate->accounted_size;
9162 object.meta.mtime = astate->mtime;
9163
9164 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
9165 if (iter != astate->attrset.end()) {
11fdf7f2 9166 etag = rgw_bl_str(iter->second);
7c673cae
FG
9167 }
9168 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
9169 if (iter != astate->attrset.end()) {
11fdf7f2 9170 content_type = rgw_bl_str(iter->second);
7c673cae 9171 }
2a845540
TL
9172 iter = astate->attrset.find(RGW_ATTR_STORAGE_CLASS);
9173 if (iter != astate->attrset.end()) {
9174 storage_class = rgw_bl_str(iter->second);
9175 }
7c673cae
FG
9176 iter = astate->attrset.find(RGW_ATTR_ACL);
9177 if (iter != astate->attrset.end()) {
20effc67 9178 r = decode_policy(dpp, iter->second, &owner);
7c673cae 9179 if (r < 0) {
b3b6e05e 9180 ldpp_dout(dpp, 0) << "WARNING: could not decode policy for object: " << obj << dendl;
7c673cae
FG
9181 }
9182 }
9183
9f95a23c 9184 if (astate->manifest) {
7c673cae 9185 RGWObjManifest::obj_iterator miter;
9f95a23c 9186 RGWObjManifest& manifest = *astate->manifest;
b3b6e05e 9187 for (miter = manifest.obj_begin(dpp); miter != manifest.obj_end(dpp); ++miter) {
f67539c2 9188 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(store);
7c673cae 9189 rgw_obj loc;
9f95a23c 9190 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
7c673cae
FG
9191
9192 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
b3b6e05e
TL
9193 ldpp_dout(dpp, 0) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
9194 r = delete_obj_index(loc, astate->mtime, dpp);
7c673cae 9195 if (r < 0) {
b3b6e05e 9196 ldpp_dout(dpp, 0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
7c673cae
FG
9197 }
9198 }
9199 }
9200 }
9201
9202 object.meta.etag = etag;
9203 object.meta.content_type = content_type;
2a845540 9204 object.meta.storage_class = storage_class;
7c673cae
FG
9205 object.meta.owner = owner.get_id().to_str();
9206 object.meta.owner_display_name = owner.get_display_name();
9207
9208 // encode suggested updates
20effc67 9209
7c673cae
FG
9210 list_state.meta.size = object.meta.size;
9211 list_state.meta.accounted_size = object.meta.accounted_size;
9212 list_state.meta.mtime = object.meta.mtime;
9213 list_state.meta.category = main_category;
9214 list_state.meta.etag = etag;
9215 list_state.meta.content_type = content_type;
2a845540 9216 list_state.meta.storage_class = storage_class;
20effc67
TL
9217
9218 librados::IoCtx head_obj_ctx; // initialize to data pool so we can get pool id
1d09f67e
TL
9219 int ret = get_obj_head_ioctx(dpp, bucket_info, obj, &head_obj_ctx);
9220 if (ret < 0) {
20effc67
TL
9221 ldpp_dout(dpp, 0) << __PRETTY_FUNCTION__ <<
9222 " WARNING: unable to find head object data pool for \"" <<
9223 obj << "\", not updating version pool/epoch" << dendl;
1d09f67e
TL
9224 } else {
9225 list_state.ver.pool = head_obj_ctx.get_id();
9226 list_state.ver.epoch = astate->epoch;
20effc67
TL
9227 }
9228
9229 if (astate->obj_tag.length() > 0) {
7c673cae 9230 list_state.tag = astate->obj_tag.c_str();
20effc67
TL
9231 }
9232
7c673cae
FG
9233 list_state.meta.owner = owner.get_id().to_str();
9234 list_state.meta.owner_display_name = owner.get_display_name();
9235
9236 list_state.exists = true;
20effc67 9237
7c673cae
FG
9238 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9239 return 0;
9240}
9241
b3b6e05e 9242int RGWRados::cls_bucket_head(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae 9243{
9f95a23c 9244 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
9245 map<int, string> oids;
9246 map<int, struct rgw_cls_list_ret> list_results;
b3b6e05e 9247 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
9f95a23c 9248 if (r < 0) {
b3b6e05e 9249 ldpp_dout(dpp, 20) << "cls_bucket_head: open_bucket_index() returned "
9f95a23c 9250 << r << dendl;
7c673cae 9251 return r;
9f95a23c 9252 }
7c673cae 9253
9f95a23c
TL
9254 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9255 if (r < 0) {
b3b6e05e 9256 ldpp_dout(dpp, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
9f95a23c 9257 << r << dendl;
7c673cae 9258 return r;
9f95a23c 9259 }
7c673cae
FG
9260
9261 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9262 for(; iter != list_results.end(); ++iter) {
a8e16298 9263 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
9264 }
9265 return 0;
9266}
9267
b3b6e05e 9268int RGWRados::cls_bucket_head_async(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
7c673cae 9269{
9f95a23c 9270 RGWSI_RADOS::Pool index_pool;
7c673cae 9271 map<int, string> bucket_objs;
b3b6e05e 9272 int r = svc.bi_rados->open_bucket_index(dpp, bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
9273 if (r < 0)
9274 return r;
9275
9276 map<int, string>::iterator iter = bucket_objs.begin();
9277 for (; iter != bucket_objs.end(); ++iter) {
9f95a23c 9278 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
7c673cae
FG
9279 if (r < 0) {
9280 ctx->put();
9281 break;
9282 } else {
9283 (*num_aio)++;
9284 }
9285 }
9286 return r;
9287}
9288
9f95a23c
TL
9289int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
9290 const rgw_bucket& bucket,
b3b6e05e
TL
9291 uint64_t num_objs,
9292 const DoutPrefixProvider *dpp)
31f18b77 9293{
11fdf7f2 9294 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
9295 return 0;
9296 }
9297
9298 bool need_resharding = false;
9f95a23c 9299 uint32_t num_source_shards =
f67539c2 9300 (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
9f95a23c
TL
9301 const uint32_t max_dynamic_shards =
9302 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
9303
9304 if (num_source_shards >= max_dynamic_shards) {
9305 return 0;
9306 }
31f18b77 9307
9f95a23c 9308 uint32_t suggested_num_shards = 0;
11fdf7f2
TL
9309 const uint64_t max_objs_per_shard =
9310 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9f95a23c 9311
20effc67 9312 quota_handler->check_bucket_shards(dpp, max_objs_per_shard, num_source_shards,
9f95a23c
TL
9313 num_objs, need_resharding, &suggested_num_shards);
9314 if (! need_resharding) {
9315 return 0;
31f18b77
FG
9316 }
9317
9f95a23c
TL
9318 const uint32_t final_num_shards =
9319 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
9320 max_dynamic_shards);
9321 // final verification, so we don't reduce number of shards
9322 if (final_num_shards <= num_source_shards) {
9323 return 0;
31f18b77
FG
9324 }
9325
b3b6e05e 9326 ldpp_dout(dpp, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
f67539c2 9327 " needs resharding; current num shards " << bucket_info.layout.current_index.layout.normal.num_shards <<
9f95a23c
TL
9328 "; new num shards " << final_num_shards << " (suggested " <<
9329 suggested_num_shards << ")" << dendl;
9330
b3b6e05e 9331 return add_bucket_to_reshard(dpp, bucket_info, final_num_shards);
31f18b77
FG
9332}
9333
b3b6e05e 9334int RGWRados::add_bucket_to_reshard(const DoutPrefixProvider *dpp, const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
31f18b77 9335{
b3b6e05e 9336 RGWReshard reshard(this->store, dpp);
31f18b77 9337
f67539c2 9338 uint32_t num_source_shards = (bucket_info.layout.current_index.layout.normal.num_shards > 0 ? bucket_info.layout.current_index.layout.normal.num_shards : 1);
31f18b77 9339
11fdf7f2 9340 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77 9341 if (new_num_shards <= num_source_shards) {
b3b6e05e 9342 ldpp_dout(dpp, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
31f18b77
FG
9343 return 0;
9344 }
9345
9346 cls_rgw_reshard_entry entry;
9347 entry.time = real_clock::now();
9348 entry.tenant = bucket_info.owner.tenant;
9349 entry.bucket_name = bucket_info.bucket.name;
9350 entry.bucket_id = bucket_info.bucket.bucket_id;
9351 entry.old_num_shards = num_source_shards;
9352 entry.new_num_shards = new_num_shards;
9353
b3b6e05e 9354 return reshard.add(dpp, entry);
31f18b77
FG
9355}
9356
20effc67 9357int RGWRados::check_quota(const DoutPrefixProvider *dpp, const rgw_user& bucket_owner, rgw_bucket& bucket,
f67539c2
TL
9358 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota,
9359 uint64_t obj_size, optional_yield y,
9360 bool check_size_only)
7c673cae 9361{
11fdf7f2
TL
9362 // if we only check size, then num_objs will set to 0
9363 if(check_size_only)
20effc67 9364 return quota_handler->check_quota(dpp, bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size, y);
11fdf7f2 9365
20effc67 9366 return quota_handler->check_quota(dpp, bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size, y);
7c673cae
FG
9367}
9368
f67539c2 9369int RGWRados::get_target_shard_id(const rgw::bucket_index_normal_layout& layout, const string& obj_key,
11fdf7f2 9370 int *shard_id)
7c673cae 9371{
11fdf7f2 9372 int r = 0;
f67539c2
TL
9373 switch (layout.hash_type) {
9374 case rgw::BucketHashType::Mod:
9375 if (!layout.num_shards) {
11fdf7f2
TL
9376 if (shard_id) {
9377 *shard_id = -1;
9378 }
9379 } else {
f67539c2 9380 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, layout.num_shards);
11fdf7f2
TL
9381 if (shard_id) {
9382 *shard_id = (int)sid;
9383 }
9384 }
9385 break;
9386 default:
9387 r = -ENOTSUP;
7c673cae 9388 }
11fdf7f2 9389 return r;
7c673cae
FG
9390}
9391
7c673cae
FG
9392uint64_t RGWRados::instance_id()
9393{
9394 return get_rados_handle()->get_instance_id();
9395}
9396
9397uint64_t RGWRados::next_bucket_id()
9398{
9f95a23c 9399 std::lock_guard l{bucket_id_lock};
7c673cae
FG
9400 return ++max_bucket_id;
9401}
9402
7c673cae
FG
9403librados::Rados* RGWRados::get_rados_handle()
9404{
494da23a 9405 return &rados;
7c673cae
FG
9406}
9407
b3b6e05e 9408int RGWRados::delete_raw_obj_aio(const DoutPrefixProvider *dpp, const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
7c673cae
FG
9409{
9410 rgw_rados_ref ref;
b3b6e05e 9411 int ret = get_raw_obj_ref(dpp, obj, &ref);
7c673cae 9412 if (ret < 0) {
b3b6e05e 9413 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9414 return ret;
9415 }
9416
9417 ObjectWriteOperation op;
9418 list<string> prefixes;
9419 cls_rgw_remove_obj(op, prefixes);
9420
9f95a23c
TL
9421 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9422 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9423 if (ret < 0) {
b3b6e05e 9424 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9425 c->release();
9426 return ret;
9427 }
9428
9429 handles.push_back(c);
9430
9431 return 0;
9432}
9433
b3b6e05e 9434int RGWRados::delete_obj_aio(const DoutPrefixProvider *dpp, const rgw_obj& obj,
7c673cae 9435 RGWBucketInfo& bucket_info, RGWObjState *astate,
9f95a23c
TL
9436 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9437 optional_yield y)
7c673cae
FG
9438{
9439 rgw_rados_ref ref;
b3b6e05e 9440 int ret = get_obj_head_ref(dpp, bucket_info, obj, &ref);
7c673cae 9441 if (ret < 0) {
b3b6e05e 9442 ldpp_dout(dpp, -1) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
7c673cae
FG
9443 return ret;
9444 }
9445
9446 if (keep_index_consistent) {
9447 RGWRados::Bucket bop(this, bucket_info);
9448 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9449
b3b6e05e 9450 ret = index_op.prepare(dpp, CLS_RGW_OP_DEL, &astate->write_tag, y);
7c673cae 9451 if (ret < 0) {
b3b6e05e 9452 ldpp_dout(dpp, -1) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
7c673cae
FG
9453 return ret;
9454 }
9455 }
9456
9457 ObjectWriteOperation op;
9458 list<string> prefixes;
9459 cls_rgw_remove_obj(op, prefixes);
9460
9f95a23c
TL
9461 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9462 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae 9463 if (ret < 0) {
b3b6e05e 9464 ldpp_dout(dpp, -1) << "ERROR: AioOperate failed with ret=" << ret << dendl;
7c673cae
FG
9465 c->release();
9466 return ret;
9467 }
9468
9469 handles.push_back(c);
9470
9471 if (keep_index_consistent) {
b3b6e05e 9472 ret = delete_obj_index(obj, astate->mtime, dpp);
7c673cae 9473 if (ret < 0) {
b3b6e05e 9474 ldpp_dout(dpp, -1) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
7c673cae
FG
9475 return ret;
9476 }
9477 }
9478 return ret;
9479}
20effc67
TL
9480
9481void objexp_hint_entry::generate_test_instances(list<objexp_hint_entry*>& o)
9482{
9483 auto it = new objexp_hint_entry;
9484 it->tenant = "tenant1";
9485 it->bucket_name = "bucket1";
9486 it->bucket_id = "1234";
9487 it->obj_key = rgw_obj_key("obj");
9488 o.push_back(it);
9489 o.push_back(new objexp_hint_entry);
9490}
9491
9492void objexp_hint_entry::dump(Formatter *f) const
9493{
9494 f->open_object_section("objexp_hint_entry");
9495 encode_json("tenant", tenant, f);
9496 encode_json("bucket_name", bucket_name, f);
9497 encode_json("bucket_id", bucket_id, f);
9498 encode_json("rgw_obj_key", obj_key, f);
9499 utime_t ut(exp_time);
9500 encode_json("exp_time", ut, f);
9501 f->close_section();
9502}
9503
9504void RGWOLHInfo::generate_test_instances(list<RGWOLHInfo*> &o)
9505{
9506 RGWOLHInfo *olh = new RGWOLHInfo;
9507 olh->removed = false;
9508 o.push_back(olh);
9509 o.push_back(new RGWOLHInfo);
9510}
9511
9512void RGWOLHInfo::dump(Formatter *f) const
9513{
9514 encode_json("target", target, f);
9515}
9516
9517void RGWOLHPendingInfo::dump(Formatter *f) const
9518{
9519 utime_t ut(time);
9520 encode_json("time", ut, f);
9521}
9522