]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
bump version to 15.2.6-pve1
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
9f95a23c 2// vim: ts=8 sw=2 smarttab ft=cpp
7c673cae 3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
9f95a23c
TL
8#include <sstream>
9
7c673cae 10#include <boost/algorithm/string.hpp>
11fdf7f2 11#include <string_view>
7c673cae 12
11fdf7f2 13#include <boost/container/flat_set.hpp>
7c673cae
FG
14#include <boost/format.hpp>
15#include <boost/optional.hpp>
16#include <boost/utility/in_place_factory.hpp>
17
18#include "common/ceph_json.h"
7c673cae
FG
19
20#include "common/errno.h"
21#include "common/Formatter.h"
22#include "common/Throttle.h"
7c673cae 23
9f95a23c 24#include "rgw_sal.h"
11fdf7f2 25#include "rgw_zone.h"
7c673cae
FG
26#include "rgw_cache.h"
27#include "rgw_acl.h"
28#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 29#include "rgw_aio_throttle.h"
7c673cae
FG
30#include "rgw_bucket.h"
31#include "rgw_rest_conn.h"
32#include "rgw_cr_rados.h"
33#include "rgw_cr_rest.h"
11fdf7f2 34#include "rgw_putobj_processor.h"
7c673cae
FG
35
36#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
37#include "cls/rgw/cls_rgw_client.h"
38#include "cls/rgw/cls_rgw_const.h"
39#include "cls/refcount/cls_refcount_client.h"
40#include "cls/version/cls_version_client.h"
c07f9fc5 41#include "osd/osd_types.h"
7c673cae
FG
42
43#include "rgw_tools.h"
44#include "rgw_coroutine.h"
45#include "rgw_compression.h"
9f95a23c 46#include "rgw_worker.h"
7c673cae 47
7c673cae
FG
48#undef fork // fails to compile RGWPeriod::fork() below
49
50#include "common/Clock.h"
51
7c673cae
FG
52using namespace librados;
53
54#include <string>
55#include <iostream>
56#include <vector>
57#include <atomic>
58#include <list>
59#include <map>
11fdf7f2 60#include "include/random.h"
7c673cae
FG
61
62#include "rgw_gc.h"
63#include "rgw_lc.h"
64
65#include "rgw_object_expirer_core.h"
66#include "rgw_sync.h"
81eedcae 67#include "rgw_sync_counters.h"
11fdf7f2 68#include "rgw_sync_trace.h"
9f95a23c
TL
69#include "rgw_trim_datalog.h"
70#include "rgw_trim_mdlog.h"
7c673cae
FG
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae 74
11fdf7f2
TL
75#include "services/svc_zone.h"
76#include "services/svc_zone_utils.h"
77#include "services/svc_quota.h"
78#include "services/svc_sync_modules.h"
79#include "services/svc_sys_obj.h"
80#include "services/svc_sys_obj_cache.h"
9f95a23c
TL
81#include "services/svc_bucket.h"
82#include "services/svc_mdlog.h"
83#include "services/svc_datalog_rados.h"
11fdf7f2 84
7c673cae
FG
85#include "compressor/Compressor.h"
86
11fdf7f2
TL
87#ifdef WITH_LTTNG
88#define TRACEPOINT_DEFINE
89#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
90#include "tracing/rgw_rados.h"
91#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92#undef TRACEPOINT_DEFINE
93#else
94#define tracepoint(...)
95#endif
96
7c673cae
FG
97#define dout_context g_ceph_context
98#define dout_subsys ceph_subsys_rgw
99
7c673cae 100
7c673cae 101static string shadow_ns = "shadow";
7c673cae
FG
102static string default_bucket_index_pool_suffix = "rgw.buckets.index";
103static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 104
11fdf7f2 105static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 106#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae
FG
107
108#define dout_subsys ceph_subsys_rgw
109
110
111static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
112 const rgw_placement_rule& head_placement_rule,
113 const rgw_obj& obj, rgw_pool *pool)
7c673cae 114{
11fdf7f2 115 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 116 RGWZonePlacementInfo placement;
11fdf7f2 117 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
118 return false;
119 }
120
121 if (!obj.in_extra_data) {
11fdf7f2 122 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 123 } else {
31f18b77 124 *pool = placement.get_data_extra_pool();
7c673cae
FG
125 }
126 }
127
128 return true;
129}
130
131static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
132 const rgw_placement_rule& head_placement_rule,
133 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
134{
135 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
136
11fdf7f2 137 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
138}
139
140rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
141{
142 if (!is_raw) {
143 rgw_raw_obj r;
144 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
145 return r;
146 }
147 return raw_obj;
148}
149
150rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
151{
152 if (!is_raw) {
153 rgw_raw_obj r;
154 store->obj_to_raw(placement_rule, obj, &r);
155 return r;
156 }
157 return raw_obj;
158}
159
11fdf7f2
TL
160void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
161{
162 obj_version *check_objv = version_for_check();
7c673cae 163
11fdf7f2
TL
164 if (check_objv) {
165 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
166 }
167
11fdf7f2 168 cls_version_read(*op, &read_version);
7c673cae
FG
169}
170
11fdf7f2
TL
171void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
172{
173 obj_version *check_objv = version_for_check();
174 obj_version *modify_version = version_for_write();
7c673cae 175
11fdf7f2
TL
176 if (check_objv) {
177 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 178 }
7c673cae 179
11fdf7f2
TL
180 if (modify_version) {
181 cls_version_set(*op, *modify_version);
182 } else {
183 cls_version_inc(*op);
7c673cae 184 }
7c673cae
FG
185}
186
9f95a23c 187RGWObjState::RGWObjState() {
7c673cae
FG
188}
189
9f95a23c 190RGWObjState::~RGWObjState() {
7c673cae
FG
191}
192
9f95a23c
TL
193RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
194 is_atomic = rhs.is_atomic;
195 has_attrs = rhs.has_attrs;
196 exists = rhs.exists;
197 size = rhs.size;
198 accounted_size = rhs.accounted_size;
199 mtime = rhs.mtime;
200 epoch = rhs.epoch;
201 if (rhs.obj_tag.length()) {
202 obj_tag = rhs.obj_tag;
7c673cae 203 }
9f95a23c
TL
204 if (rhs.tail_tag.length()) {
205 tail_tag = rhs.tail_tag;
7c673cae 206 }
9f95a23c
TL
207 write_tag = rhs.write_tag;
208 fake_tag = rhs.fake_tag;
209 manifest = rhs.manifest;
210 shadow_obj = rhs.shadow_obj;
211 has_data = rhs.has_data;
212 if (rhs.data.length()) {
213 data = rhs.data;
7c673cae 214 }
9f95a23c
TL
215 prefetch_data = rhs.prefetch_data;
216 keep_tail = rhs.keep_tail;
217 is_olh = rhs.is_olh;
218 objv_tracker = rhs.objv_tracker;
219 pg_ver = rhs.pg_ver;
7c673cae
FG
220}
221
9f95a23c
TL
222RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
223 RGWObjState *result;
224 typename std::map<rgw_obj, RGWObjState>::iterator iter;
225 lock.lock_shared();
226 assert (!obj.empty());
227 iter = objs_state.find(obj);
228 if (iter != objs_state.end()) {
229 result = &iter->second;
230 lock.unlock_shared();
231 } else {
232 lock.unlock_shared();
233 lock.lock();
234 result = &objs_state[obj];
235 lock.unlock();
224ce89b 236 }
9f95a23c 237 return result;
7c673cae
FG
238}
239
9f95a23c
TL
240void RGWObjectCtx::set_atomic(rgw_obj& obj) {
241 std::unique_lock wl{lock};
242 assert (!obj.empty());
243 objs_state[obj].is_atomic = true;
7c673cae 244}
9f95a23c
TL
245void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
246 std::unique_lock wl{lock};
247 assert (!obj.empty());
248 objs_state[obj].prefetch_data = true;
7c673cae
FG
249}
250
9f95a23c
TL
251void RGWObjectCtx::invalidate(const rgw_obj& obj) {
252 std::unique_lock wl{lock};
253 auto iter = objs_state.find(obj);
254 if (iter == objs_state.end()) {
11fdf7f2 255 return;
7c673cae 256 }
9f95a23c
TL
257 bool is_atomic = iter->second.is_atomic;
258 bool prefetch_data = iter->second.prefetch_data;
7c673cae 259
9f95a23c 260 objs_state.erase(iter);
7c673cae 261
9f95a23c
TL
262 if (is_atomic || prefetch_data) {
263 auto& state = objs_state[obj];
264 state.is_atomic = is_atomic;
265 state.prefetch_data = prefetch_data;
11fdf7f2 266 }
7c673cae
FG
267}
268
11fdf7f2 269void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 270{
11fdf7f2
TL
271 write_version.ver = 1;
272#define TAG_LEN 24
7c673cae 273
11fdf7f2
TL
274 write_version.tag.clear();
275 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
276}
277
7c673cae
FG
278class RGWMetaNotifierManager : public RGWCoroutinesManager {
279 RGWRados *store;
280 RGWHTTPManager http_manager;
281
282public:
283 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
284 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 285 http_manager.start();
7c673cae
FG
286 }
287
9f95a23c 288 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
7c673cae
FG
289 rgw_http_param_pair pairs[] = { { "type", "metadata" },
290 { "notify", NULL },
291 { NULL, NULL } };
292
293 list<RGWCoroutinesStack *> stacks;
9f95a23c 294 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
295 RGWRESTConn *conn = iter->second;
296 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
297 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
298
299 stacks.push_back(stack);
300 }
301 return run(stacks);
302 }
303};
304
305class RGWDataNotifierManager : public RGWCoroutinesManager {
306 RGWRados *store;
307 RGWHTTPManager http_manager;
308
309public:
310 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
311 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 312 http_manager.start();
7c673cae
FG
313 }
314
9f95a23c 315 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
7c673cae
FG
316 rgw_http_param_pair pairs[] = { { "type", "data" },
317 { "notify", NULL },
11fdf7f2 318 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
319 { NULL, NULL } };
320
321 list<RGWCoroutinesStack *> stacks;
9f95a23c 322 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
323 RGWRESTConn *conn = iter->second;
324 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
325 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
326
327 stacks.push_back(stack);
328 }
329 return run(stacks);
330 }
331};
332
11fdf7f2
TL
333/* class RGWRadosThread */
334
7c673cae
FG
335void RGWRadosThread::start()
336{
337 worker = new Worker(cct, this);
338 worker->create(thread_name.c_str());
339}
340
341void RGWRadosThread::stop()
342{
343 down_flag = true;
344 stop_process();
345 if (worker) {
31f18b77 346 worker->signal();
7c673cae
FG
347 worker->join();
348 }
349 delete worker;
350 worker = NULL;
351}
352
353void *RGWRadosThread::Worker::entry() {
354 uint64_t msec = processor->interval_msec();
9f95a23c 355 auto interval = std::chrono::milliseconds(msec);
7c673cae
FG
356
357 do {
9f95a23c 358 auto start = ceph::real_clock::now();
7c673cae
FG
359 int r = processor->process();
360 if (r < 0) {
361 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
362 }
363
364 if (processor->going_down())
365 break;
366
9f95a23c 367 auto end = ceph::real_clock::now() - start;
7c673cae
FG
368
369 uint64_t cur_msec = processor->interval_msec();
370 if (cur_msec != msec) { /* was it reconfigured? */
371 msec = cur_msec;
9f95a23c 372 interval = std::chrono::milliseconds(msec);
7c673cae
FG
373 }
374
375 if (cur_msec > 0) {
376 if (interval <= end)
377 continue; // next round
378
9f95a23c 379 auto wait_time = interval - end;
31f18b77 380 wait_interval(wait_time);
7c673cae 381 } else {
31f18b77 382 wait();
7c673cae
FG
383 }
384 } while (!processor->going_down());
385
386 return NULL;
387}
388
389class RGWMetaNotifier : public RGWRadosThread {
390 RGWMetaNotifierManager notify_mgr;
391 RGWMetadataLog *const log;
392
393 uint64_t interval_msec() override {
394 return cct->_conf->rgw_md_notify_interval_msec;
395 }
1adf2230
AA
396 void stop_process() override {
397 notify_mgr.stop();
398 }
7c673cae
FG
399public:
400 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
401 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
402
403 int process() override;
404};
405
406int RGWMetaNotifier::process()
407{
408 set<int> shards;
409
410 log->read_clear_modified(shards);
411
412 if (shards.empty()) {
413 return 0;
414 }
415
416 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
417 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
418 }
419
11fdf7f2 420 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
421
422 return 0;
423}
424
425class RGWDataNotifier : public RGWRadosThread {
426 RGWDataNotifierManager notify_mgr;
427
428 uint64_t interval_msec() override {
11fdf7f2 429 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 430 }
1adf2230
AA
431 void stop_process() override {
432 notify_mgr.stop();
433 }
7c673cae
FG
434public:
435 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
436
437 int process() override;
438};
439
440int RGWDataNotifier::process()
441{
9f95a23c
TL
442 auto data_log = store->svc.datalog_rados->get_log();
443 if (!data_log) {
7c673cae
FG
444 return 0;
445 }
446
447 map<int, set<string> > shards;
448
9f95a23c 449 data_log->read_clear_modified(shards);
7c673cae
FG
450
451 if (shards.empty()) {
452 return 0;
453 }
454
455 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
456 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
457 }
458
11fdf7f2 459 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
460
461 return 0;
462}
463
464class RGWSyncProcessorThread : public RGWRadosThread {
465public:
466 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
467 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
468 ~RGWSyncProcessorThread() override {}
469 int init() override = 0 ;
470 int process() override = 0;
471};
472
473class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
474{
475 RGWMetaSyncStatusManager sync;
476
477 uint64_t interval_msec() override {
478 return 0; /* no interval associated, it'll run once until stopped */
479 }
480 void stop_process() override {
481 sync.stop();
482 }
483public:
9f95a23c
TL
484 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados)
485 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
7c673cae
FG
486
487 void wakeup_sync_shards(set<int>& shard_ids) {
488 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
489 sync.wakeup(*iter);
490 }
491 }
492 RGWMetaSyncStatusManager* get_manager() { return &sync; }
493
494 int init() override {
495 int ret = sync.init();
496 if (ret < 0) {
497 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
498 return ret;
499 }
500 return 0;
501 }
502
503 int process() override {
504 sync.run();
505 return 0;
506 }
507};
508
509class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
510{
81eedcae 511 PerfCountersRef counters;
7c673cae
FG
512 RGWDataSyncStatusManager sync;
513 bool initialized;
514
515 uint64_t interval_msec() override {
516 if (initialized) {
517 return 0; /* no interval associated, it'll run once until stopped */
518 } else {
519#define DATA_SYNC_INIT_WAIT_SEC 20
520 return DATA_SYNC_INIT_WAIT_SEC * 1000;
521 }
522 }
523 void stop_process() override {
524 sync.stop();
525 }
526public:
9f95a23c 527 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados,
81eedcae 528 const RGWZone* source_zone)
9f95a23c 529 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
81eedcae
TL
530 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
531 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
532 initialized(false) {}
533
534 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
535 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
536 sync.wakeup(iter->first, iter->second);
537 }
538 }
539 RGWDataSyncStatusManager* get_manager() { return &sync; }
540
541 int init() override {
542 return 0;
543 }
544
545 int process() override {
546 while (!initialized) {
547 if (going_down()) {
548 return 0;
549 }
550 int ret = sync.init();
551 if (ret >= 0) {
552 initialized = true;
553 break;
554 }
555 /* we'll be back! */
556 return 0;
557 }
558 sync.run();
559 return 0;
560 }
561};
562
11fdf7f2 563class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
564{
565 RGWCoroutinesManager crs;
9f95a23c 566 rgw::sal::RGWRadosStore *store;
b32b8144 567 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
568 RGWHTTPManager http;
569 const utime_t trim_interval;
570
571 uint64_t interval_msec() override { return 0; }
572 void stop_process() override { crs.stop(); }
573public:
9f95a23c 574 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore *store, rgw::BucketTrimManager *bucket_trim,
b32b8144 575 int interval)
9f95a23c
TL
576 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
577 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
b32b8144 578 bucket_trim(bucket_trim),
7c673cae
FG
579 http(store->ctx(), crs.get_completion_mgr()),
580 trim_interval(interval, 0)
581 {}
582
583 int init() override {
11fdf7f2 584 return http.start();
7c673cae
FG
585 }
586 int process() override {
587 list<RGWCoroutinesStack*> stacks;
588 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
11fdf7f2 589 meta->call(create_meta_log_trim_cr(this, store, &http,
7c673cae
FG
590 cct->_conf->rgw_md_log_max_shards,
591 trim_interval));
592 stacks.push_back(meta);
593
9f95a23c
TL
594 if (store->svc()->zone->sync_module_exports_data()) {
595 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
596 data->call(create_data_log_trim_cr(store, &http,
597 cct->_conf->rgw_data_log_num_shards,
598 trim_interval));
599 stacks.push_back(data);
7c673cae 600
9f95a23c
TL
601 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
602 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
603 stacks.push_back(bucket);
604 }
b32b8144 605
7c673cae
FG
606 crs.run(stacks);
607 return 0;
608 }
11fdf7f2
TL
609
610 // implements DoutPrefixProvider
611 CephContext *get_cct() const override { return store->ctx(); }
9f95a23c 612 unsigned get_subsys() const override
11fdf7f2
TL
613 {
614 return dout_subsys;
615 }
616
9f95a23c 617 std::ostream& gen_prefix(std::ostream& out) const override
11fdf7f2
TL
618 {
619 return out << "sync log trim: ";
620 }
621
7c673cae
FG
622};
623
624void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
625{
9f95a23c 626 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
627 if (meta_sync_processor_thread) {
628 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
629 }
630}
631
9f95a23c 632void RGWRados::wakeup_data_sync_shards(const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
7c673cae
FG
633{
634 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
9f95a23c
TL
635 std::lock_guard l{data_sync_thread_lock};
636 auto iter = data_sync_processor_threads.find(source_zone);
7c673cae
FG
637 if (iter == data_sync_processor_threads.end()) {
638 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
639 return;
640 }
641
642 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 643 ceph_assert(thread);
7c673cae
FG
644 thread->wakeup_sync_shards(shard_ids);
645}
646
647RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
648{
9f95a23c 649 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
650 if (meta_sync_processor_thread) {
651 return meta_sync_processor_thread->get_manager();
652 }
653 return nullptr;
654}
655
9f95a23c 656RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
7c673cae 657{
9f95a23c 658 std::lock_guard l{data_sync_thread_lock};
7c673cae
FG
659 auto thread = data_sync_processor_threads.find(source_zone);
660 if (thread == data_sync_processor_threads.end()) {
661 return nullptr;
662 }
663 return thread->second->get_manager();
664}
665
666int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
667{
668 IoCtx ioctx;
494da23a 669 int r = open_pool_ctx(pool, ioctx, false);
7c673cae
FG
670 if (r < 0) {
671 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
672 return r;
673 }
674
675 bool requires;
676 r = ioctx.pool_requires_alignment2(&requires);
677 if (r < 0) {
678 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
679 << r << dendl;
680 return r;
681 }
682
683 if (!requires) {
684 *alignment = 0;
685 return 0;
686 }
687
688 uint64_t align;
689 r = ioctx.pool_required_alignment2(&align);
690 if (r < 0) {
691 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
692 << r << dendl;
693 return r;
694 }
695 if (align != 0) {
696 ldout(cct, 20) << "required alignment=" << align << dendl;
697 }
698 *alignment = align;
699 return 0;
700}
701
11fdf7f2
TL
702void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
703{
704 if (alignment == 0) {
705 *max_size = size;
706 return;
707 }
708
709 if (size <= alignment) {
710 *max_size = alignment;
711 return;
712 }
713
714 *max_size = size - (size % alignment);
715}
716
717int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae 718{
11fdf7f2 719 uint64_t alignment;
7c673cae
FG
720 int r = get_required_alignment(pool, &alignment);
721 if (r < 0) {
722 return r;
723 }
724
11fdf7f2
TL
725 if (palignment) {
726 *palignment = alignment;
7c673cae
FG
727 }
728
11fdf7f2 729 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 730
11fdf7f2 731 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae
FG
732
733 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
734
735 return 0;
736}
737
11fdf7f2
TL
738int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
739 uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae
FG
740{
741 rgw_pool pool;
742 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
743 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
744 return -EIO;
745 }
11fdf7f2 746 return get_max_chunk_size(pool, max_chunk_size, palignment);
7c673cae
FG
747}
748
31f18b77
FG
749class RGWIndexCompletionManager;
750
751struct complete_op_data {
9f95a23c 752 ceph::mutex lock = ceph::make_mutex("complete_op_data");
31f18b77
FG
753 AioCompletion *rados_completion{nullptr};
754 int manager_shard_id{-1};
755 RGWIndexCompletionManager *manager{nullptr};
756 rgw_obj obj;
757 RGWModifyOp op;
758 string tag;
759 rgw_bucket_entry_ver ver;
760 cls_rgw_obj_key key;
761 rgw_bucket_dir_entry_meta dir_meta;
762 list<cls_rgw_obj_key> remove_objs;
763 bool log_op;
764 uint16_t bilog_op;
765 rgw_zone_set zones_trace;
766
767 bool stopped{false};
768
769 void stop() {
9f95a23c 770 std::lock_guard l{lock};
31f18b77
FG
771 stopped = true;
772 }
773};
774
775class RGWIndexCompletionThread : public RGWRadosThread {
776 RGWRados *store;
777
778 uint64_t interval_msec() override {
779 return 0;
780 }
781
782 list<complete_op_data *> completions;
783
9f95a23c
TL
784 ceph::mutex completions_lock =
785 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
31f18b77
FG
786public:
787 RGWIndexCompletionThread(RGWRados *_store)
9f95a23c 788 : RGWRadosThread(_store, "index-complete"), store(_store) {}
31f18b77
FG
789
790 int process() override;
791
792 void add_completion(complete_op_data *completion) {
793 {
9f95a23c 794 std::lock_guard l{completions_lock};
31f18b77
FG
795 completions.push_back(completion);
796 }
797
798 signal();
799 }
800};
801
802int RGWIndexCompletionThread::process()
803{
804 list<complete_op_data *> comps;
805
806 {
9f95a23c 807 std::lock_guard l{completions_lock};
31f18b77
FG
808 completions.swap(comps);
809 }
810
811 for (auto c : comps) {
812 std::unique_ptr<complete_op_data> up{c};
813
814 if (going_down()) {
815 continue;
816 }
817 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
818
819 RGWRados::BucketShard bs(store);
f64942e4 820 RGWBucketInfo bucket_info;
31f18b77 821
f64942e4 822 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
31f18b77
FG
823 if (r < 0) {
824 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
825 /* not much to do */
826 continue;
827 }
828
f64942e4
AA
829 r = store->guard_reshard(&bs, c->obj, bucket_info,
830 [&](RGWRados::BucketShard *bs) -> int {
831 librados::ObjectWriteOperation o;
832 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
833 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
834 c->log_op, c->bilog_op, &c->zones_trace);
9f95a23c 835 return bs->bucket_obj.operate(&o, null_yield);
31f18b77
FG
836 });
837 if (r < 0) {
838 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
839 /* ignoring error, can't do anything about it */
840 continue;
841 }
9f95a23c 842 r = store->svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
31f18b77
FG
843 if (r < 0) {
844 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
845 }
846 }
847
848 return 0;
849}
850
851class RGWIndexCompletionManager {
852 RGWRados *store{nullptr};
9f95a23c 853 ceph::containers::tiny_vector<ceph::mutex> locks;
31f18b77
FG
854 vector<set<complete_op_data *> > completions;
855
856 RGWIndexCompletionThread *completion_thread{nullptr};
857
858 int num_shards;
859
860 std::atomic<int> cur_shard {0};
861
862
863public:
9f95a23c
TL
864 RGWIndexCompletionManager(RGWRados *_store) :
865 store(_store),
866 locks{ceph::make_lock_container<ceph::mutex>(
867 store->ctx()->_conf->rgw_thread_pool_size,
868 [](const size_t i) {
869 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
870 std::to_string(i));
871 })}
872 {
31f18b77 873 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
31f18b77
FG
874 completions.resize(num_shards);
875 }
876 ~RGWIndexCompletionManager() {
877 stop();
31f18b77
FG
878 }
879
880 int next_shard() {
881 int result = cur_shard % num_shards;
882 cur_shard++;
883 return result;
884 }
885
886 void create_completion(const rgw_obj& obj,
887 RGWModifyOp op, string& tag,
888 rgw_bucket_entry_ver& ver,
889 const cls_rgw_obj_key& key,
890 rgw_bucket_dir_entry_meta& dir_meta,
891 list<cls_rgw_obj_key> *remove_objs, bool log_op,
892 uint16_t bilog_op,
893 rgw_zone_set *zones_trace,
894 complete_op_data **result);
895 bool handle_completion(completion_t cb, complete_op_data *arg);
896
897 int start() {
898 completion_thread = new RGWIndexCompletionThread(store);
899 int ret = completion_thread->init();
900 if (ret < 0) {
901 return ret;
902 }
903 completion_thread->start();
904 return 0;
905 }
906 void stop() {
907 if (completion_thread) {
908 completion_thread->stop();
909 delete completion_thread;
910 }
911
912 for (int i = 0; i < num_shards; ++i) {
9f95a23c 913 std::lock_guard l{locks[i]};
31f18b77 914 for (auto c : completions[i]) {
31f18b77
FG
915 c->stop();
916 }
917 }
918 completions.clear();
919 }
920};
921
922static void obj_complete_cb(completion_t cb, void *arg)
923{
924 complete_op_data *completion = (complete_op_data *)arg;
9f95a23c 925 completion->lock.lock();
31f18b77 926 if (completion->stopped) {
9f95a23c 927 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
31f18b77
FG
928 delete completion;
929 return;
930 }
931 bool need_delete = completion->manager->handle_completion(cb, completion);
9f95a23c 932 completion->lock.unlock();
31f18b77
FG
933 if (need_delete) {
934 delete completion;
935 }
936}
937
938
939void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
940 RGWModifyOp op, string& tag,
941 rgw_bucket_entry_ver& ver,
942 const cls_rgw_obj_key& key,
943 rgw_bucket_dir_entry_meta& dir_meta,
944 list<cls_rgw_obj_key> *remove_objs, bool log_op,
945 uint16_t bilog_op,
946 rgw_zone_set *zones_trace,
947 complete_op_data **result)
948{
949 complete_op_data *entry = new complete_op_data;
950
951 int shard_id = next_shard();
952
953 entry->manager_shard_id = shard_id;
954 entry->manager = this;
955 entry->obj = obj;
956 entry->op = op;
957 entry->tag = tag;
958 entry->ver = ver;
959 entry->key = key;
960 entry->dir_meta = dir_meta;
961 entry->log_op = log_op;
962 entry->bilog_op = bilog_op;
963
964 if (remove_objs) {
965 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
966 entry->remove_objs.push_back(*iter);
967 }
968 }
969
970 if (zones_trace) {
971 entry->zones_trace = *zones_trace;
972 } else {
9f95a23c 973 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
31f18b77
FG
974 }
975
976 *result = entry;
977
9f95a23c 978 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
31f18b77 979
9f95a23c 980 std::lock_guard l{locks[shard_id]};
31f18b77
FG
981 completions[shard_id].insert(entry);
982}
983
984bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
985{
986 int shard_id = arg->manager_shard_id;
987 {
9f95a23c 988 std::lock_guard l{locks[shard_id]};
31f18b77
FG
989
990 auto& comps = completions[shard_id];
991
992 auto iter = comps.find(arg);
993 if (iter == comps.end()) {
994 return true;
995 }
996
997 comps.erase(iter);
998 }
999
1000 int r = rados_aio_get_return_value(cb);
1001 if (r != -ERR_BUSY_RESHARDING) {
1002 return true;
1003 }
1004 completion_thread->add_completion(arg);
1005 return false;
1006}
1007
7c673cae
FG
1008void RGWRados::finalize()
1009{
1010 if (run_sync_thread) {
9f95a23c 1011 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
1012 meta_sync_processor_thread->stop();
1013
9f95a23c 1014 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1015 for (auto iter : data_sync_processor_threads) {
1016 RGWDataSyncProcessorThread *thread = iter.second;
1017 thread->stop();
1018 }
1019 if (sync_log_trimmer) {
1020 sync_log_trimmer->stop();
1021 }
1022 }
7c673cae
FG
1023 if (run_sync_thread) {
1024 delete meta_sync_processor_thread;
1025 meta_sync_processor_thread = NULL;
9f95a23c 1026 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1027 for (auto iter : data_sync_processor_threads) {
1028 RGWDataSyncProcessorThread *thread = iter.second;
1029 delete thread;
1030 }
1031 data_sync_processor_threads.clear();
1032 delete sync_log_trimmer;
1033 sync_log_trimmer = nullptr;
b32b8144 1034 bucket_trim = boost::none;
7c673cae 1035 }
7c673cae
FG
1036 if (meta_notifier) {
1037 meta_notifier->stop();
1038 delete meta_notifier;
1039 }
1040 if (data_notifier) {
1041 data_notifier->stop();
1042 delete data_notifier;
1043 }
11fdf7f2 1044 delete sync_tracer;
11fdf7f2
TL
1045
1046 delete lc;
1047 lc = NULL;
7c673cae 1048
11fdf7f2
TL
1049 delete gc;
1050 gc = NULL;
7c673cae 1051
11fdf7f2
TL
1052 delete obj_expirer;
1053 obj_expirer = NULL;
7c673cae 1054
11fdf7f2
TL
1055 RGWQuotaHandler::free_handler(quota_handler);
1056 if (cr_registry) {
1057 cr_registry->put();
7c673cae
FG
1058 }
1059
11fdf7f2 1060 svc.shutdown();
7c673cae 1061
11fdf7f2
TL
1062 delete binfo_cache;
1063 delete obj_tombstone_cache;
7c673cae 1064
11fdf7f2
TL
1065 if (reshard_wait.get()) {
1066 reshard_wait->stop();
1067 reshard_wait.reset();
7c673cae
FG
1068 }
1069
11fdf7f2
TL
1070 if (run_reshard_thread) {
1071 reshard->stop_processor();
7c673cae 1072 }
11fdf7f2
TL
1073 delete reshard;
1074 delete index_completion_manager;
1075}
1076
1077/**
1078 * Initialize the RADOS instance and prepare to do other ops
1079 * Returns 0 on success, -ERR# on failure.
1080 */
1081int RGWRados::init_rados()
1082{
1083 int ret = 0;
7c673cae 1084
494da23a
TL
1085 ret = rados.init_with_context(cct);
1086 if (ret < 0) {
1087 return ret;
1088 }
1089 ret = rados.connect();
1090 if (ret < 0) {
1091 return ret;
7c673cae 1092 }
11fdf7f2
TL
1093
1094 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1095 new RGWCoroutinesManagerRegistry(cct)};
1096 ret = crs->hook_to_admin_command("cr dump");
1097 if (ret < 0) {
1098 return ret;
7c673cae
FG
1099 }
1100
11fdf7f2 1101 cr_registry = crs.release();
11fdf7f2 1102 return ret;
7c673cae
FG
1103}
1104
11fdf7f2 1105int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
7c673cae 1106{
11fdf7f2 1107 map<string,string> metadata = meta;
494da23a 1108 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1109 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1110 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1111 metadata["zone_name"] = svc.zone->zone_name();
9f95a23c 1112 metadata["zone_id"] = svc.zone->zone_id().id;
11fdf7f2
TL
1113 string name = cct->_conf->name.get_id();
1114 if (name.compare(0, 4, "rgw.") == 0) {
1115 name = name.substr(4);
7c673cae 1116 }
494da23a 1117 int ret = rados.service_daemon_register(daemon_type, name, metadata);
11fdf7f2
TL
1118 if (ret < 0) {
1119 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1120 return ret;
7c673cae
FG
1121 }
1122
1123 return 0;
1124}
1125
11fdf7f2 1126int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
7c673cae 1127{
494da23a 1128 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2
TL
1129 if (ret < 0) {
1130 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1131 return ret;
1132 }
1133
1134 return 0;
7c673cae
FG
1135}
1136
1137/**
1138 * Initialize the RADOS instance and prepare to do other ops
1139 * Returns 0 on success, -ERR# on failure.
1140 */
1141int RGWRados::init_complete()
1142{
11fdf7f2 1143 int ret;
7c673cae 1144
11fdf7f2
TL
1145 /*
1146 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1147 */
9f95a23c 1148 sync_module = svc.sync_modules->get_sync_module();
7c673cae
FG
1149
1150 ret = open_root_pool_ctx();
1151 if (ret < 0)
1152 return ret;
1153
1154 ret = open_gc_pool_ctx();
1155 if (ret < 0)
1156 return ret;
1157
1158 ret = open_lc_pool_ctx();
1159 if (ret < 0)
1160 return ret;
1161
1162 ret = open_objexp_pool_ctx();
1163 if (ret < 0)
1164 return ret;
1165
31f18b77
FG
1166 ret = open_reshard_pool_ctx();
1167 if (ret < 0)
1168 return ret;
1169
7c673cae
FG
1170 pools_initialized = true;
1171
1172 gc = new RGWGC();
1173 gc->initialize(cct, this);
1174
9f95a23c 1175 obj_expirer = new RGWObjectExpirer(this->store);
7c673cae
FG
1176
1177 if (use_gc_thread) {
1178 gc->start_processor();
1179 obj_expirer->start_processor();
1180 }
1181
11fdf7f2
TL
1182 auto& current_period = svc.zone->get_current_period();
1183 auto& zonegroup = svc.zone->get_zonegroup();
1184 auto& zone_params = svc.zone->get_zone_params();
1185 auto& zone = svc.zone->get_zone();
1186
7c673cae
FG
1187 /* no point of running sync thread if we don't have a master zone configured
1188 or there is no rest_master_conn */
9f95a23c 1189 if (!svc.zone->need_to_sync()) {
7c673cae
FG
1190 run_sync_thread = false;
1191 }
1192
11fdf7f2 1193 if (svc.zone->is_meta_master()) {
9f95a23c 1194 auto md_log = svc.mdlog->get_log(current_period.get_id());
7c673cae
FG
1195 meta_notifier = new RGWMetaNotifier(this, md_log);
1196 meta_notifier->start();
1197 }
1198
11fdf7f2
TL
1199 /* init it anyway, might run sync through radosgw-admin explicitly */
1200 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1201 sync_tracer->init(this);
1202 ret = sync_tracer->hook_to_admin_command();
1203 if (ret < 0) {
1204 return ret;
1205 }
1206
7c673cae 1207 if (run_sync_thread) {
11fdf7f2
TL
1208 for (const auto &pt: zonegroup.placement_targets) {
1209 if (zone_params.placement_pools.find(pt.second.name)
1210 == zone_params.placement_pools.end()){
1211 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1212 << pt.second.name << " present in zonegroup" << dendl;
1213 }
1214 }
9f95a23c
TL
1215 auto async_processor = svc.rados->get_async_processor();
1216 std::lock_guard l{meta_sync_thread_lock};
1217 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
7c673cae
FG
1218 ret = meta_sync_processor_thread->init();
1219 if (ret < 0) {
1220 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1221 return ret;
1222 }
1223 meta_sync_processor_thread->start();
1224
b32b8144
FG
1225 // configure the bucket trim manager
1226 rgw::BucketTrimConfig config;
1227 rgw::configure_bucket_trim(cct, config);
1228
9f95a23c 1229 bucket_trim.emplace(this->store, config);
b32b8144
FG
1230 ret = bucket_trim->init();
1231 if (ret < 0) {
1232 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1233 return ret;
1234 }
9f95a23c 1235 svc.datalog_rados->set_observer(&*bucket_trim);
b32b8144 1236
9f95a23c 1237 std::lock_guard dl{data_sync_thread_lock};
81eedcae
TL
1238 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1239 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
9f95a23c 1240 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
7c673cae
FG
1241 ret = thread->init();
1242 if (ret < 0) {
1243 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1244 return ret;
1245 }
1246 thread->start();
9f95a23c 1247 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
7c673cae
FG
1248 }
1249 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1250 if (interval > 0) {
9f95a23c 1251 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
7c673cae
FG
1252 ret = sync_log_trimmer->init();
1253 if (ret < 0) {
1254 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1255 return ret;
1256 }
1257 sync_log_trimmer->start();
1258 }
1259 }
1260 data_notifier = new RGWDataNotifier(this);
1261 data_notifier->start();
1262
92f5a8d4
TL
1263 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1264 binfo_cache->init(svc.cache);
1265
7c673cae 1266 lc = new RGWLC();
9f95a23c 1267 lc->initialize(cct, this->store);
31f18b77 1268
7c673cae
FG
1269 if (use_lc_thread)
1270 lc->start_processor();
31f18b77 1271
9f95a23c 1272 quota_handler = RGWQuotaHandler::generate_handler(this->store, quota_threads);
7c673cae
FG
1273
1274 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1275 zone.bucket_index_max_shards);
31f18b77
FG
1276 if (bucket_index_max_shards > get_max_bucket_shards()) {
1277 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 1278 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1279 << get_max_bucket_shards() << dendl;
7c673cae
FG
1280 }
1281 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1282
11fdf7f2 1283 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1284
1285 if (need_tombstone_cache) {
1286 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1287 }
1288
11fdf7f2 1289 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77 1290
9f95a23c 1291 reshard = new RGWReshard(this->store);
31f18b77
FG
1292
1293 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1294 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1295 if (run_reshard_thread) {
1296 reshard->start_processor();
1297 }
1298
1299 index_completion_manager = new RGWIndexCompletionManager(this);
1300 ret = index_completion_manager->start();
1301
7c673cae
FG
1302 return ret;
1303}
1304
11fdf7f2
TL
1305int RGWRados::init_svc(bool raw)
1306{
1307 if (raw) {
1308 return svc.init_raw(cct, use_cache);
1309 }
1310
9f95a23c
TL
1311 return svc.init(cct, use_cache, run_sync_thread);
1312}
1313
1314int RGWRados::init_ctl()
1315{
1316 return ctl.init(&svc);
11fdf7f2
TL
1317}
1318
7c673cae
FG
1319/**
1320 * Initialize the RADOS instance and prepare to do other ops
1321 * Returns 0 on success, -ERR# on failure.
1322 */
1323int RGWRados::initialize()
1324{
1325 int ret;
1326
11fdf7f2
TL
1327 inject_notify_timeout_probability =
1328 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1329 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1330
11fdf7f2 1331 ret = init_svc(false);
7c673cae 1332 if (ret < 0) {
11fdf7f2 1333 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1334 return ret;
1335 }
7c673cae 1336
9f95a23c
TL
1337 ret = init_ctl();
1338 if (ret < 0) {
1339 ldout(cct, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
1340 return ret;
1341 }
1342
11fdf7f2 1343 host_id = svc.zone_utils->gen_host_id();
7c673cae 1344
11fdf7f2
TL
1345 ret = init_rados();
1346 if (ret < 0)
1347 return ret;
1348
1349 return init_complete();
7c673cae
FG
1350}
1351
1352/**
1353 * Open the pool used as root for this gateway
1354 * Returns: 0 on success, -ERR# otherwise.
1355 */
1356int RGWRados::open_root_pool_ctx()
1357{
494da23a 1358 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1359}
1360
1361int RGWRados::open_gc_pool_ctx()
1362{
494da23a 1363 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1364}
1365
1366int RGWRados::open_lc_pool_ctx()
1367{
494da23a 1368 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1369}
1370
1371int RGWRados::open_objexp_pool_ctx()
1372{
494da23a 1373 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1374}
1375
31f18b77
FG
1376int RGWRados::open_reshard_pool_ctx()
1377{
494da23a 1378 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1379}
1380
494da23a
TL
1381int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx,
1382 bool mostly_omap)
7c673cae 1383{
28e407b8 1384 constexpr bool create = true; // create the pool if it doesn't exist
494da23a 1385 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1386}
1387
7c673cae
FG
1388/**** logs ****/
1389
1390struct log_list_state {
1391 string prefix;
1392 librados::IoCtx io_ctx;
1393 librados::NObjectIterator obit;
1394};
1395
1396int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1397{
1398 log_list_state *state = new log_list_state;
11fdf7f2 1399 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1400 if (r < 0) {
1401 delete state;
1402 return r;
1403 }
1404 state->prefix = prefix;
1405 state->obit = state->io_ctx.nobjects_begin();
1406 *handle = (RGWAccessHandle)state;
1407 return 0;
1408}
1409
1410int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1411{
1412 log_list_state *state = static_cast<log_list_state *>(handle);
1413 while (true) {
1414 if (state->obit == state->io_ctx.nobjects_end()) {
1415 delete state;
1416 return -ENOENT;
1417 }
1418 if (state->prefix.length() &&
1419 state->obit->get_oid().find(state->prefix) != 0) {
1420 state->obit++;
1421 continue;
1422 }
1423 *name = state->obit->get_oid();
1424 state->obit++;
1425 break;
1426 }
1427 return 0;
1428}
1429
1430int RGWRados::log_remove(const string& name)
1431{
1432 librados::IoCtx io_ctx;
11fdf7f2 1433 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1434 if (r < 0)
1435 return r;
1436 return io_ctx.remove(name);
1437}
1438
1439struct log_show_state {
1440 librados::IoCtx io_ctx;
1441 bufferlist bl;
11fdf7f2 1442 bufferlist::const_iterator p;
7c673cae
FG
1443 string name;
1444 uint64_t pos;
1445 bool eof;
1446 log_show_state() : pos(0), eof(false) {}
1447};
1448
1449int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1450{
1451 log_show_state *state = new log_show_state;
11fdf7f2 1452 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1453 if (r < 0) {
1454 delete state;
1455 return r;
1456 }
1457 state->name = name;
1458 *handle = (RGWAccessHandle)state;
1459 return 0;
1460}
1461
1462int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1463{
1464 log_show_state *state = static_cast<log_show_state *>(handle);
1465 off_t off = state->p.get_off();
1466
1467 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1468 << " off " << off
1469 << " eof " << (int)state->eof
1470 << dendl;
1471 // read some?
1472 unsigned chunk = 1024*1024;
1473 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1474 bufferlist more;
1475 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1476 if (r < 0)
1477 return r;
1478 state->pos += r;
1479 bufferlist old;
1480 try {
1481 old.substr_of(state->bl, off, state->bl.length() - off);
1482 } catch (buffer::error& err) {
1483 return -EINVAL;
1484 }
1485 state->bl.clear();
1486 state->bl.claim(old);
1487 state->bl.claim_append(more);
11fdf7f2 1488 state->p = state->bl.cbegin();
7c673cae
FG
1489 if ((unsigned)r < chunk)
1490 state->eof = true;
1491 ldout(cct, 10) << " read " << r << dendl;
1492 }
1493
1494 if (state->p.end())
1495 return 0; // end of file
1496 try {
11fdf7f2 1497 decode(*entry, state->p);
7c673cae
FG
1498 }
1499 catch (const buffer::error &e) {
1500 return -EINVAL;
1501 }
1502 return 1;
1503}
1504
1505/**
1506 * usage_log_hash: get usage log key hash, based on name and index
1507 *
1508 * Get the usage object name. Since a user may have more than 1
1509 * object holding that info (multiple shards), we use index to
1510 * specify that shard number. Once index exceeds max shards it
1511 * wraps.
1512 * If name is not being set, results for all users will be returned
1513 * and index will wrap only after total shards number.
1514 *
1515 * @param cct [in] ceph context
1516 * @param name [in] user name
1517 * @param hash [out] hash value
1518 * @param index [in] shard index number
1519 */
1520static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1521{
1522 uint32_t val = index;
1523
1524 if (!name.empty()) {
c07f9fc5 1525 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1526 val %= max_user_shards;
1527 val += ceph_str_hash_linux(name.c_str(), name.size());
1528 }
1529 char buf[17];
c07f9fc5 1530 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1531 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1532 hash = buf;
1533}
1534
1535int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1536{
1537 uint32_t index = 0;
1538
1539 map<string, rgw_usage_log_info> log_objs;
1540
1541 string hash;
1542 string last_user;
1543
1544 /* restructure usage map, zone by object hash */
1545 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1546 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1547 const rgw_user_bucket& ub = iter->first;
1548 RGWUsageBatch& info = iter->second;
1549
1550 if (ub.user.empty()) {
1551 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1552 continue;
1553 }
1554
1555 if (ub.user != last_user) {
1556 /* index *should* be random, but why waste extra cycles
1557 in most cases max user shards is not going to exceed 1,
1558 so just incrementing it */
1559 usage_log_hash(cct, ub.user, hash, index++);
1560 }
1561 last_user = ub.user;
1562 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1563
1564 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1565 v.push_back(miter->second);
1566 }
1567 }
1568
1569 map<string, rgw_usage_log_info>::iterator liter;
1570
1571 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1572 int r = cls_obj_usage_log_add(liter->first, liter->second);
1573 if (r < 0)
1574 return r;
1575 }
1576 return 0;
1577}
1578
11fdf7f2
TL
1579int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1580 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1581 rgw_usage_log_entry>& usage)
7c673cae
FG
1582{
1583 uint32_t num = max_entries;
1584 string hash, first_hash;
1585 string user_str = user.to_str();
1586 usage_log_hash(cct, user_str, first_hash, 0);
1587
1588 if (usage_iter.index) {
1589 usage_log_hash(cct, user_str, hash, usage_iter.index);
1590 } else {
1591 hash = first_hash;
1592 }
1593
1594 usage.clear();
1595
1596 do {
1597 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1598 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1599
11fdf7f2 1600 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1601 usage_iter.read_iter, ret_usage, is_truncated);
1602 if (ret == -ENOENT)
1603 goto next;
1604
1605 if (ret < 0)
1606 return ret;
1607
1608 num -= ret_usage.size();
1609
1610 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1611 usage[iter->first].aggregate(iter->second);
1612 }
1613
1614next:
1615 if (!*is_truncated) {
1616 usage_iter.read_iter.clear();
1617 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1618 }
1619 } while (num && !*is_truncated && hash != first_hash);
1620 return 0;
1621}
1622
11fdf7f2 1623int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1624{
1625 uint32_t index = 0;
1626 string hash, first_hash;
1627 string user_str = user.to_str();
1628 usage_log_hash(cct, user_str, first_hash, index);
1629
1630 hash = first_hash;
7c673cae 1631 do {
11fdf7f2 1632 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1633
b32b8144 1634 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1635 return ret;
1636
7c673cae
FG
1637 usage_log_hash(cct, user_str, hash, ++index);
1638 } while (hash != first_hash);
1639
1640 return 0;
1641}
1642
11fdf7f2
TL
1643
1644int RGWRados::clear_usage()
1645{
1646 auto max_shards = cct->_conf->rgw_usage_max_shards;
1647 int ret=0;
1648 for (unsigned i=0; i < max_shards; i++){
1649 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
1650 ret = cls_obj_usage_log_clear(oid);
1651 if (ret < 0){
1652 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
1653 return ret;
1654 }
1655 }
1656 return ret;
1657}
1658
9f95a23c 1659int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
7c673cae 1660{
9f95a23c
TL
1661 auto i = bl.cbegin();
1662 RGWAccessControlPolicy policy(cct);
1663 try {
1664 policy.decode_owner(i);
1665 } catch (buffer::error& err) {
1666 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1667 return -EIO;
7c673cae 1668 }
9f95a23c
TL
1669 *owner = policy.get_owner();
1670 return 0;
7c673cae
FG
1671}
1672
9f95a23c 1673int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
7c673cae 1674{
9f95a23c
TL
1675 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1676 if (aiter == attrset.end())
1677 return -EIO;
7c673cae 1678
9f95a23c
TL
1679 bufferlist& bl = aiter->second;
1680 auto iter = bl.cbegin();
1681 try {
1682 policy->decode(iter);
1683 } catch (buffer::error& err) {
1684 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1685 return -EIO;
1686 }
1687 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1688 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
1689 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
1690 s3policy->to_xml(*_dout);
1691 *_dout << dendl;
1692 }
1693 return 0;
7c673cae
FG
1694}
1695
7c673cae 1696
9f95a23c 1697int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
7c673cae 1698{
9f95a23c
TL
1699 rgw_bucket bucket = bucket_info.bucket;
1700 bucket.update_bucket_id(new_bucket_id);
7c673cae 1701
9f95a23c 1702 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 1703
9f95a23c
TL
1704 bucket_info.objv_tracker.clear();
1705 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield);
1706 if (ret < 0) {
1707 return ret;
7c673cae
FG
1708 }
1709
9f95a23c 1710 return 0;
eafe8130
TL
1711}
1712
1713
1adf2230
AA
1714/**
1715 * Get ordered listing of the objects in a bucket.
7c673cae 1716 *
9f95a23c 1717 * max_p: maximum number of results to return
7c673cae
FG
1718 * bucket: bucket to list contents of
1719 * prefix: only return results that match this prefix
1720 * delim: do not include results that match this string.
1721 * Any skipped results will have the matching portion of their name
1722 * inserted in common_prefixes with a "true" mark.
1723 * marker: if filled in, begin the listing with this object.
1724 * end_marker: if filled in, end the listing with this object.
1725 * result: the objects are put in here.
11fdf7f2
TL
1726 * common_prefixes: if delim is filled in, any matching prefixes are
1727 * placed here.
1728 * is_truncated: if number of objects in the bucket is bigger than
1729 * max, then truncated.
7c673cae 1730 */
11fdf7f2 1731int RGWRados::Bucket::List::list_objects_ordered(
eafe8130 1732 int64_t max_p,
11fdf7f2
TL
1733 vector<rgw_bucket_dir_entry> *result,
1734 map<string, bool> *common_prefixes,
9f95a23c
TL
1735 bool *is_truncated,
1736 optional_yield y)
7c673cae
FG
1737{
1738 RGWRados *store = target->get_store();
1739 CephContext *cct = store->ctx();
1740 int shard_id = target->get_shard_id();
1741
1742 int count = 0;
1743 bool truncated = true;
9f95a23c 1744 bool cls_filtered = false;
eafe8130
TL
1745 const int64_t max = // protect against memory issues and negative vals
1746 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1747 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
7c673cae
FG
1748
1749 result->clear();
1750
9f95a23c
TL
1751 // use a local marker; either the marker will have a previous entry
1752 // or it will be empty; either way it's OK to copy
1753 rgw_obj_key marker_obj(params.marker.name,
1754 params.marker.instance,
1755 params.marker.ns);
7c673cae
FG
1756 rgw_obj_index_key cur_marker;
1757 marker_obj.get_index_key(&cur_marker);
1758
9f95a23c
TL
1759 rgw_obj_key end_marker_obj(params.end_marker.name,
1760 params.end_marker.instance,
1761 params.end_marker.ns);
3efd9988
FG
1762 rgw_obj_index_key cur_end_marker;
1763 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
1764 const bool cur_end_marker_valid = !params.end_marker.empty();
1765
1766 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 1767 prefix_obj.set_ns(params.ns);
7c673cae 1768 string cur_prefix = prefix_obj.get_index_key_name();
11fdf7f2 1769 string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
1770
1771 if (!params.delim.empty()) {
9f95a23c 1772 after_delim_s = cls_rgw_after_delim(params.delim);
11fdf7f2
TL
1773 /* if marker points at a common prefix, fast forward it into its
1774 * upper bound string */
224ce89b 1775 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
1776 if (delim_pos >= 0) {
1777 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 1778 s.append(after_delim_s);
7c673cae
FG
1779 cur_marker = s;
1780 }
1781 }
1adf2230 1782
9f95a23c 1783 rgw_obj_index_key prev_marker;
f6b5b4d7 1784 for (uint16_t attempt = 1; /* empty */; ++attempt) {
9f95a23c 1785 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
f6b5b4d7 1786 " starting attempt " << attempt << dendl;
9f95a23c
TL
1787
1788 if (attempt > 1 && !(prev_marker < cur_marker)) {
1789 // we've failed to make forward progress
1790 ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ <<
1791 ": ERROR marker failed to make forward progress; attempt=" << attempt <<
1792 ", prev_marker=" << prev_marker <<
1793 ", cur_marker=" << cur_marker << dendl;
1794 break;
1795 }
1796 prev_marker = cur_marker;
1797
1798 ent_map_t ent_map;
1799 ent_map.reserve(read_ahead);
1adf2230
AA
1800 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
1801 shard_id,
1802 cur_marker,
1803 cur_prefix,
9f95a23c 1804 params.delim,
1adf2230
AA
1805 read_ahead + 1 - count,
1806 params.list_versions,
9f95a23c 1807 attempt,
1adf2230
AA
1808 ent_map,
1809 &truncated,
9f95a23c
TL
1810 &cls_filtered,
1811 &cur_marker,
1812 y);
1813 if (r < 0) {
7c673cae 1814 return r;
9f95a23c 1815 }
7c673cae 1816
1adf2230 1817 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
1818 rgw_bucket_dir_entry& entry = eiter->second;
1819 rgw_obj_index_key index_key = entry.key;
7c673cae
FG
1820 rgw_obj_key obj(index_key);
1821
9f95a23c
TL
1822 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1823 " considering entry " << entry.key << dendl;
1824
1adf2230
AA
1825 /* note that parse_raw_oid() here will not set the correct
1826 * object's instance, as rgw_obj_index_key encodes that
1827 * separately. We don't need to set the instance because it's
1828 * not needed for the checks here and we end up using the raw
1829 * entry for the return vector
7c673cae
FG
1830 */
1831 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1832 if (!valid) {
9f95a23c
TL
1833 ldout(cct, 0) << "ERROR: could not parse object name: " <<
1834 obj.name << dendl;
7c673cae
FG
1835 continue;
1836 }
11fdf7f2 1837
9f95a23c 1838 bool matched_ns = (obj.ns == params.ns);
7c673cae
FG
1839 if (!params.list_versions && !entry.is_visible()) {
1840 continue;
1841 }
1842
9f95a23c 1843 if (params.enforce_ns && !matched_ns) {
7c673cae
FG
1844 if (!params.ns.empty()) {
1845 /* we've iterated past the namespace we're searching -- done now */
1846 truncated = false;
1847 goto done;
1848 }
1849
1850 /* we're not looking at the namespace this object is in, next! */
1851 continue;
1852 }
1853
1854 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1855 truncated = false;
1856 goto done;
1857 }
1858
1859 if (count < max) {
9f95a23c
TL
1860 params.marker = index_key;
1861 next_marker = index_key;
7c673cae
FG
1862 }
1863
9f95a23c
TL
1864 if (params.filter &&
1865 ! params.filter->filter(obj.name, index_key.name)) {
7c673cae 1866 continue;
9f95a23c 1867 }
7c673cae 1868
1adf2230 1869 if (params.prefix.size() &&
9f95a23c 1870 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
7c673cae 1871 continue;
9f95a23c 1872 }
7c673cae
FG
1873
1874 if (!params.delim.empty()) {
9f95a23c
TL
1875 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1876 if (delim_pos >= 0) {
1877 // run either the code where delimiter filtering is done a)
1878 // in the OSD/CLS or b) here.
1879 if (cls_filtered) {
1880 // NOTE: this condition is for the newer versions of the
1881 // OSD that does filtering on the CLS side
1882
1883 // should only find one delimiter at the end if it finds any
1884 // after the prefix
1885 if (delim_pos !=
1886 int(obj.name.length() - params.delim.length())) {
1887 ldout(cct, 0) <<
1888 "WARNING: found delimiter in place other than the end of "
1889 "the prefix; obj.name=" << obj.name <<
1890 ", prefix=" << params.prefix << dendl;
1891 }
1892 if (common_prefixes) {
1893 if (count >= max) {
1894 truncated = true;
1895 goto done;
1896 }
1897
1898 (*common_prefixes)[obj.name] = true;
1899 count++;
1900 }
1901
1902 continue;
1903 } else {
1904 // NOTE: this condition is for older versions of the OSD
1905 // that do not filter on the CLS side, so the following code
1906 // must do the filtering; once we reach version 16 of ceph,
1907 // this code can be removed along with the conditional that
1908 // can lead this way
1909
1910 /* extract key -with trailing delimiter- for CommonPrefix */
1911 string prefix_key =
1912 obj.name.substr(0, delim_pos + params.delim.length());
1913
1914 if (common_prefixes &&
1915 common_prefixes->find(prefix_key) == common_prefixes->end()) {
1916 if (count >= max) {
1917 truncated = true;
1918 goto done;
1919 }
1920 next_marker = prefix_key;
1921 (*common_prefixes)[prefix_key] = true;
1922
1923 count++;
1924 }
1925
1926 continue;
1927 } // if we're running an older OSD version
1928 } // if a delimiter was found after prefix
1929 } // if a delimiter was passed in
7c673cae
FG
1930
1931 if (count >= max) {
1932 truncated = true;
1933 goto done;
1934 }
1935
9f95a23c
TL
1936 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1937 " adding entry " << entry.key << " to result" << dendl;
1938
7c673cae
FG
1939 result->emplace_back(std::move(entry));
1940 count++;
9f95a23c
TL
1941 } // eiter for loop
1942
1943 // NOTE: the following conditional is needed by older versions of
1944 // the OSD that don't do delimiter filtering on the CLS side; once
1945 // we reach version 16 of ceph, the following conditional and the
1946 // code within can be removed
1947 if (!cls_filtered && !params.delim.empty()) {
1948 int marker_delim_pos =
1949 cur_marker.name.find(params.delim, cur_prefix.size());
eafe8130 1950 if (marker_delim_pos >= 0) {
9f95a23c
TL
1951 std::string skip_after_delim =
1952 cur_marker.name.substr(0, marker_delim_pos);
eafe8130
TL
1953 skip_after_delim.append(after_delim_s);
1954
1955 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
1956
1957 if (skip_after_delim > cur_marker.name) {
1958 cur_marker = skip_after_delim;
1959 ldout(cct, 20) << "setting cur_marker="
1960 << cur_marker.name
1961 << "[" << cur_marker.instance << "]"
1962 << dendl;
1963 }
1964 }
9f95a23c
TL
1965 } // if older osd didn't do delimiter filtering
1966
1967 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1968 " INFO end of outer loop, truncated=" << truncated <<
1969 ", count=" << count << ", attempt=" << attempt << dendl;
1970
1971 if (!truncated || count >= (max + 1) / 2) {
1972 // if we finished listing, or if we're returning at least half the
1973 // requested entries, that's enough; S3 and swift protocols allow
1974 // returning fewer than max entries
1975 break;
1976 } else if (attempt > 8 && count >= 1) {
1977 // if we've made at least 8 attempts and we have some, but very
1978 // few, results, return with what we have
1979 break;
eafe8130 1980 }
f6b5b4d7 1981 } // for (uint16_t attempt...
7c673cae
FG
1982
1983done:
9f95a23c
TL
1984
1985 if (is_truncated) {
7c673cae 1986 *is_truncated = truncated;
9f95a23c 1987 }
7c673cae
FG
1988
1989 return 0;
1adf2230
AA
1990} // list_objects_ordered
1991
1992
1993/**
1994 * Get listing of the objects in a bucket and allow the results to be out
1995 * of order.
1996 *
1997 * Even though there are key differences with the ordered counterpart,
1998 * the parameters are the same to maintain some compatability.
1999 *
2000 * max: maximum number of results to return
2001 * bucket: bucket to list contents of
2002 * prefix: only return results that match this prefix
2003 * delim: should not be set; if it is we should have indicated an error
2004 * marker: if filled in, begin the listing with this object.
2005 * end_marker: if filled in, end the listing with this object.
2006 * result: the objects are put in here.
2007 * common_prefixes: this is never filled with an unordered list; the param
2008 * is maintained for compatibility
2009 * is_truncated: if number of objects in the bucket is bigger than max, then
2010 * truncated.
2011 */
eafe8130 2012int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p,
1adf2230
AA
2013 vector<rgw_bucket_dir_entry> *result,
2014 map<string, bool> *common_prefixes,
9f95a23c
TL
2015 bool *is_truncated,
2016 optional_yield y)
1adf2230
AA
2017{
2018 RGWRados *store = target->get_store();
2019 CephContext *cct = store->ctx();
2020 int shard_id = target->get_shard_id();
2021
2022 int count = 0;
2023 bool truncated = true;
2024
eafe8130
TL
2025 const int64_t max = // protect against memory issues and negative vals
2026 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2027
1adf2230
AA
2028 // read a few extra in each call to cls_bucket_list_unordered in
2029 // case some are filtered out due to namespace matching, versioning,
2030 // filtering, etc.
2031 const int64_t max_read_ahead = 100;
2032 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2033
2034 result->clear();
2035
9f95a23c
TL
2036 // use a local marker; either the marker will have a previous entry
2037 // or it will be empty; either way it's OK to copy
11fdf7f2
TL
2038 rgw_obj_key marker_obj(params.marker.name,
2039 params.marker.instance,
9f95a23c 2040 params.marker.ns);
1adf2230
AA
2041 rgw_obj_index_key cur_marker;
2042 marker_obj.get_index_key(&cur_marker);
2043
11fdf7f2
TL
2044 rgw_obj_key end_marker_obj(params.end_marker.name,
2045 params.end_marker.instance,
9f95a23c 2046 params.end_marker.ns);
1adf2230
AA
2047 rgw_obj_index_key cur_end_marker;
2048 end_marker_obj.get_index_key(&cur_end_marker);
2049 const bool cur_end_marker_valid = !params.end_marker.empty();
2050
2051 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 2052 prefix_obj.set_ns(params.ns);
1adf2230
AA
2053 string cur_prefix = prefix_obj.get_index_key_name();
2054
2055 while (truncated && count <= max) {
2056 std::vector<rgw_bucket_dir_entry> ent_list;
9f95a23c
TL
2057 ent_list.reserve(read_ahead);
2058
1adf2230
AA
2059 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2060 shard_id,
2061 cur_marker,
2062 cur_prefix,
2063 read_ahead,
2064 params.list_versions,
2065 ent_list,
2066 &truncated,
9f95a23c
TL
2067 &cur_marker,
2068 y);
1adf2230
AA
2069 if (r < 0)
2070 return r;
2071
2072 // NB: while regions of ent_list will be sorted, we have no
2073 // guarantee that all items will be sorted since they can cross
2074 // shard boundaries
2075
2076 for (auto& entry : ent_list) {
2077 rgw_obj_index_key index_key = entry.key;
2078 rgw_obj_key obj(index_key);
2079
9f95a23c
TL
2080 if (count < max) {
2081 params.marker.set(index_key);
2082 next_marker.set(index_key);
2083 }
2084
1adf2230
AA
2085 /* note that parse_raw_oid() here will not set the correct
2086 * object's instance, as rgw_obj_index_key encodes that
2087 * separately. We don't need to set the instance because it's
2088 * not needed for the checks here and we end up using the raw
2089 * entry for the return vector
2090 */
2091 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2092 if (!valid) {
2093 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2094 obj.name << dendl;
2095 continue;
2096 }
2097
2098 if (!params.list_versions && !entry.is_visible()) {
2099 continue;
2100 }
2101
2102 if (params.enforce_ns && obj.ns != params.ns) {
2103 continue;
2104 }
2105
2106 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2107 // we're not guaranteed items will come in order, so we have
2108 // to loop through all
2109 continue;
2110 }
2111
1adf2230
AA
2112 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2113 continue;
2114
2115 if (params.prefix.size() &&
2116 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2117 continue;
2118
2119 if (count >= max) {
2120 truncated = true;
2121 goto done;
2122 }
2123
2124 result->emplace_back(std::move(entry));
2125 count++;
2126 } // for (auto& entry : ent_list)
2127 } // while (truncated && count <= max)
2128
2129done:
2130 if (is_truncated)
2131 *is_truncated = truncated;
2132
2133 return 0;
2134} // list_objects_unordered
2135
7c673cae
FG
2136
2137/**
2138 * create a rados pool, associated meta info
2139 * returns 0 on success, -ERR# otherwise.
2140 */
2141int RGWRados::create_pool(const rgw_pool& pool)
2142{
c07f9fc5 2143 librados::IoCtx io_ctx;
28e407b8
AA
2144 constexpr bool create = true;
2145 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2146}
2147
9f95a23c 2148void RGWRados::create_bucket_id(string *bucket_id)
7c673cae 2149{
9f95a23c
TL
2150 uint64_t iid = instance_id();
2151 uint64_t bid = next_bucket_id();
2152 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2153 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2154 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2155 *bucket_id = buf;
2156}
7c673cae 2157
11fdf7f2 2158int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2159 const string& zonegroup_id,
11fdf7f2 2160 const rgw_placement_rule& placement_rule,
7c673cae
FG
2161 const string& swift_ver_location,
2162 const RGWQuotaInfo * pquota_info,
2163 map<std::string, bufferlist>& attrs,
2164 RGWBucketInfo& info,
2165 obj_version *pobjv,
2166 obj_version *pep_objv,
2167 real_time creation_time,
2168 rgw_bucket *pmaster_bucket,
2169 uint32_t *pmaster_num_shards,
2170 bool exclusive)
2171{
2172#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2173 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2174 RGWZonePlacementInfo rule_info;
2175
2176 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2177 int ret = 0;
11fdf7f2
TL
2178 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2179 &selected_placement_rule, &rule_info);
7c673cae
FG
2180 if (ret < 0)
2181 return ret;
2182
2183 if (!pmaster_bucket) {
2184 create_bucket_id(&bucket.marker);
2185 bucket.bucket_id = bucket.marker;
2186 } else {
2187 bucket.marker = pmaster_bucket->marker;
2188 bucket.bucket_id = pmaster_bucket->bucket_id;
2189 }
2190
2191 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2192
9f95a23c
TL
2193 objv_tracker.read_version.clear();
2194
7c673cae
FG
2195 if (pobjv) {
2196 objv_tracker.write_version = *pobjv;
2197 } else {
2198 objv_tracker.generate_new_write_ver(cct);
2199 }
2200
2201 info.bucket = bucket;
2202 info.owner = owner.user_id;
2203 info.zonegroup = zonegroup_id;
11fdf7f2 2204 info.placement_rule = selected_placement_rule;
7c673cae
FG
2205 info.index_type = rule_info.index_type;
2206 info.swift_ver_location = swift_ver_location;
2207 info.swift_versioning = (!swift_ver_location.empty());
2208 if (pmaster_num_shards) {
2209 info.num_shards = *pmaster_num_shards;
2210 } else {
2211 info.num_shards = bucket_index_max_shards;
2212 }
2213 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2214 info.requester_pays = false;
2215 if (real_clock::is_zero(creation_time)) {
2216 info.creation_time = ceph::real_clock::now();
2217 } else {
2218 info.creation_time = creation_time;
2219 }
2220 if (pquota_info) {
2221 info.quota = *pquota_info;
2222 }
2223
9f95a23c 2224 int r = svc.bi->init_index(info);
11fdf7f2
TL
2225 if (r < 0) {
2226 return r;
2227 }
7c673cae 2228
11fdf7f2 2229 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
9f95a23c
TL
2230 if (ret == -ECANCELED) {
2231 ret = -EEXIST;
2232 }
11fdf7f2 2233 if (ret == -EEXIST) {
11fdf7f2 2234 /* we need to reread the info and return it, caller will have a use for it */
9f95a23c
TL
2235 RGWBucketInfo orig_info;
2236 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
11fdf7f2
TL
2237 if (r < 0) {
2238 if (r == -ENOENT) {
2239 continue;
2240 }
2241 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2242 return r;
2243 }
7c673cae 2244
11fdf7f2 2245 /* only remove it if it's a different bucket instance */
9f95a23c
TL
2246 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
2247 int r = svc.bi->clean_index(info);
2248 if (r < 0) {
2249 ldout(cct, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
2250 }
2251 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield);
2252 if (r < 0) {
2253 ldout(cct, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
2254 /* continue anyway */
2255 }
11fdf7f2 2256 }
9f95a23c
TL
2257
2258 info = std::move(orig_info);
2259 /* ret == -EEXIST here */
11fdf7f2 2260 }
7c673cae 2261 return ret;
7c673cae
FG
2262 }
2263
11fdf7f2
TL
2264 /* this is highly unlikely */
2265 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2266 return -ENOENT;
7c673cae
FG
2267}
2268
11fdf7f2 2269bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2270{
11fdf7f2
TL
2271 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2272}
c07f9fc5 2273
11fdf7f2
TL
2274bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2275{
2276 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2277
11fdf7f2 2278 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2279}
2280
2281int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2282{
2283 string oid, key;
2284 get_obj_bucket_and_oid_loc(obj, oid, key);
2285
2286 rgw_pool pool;
2287 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2288 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2289 return -EIO;
2290 }
2291
494da23a 2292 int r = open_pool_ctx(pool, *ioctx, false);
7c673cae
FG
2293 if (r < 0) {
2294 return r;
2295 }
2296
2297 ioctx->locator_set_key(key);
2298
2299 return 0;
2300}
2301
2302int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2303{
11fdf7f2 2304 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2305
2306 rgw_pool pool;
2307 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2308 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2309 return -EIO;
2310 }
2311
9f95a23c
TL
2312 ref->pool = svc.rados->pool(pool);
2313
2314 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2315 .set_mostly_omap(false));
7c673cae 2316 if (r < 0) {
9f95a23c 2317 ldout(cct, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
7c673cae
FG
2318 return r;
2319 }
2320
9f95a23c 2321 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2322
2323 return 0;
2324}
2325
224ce89b 2326int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2327{
11fdf7f2 2328 ref->obj = obj;
7c673cae 2329
11fdf7f2
TL
2330 if (ref->obj.oid.empty()) {
2331 ref->obj.oid = obj.pool.to_str();
2332 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2333 }
9f95a23c
TL
2334 ref->pool = svc.rados->pool(obj.pool);
2335 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2336 .set_mostly_omap(false));
2337 if (r < 0) {
2338 ldout(cct, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
7c673cae 2339 return r;
9f95a23c 2340 }
7c673cae 2341
9f95a23c 2342 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2343
2344 return 0;
2345}
2346
224ce89b 2347int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2348{
224ce89b 2349 return get_raw_obj_ref(obj, ref);
7c673cae
FG
2350}
2351
2352/*
2353 * fixes an issue where head objects were supposed to have a locator created, but ended
2354 * up without one
2355 */
2356int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2357{
2358 const rgw_bucket& bucket = bucket_info.bucket;
2359 string oid;
2360 string locator;
2361
2362 rgw_obj obj(bucket, key);
2363
2364 get_obj_bucket_and_oid_loc(obj, oid, locator);
2365
2366 if (locator.empty()) {
2367 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
2368 return 0;
2369 }
2370
2371 librados::IoCtx ioctx;
2372
2373 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
2374 if (ret < 0) {
2375 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2376 return ret;
2377 }
2378 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2379
2380 uint64_t size;
2381 bufferlist data;
2382
2383 struct timespec mtime_ts;
2384 map<string, bufferlist> attrs;
2385 librados::ObjectReadOperation op;
2386 op.getxattrs(&attrs, NULL);
2387 op.stat2(&size, &mtime_ts, NULL);
2388#define HEAD_SIZE 512 * 1024
2389 op.read(0, HEAD_SIZE, &data, NULL);
2390
9f95a23c 2391 ret = rgw_rados_operate(ioctx, oid, &op, &data, null_yield);
7c673cae 2392 if (ret < 0) {
9f95a23c 2393 lderr(cct) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
7c673cae
FG
2394 return ret;
2395 }
2396
2397 if (size > HEAD_SIZE) {
2398 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
2399 return -EIO;
2400 }
2401
2402 if (size != data.length()) {
2403 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
2404 return -EIO;
2405 }
2406
2407 if (copy_obj) {
2408 librados::ObjectWriteOperation wop;
2409
2410 wop.mtime2(&mtime_ts);
2411
2412 map<string, bufferlist>::iterator iter;
2413 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2414 wop.setxattr(iter->first.c_str(), iter->second);
2415 }
2416
2417 wop.write(0, data);
2418
2419 ioctx.locator_set_key(locator);
9f95a23c 2420 rgw_rados_operate(ioctx, oid, &wop, null_yield);
7c673cae
FG
2421 }
2422
2423 if (remove_bad) {
2424 ioctx.locator_set_key(string());
2425
2426 ret = ioctx.remove(oid);
2427 if (ret < 0) {
2428 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
2429 return ret;
2430 }
2431 }
2432
2433 return 0;
2434}
2435
2436int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
2437 const string& src_oid, const string& src_locator,
2438 librados::IoCtx& dst_ioctx,
2439 const string& dst_oid, const string& dst_locator)
2440{
2441
2442#define COPY_BUF_SIZE (4 * 1024 * 1024)
2443 bool done = false;
2444 uint64_t chunk_size = COPY_BUF_SIZE;
2445 uint64_t ofs = 0;
2446 int ret = 0;
2447 real_time mtime;
2448 struct timespec mtime_ts;
2449 uint64_t size;
2450
2451 if (src_oid == dst_oid && src_locator == dst_locator) {
2452 return 0;
2453 }
2454
2455 src_ioctx.locator_set_key(src_locator);
2456 dst_ioctx.locator_set_key(dst_locator);
2457
2458 do {
2459 bufferlist data;
2460 ObjectReadOperation rop;
2461 ObjectWriteOperation wop;
2462
2463 if (ofs == 0) {
2464 rop.stat2(&size, &mtime_ts, NULL);
2465 mtime = real_clock::from_timespec(mtime_ts);
2466 }
2467 rop.read(ofs, chunk_size, &data, NULL);
9f95a23c 2468 ret = rgw_rados_operate(src_ioctx, src_oid, &rop, &data, null_yield);
7c673cae
FG
2469 if (ret < 0) {
2470 goto done_err;
2471 }
2472
2473 if (data.length() == 0) {
2474 break;
2475 }
2476
2477 if (ofs == 0) {
2478 wop.create(true); /* make it exclusive */
2479 wop.mtime2(&mtime_ts);
2480 mtime = real_clock::from_timespec(mtime_ts);
2481 }
2482 wop.write(ofs, data);
9f95a23c 2483 ret = rgw_rados_operate(dst_ioctx, dst_oid, &wop, null_yield);
11fdf7f2
TL
2484 if (ret < 0) {
2485 goto done_err;
2486 }
7c673cae
FG
2487 ofs += data.length();
2488 done = data.length() != chunk_size;
2489 } while (!done);
2490
2491 if (ofs != size) {
2492 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
2493 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2494 ret = -EIO;
2495 goto done_err;
2496 }
2497
2498 src_ioctx.remove(src_oid);
2499
2500 return 0;
2501
2502done_err:
11fdf7f2 2503 // TODO: clean up dst_oid if we created it
7c673cae
FG
2504 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
2505 return ret;
2506}
2507
2508/*
2509 * fixes an issue where head objects were supposed to have a locator created, but ended
2510 * up without one
2511 */
9f95a23c 2512int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
7c673cae
FG
2513{
2514 const rgw_bucket& bucket = bucket_info.bucket;
2515 rgw_obj obj(bucket, key);
2516
2517 if (need_fix) {
2518 *need_fix = false;
2519 }
2520
2521 rgw_rados_ref ref;
2522 int r = get_obj_head_ref(bucket_info, obj, &ref);
2523 if (r < 0) {
2524 return r;
2525 }
2526
2527 RGWObjState *astate = NULL;
9f95a23c
TL
2528 RGWObjectCtx rctx(this->store);
2529 r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
2530 if (r < 0)
2531 return r;
2532
9f95a23c 2533 if (astate->manifest) {
7c673cae 2534 RGWObjManifest::obj_iterator miter;
9f95a23c 2535 RGWObjManifest& manifest = *astate->manifest;
7c673cae
FG
2536 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
2537 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
2538 rgw_obj loc;
2539 string oid;
2540 string locator;
2541
9f95a23c 2542 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
7c673cae
FG
2543
2544 if (loc.key.ns.empty()) {
2545 /* continue, we're only interested in tail objects */
2546 continue;
2547 }
2548
9f95a23c
TL
2549 auto& ioctx = ref.pool.ioctx();
2550
7c673cae 2551 get_obj_bucket_and_oid_loc(loc, oid, locator);
9f95a23c 2552 ref.pool.ioctx().locator_set_key(locator);
7c673cae
FG
2553
2554 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
2555
9f95a23c 2556 r = ioctx.stat(oid, NULL, NULL);
7c673cae
FG
2557 if (r != -ENOENT) {
2558 continue;
2559 }
2560
2561 string bad_loc;
2562 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2563
2564 /* create a new ioctx with the bad locator */
2565 librados::IoCtx src_ioctx;
9f95a23c 2566 src_ioctx.dup(ioctx);
7c673cae
FG
2567 src_ioctx.locator_set_key(bad_loc);
2568
2569 r = src_ioctx.stat(oid, NULL, NULL);
2570 if (r != 0) {
2571 /* cannot find a broken part */
2572 continue;
2573 }
2574 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
2575 if (need_fix) {
2576 *need_fix = true;
2577 }
2578 if (fix) {
9f95a23c 2579 r = move_rados_obj(src_ioctx, oid, bad_loc, ioctx, oid, locator);
7c673cae
FG
2580 if (r < 0) {
2581 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
2582 }
2583 }
2584 }
2585 }
2586
2587 return 0;
2588}
2589
f64942e4
AA
2590int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2591 const rgw_obj& obj,
2592 RGWBucketInfo* bucket_info_out)
7c673cae
FG
2593{
2594 bucket = _bucket;
2595
11fdf7f2 2596 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2597
2598 RGWBucketInfo bucket_info;
f64942e4
AA
2599 RGWBucketInfo* bucket_info_p =
2600 bucket_info_out ? bucket_info_out : &bucket_info;
2601
9f95a23c 2602 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
7c673cae
FG
2603 if (ret < 0) {
2604 return ret;
2605 }
2606
9f95a23c
TL
2607 string oid;
2608
2609 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae
FG
2610 if (ret < 0) {
2611 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2612 return ret;
2613 }
9f95a23c 2614 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2615
2616 return 0;
2617}
2618
f64942e4
AA
2619int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2620 int sid,
2621 RGWBucketInfo* bucket_info_out)
7c673cae
FG
2622{
2623 bucket = _bucket;
2624 shard_id = sid;
2625
11fdf7f2 2626 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2627
2628 RGWBucketInfo bucket_info;
f64942e4
AA
2629 RGWBucketInfo* bucket_info_p =
2630 bucket_info_out ? bucket_info_out : &bucket_info;
9f95a23c 2631 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
7c673cae
FG
2632 if (ret < 0) {
2633 return ret;
2634 }
2635
9f95a23c
TL
2636 string oid;
2637
2638 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, shard_id, &bucket_obj);
7c673cae
FG
2639 if (ret < 0) {
2640 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2641 return ret;
2642 }
9f95a23c 2643 ldout(store->ctx(), 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2644
2645 return 0;
2646}
2647
a8e16298
TL
2648int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
2649 const rgw_obj& obj)
2650{
2651 bucket = bucket_info.bucket;
2652
9f95a23c
TL
2653 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info,
2654 obj.get_hash_object(),
2655 &bucket_obj,
2656 &shard_id);
a8e16298
TL
2657 if (ret < 0) {
2658 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2659 return ret;
2660 }
2661 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2662
2663 return 0;
2664}
2665
b32b8144
FG
2666int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
2667{
2668 bucket = bucket_info.bucket;
2669 shard_id = sid;
2670
9f95a23c 2671 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info, shard_id, &bucket_obj);
b32b8144
FG
2672 if (ret < 0) {
2673 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2674 return ret;
2675 }
2676 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2677
2678 return 0;
2679}
2680
7c673cae
FG
2681
2682/* Execute @handler on last item in bucket listing for bucket specified
2683 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2684 * to objects matching these criterias. */
2685int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
2686 const std::string& obj_prefix,
2687 const std::string& obj_delim,
2688 std::function<int(const rgw_bucket_dir_entry&)> handler)
2689{
2690 RGWRados::Bucket target(this, bucket_info);
2691 RGWRados::Bucket::List list_op(&target);
2692
2693 list_op.params.prefix = obj_prefix;
2694 list_op.params.delim = obj_delim;
2695
2696 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
2697 << ", obj_prefix=" << obj_prefix
2698 << ", obj_delim=" << obj_delim
2699 << dendl;
2700
2701 bool is_truncated = false;
2702
2703 boost::optional<rgw_bucket_dir_entry> last_entry;
2704 /* We need to rewind to the last object in a listing. */
2705 do {
2706 /* List bucket entries in chunks. */
2707 static constexpr int MAX_LIST_OBJS = 100;
2708 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2709
2710 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
9f95a23c 2711 &is_truncated, null_yield);
7c673cae
FG
2712 if (ret < 0) {
2713 return ret;
2714 } else if (!entries.empty()) {
2715 last_entry = entries.back();
2716 }
2717 } while (is_truncated);
2718
2719 if (last_entry) {
2720 return handler(*last_entry);
2721 }
2722
2723 /* Empty listing - no items we can run handler on. */
2724 return 0;
2725}
2726
2727
2728int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2729 const rgw_user& user,
2730 RGWBucketInfo& bucket_info,
9f95a23c
TL
2731 rgw_obj& obj,
2732 const DoutPrefixProvider *dpp,
2733 optional_yield y)
7c673cae
FG
2734{
2735 if (! swift_versioning_enabled(bucket_info)) {
2736 return 0;
2737 }
2738
11fdf7f2 2739 obj_ctx.set_atomic(obj);
7c673cae
FG
2740
2741 RGWObjState * state = nullptr;
9f95a23c 2742 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false, y);
7c673cae
FG
2743 if (r < 0) {
2744 return r;
2745 }
2746
2747 if (!state->exists) {
2748 return 0;
2749 }
2750
7c673cae
FG
2751 const string& src_name = obj.get_oid();
2752 char buf[src_name.size() + 32];
2753 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2754 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2755 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2756
2757 RGWBucketInfo dest_bucket_info;
2758
9f95a23c 2759 r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
7c673cae
FG
2760 if (r < 0) {
2761 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
2762 if (r == -ENOENT) {
2763 return -ERR_PRECONDITION_FAILED;
2764 }
2765 return r;
2766 }
2767
2768 if (dest_bucket_info.owner != bucket_info.owner) {
2769 return -ERR_PRECONDITION_FAILED;
2770 }
2771
2772 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
11fdf7f2
TL
2773
2774 if (dest_bucket_info.versioning_enabled()){
2775 gen_rand_obj_instance_name(&dest_obj);
2776 }
2777
2778 obj_ctx.set_atomic(dest_obj);
7c673cae 2779
9f95a23c 2780 rgw_zone_id no_zone;
7c673cae
FG
2781
2782 r = copy_obj(obj_ctx,
2783 user,
7c673cae
FG
2784 NULL, /* req_info *info */
2785 no_zone,
2786 dest_obj,
2787 obj,
2788 dest_bucket_info,
2789 bucket_info,
11fdf7f2 2790 bucket_info.placement_rule,
7c673cae
FG
2791 NULL, /* time_t *src_mtime */
2792 NULL, /* time_t *mtime */
2793 NULL, /* const time_t *mod_ptr */
2794 NULL, /* const time_t *unmod_ptr */
2795 false, /* bool high_precision_time */
2796 NULL, /* const char *if_match */
2797 NULL, /* const char *if_nomatch */
2798 RGWRados::ATTRSMOD_NONE,
2799 true, /* bool copy_if_newer */
2800 state->attrset,
11fdf7f2 2801 RGWObjCategory::Main,
7c673cae
FG
2802 0, /* uint64_t olh_epoch */
2803 real_time(), /* time_t delete_at */
2804 NULL, /* string *version_id */
2805 NULL, /* string *ptag */
2806 NULL, /* string *petag */
7c673cae 2807 NULL, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2808 NULL, /* void *progress_data */
2809 dpp,
2810 null_yield);
7c673cae
FG
2811 if (r == -ECANCELED || r == -ENOENT) {
2812 /* Has already been overwritten, meaning another rgw process already
2813 * copied it out */
2814 return 0;
2815 }
2816
2817 return r;
2818}
2819
9f95a23c 2820int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
7c673cae
FG
2821 const rgw_user& user,
2822 RGWBucketInfo& bucket_info,
2823 rgw_obj& obj,
9f95a23c
TL
2824 bool& restored, /* out */
2825 const DoutPrefixProvider *dpp)
7c673cae
FG
2826{
2827 if (! swift_versioning_enabled(bucket_info)) {
2828 return 0;
2829 }
2830
2831 /* Bucket info of the bucket that stores previous versions of our object. */
2832 RGWBucketInfo archive_binfo;
2833
9f95a23c 2834 int ret = get_bucket_info(&svc, bucket_info.bucket.tenant,
7c673cae 2835 bucket_info.swift_ver_location, archive_binfo,
9f95a23c 2836 nullptr, null_yield, nullptr);
7c673cae
FG
2837 if (ret < 0) {
2838 return ret;
2839 }
2840
2841 /* Abort the operation if the bucket storing our archive belongs to someone
2842 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2843 * into consideration. For we can live with that.
2844 *
2845 * TODO: delegate this check to un upper layer and compare with ACLs. */
2846 if (bucket_info.owner != archive_binfo.owner) {
2847 return -EPERM;
2848 }
2849
2850 /* This code will be executed on latest version of the object. */
2851 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
9f95a23c 2852 rgw_zone_id no_zone;
7c673cae
FG
2853
2854 /* We don't support object versioning of Swift API on those buckets that
2855 * are already versioned using the S3 mechanism. This affects also bucket
2856 * storing archived objects. Otherwise the delete operation would create
2857 * a deletion marker. */
2858 if (archive_binfo.versioned()) {
2859 restored = false;
2860 return -ERR_PRECONDITION_FAILED;
2861 }
2862
2863 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2864 * irrelevant and may be safely skipped. */
2865 std::map<std::string, ceph::bufferlist> no_attrs;
2866
2867 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
11fdf7f2
TL
2868
2869 if (bucket_info.versioning_enabled()){
2870 gen_rand_obj_instance_name(&obj);
2871 }
2872
2873 obj_ctx.set_atomic(archive_obj);
2874 obj_ctx.set_atomic(obj);
7c673cae
FG
2875
2876 int ret = copy_obj(obj_ctx,
2877 user,
7c673cae
FG
2878 nullptr, /* req_info *info */
2879 no_zone,
2880 obj, /* dest obj */
2881 archive_obj, /* src obj */
2882 bucket_info, /* dest bucket info */
2883 archive_binfo, /* src bucket info */
11fdf7f2 2884 bucket_info.placement_rule, /* placement_rule */
7c673cae
FG
2885 nullptr, /* time_t *src_mtime */
2886 nullptr, /* time_t *mtime */
2887 nullptr, /* const time_t *mod_ptr */
2888 nullptr, /* const time_t *unmod_ptr */
2889 false, /* bool high_precision_time */
2890 nullptr, /* const char *if_match */
2891 nullptr, /* const char *if_nomatch */
2892 RGWRados::ATTRSMOD_NONE,
2893 true, /* bool copy_if_newer */
2894 no_attrs,
11fdf7f2 2895 RGWObjCategory::Main,
7c673cae
FG
2896 0, /* uint64_t olh_epoch */
2897 real_time(), /* time_t delete_at */
2898 nullptr, /* string *version_id */
2899 nullptr, /* string *ptag */
2900 nullptr, /* string *petag */
7c673cae 2901 nullptr, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2902 nullptr, /* void *progress_data */
2903 dpp,
2904 null_yield);
7c673cae
FG
2905 if (ret == -ECANCELED || ret == -ENOENT) {
2906 /* Has already been overwritten, meaning another rgw process already
2907 * copied it out */
2908 return 0;
2909 } else if (ret < 0) {
2910 return ret;
2911 } else {
2912 restored = true;
2913 }
2914
2915 /* Need to remove the archived copy. */
2916 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
2917 archive_binfo.versioning_status());
2918
2919 return ret;
2920 };
2921
2922 const std::string& obj_name = obj.get_oid();
2923 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
2924 % obj_name);
2925
2926 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
2927 handler);
2928}
2929
7c673cae 2930int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
2931 map<string, bufferlist>& attrs,
2932 bool assume_noent, bool modify_tail,
9f95a23c 2933 void *_index_op, optional_yield y)
7c673cae
FG
2934{
2935 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
2936 RGWRados *store = target->get_store();
2937
2938 ObjectWriteOperation op;
11fdf7f2
TL
2939#ifdef WITH_LTTNG
2940 const struct req_state* s = get_req_state();
2941 string req_id;
2942 if (!s) {
2943 // fake req_id
2944 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
2945 } else {
2946 req_id = s->req_id;
2947 }
2948#endif
7c673cae
FG
2949
2950 RGWObjState *state;
9f95a23c 2951 int r = target->get_state(&state, false, y, assume_noent);
7c673cae
FG
2952 if (r < 0)
2953 return r;
2954
2955 rgw_obj& obj = target->get_obj();
2956
2957 if (obj.get_oid().empty()) {
2958 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
2959 return -EIO;
2960 }
2961
224ce89b 2962 rgw_rados_ref ref;
7c673cae
FG
2963 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
2964 if (r < 0)
2965 return r;
2966
2967 bool is_olh = state->is_olh;
2968
2969 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
2970
2971 const string *ptag = meta.ptag;
2972 if (!ptag && !index_op->get_optag()->empty()) {
2973 ptag = index_op->get_optag();
2974 }
9f95a23c 2975 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
7c673cae
FG
2976 if (r < 0)
2977 return r;
2978
2979 if (real_clock::is_zero(meta.set_mtime)) {
2980 meta.set_mtime = real_clock::now();
2981 }
2982
eafe8130
TL
2983 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
2984 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
2985 if (iter == attrs.end()) {
2986 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
2987 string mode = target->bucket_info.obj_lock.get_mode();
2988 RGWObjectRetention obj_retention(mode, lock_until_date);
2989 bufferlist bl;
2990 obj_retention.encode(bl);
2991 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
2992 }
2993 }
2994
7c673cae
FG
2995 if (state->is_olh) {
2996 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
2997 }
2998
2999 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3000 op.mtime2(&mtime_ts);
3001
3002 if (meta.data) {
3003 /* if we want to overwrite the data, we also want to overwrite the
3004 xattrs, so just remove the object */
3005 op.write_full(*meta.data);
3006 }
3007
3008 string etag;
3009 string content_type;
3010 bufferlist acl_bl;
11fdf7f2 3011 string storage_class;
7c673cae
FG
3012
3013 map<string, bufferlist>::iterator iter;
3014 if (meta.rmattrs) {
3015 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3016 const string& name = iter->first;
3017 op.rmxattr(name.c_str());
3018 }
3019 }
3020
3021 if (meta.manifest) {
11fdf7f2
TL
3022 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3023
7c673cae
FG
3024 /* remove existing manifest attr */
3025 iter = attrs.find(RGW_ATTR_MANIFEST);
3026 if (iter != attrs.end())
3027 attrs.erase(iter);
3028
3029 bufferlist bl;
11fdf7f2 3030 encode(*meta.manifest, bl);
7c673cae
FG
3031 op.setxattr(RGW_ATTR_MANIFEST, bl);
3032 }
3033
3034 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3035 const string& name = iter->first;
3036 bufferlist& bl = iter->second;
3037
3038 if (!bl.length())
3039 continue;
3040
3041 op.setxattr(name.c_str(), bl);
3042
3043 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3044 etag = rgw_bl_str(bl);
7c673cae 3045 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3046 content_type = rgw_bl_str(bl);
7c673cae
FG
3047 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3048 acl_bl = bl;
3049 }
3050 }
3051 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3052 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3053 }
3054
3055 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3056 bufferlist bl;
11fdf7f2 3057 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3058 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3059 }
3060
11fdf7f2
TL
3061 if (!storage_class.empty()) {
3062 bufferlist bl;
3063 bl.append(storage_class);
3064 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3065 }
3066
7c673cae
FG
3067 if (!op.size())
3068 return 0;
3069
3070 uint64_t epoch;
3071 int64_t poolid;
224ce89b
WB
3072 bool orig_exists;
3073 uint64_t orig_size;
3074
3075 if (!reset_obj) { //Multipart upload, it has immutable head.
3076 orig_exists = false;
3077 orig_size = 0;
3078 } else {
3079 orig_exists = state->exists;
3080 orig_size = state->accounted_size;
3081 }
7c673cae 3082
91327a77
AA
3083 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3084 !obj.key.instance.empty();
7c673cae
FG
3085
3086 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3087
3088 if (versioned_op) {
3089 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3090 }
3091
3092 if (!index_op->is_prepared()) {
11fdf7f2 3093 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
9f95a23c 3094 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
11fdf7f2 3095 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3096 if (r < 0)
3097 return r;
3098 }
3099
9f95a23c
TL
3100 auto& ioctx = ref.pool.ioctx();
3101
11fdf7f2 3102 tracepoint(rgw_rados, operate_enter, req_id.c_str());
9f95a23c 3103 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
11fdf7f2 3104 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3105 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3106 or -ENOENT if was removed, or -EEXIST if it did not exist
3107 before and now it does */
3108 if (r == -EEXIST && assume_noent) {
3109 target->invalidate_state();
3110 return r;
3111 }
3112 goto done_cancel;
3113 }
3114
9f95a23c
TL
3115 epoch = ioctx.get_last_version();
3116 poolid = ioctx.get_id();
7c673cae
FG
3117
3118 r = target->complete_atomic_modification();
3119 if (r < 0) {
3120 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3121 }
3122
11fdf7f2 3123 tracepoint(rgw_rados, complete_enter, req_id.c_str());
7c673cae 3124 r = index_op->complete(poolid, epoch, size, accounted_size,
11fdf7f2
TL
3125 meta.set_mtime, etag, content_type,
3126 storage_class, &acl_bl,
3127 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3128 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3129 if (r < 0)
3130 goto done_cancel;
3131
3132 if (meta.mtime) {
3133 *meta.mtime = meta.set_mtime;
3134 }
3135
3136 /* note that index_op was using state so we couldn't invalidate it earlier */
3137 target->invalidate_state();
3138 state = NULL;
3139
91327a77 3140 if (versioned_op && meta.olh_epoch) {
9f95a23c 3141 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
7c673cae
FG
3142 if (r < 0) {
3143 return r;
3144 }
3145 }
3146
3147 if (!real_clock::is_zero(meta.delete_at)) {
3148 rgw_obj_index_key obj_key;
3149 obj.key.get_index_key(&obj_key);
3150
9f95a23c
TL
3151 r = store->obj_expirer->hint_add(meta.delete_at, obj.bucket.tenant, obj.bucket.name,
3152 obj.bucket.bucket_id, obj_key);
7c673cae
FG
3153 if (r < 0) {
3154 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3155 /* ignoring error, nothing we can do at this point */
3156 }
3157 }
3158 meta.canceled = false;
3159
3160 /* update quota cache */
3efd9988
FG
3161 if (meta.completeMultipart){
3162 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3163 0, orig_size);
3164 }
3165 else {
3166 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3167 accounted_size, orig_size);
3168 }
7c673cae
FG
3169 return 0;
3170
3171done_cancel:
3172 int ret = index_op->cancel();
3173 if (ret < 0) {
3174 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3175 }
3176
3177 meta.canceled = true;
3178
3179 /* we lost in a race. There are a few options:
3180 * - existing object was rewritten (ECANCELED)
3181 * - non existing object was created (EEXIST)
3182 * - object was removed (ENOENT)
3183 * should treat it as a success
3184 */
3185 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3186 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3187 r = 0;
3188 }
3189 } else {
3190 if (meta.if_match != NULL) {
3191 // only overwrite existing object
3192 if (strcmp(meta.if_match, "*") == 0) {
3193 if (r == -ENOENT) {
3194 r = -ERR_PRECONDITION_FAILED;
3195 } else if (r == -ECANCELED) {
3196 r = 0;
3197 }
3198 }
3199 }
3200
3201 if (meta.if_nomatch != NULL) {
3202 // only create a new object
3203 if (strcmp(meta.if_nomatch, "*") == 0) {
3204 if (r == -EEXIST) {
3205 r = -ERR_PRECONDITION_FAILED;
3206 } else if (r == -ENOENT) {
3207 r = 0;
3208 }
3209 }
3210 }
3211 }
3212
3213 return r;
3214}
3215
3216int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
9f95a23c 3217 map<string, bufferlist>& attrs, optional_yield y)
7c673cae
FG
3218{
3219 RGWBucketInfo& bucket_info = target->get_bucket_info();
3220
3221 RGWRados::Bucket bop(target->get_store(), bucket_info);
3222 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3223 index_op.set_zones_trace(meta.zones_trace);
3224
7c673cae
FG
3225 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3226 int r;
3227 if (assume_noent) {
9f95a23c 3228 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3229 if (r == -EEXIST) {
3230 assume_noent = false;
3231 }
3232 }
3233 if (!assume_noent) {
9f95a23c 3234 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3235 }
3236 return r;
3237}
3238
11fdf7f2 3239class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae
FG
3240{
3241 CephContext* cct;
3242 rgw_obj obj;
11fdf7f2 3243 rgw::putobj::DataProcessor *filter;
7c673cae 3244 boost::optional<RGWPutObj_Compress>& compressor;
11fdf7f2 3245 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3246 CompressorRef& plugin;
11fdf7f2 3247 rgw::putobj::ObjectProcessor *processor;
7c673cae
FG
3248 void (*progress_cb)(off_t, void *);
3249 void *progress_data;
3250 bufferlist extra_data_bl;
11fdf7f2
TL
3251 uint64_t extra_data_left{0};
3252 bool need_to_process_attrs{true};
3253 uint64_t data_len{0};
7c673cae 3254 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3255 uint64_t ofs{0};
3256 uint64_t lofs{0}; /* logical ofs */
9f95a23c 3257 std::function<int(map<string, bufferlist>&)> attrs_handler;
7c673cae
FG
3258public:
3259 RGWRadosPutObj(CephContext* cct,
3260 CompressorRef& plugin,
3261 boost::optional<RGWPutObj_Compress>& compressor,
11fdf7f2 3262 rgw::putobj::ObjectProcessor *p,
7c673cae 3263 void (*_progress_cb)(off_t, void *),
11fdf7f2 3264 void *_progress_data,
9f95a23c 3265 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
7c673cae
FG
3266 cct(cct),
3267 filter(p),
3268 compressor(compressor),
3269 plugin(plugin),
3270 processor(p),
7c673cae
FG
3271 progress_cb(_progress_cb),
3272 progress_data(_progress_data),
11fdf7f2 3273 attrs_handler(_attrs_handler) {}
7c673cae
FG
3274
3275 int process_attrs(void) {
3276 if (extra_data_bl.length()) {
3277 JSONParser jp;
3278 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3279 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3280 return -EIO;
3281 }
3282
3283 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3284
3285 src_attrs.erase(RGW_ATTR_COMPRESSION);
3286 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
a8e16298
TL
3287
3288 // filter out olh attributes
3289 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3290 while (iter != src_attrs.end()) {
3291 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3292 break;
3293 }
3294 iter = src_attrs.erase(iter);
3295 }
7c673cae
FG
3296 }
3297
11fdf7f2
TL
3298 int ret = attrs_handler(src_attrs);
3299 if (ret < 0) {
3300 return ret;
3301 }
3302
7c673cae
FG
3303 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3304 //do not compress if object is encrypted
3305 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3306 // add a filter that buffers data so we don't try to compress tiny blocks.
3307 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3308 // compression ratio
28e407b8
AA
3309 constexpr unsigned buffer_size = 512 * 1024;
3310 buffering = boost::in_place(&*compressor, buffer_size);
3311 filter = &*buffering;
7c673cae 3312 }
11fdf7f2
TL
3313
3314 need_to_process_attrs = false;
3315
7c673cae
FG
3316 return 0;
3317 }
3318
11fdf7f2 3319 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3320 if (progress_cb) {
11fdf7f2 3321 progress_cb(data_len, progress_data);
7c673cae 3322 }
b32b8144 3323 if (extra_data_left) {
11fdf7f2 3324 uint64_t extra_len = bl.length();
b32b8144
FG
3325 if (extra_len > extra_data_left)
3326 extra_len = extra_data_left;
7c673cae
FG
3327
3328 bufferlist extra;
3329 bl.splice(0, extra_len, &extra);
3330 extra_data_bl.append(extra);
3331
b32b8144
FG
3332 extra_data_left -= extra_len;
3333 if (extra_data_left == 0) {
7c673cae
FG
3334 int res = process_attrs();
3335 if (res < 0)
3336 return res;
3337 }
11fdf7f2 3338 ofs += extra_len;
7c673cae
FG
3339 if (bl.length() == 0) {
3340 return 0;
3341 }
3342 }
11fdf7f2
TL
3343 if (need_to_process_attrs) {
3344 /* need to call process_attrs() even if we don't get any attrs,
3345 * need it to call attrs_handler().
3346 */
3347 int res = process_attrs();
3348 if (res < 0) {
3349 return res;
3350 }
3351 }
7c673cae 3352
11fdf7f2 3353 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3354
11fdf7f2
TL
3355 uint64_t size = bl.length();
3356 ofs += size;
7c673cae 3357
11fdf7f2
TL
3358 const uint64_t lofs = data_len;
3359 data_len += size;
7c673cae 3360
11fdf7f2 3361 return filter->process(std::move(bl), lofs);
7c673cae
FG
3362 }
3363
28e407b8 3364 int flush() {
11fdf7f2 3365 return filter->process({}, data_len);
28e407b8
AA
3366 }
3367
7c673cae
FG
3368 bufferlist& get_extra_data() { return extra_data_bl; }
3369
3370 map<string, bufferlist>& get_attrs() { return src_attrs; }
3371
3372 void set_extra_data_len(uint64_t len) override {
b32b8144 3373 extra_data_left = len;
11fdf7f2 3374 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3375 }
3376
3377 uint64_t get_data_len() {
3378 return data_len;
3379 }
7c673cae
FG
3380};
3381
3382/*
3383 * prepare attrset depending on attrs_mod.
3384 */
3385static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3386 map<string, bufferlist>& attrs,
3387 RGWRados::AttrsMod attrs_mod)
3388{
3389 switch (attrs_mod) {
3390 case RGWRados::ATTRSMOD_NONE:
3391 attrs = src_attrs;
3392 break;
3393 case RGWRados::ATTRSMOD_REPLACE:
3394 if (!attrs[RGW_ATTR_ETAG].length()) {
3395 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3396 }
181888fb
FG
3397 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3398 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3399 if (ttiter != src_attrs.end()) {
3400 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3401 }
3402 }
7c673cae
FG
3403 break;
3404 case RGWRados::ATTRSMOD_MERGE:
3405 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3406 if (attrs.find(it->first) == attrs.end()) {
3407 attrs[it->first] = it->second;
3408 }
3409 }
3410 break;
3411 }
3412}
3413
9f95a23c 3414int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y)
7c673cae
FG
3415{
3416 map<string, bufferlist> attrset;
3417
3418 real_time mtime;
3419 uint64_t obj_size;
9f95a23c 3420 RGWObjectCtx rctx(this->store);
7c673cae
FG
3421
3422 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
3423 RGWRados::Object::Read read_op(&op_target);
3424
3425 read_op.params.attrs = &attrset;
3426 read_op.params.lastmod = &mtime;
3427 read_op.params.obj_size = &obj_size;
3428
9f95a23c 3429 int ret = read_op.prepare(y);
7c673cae
FG
3430 if (ret < 0)
3431 return ret;
3432
3433 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 3434 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae 3435
11fdf7f2
TL
3436 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
3437 read_op, obj_size - 1, obj, NULL, mtime, attrset,
9f95a23c 3438 0, real_time(), NULL, dpp, y);
7c673cae
FG
3439}
3440
3441struct obj_time_weight {
3442 real_time mtime;
3443 uint32_t zone_short_id;
3444 uint64_t pg_ver;
3445 bool high_precision;
3446
3447 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3448
3449 bool compare_low_precision(const obj_time_weight& rhs) {
3450 struct timespec l = ceph::real_clock::to_timespec(mtime);
3451 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3452 l.tv_nsec = 0;
3453 r.tv_nsec = 0;
3454 if (l > r) {
3455 return false;
3456 }
3457 if (l < r) {
3458 return true;
3459 }
11fdf7f2
TL
3460 if (!zone_short_id || !rhs.zone_short_id) {
3461 /* don't compare zone ids, if one wasn't provided */
3462 return false;
3463 }
7c673cae
FG
3464 if (zone_short_id != rhs.zone_short_id) {
3465 return (zone_short_id < rhs.zone_short_id);
3466 }
3467 return (pg_ver < rhs.pg_ver);
3468
3469 }
3470
3471 bool operator<(const obj_time_weight& rhs) {
3472 if (!high_precision || !rhs.high_precision) {
3473 return compare_low_precision(rhs);
3474 }
3475 if (mtime > rhs.mtime) {
3476 return false;
3477 }
3478 if (mtime < rhs.mtime) {
3479 return true;
3480 }
11fdf7f2
TL
3481 if (!zone_short_id || !rhs.zone_short_id) {
3482 /* don't compare zone ids, if one wasn't provided */
3483 return false;
3484 }
7c673cae
FG
3485 if (zone_short_id != rhs.zone_short_id) {
3486 return (zone_short_id < rhs.zone_short_id);
3487 }
3488 return (pg_ver < rhs.pg_ver);
3489 }
3490
3491 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3492 mtime = _mtime;
3493 zone_short_id = _short_id;
3494 pg_ver = _pg_ver;
3495 }
3496
3497 void init(RGWObjState *state) {
3498 mtime = state->mtime;
3499 zone_short_id = state->zone_short_id;
3500 pg_ver = state->pg_ver;
3501 }
3502};
3503
3504inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3505 out << o.mtime;
3506
3507 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3508 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3509 }
3510
3511 return out;
3512}
3513
11fdf7f2 3514class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
3515 bufferlist extra_data;
3516public:
3517 RGWGetExtraDataCB() {}
11fdf7f2
TL
3518 int handle_data(bufferlist& bl, bool *pause) override {
3519 int bl_len = (int)bl.length();
7c673cae
FG
3520 if (extra_data.length() < extra_data_len) {
3521 off_t max = extra_data_len - extra_data.length();
3522 if (max > bl_len) {
3523 max = bl_len;
3524 }
3525 bl.splice(0, max, &extra_data);
3526 }
3527 return bl_len;
3528 }
3529
3530 bufferlist& get_extra_data() {
3531 return extra_data;
3532 }
3533};
3534
3535int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
3536 const rgw_user& user_id,
7c673cae 3537 req_info *info,
9f95a23c 3538 const rgw_zone_id& source_zone,
7c673cae 3539 rgw_obj& src_obj,
9f95a23c 3540 const RGWBucketInfo *src_bucket_info,
7c673cae
FG
3541 real_time *src_mtime,
3542 uint64_t *psize,
3543 const real_time *mod_ptr,
3544 const real_time *unmod_ptr,
3545 bool high_precision_time,
3546 const char *if_match,
3547 const char *if_nomatch,
3548 map<string, bufferlist> *pattrs,
11fdf7f2 3549 map<string, string> *pheaders,
7c673cae
FG
3550 string *version_id,
3551 string *ptag,
3552 string *petag)
3553{
3554 /* source is in a different zonegroup, copy from there */
3555
3556 RGWRESTStreamRWRequest *in_stream_req;
3557 string tag;
3558 map<string, bufferlist> src_attrs;
3559 append_rand_alpha(cct, tag, tag, 32);
3560 obj_time_weight set_mtime_weight;
3561 set_mtime_weight.high_precision = high_precision_time;
3562
3563 RGWRESTConn *conn;
3564 if (source_zone.empty()) {
9f95a23c 3565 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3566 /* source is in the master zonegroup */
11fdf7f2 3567 conn = svc.zone->get_master_conn();
7c673cae 3568 } else {
11fdf7f2 3569 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
9f95a23c 3570 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae
FG
3571 if (iter == zonegroup_conn_map.end()) {
3572 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3573 return -ENOENT;
3574 }
3575 conn = iter->second;
3576 }
3577 } else {
11fdf7f2 3578 auto& zone_conn_map = svc.zone->get_zone_conn_map();
9f95a23c 3579 auto iter = zone_conn_map.find(source_zone);
7c673cae
FG
3580 if (iter == zone_conn_map.end()) {
3581 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3582 return -ENOENT;
3583 }
3584 conn = iter->second;
3585 }
3586
3587 RGWGetExtraDataCB cb;
7c673cae
FG
3588 map<string, string> req_headers;
3589 real_time set_mtime;
3590
3591 const real_time *pmod = mod_ptr;
3592
3593 obj_time_weight dest_mtime_weight;
3594
181888fb
FG
3595 constexpr bool prepend_meta = true;
3596 constexpr bool get_op = true;
3597 constexpr bool rgwx_stat = true;
3598 constexpr bool sync_manifest = true;
3599 constexpr bool skip_decrypt = true;
7c673cae
FG
3600 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3601 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3602 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3603 sync_manifest, skip_decrypt,
3604 true, &cb, &in_stream_req);
7c673cae
FG
3605 if (ret < 0) {
3606 return ret;
3607 }
3608
11fdf7f2 3609 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
7c673cae
FG
3610 if (ret < 0) {
3611 return ret;
3612 }
3613
3614 bufferlist& extra_data_bl = cb.get_extra_data();
3615 if (extra_data_bl.length()) {
3616 JSONParser jp;
3617 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3618 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3619 return -EIO;
3620 }
3621
3622 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3623
3624 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3625 }
3626
3627 if (src_mtime) {
3628 *src_mtime = set_mtime;
3629 }
3630
3631 if (petag) {
3632 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3633 if (iter != src_attrs.end()) {
3634 bufferlist& etagbl = iter->second;
3635 *petag = etagbl.to_str();
11fdf7f2
TL
3636 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3637 *petag = petag->substr(0, petag->size() - 1);
3638 }
7c673cae
FG
3639 }
3640 }
3641
3642 if (pattrs) {
11fdf7f2 3643 *pattrs = std::move(src_attrs);
7c673cae
FG
3644 }
3645
3646 return 0;
3647}
3648
9f95a23c
TL
3649int RGWFetchObjFilter_Default::filter(CephContext *cct,
3650 const rgw_obj_key& source_key,
3651 const RGWBucketInfo& dest_bucket_info,
3652 std::optional<rgw_placement_rule> dest_placement_rule,
3653 const map<string, bufferlist>& obj_attrs,
3654 std::optional<rgw_user> *poverride_owner,
3655 const rgw_placement_rule **prule)
3656{
3657 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3658 if (!ptail_rule) {
3659 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3660 if (iter != obj_attrs.end()) {
3661 dest_rule.storage_class = iter->second.to_str();
3662 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3663 ptail_rule = &dest_rule;
3664 } else {
3665 ptail_rule = &dest_bucket_info.placement_rule;
3666 }
3667 }
3668 *prule = ptail_rule;
3669 return 0;
3670}
3671
7c673cae
FG
3672int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3673 const rgw_user& user_id,
7c673cae 3674 req_info *info,
9f95a23c 3675 const rgw_zone_id& source_zone,
11fdf7f2
TL
3676 const rgw_obj& dest_obj,
3677 const rgw_obj& src_obj,
9f95a23c
TL
3678 const RGWBucketInfo& dest_bucket_info,
3679 const RGWBucketInfo *src_bucket_info,
11fdf7f2 3680 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
3681 real_time *src_mtime,
3682 real_time *mtime,
3683 const real_time *mod_ptr,
3684 const real_time *unmod_ptr,
3685 bool high_precision_time,
3686 const char *if_match,
3687 const char *if_nomatch,
3688 AttrsMod attrs_mod,
3689 bool copy_if_newer,
3690 map<string, bufferlist>& attrs,
3691 RGWObjCategory category,
11fdf7f2 3692 std::optional<uint64_t> olh_epoch,
7c673cae 3693 real_time delete_at,
7c673cae 3694 string *ptag,
11fdf7f2 3695 string *petag,
7c673cae 3696 void (*progress_cb)(off_t, void *),
31f18b77 3697 void *progress_data,
9f95a23c
TL
3698 const DoutPrefixProvider *dpp,
3699 RGWFetchObjFilter *filter,
81eedcae
TL
3700 rgw_zone_set *zones_trace,
3701 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
3702{
3703 /* source is in a different zonegroup, copy from there */
3704
3705 RGWRESTStreamRWRequest *in_stream_req;
3706 string tag;
3707 int i;
3708 append_rand_alpha(cct, tag, tag, 32);
3709 obj_time_weight set_mtime_weight;
3710 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 3711 int ret;
7c673cae 3712
9f95a23c 3713 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 3714 using namespace rgw::putobj;
9f95a23c
TL
3715 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, nullptr, user_id,
3716 obj_ctx, dest_obj, olh_epoch, tag, dpp, null_yield);
7c673cae 3717 RGWRESTConn *conn;
11fdf7f2
TL
3718 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3719 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae 3720 if (source_zone.empty()) {
9f95a23c 3721 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3722 /* source is in the master zonegroup */
11fdf7f2 3723 conn = svc.zone->get_master_conn();
7c673cae 3724 } else {
9f95a23c 3725 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae
FG
3726 if (iter == zonegroup_conn_map.end()) {
3727 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3728 return -ENOENT;
3729 }
3730 conn = iter->second;
3731 }
3732 } else {
9f95a23c 3733 auto iter = zone_conn_map.find(source_zone);
7c673cae
FG
3734 if (iter == zone_conn_map.end()) {
3735 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 3736 return -ENOENT;
7c673cae 3737 }
11fdf7f2 3738 conn = iter->second;
7c673cae
FG
3739 }
3740
3741 boost::optional<RGWPutObj_Compress> compressor;
3742 CompressorRef plugin;
3743
9f95a23c
TL
3744 RGWFetchObjFilter_Default source_filter;
3745 if (!filter) {
3746 filter = &source_filter;
3747 }
3748
3749 std::optional<rgw_user> override_owner;
3750
11fdf7f2 3751 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
9f95a23c
TL
3752 [&](map<string, bufferlist>& obj_attrs) {
3753 const rgw_placement_rule *ptail_rule;
3754
3755 int ret = filter->filter(cct,
3756 src_obj.key,
3757 dest_bucket_info,
3758 dest_placement_rule,
3759 obj_attrs,
3760 &override_owner,
3761 &ptail_rule);
3762 if (ret < 0) {
3763 ldout(cct, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
3764 return ret;
11fdf7f2 3765 }
9f95a23c
TL
3766
3767 processor.set_tail_placement(*ptail_rule);
3768
11fdf7f2
TL
3769 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
3770 if (compression_type != "none") {
3771 plugin = Compressor::create(cct, compression_type);
3772 if (!plugin) {
3773 ldout(cct, 1) << "Cannot load plugin for compression type "
3774 << compression_type << dendl;
3775 }
3776 }
3777
9f95a23c 3778 ret = processor.prepare(null_yield);
11fdf7f2
TL
3779 if (ret < 0) {
3780 return ret;
3781 }
3782 return 0;
3783 });
7c673cae
FG
3784
3785 string etag;
7c673cae 3786 real_time set_mtime;
81eedcae 3787 uint64_t expected_size = 0;
7c673cae
FG
3788
3789 RGWObjState *dest_state = NULL;
3790
3791 const real_time *pmod = mod_ptr;
3792
3793 obj_time_weight dest_mtime_weight;
3794
3795 if (copy_if_newer) {
3796 /* need to get mtime for destination */
9f95a23c 3797 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
7c673cae
FG
3798 if (ret < 0)
3799 goto set_err_state;
3800
3801 if (!real_clock::is_zero(dest_state->mtime)) {
3802 dest_mtime_weight.init(dest_state);
3803 pmod = &dest_mtime_weight.mtime;
3804 }
3805 }
3806
181888fb
FG
3807 static constexpr bool prepend_meta = true;
3808 static constexpr bool get_op = true;
3809 static constexpr bool rgwx_stat = false;
3810 static constexpr bool sync_manifest = true;
3811 static constexpr bool skip_decrypt = true;
7c673cae
FG
3812 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3813 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3814 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3815 sync_manifest, skip_decrypt,
3816 true,
3817 &cb, &in_stream_req);
7c673cae
FG
3818 if (ret < 0) {
3819 goto set_err_state;
3820 }
3821
81eedcae
TL
3822 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
3823 &expected_size, nullptr, nullptr);
7c673cae
FG
3824 if (ret < 0) {
3825 goto set_err_state;
3826 }
28e407b8
AA
3827 ret = cb.flush();
3828 if (ret < 0) {
3829 goto set_err_state;
3830 }
81eedcae
TL
3831 if (cb.get_data_len() != expected_size) {
3832 ret = -EIO;
3833 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
3834 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
3835 goto set_err_state;
3836 }
7c673cae
FG
3837 if (compressor && compressor->is_compressed()) {
3838 bufferlist tmp;
3839 RGWCompressionInfo cs_info;
3840 cs_info.compression_type = plugin->get_type_name();
3841 cs_info.orig_size = cb.get_data_len();
3842 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 3843 encode(cs_info, tmp);
7c673cae
FG
3844 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
3845 }
3846
9f95a23c
TL
3847 if (override_owner) {
3848 processor.set_owner(*override_owner);
3849
3850 auto& obj_attrs = cb.get_attrs();
3851
3852 RGWUserInfo owner_info;
3853 if (ctl.user->get_info_by_uid(*override_owner, &owner_info, null_yield) < 0) {
3854 ldout(cct, 10) << "owner info does not exist" << dendl;
3855 return -EINVAL;
3856 }
3857
3858 RGWAccessControlPolicy acl;
3859
3860 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
3861 if (aiter == obj_attrs.end()) {
3862 ldout(cct, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
3863 acl.create_default(owner_info.user_id, owner_info.display_name);
3864 } else {
3865 auto iter = aiter->second.cbegin();
3866 try {
3867 acl.decode(iter);
3868 } catch (buffer::error& err) {
3869 ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
3870 return -EIO;
3871 }
3872 }
3873
3874 ACLOwner new_owner;
3875 new_owner.set_id(*override_owner);
3876 new_owner.set_name(owner_info.display_name);
3877
3878 acl.set_owner(new_owner);
3879
3880 bufferlist bl;
3881 acl.encode(bl);
3882 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
3883 }
3884
7c673cae
FG
3885 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3886 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
3887 } else {
3888 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
3889 if (iter != cb.get_attrs().end()) {
3890 try {
11fdf7f2 3891 decode(delete_at, iter->second);
7c673cae
FG
3892 } catch (buffer::error& err) {
3893 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
3894 }
3895 }
3896 }
3897
3898 if (src_mtime) {
3899 *src_mtime = set_mtime;
3900 }
3901
3902 if (petag) {
3903 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
3904 if (iter != cb.get_attrs().end()) {
11fdf7f2 3905 *petag = iter->second.to_str();
7c673cae
FG
3906 }
3907 }
3908
11fdf7f2
TL
3909 //erase the append attr
3910 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
3911
7c673cae
FG
3912 if (source_zone.empty()) {
3913 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
3914 } else {
3915 attrs = cb.get_attrs();
3916 }
3917
3918 if (copy_if_newer) {
3919 uint64_t pg_ver = 0;
3920 auto i = attrs.find(RGW_ATTR_PG_VER);
3921 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 3922 auto iter = i->second.cbegin();
7c673cae 3923 try {
11fdf7f2 3924 decode(pg_ver, iter);
7c673cae
FG
3925 } catch (buffer::error& err) {
3926 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
3927 /* non critical error */
3928 }
3929 }
11fdf7f2 3930 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
3931 }
3932
3933#define MAX_COMPLETE_RETRY 100
3934 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
3935 bool canceled = false;
3936 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
3937 attrs, delete_at, nullptr, nullptr, nullptr,
9f95a23c 3938 zones_trace, &canceled, null_yield);
7c673cae
FG
3939 if (ret < 0) {
3940 goto set_err_state;
3941 }
11fdf7f2 3942 if (copy_if_newer && canceled) {
7c673cae 3943 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
11fdf7f2 3944 obj_ctx.invalidate(dest_obj); /* object was overwritten */
9f95a23c 3945 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
7c673cae
FG
3946 if (ret < 0) {
3947 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
3948 goto set_err_state;
3949 }
3950 dest_mtime_weight.init(dest_state);
3951 dest_mtime_weight.high_precision = high_precision_time;
3952 if (!dest_state->exists ||
3953 dest_mtime_weight < set_mtime_weight) {
3954 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3955 continue;
3956 } else {
3957 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3958 }
3959 }
3960 break;
3961 }
3962
3963 if (i == MAX_COMPLETE_RETRY) {
3964 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
3965 ret = -EIO;
3966 goto set_err_state;
3967 }
3968
81eedcae
TL
3969 if (bytes_transferred) {
3970 *bytes_transferred = cb.get_data_len();
3971 }
7c673cae
FG
3972 return 0;
3973set_err_state:
3974 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
3975 // we may have already fetched during sync of OP_ADD, but were waiting
3976 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
3977 if (olh_epoch && *olh_epoch > 0) {
3978 constexpr bool log_data_change = true;
3979 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
9f95a23c 3980 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
91327a77
AA
3981 } else {
3982 // we already have the latest copy
3983 ret = 0;
3984 }
7c673cae 3985 }
7c673cae
FG
3986 return ret;
3987}
3988
3989
3990int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
3991 map<string, bufferlist>& src_attrs,
3992 RGWRados::Object::Read& read_op,
3993 const rgw_user& user_id,
3994 rgw_obj& dest_obj,
3995 real_time *mtime)
3996{
3997 string etag;
3998
11fdf7f2 3999 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4000
11fdf7f2
TL
4001 auto rest_master_conn = svc.zone->get_master_conn();
4002
4003 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
7c673cae 4004 if (ret < 0) {
7c673cae
FG
4005 return ret;
4006 }
4007
9f95a23c 4008 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
224ce89b
WB
4009 if (ret < 0) {
4010 delete out_stream_req;
7c673cae 4011 return ret;
224ce89b 4012 }
7c673cae
FG
4013
4014 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4015 if (ret < 0)
4016 return ret;
4017
4018 return 0;
4019}
4020
4021/**
4022 * Copy an object.
4023 * dest_obj: the object to copy into
4024 * src_obj: the object to copy from
4025 * attrs: usage depends on attrs_mod parameter
4026 * attrs_mod: the modification mode of the attrs, may have the following values:
4027 * ATTRSMOD_NONE - the attributes of the source object will be
4028 * copied without modifications, attrs parameter is ignored;
4029 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4030 * parameter, source object attributes are not copied;
4031 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4032 * are overwritten by values contained in attrs parameter.
4033 * err: stores any errors resulting from the get of the original object
4034 * Returns: 0 on success, -ERR# otherwise.
4035 */
4036int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4037 const rgw_user& user_id,
7c673cae 4038 req_info *info,
9f95a23c 4039 const rgw_zone_id& source_zone,
7c673cae
FG
4040 rgw_obj& dest_obj,
4041 rgw_obj& src_obj,
4042 RGWBucketInfo& dest_bucket_info,
4043 RGWBucketInfo& src_bucket_info,
11fdf7f2 4044 const rgw_placement_rule& dest_placement,
7c673cae
FG
4045 real_time *src_mtime,
4046 real_time *mtime,
4047 const real_time *mod_ptr,
4048 const real_time *unmod_ptr,
4049 bool high_precision_time,
4050 const char *if_match,
4051 const char *if_nomatch,
4052 AttrsMod attrs_mod,
4053 bool copy_if_newer,
4054 map<string, bufferlist>& attrs,
4055 RGWObjCategory category,
4056 uint64_t olh_epoch,
4057 real_time delete_at,
4058 string *version_id,
4059 string *ptag,
11fdf7f2 4060 string *petag,
7c673cae 4061 void (*progress_cb)(off_t, void *),
9f95a23c
TL
4062 void *progress_data,
4063 const DoutPrefixProvider *dpp,
4064 optional_yield y)
7c673cae
FG
4065{
4066 int ret;
4067 uint64_t obj_size;
4068 rgw_obj shadow_obj = dest_obj;
4069 string shadow_oid;
4070
4071 bool remote_src;
4072 bool remote_dest;
4073
4074 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4075 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4076
11fdf7f2
TL
4077 auto& zonegroup = svc.zone->get_zonegroup();
4078
4079 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4080 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
7c673cae
FG
4081
4082 if (remote_src && remote_dest) {
9f95a23c 4083 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7c673cae
FG
4084 return -EINVAL;
4085 }
4086
9f95a23c 4087 ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7c673cae
FG
4088
4089 if (remote_src || !source_zone.empty()) {
11fdf7f2 4090 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
9f95a23c 4091 dest_obj, src_obj, dest_bucket_info, &src_bucket_info,
11fdf7f2 4092 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4093 unmod_ptr, high_precision_time,
4094 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
9f95a23c
TL
4095 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4096 nullptr /* filter */);
7c673cae
FG
4097 }
4098
4099 map<string, bufferlist> src_attrs;
4100 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4101 RGWRados::Object::Read read_op(&src_op_target);
4102
4103 read_op.conds.mod_ptr = mod_ptr;
4104 read_op.conds.unmod_ptr = unmod_ptr;
4105 read_op.conds.high_precision_time = high_precision_time;
4106 read_op.conds.if_match = if_match;
4107 read_op.conds.if_nomatch = if_nomatch;
4108 read_op.params.attrs = &src_attrs;
4109 read_op.params.lastmod = src_mtime;
4110 read_op.params.obj_size = &obj_size;
7c673cae 4111
9f95a23c 4112 ret = read_op.prepare(y);
7c673cae
FG
4113 if (ret < 0) {
4114 return ret;
4115 }
94b18763
FG
4116 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4117 // Current implementation does not follow S3 spec and even
4118 // may result in data corruption silently when copying
4119 // multipart objects acorss pools. So reject COPY operations
4120 //on encrypted objects before it is fully functional.
9f95a23c 4121 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
94b18763
FG
4122 << " has not been implemented." << dendl;
4123 return -ERR_NOT_IMPLEMENTED;
4124 }
7c673cae
FG
4125
4126 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4127 src_attrs.erase(RGW_ATTR_DELETE_AT);
4128
4129 set_copy_attrs(src_attrs, attrs, attrs_mod);
4130 attrs.erase(RGW_ATTR_ID_TAG);
4131 attrs.erase(RGW_ATTR_PG_VER);
4132 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4133 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4134 if (cmp != src_attrs.end())
4135 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4136
4137 RGWObjManifest manifest;
4138 RGWObjState *astate = NULL;
4139
9f95a23c 4140 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate, y);
7c673cae
FG
4141 if (ret < 0) {
4142 return ret;
4143 }
4144
4145 vector<rgw_raw_obj> ref_objs;
4146
4147 if (remote_dest) {
4148 /* dest is in a different zonegroup, copy it there */
4149 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4150 }
4151 uint64_t max_chunk_size;
4152
4153 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4154 if (ret < 0) {
9f95a23c 4155 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
7c673cae
FG
4156 return ret;
4157 }
4158
4159 rgw_pool src_pool;
4160 rgw_pool dest_pool;
11fdf7f2
TL
4161
4162 const rgw_placement_rule *src_rule{nullptr};
4163
9f95a23c
TL
4164 if (astate->manifest) {
4165 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4166 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
11fdf7f2
TL
4167 }
4168
4169 if (!src_rule || src_rule->empty()) {
4170 src_rule = &src_bucket_info.placement_rule;
4171 }
4172
4173 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
9f95a23c 4174 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7c673cae
FG
4175 return -EIO;
4176 }
11fdf7f2
TL
4177
4178 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
9f95a23c 4179 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7c673cae
FG
4180 return -EIO;
4181 }
4182
9f95a23c 4183 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
11fdf7f2
TL
4184 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4185
9f95a23c 4186 bool copy_data = (!astate->manifest) ||
11fdf7f2
TL
4187 (*src_rule != dest_placement) ||
4188 (src_pool != dest_pool);
7c673cae 4189
7c673cae 4190 bool copy_first = false;
9f95a23c
TL
4191 if (astate->manifest) {
4192 if (!astate->manifest->has_tail()) {
7c673cae
FG
4193 copy_data = true;
4194 } else {
9f95a23c 4195 uint64_t head_size = astate->manifest->get_head_size();
7c673cae
FG
4196
4197 if (head_size > 0) {
4198 if (head_size > max_chunk_size) {
4199 copy_data = true;
4200 } else {
4201 copy_first = true;
4202 }
4203 }
4204 }
4205 }
4206
4207 if (petag) {
4208 const auto iter = attrs.find(RGW_ATTR_ETAG);
4209 if (iter != attrs.end()) {
11fdf7f2 4210 *petag = iter->second.to_str();
7c673cae
FG
4211 }
4212 }
4213
4214 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2
TL
4215 attrs.erase(RGW_ATTR_TAIL_TAG);
4216 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
9f95a23c 4217 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
7c673cae
FG
4218 }
4219
9f95a23c 4220 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin();
7c673cae
FG
4221
4222 if (copy_first) { // we need to copy first chunk, not increase refcount
4223 ++miter;
4224 }
4225
4226 rgw_rados_ref ref;
4227 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4228 if (ret < 0) {
4229 return ret;
4230 }
4231
7c673cae
FG
4232 bufferlist first_chunk;
4233
4234 bool copy_itself = (dest_obj == src_obj);
4235 RGWObjManifest *pmanifest;
9f95a23c 4236 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
4237
4238 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4239 RGWRados::Object::Write write_op(&dest_op_target);
4240
4241 string tag;
4242
4243 if (ptag) {
4244 tag = *ptag;
4245 }
4246
4247 if (tag.empty()) {
4248 append_rand_alpha(cct, tag, tag, 32);
4249 }
4250
4251 if (!copy_itself) {
181888fb 4252 attrs.erase(RGW_ATTR_TAIL_TAG);
9f95a23c 4253 manifest = *astate->manifest;
7c673cae
FG
4254 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4255 if (tail_placement.bucket.name.empty()) {
4256 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4257 }
3efd9988 4258 string ref_tag;
9f95a23c 4259 for (; miter != astate->manifest->obj_end(); ++miter) {
7c673cae 4260 ObjectWriteOperation op;
3efd9988
FG
4261 ref_tag = tag + '\0';
4262 cls_refcount_get(op, ref_tag, true);
7c673cae 4263 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
7c673cae 4264
9f95a23c
TL
4265 auto& ioctx = ref.pool.ioctx();
4266 ioctx.locator_set_key(loc.loc);
4267
4268 ret = rgw_rados_operate(ioctx, loc.oid, &op, null_yield);
7c673cae
FG
4269 if (ret < 0) {
4270 goto done_ret;
4271 }
4272
4273 ref_objs.push_back(loc);
4274 }
4275
4276 pmanifest = &manifest;
4277 } else {
9f95a23c 4278 pmanifest = &(*astate->manifest);
7c673cae
FG
4279 /* don't send the object's tail for garbage collection */
4280 astate->keep_tail = true;
4281 }
4282
4283 if (copy_first) {
9f95a23c 4284 ret = read_op.read(0, max_chunk_size, first_chunk, y);
7c673cae
FG
4285 if (ret < 0) {
4286 goto done_ret;
4287 }
4288
4289 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4290 } else {
4291 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4292 }
4293
4294 write_op.meta.data = &first_chunk;
4295 write_op.meta.manifest = pmanifest;
4296 write_op.meta.ptag = &tag;
4297 write_op.meta.owner = dest_bucket_info.owner;
4298 write_op.meta.mtime = mtime;
4299 write_op.meta.flags = PUT_OBJ_CREATE;
4300 write_op.meta.category = category;
4301 write_op.meta.olh_epoch = olh_epoch;
4302 write_op.meta.delete_at = delete_at;
181888fb 4303 write_op.meta.modify_tail = !copy_itself;
7c673cae 4304
9f95a23c 4305 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs, y);
7c673cae
FG
4306 if (ret < 0) {
4307 goto done_ret;
4308 }
4309
4310 return 0;
4311
4312done_ret:
4313 if (!copy_itself) {
4314 vector<rgw_raw_obj>::iterator riter;
4315
7c673cae 4316 /* rollback reference */
92f5a8d4 4317 string ref_tag = tag + '\0';
7c673cae
FG
4318 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4319 ObjectWriteOperation op;
92f5a8d4 4320 cls_refcount_put(op, ref_tag, true);
7c673cae 4321
9f95a23c 4322 ref.pool.ioctx().locator_set_key(riter->loc);
7c673cae 4323
9f95a23c 4324 int r = rgw_rados_operate(ref.pool.ioctx(), riter->oid, &op, null_yield);
7c673cae 4325 if (r < 0) {
9f95a23c 4326 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7c673cae
FG
4327 }
4328 }
4329 }
4330 return ret;
4331}
4332
4333
4334int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4335 RGWBucketInfo& dest_bucket_info,
11fdf7f2 4336 const rgw_placement_rule& dest_placement,
7c673cae 4337 RGWRados::Object::Read& read_op, off_t end,
11fdf7f2 4338 const rgw_obj& dest_obj,
7c673cae
FG
4339 real_time *mtime,
4340 real_time set_mtime,
4341 map<string, bufferlist>& attrs,
7c673cae
FG
4342 uint64_t olh_epoch,
4343 real_time delete_at,
9f95a23c
TL
4344 string *petag,
4345 const DoutPrefixProvider *dpp,
4346 optional_yield y)
7c673cae 4347{
7c673cae
FG
4348 string tag;
4349 append_rand_alpha(cct, tag, tag, 32);
4350
9f95a23c 4351 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 4352 using namespace rgw::putobj;
9f95a23c
TL
4353 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4354 // it causes crashes in the ragweed tests
4355 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, &dest_placement,
11fdf7f2 4356 dest_bucket_info.owner, obj_ctx,
9f95a23c
TL
4357 dest_obj, olh_epoch, tag, dpp, null_yield);
4358 int ret = processor.prepare(y);
7c673cae
FG
4359 if (ret < 0)
4360 return ret;
4361
4362 off_t ofs = 0;
4363
4364 do {
4365 bufferlist bl;
9f95a23c 4366 ret = read_op.read(ofs, end, bl, y);
11fdf7f2 4367 if (ret < 0) {
9f95a23c 4368 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
11fdf7f2
TL
4369 return ret;
4370 }
7c673cae
FG
4371
4372 uint64_t read_len = ret;
11fdf7f2
TL
4373 ret = processor.process(std::move(bl), ofs);
4374 if (ret < 0) {
4375 return ret;
4376 }
7c673cae
FG
4377
4378 ofs += read_len;
4379 } while (ofs <= end);
4380
11fdf7f2
TL
4381 // flush
4382 ret = processor.process({}, ofs);
4383 if (ret < 0) {
4384 return ret;
4385 }
4386
7c673cae
FG
4387 string etag;
4388 auto iter = attrs.find(RGW_ATTR_ETAG);
4389 if (iter != attrs.end()) {
4390 bufferlist& bl = iter->second;
11fdf7f2 4391 etag = bl.to_str();
7c673cae 4392 if (petag) {
11fdf7f2 4393 *petag = etag;
7c673cae
FG
4394 }
4395 }
4396
4397 uint64_t accounted_size;
4398 {
4399 bool compressed{false};
4400 RGWCompressionInfo cs_info;
4401 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4402 if (ret < 0) {
9f95a23c 4403 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
7c673cae
FG
4404 return ret;
4405 }
4406 // pass original size if compressed
4407 accounted_size = compressed ? cs_info.orig_size : ofs;
4408 }
4409
11fdf7f2 4410 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
9f95a23c 4411 nullptr, nullptr, nullptr, nullptr, nullptr, y);
7c673cae
FG
4412}
4413
11fdf7f2
TL
4414int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4415 RGWBucketInfo& bucket_info,
4416 rgw_obj& obj,
4417 const rgw_placement_rule& placement_rule,
4418 const real_time& mtime,
9f95a23c
TL
4419 uint64_t olh_epoch,
4420 const DoutPrefixProvider *dpp,
4421 optional_yield y)
7c673cae 4422{
11fdf7f2
TL
4423 map<string, bufferlist> attrs;
4424 real_time read_mtime;
4425 uint64_t obj_size;
7c673cae 4426
9f95a23c
TL
4427 obj_ctx.set_atomic(obj);
4428
11fdf7f2
TL
4429 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4430 RGWRados::Object::Read read_op(&op_target);
7c673cae 4431
11fdf7f2
TL
4432 read_op.params.attrs = &attrs;
4433 read_op.params.lastmod = &read_mtime;
4434 read_op.params.obj_size = &obj_size;
7c673cae 4435
9f95a23c 4436 int ret = read_op.prepare(y);
11fdf7f2
TL
4437 if (ret < 0) {
4438 return ret;
7c673cae
FG
4439 }
4440
11fdf7f2
TL
4441 if (read_mtime != mtime) {
4442 /* raced */
4443 return -ECANCELED;
7c673cae
FG
4444 }
4445
9f95a23c
TL
4446 attrs.erase(RGW_ATTR_ID_TAG);
4447 attrs.erase(RGW_ATTR_TAIL_TAG);
4448
11fdf7f2
TL
4449 ret = copy_obj_data(obj_ctx,
4450 bucket_info,
4451 placement_rule,
4452 read_op,
4453 obj_size - 1,
4454 obj,
4455 nullptr /* pmtime */,
4456 mtime,
4457 attrs,
4458 olh_epoch,
4459 real_time(),
9f95a23c
TL
4460 nullptr /* petag */,
4461 dpp,
4462 y);
11fdf7f2
TL
4463 if (ret < 0) {
4464 return ret;
7c673cae
FG
4465 }
4466
11fdf7f2 4467 return 0;
7c673cae
FG
4468}
4469
9f95a23c 4470int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info, optional_yield y)
7c673cae 4471{
9f95a23c
TL
4472 constexpr uint NUM_ENTRIES = 1000u;
4473
7c673cae
FG
4474 rgw_obj_index_key marker;
4475 string prefix;
4476 bool is_truncated;
4477
4478 do {
9f95a23c
TL
4479 std::vector<rgw_bucket_dir_entry> ent_list;
4480 ent_list.reserve(NUM_ENTRIES);
4481
1adf2230
AA
4482 int r = cls_bucket_list_unordered(bucket_info,
4483 RGW_NO_SHARD,
4484 marker,
4485 prefix,
4486 NUM_ENTRIES,
4487 true,
4488 ent_list,
4489 &is_truncated,
9f95a23c
TL
4490 &marker,
4491 y);
4492 if (r < 0) {
7c673cae 4493 return r;
9f95a23c 4494 }
7c673cae
FG
4495
4496 string ns;
1adf2230 4497 for (auto const& dirent : ent_list) {
7c673cae
FG
4498 rgw_obj_key obj;
4499
9f95a23c 4500 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
7c673cae 4501 return -ENOTEMPTY;
9f95a23c 4502 }
7c673cae
FG
4503 }
4504 } while (is_truncated);
1adf2230 4505
7c673cae
FG
4506 return 0;
4507}
4508
4509/**
4510 * Delete a bucket.
4511 * bucket: the name of the bucket to delete
4512 * Returns 0 on success, -ERR# otherwise.
4513 */
9f95a23c 4514int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, bool check_empty)
7c673cae
FG
4515{
4516 const rgw_bucket& bucket = bucket_info.bucket;
9f95a23c 4517 RGWSI_RADOS::Pool index_pool;
7c673cae 4518 map<int, string> bucket_objs;
9f95a23c 4519 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
4520 if (r < 0)
4521 return r;
4522
4523 if (check_empty) {
9f95a23c 4524 r = check_bucket_empty(bucket_info, y);
7c673cae
FG
4525 if (r < 0) {
4526 return r;
4527 }
4528 }
9f95a23c
TL
4529
4530 bool remove_ep = true;
4531
4532 if (objv_tracker.read_version.empty()) {
4533 RGWBucketEntryPoint ep;
4534 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4535 &ep,
4536 null_yield,
4537 RGWBucketCtl::Bucket::GetParams()
4538 .set_objv_tracker(&objv_tracker));
4539 if (r < 0 ||
4540 (!bucket_info.bucket.bucket_id.empty() &&
4541 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4542 if (r != -ENOENT) {
4543 ldout(cct, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
4544 /* we have no idea what caused the error, will not try to remove it */
4545 }
4546 /*
4547 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4548 * requested
4549 */
4550 remove_ep = false;
4551 }
4552 }
4553
4554 if (remove_ep) {
4555 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield,
4556 RGWBucketCtl::Bucket::RemoveParams()
4557 .set_objv_tracker(&objv_tracker));
4558 if (r < 0)
4559 return r;
4560 }
7c673cae
FG
4561
4562 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 4563 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 4564 RGWObjVersionTracker objv_tracker;
9f95a23c 4565 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield);
7c673cae
FG
4566 if (r < 0) {
4567 return r;
4568 }
f64942e4
AA
4569
4570 /* remove bucket index objects asynchronously by best effort */
9f95a23c 4571 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
f64942e4
AA
4572 bucket_objs,
4573 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 4574 }
f64942e4 4575
7c673cae
FG
4576 return 0;
4577}
4578
4579int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
4580{
4581 RGWBucketInfo info;
4582 map<string, bufferlist> attrs;
31f18b77 4583 int r;
9f95a23c
TL
4584 auto obj_ctx = svc.sysobj->init_obj_ctx();
4585
31f18b77 4586 if (bucket.bucket_id.empty()) {
9f95a23c 4587 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
31f18b77 4588 } else {
9f95a23c 4589 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield);
31f18b77 4590 }
7c673cae
FG
4591 if (r < 0) {
4592 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4593 return r;
4594 }
4595
4596 info.owner = owner.get_id();
4597
4598 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4599 if (r < 0) {
4600 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4601 return r;
4602 }
4603
4604 return 0;
4605}
4606
4607
4608int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
4609{
4610 int ret = 0;
4611
4612 vector<rgw_bucket>::iterator iter;
4613
4614 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4615 rgw_bucket& bucket = *iter;
4616 if (enabled)
4617 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
4618 else
4619 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
4620
4621 RGWBucketInfo info;
4622 map<string, bufferlist> attrs;
9f95a23c 4623 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
7c673cae
FG
4624 if (r < 0) {
4625 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4626 ret = r;
4627 continue;
4628 }
4629 if (enabled) {
4630 info.flags &= ~BUCKET_SUSPENDED;
4631 } else {
4632 info.flags |= BUCKET_SUSPENDED;
4633 }
4634
4635 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4636 if (r < 0) {
4637 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4638 ret = r;
4639 continue;
4640 }
4641 }
4642 return ret;
4643}
4644
4645int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
4646{
4647 RGWBucketInfo bucket_info;
9f95a23c 4648 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield);
7c673cae
FG
4649 if (ret < 0) {
4650 return ret;
4651 }
4652
4653 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4654 return 0;
4655}
4656
4657int RGWRados::Object::complete_atomic_modification()
4658{
9f95a23c 4659 if ((!state->manifest)|| state->keep_tail)
7c673cae
FG
4660 return 0;
4661
4662 cls_rgw_obj_chain chain;
9f95a23c 4663 store->update_gc_chain(obj, *state->manifest, &chain);
7c673cae
FG
4664
4665 if (chain.empty()) {
4666 return 0;
4667 }
4668
181888fb 4669 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
9f95a23c
TL
4670 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4671 if (ret < 0) {
4672 //Delete objects inline if send chain to gc fails
4673 store->delete_objs_inline(chain, tag);
4674 }
4675 return 0;
7c673cae
FG
4676}
4677
4678void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
4679{
4680 RGWObjManifest::obj_iterator iter;
4681 rgw_raw_obj raw_head;
4682 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
4683 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
4684 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
4685 if (mobj == raw_head)
4686 continue;
4687 cls_rgw_obj_key key(mobj.oid);
4688 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4689 }
4690}
4691
9f95a23c 4692int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4693{
9f95a23c 4694 return gc->send_chain(chain, tag);
7c673cae
FG
4695}
4696
9f95a23c 4697void RGWRados::delete_objs_inline(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4698{
9f95a23c
TL
4699 string last_pool;
4700 std::unique_ptr<IoCtx> ctx(new IoCtx);
4701 int ret = 0;
4702 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4703 cls_rgw_obj& obj = *liter;
4704 if (obj.pool != last_pool) {
4705 ctx.reset(new IoCtx);
4706 ret = rgw_init_ioctx(get_rados_handle(), obj.pool, *ctx);
4707 if (ret < 0) {
4708 last_pool = "";
4709 ldout(cct, 0) << "ERROR: failed to create ioctx pool=" <<
4710 obj.pool << dendl;
4711 continue;
4712 }
4713 last_pool = obj.pool;
4714 }
4715 ctx->locator_set_key(obj.loc);
4716 const string& oid = obj.key.name; /* just stored raw oid there */
4717 ldout(cct, 5) << "delete_objs_inline: removing " << obj.pool <<
4718 ":" << obj.key.name << dendl;
4719 ObjectWriteOperation op;
4720 cls_refcount_put(op, tag, true);
4721 ret = ctx->operate(oid, &op);
4722 if (ret < 0) {
4723 ldout(cct, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
4724 }
7c673cae 4725 }
7c673cae
FG
4726}
4727
4728static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
4729 map<RGWObjCategory, RGWStorageStats>& stats)
4730{
4731 for (const auto& pair : header.stats) {
4732 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
4733 const rgw_bucket_category_stats& header_stats = pair.second;
4734
4735 RGWStorageStats& s = stats[category];
4736
4737 s.category = category;
4738 s.size += header_stats.total_size;
4739 s.size_rounded += header_stats.total_size_rounded;
4740 s.size_utilized += header_stats.actual_size;
4741 s.num_objects += header_stats.num_entries;
4742 }
4743}
4744
4745int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
4746 map<RGWObjCategory, RGWStorageStats> *existing_stats,
4747 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
4748{
9f95a23c 4749 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
4750 // key - bucket index object id
4751 // value - bucket index check OP returned result with the given bucket index object (shard)
4752 map<int, string> oids;
4753 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 4754
9f95a23c 4755 int ret = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &oids, nullptr);
31f18b77
FG
4756 if (ret < 0) {
4757 return ret;
4758 }
7c673cae 4759
9f95a23c
TL
4760 for (auto& iter : oids) {
4761 bucket_objs_ret[iter.first] = rgw_cls_check_index_ret();
4762 }
4763
4764 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
4765 if (ret < 0) {
4766 return ret;
4767 }
7c673cae
FG
4768
4769 // Aggregate results (from different shards if there is any)
4770 map<int, struct rgw_cls_check_index_ret>::iterator iter;
4771 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
4772 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
4773 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
4774 }
4775
4776 return 0;
4777}
4778
4779int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
4780{
9f95a23c 4781 RGWSI_RADOS::Pool index_pool;
7c673cae 4782 map<int, string> bucket_objs;
31f18b77 4783
9f95a23c 4784 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77 4785 if (r < 0) {
7c673cae 4786 return r;
31f18b77 4787 }
7c673cae 4788
9f95a23c 4789 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
4790}
4791
f64942e4 4792int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77 4793{
9f95a23c 4794 RGWSI_RADOS::Pool index_pool;
31f18b77
FG
4795 map<int, string> bucket_objs;
4796
9f95a23c 4797 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77
FG
4798 if (r < 0) {
4799 return r;
4800 }
4801
9f95a23c 4802 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 4803}
7c673cae 4804
9f95a23c 4805int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
7c673cae
FG
4806{
4807 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
4808 std::string oid, key;
4809 get_obj_bucket_and_oid_loc(obj, oid, key);
4810 if (!rctx)
4811 return 0;
4812
4813 RGWObjState *state = NULL;
4814
9f95a23c 4815 int r = get_obj_state(rctx, bucket_info, obj, &state, false, y);
7c673cae
FG
4816 if (r < 0)
4817 return r;
4818
4819 if (!state->is_atomic) {
4820 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
4821 return -EINVAL;
4822 }
4823
181888fb
FG
4824 string tag;
4825
4826 if (state->tail_tag.length() > 0) {
4827 tag = state->tail_tag.c_str();
4828 } else if (state->obj_tag.length() > 0) {
4829 tag = state->obj_tag.c_str();
4830 } else {
7c673cae
FG
4831 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
4832 return -EINVAL;
4833 }
4834
7c673cae
FG
4835 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
4836
9f95a23c
TL
4837 cls_rgw_obj_chain chain;
4838 update_gc_chain(state->obj, *state->manifest, &chain);
4839 return gc->async_defer_chain(tag, chain);
7c673cae
FG
4840}
4841
4842void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
4843{
4844 list<string> prefixes;
4845 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
4846 cls_rgw_remove_obj(op, prefixes);
4847}
4848
4849void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
4850{
4851 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
4852}
4853
4854void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
4855{
4856 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
4857}
4858
9f95a23c
TL
4859struct tombstone_entry {
4860 ceph::real_time mtime;
4861 uint32_t zone_short_id;
4862 uint64_t pg_ver;
4863
4864 tombstone_entry() = default;
4865 explicit tombstone_entry(const RGWObjState& state)
4866 : mtime(state.mtime), zone_short_id(state.zone_short_id),
4867 pg_ver(state.pg_ver) {}
4868};
7c673cae
FG
4869
4870/**
4871 * Delete an object.
4872 * bucket: name of the bucket storing the object
4873 * obj: name of the object to delete
4874 * Returns: 0 on success, -ERR# otherwise.
4875 */
9f95a23c 4876int RGWRados::Object::Delete::delete_obj(optional_yield y)
7c673cae
FG
4877{
4878 RGWRados *store = target->get_store();
4879 rgw_obj& src_obj = target->get_obj();
4880 const string& instance = src_obj.key.instance;
4881 rgw_obj obj = src_obj;
4882
4883 if (instance == "null") {
4884 obj.key.instance.clear();
4885 }
4886
4887 bool explicit_marker_version = (!params.marker_version_id.empty());
4888
4889 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
4890 if (instance.empty() || explicit_marker_version) {
4891 rgw_obj marker = obj;
4892
4893 if (!params.marker_version_id.empty()) {
4894 if (params.marker_version_id != "null") {
4895 marker.key.set_instance(params.marker_version_id);
4896 }
4897 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
4898 store->gen_rand_obj_instance_name(&marker);
4899 }
4900
4901 result.version_id = marker.key.instance;
91327a77
AA
4902 if (result.version_id.empty())
4903 result.version_id = "null";
7c673cae
FG
4904 result.delete_marker = true;
4905
4906 struct rgw_bucket_dir_entry_meta meta;
4907
4908 meta.owner = params.obj_owner.get_id().to_str();
4909 meta.owner_display_name = params.obj_owner.get_display_name();
4910
4911 if (real_clock::is_zero(params.mtime)) {
4912 meta.mtime = real_clock::now();
4913 } else {
4914 meta.mtime = params.mtime;
4915 }
4916
9f95a23c 4917 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
7c673cae
FG
4918 if (r < 0) {
4919 return r;
4920 }
4921 } else {
4922 rgw_bucket_dir_entry dirent;
4923
4924 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
4925 if (r < 0) {
4926 return r;
4927 }
4928 result.delete_marker = dirent.is_delete_marker();
9f95a23c 4929 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
7c673cae
FG
4930 if (r < 0) {
4931 return r;
4932 }
4933 result.version_id = instance;
4934 }
4935
4936 BucketShard *bs;
4937 int r = target->get_bucket_shard(&bs);
4938 if (r < 0) {
4939 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
4940 return r;
4941 }
4942
9f95a23c
TL
4943 r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
4944 if (r < 0) {
4945 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
4946 return r;
7c673cae
FG
4947 }
4948
4949 return 0;
4950 }
4951
4952 rgw_rados_ref ref;
4953 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
4954 if (r < 0) {
4955 return r;
4956 }
4957
4958 RGWObjState *state;
9f95a23c 4959 r = target->get_state(&state, false, y);
7c673cae
FG
4960 if (r < 0)
4961 return r;
4962
4963 ObjectWriteOperation op;
4964
4965 if (!real_clock::is_zero(params.unmod_since)) {
4966 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
4967 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
4968 if (!params.high_precision_time) {
4969 ctime.tv_nsec = 0;
4970 unmod.tv_nsec = 0;
4971 }
4972
4973 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
4974 if (ctime > unmod) {
4975 return -ERR_PRECONDITION_FAILED;
4976 }
4977
4978 /* only delete object if mtime is less than or equal to params.unmod_since */
4979 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
4980 }
11fdf7f2 4981 uint64_t obj_accounted_size = state->accounted_size;
7c673cae 4982
9f95a23c
TL
4983 if(params.abortmp) {
4984 obj_accounted_size = params.parts_accounted_size;
4985 }
4986
7c673cae
FG
4987 if (!real_clock::is_zero(params.expiration_time)) {
4988 bufferlist bl;
4989 real_time delete_at;
4990
4991 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
4992 try {
11fdf7f2
TL
4993 auto iter = bl.cbegin();
4994 decode(delete_at, iter);
7c673cae
FG
4995 } catch (buffer::error& err) {
4996 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
4997 return -EIO;
4998 }
4999
5000 if (params.expiration_time != delete_at) {
5001 return -ERR_PRECONDITION_FAILED;
5002 }
5003 } else {
5004 return -ERR_PRECONDITION_FAILED;
5005 }
5006 }
5007
5008 if (!state->exists) {
5009 target->invalidate_state();
5010 return -ENOENT;
5011 }
5012
9f95a23c 5013 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false, y);
7c673cae
FG
5014 if (r < 0)
5015 return r;
5016
5017 RGWBucketInfo& bucket_info = target->get_bucket_info();
5018
5019 RGWRados::Bucket bop(store, bucket_info);
5020 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5021
5022 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5023 index_op.set_bilog_flags(params.bilog_flags);
5024
9f95a23c 5025 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag, y);
7c673cae
FG
5026 if (r < 0)
5027 return r;
5028
5029 store->remove_rgw_head_obj(op);
9f95a23c
TL
5030
5031 auto& ioctx = ref.pool.ioctx();
5032 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
94b18763
FG
5033
5034 /* raced with another operation, object state is indeterminate */
5035 const bool need_invalidate = (r == -ECANCELED);
7c673cae 5036
9f95a23c 5037 int64_t poolid = ioctx.get_id();
7c673cae
FG
5038 if (r >= 0) {
5039 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5040 if (obj_tombstone_cache) {
5041 tombstone_entry entry{*state};
5042 obj_tombstone_cache->add(obj, entry);
5043 }
9f95a23c 5044 r = index_op.complete_del(poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5045
7c673cae
FG
5046 int ret = target->complete_atomic_modification();
5047 if (ret < 0) {
5048 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5049 }
5050 /* other than that, no need to propagate error */
224ce89b
WB
5051 } else {
5052 int ret = index_op.cancel();
5053 if (ret < 0) {
5054 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5055 }
7c673cae
FG
5056 }
5057
5058 if (need_invalidate) {
5059 target->invalidate_state();
5060 }
5061
5062 if (r < 0)
5063 return r;
5064
5065 /* update quota cache */
11fdf7f2 5066 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5067
5068 return 0;
5069}
5070
5071int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5072 const RGWBucketInfo& bucket_info,
5073 const rgw_obj& obj,
5074 int versioning_status,
5075 uint16_t bilog_flags,
31f18b77
FG
5076 const real_time& expiration_time,
5077 rgw_zone_set *zones_trace)
7c673cae
FG
5078{
5079 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5080 RGWRados::Object::Delete del_op(&del_target);
5081
5082 del_op.params.bucket_owner = bucket_info.owner;
5083 del_op.params.versioning_status = versioning_status;
5084 del_op.params.bilog_flags = bilog_flags;
5085 del_op.params.expiration_time = expiration_time;
31f18b77 5086 del_op.params.zones_trace = zones_trace;
7c673cae 5087
9f95a23c 5088 return del_op.delete_obj(null_yield);
7c673cae
FG
5089}
5090
5091int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5092{
5093 rgw_rados_ref ref;
224ce89b 5094 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
5095 if (r < 0) {
5096 return r;
5097 }
5098
5099 ObjectWriteOperation op;
5100
5101 op.remove();
9f95a23c 5102 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
5103 if (r < 0)
5104 return r;
5105
5106 return 0;
5107}
5108
494da23a 5109int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime)
7c673cae
FG
5110{
5111 std::string oid, key;
5112 get_obj_bucket_and_oid_loc(obj, oid, key);
5113
11fdf7f2 5114 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5115
5116 RGWBucketInfo bucket_info;
9f95a23c 5117 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield);
7c673cae
FG
5118 if (ret < 0) {
5119 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5120 return ret;
5121 }
5122
5123 RGWRados::Bucket bop(this, bucket_info);
5124 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5125
494da23a 5126 return index_op.complete_del(-1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5127}
5128
5129static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5130{
5131 string tag;
5132
5133 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5134 if (mi != manifest.obj_end()) {
5135 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5136 ++mi;
5137 tag = mi.get_location().get_raw_obj(store).oid;
5138 tag.append("_");
5139 }
5140
5141 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5142 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5143 MD5 hash;
11fdf7f2 5144 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5145
5146 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5147 if (iter != attrset.end()) {
5148 bufferlist& bl = iter->second;
11fdf7f2 5149 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5150 }
5151
5152 hash.Final(md5);
5153 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5154 tag.append(md5_str);
5155
5156 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5157
5158 tag_bl.append(tag.c_str(), tag.size() + 1);
5159}
5160
5161static bool is_olh(map<string, bufferlist>& attrs)
5162{
5163 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5164 return (iter != attrs.end());
5165}
5166
5167static bool has_olh_tag(map<string, bufferlist>& attrs)
5168{
5169 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5170 return (iter != attrs.end());
5171}
5172
5173int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5174 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
7c673cae 5175{
11fdf7f2 5176 ceph_assert(olh_state->is_olh);
7c673cae
FG
5177
5178 rgw_obj target;
5179 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5180 if (r < 0) {
5181 return r;
5182 }
9f95a23c 5183 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false, y);
7c673cae
FG
5184 if (r < 0) {
5185 return r;
5186 }
5187
5188 return 0;
5189}
5190
7c673cae 5191int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5192 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5193{
5194 if (obj.empty()) {
5195 return -EINVAL;
5196 }
5197
5198 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5199
11fdf7f2 5200 RGWObjState *s = rctx->get_state(obj);
7c673cae
FG
5201 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5202 *state = s;
5203 if (s->has_attrs) {
5204 if (s->is_olh && need_follow_olh) {
9f95a23c 5205 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
7c673cae
FG
5206 }
5207 return 0;
5208 }
5209
5210 s->obj = obj;
5211
5212 rgw_raw_obj raw_obj;
5213 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5214
5215 int r = -ENOENT;
5216
5217 if (!assume_noent) {
9f95a23c 5218 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
7c673cae
FG
5219 }
5220
5221 if (r == -ENOENT) {
5222 s->exists = false;
5223 s->has_attrs = true;
5224 tombstone_entry entry;
5225 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5226 s->mtime = entry.mtime;
5227 s->zone_short_id = entry.zone_short_id;
5228 s->pg_ver = entry.pg_ver;
5229 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5230 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5231 } else {
5232 s->mtime = real_time();
5233 }
5234 return 0;
5235 }
5236 if (r < 0)
5237 return r;
5238
5239 s->exists = true;
5240 s->has_attrs = true;
5241 s->accounted_size = s->size;
5242
11fdf7f2
TL
5243 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5244 if (iter != s->attrset.end()) {
5245 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5246 bufferlist& bletag = iter->second;
5247 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5248 bufferlist newbl;
5249 bletag.splice(0, bletag.length() - 1, &newbl);
5250 bletag.claim(newbl);
5251 }
5252 }
5253
5254 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5255 const bool compressed = (iter != s->attrset.end());
5256 if (compressed) {
7c673cae
FG
5257 // use uncompressed size for accounted_size
5258 try {
5259 RGWCompressionInfo info;
11fdf7f2
TL
5260 auto p = iter->second.cbegin();
5261 decode(info, p);
31f18b77 5262 s->accounted_size = info.orig_size;
7c673cae
FG
5263 } catch (buffer::error&) {
5264 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5265 return -EIO;
5266 }
5267 }
5268
5269 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5270 if (iter != s->attrset.end()) {
5271 bufferlist bl = iter->second;
5272 bufferlist::iterator it = bl.begin();
5273 it.copy(bl.length(), s->shadow_obj);
5274 s->shadow_obj[bl.length()] = '\0';
5275 }
5276 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5277 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5278 if (ttiter != s->attrset.end()) {
5279 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5280 }
7c673cae
FG
5281
5282 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5283 if (manifest_bl.length()) {
11fdf7f2 5284 auto miter = manifest_bl.cbegin();
7c673cae 5285 try {
9f95a23c
TL
5286 s->manifest.emplace();
5287 decode(*s->manifest, miter);
5288 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
7c673cae 5289 broken due to old bugs */
9f95a23c 5290 s->size = s->manifest->get_obj_size();
31f18b77
FG
5291 if (!compressed)
5292 s->accounted_size = s->size;
7c673cae
FG
5293 } catch (buffer::error& err) {
5294 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5295 return -EIO;
5296 }
9f95a23c 5297 ldout(cct, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
11fdf7f2 5298 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
9f95a23c 5299 s->manifest->has_explicit_objs()) {
7c673cae 5300 RGWObjManifest::obj_iterator mi;
9f95a23c 5301 for (mi = s->manifest->obj_begin(); mi != s->manifest->obj_end(); ++mi) {
7c673cae
FG
5302 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5303 }
5304 }
5305
5306 if (!s->obj_tag.length()) {
5307 /*
5308 * Uh oh, something's wrong, object with manifest should have tag. Let's
5309 * create one out of the manifest, would be unique
5310 */
9f95a23c 5311 generate_fake_tag(this, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
7c673cae
FG
5312 s->fake_tag = true;
5313 }
5314 }
5315 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5316 if (aiter != s->attrset.end()) {
5317 bufferlist& pg_ver_bl = aiter->second;
5318 if (pg_ver_bl.length()) {
11fdf7f2 5319 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5320 try {
11fdf7f2 5321 decode(s->pg_ver, pgbl);
7c673cae
FG
5322 } catch (buffer::error& err) {
5323 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5324 }
5325 }
5326 }
5327 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5328 if (aiter != s->attrset.end()) {
5329 bufferlist& zone_short_id_bl = aiter->second;
5330 if (zone_short_id_bl.length()) {
11fdf7f2 5331 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5332 try {
11fdf7f2 5333 decode(s->zone_short_id, zbl);
7c673cae
FG
5334 } catch (buffer::error& err) {
5335 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5336 }
5337 }
5338 }
5339 if (s->obj_tag.length())
31f18b77 5340 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
5341 else
5342 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5343
5344 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5345 * it exist, and not only if is_olh() returns true
5346 */
5347 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5348 if (iter != s->attrset.end()) {
5349 s->olh_tag = iter->second;
5350 }
5351
5352 if (is_olh(s->attrset)) {
5353 s->is_olh = true;
5354
5355 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5356
5357 if (need_follow_olh) {
9f95a23c
TL
5358 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
5359 } else if (obj.key.have_null_instance() && !s->manifest) {
11fdf7f2
TL
5360 // read null version, and the head object only have olh info
5361 s->exists = false;
5362 return -ENOENT;
7c673cae
FG
5363 }
5364 }
5365
5366 return 0;
5367}
5368
5369int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9f95a23c 5370 bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5371{
5372 int ret;
5373
5374 do {
9f95a23c 5375 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
7c673cae
FG
5376 } while (ret == -EAGAIN);
5377
5378 return ret;
5379}
5380
9f95a23c 5381int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest, optional_yield y)
7c673cae
FG
5382{
5383 RGWObjState *astate;
9f95a23c 5384 int r = get_state(&astate, true, y);
7c673cae
FG
5385 if (r < 0) {
5386 return r;
5387 }
5388
9f95a23c 5389 *pmanifest = &(*astate->manifest);
7c673cae
FG
5390
5391 return 0;
5392}
5393
9f95a23c 5394int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest, optional_yield y)
7c673cae
FG
5395{
5396 RGWObjState *state;
9f95a23c 5397 int r = source->get_state(&state, true, y);
7c673cae
FG
5398 if (r < 0)
5399 return r;
5400 if (!state->exists)
5401 return -ENOENT;
5402 if (!state->get_attr(name, dest))
5403 return -ENODATA;
5404
5405 return 0;
5406}
5407
7c673cae
FG
5408int RGWRados::Object::Stat::stat_async()
5409{
5410 RGWObjectCtx& ctx = source->get_ctx();
5411 rgw_obj& obj = source->get_obj();
5412 RGWRados *store = source->get_store();
5413
11fdf7f2 5414 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5415 result.obj = obj;
5416 if (s->has_attrs) {
5417 state.ret = 0;
5418 result.size = s->size;
5419 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5420 result.attrs = s->attrset;
7c673cae
FG
5421 result.manifest = s->manifest;
5422 return 0;
5423 }
5424
5425 string oid;
5426 string loc;
5427 get_obj_bucket_and_oid_loc(obj, oid, loc);
5428
5429 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5430 if (r < 0) {
5431 return r;
5432 }
5433
5434 librados::ObjectReadOperation op;
5435 op.stat2(&result.size, &result.mtime, NULL);
5436 op.getxattrs(&result.attrs, NULL);
9f95a23c 5437 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
7c673cae
FG
5438 state.io_ctx.locator_set_key(loc);
5439 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5440 if (r < 0) {
5441 ldout(store->ctx(), 5) << __func__
5442 << ": ERROR: aio_operate() returned ret=" << r
5443 << dendl;
5444 return r;
5445 }
5446
5447 return 0;
5448}
5449
5450
5451int RGWRados::Object::Stat::wait()
5452{
5453 if (!state.completion) {
5454 return state.ret;
5455 }
5456
9f95a23c 5457 state.completion->wait_for_complete();
7c673cae
FG
5458 state.ret = state.completion->get_return_value();
5459 state.completion->release();
5460
5461 if (state.ret != 0) {
5462 return state.ret;
5463 }
5464
5465 return finish();
5466}
5467
5468int RGWRados::Object::Stat::finish()
5469{
5470 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5471 if (iter != result.attrs.end()) {
5472 bufferlist& bl = iter->second;
11fdf7f2 5473 auto biter = bl.cbegin();
7c673cae 5474 try {
9f95a23c
TL
5475 result.manifest.emplace();
5476 decode(*result.manifest, biter);
7c673cae
FG
5477 } catch (buffer::error& err) {
5478 RGWRados *store = source->get_store();
5479 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5480 return -EIO;
5481 }
7c673cae
FG
5482 }
5483
5484 return 0;
5485}
5486
7c673cae
FG
5487int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
5488 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5489 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
7c673cae
FG
5490{
5491 if (!rctx)
5492 return 0;
5493
9f95a23c 5494 int r = get_obj_state(rctx, bucket_info, obj, pstate, false, y);
7c673cae
FG
5495 if (r < 0)
5496 return r;
5497
11fdf7f2
TL
5498 return append_atomic_test(*pstate, op);
5499}
7c673cae 5500
11fdf7f2
TL
5501int RGWRados::append_atomic_test(const RGWObjState* state,
5502 librados::ObjectOperation& op)
5503{
7c673cae 5504 if (!state->is_atomic) {
11fdf7f2 5505 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
5506 return 0;
5507 }
5508
5509 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5510 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5511 } else {
5512 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
5513 }
5514 return 0;
5515}
5516
9f95a23c 5517int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae 5518{
9f95a23c 5519 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
7c673cae
FG
5520}
5521
5522void RGWRados::Object::invalidate_state()
5523{
11fdf7f2 5524 ctx.invalidate(obj);
7c673cae
FG
5525}
5526
5527int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb 5528 const char *if_match, const char *if_nomatch, bool removal_op,
9f95a23c 5529 bool modify_tail, optional_yield y)
7c673cae 5530{
9f95a23c 5531 int r = get_state(&state, false, y);
7c673cae
FG
5532 if (r < 0)
5533 return r;
5534
9f95a23c 5535 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
7c673cae
FG
5536 if_match != NULL || if_nomatch != NULL) &&
5537 (!state->fake_tag);
5538
5539 if (!state->is_atomic) {
5540 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
5541
5542 if (reset_obj) {
5543 op.create(false);
5544 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5545 }
5546
5547 return 0;
5548 }
5549
5550 if (need_guard) {
5551 /* first verify that the object wasn't replaced under */
5552 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5553 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5554 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5555 }
5556
5557 if (if_match) {
5558 if (strcmp(if_match, "*") == 0) {
5559 // test the object is existing
5560 if (!state->exists) {
5561 return -ERR_PRECONDITION_FAILED;
5562 }
5563 } else {
5564 bufferlist bl;
5565 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5566 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5567 return -ERR_PRECONDITION_FAILED;
5568 }
5569 }
5570 }
5571
5572 if (if_nomatch) {
5573 if (strcmp(if_nomatch, "*") == 0) {
5574 // test the object is NOT existing
5575 if (state->exists) {
5576 return -ERR_PRECONDITION_FAILED;
5577 }
5578 } else {
5579 bufferlist bl;
5580 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5581 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5582 return -ERR_PRECONDITION_FAILED;
5583 }
5584 }
5585 }
5586 }
5587
5588 if (reset_obj) {
5589 if (state->exists) {
5590 op.create(false);
5591 store->remove_rgw_head_obj(op);
5592 } else {
5593 op.create(true);
5594 }
5595 }
5596
5597 if (removal_op) {
5598 /* the object is being removed, no need to update its tag */
5599 return 0;
5600 }
5601
5602 if (ptag) {
5603 state->write_tag = *ptag;
5604 } else {
5605 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5606 }
5607 bufferlist bl;
5608 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5609
5610 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
5611
5612 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
5613 if (modify_tail) {
5614 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5615 }
7c673cae
FG
5616
5617 return 0;
5618}
5619
7c673cae
FG
5620/**
5621 * Set an attr on an object.
5622 * bucket: name of the bucket holding the object
5623 * obj: name of the object to set the attr on
5624 * name: the attr to set
5625 * bl: the contents of the attr
5626 * Returns: 0 on success, -ERR# otherwise.
5627 */
5628int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
5629{
5630 map<string, bufferlist> attrs;
5631 attrs[name] = bl;
9f95a23c 5632 return set_attrs(ctx, bucket_info, obj, attrs, NULL, null_yield);
7c673cae
FG
5633}
5634
494da23a 5635int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae 5636 map<string, bufferlist>& attrs,
9f95a23c
TL
5637 map<string, bufferlist>* rmattrs,
5638 optional_yield y)
7c673cae 5639{
494da23a
TL
5640 rgw_obj obj = src_obj;
5641 if (obj.key.instance == "null") {
5642 obj.key.instance.clear();
5643 }
5644
7c673cae
FG
5645 rgw_rados_ref ref;
5646 int r = get_obj_head_ref(bucket_info, obj, &ref);
5647 if (r < 0) {
5648 return r;
5649 }
5650 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5651
5652 ObjectWriteOperation op;
5653 RGWObjState *state = NULL;
5654
9f95a23c 5655 r = append_atomic_test(rctx, bucket_info, obj, op, &state, y);
7c673cae
FG
5656 if (r < 0)
5657 return r;
5658
494da23a 5659 // ensure null version object exist
9f95a23c 5660 if (src_obj.key.instance == "null" && !state->manifest) {
494da23a
TL
5661 return -ENOENT;
5662 }
5663
7c673cae
FG
5664 map<string, bufferlist>::iterator iter;
5665 if (rmattrs) {
5666 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5667 const string& name = iter->first;
5668 op.rmxattr(name.c_str());
5669 }
5670 }
5671
5672 const rgw_bucket& bucket = obj.bucket;
5673
5674 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5675 const string& name = iter->first;
5676 bufferlist& bl = iter->second;
5677
5678 if (!bl.length())
5679 continue;
5680
5681 op.setxattr(name.c_str(), bl);
5682
5683 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5684 real_time ts;
5685 try {
11fdf7f2 5686 decode(ts, bl);
7c673cae
FG
5687
5688 rgw_obj_index_key obj_key;
5689 obj.key.get_index_key(&obj_key);
5690
9f95a23c 5691 obj_expirer->hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
7c673cae
FG
5692 } catch (buffer::error& err) {
5693 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
5694 }
5695 }
5696 }
5697
5698 if (!op.size())
5699 return 0;
5700
9f95a23c 5701 RGWObjectCtx obj_ctx(this->store);
7c673cae
FG
5702
5703 bufferlist bl;
5704 RGWRados::Bucket bop(this, bucket_info);
5705 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5706
5707 if (state) {
5708 string tag;
5709 append_rand_alpha(cct, tag, tag, 32);
5710 state->write_tag = tag;
9f95a23c 5711 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
7c673cae
FG
5712
5713 if (r < 0)
5714 return r;
5715
5716 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
5717 op.setxattr(RGW_ATTR_ID_TAG, bl);
5718 }
5719
3efd9988
FG
5720
5721 real_time mtime = real_clock::now();
5722 struct timespec mtime_ts = real_clock::to_timespec(mtime);
5723 op.mtime2(&mtime_ts);
9f95a23c
TL
5724 auto& ioctx = ref.pool.ioctx();
5725 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
7c673cae
FG
5726 if (state) {
5727 if (r >= 0) {
5728 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
5729 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
5730 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
5731 string etag = rgw_bl_str(etag_bl);
5732 string content_type = rgw_bl_str(content_type_bl);
5733 string storage_class;
5734 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
5735 if (iter != attrs.end()) {
5736 storage_class = rgw_bl_str(iter->second);
5737 }
9f95a23c
TL
5738 uint64_t epoch = ioctx.get_last_version();
5739 int64_t poolid = ioctx.get_id();
7c673cae 5740 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
5741 mtime, etag, content_type, storage_class, &acl_bl,
5742 RGWObjCategory::Main, NULL);
7c673cae
FG
5743 } else {
5744 int ret = index_op.cancel();
5745 if (ret < 0) {
5746 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
5747 }
5748 }
5749 }
5750 if (r < 0)
5751 return r;
5752
5753 if (state) {
5754 state->obj_tag.swap(bl);
5755 if (rmattrs) {
5756 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5757 state->attrset.erase(iter->first);
5758 }
5759 }
92f5a8d4 5760
7c673cae
FG
5761 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5762 state->attrset[iter->first] = iter->second;
5763 }
92f5a8d4
TL
5764
5765 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
5766 if (iter != state->attrset.end()) {
5767 iter->second = state->obj_tag;
5768 }
7c673cae
FG
5769 }
5770
5771 return 0;
5772}
5773
9f95a23c 5774int RGWRados::Object::Read::prepare(optional_yield y)
7c673cae
FG
5775{
5776 RGWRados *store = source->get_store();
5777 CephContext *cct = store->ctx();
5778
5779 bufferlist etag;
5780
5781 map<string, bufferlist>::iterator iter;
5782
5783 RGWObjState *astate;
9f95a23c 5784 int r = source->get_state(&astate, true, y);
7c673cae
FG
5785 if (r < 0)
5786 return r;
5787
5788 if (!astate->exists) {
5789 return -ENOENT;
5790 }
5791
5792 const RGWBucketInfo& bucket_info = source->get_bucket_info();
5793
5794 state.obj = astate->obj;
5795 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
5796
11fdf7f2
TL
5797 state.cur_pool = state.head_obj.pool;
5798 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
5799
5800 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
5801 if (r < 0) {
5802 return r;
5803 }
eafe8130
TL
5804 if (params.target_obj) {
5805 *params.target_obj = state.obj;
5806 }
7c673cae
FG
5807 if (params.attrs) {
5808 *params.attrs = astate->attrset;
11fdf7f2 5809 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae
FG
5810 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
5811 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
5812 }
5813 }
5814 }
5815
5816 /* Convert all times go GMT to make them compatible */
5817 if (conds.mod_ptr || conds.unmod_ptr) {
5818 obj_time_weight src_weight;
5819 src_weight.init(astate);
5820 src_weight.high_precision = conds.high_precision_time;
5821
5822 obj_time_weight dest_weight;
5823 dest_weight.high_precision = conds.high_precision_time;
5824
9f95a23c 5825 if (conds.mod_ptr && !conds.if_nomatch) {
7c673cae
FG
5826 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5827 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5828 if (!(dest_weight < src_weight)) {
5829 return -ERR_NOT_MODIFIED;
5830 }
5831 }
5832
9f95a23c 5833 if (conds.unmod_ptr && !conds.if_match) {
7c673cae
FG
5834 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5835 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5836 if (dest_weight < src_weight) {
5837 return -ERR_PRECONDITION_FAILED;
5838 }
5839 }
5840 }
5841 if (conds.if_match || conds.if_nomatch) {
9f95a23c 5842 r = get_attr(RGW_ATTR_ETAG, etag, y);
7c673cae
FG
5843 if (r < 0)
5844 return r;
5845
5846 if (conds.if_match) {
5847 string if_match_str = rgw_string_unquote(conds.if_match);
11fdf7f2
TL
5848 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
5849 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
5850 return -ERR_PRECONDITION_FAILED;
5851 }
5852 }
5853
5854 if (conds.if_nomatch) {
5855 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
11fdf7f2
TL
5856 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
5857 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
5858 return -ERR_NOT_MODIFIED;
5859 }
5860 }
5861 }
5862
5863 if (params.obj_size)
5864 *params.obj_size = astate->size;
5865 if (params.lastmod)
5866 *params.lastmod = astate->mtime;
5867
5868 return 0;
5869}
5870
5871int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
5872{
5873 if (ofs < 0) {
5874 ofs += obj_size;
11fdf7f2
TL
5875 if (ofs < 0)
5876 ofs = 0;
5877 end = obj_size - 1;
5878 } else if (end < 0) {
5879 end = obj_size - 1;
7c673cae
FG
5880 }
5881
11fdf7f2
TL
5882 if (obj_size > 0) {
5883 if (ofs >= (off_t)obj_size) {
5884 return -ERANGE;
5885 }
5886 if (end >= (off_t)obj_size) {
5887 end = obj_size - 1;
7c673cae
FG
5888 }
5889 }
7c673cae
FG
5890 return 0;
5891}
5892
31f18b77
FG
5893int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
5894{
5895 RGWRados *store = target->get_store();
5896 BucketShard *bs;
5897 int r;
5898
5899#define NUM_RESHARD_RETRIES 10
5900 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
5901 int ret = get_bucket_shard(&bs);
5902 if (ret < 0) {
5903 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
5904 return ret;
5905 }
5906 r = call(bs);
5907 if (r != -ERR_BUSY_RESHARDING) {
5908 break;
5909 }
5910 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
5911 string new_bucket_id;
11fdf7f2
TL
5912 r = store->block_while_resharding(bs, &new_bucket_id,
5913 target->bucket_info, null_yield);
31f18b77
FG
5914 if (r == -ERR_BUSY_RESHARDING) {
5915 continue;
5916 }
5917 if (r < 0) {
5918 return r;
5919 }
5920 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
5921 i = 0; /* resharding is finished, make sure we can retry */
5922 r = target->update_bucket_id(new_bucket_id);
5923 if (r < 0) {
5924 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
5925 return r;
5926 }
5927 invalidate_bs();
81eedcae 5928 } // for loop
31f18b77
FG
5929
5930 if (r < 0) {
5931 return r;
5932 }
5933
5934 if (pbs) {
5935 *pbs = bs;
5936 }
5937
5938 return 0;
5939}
5940
9f95a23c 5941int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag, optional_yield y)
7c673cae
FG
5942{
5943 if (blind) {
5944 return 0;
5945 }
5946 RGWRados *store = target->get_store();
7c673cae
FG
5947
5948 if (write_tag && write_tag->length()) {
5949 optag = string(write_tag->c_str(), write_tag->length());
5950 } else {
5951 if (optag.empty()) {
5952 append_rand_alpha(store->ctx(), optag, optag, 32);
5953 }
5954 }
5955
f64942e4 5956 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
9f95a23c 5957 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, y, zones_trace);
f64942e4 5958 });
31f18b77 5959
7c673cae
FG
5960 if (r < 0) {
5961 return r;
5962 }
5963 prepared = true;
31f18b77 5964
7c673cae
FG
5965 return 0;
5966}
5967
5968int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
5969 uint64_t size, uint64_t accounted_size,
5970 ceph::real_time& ut, const string& etag,
11fdf7f2 5971 const string& content_type, const string& storage_class,
7c673cae
FG
5972 bufferlist *acl_bl,
5973 RGWObjCategory category,
11fdf7f2
TL
5974 list<rgw_obj_index_key> *remove_objs, const string *user_data,
5975 bool appendable)
7c673cae
FG
5976{
5977 if (blind) {
5978 return 0;
5979 }
5980 RGWRados *store = target->get_store();
5981 BucketShard *bs;
31f18b77 5982
7c673cae
FG
5983 int ret = get_bucket_shard(&bs);
5984 if (ret < 0) {
5985 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
5986 return ret;
5987 }
5988
5989 rgw_bucket_dir_entry ent;
5990 obj.key.get_index_key(&ent.key);
5991 ent.meta.size = size;
5992 ent.meta.accounted_size = accounted_size;
5993 ent.meta.mtime = ut;
5994 ent.meta.etag = etag;
11fdf7f2 5995 ent.meta.storage_class = storage_class;
7c673cae
FG
5996 if (user_data)
5997 ent.meta.user_data = *user_data;
5998
5999 ACLOwner owner;
6000 if (acl_bl && acl_bl->length()) {
6001 int ret = store->decode_policy(*acl_bl, &owner);
6002 if (ret < 0) {
6003 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6004 }
6005 }
6006 ent.meta.owner = owner.get_id().to_str();
6007 ent.meta.owner_display_name = owner.get_display_name();
6008 ent.meta.content_type = content_type;
11fdf7f2 6009 ent.meta.appendable = appendable;
7c673cae 6010
31f18b77 6011 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6012
9f95a23c
TL
6013 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6014 if (r < 0) {
6015 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6016 }
6017
6018 return ret;
6019}
6020
6021int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6022 real_time& removed_mtime,
6023 list<rgw_obj_index_key> *remove_objs)
6024{
6025 if (blind) {
6026 return 0;
6027 }
6028 RGWRados *store = target->get_store();
6029 BucketShard *bs;
31f18b77 6030
7c673cae
FG
6031 int ret = get_bucket_shard(&bs);
6032 if (ret < 0) {
6033 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6034 return ret;
6035 }
6036
31f18b77 6037 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6038
9f95a23c
TL
6039 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6040 if (r < 0) {
6041 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6042 }
6043
6044 return ret;
6045}
6046
6047
6048int RGWRados::Bucket::UpdateIndex::cancel()
6049{
6050 if (blind) {
6051 return 0;
6052 }
6053 RGWRados *store = target->get_store();
6054 BucketShard *bs;
7c673cae 6055
f64942e4
AA
6056 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6057 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6058 });
7c673cae
FG
6059
6060 /*
6061 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6062 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6063 * have no way to tell that they're all caught up
6064 */
9f95a23c
TL
6065 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6066 if (r < 0) {
6067 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6068 }
6069
6070 return ret;
6071}
6072
9f95a23c 6073int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y)
7c673cae
FG
6074{
6075 RGWRados *store = source->get_store();
6076 CephContext *cct = store->ctx();
6077
7c673cae
FG
6078 rgw_raw_obj read_obj;
6079 uint64_t read_ofs = ofs;
6080 uint64_t len, read_len;
6081 bool reading_from_head = true;
6082 ObjectReadOperation op;
6083
6084 bool merge_bl = false;
6085 bufferlist *pbl = &bl;
6086 bufferlist read_bl;
6087 uint64_t max_chunk_size;
6088
6089 RGWObjState *astate;
9f95a23c 6090 int r = source->get_state(&astate, true, y);
7c673cae
FG
6091 if (r < 0)
6092 return r;
6093
11fdf7f2
TL
6094 if (astate->size == 0) {
6095 end = 0;
6096 } else if (end >= (int64_t)astate->size) {
6097 end = astate->size - 1;
6098 }
6099
7c673cae
FG
6100 if (end < 0)
6101 len = 0;
6102 else
6103 len = end - ofs + 1;
6104
9f95a23c 6105 if (astate->manifest && astate->manifest->has_tail()) {
7c673cae 6106 /* now get the relevant object part */
9f95a23c 6107 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
7c673cae
FG
6108
6109 uint64_t stripe_ofs = iter.get_stripe_ofs();
6110 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6111 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6112 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6113 reading_from_head = (read_obj == state.head_obj);
6114 } else {
6115 read_obj = state.head_obj;
6116 }
6117
6118 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6119 if (r < 0) {
6120 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6121 return r;
6122 }
6123
6124 if (len > max_chunk_size)
6125 len = max_chunk_size;
6126
6127
7c673cae
FG
6128 read_len = len;
6129
6130 if (reading_from_head) {
6131 /* only when reading from the head object do we need to do the atomic test */
9f95a23c 6132 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
7c673cae
FG
6133 if (r < 0)
6134 return r;
6135
6136 if (astate && astate->prefetch_data) {
6137 if (!ofs && astate->data.length() >= len) {
6138 bl = astate->data;
6139 return bl.length();
6140 }
6141
6142 if (ofs < astate->data.length()) {
11fdf7f2 6143 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
9f95a23c 6144 astate->data.begin(ofs).copy(copy_len, bl);
7c673cae
FG
6145 read_len -= copy_len;
6146 read_ofs += copy_len;
6147 if (!read_len)
6148 return bl.length();
6149
6150 merge_bl = true;
6151 pbl = &read_bl;
6152 }
6153 }
6154 }
6155
6156 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6157 op.read(read_ofs, read_len, pbl, NULL);
6158
11fdf7f2
TL
6159 if (state.cur_pool != read_obj.pool) {
6160 auto iter = state.io_ctxs.find(read_obj.pool);
6161 if (iter == state.io_ctxs.end()) {
6162 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
494da23a 6163 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false);
11fdf7f2
TL
6164 if (r < 0) {
6165 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6166 return r;
6167 }
6168 } else {
6169 state.cur_ioctx = &iter->second;
7c673cae 6170 }
11fdf7f2 6171 state.cur_pool = read_obj.pool;
7c673cae
FG
6172 }
6173
11fdf7f2 6174 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6175
11fdf7f2
TL
6176 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6177 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6178
7c673cae 6179 if (r < 0) {
7c673cae
FG
6180 return r;
6181 }
7c673cae 6182
11fdf7f2
TL
6183 if (merge_bl) {
6184 bl.append(read_bl);
7c673cae
FG
6185 }
6186
7c673cae
FG
6187 return bl.length();
6188}
6189
11fdf7f2
TL
6190struct get_obj_data {
6191 RGWRados* store;
6192 RGWGetDataCB* client_cb;
6193 rgw::Aio* aio;
6194 uint64_t offset; // next offset to write to client
6195 rgw::AioResultList completed; // completed read results, sorted by offset
9f95a23c 6196 optional_yield yield;
7c673cae 6197
9f95a23c
TL
6198 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio,
6199 uint64_t offset, optional_yield yield)
6200 : store(store), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
7c673cae 6201
11fdf7f2
TL
6202 int flush(rgw::AioResultList&& results) {
6203 int r = rgw::check_for_errors(results);
6204 if (r < 0) {
6205 return r;
7c673cae 6206 }
7c673cae 6207
11fdf7f2
TL
6208 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6209 results.sort(cmp); // merge() requires results to be sorted first
6210 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6211
11fdf7f2
TL
6212 while (!completed.empty() && completed.front().id == offset) {
6213 auto bl = std::move(completed.front().data);
6214 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6215
11fdf7f2
TL
6216 offset += bl.length();
6217 int r = client_cb->handle_data(bl, 0, bl.length());
6218 if (r < 0) {
6219 return r;
6220 }
7c673cae 6221 }
11fdf7f2 6222 return 0;
7c673cae
FG
6223 }
6224
11fdf7f2
TL
6225 void cancel() {
6226 // wait for all completions to drain and ignore the results
6227 aio->drain();
7c673cae
FG
6228 }
6229
11fdf7f2
TL
6230 int drain() {
6231 auto c = aio->wait();
6232 while (!c.empty()) {
6233 int r = flush(std::move(c));
7c673cae 6234 if (r < 0) {
11fdf7f2 6235 cancel();
7c673cae
FG
6236 return r;
6237 }
11fdf7f2 6238 c = aio->wait();
7c673cae 6239 }
11fdf7f2 6240 return flush(std::move(c));
7c673cae
FG
6241 }
6242};
6243
11fdf7f2
TL
6244static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6245 off_t read_ofs, off_t len, bool is_head_obj,
6246 RGWObjState *astate, void *arg)
7c673cae
FG
6247{
6248 struct get_obj_data *d = (struct get_obj_data *)arg;
6249
11fdf7f2
TL
6250 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6251 is_head_obj, astate, arg);
7c673cae
FG
6252}
6253
11fdf7f2
TL
6254int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6255 off_t read_ofs, off_t len, bool is_head_obj,
6256 RGWObjState *astate, void *arg)
7c673cae 6257{
7c673cae
FG
6258 ObjectReadOperation op;
6259 struct get_obj_data *d = (struct get_obj_data *)arg;
6260 string oid, key;
7c673cae
FG
6261
6262 if (is_head_obj) {
6263 /* only when reading from the head object do we need to do the atomic test */
11fdf7f2 6264 int r = append_atomic_test(astate, op);
7c673cae
FG
6265 if (r < 0)
6266 return r;
6267
6268 if (astate &&
6269 obj_ofs < astate->data.length()) {
11fdf7f2 6270 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6271
7c673cae 6272 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6273 if (r < 0)
6274 return r;
6275
7c673cae 6276 len -= chunk_len;
11fdf7f2 6277 d->offset += chunk_len;
7c673cae
FG
6278 read_ofs += chunk_len;
6279 obj_ofs += chunk_len;
6280 if (!len)
6281 return 0;
6282 }
6283 }
6284
11fdf7f2
TL
6285 auto obj = d->store->svc.rados->obj(read_obj);
6286 int r = obj.open();
7c673cae 6287 if (r < 0) {
11fdf7f2
TL
6288 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6289 return r;
7c673cae
FG
6290 }
6291
11fdf7f2
TL
6292 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6293 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6294
11fdf7f2
TL
6295 const uint64_t cost = len;
6296 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6297
9f95a23c 6298 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
7c673cae 6299
11fdf7f2 6300 return d->flush(std::move(completed));
7c673cae
FG
6301}
6302
9f95a23c
TL
6303int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb,
6304 optional_yield y)
7c673cae
FG
6305{
6306 RGWRados *store = source->get_store();
6307 CephContext *cct = store->ctx();
7c673cae 6308 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6309 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6310 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6311
9f95a23c
TL
6312 auto aio = rgw::make_throttle(window_size, y);
6313 get_obj_data data(store, cb, &*aio, ofs, y);
7c673cae 6314
11fdf7f2 6315 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
9f95a23c 6316 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
7c673cae 6317 if (r < 0) {
11fdf7f2
TL
6318 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6319 data.cancel(); // drain completions without writing back to client
6320 return r;
7c673cae
FG
6321 }
6322
11fdf7f2 6323 return data.drain();
7c673cae
FG
6324}
6325
6326int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6327 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2 6328 off_t ofs, off_t end, uint64_t max_chunk_size,
9f95a23c 6329 iterate_obj_cb cb, void *arg, optional_yield y)
7c673cae
FG
6330{
6331 rgw_raw_obj head_obj;
6332 rgw_raw_obj read_obj;
6333 uint64_t read_ofs = ofs;
6334 uint64_t len;
6335 bool reading_from_head = true;
6336 RGWObjState *astate = NULL;
6337
6338 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6339
9f95a23c 6340 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
6341 if (r < 0) {
6342 return r;
6343 }
6344
6345 if (end < 0)
6346 len = 0;
6347 else
6348 len = end - ofs + 1;
6349
9f95a23c 6350 if (astate->manifest) {
7c673cae 6351 /* now get the relevant object stripe */
9f95a23c 6352 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
7c673cae 6353
9f95a23c 6354 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end();
7c673cae
FG
6355
6356 for (; iter != obj_end && ofs <= end; ++iter) {
6357 off_t stripe_ofs = iter.get_stripe_ofs();
6358 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6359
6360 while (ofs < next_stripe_ofs && ofs <= end) {
6361 read_obj = iter.get_location().get_raw_obj(this);
11fdf7f2 6362 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6363 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6364
6365 if (read_len > max_chunk_size) {
6366 read_len = max_chunk_size;
6367 }
6368
6369 reading_from_head = (read_obj == head_obj);
11fdf7f2 6370 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6371 if (r < 0) {
6372 return r;
6373 }
6374
6375 len -= read_len;
6376 ofs += read_len;
6377 }
6378 }
6379 } else {
6380 while (ofs <= end) {
6381 read_obj = head_obj;
11fdf7f2 6382 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6383
11fdf7f2 6384 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6385 if (r < 0) {
6386 return r;
6387 }
6388
6389 len -= read_len;
6390 ofs += read_len;
6391 }
6392 }
6393
6394 return 0;
6395}
6396
6397int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6398{
6399 rgw_rados_ref ref;
6400 int r = get_obj_head_ref(bucket_info, obj, &ref);
6401 if (r < 0) {
6402 return r;
6403 }
6404
9f95a23c 6405 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, null_yield);
7c673cae
FG
6406}
6407
6408int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6409{
6410 rgw_rados_ref ref;
6411 int r = get_obj_head_ref(bucket_info, obj, &ref);
6412 if (r < 0) {
6413 return r;
6414 }
6415
6416 bufferlist outbl;
6417
9f95a23c 6418 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
7c673cae
FG
6419}
6420
6421int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6422{
6423 ObjectWriteOperation op;
6424
11fdf7f2 6425 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6426
6427 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6428
6429 if (!state.exists) {
6430 op.create(true);
6431 } else {
6432 op.assert_exists();
b32b8144
FG
6433 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6434 op.mtime2(&mtime_ts);
7c673cae
FG
6435 }
6436
6437 /*
6438 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6439 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6440 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6441 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6442 * log will reflect that.
6443 *
6444 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6445 * is used for object data instance, olh_tag for olh instance.
6446 */
6447 if (has_tag) {
6448 /* guard against racing writes */
6449 bucket_index_guard_olh_op(state, op);
6450 }
6451
6452 if (!has_tag) {
6453 /* obj tag */
9f95a23c 6454 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6455
7c673cae
FG
6456 bufferlist bl;
6457 bl.append(obj_tag.c_str(), obj_tag.size());
6458 op.setxattr(RGW_ATTR_ID_TAG, bl);
6459
6460 state.attrset[RGW_ATTR_ID_TAG] = bl;
6461 state.obj_tag = bl;
6462
6463 /* olh tag */
9f95a23c 6464 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6465
7c673cae
FG
6466 bufferlist olh_bl;
6467 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6468 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6469
6470 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6471 state.olh_tag = olh_bl;
6472 state.is_olh = true;
6473
6474 bufferlist verbl;
6475 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6476 }
6477
6478 bufferlist bl;
6479 RGWOLHPendingInfo pending_info;
6480 pending_info.time = real_clock::now();
11fdf7f2 6481 encode(pending_info, bl);
7c673cae
FG
6482
6483#define OLH_PENDING_TAG_LEN 32
6484 /* tag will start with current time epoch, this so that entries are sorted by time */
6485 char buf[32];
6486 utime_t ut(pending_info.time);
6487 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6488 *op_tag = buf;
6489
9f95a23c 6490 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
11fdf7f2 6491
7c673cae
FG
6492 op_tag->append(s);
6493
6494 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6495 attr_name.append(*op_tag);
6496
6497 op.setxattr(attr_name.c_str(), bl);
6498
11fdf7f2 6499 int ret = obj_operate(bucket_info, olh_obj, &op);
7c673cae
FG
6500 if (ret < 0) {
6501 return ret;
6502 }
6503
6504 state.exists = true;
6505 state.attrset[attr_name] = bl;
6506
6507 return 0;
6508}
6509
6510int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
6511{
6512 int ret;
6513
6514 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
6515 if (ret == -EEXIST) {
6516 ret = -ECANCELED;
6517 }
6518
6519 return ret;
6520}
6521
f64942e4
AA
6522int RGWRados::guard_reshard(BucketShard *bs,
6523 const rgw_obj& obj_instance,
6524 const RGWBucketInfo& bucket_info,
6525 std::function<int(BucketShard *)> call)
31f18b77
FG
6526{
6527 rgw_obj obj;
6528 const rgw_obj *pobj = &obj_instance;
6529 int r;
6530
6531 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
f64942e4 6532 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
31f18b77
FG
6533 if (r < 0) {
6534 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
6535 return r;
6536 }
6537 r = call(bs);
6538 if (r != -ERR_BUSY_RESHARDING) {
6539 break;
6540 }
6541 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6542 string new_bucket_id;
11fdf7f2 6543 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
31f18b77
FG
6544 if (r == -ERR_BUSY_RESHARDING) {
6545 continue;
6546 }
6547 if (r < 0) {
6548 return r;
6549 }
6550 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6551 i = 0; /* resharding is finished, make sure we can retry */
6552
6553 obj = *pobj;
6554 obj.bucket.update_bucket_id(new_bucket_id);
6555 pobj = &obj;
81eedcae 6556 } // for loop
31f18b77
FG
6557
6558 if (r < 0) {
6559 return r;
6560 }
6561
6562 return 0;
6563}
6564
f64942e4
AA
6565int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6566 string *new_bucket_id,
11fdf7f2
TL
6567 const RGWBucketInfo& bucket_info,
6568 optional_yield y)
31f18b77 6569{
11fdf7f2
TL
6570 int ret = 0;
6571 cls_rgw_bucket_instance_entry entry;
6572
81eedcae
TL
6573 // since we want to run this recovery code from two distinct places,
6574 // let's just put it in a lambda so we can easily re-use; if the
6575 // lambda successfully fetches a new bucket id, it sets
6576 // new_bucket_id and returns 0, otherwise it returns a negative
6577 // error code
6578 auto fetch_new_bucket_id =
9f95a23c
TL
6579 [this, &bucket_info](const std::string& log_tag,
6580 std::string* new_bucket_id) -> int {
81eedcae
TL
6581 RGWBucketInfo fresh_bucket_info = bucket_info;
6582 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
6583 if (ret < 0) {
6584 ldout(cct, 0) << __func__ <<
6585 " ERROR: failed to refresh bucket info after reshard at " <<
6586 log_tag << ": " << cpp_strerror(-ret) << dendl;
6587 return ret;
6588 }
6589 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6590 return 0;
6591 };
6592
6593 constexpr int num_retries = 10;
6594 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
9f95a23c
TL
6595 auto& ref = bs->bucket_obj.get_ref();
6596 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
81eedcae
TL
6597 if (ret == -ENOENT) {
6598 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6599 } else if (ret < 0) {
6600 ldout(cct, 0) << __func__ <<
6601 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6602 dendl;
11fdf7f2
TL
6603 return ret;
6604 }
81eedcae 6605
11fdf7f2 6606 if (!entry.resharding_in_progress()) {
81eedcae
TL
6607 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6608 new_bucket_id);
11fdf7f2 6609 }
31f18b77 6610
81eedcae
TL
6611 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
6612 (i < num_retries ? "retrying" : "too many retries") << dendl;
6613
6614 if (i == num_retries) {
11fdf7f2
TL
6615 break;
6616 }
6617
6618 // If bucket is erroneously marked as resharding (e.g., crash or
6619 // other error) then fix it. If we can take the bucket reshard
6620 // lock then it means no other resharding should be taking place,
6621 // and we're free to clear the flags.
6622 {
6623 // since we expect to do this rarely, we'll do our work in a
6624 // block and erase our work after each try
6625
9f95a23c 6626 RGWObjectCtx obj_ctx(this->store);
11fdf7f2
TL
6627 const rgw_bucket& b = bs->bucket;
6628 std::string bucket_id = b.get_key();
9f95a23c 6629 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
11fdf7f2
TL
6630 ret = reshard_lock.lock();
6631 if (ret < 0) {
6632 ldout(cct, 20) << __func__ <<
6633 " INFO: failed to take reshard lock for bucket " <<
6634 bucket_id << "; expected if resharding underway" << dendl;
6635 } else {
6636 ldout(cct, 10) << __func__ <<
6637 " INFO: was able to take reshard lock for bucket " <<
6638 bucket_id << dendl;
9f95a23c 6639 ret = RGWBucketReshard::clear_resharding(this->store, bucket_info);
11fdf7f2
TL
6640 if (ret < 0) {
6641 reshard_lock.unlock();
6642 ldout(cct, 0) << __func__ <<
6643 " ERROR: failed to clear resharding flags for bucket " <<
6644 bucket_id << dendl;
6645 } else {
6646 reshard_lock.unlock();
6647 ldout(cct, 5) << __func__ <<
6648 " INFO: apparently successfully cleared resharding flags for "
6649 "bucket " << bucket_id << dendl;
6650 continue; // if we apparently succeed immediately test again
6651 } // if clear resharding succeeded
6652 } // if taking of lock succeeded
6653 } // block to encapsulate recovery from incomplete reshard
6654
6655 ret = reshard_wait->wait(y);
6656 if (ret < 0) {
81eedcae
TL
6657 ldout(cct, 0) << __func__ <<
6658 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
6659 return ret;
6660 }
81eedcae
TL
6661 } // for loop
6662
6663 ldout(cct, 0) << __func__ <<
6664 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 6665 return -ERR_BUSY_RESHARDING;
31f18b77
FG
6666}
6667
7c673cae
FG
6668int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
6669 bool delete_marker,
6670 const string& op_tag,
6671 struct rgw_bucket_dir_entry_meta *meta,
6672 uint64_t olh_epoch,
91327a77
AA
6673 real_time unmod_since, bool high_precision_time,
6674 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
6675{
6676 rgw_rados_ref ref;
6677 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6678 if (r < 0) {
6679 return r;
6680 }
6681
31f18b77
FG
6682 rgw_zone_set zones_trace;
6683 if (_zones_trace) {
6684 zones_trace = *_zones_trace;
7c673cae 6685 }
9f95a23c 6686 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
7c673cae 6687
31f18b77
FG
6688 BucketShard bs(this);
6689
f64942e4
AA
6690 r = guard_reshard(&bs, obj_instance, bucket_info,
6691 [&](BucketShard *bs) -> int {
9f95a23c
TL
6692 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6693 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6694 librados::ObjectWriteOperation op;
6695 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6696 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6697 delete_marker, op_tag, meta, olh_epoch,
6698 unmod_since, high_precision_time,
6699 svc.zone->get_zone().log_data, zones_trace);
6700 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6701 });
6702 if (r < 0) {
9f95a23c 6703 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
31f18b77 6704 return r;
7c673cae
FG
6705 }
6706
9f95a23c
TL
6707 r = svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
6708 if (r < 0) {
6709 ldout(cct, 0) << "ERROR: failed writing data log" << dendl;
91327a77
AA
6710 }
6711
7c673cae
FG
6712 return 0;
6713}
6714
6715void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
6716{
6717 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
6718 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
6719}
6720
6721int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 6722 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
6723{
6724 rgw_rados_ref ref;
6725 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6726 if (r < 0) {
6727 return r;
6728 }
6729
31f18b77
FG
6730 rgw_zone_set zones_trace;
6731 if (_zones_trace) {
6732 zones_trace = *_zones_trace;
7c673cae 6733 }
9f95a23c 6734 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
31f18b77
FG
6735
6736 BucketShard bs(this);
7c673cae
FG
6737
6738 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
6739 r = guard_reshard(&bs, obj_instance, bucket_info,
6740 [&](BucketShard *bs) -> int {
9f95a23c 6741 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6742 librados::ObjectWriteOperation op;
6743 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6744 cls_rgw_bucket_unlink_instance(op, key, op_tag,
6745 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
6746 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6747 });
6748 if (r < 0) {
9f95a23c 6749 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
31f18b77 6750 return r;
7c673cae
FG
6751 }
6752
6753 return 0;
6754}
6755
6756int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
6757 const rgw_obj& obj_instance, uint64_t ver_marker,
6758 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
6759 bool *is_truncated)
6760{
6761 rgw_rados_ref ref;
6762 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6763 if (r < 0) {
6764 return r;
6765 }
6766
6767 BucketShard bs(this);
f64942e4
AA
6768 int ret =
6769 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
6770 if (ret < 0) {
6771 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6772 return ret;
6773 }
6774
6775 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6776
6777 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6778
f64942e4
AA
6779 ret = guard_reshard(&bs, obj_instance, bucket_info,
6780 [&](BucketShard *bs) -> int {
9f95a23c 6781 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6782 ObjectReadOperation op;
6783 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6784
6785 rgw_cls_read_olh_log_ret log_ret;
6786 int op_ret = 0;
6787 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
6788 bufferlist outbl;
6789 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
6790 if (r < 0) {
6791 return r;
6792 }
6793 if (op_ret < 0) {
6794 return op_ret;
6795 }
6796
6797 *log = std::move(log_ret.log);
6798 *is_truncated = log_ret.is_truncated;
6799 return r;
f64942e4 6800 });
31f18b77
FG
6801 if (ret < 0) {
6802 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 6803 return ret;
31f18b77 6804 }
7c673cae
FG
6805
6806 return 0;
6807}
6808
a8e16298
TL
6809// a multisite sync bug resulted in the OLH head attributes being overwritten by
6810// the attributes from another zone, causing link_olh() to fail endlessly due to
6811// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6812// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
6813int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
6814 const rgw_obj& obj)
6815{
6816 // fetch the current olh entry from the bucket index
6817 rgw_bucket_olh_entry olh;
6818 int r = bi_get_olh(bucket_info, obj, &olh);
6819 if (r < 0) {
6820 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
6821 return r;
6822 }
11fdf7f2 6823 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
6824 return 0;
6825 }
6826
6827 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
6828 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
6829
6830 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6831 ObjectWriteOperation op;
6832 // assert this is the same olh tag we think we're fixing
6833 bucket_index_guard_olh_op(*state, op);
6834 // preserve existing mtime
6835 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
6836 op.mtime2(&mtime_ts);
6837 {
6838 bufferlist bl;
6839 bl.append(olh.tag.c_str(), olh.tag.size());
6840 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
6841 }
6842 {
6843 RGWOLHInfo info;
6844 info.target = rgw_obj(bucket_info.bucket, olh.key);
6845 info.removed = olh.delete_marker;
6846 bufferlist bl;
6847 encode(info, bl);
6848 op.setxattr(RGW_ATTR_OLH_INFO, bl);
6849 }
6850 rgw_rados_ref ref;
6851 r = get_obj_head_ref(bucket_info, obj, &ref);
6852 if (r < 0) {
6853 return r;
6854 }
9f95a23c 6855 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
a8e16298
TL
6856 if (r < 0) {
6857 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
6858 << cpp_strerror(r) << dendl;
6859 return r;
6860 }
6861 return 0;
6862}
6863
7c673cae
FG
6864int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
6865{
6866 rgw_rados_ref ref;
6867 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6868 if (r < 0) {
6869 return r;
6870 }
6871
6872 BucketShard bs(this);
f64942e4
AA
6873 int ret =
6874 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
6875 if (ret < 0) {
6876 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6877 return ret;
6878 }
6879
6880 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6881
6882 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6883
f64942e4
AA
6884 ret = guard_reshard(&bs, obj_instance, bucket_info,
6885 [&](BucketShard *pbs) -> int {
6886 ObjectWriteOperation op;
6887 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6888 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
9f95a23c 6889 return pbs->bucket_obj.operate(&op, null_yield);
31f18b77
FG
6890 });
6891 if (ret < 0) {
6892 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 6893 return ret;
31f18b77 6894 }
7c673cae
FG
6895
6896 return 0;
6897}
6898
6899int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
6900{
6901 rgw_rados_ref ref;
6902 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6903 if (r < 0) {
6904 return r;
6905 }
6906
6907 BucketShard bs(this);
7c673cae
FG
6908
6909 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6910
6911 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6912
f64942e4
AA
6913 int ret = guard_reshard(&bs, obj_instance, bucket_info,
6914 [&](BucketShard *pbs) -> int {
6915 ObjectWriteOperation op;
9f95a23c 6916 auto& ref = pbs->bucket_obj.get_ref();
f64942e4 6917 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6918 cls_rgw_clear_olh(op, key, olh_tag);
6919 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77 6920 });
7c673cae 6921 if (ret < 0) {
9f95a23c 6922 ldout(cct, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7c673cae
FG
6923 return ret;
6924 }
6925
6926 return 0;
6927}
6928
92f5a8d4
TL
6929static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
6930{
6931 try {
6932 auto biter = bl.cbegin();
6933 decode(*olh, biter);
6934 return 0;
6935 } catch (buffer::error& err) {
6936 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
6937 return -EIO;
6938 }
6939}
6940
7c673cae
FG
6941int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6942 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 6943 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
6944{
6945 if (log.empty()) {
6946 return 0;
6947 }
6948
6949 librados::ObjectWriteOperation op;
6950
6951 uint64_t last_ver = log.rbegin()->first;
6952 *plast_ver = last_ver;
6953
6954 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
6955
6956 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
92f5a8d4 6957 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7c673cae 6958
a8e16298
TL
6959 bufferlist ver_bl;
6960 string last_ver_s = to_string(last_ver);
6961 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
6962 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
6963
b32b8144
FG
6964 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6965 op.mtime2(&mtime_ts);
6966
7c673cae 6967 bool need_to_link = false;
92f5a8d4 6968 uint64_t link_epoch = 0;
7c673cae
FG
6969 cls_rgw_obj_key key;
6970 bool delete_marker = false;
6971 list<cls_rgw_obj_key> remove_instances;
6972 bool need_to_remove = false;
6973
92f5a8d4
TL
6974 // decode current epoch and instance
6975 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
6976 if (olh_ver != state.attrset.end()) {
6977 std::string str = olh_ver->second.to_str();
6978 std::string err;
6979 link_epoch = strict_strtoll(str.c_str(), 10, &err);
6980 }
6981 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
6982 if (olh_info != state.attrset.end()) {
6983 RGWOLHInfo info;
6984 int r = decode_olh_info(cct, olh_info->second, &info);
6985 if (r < 0) {
6986 return r;
6987 }
6988 info.target.key.get_index_key(&key);
6989 delete_marker = info.removed;
6990 }
6991
7c673cae
FG
6992 for (iter = log.begin(); iter != log.end(); ++iter) {
6993 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
6994 for (; viter != iter->second.end(); ++viter) {
6995 rgw_bucket_olh_log_entry& entry = *viter;
6996
92f5a8d4 6997 ldout(cct, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7c673cae
FG
6998 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
6999 << (entry.delete_marker ? "(delete)" : "") << dendl;
7000 switch (entry.op) {
7001 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7002 remove_instances.push_back(entry.key);
7003 break;
7004 case CLS_RGW_OLH_OP_LINK_OLH:
92f5a8d4
TL
7005 // only overwrite a link of the same epoch if its key sorts before
7006 if (link_epoch < iter->first || key.instance.empty() ||
7007 key.instance > entry.key.instance) {
7008 ldout(cct, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7009 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7010 need_to_link = true;
7011 need_to_remove = false;
7012 key = entry.key;
7013 delete_marker = entry.delete_marker;
7014 } else {
7015 ldout(cct, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7016 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7017 }
7c673cae
FG
7018 break;
7019 case CLS_RGW_OLH_OP_UNLINK_OLH:
7020 need_to_remove = true;
7021 need_to_link = false;
7022 break;
7023 default:
7024 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7025 return -EIO;
7026 }
7027 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7028 attr_name.append(entry.op_tag);
7029 op.rmxattr(attr_name.c_str());
7030 }
7031 }
7032
7033 rgw_rados_ref ref;
7034 int r = get_obj_head_ref(bucket_info, obj, &ref);
7035 if (r < 0) {
7036 return r;
7037 }
7038
7039 const rgw_bucket& bucket = obj.bucket;
7040
7041 if (need_to_link) {
7042 rgw_obj target(bucket, key);
7043 RGWOLHInfo info;
7044 info.target = target;
7045 info.removed = delete_marker;
7046 bufferlist bl;
11fdf7f2 7047 encode(info, bl);
7c673cae
FG
7048 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7049 }
7050
7051 /* first remove object instances */
7052 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7053 liter != remove_instances.end(); ++liter) {
7054 cls_rgw_obj_key& key = *liter;
7055 rgw_obj obj_instance(bucket, key);
31f18b77 7056 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
7057 if (ret < 0 && ret != -ENOENT) {
7058 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7059 return ret;
7060 }
7061 }
7062
7063 /* update olh object */
9f95a23c 7064 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
7065 if (r == -ECANCELED) {
7066 r = 0;
7067 }
7068 if (r < 0) {
7069 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7070 return r;
7071 }
7072
7073 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7074 if (r < 0) {
7075 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7076 return r;
7077 }
7078
7079 if (need_to_remove) {
7080 ObjectWriteOperation rm_op;
7081
7082 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7083 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7084 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7085 rm_op.remove();
7086
9f95a23c 7087 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7c673cae
FG
7088 if (r == -ECANCELED) {
7089 return 0; /* someone else won this race */
7090 } else {
7091 /*
7092 * only clear if was successful, otherwise we might clobber pending operations on this object
7093 */
7094 r = bucket_index_clear_olh(bucket_info, state, obj);
7095 if (r < 0) {
7096 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7097 return r;
7098 }
7099 }
7100 }
7101
7102 return 0;
7103}
7104
7105/*
7106 * read olh log and apply it
7107 */
31f18b77 7108int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7109{
7110 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7111 bool is_truncated;
7112 uint64_t ver_marker = 0;
7113
7114 do {
7115 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7116 if (ret < 0) {
7117 return ret;
7118 }
31f18b77 7119 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7120 if (ret < 0) {
7121 return ret;
7122 }
7123 } while (is_truncated);
7124
7125 return 0;
7126}
7127
9f95a23c 7128int RGWRados::set_olh(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77 7129 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
9f95a23c 7130 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7131{
7132 string op_tag;
7133
7134 rgw_obj olh_obj = target_obj;
7135 olh_obj.key.instance.clear();
7136
7137 RGWObjState *state = NULL;
7138
7139 int ret = 0;
7140 int i;
31f18b77 7141
7c673cae
FG
7142#define MAX_ECANCELED_RETRY 100
7143 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7144 if (ret == -ECANCELED) {
11fdf7f2 7145 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7146 }
7147
9f95a23c 7148 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7149 if (ret < 0) {
7150 return ret;
7151 }
7152
7153 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7154 if (ret < 0) {
7155 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7156 if (ret == -ECANCELED) {
7157 continue;
7158 }
7159 return ret;
7160 }
91327a77
AA
7161 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7162 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7163 zones_trace, log_data_change);
7c673cae
FG
7164 if (ret < 0) {
7165 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7166 if (ret == -ECANCELED) {
a8e16298
TL
7167 // the bucket index rejected the link_olh() due to olh tag mismatch;
7168 // attempt to reconstruct olh head attributes based on the bucket index
7169 int r2 = repair_olh(state, bucket_info, olh_obj);
7170 if (r2 < 0 && r2 != -ECANCELED) {
7171 return r2;
7172 }
7c673cae
FG
7173 continue;
7174 }
7175 return ret;
7176 }
7177 break;
7178 }
7179
7180 if (i == MAX_ECANCELED_RETRY) {
7181 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7182 return -EIO;
7183 }
7184
7185 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7186 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7187 ret = 0;
7188 }
7189 if (ret < 0) {
7190 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7191 return ret;
7192 }
7193
7194 return 0;
7195}
7196
7197int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
9f95a23c 7198 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7c673cae
FG
7199{
7200 string op_tag;
7201
7202 rgw_obj olh_obj = target_obj;
7203 olh_obj.key.instance.clear();
7204
7205 RGWObjState *state = NULL;
7206
7207 int ret = 0;
7208 int i;
7209
7210 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7211 if (ret == -ECANCELED) {
11fdf7f2 7212 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7213 }
7214
9f95a23c 7215 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7216 if (ret < 0)
7217 return ret;
7218
7219 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7220 if (ret < 0) {
7221 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7222 if (ret == -ECANCELED) {
7223 continue;
7224 }
7225 return ret;
7226 }
7227
7228 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7229
31f18b77 7230 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
7231 if (ret < 0) {
7232 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7233 if (ret == -ECANCELED) {
7234 continue;
7235 }
7236 return ret;
7237 }
7238 break;
7239 }
7240
7241 if (i == MAX_ECANCELED_RETRY) {
7242 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7243 return -EIO;
7244 }
7245
31f18b77 7246 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7247 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7248 return 0;
7249 }
7250 if (ret < 0) {
7251 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7252 return ret;
7253 }
7254
7255 return 0;
7256}
7257
11fdf7f2 7258void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7259{
7260#define OBJ_INSTANCE_LEN 32
7261 char buf[OBJ_INSTANCE_LEN + 1];
7262
7263 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7264 no underscore for instance name due to the way we encode the raw keys */
7265
11fdf7f2 7266 target_key->set_instance(buf);
7c673cae
FG
7267}
7268
11fdf7f2 7269void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7270{
11fdf7f2 7271 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7272}
7273
7274int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7275{
92f5a8d4 7276 map<string, bufferlist> attrset;
7c673cae
FG
7277
7278 ObjectReadOperation op;
92f5a8d4 7279 op.getxattrs(&attrset, NULL);
7c673cae 7280
7c673cae 7281 int r = obj_operate(bucket_info, obj, &op);
7c673cae
FG
7282 if (r < 0) {
7283 return r;
7284 }
7c673cae 7285
92f5a8d4 7286 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7c673cae
FG
7287 if (iter == attrset.end()) { /* not an olh */
7288 return -EINVAL;
7289 }
7290
92f5a8d4 7291 return decode_olh_info(cct, iter->second, olh);
7c673cae
FG
7292}
7293
7294void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7295 map<string, bufferlist> *rm_pending_entries)
7296{
7297 map<string, bufferlist>::iterator iter = pending_entries.begin();
7298
7299 real_time now = real_clock::now();
7300
7301 while (iter != pending_entries.end()) {
11fdf7f2 7302 auto biter = iter->second.cbegin();
7c673cae
FG
7303 RGWOLHPendingInfo pending_info;
7304 try {
11fdf7f2 7305 decode(pending_info, biter);
7c673cae
FG
7306 } catch (buffer::error& err) {
7307 /* skipping bad entry, we could remove it but it might hide a bug */
7308 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7309 ++iter;
7310 continue;
7311 }
7312
7313 map<string, bufferlist>::iterator cur_iter = iter;
7314 ++iter;
7315 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7316 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7317 pending_entries.erase(cur_iter);
7318 } else {
7319 /* entries names are sorted by time (rounded to a second) */
7320 break;
7321 }
7322 }
7323}
7324
7325int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7326{
7c673cae
FG
7327 rgw_rados_ref ref;
7328 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7329 if (r < 0) {
7330 return r;
7331 }
7332
81eedcae
TL
7333 // trim no more than 1000 entries per osd op
7334 constexpr int max_entries = 1000;
7c673cae 7335
81eedcae
TL
7336 auto i = pending_attrs.begin();
7337 while (i != pending_attrs.end()) {
7338 ObjectWriteOperation op;
7339 bucket_index_guard_olh_op(state, op);
7340
7341 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7342 op.rmxattr(i->first.c_str());
7343 }
7344
9f95a23c 7345 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
81eedcae
TL
7346 if (r == -ENOENT || r == -ECANCELED) {
7347 /* raced with some other change, shouldn't sweat about it */
7348 return 0;
7349 }
7350 if (r < 0) {
7351 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7352 return r;
7353 }
7354 }
7c673cae
FG
7355 return 0;
7356}
7357
7358int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7359{
7360 map<string, bufferlist> pending_entries;
11fdf7f2 7361 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7362
7363 map<string, bufferlist> rm_pending_entries;
7364 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7365
7366 if (!rm_pending_entries.empty()) {
7367 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7368 if (ret < 0) {
7369 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7370 return ret;
7371 }
7372 }
7373 if (!pending_entries.empty()) {
7374 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7375
7376 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7377 if (ret < 0) {
7378 return ret;
7379 }
7380 }
7381
92f5a8d4
TL
7382 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7383 if (iter == state->attrset.end()) {
7384 return -EINVAL;
7385 }
7386
7c673cae 7387 RGWOLHInfo olh;
92f5a8d4
TL
7388 int ret = decode_olh_info(cct, iter->second, &olh);
7389 if (ret < 0) {
7390 return ret;
7c673cae
FG
7391 }
7392
7393 if (olh.removed) {
7394 return -ENOENT;
7395 }
7396
7397 *target = olh.target;
7398
7399 return 0;
7400}
7401
7402int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7403 map<string, bufferlist> *attrs, bufferlist *first_chunk,
9f95a23c 7404 RGWObjVersionTracker *objv_tracker, optional_yield y)
7c673cae
FG
7405{
7406 rgw_rados_ref ref;
7407 int r = get_raw_obj_ref(obj, &ref);
7408 if (r < 0) {
7409 return r;
7410 }
7411
7412 map<string, bufferlist> unfiltered_attrset;
7413 uint64_t size = 0;
7414 struct timespec mtime_ts;
7415
7416 ObjectReadOperation op;
7417 if (objv_tracker) {
7418 objv_tracker->prepare_op_for_read(&op);
7419 }
7420 if (attrs) {
7421 op.getxattrs(&unfiltered_attrset, NULL);
7422 }
7423 if (psize || pmtime) {
7424 op.stat2(&size, &mtime_ts, NULL);
7425 }
7426 if (first_chunk) {
7427 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7428 }
7429 bufferlist outbl;
9f95a23c 7430 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7c673cae
FG
7431
7432 if (epoch) {
9f95a23c 7433 *epoch = ref.pool.ioctx().get_last_version();
7c673cae
FG
7434 }
7435
7436 if (r < 0)
7437 return r;
7438
7439 if (psize)
7440 *psize = size;
7441 if (pmtime)
7442 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7443 if (attrs) {
11fdf7f2 7444 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7445 }
7446
7447 return 0;
7448}
7449
7450int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7451 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7452{
a8e16298 7453 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7454 map<int, string> bucket_instance_ids;
7455 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7456 if (r < 0) {
7457 return r;
7458 }
7459
11fdf7f2 7460 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7461
a8e16298 7462 auto iter = headers.begin();
7c673cae
FG
7463 map<int, string>::iterator viter = bucket_instance_ids.begin();
7464 BucketIndexShardsManager ver_mgr;
7465 BucketIndexShardsManager master_ver_mgr;
7466 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7467 char buf[64];
7468 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7469 accumulate_raw_stats(*iter, stats);
7470 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7471 ver_mgr.add(viter->first, string(buf));
a8e16298 7472 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7473 master_ver_mgr.add(viter->first, string(buf));
7474 if (shard_id >= 0) {
a8e16298 7475 *max_marker = iter->max_marker;
7c673cae 7476 } else {
a8e16298 7477 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7478 }
c07f9fc5 7479 if (syncstopped != NULL)
a8e16298 7480 *syncstopped = iter->syncstopped;
7c673cae
FG
7481 }
7482 ver_mgr.to_string(bucket_ver);
7483 master_ver_mgr.to_string(master_ver);
7484 if (shard_id < 0) {
7485 marker_mgr.to_string(max_marker);
7486 }
7487 return 0;
7488}
7489
7c673cae
FG
7490class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7491 RGWGetBucketStats_CB *cb;
7492 uint32_t pendings;
7493 map<RGWObjCategory, RGWStorageStats> stats;
7494 int ret_code;
7495 bool should_cb;
9f95a23c 7496 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7c673cae
FG
7497
7498public:
7499 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
9f95a23c
TL
7500 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7501 {}
7c673cae
FG
7502
7503 void handle_response(int r, rgw_bucket_dir_header& header) override {
9f95a23c 7504 std::lock_guard l{lock};
7c673cae
FG
7505 if (should_cb) {
7506 if ( r >= 0) {
7507 accumulate_raw_stats(header, stats);
7508 } else {
7509 ret_code = r;
7510 }
7511
7512 // Are we all done?
7513 if (--pendings == 0) {
7514 if (!ret_code) {
7515 cb->set_response(&stats);
7516 }
7517 cb->handle_response(ret_code);
7518 cb->put();
7519 }
7520 }
7521 }
7522
7523 void unset_cb() {
9f95a23c 7524 std::lock_guard l{lock};
7c673cae
FG
7525 should_cb = false;
7526 }
7527};
7528
7529int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7530{
7531 int num_aio = 0;
c07f9fc5 7532 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11fdf7f2 7533 ceph_assert(get_ctx);
7c673cae 7534 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
7535 if (r < 0) {
7536 ctx->put();
7537 if (num_aio) {
7538 get_ctx->unset_cb();
7539 }
7540 }
c07f9fc5 7541 get_ctx->put();
7c673cae
FG
7542 return r;
7543}
7544
e306af50
TL
7545int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7546 const string& meta_key,
7547 RGWBucketInfo& info,
7548 real_time *pmtime,
7549 map<string, bufferlist> *pattrs,
7550 optional_yield y)
9f95a23c
TL
7551{
7552 rgw_bucket bucket;
7553 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7c673cae 7554
9f95a23c
TL
7555 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y);
7556}
7c673cae 7557
11fdf7f2 7558int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
9f95a23c 7559 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y)
7c673cae 7560{
9f95a23c
TL
7561 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7562 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7563 y,
7564 RGWBucketCtl::BucketInstance::GetParams()
7565 .set_mtime(pmtime)
7566 .set_attrs(pattrs)
7567 .set_bectx_params(bectx_params));
7c673cae
FG
7568}
7569
9f95a23c 7570int RGWRados::get_bucket_info(RGWServices *svc,
b32b8144
FG
7571 const string& tenant, const string& bucket_name,
7572 RGWBucketInfo& info,
9f95a23c
TL
7573 real_time *pmtime,
7574 optional_yield y, map<string, bufferlist> *pattrs)
b32b8144 7575{
9f95a23c
TL
7576 auto obj_ctx = svc->sysobj->init_obj_ctx();
7577 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7578 rgw_bucket bucket;
7579 bucket.tenant = tenant;
7580 bucket.name = bucket_name;
7581 return ctl.bucket->read_bucket_info(bucket, &info, y,
7582 RGWBucketCtl::BucketInstance::GetParams()
7583 .set_mtime(pmtime)
7584 .set_attrs(pattrs)
7585 .set_bectx_params(bectx_params));
b32b8144
FG
7586}
7587
7588int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7589 ceph::real_time *pmtime,
7590 map<string, bufferlist> *pattrs)
7591{
9f95a23c
TL
7592 rgw_bucket bucket = info.bucket;
7593 bucket.bucket_id.clear();
b32b8144 7594
9f95a23c 7595 auto rv = info.objv_tracker.read_version;
b32b8144 7596
9f95a23c
TL
7597 return ctl.bucket->read_bucket_info(bucket, &info, null_yield,
7598 RGWBucketCtl::BucketInstance::GetParams()
7599 .set_mtime(pmtime)
7600 .set_attrs(pattrs)
7601 .set_refresh_version(rv));
7c673cae
FG
7602}
7603
7604int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
7605 real_time mtime, map<string, bufferlist> *pattrs)
7606{
9f95a23c
TL
7607 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield,
7608 RGWBucketCtl::BucketInstance::PutParams()
7609 .set_exclusive(exclusive)
7610 .set_mtime(mtime)
7611 .set_attrs(pattrs));
7c673cae
FG
7612}
7613
7614int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
7615 map<string, bufferlist> *pattrs, bool create_entry_point)
7616{
7617 bool create_head = !info.has_instance_obj || create_entry_point;
7618
7619 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
7620 if (ret < 0) {
7621 return ret;
7622 }
7623
7624 if (!create_head)
7625 return 0; /* done! */
7626
7627 RGWBucketEntryPoint entry_point;
7628 entry_point.bucket = info.bucket;
7629 entry_point.owner = info.owner;
7630 entry_point.creation_time = info.creation_time;
7631 entry_point.linked = true;
7632 RGWObjVersionTracker ot;
7633 if (pep_objv && !pep_objv->tag.empty()) {
7634 ot.write_version = *pep_objv;
7635 } else {
7636 ot.generate_new_write_ver(cct);
7637 if (pep_objv) {
7638 *pep_objv = ot.write_version;
7639 }
7640 }
9f95a23c
TL
7641 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, RGWBucketCtl::Bucket::PutParams()
7642 .set_exclusive(exclusive)
7643 .set_objv_tracker(&ot)
7644 .set_mtime(mtime));
7c673cae
FG
7645 if (ret < 0)
7646 return ret;
7647
7648 return 0;
7649}
7650
7c673cae
FG
7651int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
7652{
11fdf7f2 7653 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
7654
7655 map<string, RGWBucketEnt>::iterator iter;
7656 for (iter = m.begin(); iter != m.end(); ++iter) {
7657 RGWBucketEnt& ent = iter->second;
7658 rgw_bucket& bucket = ent.bucket;
7659 ent.count = 0;
7660 ent.size = 0;
7661 ent.size_rounded = 0;
7662
a8e16298 7663 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7664
7665 RGWBucketInfo bucket_info;
9f95a23c 7666 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield);
7c673cae
FG
7667 if (ret < 0) {
7668 return ret;
7669 }
7670
7671 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
7672 if (r < 0)
7673 return r;
7674
a8e16298 7675 auto hiter = headers.begin();
7c673cae
FG
7676 for (; hiter != headers.end(); ++hiter) {
7677 RGWObjCategory category = main_category;
11fdf7f2 7678 auto iter = (hiter->stats).find(category);
a8e16298 7679 if (iter != hiter->stats.end()) {
7c673cae
FG
7680 struct rgw_bucket_category_stats& stats = iter->second;
7681 ent.count += stats.num_entries;
7682 ent.size += stats.total_size;
7683 ent.size_rounded += stats.total_size_rounded;
7684 }
7685 }
3efd9988
FG
7686
7687 // fill in placement_rule from the bucket instance for use in swift's
7688 // per-storage policy statistics
7689 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
7690 }
7691
7692 return m.size();
7693}
7694
7695int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
7696{
7697 rgw_rados_ref ref;
7698 int r = get_raw_obj_ref(obj, &ref);
7699 if (r < 0) {
7700 return r;
7701 }
7702 librados::Rados *rad = get_rados_handle();
9f95a23c 7703 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7c673cae 7704
9f95a23c 7705 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
7706 completion->release();
7707 return r;
7708}
7709
7c673cae
FG
7710int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
7711{
7712 librados::IoCtx& io_ctx = ctx.io_ctx;
7713 librados::NObjectIterator& iter = ctx.iter;
7714
494da23a 7715 int r = open_pool_ctx(pool, io_ctx, false);
7c673cae
FG
7716 if (r < 0)
7717 return r;
7718
7719 iter = io_ctx.nobjects_begin();
7720
7721 return 0;
7722}
7723
181888fb
FG
7724int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
7725{
7726 librados::IoCtx& io_ctx = ctx.io_ctx;
7727 librados::NObjectIterator& iter = ctx.iter;
7728
494da23a 7729 int r = open_pool_ctx(pool, io_ctx, false);
181888fb
FG
7730 if (r < 0)
7731 return r;
7732
7733 librados::ObjectCursor oc;
7734 if (!oc.from_str(cursor)) {
7735 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
7736 return -EINVAL;
7737 }
7738
f64942e4
AA
7739 try {
7740 iter = io_ctx.nobjects_begin(oc);
7741 return 0;
7742 } catch (const std::system_error& e) {
7743 r = -e.code().value();
7744 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7745 << ", returning " << r << dendl;
7746 return r;
7747 } catch (const std::exception& e) {
7748 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7749 << ", returning -5" << dendl;
7750 return -EIO;
7751 }
181888fb
FG
7752}
7753
7754string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
7755{
7756 return ctx.iter.get_cursor().to_str();
7757}
7758
f64942e4
AA
7759static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
7760 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
7761 bool *is_truncated, RGWAccessListFilter *filter)
7762{
7763 librados::IoCtx& io_ctx = ctx.io_ctx;
7764 librados::NObjectIterator& iter = ctx.iter;
7765
7766 if (iter == io_ctx.nobjects_end())
7767 return -ENOENT;
7768
7769 uint32_t i;
7770
7771 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
7772 rgw_bucket_dir_entry e;
7773
7774 string oid = iter->get_oid();
7775 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7776
7777 // fill it in with initial values; we may correct later
7778 if (filter && !filter->filter(oid, oid))
7779 continue;
7780
7781 e.key = oid;
7782 objs.push_back(e);
7783 }
7784
7785 if (is_truncated)
7786 *is_truncated = (iter != io_ctx.nobjects_end());
7787
7788 return objs.size();
7789}
7c673cae 7790
f64942e4
AA
7791int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
7792 bool *is_truncated, RGWAccessListFilter *filter)
7793{
7794 // catch exceptions from NObjectIterator::operator++()
7795 try {
7796 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
7797 } catch (const std::system_error& e) {
7798 int r = -e.code().value();
7799 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7800 << ", returning " << r << dendl;
7801 return r;
7802 } catch (const std::exception& e) {
7803 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7804 << ", returning -5" << dendl;
7805 return -EIO;
7806 }
7807}
7808
181888fb 7809int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 7810{
181888fb
FG
7811 if (!ctx->initialized) {
7812 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
7813 if (r < 0) {
7814 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7815 return r;
7816 }
181888fb 7817 ctx->initialized = true;
7c673cae 7818 }
181888fb
FG
7819 return 0;
7820}
7c673cae 7821
181888fb
FG
7822int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
7823 RGWListRawObjsCtx& ctx, list<string>& oids,
7824 bool *is_truncated)
7825{
7826 if (!ctx.initialized) {
7827 return -EINVAL;
7828 }
7829 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
7830 vector<rgw_bucket_dir_entry> objs;
7831 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
7832 if (r < 0) {
7833 if(r != -ENOENT)
7834 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7835 return r;
7836 }
7837
7838 vector<rgw_bucket_dir_entry>::iterator iter;
7839 for (iter = objs.begin(); iter != objs.end(); ++iter) {
7840 oids.push_back(iter->key.name);
7841 }
7842
7843 return oids.size();
7844}
7845
181888fb
FG
7846int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
7847 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
7848 bool *is_truncated)
7849{
7850 if (!ctx.initialized) {
7851 int r = list_raw_objects_init(pool, string(), &ctx);
7852 if (r < 0) {
7853 return r;
7854 }
7855 }
7856
7857 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
7858}
7859
7860string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
7861{
7862 return pool_iterate_get_cursor(ctx.iter_ctx);
7863}
7864
a8e16298
TL
7865int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7866 rgw_bucket_dir_entry *dirent)
7c673cae 7867{
a8e16298 7868 rgw_cls_bi_entry bi_entry;
11fdf7f2 7869 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298
TL
7870 if (r < 0 && r != -ENOENT) {
7871 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7872 }
7c673cae
FG
7873 if (r < 0) {
7874 return r;
7875 }
11fdf7f2 7876 auto iter = bi_entry.data.cbegin();
a8e16298 7877 try {
11fdf7f2 7878 decode(*dirent, iter);
a8e16298
TL
7879 } catch (buffer::error& err) {
7880 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7881 return -EIO;
7882 }
7883
7884 return 0;
7885}
7c673cae 7886
a8e16298
TL
7887int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7888 rgw_bucket_olh_entry *olh)
7889{
7c673cae 7890 rgw_cls_bi_entry bi_entry;
11fdf7f2 7891 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae
FG
7892 if (r < 0 && r != -ENOENT) {
7893 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7894 }
7895 if (r < 0) {
7896 return r;
7897 }
11fdf7f2 7898 auto iter = bi_entry.data.cbegin();
7c673cae 7899 try {
a8e16298 7900 decode(*olh, iter);
7c673cae
FG
7901 } catch (buffer::error& err) {
7902 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7903 return -EIO;
7904 }
7905
7906 return 0;
7907}
7908
a8e16298
TL
7909int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7910 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
7911{
7912 BucketShard bs(this);
a8e16298 7913 int ret = bs.init(bucket_info, obj);
7c673cae
FG
7914 if (ret < 0) {
7915 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7916 return ret;
7917 }
7918
7919 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9f95a23c
TL
7920
7921 auto& ref = bs.bucket_obj.get_ref();
7c673cae 7922
9f95a23c 7923 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7c673cae
FG
7924}
7925
7926void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
7927{
9f95a23c
TL
7928 auto& ref = bs.bucket_obj.get_ref();
7929 cls_rgw_bi_put(op, ref.obj.oid, entry);
7c673cae
FG
7930}
7931
7932int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
7933{
9f95a23c
TL
7934 auto& ref = bs.bucket_obj.get_ref();
7935 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7c673cae
FG
7936 if (ret < 0)
7937 return ret;
7938
7939 return 0;
7940}
7941
7942int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7943{
7944 BucketShard bs(this);
f64942e4 7945 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7946 if (ret < 0) {
7947 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7948 return ret;
7949 }
7950
7951 return bi_put(bs, entry);
7952}
7953
7954int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7955{
7956 rgw_obj obj(bucket, obj_name);
7957 BucketShard bs(this);
f64942e4 7958 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7959 if (ret < 0) {
7960 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7961 return ret;
7962 }
7963
9f95a23c
TL
7964 auto& ref = bs.bucket_obj.get_ref();
7965 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
7966 if (ret == -ENOENT) {
7967 *is_truncated = false;
7968 }
7c673cae
FG
7969 if (ret < 0)
7970 return ret;
7971
7972 return 0;
7973}
7974
7975int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7976{
9f95a23c
TL
7977 auto& ref = bs.bucket_obj.get_ref();
7978 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, filter_obj, marker, max, entries, is_truncated);
7c673cae
FG
7979 if (ret < 0)
7980 return ret;
7981
7982 return 0;
7983}
7984
7985int RGWRados::bi_remove(BucketShard& bs)
7986{
9f95a23c
TL
7987 auto& ref = bs.bucket_obj.get_ref();
7988 int ret = ref.pool.ioctx().remove(ref.obj.oid);
7c673cae
FG
7989 if (ret == -ENOENT) {
7990 ret = 0;
7991 }
7992 if (ret < 0) {
7993 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
7994 return ret;
7995 }
7996
7997 return 0;
7998}
7999
8000int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8001{
8002 BucketShard bs(this);
f64942e4 8003 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
7c673cae
FG
8004 if (ret < 0) {
8005 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8006 return ret;
8007 }
8008
8009 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8010}
8011
8012int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8013{
9f95a23c 8014 return rgw_rados_operate(gc_pool_ctx, oid, op, null_yield);
7c673cae
FG
8015}
8016
9f95a23c
TL
8017int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8018 librados::ObjectWriteOperation *op)
7c673cae 8019{
9f95a23c 8020 return gc_pool_ctx.aio_operate(oid, c, op);
7c673cae
FG
8021}
8022
8023int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8024{
9f95a23c 8025 return rgw_rados_operate(gc_pool_ctx, oid, op, pbl, null_yield);
7c673cae
FG
8026}
8027
9f95a23c 8028int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
7c673cae 8029{
9f95a23c 8030 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
7c673cae
FG
8031}
8032
11fdf7f2 8033int RGWRados::process_gc(bool expired_only)
7c673cae 8034{
11fdf7f2 8035 return gc->process(expired_only);
7c673cae
FG
8036}
8037
f6b5b4d7
TL
8038int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
8039 vector<cls_rgw_lc_entry>& progress_map,
8040 int& index)
7c673cae 8041{
f6b5b4d7 8042 return lc->list_lc_progress(marker, max_entries, progress_map, index);
7c673cae
FG
8043}
8044
8045int RGWRados::process_lc()
8046{
f6b5b4d7
TL
8047 RGWLC lc;
8048 lc.initialize(cct, this->store);
8049 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
8050 auto ret = lc.process(&worker, true /* once */);
8051 lc.stop_processor(); // sets down_flag, but returns immediately
8052 return ret;
7c673cae
FG
8053}
8054
1adf2230 8055bool RGWRados::process_expire_objects()
7c673cae 8056{
1adf2230 8057 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
8058}
8059
7c673cae 8060int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
9f95a23c 8061 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
7c673cae 8062{
31f18b77
FG
8063 rgw_zone_set zones_trace;
8064 if (_zones_trace) {
8065 zones_trace = *_zones_trace;
8066 }
9f95a23c 8067 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8068
7c673cae
FG
8069 ObjectWriteOperation o;
8070 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8071 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8072 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
9f95a23c 8073 return bs.bucket_obj.operate(&o, y);
7c673cae
FG
8074}
8075
31f18b77 8076int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8077 int64_t pool, uint64_t epoch,
8078 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8079 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8080{
7c673cae
FG
8081 ObjectWriteOperation o;
8082 rgw_bucket_dir_entry_meta dir_meta;
8083 dir_meta = ent.meta;
8084 dir_meta.category = category;
8085
1adf2230
AA
8086 rgw_zone_set zones_trace;
8087 if (_zones_trace) {
8088 zones_trace = *_zones_trace;
8089 }
9f95a23c 8090 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8091
7c673cae
FG
8092 rgw_bucket_entry_ver ver;
8093 ver.pool = pool;
8094 ver.epoch = epoch;
8095 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
8096 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8097 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8098 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
8099 complete_op_data *arg;
8100 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8101 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77 8102 librados::AioCompletion *completion = arg->rados_completion;
9f95a23c 8103 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
31f18b77 8104 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
8105 return ret;
8106}
8107
31f18b77 8108int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
8109 int64_t pool, uint64_t epoch,
8110 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8111 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 8112{
31f18b77 8113 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
8114}
8115
8116int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8117 int64_t pool, uint64_t epoch,
8118 rgw_obj& obj,
8119 real_time& removed_mtime,
8120 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
8121 uint16_t bilog_flags,
8122 rgw_zone_set *zones_trace)
7c673cae
FG
8123{
8124 rgw_bucket_dir_entry ent;
8125 ent.meta.mtime = removed_mtime;
8126 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8127 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8128 ent, RGWObjCategory::None, remove_objs,
8129 bilog_flags, zones_trace);
7c673cae
FG
8130}
8131
31f18b77 8132int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
8133{
8134 rgw_bucket_dir_entry ent;
8135 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8136 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8137 -1 /* pool id */, 0, ent,
8138 RGWObjCategory::None, NULL, bilog_flags,
8139 zones_trace);
7c673cae
FG
8140}
8141
8142int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
8143{
9f95a23c 8144 RGWSI_RADOS::Pool index_pool;
7c673cae 8145 map<int, string> bucket_objs;
9f95a23c 8146 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8147 if (r < 0)
8148 return r;
8149
9f95a23c
TL
8150 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8151}
8152
8153
8154uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8155 uint32_t num_shards)
8156{
8157 // We want to minimize the chances that when num_shards >>
8158 // num_entries that we return much fewer than num_entries to the
8159 // client. Given all the overhead of making a cls call to the osd,
8160 // returning a few entries is not much more work than returning one
8161 // entry. This minimum might be better tuned based on future
8162 // experiments where num_shards >> num_entries. (Note: ">>" should
8163 // be interpreted as "much greater than".)
8164 constexpr uint32_t min_read = 8;
8165
8166 // The following is based on _"Balls into Bins" -- A Simple and
8167 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8168 // cases when num_shards >> num_entries (it almost serves as a
8169 // ceiling calculation). We also assume alpha is 1.0 and extract it
8170 // from the calculation. Future work could involve memoizing some of
8171 // the transcendental functions to minimize repeatedly re-calling
8172 // them with the same parameters, which we expect to be the case the
8173 // majority of the time.
8174 uint32_t calc_read =
8175 1 +
8176 static_cast<uint32_t>((num_entries / num_shards) +
8177 sqrt((2 * num_entries) *
8178 log(num_shards) / num_shards));
8179
8180 return std::max(min_read, calc_read);
7c673cae
FG
8181}
8182
1adf2230
AA
8183
8184int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
9f95a23c
TL
8185 const int shard_id,
8186 const rgw_obj_index_key& start_after,
1adf2230 8187 const string& prefix,
9f95a23c
TL
8188 const string& delimiter,
8189 const uint32_t num_entries,
8190 const bool list_versions,
8191 const uint16_t expansion_factor,
8192 ent_map_t& m,
8193 bool* is_truncated,
8194 bool* cls_filtered,
1adf2230 8195 rgw_obj_index_key *last_entry,
9f95a23c
TL
8196 optional_yield y,
8197 check_filter_t force_check_filter)
7c673cae 8198{
9f95a23c
TL
8199 /* expansion_factor allows the number of entries to read to grow
8200 * exponentially; this is used when earlier reads are producing too
8201 * few results, perhaps due to filtering or to a series of
8202 * namespaced entries */
8203
8204 ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
8205 " start_after=\"" << start_after.name <<
8206 "[" << start_after.instance <<
8207 "]\", prefix=\"" << prefix <<
8208 "\" num_entries=" << num_entries <<
8209 ", list_versions=" << list_versions <<
8210 ", expansion_factor=" << expansion_factor << dendl;
7c673cae 8211
9f95a23c
TL
8212 m.clear();
8213
8214 RGWSI_RADOS::Pool index_pool;
7c673cae 8215 // key - oid (for different shards if there is any)
1adf2230
AA
8216 // value - list result for the corresponding oid (shard), it is filled by
8217 // the AIO callback
9f95a23c
TL
8218 map<int, string> shard_oids;
8219 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id,
8220 &index_pool, &shard_oids,
8221 nullptr);
8222 if (r < 0) {
7c673cae 8223 return r;
9f95a23c
TL
8224 }
8225
8226 const uint32_t shard_count = shard_oids.size();
8227 uint32_t num_entries_per_shard;
8228 if (expansion_factor == 0) {
8229 num_entries_per_shard =
8230 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8231 } else if (expansion_factor <= 11) {
8232 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8233 num_entries_per_shard =
8234 std::min(num_entries,
8235 (uint32_t(1 << (expansion_factor - 1)) *
8236 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8237 } else {
8238 num_entries_per_shard = num_entries;
8239 }
8240
8241 ldout(cct, 10) << "RGWRados::" << __func__ <<
8242 " request from each of " << shard_count <<
8243 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8244 num_entries << " total entries" << dendl;
7c673cae 8245
9f95a23c
TL
8246 auto& ioctx = index_pool.ioctx();
8247 map<int, rgw_cls_list_ret> shard_list_results;
8248 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8249 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8250 num_entries_per_shard,
8251 list_versions, shard_oids, shard_list_results,
1adf2230 8252 cct->_conf->rgw_bucket_index_max_aio)();
9f95a23c 8253 if (r < 0) {
7c673cae 8254 return r;
9f95a23c 8255 }
7c673cae 8256
9f95a23c
TL
8257 // to manage the iterators through each shard's list results
8258 struct ShardTracker {
8259 const size_t shard_idx;
8260 rgw_cls_list_ret& result;
8261 const std::string& oid_name;
8262 RGWRados::ent_map_t::iterator cursor;
8263 RGWRados::ent_map_t::iterator end;
8264
8265 // manages an iterator through a shard and provides other
8266 // accessors
8267 ShardTracker(size_t _shard_idx,
8268 rgw_cls_list_ret& _result,
8269 const std::string& _oid_name):
8270 shard_idx(_shard_idx),
8271 result(_result),
8272 oid_name(_oid_name),
8273 cursor(_result.dir.m.begin()),
8274 end(_result.dir.m.end())
8275 {}
8276
8277 inline const std::string& entry_name() const {
8278 return cursor->first;
8279 }
8280 rgw_bucket_dir_entry& dir_entry() const {
8281 return cursor->second;
8282 }
8283 inline bool is_truncated() const {
8284 return result.is_truncated;
8285 }
8286 inline ShardTracker& advance() {
8287 ++cursor;
8288 // return a self-reference to allow for chaining of calls, such
8289 // as x.advance().at_end()
8290 return *this;
8291 }
8292 inline bool at_end() const {
8293 return cursor == end;
8294 }
8295 }; // ShardTracker
8296
8297 // add the next unique candidate, or return false if we reach the end
8298 auto next_candidate = [] (ShardTracker& t,
8299 std::map<std::string, size_t>& candidates,
8300 size_t tracker_idx) {
8301 while (!t.at_end()) {
8302 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8303 return;
8304 }
8305 t.advance(); // skip duplicate common prefixes
8306 }
8307 };
8308
8309 // one tracker per shard requested (may not be all shards)
8310 std::vector<ShardTracker> results_trackers;
8311 results_trackers.reserve(shard_list_results.size());
8312 for (auto& r : shard_list_results) {
8313 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8314
8315 // if any *one* shard's result is trucated, the entire result is
8316 // truncated
8317 *is_truncated = *is_truncated || r.second.is_truncated;
8318
8319 // unless *all* are shards are cls_filtered, the entire result is
8320 // not filtered
8321 *cls_filtered = *cls_filtered && r.second.cls_filtered;
7c673cae
FG
8322 }
8323
9f95a23c
TL
8324 // create a map to track the next candidate entry from ShardTracker
8325 // (key=candidate, value=index into results_trackers); as we consume
8326 // entries from shards, we replace them with the next entries in the
8327 // shards until we run out
7c673cae 8328 map<string, size_t> candidates;
9f95a23c
TL
8329 size_t tracker_idx = 0;
8330 for (auto& t : results_trackers) {
8331 // it's important that the values in the map refer to the index
8332 // into the results_trackers vector, which may not be the same
8333 // as the shard number (i.e., when not all shards are requested)
8334 next_candidate(t, candidates, tracker_idx);
8335 ++tracker_idx;
7c673cae
FG
8336 }
8337
9f95a23c
TL
8338 rgw_bucket_dir_entry*
8339 last_entry_visited = nullptr; // to set last_entry (marker)
7c673cae
FG
8340 map<string, bufferlist> updates;
8341 uint32_t count = 0;
8342 while (count < num_entries && !candidates.empty()) {
8343 r = 0;
9f95a23c
TL
8344 // select the next entry in lexical order (first key in map);
8345 // again tracker_idx is not necessarily shard number, but is index
8346 // into results_trackers vector
8347 tracker_idx = candidates.begin()->second;
8348 auto& tracker = results_trackers.at(tracker_idx);
e306af50 8349
9f95a23c
TL
8350 const string& name = tracker.entry_name();
8351 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8352
8353 ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " <<
8354 dirent.key << " from shard " << tracker.shard_idx << dendl;
8355
8356 const bool force_check =
8357 force_check_filter && force_check_filter(dirent.key.name);
8358
8359 if ((!dirent.exists &&
8360 !dirent.is_delete_marker() &&
8361 !dirent.is_common_prefix()) ||
3efd9988
FG
8362 !dirent.pending_map.empty() ||
8363 force_check) {
9f95a23c
TL
8364 /* there are uncommitted ops. We need to check the current
8365 * state, and if the tags are old we need to do clean-up as
8366 * well. */
7c673cae 8367 librados::IoCtx sub_ctx;
9f95a23c 8368 sub_ctx.dup(ioctx);
1adf2230 8369 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
9f95a23c 8370 updates[tracker.oid_name], y);
7c673cae 8371 if (r < 0 && r != -ENOENT) {
9f95a23c 8372 return r;
7c673cae 8373 }
eafe8130 8374 } else {
9f95a23c 8375 r = 0;
7c673cae 8376 }
9f95a23c 8377
7c673cae 8378 if (r >= 0) {
9f95a23c 8379 ldout(cct, 10) << "RGWRados::" << __func__ << ": got " <<
1adf2230 8380 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae 8381 m[name] = std::move(dirent);
e306af50 8382 last_entry_visited = &(m[name]);
7c673cae 8383 ++count;
9f95a23c
TL
8384 } else {
8385 ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " <<
8386 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
e306af50 8387 last_entry_visited = &tracker.dir_entry();
7c673cae
FG
8388 }
8389
9f95a23c 8390 // refresh the candidates map
7c673cae 8391 candidates.erase(candidates.begin());
9f95a23c
TL
8392 tracker.advance();
8393
8394 next_candidate(tracker, candidates, tracker_idx);
8395
8396 if (tracker.at_end() && tracker.is_truncated()) {
8397 // once we exhaust one shard that is truncated, we need to stop,
8398 // as we cannot be certain that one of the next entries needs to
8399 // come from that shard; S3 and swift protocols allow returning
8400 // fewer than what was requested
8401 break;
7c673cae 8402 }
9f95a23c 8403 } // while we haven't provided requested # of result entries
7c673cae 8404
9f95a23c
TL
8405 // suggest updates if there are any
8406 for (auto& miter : updates) {
8407 if (miter.second.length()) {
7c673cae 8408 ObjectWriteOperation o;
9f95a23c 8409 cls_rgw_suggest_changes(o, miter.second);
7c673cae 8410 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8411 AioCompletion *c =
8412 librados::Rados::aio_create_completion(nullptr, nullptr);
8413 ioctx.aio_operate(miter.first, c, &o);
1adf2230 8414 c->release();
7c673cae 8415 }
9f95a23c 8416 } // updates loop
7c673cae 8417
9f95a23c
TL
8418 // determine truncation by checking if all the returned entries are
8419 // consumed or not
8420 *is_truncated = false;
8421 for (const auto& t : results_trackers) {
8422 if (!t.at_end() || t.is_truncated()) {
7c673cae 8423 *is_truncated = true;
1adf2230
AA
8424 break;
8425 }
7c673cae 8426 }
92f5a8d4 8427
9f95a23c
TL
8428 ldout(cct, 20) << "RGWRados::" << __func__ <<
8429 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8430 dendl;
8431
8432 if (*is_truncated && count < num_entries) {
8433 ldout(cct, 10) << "RGWRados::" << __func__ <<
8434 ": INFO requested " << num_entries << " entries but returning " <<
8435 count << ", which is truncated" << dendl;
8436 }
8437
8438 if (last_entry_visited != nullptr && last_entry) {
e306af50 8439 *last_entry = last_entry_visited->key;
9f95a23c
TL
8440 ldout(cct, 20) << "RGWRados::" << __func__ <<
8441 ": returning, last_entry=" << *last_entry << dendl;
8442 } else {
8443 ldout(cct, 20) << "RGWRados::" << __func__ <<
8444 ": returning, last_entry NOT SET" << dendl;
8445 }
7c673cae
FG
8446
8447 return 0;
8448}
8449
1adf2230
AA
8450
8451int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
8452 int shard_id,
9f95a23c 8453 const rgw_obj_index_key& start_after,
1adf2230
AA
8454 const string& prefix,
8455 uint32_t num_entries,
8456 bool list_versions,
8457 std::vector<rgw_bucket_dir_entry>& ent_list,
8458 bool *is_truncated,
8459 rgw_obj_index_key *last_entry,
9f95a23c
TL
8460 optional_yield y,
8461 check_filter_t force_check_filter) {
1adf2230 8462 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9f95a23c 8463 " start_after " << start_after.name << "[" << start_after.instance <<
1adf2230
AA
8464 "] num_entries " << num_entries << dendl;
8465
9f95a23c 8466 ent_list.clear();
11fdf7f2
TL
8467 static MultipartMetaFilter multipart_meta_filter;
8468
1adf2230 8469 *is_truncated = false;
9f95a23c 8470 RGWSI_RADOS::Pool index_pool;
1adf2230 8471
1adf2230 8472 map<int, string> oids;
9f95a23c 8473 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, nullptr);
1adf2230
AA
8474 if (r < 0)
8475 return r;
9f95a23c
TL
8476
8477 auto& ioctx = index_pool.ioctx();
8478
1adf2230
AA
8479 const uint32_t num_shards = oids.size();
8480
9f95a23c 8481 rgw_obj_index_key marker = start_after;
1adf2230
AA
8482 uint32_t current_shard;
8483 if (shard_id >= 0) {
8484 current_shard = shard_id;
9f95a23c 8485 } else if (start_after.empty()) {
1adf2230
AA
8486 current_shard = 0u;
8487 } else {
9f95a23c
TL
8488 // at this point we have a marker (start_after) that has something
8489 // in it, so we need to get to the bucket shard index, so we can
11fdf7f2
TL
8490 // start reading from there
8491
8492 std::string key;
8493 // test whether object name is a multipart meta name
9f95a23c 8494 if(! multipart_meta_filter.filter(start_after.name, key)) {
11fdf7f2
TL
8495 // if multipart_meta_filter fails, must be "regular" (i.e.,
8496 // unadorned) and the name is the key
9f95a23c 8497 key = start_after.name;
11fdf7f2
TL
8498 }
8499
8500 // now convert the key (oid) to an rgw_obj_key since that will
8501 // separate out the namespace, name, and instance
8502 rgw_obj_key obj_key;
8503 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
8504 if (!parsed) {
8505 ldout(cct, 0) <<
8506 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9f95a23c 8507 "start marker: '" << start_after << "'" << dendl;
11fdf7f2
TL
8508 return -EINVAL;
8509 } else if (obj_key.name.empty()) {
8510 // if the name is empty that means the object name came in with
8511 // a namespace only, and therefore we need to start our scan at
8512 // the first bucket index shard
8513 current_shard = 0u;
8514 } else {
8515 // so now we have the key used to compute the bucket index shard
8516 // and can extract the specific shard from it
9f95a23c 8517 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
11fdf7f2 8518 }
1adf2230
AA
8519 }
8520
8521 uint32_t count = 0u;
8522 map<string, bufferlist> updates;
11fdf7f2 8523 rgw_obj_index_key last_added_entry;
1adf2230
AA
8524 while (count <= num_entries &&
8525 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8526 current_shard < num_shards)) {
81eedcae
TL
8527 const std::string& oid = oids[current_shard];
8528 rgw_cls_list_ret result;
8529
8530 librados::ObjectReadOperation op;
9f95a23c
TL
8531 string empty_delimiter;
8532 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8533 num_entries,
81eedcae 8534 list_versions, &result);
9f95a23c 8535 r = rgw_rados_operate(ioctx, oid, &op, nullptr, null_yield);
1adf2230
AA
8536 if (r < 0)
8537 return r;
8538
1adf2230
AA
8539 for (auto& entry : result.dir.m) {
8540 rgw_bucket_dir_entry& dirent = entry.second;
8541
8542 bool force_check = force_check_filter &&
8543 force_check_filter(dirent.key.name);
8544 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8545 !dirent.pending_map.empty() ||
8546 force_check) {
8547 /* there are uncommitted ops. We need to check the current state,
8548 * and if the tags are old we need to do cleanup as well. */
8549 librados::IoCtx sub_ctx;
9f95a23c
TL
8550 sub_ctx.dup(ioctx);
8551 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
1adf2230
AA
8552 if (r < 0 && r != -ENOENT) {
8553 return r;
8554 }
eafe8130
TL
8555 } else {
8556 r = 0;
1adf2230
AA
8557 }
8558
8559 // at this point either r >=0 or r == -ENOENT
8560 if (r >= 0) { // i.e., if r != -ENOENT
8561 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
8562 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8563
8564 if (count < num_entries) {
11fdf7f2 8565 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
8566 ent_list.emplace_back(std::move(dirent));
8567 ++count;
8568 } else {
8569 *is_truncated = true;
8570 goto check_updates;
8571 }
8572 } else { // r == -ENOENT
8573 // in the case of -ENOENT, make sure we're advancing marker
8574 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 8575 marker = dirent.key;
1adf2230
AA
8576 }
8577 } // entry for loop
8578
8579 if (!result.is_truncated) {
8580 // if we reached the end of the shard read next shard
8581 ++current_shard;
11fdf7f2 8582 marker = rgw_obj_index_key();
1adf2230
AA
8583 }
8584 } // shard loop
8585
8586check_updates:
11fdf7f2 8587
1adf2230
AA
8588 // suggest updates if there is any
8589 map<string, bufferlist>::iterator miter = updates.begin();
8590 for (; miter != updates.end(); ++miter) {
8591 if (miter->second.length()) {
8592 ObjectWriteOperation o;
8593 cls_rgw_suggest_changes(o, miter->second);
8594 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8595 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8596 ioctx.aio_operate(miter->first, c, &o);
1adf2230
AA
8597 c->release();
8598 }
8599 }
8600
8601 if (last_entry && !ent_list.empty()) {
8602 *last_entry = last_added_entry;
8603 }
8604
8605 return 0;
11fdf7f2 8606} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
8607
8608
8609int RGWRados::cls_obj_usage_log_add(const string& oid,
8610 rgw_usage_log_info& info)
7c673cae 8611{
11fdf7f2 8612 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8613
8614 rgw_rados_ref ref;
224ce89b 8615 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8616 if (r < 0) {
8617 return r;
8618 }
8619
8620 ObjectWriteOperation op;
8621 cls_rgw_usage_log_add(op, info);
8622
9f95a23c 8623 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8624 return r;
8625}
8626
11fdf7f2
TL
8627int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
8628 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
8629 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
8630 bool *is_truncated)
7c673cae 8631{
11fdf7f2 8632 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8633
8634 rgw_rados_ref ref;
224ce89b 8635 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8636 if (r < 0) {
8637 return r;
8638 }
8639
8640 *is_truncated = false;
8641
9f95a23c 8642 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
8643 max_entries, read_iter, usage, is_truncated);
8644
8645 return r;
8646}
8647
9f95a23c
TL
8648static int cls_rgw_usage_log_trim_repeat(rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
8649{
8650 bool done = false;
8651 do {
8652 librados::ObjectWriteOperation op;
8653 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
8654 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8655 if (r == -ENODATA)
8656 done = true;
8657 else if (r < 0)
8658 return r;
8659 } while (!done);
8660
8661 return 0;
8662}
8663
11fdf7f2
TL
8664int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
8665 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 8666{
11fdf7f2 8667 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8668
8669 rgw_rados_ref ref;
224ce89b 8670 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8671 if (r < 0) {
8672 return r;
8673 }
8674
9f95a23c 8675 r = cls_rgw_usage_log_trim_repeat(ref, user, bucket, start_epoch, end_epoch);
11fdf7f2
TL
8676 return r;
8677}
8678
8679int RGWRados::cls_obj_usage_log_clear(string& oid)
8680{
8681 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8682
8683 rgw_rados_ref ref;
8684 int r = get_raw_obj_ref(obj, &ref);
8685 if (r < 0) {
8686 return r;
8687 }
8688 librados::ObjectWriteOperation op;
8689 cls_rgw_usage_log_clear(op);
9f95a23c 8690 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8691 return r;
8692}
8693
11fdf7f2 8694
7c673cae
FG
8695int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
8696{
9f95a23c 8697 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8698 string dir_oid;
8699
11fdf7f2 8700 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae 8701
9f95a23c 8702 int r = svc.bi_rados->open_bucket_index(bucket_info, &index_pool, &dir_oid);
7c673cae
FG
8703 if (r < 0)
8704 return r;
8705
8706 bufferlist updates;
8707
8708 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
8709 rgw_bucket_dir_entry entry;
8710 entry.key = *iter;
8711 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
8712 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8713 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 8714 encode(entry, updates);
7c673cae
FG
8715 }
8716
8717 bufferlist out;
8718
9f95a23c 8719 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
7c673cae
FG
8720
8721 return r;
8722}
8723
8724int RGWRados::check_disk_state(librados::IoCtx io_ctx,
8725 const RGWBucketInfo& bucket_info,
8726 rgw_bucket_dir_entry& list_state,
8727 rgw_bucket_dir_entry& object,
9f95a23c
TL
8728 bufferlist& suggested_updates,
8729 optional_yield y)
7c673cae
FG
8730{
8731 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 8732 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
8733
8734 std::string loc;
8735
8736 rgw_obj obj(bucket, list_state.key);
8737
8738 string oid;
8739 get_obj_bucket_and_oid_loc(obj, oid, loc);
8740
8741 if (loc != list_state.locator) {
8742 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
8743 }
8744
8745 io_ctx.locator_set_key(list_state.locator);
8746
8747 RGWObjState *astate = NULL;
9f95a23c
TL
8748 RGWObjectCtx rctx(this->store);
8749 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
8750 if (r < 0)
8751 return r;
8752
8753 list_state.pending_map.clear(); // we don't need this and it inflates size
9f95a23c 8754 if (!list_state.is_delete_marker() && !astate->exists) {
7c673cae
FG
8755 /* object doesn't exist right now -- hopefully because it's
8756 * marked as !exists and got deleted */
8757 if (list_state.exists) {
8758 /* FIXME: what should happen now? Work out if there are any
8759 * non-bad ways this could happen (there probably are, but annoying
8760 * to handle!) */
8761 }
8762 // encode a suggested removal of that key
8763 list_state.ver.epoch = io_ctx.get_last_version();
8764 list_state.ver.pool = io_ctx.get_id();
8765 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
8766 return -ENOENT;
8767 }
8768
8769 string etag;
8770 string content_type;
8771 ACLOwner owner;
8772
8773 object.meta.size = astate->size;
8774 object.meta.accounted_size = astate->accounted_size;
8775 object.meta.mtime = astate->mtime;
8776
8777 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
8778 if (iter != astate->attrset.end()) {
11fdf7f2 8779 etag = rgw_bl_str(iter->second);
7c673cae
FG
8780 }
8781 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
8782 if (iter != astate->attrset.end()) {
11fdf7f2 8783 content_type = rgw_bl_str(iter->second);
7c673cae
FG
8784 }
8785 iter = astate->attrset.find(RGW_ATTR_ACL);
8786 if (iter != astate->attrset.end()) {
8787 r = decode_policy(iter->second, &owner);
8788 if (r < 0) {
8789 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
8790 }
8791 }
8792
9f95a23c 8793 if (astate->manifest) {
7c673cae 8794 RGWObjManifest::obj_iterator miter;
9f95a23c 8795 RGWObjManifest& manifest = *astate->manifest;
7c673cae
FG
8796 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
8797 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
8798 rgw_obj loc;
9f95a23c 8799 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
7c673cae
FG
8800
8801 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
8802 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
494da23a 8803 r = delete_obj_index(loc, astate->mtime);
7c673cae
FG
8804 if (r < 0) {
8805 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
8806 }
8807 }
8808 }
8809 }
8810
8811 object.meta.etag = etag;
8812 object.meta.content_type = content_type;
8813 object.meta.owner = owner.get_id().to_str();
8814 object.meta.owner_display_name = owner.get_display_name();
8815
8816 // encode suggested updates
8817 list_state.ver.pool = io_ctx.get_id();
8818 list_state.ver.epoch = astate->epoch;
8819 list_state.meta.size = object.meta.size;
8820 list_state.meta.accounted_size = object.meta.accounted_size;
8821 list_state.meta.mtime = object.meta.mtime;
8822 list_state.meta.category = main_category;
8823 list_state.meta.etag = etag;
8824 list_state.meta.content_type = content_type;
8825 if (astate->obj_tag.length() > 0)
8826 list_state.tag = astate->obj_tag.c_str();
8827 list_state.meta.owner = owner.get_id().to_str();
8828 list_state.meta.owner_display_name = owner.get_display_name();
8829
8830 list_state.exists = true;
8831 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
8832 return 0;
8833}
8834
a8e16298 8835int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae 8836{
9f95a23c 8837 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8838 map<int, string> oids;
8839 map<int, struct rgw_cls_list_ret> list_results;
9f95a23c
TL
8840 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
8841 if (r < 0) {
8842 ldout(cct, 20) << "cls_bucket_head: open_bucket_index() returned "
8843 << r << dendl;
7c673cae 8844 return r;
9f95a23c 8845 }
7c673cae 8846
9f95a23c
TL
8847 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
8848 if (r < 0) {
8849 ldout(cct, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
8850 << r << dendl;
7c673cae 8851 return r;
9f95a23c 8852 }
7c673cae
FG
8853
8854 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
8855 for(; iter != list_results.end(); ++iter) {
a8e16298 8856 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
8857 }
8858 return 0;
8859}
8860
8861int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
8862{
9f95a23c 8863 RGWSI_RADOS::Pool index_pool;
7c673cae 8864 map<int, string> bucket_objs;
9f95a23c 8865 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8866 if (r < 0)
8867 return r;
8868
8869 map<int, string>::iterator iter = bucket_objs.begin();
8870 for (; iter != bucket_objs.end(); ++iter) {
9f95a23c 8871 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
7c673cae
FG
8872 if (r < 0) {
8873 ctx->put();
8874 break;
8875 } else {
8876 (*num_aio)++;
8877 }
8878 }
8879 return r;
8880}
8881
9f95a23c
TL
8882int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
8883 const rgw_bucket& bucket,
8884 uint64_t num_objs)
31f18b77 8885{
11fdf7f2 8886 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
8887 return 0;
8888 }
8889
8890 bool need_resharding = false;
9f95a23c
TL
8891 uint32_t num_source_shards =
8892 (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8893 const uint32_t max_dynamic_shards =
8894 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
8895
8896 if (num_source_shards >= max_dynamic_shards) {
8897 return 0;
8898 }
31f18b77 8899
9f95a23c 8900 uint32_t suggested_num_shards = 0;
11fdf7f2
TL
8901 const uint64_t max_objs_per_shard =
8902 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9f95a23c
TL
8903
8904 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
8905 num_objs, need_resharding, &suggested_num_shards);
8906 if (! need_resharding) {
8907 return 0;
31f18b77
FG
8908 }
8909
9f95a23c
TL
8910 const uint32_t final_num_shards =
8911 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
8912 max_dynamic_shards);
8913 // final verification, so we don't reduce number of shards
8914 if (final_num_shards <= num_source_shards) {
8915 return 0;
31f18b77
FG
8916 }
8917
9f95a23c
TL
8918 ldout(cct, 20) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
8919 " needs resharding; current num shards " << bucket_info.num_shards <<
8920 "; new num shards " << final_num_shards << " (suggested " <<
8921 suggested_num_shards << ")" << dendl;
8922
8923 return add_bucket_to_reshard(bucket_info, final_num_shards);
31f18b77
FG
8924}
8925
8926int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
8927{
9f95a23c 8928 RGWReshard reshard(this->store);
31f18b77
FG
8929
8930 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8931
11fdf7f2 8932 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77
FG
8933 if (new_num_shards <= num_source_shards) {
8934 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
8935 return 0;
8936 }
8937
8938 cls_rgw_reshard_entry entry;
8939 entry.time = real_clock::now();
8940 entry.tenant = bucket_info.owner.tenant;
8941 entry.bucket_name = bucket_info.bucket.name;
8942 entry.bucket_id = bucket_info.bucket.bucket_id;
8943 entry.old_num_shards = num_source_shards;
8944 entry.new_num_shards = new_num_shards;
8945
8946 return reshard.add(entry);
8947}
8948
7c673cae 8949int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
11fdf7f2 8950 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
7c673cae 8951{
11fdf7f2
TL
8952 // if we only check size, then num_objs will set to 0
8953 if(check_size_only)
8954 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
8955
7c673cae
FG
8956 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
8957}
8958
11fdf7f2
TL
8959int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
8960 int *shard_id)
7c673cae 8961{
11fdf7f2
TL
8962 int r = 0;
8963 switch (bucket_info.bucket_index_shard_hash_type) {
8964 case RGWBucketInfo::MOD:
8965 if (!bucket_info.num_shards) {
8966 if (shard_id) {
8967 *shard_id = -1;
8968 }
8969 } else {
9f95a23c 8970 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, bucket_info.num_shards);
11fdf7f2
TL
8971 if (shard_id) {
8972 *shard_id = (int)sid;
8973 }
8974 }
8975 break;
8976 default:
8977 r = -ENOTSUP;
7c673cae 8978 }
11fdf7f2 8979 return r;
7c673cae
FG
8980}
8981
7c673cae
FG
8982uint64_t RGWRados::instance_id()
8983{
8984 return get_rados_handle()->get_instance_id();
8985}
8986
8987uint64_t RGWRados::next_bucket_id()
8988{
9f95a23c 8989 std::lock_guard l{bucket_id_lock};
7c673cae
FG
8990 return ++max_bucket_id;
8991}
8992
7c673cae
FG
8993librados::Rados* RGWRados::get_rados_handle()
8994{
494da23a 8995 return &rados;
7c673cae
FG
8996}
8997
8998int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
8999{
9000 rgw_rados_ref ref;
9001 int ret = get_raw_obj_ref(obj, &ref);
9002 if (ret < 0) {
9003 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9004 return ret;
9005 }
9006
9007 ObjectWriteOperation op;
9008 list<string> prefixes;
9009 cls_rgw_remove_obj(op, prefixes);
9010
9f95a23c
TL
9011 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9012 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
9013 if (ret < 0) {
9014 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9015 c->release();
9016 return ret;
9017 }
9018
9019 handles.push_back(c);
9020
9021 return 0;
9022}
9023
9024int RGWRados::delete_obj_aio(const rgw_obj& obj,
9025 RGWBucketInfo& bucket_info, RGWObjState *astate,
9f95a23c
TL
9026 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9027 optional_yield y)
7c673cae
FG
9028{
9029 rgw_rados_ref ref;
9030 int ret = get_obj_head_ref(bucket_info, obj, &ref);
9031 if (ret < 0) {
9032 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9033 return ret;
9034 }
9035
9036 if (keep_index_consistent) {
9037 RGWRados::Bucket bop(this, bucket_info);
9038 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9039
9f95a23c 9040 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag, y);
7c673cae
FG
9041 if (ret < 0) {
9042 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
9043 return ret;
9044 }
9045 }
9046
9047 ObjectWriteOperation op;
9048 list<string> prefixes;
9049 cls_rgw_remove_obj(op, prefixes);
9050
9f95a23c
TL
9051 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9052 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
9053 if (ret < 0) {
9054 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9055 c->release();
9056 return ret;
9057 }
9058
9059 handles.push_back(c);
9060
9061 if (keep_index_consistent) {
494da23a 9062 ret = delete_obj_index(obj, astate->mtime);
7c673cae
FG
9063 if (ret < 0) {
9064 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
9065 return ret;
9066 }
9067 }
9068 return ret;
9069}