]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
9f95a23c 2// vim: ts=8 sw=2 smarttab ft=cpp
7c673cae 3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
9f95a23c
TL
8#include <sstream>
9
7c673cae 10#include <boost/algorithm/string.hpp>
11fdf7f2 11#include <string_view>
7c673cae 12
11fdf7f2 13#include <boost/container/flat_set.hpp>
7c673cae
FG
14#include <boost/format.hpp>
15#include <boost/optional.hpp>
16#include <boost/utility/in_place_factory.hpp>
17
18#include "common/ceph_json.h"
7c673cae
FG
19
20#include "common/errno.h"
21#include "common/Formatter.h"
22#include "common/Throttle.h"
7c673cae 23
9f95a23c 24#include "rgw_sal.h"
11fdf7f2 25#include "rgw_zone.h"
7c673cae
FG
26#include "rgw_cache.h"
27#include "rgw_acl.h"
28#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 29#include "rgw_aio_throttle.h"
7c673cae
FG
30#include "rgw_bucket.h"
31#include "rgw_rest_conn.h"
32#include "rgw_cr_rados.h"
33#include "rgw_cr_rest.h"
11fdf7f2 34#include "rgw_putobj_processor.h"
7c673cae
FG
35
36#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
37#include "cls/rgw/cls_rgw_client.h"
38#include "cls/rgw/cls_rgw_const.h"
39#include "cls/refcount/cls_refcount_client.h"
40#include "cls/version/cls_version_client.h"
c07f9fc5 41#include "osd/osd_types.h"
7c673cae
FG
42
43#include "rgw_tools.h"
44#include "rgw_coroutine.h"
45#include "rgw_compression.h"
9f95a23c 46#include "rgw_worker.h"
7c673cae 47
7c673cae
FG
48#undef fork // fails to compile RGWPeriod::fork() below
49
50#include "common/Clock.h"
51
7c673cae
FG
52using namespace librados;
53
54#include <string>
55#include <iostream>
56#include <vector>
57#include <atomic>
58#include <list>
59#include <map>
11fdf7f2 60#include "include/random.h"
7c673cae
FG
61
62#include "rgw_gc.h"
63#include "rgw_lc.h"
64
65#include "rgw_object_expirer_core.h"
66#include "rgw_sync.h"
81eedcae 67#include "rgw_sync_counters.h"
11fdf7f2 68#include "rgw_sync_trace.h"
9f95a23c
TL
69#include "rgw_trim_datalog.h"
70#include "rgw_trim_mdlog.h"
7c673cae
FG
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae 74
11fdf7f2
TL
75#include "services/svc_zone.h"
76#include "services/svc_zone_utils.h"
77#include "services/svc_quota.h"
78#include "services/svc_sync_modules.h"
79#include "services/svc_sys_obj.h"
80#include "services/svc_sys_obj_cache.h"
9f95a23c
TL
81#include "services/svc_bucket.h"
82#include "services/svc_mdlog.h"
83#include "services/svc_datalog_rados.h"
11fdf7f2 84
7c673cae
FG
85#include "compressor/Compressor.h"
86
11fdf7f2
TL
87#ifdef WITH_LTTNG
88#define TRACEPOINT_DEFINE
89#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
90#include "tracing/rgw_rados.h"
91#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
92#undef TRACEPOINT_DEFINE
93#else
94#define tracepoint(...)
95#endif
96
7c673cae
FG
97#define dout_context g_ceph_context
98#define dout_subsys ceph_subsys_rgw
99
7c673cae 100
7c673cae 101static string shadow_ns = "shadow";
7c673cae
FG
102static string default_bucket_index_pool_suffix = "rgw.buckets.index";
103static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 104
11fdf7f2 105static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 106#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae
FG
107
108#define dout_subsys ceph_subsys_rgw
109
110
111static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
112 const rgw_placement_rule& head_placement_rule,
113 const rgw_obj& obj, rgw_pool *pool)
7c673cae 114{
11fdf7f2 115 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 116 RGWZonePlacementInfo placement;
11fdf7f2 117 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
118 return false;
119 }
120
121 if (!obj.in_extra_data) {
11fdf7f2 122 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 123 } else {
31f18b77 124 *pool = placement.get_data_extra_pool();
7c673cae
FG
125 }
126 }
127
128 return true;
129}
130
131static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
132 const rgw_placement_rule& head_placement_rule,
133 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
134{
135 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
136
11fdf7f2 137 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
138}
139
140rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
141{
142 if (!is_raw) {
143 rgw_raw_obj r;
144 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
145 return r;
146 }
147 return raw_obj;
148}
149
150rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
151{
152 if (!is_raw) {
153 rgw_raw_obj r;
154 store->obj_to_raw(placement_rule, obj, &r);
155 return r;
156 }
157 return raw_obj;
158}
159
11fdf7f2
TL
160void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
161{
162 obj_version *check_objv = version_for_check();
7c673cae 163
11fdf7f2
TL
164 if (check_objv) {
165 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
166 }
167
11fdf7f2 168 cls_version_read(*op, &read_version);
7c673cae
FG
169}
170
11fdf7f2
TL
171void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
172{
173 obj_version *check_objv = version_for_check();
174 obj_version *modify_version = version_for_write();
7c673cae 175
11fdf7f2
TL
176 if (check_objv) {
177 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 178 }
7c673cae 179
11fdf7f2
TL
180 if (modify_version) {
181 cls_version_set(*op, *modify_version);
182 } else {
183 cls_version_inc(*op);
7c673cae 184 }
7c673cae
FG
185}
186
f91f0fd5
TL
187void RGWObjVersionTracker::apply_write()
188{
189 const bool checked = (read_version.ver != 0);
190 const bool incremented = (write_version.ver == 0);
191
192 if (checked && incremented) {
193 // apply cls_version_inc() so our next operation can recheck it
194 ++read_version.ver;
195 } else {
196 read_version = write_version;
197 }
198 write_version = obj_version();
199}
200
9f95a23c 201RGWObjState::RGWObjState() {
7c673cae
FG
202}
203
9f95a23c 204RGWObjState::~RGWObjState() {
7c673cae
FG
205}
206
9f95a23c
TL
207RGWObjState::RGWObjState(const RGWObjState& rhs) : obj (rhs.obj) {
208 is_atomic = rhs.is_atomic;
209 has_attrs = rhs.has_attrs;
210 exists = rhs.exists;
211 size = rhs.size;
212 accounted_size = rhs.accounted_size;
213 mtime = rhs.mtime;
214 epoch = rhs.epoch;
215 if (rhs.obj_tag.length()) {
216 obj_tag = rhs.obj_tag;
7c673cae 217 }
9f95a23c
TL
218 if (rhs.tail_tag.length()) {
219 tail_tag = rhs.tail_tag;
7c673cae 220 }
9f95a23c
TL
221 write_tag = rhs.write_tag;
222 fake_tag = rhs.fake_tag;
223 manifest = rhs.manifest;
224 shadow_obj = rhs.shadow_obj;
225 has_data = rhs.has_data;
226 if (rhs.data.length()) {
227 data = rhs.data;
7c673cae 228 }
9f95a23c
TL
229 prefetch_data = rhs.prefetch_data;
230 keep_tail = rhs.keep_tail;
231 is_olh = rhs.is_olh;
232 objv_tracker = rhs.objv_tracker;
233 pg_ver = rhs.pg_ver;
7c673cae
FG
234}
235
9f95a23c
TL
236RGWObjState *RGWObjectCtx::get_state(const rgw_obj& obj) {
237 RGWObjState *result;
238 typename std::map<rgw_obj, RGWObjState>::iterator iter;
239 lock.lock_shared();
240 assert (!obj.empty());
241 iter = objs_state.find(obj);
242 if (iter != objs_state.end()) {
243 result = &iter->second;
244 lock.unlock_shared();
245 } else {
246 lock.unlock_shared();
247 lock.lock();
248 result = &objs_state[obj];
249 lock.unlock();
224ce89b 250 }
9f95a23c 251 return result;
7c673cae
FG
252}
253
9f95a23c
TL
254void RGWObjectCtx::set_atomic(rgw_obj& obj) {
255 std::unique_lock wl{lock};
256 assert (!obj.empty());
257 objs_state[obj].is_atomic = true;
7c673cae 258}
9f95a23c
TL
259void RGWObjectCtx::set_prefetch_data(const rgw_obj& obj) {
260 std::unique_lock wl{lock};
261 assert (!obj.empty());
262 objs_state[obj].prefetch_data = true;
7c673cae
FG
263}
264
9f95a23c
TL
265void RGWObjectCtx::invalidate(const rgw_obj& obj) {
266 std::unique_lock wl{lock};
267 auto iter = objs_state.find(obj);
268 if (iter == objs_state.end()) {
11fdf7f2 269 return;
7c673cae 270 }
9f95a23c
TL
271 bool is_atomic = iter->second.is_atomic;
272 bool prefetch_data = iter->second.prefetch_data;
7c673cae 273
9f95a23c 274 objs_state.erase(iter);
7c673cae 275
9f95a23c
TL
276 if (is_atomic || prefetch_data) {
277 auto& state = objs_state[obj];
278 state.is_atomic = is_atomic;
279 state.prefetch_data = prefetch_data;
11fdf7f2 280 }
7c673cae
FG
281}
282
11fdf7f2 283void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 284{
11fdf7f2
TL
285 write_version.ver = 1;
286#define TAG_LEN 24
7c673cae 287
11fdf7f2
TL
288 write_version.tag.clear();
289 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
290}
291
7c673cae
FG
292class RGWMetaNotifierManager : public RGWCoroutinesManager {
293 RGWRados *store;
294 RGWHTTPManager http_manager;
295
296public:
297 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
298 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 299 http_manager.start();
7c673cae
FG
300 }
301
9f95a23c 302 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, set<int>& shards) {
7c673cae
FG
303 rgw_http_param_pair pairs[] = { { "type", "metadata" },
304 { "notify", NULL },
305 { NULL, NULL } };
306
307 list<RGWCoroutinesStack *> stacks;
9f95a23c 308 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
309 RGWRESTConn *conn = iter->second;
310 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
311 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
312
313 stacks.push_back(stack);
314 }
315 return run(stacks);
316 }
317};
318
319class RGWDataNotifierManager : public RGWCoroutinesManager {
320 RGWRados *store;
321 RGWHTTPManager http_manager;
322
323public:
324 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
325 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 326 http_manager.start();
7c673cae
FG
327 }
328
9f95a23c 329 int notify_all(map<rgw_zone_id, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
7c673cae
FG
330 rgw_http_param_pair pairs[] = { { "type", "data" },
331 { "notify", NULL },
11fdf7f2 332 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
333 { NULL, NULL } };
334
335 list<RGWCoroutinesStack *> stacks;
9f95a23c 336 for (auto iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
7c673cae
FG
337 RGWRESTConn *conn = iter->second;
338 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
339 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
340
341 stacks.push_back(stack);
342 }
343 return run(stacks);
344 }
345};
346
11fdf7f2
TL
347/* class RGWRadosThread */
348
7c673cae
FG
349void RGWRadosThread::start()
350{
351 worker = new Worker(cct, this);
352 worker->create(thread_name.c_str());
353}
354
355void RGWRadosThread::stop()
356{
357 down_flag = true;
358 stop_process();
359 if (worker) {
31f18b77 360 worker->signal();
7c673cae
FG
361 worker->join();
362 }
363 delete worker;
364 worker = NULL;
365}
366
367void *RGWRadosThread::Worker::entry() {
368 uint64_t msec = processor->interval_msec();
9f95a23c 369 auto interval = std::chrono::milliseconds(msec);
7c673cae
FG
370
371 do {
9f95a23c 372 auto start = ceph::real_clock::now();
7c673cae
FG
373 int r = processor->process();
374 if (r < 0) {
375 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
376 }
377
378 if (processor->going_down())
379 break;
380
9f95a23c 381 auto end = ceph::real_clock::now() - start;
7c673cae
FG
382
383 uint64_t cur_msec = processor->interval_msec();
384 if (cur_msec != msec) { /* was it reconfigured? */
385 msec = cur_msec;
9f95a23c 386 interval = std::chrono::milliseconds(msec);
7c673cae
FG
387 }
388
389 if (cur_msec > 0) {
390 if (interval <= end)
391 continue; // next round
392
9f95a23c 393 auto wait_time = interval - end;
31f18b77 394 wait_interval(wait_time);
7c673cae 395 } else {
31f18b77 396 wait();
7c673cae
FG
397 }
398 } while (!processor->going_down());
399
400 return NULL;
401}
402
403class RGWMetaNotifier : public RGWRadosThread {
404 RGWMetaNotifierManager notify_mgr;
405 RGWMetadataLog *const log;
406
407 uint64_t interval_msec() override {
408 return cct->_conf->rgw_md_notify_interval_msec;
409 }
1adf2230
AA
410 void stop_process() override {
411 notify_mgr.stop();
412 }
7c673cae
FG
413public:
414 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
415 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
416
417 int process() override;
418};
419
420int RGWMetaNotifier::process()
421{
422 set<int> shards;
423
424 log->read_clear_modified(shards);
425
426 if (shards.empty()) {
427 return 0;
428 }
429
430 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
431 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
432 }
433
11fdf7f2 434 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
435
436 return 0;
437}
438
439class RGWDataNotifier : public RGWRadosThread {
440 RGWDataNotifierManager notify_mgr;
441
442 uint64_t interval_msec() override {
11fdf7f2 443 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 444 }
1adf2230
AA
445 void stop_process() override {
446 notify_mgr.stop();
447 }
7c673cae
FG
448public:
449 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
450
451 int process() override;
452};
453
454int RGWDataNotifier::process()
455{
9f95a23c
TL
456 auto data_log = store->svc.datalog_rados->get_log();
457 if (!data_log) {
7c673cae
FG
458 return 0;
459 }
460
461 map<int, set<string> > shards;
462
9f95a23c 463 data_log->read_clear_modified(shards);
7c673cae
FG
464
465 if (shards.empty()) {
466 return 0;
467 }
468
469 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
470 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
471 }
472
11fdf7f2 473 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
474
475 return 0;
476}
477
478class RGWSyncProcessorThread : public RGWRadosThread {
479public:
480 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
481 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
482 ~RGWSyncProcessorThread() override {}
483 int init() override = 0 ;
484 int process() override = 0;
485};
486
487class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
488{
489 RGWMetaSyncStatusManager sync;
490
491 uint64_t interval_msec() override {
492 return 0; /* no interval associated, it'll run once until stopped */
493 }
494 void stop_process() override {
495 sync.stop();
496 }
497public:
9f95a23c
TL
498 RGWMetaSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados)
499 : RGWSyncProcessorThread(_store->getRados(), "meta-sync"), sync(_store, async_rados) {}
7c673cae
FG
500
501 void wakeup_sync_shards(set<int>& shard_ids) {
502 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
503 sync.wakeup(*iter);
504 }
505 }
506 RGWMetaSyncStatusManager* get_manager() { return &sync; }
507
508 int init() override {
509 int ret = sync.init();
510 if (ret < 0) {
511 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
512 return ret;
513 }
514 return 0;
515 }
516
517 int process() override {
518 sync.run();
519 return 0;
520 }
521};
522
523class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
524{
81eedcae 525 PerfCountersRef counters;
7c673cae
FG
526 RGWDataSyncStatusManager sync;
527 bool initialized;
528
529 uint64_t interval_msec() override {
530 if (initialized) {
531 return 0; /* no interval associated, it'll run once until stopped */
532 } else {
533#define DATA_SYNC_INIT_WAIT_SEC 20
534 return DATA_SYNC_INIT_WAIT_SEC * 1000;
535 }
536 }
537 void stop_process() override {
538 sync.stop();
539 }
540public:
9f95a23c 541 RGWDataSyncProcessorThread(rgw::sal::RGWRadosStore *_store, RGWAsyncRadosProcessor *async_rados,
81eedcae 542 const RGWZone* source_zone)
9f95a23c 543 : RGWSyncProcessorThread(_store->getRados(), "data-sync"),
81eedcae
TL
544 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
545 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
546 initialized(false) {}
547
548 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
549 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
550 sync.wakeup(iter->first, iter->second);
551 }
552 }
553 RGWDataSyncStatusManager* get_manager() { return &sync; }
554
555 int init() override {
556 return 0;
557 }
558
559 int process() override {
560 while (!initialized) {
561 if (going_down()) {
562 return 0;
563 }
564 int ret = sync.init();
565 if (ret >= 0) {
566 initialized = true;
567 break;
568 }
569 /* we'll be back! */
570 return 0;
571 }
572 sync.run();
573 return 0;
574 }
575};
576
11fdf7f2 577class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
578{
579 RGWCoroutinesManager crs;
9f95a23c 580 rgw::sal::RGWRadosStore *store;
b32b8144 581 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
582 RGWHTTPManager http;
583 const utime_t trim_interval;
584
585 uint64_t interval_msec() override { return 0; }
586 void stop_process() override { crs.stop(); }
587public:
9f95a23c 588 RGWSyncLogTrimThread(rgw::sal::RGWRadosStore *store, rgw::BucketTrimManager *bucket_trim,
b32b8144 589 int interval)
9f95a23c
TL
590 : RGWSyncProcessorThread(store->getRados(), "sync-log-trim"),
591 crs(store->ctx(), store->getRados()->get_cr_registry()), store(store),
b32b8144 592 bucket_trim(bucket_trim),
7c673cae
FG
593 http(store->ctx(), crs.get_completion_mgr()),
594 trim_interval(interval, 0)
595 {}
596
597 int init() override {
11fdf7f2 598 return http.start();
7c673cae
FG
599 }
600 int process() override {
601 list<RGWCoroutinesStack*> stacks;
602 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
11fdf7f2 603 meta->call(create_meta_log_trim_cr(this, store, &http,
7c673cae
FG
604 cct->_conf->rgw_md_log_max_shards,
605 trim_interval));
606 stacks.push_back(meta);
607
9f95a23c
TL
608 if (store->svc()->zone->sync_module_exports_data()) {
609 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
610 data->call(create_data_log_trim_cr(store, &http,
611 cct->_conf->rgw_data_log_num_shards,
612 trim_interval));
613 stacks.push_back(data);
7c673cae 614
9f95a23c
TL
615 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
616 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
617 stacks.push_back(bucket);
618 }
b32b8144 619
7c673cae
FG
620 crs.run(stacks);
621 return 0;
622 }
11fdf7f2
TL
623
624 // implements DoutPrefixProvider
625 CephContext *get_cct() const override { return store->ctx(); }
9f95a23c 626 unsigned get_subsys() const override
11fdf7f2
TL
627 {
628 return dout_subsys;
629 }
630
9f95a23c 631 std::ostream& gen_prefix(std::ostream& out) const override
11fdf7f2
TL
632 {
633 return out << "sync log trim: ";
634 }
635
7c673cae
FG
636};
637
638void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
639{
9f95a23c 640 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
641 if (meta_sync_processor_thread) {
642 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
643 }
644}
645
9f95a23c 646void RGWRados::wakeup_data_sync_shards(const rgw_zone_id& source_zone, map<int, set<string> >& shard_ids)
7c673cae
FG
647{
648 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
9f95a23c
TL
649 std::lock_guard l{data_sync_thread_lock};
650 auto iter = data_sync_processor_threads.find(source_zone);
7c673cae
FG
651 if (iter == data_sync_processor_threads.end()) {
652 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
653 return;
654 }
655
656 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 657 ceph_assert(thread);
7c673cae
FG
658 thread->wakeup_sync_shards(shard_ids);
659}
660
661RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
662{
9f95a23c 663 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
664 if (meta_sync_processor_thread) {
665 return meta_sync_processor_thread->get_manager();
666 }
667 return nullptr;
668}
669
9f95a23c 670RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const rgw_zone_id& source_zone)
7c673cae 671{
9f95a23c 672 std::lock_guard l{data_sync_thread_lock};
7c673cae
FG
673 auto thread = data_sync_processor_threads.find(source_zone);
674 if (thread == data_sync_processor_threads.end()) {
675 return nullptr;
676 }
677 return thread->second->get_manager();
678}
679
680int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
681{
682 IoCtx ioctx;
494da23a 683 int r = open_pool_ctx(pool, ioctx, false);
7c673cae
FG
684 if (r < 0) {
685 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
686 return r;
687 }
688
689 bool requires;
690 r = ioctx.pool_requires_alignment2(&requires);
691 if (r < 0) {
692 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
693 << r << dendl;
694 return r;
695 }
696
697 if (!requires) {
698 *alignment = 0;
699 return 0;
700 }
701
702 uint64_t align;
703 r = ioctx.pool_required_alignment2(&align);
704 if (r < 0) {
705 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
706 << r << dendl;
707 return r;
708 }
709 if (align != 0) {
710 ldout(cct, 20) << "required alignment=" << align << dendl;
711 }
712 *alignment = align;
713 return 0;
714}
715
11fdf7f2
TL
716void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
717{
718 if (alignment == 0) {
719 *max_size = size;
720 return;
721 }
722
723 if (size <= alignment) {
724 *max_size = alignment;
725 return;
726 }
727
728 *max_size = size - (size % alignment);
729}
730
731int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae 732{
11fdf7f2 733 uint64_t alignment;
7c673cae
FG
734 int r = get_required_alignment(pool, &alignment);
735 if (r < 0) {
736 return r;
737 }
738
11fdf7f2
TL
739 if (palignment) {
740 *palignment = alignment;
7c673cae
FG
741 }
742
11fdf7f2 743 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 744
11fdf7f2 745 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae
FG
746
747 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
748
749 return 0;
750}
751
11fdf7f2
TL
752int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
753 uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae
FG
754{
755 rgw_pool pool;
756 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
757 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
758 return -EIO;
759 }
11fdf7f2 760 return get_max_chunk_size(pool, max_chunk_size, palignment);
7c673cae
FG
761}
762
31f18b77
FG
763class RGWIndexCompletionManager;
764
765struct complete_op_data {
9f95a23c 766 ceph::mutex lock = ceph::make_mutex("complete_op_data");
31f18b77
FG
767 AioCompletion *rados_completion{nullptr};
768 int manager_shard_id{-1};
769 RGWIndexCompletionManager *manager{nullptr};
770 rgw_obj obj;
771 RGWModifyOp op;
772 string tag;
773 rgw_bucket_entry_ver ver;
774 cls_rgw_obj_key key;
775 rgw_bucket_dir_entry_meta dir_meta;
776 list<cls_rgw_obj_key> remove_objs;
777 bool log_op;
778 uint16_t bilog_op;
779 rgw_zone_set zones_trace;
780
781 bool stopped{false};
782
783 void stop() {
9f95a23c 784 std::lock_guard l{lock};
31f18b77
FG
785 stopped = true;
786 }
787};
788
789class RGWIndexCompletionThread : public RGWRadosThread {
790 RGWRados *store;
791
792 uint64_t interval_msec() override {
793 return 0;
794 }
795
796 list<complete_op_data *> completions;
797
9f95a23c
TL
798 ceph::mutex completions_lock =
799 ceph::make_mutex("RGWIndexCompletionThread::completions_lock");
31f18b77
FG
800public:
801 RGWIndexCompletionThread(RGWRados *_store)
9f95a23c 802 : RGWRadosThread(_store, "index-complete"), store(_store) {}
31f18b77
FG
803
804 int process() override;
805
806 void add_completion(complete_op_data *completion) {
807 {
9f95a23c 808 std::lock_guard l{completions_lock};
31f18b77
FG
809 completions.push_back(completion);
810 }
811
812 signal();
813 }
814};
815
816int RGWIndexCompletionThread::process()
817{
818 list<complete_op_data *> comps;
819
820 {
9f95a23c 821 std::lock_guard l{completions_lock};
31f18b77
FG
822 completions.swap(comps);
823 }
824
825 for (auto c : comps) {
826 std::unique_ptr<complete_op_data> up{c};
827
828 if (going_down()) {
829 continue;
830 }
831 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
832
833 RGWRados::BucketShard bs(store);
f64942e4 834 RGWBucketInfo bucket_info;
31f18b77 835
f64942e4 836 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
31f18b77
FG
837 if (r < 0) {
838 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
839 /* not much to do */
840 continue;
841 }
842
f64942e4
AA
843 r = store->guard_reshard(&bs, c->obj, bucket_info,
844 [&](RGWRados::BucketShard *bs) -> int {
845 librados::ObjectWriteOperation o;
846 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
847 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
848 c->log_op, c->bilog_op, &c->zones_trace);
9f95a23c 849 return bs->bucket_obj.operate(&o, null_yield);
31f18b77
FG
850 });
851 if (r < 0) {
852 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
853 /* ignoring error, can't do anything about it */
854 continue;
855 }
9f95a23c 856 r = store->svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
31f18b77
FG
857 if (r < 0) {
858 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
859 }
860 }
861
862 return 0;
863}
864
865class RGWIndexCompletionManager {
866 RGWRados *store{nullptr};
9f95a23c 867 ceph::containers::tiny_vector<ceph::mutex> locks;
31f18b77
FG
868 vector<set<complete_op_data *> > completions;
869
870 RGWIndexCompletionThread *completion_thread{nullptr};
871
872 int num_shards;
873
874 std::atomic<int> cur_shard {0};
875
876
877public:
9f95a23c
TL
878 RGWIndexCompletionManager(RGWRados *_store) :
879 store(_store),
880 locks{ceph::make_lock_container<ceph::mutex>(
881 store->ctx()->_conf->rgw_thread_pool_size,
882 [](const size_t i) {
883 return ceph::make_mutex("RGWIndexCompletionManager::lock::" +
884 std::to_string(i));
885 })}
886 {
31f18b77 887 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
31f18b77
FG
888 completions.resize(num_shards);
889 }
890 ~RGWIndexCompletionManager() {
891 stop();
31f18b77
FG
892 }
893
894 int next_shard() {
895 int result = cur_shard % num_shards;
896 cur_shard++;
897 return result;
898 }
899
900 void create_completion(const rgw_obj& obj,
901 RGWModifyOp op, string& tag,
902 rgw_bucket_entry_ver& ver,
903 const cls_rgw_obj_key& key,
904 rgw_bucket_dir_entry_meta& dir_meta,
905 list<cls_rgw_obj_key> *remove_objs, bool log_op,
906 uint16_t bilog_op,
907 rgw_zone_set *zones_trace,
908 complete_op_data **result);
909 bool handle_completion(completion_t cb, complete_op_data *arg);
910
911 int start() {
912 completion_thread = new RGWIndexCompletionThread(store);
913 int ret = completion_thread->init();
914 if (ret < 0) {
915 return ret;
916 }
917 completion_thread->start();
918 return 0;
919 }
920 void stop() {
921 if (completion_thread) {
922 completion_thread->stop();
923 delete completion_thread;
924 }
925
926 for (int i = 0; i < num_shards; ++i) {
9f95a23c 927 std::lock_guard l{locks[i]};
31f18b77 928 for (auto c : completions[i]) {
31f18b77
FG
929 c->stop();
930 }
931 }
932 completions.clear();
933 }
934};
935
936static void obj_complete_cb(completion_t cb, void *arg)
937{
938 complete_op_data *completion = (complete_op_data *)arg;
9f95a23c 939 completion->lock.lock();
31f18b77 940 if (completion->stopped) {
9f95a23c 941 completion->lock.unlock(); /* can drop lock, no one else is referencing us */
31f18b77
FG
942 delete completion;
943 return;
944 }
945 bool need_delete = completion->manager->handle_completion(cb, completion);
9f95a23c 946 completion->lock.unlock();
31f18b77
FG
947 if (need_delete) {
948 delete completion;
949 }
950}
951
952
953void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
954 RGWModifyOp op, string& tag,
955 rgw_bucket_entry_ver& ver,
956 const cls_rgw_obj_key& key,
957 rgw_bucket_dir_entry_meta& dir_meta,
958 list<cls_rgw_obj_key> *remove_objs, bool log_op,
959 uint16_t bilog_op,
960 rgw_zone_set *zones_trace,
961 complete_op_data **result)
962{
963 complete_op_data *entry = new complete_op_data;
964
965 int shard_id = next_shard();
966
967 entry->manager_shard_id = shard_id;
968 entry->manager = this;
969 entry->obj = obj;
970 entry->op = op;
971 entry->tag = tag;
972 entry->ver = ver;
973 entry->key = key;
974 entry->dir_meta = dir_meta;
975 entry->log_op = log_op;
976 entry->bilog_op = bilog_op;
977
978 if (remove_objs) {
979 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
980 entry->remove_objs.push_back(*iter);
981 }
982 }
983
984 if (zones_trace) {
985 entry->zones_trace = *zones_trace;
986 } else {
9f95a23c 987 entry->zones_trace.insert(store->svc.zone->get_zone().id, obj.bucket.get_key());
31f18b77
FG
988 }
989
990 *result = entry;
991
9f95a23c 992 entry->rados_completion = librados::Rados::aio_create_completion(entry, obj_complete_cb);
31f18b77 993
9f95a23c 994 std::lock_guard l{locks[shard_id]};
31f18b77
FG
995 completions[shard_id].insert(entry);
996}
997
998bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
999{
1000 int shard_id = arg->manager_shard_id;
1001 {
9f95a23c 1002 std::lock_guard l{locks[shard_id]};
31f18b77
FG
1003
1004 auto& comps = completions[shard_id];
1005
1006 auto iter = comps.find(arg);
1007 if (iter == comps.end()) {
1008 return true;
1009 }
1010
1011 comps.erase(iter);
1012 }
1013
1014 int r = rados_aio_get_return_value(cb);
1015 if (r != -ERR_BUSY_RESHARDING) {
1016 return true;
1017 }
1018 completion_thread->add_completion(arg);
1019 return false;
1020}
1021
7c673cae
FG
1022void RGWRados::finalize()
1023{
1024 if (run_sync_thread) {
9f95a23c 1025 std::lock_guard l{meta_sync_thread_lock};
7c673cae
FG
1026 meta_sync_processor_thread->stop();
1027
9f95a23c 1028 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1029 for (auto iter : data_sync_processor_threads) {
1030 RGWDataSyncProcessorThread *thread = iter.second;
1031 thread->stop();
1032 }
1033 if (sync_log_trimmer) {
1034 sync_log_trimmer->stop();
1035 }
1036 }
7c673cae
FG
1037 if (run_sync_thread) {
1038 delete meta_sync_processor_thread;
1039 meta_sync_processor_thread = NULL;
9f95a23c 1040 std::lock_guard dl{data_sync_thread_lock};
7c673cae
FG
1041 for (auto iter : data_sync_processor_threads) {
1042 RGWDataSyncProcessorThread *thread = iter.second;
1043 delete thread;
1044 }
1045 data_sync_processor_threads.clear();
1046 delete sync_log_trimmer;
1047 sync_log_trimmer = nullptr;
b32b8144 1048 bucket_trim = boost::none;
7c673cae 1049 }
7c673cae
FG
1050 if (meta_notifier) {
1051 meta_notifier->stop();
1052 delete meta_notifier;
1053 }
1054 if (data_notifier) {
1055 data_notifier->stop();
1056 delete data_notifier;
1057 }
11fdf7f2 1058 delete sync_tracer;
11fdf7f2
TL
1059
1060 delete lc;
1061 lc = NULL;
7c673cae 1062
11fdf7f2
TL
1063 delete gc;
1064 gc = NULL;
7c673cae 1065
11fdf7f2
TL
1066 delete obj_expirer;
1067 obj_expirer = NULL;
7c673cae 1068
11fdf7f2
TL
1069 RGWQuotaHandler::free_handler(quota_handler);
1070 if (cr_registry) {
1071 cr_registry->put();
7c673cae
FG
1072 }
1073
11fdf7f2 1074 svc.shutdown();
7c673cae 1075
11fdf7f2
TL
1076 delete binfo_cache;
1077 delete obj_tombstone_cache;
7c673cae 1078
11fdf7f2
TL
1079 if (reshard_wait.get()) {
1080 reshard_wait->stop();
1081 reshard_wait.reset();
7c673cae
FG
1082 }
1083
11fdf7f2
TL
1084 if (run_reshard_thread) {
1085 reshard->stop_processor();
7c673cae 1086 }
11fdf7f2
TL
1087 delete reshard;
1088 delete index_completion_manager;
1089}
1090
1091/**
1092 * Initialize the RADOS instance and prepare to do other ops
1093 * Returns 0 on success, -ERR# on failure.
1094 */
1095int RGWRados::init_rados()
1096{
1097 int ret = 0;
7c673cae 1098
494da23a
TL
1099 ret = rados.init_with_context(cct);
1100 if (ret < 0) {
1101 return ret;
1102 }
1103 ret = rados.connect();
1104 if (ret < 0) {
1105 return ret;
7c673cae 1106 }
11fdf7f2
TL
1107
1108 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1109 new RGWCoroutinesManagerRegistry(cct)};
1110 ret = crs->hook_to_admin_command("cr dump");
1111 if (ret < 0) {
1112 return ret;
7c673cae
FG
1113 }
1114
11fdf7f2 1115 cr_registry = crs.release();
11fdf7f2 1116 return ret;
7c673cae
FG
1117}
1118
11fdf7f2 1119int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
7c673cae 1120{
11fdf7f2 1121 map<string,string> metadata = meta;
494da23a 1122 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1123 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1124 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1125 metadata["zone_name"] = svc.zone->zone_name();
9f95a23c 1126 metadata["zone_id"] = svc.zone->zone_id().id;
11fdf7f2
TL
1127 string name = cct->_conf->name.get_id();
1128 if (name.compare(0, 4, "rgw.") == 0) {
1129 name = name.substr(4);
7c673cae 1130 }
494da23a 1131 int ret = rados.service_daemon_register(daemon_type, name, metadata);
11fdf7f2
TL
1132 if (ret < 0) {
1133 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1134 return ret;
7c673cae
FG
1135 }
1136
1137 return 0;
1138}
1139
11fdf7f2 1140int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
7c673cae 1141{
494da23a 1142 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2
TL
1143 if (ret < 0) {
1144 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1145 return ret;
1146 }
1147
1148 return 0;
7c673cae
FG
1149}
1150
1151/**
1152 * Initialize the RADOS instance and prepare to do other ops
1153 * Returns 0 on success, -ERR# on failure.
1154 */
1155int RGWRados::init_complete()
1156{
11fdf7f2 1157 int ret;
7c673cae 1158
11fdf7f2
TL
1159 /*
1160 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1161 */
9f95a23c 1162 sync_module = svc.sync_modules->get_sync_module();
7c673cae
FG
1163
1164 ret = open_root_pool_ctx();
1165 if (ret < 0)
1166 return ret;
1167
1168 ret = open_gc_pool_ctx();
1169 if (ret < 0)
1170 return ret;
1171
1172 ret = open_lc_pool_ctx();
1173 if (ret < 0)
1174 return ret;
1175
1176 ret = open_objexp_pool_ctx();
1177 if (ret < 0)
1178 return ret;
1179
31f18b77
FG
1180 ret = open_reshard_pool_ctx();
1181 if (ret < 0)
1182 return ret;
1183
7c673cae
FG
1184 pools_initialized = true;
1185
1186 gc = new RGWGC();
1187 gc->initialize(cct, this);
1188
9f95a23c 1189 obj_expirer = new RGWObjectExpirer(this->store);
7c673cae
FG
1190
1191 if (use_gc_thread) {
1192 gc->start_processor();
1193 obj_expirer->start_processor();
1194 }
1195
11fdf7f2
TL
1196 auto& current_period = svc.zone->get_current_period();
1197 auto& zonegroup = svc.zone->get_zonegroup();
1198 auto& zone_params = svc.zone->get_zone_params();
1199 auto& zone = svc.zone->get_zone();
1200
7c673cae
FG
1201 /* no point of running sync thread if we don't have a master zone configured
1202 or there is no rest_master_conn */
9f95a23c 1203 if (!svc.zone->need_to_sync()) {
7c673cae
FG
1204 run_sync_thread = false;
1205 }
1206
11fdf7f2 1207 if (svc.zone->is_meta_master()) {
9f95a23c 1208 auto md_log = svc.mdlog->get_log(current_period.get_id());
7c673cae
FG
1209 meta_notifier = new RGWMetaNotifier(this, md_log);
1210 meta_notifier->start();
1211 }
1212
11fdf7f2
TL
1213 /* init it anyway, might run sync through radosgw-admin explicitly */
1214 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1215 sync_tracer->init(this);
1216 ret = sync_tracer->hook_to_admin_command();
1217 if (ret < 0) {
1218 return ret;
1219 }
1220
7c673cae 1221 if (run_sync_thread) {
11fdf7f2
TL
1222 for (const auto &pt: zonegroup.placement_targets) {
1223 if (zone_params.placement_pools.find(pt.second.name)
1224 == zone_params.placement_pools.end()){
1225 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1226 << pt.second.name << " present in zonegroup" << dendl;
1227 }
1228 }
9f95a23c
TL
1229 auto async_processor = svc.rados->get_async_processor();
1230 std::lock_guard l{meta_sync_thread_lock};
1231 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this->store, async_processor);
7c673cae
FG
1232 ret = meta_sync_processor_thread->init();
1233 if (ret < 0) {
1234 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1235 return ret;
1236 }
1237 meta_sync_processor_thread->start();
1238
b32b8144
FG
1239 // configure the bucket trim manager
1240 rgw::BucketTrimConfig config;
1241 rgw::configure_bucket_trim(cct, config);
1242
9f95a23c 1243 bucket_trim.emplace(this->store, config);
b32b8144
FG
1244 ret = bucket_trim->init();
1245 if (ret < 0) {
1246 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1247 return ret;
1248 }
9f95a23c 1249 svc.datalog_rados->set_observer(&*bucket_trim);
b32b8144 1250
9f95a23c 1251 std::lock_guard dl{data_sync_thread_lock};
81eedcae
TL
1252 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1253 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
9f95a23c 1254 auto *thread = new RGWDataSyncProcessorThread(this->store, svc.rados->get_async_processor(), source_zone);
7c673cae
FG
1255 ret = thread->init();
1256 if (ret < 0) {
1257 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1258 return ret;
1259 }
1260 thread->start();
9f95a23c 1261 data_sync_processor_threads[rgw_zone_id(source_zone->id)] = thread;
7c673cae
FG
1262 }
1263 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1264 if (interval > 0) {
9f95a23c 1265 sync_log_trimmer = new RGWSyncLogTrimThread(this->store, &*bucket_trim, interval);
7c673cae
FG
1266 ret = sync_log_trimmer->init();
1267 if (ret < 0) {
1268 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1269 return ret;
1270 }
1271 sync_log_trimmer->start();
1272 }
1273 }
1274 data_notifier = new RGWDataNotifier(this);
1275 data_notifier->start();
1276
92f5a8d4
TL
1277 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1278 binfo_cache->init(svc.cache);
1279
7c673cae 1280 lc = new RGWLC();
9f95a23c 1281 lc->initialize(cct, this->store);
31f18b77 1282
7c673cae
FG
1283 if (use_lc_thread)
1284 lc->start_processor();
31f18b77 1285
9f95a23c 1286 quota_handler = RGWQuotaHandler::generate_handler(this->store, quota_threads);
7c673cae
FG
1287
1288 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1289 zone.bucket_index_max_shards);
31f18b77
FG
1290 if (bucket_index_max_shards > get_max_bucket_shards()) {
1291 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 1292 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1293 << get_max_bucket_shards() << dendl;
7c673cae
FG
1294 }
1295 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1296
11fdf7f2 1297 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1298
1299 if (need_tombstone_cache) {
1300 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1301 }
1302
11fdf7f2 1303 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77 1304
9f95a23c 1305 reshard = new RGWReshard(this->store);
31f18b77
FG
1306
1307 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1308 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1309 if (run_reshard_thread) {
1310 reshard->start_processor();
1311 }
1312
1313 index_completion_manager = new RGWIndexCompletionManager(this);
1314 ret = index_completion_manager->start();
1315
7c673cae
FG
1316 return ret;
1317}
1318
11fdf7f2
TL
1319int RGWRados::init_svc(bool raw)
1320{
1321 if (raw) {
1322 return svc.init_raw(cct, use_cache);
1323 }
1324
9f95a23c
TL
1325 return svc.init(cct, use_cache, run_sync_thread);
1326}
1327
1328int RGWRados::init_ctl()
1329{
1330 return ctl.init(&svc);
11fdf7f2
TL
1331}
1332
7c673cae
FG
1333/**
1334 * Initialize the RADOS instance and prepare to do other ops
1335 * Returns 0 on success, -ERR# on failure.
1336 */
1337int RGWRados::initialize()
1338{
1339 int ret;
1340
11fdf7f2
TL
1341 inject_notify_timeout_probability =
1342 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1343 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1344
11fdf7f2 1345 ret = init_svc(false);
7c673cae 1346 if (ret < 0) {
11fdf7f2 1347 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1348 return ret;
1349 }
7c673cae 1350
9f95a23c
TL
1351 ret = init_ctl();
1352 if (ret < 0) {
1353 ldout(cct, 0) << "ERROR: failed to init ctls (ret=" << cpp_strerror(-ret) << ")" << dendl;
1354 return ret;
1355 }
1356
11fdf7f2 1357 host_id = svc.zone_utils->gen_host_id();
7c673cae 1358
11fdf7f2
TL
1359 ret = init_rados();
1360 if (ret < 0)
1361 return ret;
1362
1363 return init_complete();
7c673cae
FG
1364}
1365
1366/**
1367 * Open the pool used as root for this gateway
1368 * Returns: 0 on success, -ERR# otherwise.
1369 */
1370int RGWRados::open_root_pool_ctx()
1371{
494da23a 1372 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1373}
1374
1375int RGWRados::open_gc_pool_ctx()
1376{
494da23a 1377 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1378}
1379
1380int RGWRados::open_lc_pool_ctx()
1381{
494da23a 1382 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1383}
1384
1385int RGWRados::open_objexp_pool_ctx()
1386{
494da23a 1387 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1388}
1389
31f18b77
FG
1390int RGWRados::open_reshard_pool_ctx()
1391{
494da23a 1392 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1393}
1394
494da23a
TL
1395int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx,
1396 bool mostly_omap)
7c673cae 1397{
28e407b8 1398 constexpr bool create = true; // create the pool if it doesn't exist
494da23a 1399 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1400}
1401
7c673cae
FG
1402/**** logs ****/
1403
1404struct log_list_state {
1405 string prefix;
1406 librados::IoCtx io_ctx;
1407 librados::NObjectIterator obit;
1408};
1409
1410int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1411{
1412 log_list_state *state = new log_list_state;
11fdf7f2 1413 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1414 if (r < 0) {
1415 delete state;
1416 return r;
1417 }
1418 state->prefix = prefix;
1419 state->obit = state->io_ctx.nobjects_begin();
1420 *handle = (RGWAccessHandle)state;
1421 return 0;
1422}
1423
1424int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1425{
1426 log_list_state *state = static_cast<log_list_state *>(handle);
1427 while (true) {
1428 if (state->obit == state->io_ctx.nobjects_end()) {
1429 delete state;
1430 return -ENOENT;
1431 }
1432 if (state->prefix.length() &&
1433 state->obit->get_oid().find(state->prefix) != 0) {
1434 state->obit++;
1435 continue;
1436 }
1437 *name = state->obit->get_oid();
1438 state->obit++;
1439 break;
1440 }
1441 return 0;
1442}
1443
1444int RGWRados::log_remove(const string& name)
1445{
1446 librados::IoCtx io_ctx;
11fdf7f2 1447 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1448 if (r < 0)
1449 return r;
1450 return io_ctx.remove(name);
1451}
1452
1453struct log_show_state {
1454 librados::IoCtx io_ctx;
1455 bufferlist bl;
11fdf7f2 1456 bufferlist::const_iterator p;
7c673cae
FG
1457 string name;
1458 uint64_t pos;
1459 bool eof;
1460 log_show_state() : pos(0), eof(false) {}
1461};
1462
1463int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1464{
1465 log_show_state *state = new log_show_state;
11fdf7f2 1466 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1467 if (r < 0) {
1468 delete state;
1469 return r;
1470 }
1471 state->name = name;
1472 *handle = (RGWAccessHandle)state;
1473 return 0;
1474}
1475
1476int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1477{
1478 log_show_state *state = static_cast<log_show_state *>(handle);
1479 off_t off = state->p.get_off();
1480
1481 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1482 << " off " << off
1483 << " eof " << (int)state->eof
1484 << dendl;
1485 // read some?
1486 unsigned chunk = 1024*1024;
1487 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1488 bufferlist more;
1489 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1490 if (r < 0)
1491 return r;
1492 state->pos += r;
1493 bufferlist old;
1494 try {
1495 old.substr_of(state->bl, off, state->bl.length() - off);
1496 } catch (buffer::error& err) {
1497 return -EINVAL;
1498 }
1499 state->bl.clear();
1500 state->bl.claim(old);
1501 state->bl.claim_append(more);
11fdf7f2 1502 state->p = state->bl.cbegin();
7c673cae
FG
1503 if ((unsigned)r < chunk)
1504 state->eof = true;
1505 ldout(cct, 10) << " read " << r << dendl;
1506 }
1507
1508 if (state->p.end())
1509 return 0; // end of file
1510 try {
11fdf7f2 1511 decode(*entry, state->p);
7c673cae
FG
1512 }
1513 catch (const buffer::error &e) {
1514 return -EINVAL;
1515 }
1516 return 1;
1517}
1518
1519/**
1520 * usage_log_hash: get usage log key hash, based on name and index
1521 *
1522 * Get the usage object name. Since a user may have more than 1
1523 * object holding that info (multiple shards), we use index to
1524 * specify that shard number. Once index exceeds max shards it
1525 * wraps.
1526 * If name is not being set, results for all users will be returned
1527 * and index will wrap only after total shards number.
1528 *
1529 * @param cct [in] ceph context
1530 * @param name [in] user name
1531 * @param hash [out] hash value
1532 * @param index [in] shard index number
1533 */
1534static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1535{
1536 uint32_t val = index;
1537
1538 if (!name.empty()) {
c07f9fc5 1539 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1540 val %= max_user_shards;
1541 val += ceph_str_hash_linux(name.c_str(), name.size());
1542 }
1543 char buf[17];
c07f9fc5 1544 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1545 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1546 hash = buf;
1547}
1548
1549int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1550{
1551 uint32_t index = 0;
1552
1553 map<string, rgw_usage_log_info> log_objs;
1554
1555 string hash;
1556 string last_user;
1557
1558 /* restructure usage map, zone by object hash */
1559 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1560 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1561 const rgw_user_bucket& ub = iter->first;
1562 RGWUsageBatch& info = iter->second;
1563
1564 if (ub.user.empty()) {
1565 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1566 continue;
1567 }
1568
1569 if (ub.user != last_user) {
1570 /* index *should* be random, but why waste extra cycles
1571 in most cases max user shards is not going to exceed 1,
1572 so just incrementing it */
1573 usage_log_hash(cct, ub.user, hash, index++);
1574 }
1575 last_user = ub.user;
1576 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1577
1578 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1579 v.push_back(miter->second);
1580 }
1581 }
1582
1583 map<string, rgw_usage_log_info>::iterator liter;
1584
1585 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1586 int r = cls_obj_usage_log_add(liter->first, liter->second);
1587 if (r < 0)
1588 return r;
1589 }
1590 return 0;
1591}
1592
11fdf7f2
TL
1593int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1594 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1595 rgw_usage_log_entry>& usage)
7c673cae
FG
1596{
1597 uint32_t num = max_entries;
1598 string hash, first_hash;
1599 string user_str = user.to_str();
1600 usage_log_hash(cct, user_str, first_hash, 0);
1601
1602 if (usage_iter.index) {
1603 usage_log_hash(cct, user_str, hash, usage_iter.index);
1604 } else {
1605 hash = first_hash;
1606 }
1607
1608 usage.clear();
1609
1610 do {
1611 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1612 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1613
11fdf7f2 1614 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1615 usage_iter.read_iter, ret_usage, is_truncated);
1616 if (ret == -ENOENT)
1617 goto next;
1618
1619 if (ret < 0)
1620 return ret;
1621
1622 num -= ret_usage.size();
1623
1624 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1625 usage[iter->first].aggregate(iter->second);
1626 }
1627
1628next:
1629 if (!*is_truncated) {
1630 usage_iter.read_iter.clear();
1631 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1632 }
1633 } while (num && !*is_truncated && hash != first_hash);
1634 return 0;
1635}
1636
11fdf7f2 1637int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1638{
1639 uint32_t index = 0;
1640 string hash, first_hash;
1641 string user_str = user.to_str();
1642 usage_log_hash(cct, user_str, first_hash, index);
1643
1644 hash = first_hash;
7c673cae 1645 do {
11fdf7f2 1646 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1647
b32b8144 1648 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1649 return ret;
1650
7c673cae
FG
1651 usage_log_hash(cct, user_str, hash, ++index);
1652 } while (hash != first_hash);
1653
1654 return 0;
1655}
1656
11fdf7f2
TL
1657
1658int RGWRados::clear_usage()
1659{
1660 auto max_shards = cct->_conf->rgw_usage_max_shards;
1661 int ret=0;
1662 for (unsigned i=0; i < max_shards; i++){
1663 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
1664 ret = cls_obj_usage_log_clear(oid);
1665 if (ret < 0){
1666 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
1667 return ret;
1668 }
1669 }
1670 return ret;
1671}
1672
9f95a23c 1673int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
7c673cae 1674{
9f95a23c
TL
1675 auto i = bl.cbegin();
1676 RGWAccessControlPolicy policy(cct);
1677 try {
1678 policy.decode_owner(i);
1679 } catch (buffer::error& err) {
1680 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1681 return -EIO;
7c673cae 1682 }
9f95a23c
TL
1683 *owner = policy.get_owner();
1684 return 0;
7c673cae
FG
1685}
1686
9f95a23c 1687int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
7c673cae 1688{
9f95a23c
TL
1689 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
1690 if (aiter == attrset.end())
1691 return -EIO;
7c673cae 1692
9f95a23c
TL
1693 bufferlist& bl = aiter->second;
1694 auto iter = bl.cbegin();
1695 try {
1696 policy->decode(iter);
1697 } catch (buffer::error& err) {
1698 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
1699 return -EIO;
1700 }
1701 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
1702 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
1703 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
1704 s3policy->to_xml(*_dout);
1705 *_dout << dendl;
1706 }
1707 return 0;
7c673cae
FG
1708}
1709
7c673cae 1710
9f95a23c 1711int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
7c673cae 1712{
9f95a23c
TL
1713 rgw_bucket bucket = bucket_info.bucket;
1714 bucket.update_bucket_id(new_bucket_id);
7c673cae 1715
9f95a23c 1716 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae 1717
9f95a23c
TL
1718 bucket_info.objv_tracker.clear();
1719 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr, null_yield);
1720 if (ret < 0) {
1721 return ret;
7c673cae
FG
1722 }
1723
9f95a23c 1724 return 0;
eafe8130
TL
1725}
1726
1727
1adf2230
AA
1728/**
1729 * Get ordered listing of the objects in a bucket.
7c673cae 1730 *
9f95a23c 1731 * max_p: maximum number of results to return
7c673cae
FG
1732 * bucket: bucket to list contents of
1733 * prefix: only return results that match this prefix
1734 * delim: do not include results that match this string.
1735 * Any skipped results will have the matching portion of their name
1736 * inserted in common_prefixes with a "true" mark.
1737 * marker: if filled in, begin the listing with this object.
1738 * end_marker: if filled in, end the listing with this object.
1739 * result: the objects are put in here.
11fdf7f2
TL
1740 * common_prefixes: if delim is filled in, any matching prefixes are
1741 * placed here.
1742 * is_truncated: if number of objects in the bucket is bigger than
1743 * max, then truncated.
7c673cae 1744 */
11fdf7f2 1745int RGWRados::Bucket::List::list_objects_ordered(
eafe8130 1746 int64_t max_p,
11fdf7f2
TL
1747 vector<rgw_bucket_dir_entry> *result,
1748 map<string, bool> *common_prefixes,
9f95a23c
TL
1749 bool *is_truncated,
1750 optional_yield y)
7c673cae
FG
1751{
1752 RGWRados *store = target->get_store();
1753 CephContext *cct = store->ctx();
1754 int shard_id = target->get_shard_id();
1755
1756 int count = 0;
1757 bool truncated = true;
9f95a23c 1758 bool cls_filtered = false;
eafe8130
TL
1759 const int64_t max = // protect against memory issues and negative vals
1760 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
1761 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
7c673cae
FG
1762
1763 result->clear();
1764
9f95a23c
TL
1765 // use a local marker; either the marker will have a previous entry
1766 // or it will be empty; either way it's OK to copy
1767 rgw_obj_key marker_obj(params.marker.name,
1768 params.marker.instance,
f91f0fd5 1769 params.ns.empty() ? params.marker.ns : params.ns);
7c673cae
FG
1770 rgw_obj_index_key cur_marker;
1771 marker_obj.get_index_key(&cur_marker);
1772
9f95a23c
TL
1773 rgw_obj_key end_marker_obj(params.end_marker.name,
1774 params.end_marker.instance,
f91f0fd5 1775 params.ns.empty() ? params.end_marker.ns : params.ns);
3efd9988
FG
1776 rgw_obj_index_key cur_end_marker;
1777 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
1778 const bool cur_end_marker_valid = !params.end_marker.empty();
1779
1780 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 1781 prefix_obj.set_ns(params.ns);
7c673cae 1782 string cur_prefix = prefix_obj.get_index_key_name();
11fdf7f2 1783 string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
1784
1785 if (!params.delim.empty()) {
9f95a23c 1786 after_delim_s = cls_rgw_after_delim(params.delim);
11fdf7f2
TL
1787 /* if marker points at a common prefix, fast forward it into its
1788 * upper bound string */
224ce89b 1789 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
1790 if (delim_pos >= 0) {
1791 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 1792 s.append(after_delim_s);
7c673cae
FG
1793 cur_marker = s;
1794 }
1795 }
1adf2230 1796
9f95a23c 1797 rgw_obj_index_key prev_marker;
f6b5b4d7 1798 for (uint16_t attempt = 1; /* empty */; ++attempt) {
9f95a23c 1799 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
f6b5b4d7 1800 " starting attempt " << attempt << dendl;
9f95a23c
TL
1801
1802 if (attempt > 1 && !(prev_marker < cur_marker)) {
1803 // we've failed to make forward progress
1804 ldout(cct, 0) << "RGWRados::Bucket::List::" << __func__ <<
1805 ": ERROR marker failed to make forward progress; attempt=" << attempt <<
1806 ", prev_marker=" << prev_marker <<
1807 ", cur_marker=" << cur_marker << dendl;
1808 break;
1809 }
1810 prev_marker = cur_marker;
1811
1812 ent_map_t ent_map;
1813 ent_map.reserve(read_ahead);
1adf2230
AA
1814 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
1815 shard_id,
1816 cur_marker,
1817 cur_prefix,
9f95a23c 1818 params.delim,
1adf2230
AA
1819 read_ahead + 1 - count,
1820 params.list_versions,
9f95a23c 1821 attempt,
1adf2230
AA
1822 ent_map,
1823 &truncated,
9f95a23c
TL
1824 &cls_filtered,
1825 &cur_marker,
1826 y);
1827 if (r < 0) {
7c673cae 1828 return r;
9f95a23c 1829 }
7c673cae 1830
1adf2230 1831 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
1832 rgw_bucket_dir_entry& entry = eiter->second;
1833 rgw_obj_index_key index_key = entry.key;
7c673cae
FG
1834 rgw_obj_key obj(index_key);
1835
9f95a23c
TL
1836 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1837 " considering entry " << entry.key << dendl;
1838
1adf2230
AA
1839 /* note that parse_raw_oid() here will not set the correct
1840 * object's instance, as rgw_obj_index_key encodes that
1841 * separately. We don't need to set the instance because it's
1842 * not needed for the checks here and we end up using the raw
1843 * entry for the return vector
7c673cae
FG
1844 */
1845 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
1846 if (!valid) {
9f95a23c
TL
1847 ldout(cct, 0) << "ERROR: could not parse object name: " <<
1848 obj.name << dendl;
7c673cae
FG
1849 continue;
1850 }
11fdf7f2 1851
9f95a23c 1852 bool matched_ns = (obj.ns == params.ns);
7c673cae
FG
1853 if (!params.list_versions && !entry.is_visible()) {
1854 continue;
1855 }
1856
9f95a23c 1857 if (params.enforce_ns && !matched_ns) {
7c673cae
FG
1858 if (!params.ns.empty()) {
1859 /* we've iterated past the namespace we're searching -- done now */
1860 truncated = false;
1861 goto done;
1862 }
1863
1864 /* we're not looking at the namespace this object is in, next! */
1865 continue;
1866 }
1867
1868 if (cur_end_marker_valid && cur_end_marker <= index_key) {
1869 truncated = false;
1870 goto done;
1871 }
1872
1873 if (count < max) {
9f95a23c
TL
1874 params.marker = index_key;
1875 next_marker = index_key;
7c673cae
FG
1876 }
1877
9f95a23c
TL
1878 if (params.filter &&
1879 ! params.filter->filter(obj.name, index_key.name)) {
7c673cae 1880 continue;
9f95a23c 1881 }
7c673cae 1882
1adf2230 1883 if (params.prefix.size() &&
9f95a23c 1884 0 != obj.name.compare(0, params.prefix.size(), params.prefix)) {
7c673cae 1885 continue;
9f95a23c 1886 }
7c673cae
FG
1887
1888 if (!params.delim.empty()) {
9f95a23c
TL
1889 const int delim_pos = obj.name.find(params.delim, params.prefix.size());
1890 if (delim_pos >= 0) {
1891 // run either the code where delimiter filtering is done a)
1892 // in the OSD/CLS or b) here.
1893 if (cls_filtered) {
1894 // NOTE: this condition is for the newer versions of the
1895 // OSD that does filtering on the CLS side
1896
1897 // should only find one delimiter at the end if it finds any
1898 // after the prefix
1899 if (delim_pos !=
1900 int(obj.name.length() - params.delim.length())) {
1901 ldout(cct, 0) <<
1902 "WARNING: found delimiter in place other than the end of "
1903 "the prefix; obj.name=" << obj.name <<
1904 ", prefix=" << params.prefix << dendl;
1905 }
1906 if (common_prefixes) {
1907 if (count >= max) {
1908 truncated = true;
1909 goto done;
1910 }
1911
1912 (*common_prefixes)[obj.name] = true;
1913 count++;
1914 }
1915
1916 continue;
1917 } else {
1918 // NOTE: this condition is for older versions of the OSD
1919 // that do not filter on the CLS side, so the following code
1920 // must do the filtering; once we reach version 16 of ceph,
1921 // this code can be removed along with the conditional that
1922 // can lead this way
1923
1924 /* extract key -with trailing delimiter- for CommonPrefix */
1925 string prefix_key =
1926 obj.name.substr(0, delim_pos + params.delim.length());
1927
1928 if (common_prefixes &&
1929 common_prefixes->find(prefix_key) == common_prefixes->end()) {
1930 if (count >= max) {
1931 truncated = true;
1932 goto done;
1933 }
1934 next_marker = prefix_key;
1935 (*common_prefixes)[prefix_key] = true;
1936
1937 count++;
1938 }
1939
1940 continue;
1941 } // if we're running an older OSD version
1942 } // if a delimiter was found after prefix
1943 } // if a delimiter was passed in
7c673cae
FG
1944
1945 if (count >= max) {
1946 truncated = true;
1947 goto done;
1948 }
1949
9f95a23c
TL
1950 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1951 " adding entry " << entry.key << " to result" << dendl;
1952
7c673cae
FG
1953 result->emplace_back(std::move(entry));
1954 count++;
9f95a23c
TL
1955 } // eiter for loop
1956
1957 // NOTE: the following conditional is needed by older versions of
1958 // the OSD that don't do delimiter filtering on the CLS side; once
1959 // we reach version 16 of ceph, the following conditional and the
1960 // code within can be removed
1961 if (!cls_filtered && !params.delim.empty()) {
1962 int marker_delim_pos =
1963 cur_marker.name.find(params.delim, cur_prefix.size());
eafe8130 1964 if (marker_delim_pos >= 0) {
9f95a23c
TL
1965 std::string skip_after_delim =
1966 cur_marker.name.substr(0, marker_delim_pos);
eafe8130
TL
1967 skip_after_delim.append(after_delim_s);
1968
1969 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
1970
1971 if (skip_after_delim > cur_marker.name) {
1972 cur_marker = skip_after_delim;
1973 ldout(cct, 20) << "setting cur_marker="
1974 << cur_marker.name
1975 << "[" << cur_marker.instance << "]"
1976 << dendl;
1977 }
1978 }
9f95a23c
TL
1979 } // if older osd didn't do delimiter filtering
1980
1981 ldout(cct, 20) << "RGWRados::Bucket::List::" << __func__ <<
1982 " INFO end of outer loop, truncated=" << truncated <<
1983 ", count=" << count << ", attempt=" << attempt << dendl;
1984
1985 if (!truncated || count >= (max + 1) / 2) {
1986 // if we finished listing, or if we're returning at least half the
1987 // requested entries, that's enough; S3 and swift protocols allow
1988 // returning fewer than max entries
1989 break;
1990 } else if (attempt > 8 && count >= 1) {
1991 // if we've made at least 8 attempts and we have some, but very
1992 // few, results, return with what we have
1993 break;
eafe8130 1994 }
f6b5b4d7 1995 } // for (uint16_t attempt...
7c673cae
FG
1996
1997done:
9f95a23c
TL
1998
1999 if (is_truncated) {
7c673cae 2000 *is_truncated = truncated;
9f95a23c 2001 }
7c673cae
FG
2002
2003 return 0;
1adf2230
AA
2004} // list_objects_ordered
2005
2006
2007/**
2008 * Get listing of the objects in a bucket and allow the results to be out
2009 * of order.
2010 *
2011 * Even though there are key differences with the ordered counterpart,
2012 * the parameters are the same to maintain some compatability.
2013 *
2014 * max: maximum number of results to return
2015 * bucket: bucket to list contents of
2016 * prefix: only return results that match this prefix
2017 * delim: should not be set; if it is we should have indicated an error
2018 * marker: if filled in, begin the listing with this object.
2019 * end_marker: if filled in, end the listing with this object.
2020 * result: the objects are put in here.
2021 * common_prefixes: this is never filled with an unordered list; the param
2022 * is maintained for compatibility
2023 * is_truncated: if number of objects in the bucket is bigger than max, then
2024 * truncated.
2025 */
eafe8130 2026int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p,
1adf2230
AA
2027 vector<rgw_bucket_dir_entry> *result,
2028 map<string, bool> *common_prefixes,
9f95a23c
TL
2029 bool *is_truncated,
2030 optional_yield y)
1adf2230
AA
2031{
2032 RGWRados *store = target->get_store();
2033 CephContext *cct = store->ctx();
2034 int shard_id = target->get_shard_id();
2035
2036 int count = 0;
2037 bool truncated = true;
2038
eafe8130
TL
2039 const int64_t max = // protect against memory issues and negative vals
2040 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2041
1adf2230
AA
2042 // read a few extra in each call to cls_bucket_list_unordered in
2043 // case some are filtered out due to namespace matching, versioning,
2044 // filtering, etc.
2045 const int64_t max_read_ahead = 100;
2046 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2047
2048 result->clear();
2049
9f95a23c
TL
2050 // use a local marker; either the marker will have a previous entry
2051 // or it will be empty; either way it's OK to copy
11fdf7f2
TL
2052 rgw_obj_key marker_obj(params.marker.name,
2053 params.marker.instance,
f91f0fd5 2054 params.ns.empty() ? params.marker.ns : params.ns);
1adf2230
AA
2055 rgw_obj_index_key cur_marker;
2056 marker_obj.get_index_key(&cur_marker);
2057
11fdf7f2
TL
2058 rgw_obj_key end_marker_obj(params.end_marker.name,
2059 params.end_marker.instance,
f91f0fd5 2060 params.ns.empty() ? params.end_marker.ns : params.ns);
1adf2230
AA
2061 rgw_obj_index_key cur_end_marker;
2062 end_marker_obj.get_index_key(&cur_end_marker);
2063 const bool cur_end_marker_valid = !params.end_marker.empty();
2064
2065 rgw_obj_key prefix_obj(params.prefix);
9f95a23c 2066 prefix_obj.set_ns(params.ns);
1adf2230
AA
2067 string cur_prefix = prefix_obj.get_index_key_name();
2068
2069 while (truncated && count <= max) {
2070 std::vector<rgw_bucket_dir_entry> ent_list;
9f95a23c
TL
2071 ent_list.reserve(read_ahead);
2072
1adf2230
AA
2073 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2074 shard_id,
2075 cur_marker,
2076 cur_prefix,
2077 read_ahead,
2078 params.list_versions,
2079 ent_list,
2080 &truncated,
9f95a23c
TL
2081 &cur_marker,
2082 y);
1adf2230
AA
2083 if (r < 0)
2084 return r;
2085
2086 // NB: while regions of ent_list will be sorted, we have no
2087 // guarantee that all items will be sorted since they can cross
2088 // shard boundaries
2089
2090 for (auto& entry : ent_list) {
2091 rgw_obj_index_key index_key = entry.key;
2092 rgw_obj_key obj(index_key);
2093
9f95a23c
TL
2094 if (count < max) {
2095 params.marker.set(index_key);
2096 next_marker.set(index_key);
2097 }
2098
1adf2230
AA
2099 /* note that parse_raw_oid() here will not set the correct
2100 * object's instance, as rgw_obj_index_key encodes that
2101 * separately. We don't need to set the instance because it's
2102 * not needed for the checks here and we end up using the raw
2103 * entry for the return vector
2104 */
2105 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2106 if (!valid) {
2107 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2108 obj.name << dendl;
2109 continue;
2110 }
2111
2112 if (!params.list_versions && !entry.is_visible()) {
2113 continue;
2114 }
2115
2116 if (params.enforce_ns && obj.ns != params.ns) {
2117 continue;
2118 }
2119
2120 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2121 // we're not guaranteed items will come in order, so we have
2122 // to loop through all
2123 continue;
2124 }
2125
1adf2230
AA
2126 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2127 continue;
2128
2129 if (params.prefix.size() &&
2130 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2131 continue;
2132
2133 if (count >= max) {
2134 truncated = true;
2135 goto done;
2136 }
2137
2138 result->emplace_back(std::move(entry));
2139 count++;
2140 } // for (auto& entry : ent_list)
2141 } // while (truncated && count <= max)
2142
2143done:
2144 if (is_truncated)
2145 *is_truncated = truncated;
2146
2147 return 0;
2148} // list_objects_unordered
2149
7c673cae
FG
2150
2151/**
2152 * create a rados pool, associated meta info
2153 * returns 0 on success, -ERR# otherwise.
2154 */
2155int RGWRados::create_pool(const rgw_pool& pool)
2156{
c07f9fc5 2157 librados::IoCtx io_ctx;
28e407b8
AA
2158 constexpr bool create = true;
2159 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2160}
2161
9f95a23c 2162void RGWRados::create_bucket_id(string *bucket_id)
7c673cae 2163{
9f95a23c
TL
2164 uint64_t iid = instance_id();
2165 uint64_t bid = next_bucket_id();
2166 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2167 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2168 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2169 *bucket_id = buf;
2170}
7c673cae 2171
11fdf7f2 2172int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2173 const string& zonegroup_id,
11fdf7f2 2174 const rgw_placement_rule& placement_rule,
7c673cae
FG
2175 const string& swift_ver_location,
2176 const RGWQuotaInfo * pquota_info,
2177 map<std::string, bufferlist>& attrs,
2178 RGWBucketInfo& info,
2179 obj_version *pobjv,
2180 obj_version *pep_objv,
2181 real_time creation_time,
2182 rgw_bucket *pmaster_bucket,
2183 uint32_t *pmaster_num_shards,
2184 bool exclusive)
2185{
2186#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2187 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2188 RGWZonePlacementInfo rule_info;
2189
2190 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2191 int ret = 0;
11fdf7f2
TL
2192 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2193 &selected_placement_rule, &rule_info);
7c673cae
FG
2194 if (ret < 0)
2195 return ret;
2196
2197 if (!pmaster_bucket) {
2198 create_bucket_id(&bucket.marker);
2199 bucket.bucket_id = bucket.marker;
2200 } else {
2201 bucket.marker = pmaster_bucket->marker;
2202 bucket.bucket_id = pmaster_bucket->bucket_id;
2203 }
2204
2205 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2206
9f95a23c
TL
2207 objv_tracker.read_version.clear();
2208
7c673cae
FG
2209 if (pobjv) {
2210 objv_tracker.write_version = *pobjv;
2211 } else {
2212 objv_tracker.generate_new_write_ver(cct);
2213 }
2214
2215 info.bucket = bucket;
2216 info.owner = owner.user_id;
2217 info.zonegroup = zonegroup_id;
11fdf7f2 2218 info.placement_rule = selected_placement_rule;
7c673cae
FG
2219 info.index_type = rule_info.index_type;
2220 info.swift_ver_location = swift_ver_location;
2221 info.swift_versioning = (!swift_ver_location.empty());
2222 if (pmaster_num_shards) {
2223 info.num_shards = *pmaster_num_shards;
2224 } else {
2225 info.num_shards = bucket_index_max_shards;
2226 }
2227 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2228 info.requester_pays = false;
2229 if (real_clock::is_zero(creation_time)) {
2230 info.creation_time = ceph::real_clock::now();
2231 } else {
2232 info.creation_time = creation_time;
2233 }
2234 if (pquota_info) {
2235 info.quota = *pquota_info;
2236 }
2237
9f95a23c 2238 int r = svc.bi->init_index(info);
11fdf7f2
TL
2239 if (r < 0) {
2240 return r;
2241 }
7c673cae 2242
11fdf7f2 2243 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
9f95a23c
TL
2244 if (ret == -ECANCELED) {
2245 ret = -EEXIST;
2246 }
11fdf7f2 2247 if (ret == -EEXIST) {
11fdf7f2 2248 /* we need to reread the info and return it, caller will have a use for it */
9f95a23c
TL
2249 RGWBucketInfo orig_info;
2250 r = get_bucket_info(&svc, bucket.tenant, bucket.name, orig_info, NULL, null_yield, NULL);
11fdf7f2
TL
2251 if (r < 0) {
2252 if (r == -ENOENT) {
2253 continue;
2254 }
2255 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2256 return r;
2257 }
7c673cae 2258
11fdf7f2 2259 /* only remove it if it's a different bucket instance */
9f95a23c
TL
2260 if (orig_info.bucket.bucket_id != bucket.bucket_id) {
2261 int r = svc.bi->clean_index(info);
2262 if (r < 0) {
2263 ldout(cct, 0) << "WARNING: could not remove bucket index (r=" << r << ")" << dendl;
2264 }
2265 r = ctl.bucket->remove_bucket_instance_info(info.bucket, info, null_yield);
2266 if (r < 0) {
2267 ldout(cct, 0) << "WARNING: " << __func__ << "(): failed to remove bucket instance info: bucket instance=" << info.bucket.get_key() << ": r=" << r << dendl;
2268 /* continue anyway */
2269 }
11fdf7f2 2270 }
9f95a23c
TL
2271
2272 info = std::move(orig_info);
2273 /* ret == -EEXIST here */
11fdf7f2 2274 }
7c673cae 2275 return ret;
7c673cae
FG
2276 }
2277
11fdf7f2
TL
2278 /* this is highly unlikely */
2279 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2280 return -ENOENT;
7c673cae
FG
2281}
2282
11fdf7f2 2283bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2284{
11fdf7f2
TL
2285 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2286}
c07f9fc5 2287
11fdf7f2
TL
2288bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2289{
2290 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2291
11fdf7f2 2292 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2293}
2294
2295int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2296{
2297 string oid, key;
2298 get_obj_bucket_and_oid_loc(obj, oid, key);
2299
2300 rgw_pool pool;
2301 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2302 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2303 return -EIO;
2304 }
2305
494da23a 2306 int r = open_pool_ctx(pool, *ioctx, false);
7c673cae
FG
2307 if (r < 0) {
2308 return r;
2309 }
2310
2311 ioctx->locator_set_key(key);
2312
2313 return 0;
2314}
2315
2316int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2317{
11fdf7f2 2318 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2319
2320 rgw_pool pool;
2321 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2322 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2323 return -EIO;
2324 }
2325
9f95a23c
TL
2326 ref->pool = svc.rados->pool(pool);
2327
2328 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2329 .set_mostly_omap(false));
7c673cae 2330 if (r < 0) {
9f95a23c 2331 ldout(cct, 0) << "ERROR: failed opening data pool (pool=" << pool << "); r=" << r << dendl;
7c673cae
FG
2332 return r;
2333 }
2334
9f95a23c 2335 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2336
2337 return 0;
2338}
2339
224ce89b 2340int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2341{
11fdf7f2 2342 ref->obj = obj;
7c673cae 2343
11fdf7f2
TL
2344 if (ref->obj.oid.empty()) {
2345 ref->obj.oid = obj.pool.to_str();
2346 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2347 }
9f95a23c
TL
2348 ref->pool = svc.rados->pool(obj.pool);
2349 int r = ref->pool.open(RGWSI_RADOS::OpenParams()
2350 .set_mostly_omap(false));
2351 if (r < 0) {
2352 ldout(cct, 0) << "ERROR: failed opening pool (pool=" << obj.pool << "); r=" << r << dendl;
7c673cae 2353 return r;
9f95a23c 2354 }
7c673cae 2355
9f95a23c 2356 ref->pool.ioctx().locator_set_key(ref->obj.loc);
7c673cae
FG
2357
2358 return 0;
2359}
2360
224ce89b 2361int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2362{
224ce89b 2363 return get_raw_obj_ref(obj, ref);
7c673cae
FG
2364}
2365
2366/*
2367 * fixes an issue where head objects were supposed to have a locator created, but ended
2368 * up without one
2369 */
2370int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2371{
2372 const rgw_bucket& bucket = bucket_info.bucket;
2373 string oid;
2374 string locator;
2375
2376 rgw_obj obj(bucket, key);
2377
2378 get_obj_bucket_and_oid_loc(obj, oid, locator);
2379
2380 if (locator.empty()) {
2381 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
2382 return 0;
2383 }
2384
2385 librados::IoCtx ioctx;
2386
2387 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
2388 if (ret < 0) {
2389 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2390 return ret;
2391 }
2392 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2393
2394 uint64_t size;
2395 bufferlist data;
2396
2397 struct timespec mtime_ts;
2398 map<string, bufferlist> attrs;
2399 librados::ObjectReadOperation op;
2400 op.getxattrs(&attrs, NULL);
2401 op.stat2(&size, &mtime_ts, NULL);
2402#define HEAD_SIZE 512 * 1024
2403 op.read(0, HEAD_SIZE, &data, NULL);
2404
9f95a23c 2405 ret = rgw_rados_operate(ioctx, oid, &op, &data, null_yield);
7c673cae 2406 if (ret < 0) {
9f95a23c 2407 lderr(cct) << "ERROR: rgw_rados_operate(oid=" << oid << ") returned ret=" << ret << dendl;
7c673cae
FG
2408 return ret;
2409 }
2410
2411 if (size > HEAD_SIZE) {
2412 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
2413 return -EIO;
2414 }
2415
2416 if (size != data.length()) {
2417 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
2418 return -EIO;
2419 }
2420
2421 if (copy_obj) {
2422 librados::ObjectWriteOperation wop;
2423
2424 wop.mtime2(&mtime_ts);
2425
2426 map<string, bufferlist>::iterator iter;
2427 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
2428 wop.setxattr(iter->first.c_str(), iter->second);
2429 }
2430
2431 wop.write(0, data);
2432
2433 ioctx.locator_set_key(locator);
9f95a23c 2434 rgw_rados_operate(ioctx, oid, &wop, null_yield);
7c673cae
FG
2435 }
2436
2437 if (remove_bad) {
2438 ioctx.locator_set_key(string());
2439
2440 ret = ioctx.remove(oid);
2441 if (ret < 0) {
2442 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
2443 return ret;
2444 }
2445 }
2446
2447 return 0;
2448}
2449
2450int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
2451 const string& src_oid, const string& src_locator,
2452 librados::IoCtx& dst_ioctx,
2453 const string& dst_oid, const string& dst_locator)
2454{
2455
2456#define COPY_BUF_SIZE (4 * 1024 * 1024)
2457 bool done = false;
2458 uint64_t chunk_size = COPY_BUF_SIZE;
2459 uint64_t ofs = 0;
2460 int ret = 0;
2461 real_time mtime;
2462 struct timespec mtime_ts;
2463 uint64_t size;
2464
2465 if (src_oid == dst_oid && src_locator == dst_locator) {
2466 return 0;
2467 }
2468
2469 src_ioctx.locator_set_key(src_locator);
2470 dst_ioctx.locator_set_key(dst_locator);
2471
2472 do {
2473 bufferlist data;
2474 ObjectReadOperation rop;
2475 ObjectWriteOperation wop;
2476
2477 if (ofs == 0) {
2478 rop.stat2(&size, &mtime_ts, NULL);
2479 mtime = real_clock::from_timespec(mtime_ts);
2480 }
2481 rop.read(ofs, chunk_size, &data, NULL);
9f95a23c 2482 ret = rgw_rados_operate(src_ioctx, src_oid, &rop, &data, null_yield);
7c673cae
FG
2483 if (ret < 0) {
2484 goto done_err;
2485 }
2486
2487 if (data.length() == 0) {
2488 break;
2489 }
2490
2491 if (ofs == 0) {
2492 wop.create(true); /* make it exclusive */
2493 wop.mtime2(&mtime_ts);
2494 mtime = real_clock::from_timespec(mtime_ts);
2495 }
2496 wop.write(ofs, data);
9f95a23c 2497 ret = rgw_rados_operate(dst_ioctx, dst_oid, &wop, null_yield);
11fdf7f2
TL
2498 if (ret < 0) {
2499 goto done_err;
2500 }
7c673cae
FG
2501 ofs += data.length();
2502 done = data.length() != chunk_size;
2503 } while (!done);
2504
2505 if (ofs != size) {
2506 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
2507 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
2508 ret = -EIO;
2509 goto done_err;
2510 }
2511
2512 src_ioctx.remove(src_oid);
2513
2514 return 0;
2515
2516done_err:
11fdf7f2 2517 // TODO: clean up dst_oid if we created it
7c673cae
FG
2518 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
2519 return ret;
2520}
2521
2522/*
2523 * fixes an issue where head objects were supposed to have a locator created, but ended
2524 * up without one
2525 */
9f95a23c 2526int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix, optional_yield y)
7c673cae
FG
2527{
2528 const rgw_bucket& bucket = bucket_info.bucket;
2529 rgw_obj obj(bucket, key);
2530
2531 if (need_fix) {
2532 *need_fix = false;
2533 }
2534
2535 rgw_rados_ref ref;
2536 int r = get_obj_head_ref(bucket_info, obj, &ref);
2537 if (r < 0) {
2538 return r;
2539 }
2540
2541 RGWObjState *astate = NULL;
9f95a23c
TL
2542 RGWObjectCtx rctx(this->store);
2543 r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
2544 if (r < 0)
2545 return r;
2546
9f95a23c 2547 if (astate->manifest) {
7c673cae 2548 RGWObjManifest::obj_iterator miter;
9f95a23c 2549 RGWObjManifest& manifest = *astate->manifest;
7c673cae
FG
2550 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
2551 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
2552 rgw_obj loc;
2553 string oid;
2554 string locator;
2555
9f95a23c 2556 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
7c673cae
FG
2557
2558 if (loc.key.ns.empty()) {
2559 /* continue, we're only interested in tail objects */
2560 continue;
2561 }
2562
9f95a23c
TL
2563 auto& ioctx = ref.pool.ioctx();
2564
7c673cae 2565 get_obj_bucket_and_oid_loc(loc, oid, locator);
9f95a23c 2566 ref.pool.ioctx().locator_set_key(locator);
7c673cae
FG
2567
2568 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
2569
9f95a23c 2570 r = ioctx.stat(oid, NULL, NULL);
7c673cae
FG
2571 if (r != -ENOENT) {
2572 continue;
2573 }
2574
2575 string bad_loc;
2576 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
2577
2578 /* create a new ioctx with the bad locator */
2579 librados::IoCtx src_ioctx;
9f95a23c 2580 src_ioctx.dup(ioctx);
7c673cae
FG
2581 src_ioctx.locator_set_key(bad_loc);
2582
2583 r = src_ioctx.stat(oid, NULL, NULL);
2584 if (r != 0) {
2585 /* cannot find a broken part */
2586 continue;
2587 }
2588 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
2589 if (need_fix) {
2590 *need_fix = true;
2591 }
2592 if (fix) {
9f95a23c 2593 r = move_rados_obj(src_ioctx, oid, bad_loc, ioctx, oid, locator);
7c673cae
FG
2594 if (r < 0) {
2595 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
2596 }
2597 }
2598 }
2599 }
2600
2601 return 0;
2602}
2603
f64942e4
AA
2604int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2605 const rgw_obj& obj,
2606 RGWBucketInfo* bucket_info_out)
7c673cae
FG
2607{
2608 bucket = _bucket;
2609
11fdf7f2 2610 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2611
2612 RGWBucketInfo bucket_info;
f64942e4
AA
2613 RGWBucketInfo* bucket_info_p =
2614 bucket_info_out ? bucket_info_out : &bucket_info;
2615
9f95a23c 2616 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
7c673cae
FG
2617 if (ret < 0) {
2618 return ret;
2619 }
2620
9f95a23c
TL
2621 string oid;
2622
2623 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae
FG
2624 if (ret < 0) {
2625 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2626 return ret;
2627 }
9f95a23c 2628 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2629
2630 return 0;
2631}
2632
f64942e4
AA
2633int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
2634 int sid,
2635 RGWBucketInfo* bucket_info_out)
7c673cae
FG
2636{
2637 bucket = _bucket;
2638 shard_id = sid;
2639
11fdf7f2 2640 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
2641
2642 RGWBucketInfo bucket_info;
f64942e4
AA
2643 RGWBucketInfo* bucket_info_p =
2644 bucket_info_out ? bucket_info_out : &bucket_info;
9f95a23c 2645 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL, null_yield);
7c673cae
FG
2646 if (ret < 0) {
2647 return ret;
2648 }
2649
9f95a23c
TL
2650 string oid;
2651
2652 ret = store->svc.bi_rados->open_bucket_index_shard(*bucket_info_p, shard_id, &bucket_obj);
7c673cae
FG
2653 if (ret < 0) {
2654 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2655 return ret;
2656 }
9f95a23c 2657 ldout(store->ctx(), 20) << " bucket index oid: " << bucket_obj.get_raw_obj() << dendl;
7c673cae
FG
2658
2659 return 0;
2660}
2661
a8e16298
TL
2662int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
2663 const rgw_obj& obj)
2664{
2665 bucket = bucket_info.bucket;
2666
9f95a23c
TL
2667 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info,
2668 obj.get_hash_object(),
2669 &bucket_obj,
2670 &shard_id);
a8e16298
TL
2671 if (ret < 0) {
2672 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2673 return ret;
2674 }
2675 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2676
2677 return 0;
2678}
2679
b32b8144
FG
2680int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
2681{
2682 bucket = bucket_info.bucket;
2683 shard_id = sid;
2684
9f95a23c 2685 int ret = store->svc.bi_rados->open_bucket_index_shard(bucket_info, shard_id, &bucket_obj);
b32b8144
FG
2686 if (ret < 0) {
2687 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
2688 return ret;
2689 }
2690 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
2691
2692 return 0;
2693}
2694
7c673cae
FG
2695
2696/* Execute @handler on last item in bucket listing for bucket specified
2697 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
2698 * to objects matching these criterias. */
2699int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
2700 const std::string& obj_prefix,
2701 const std::string& obj_delim,
2702 std::function<int(const rgw_bucket_dir_entry&)> handler)
2703{
2704 RGWRados::Bucket target(this, bucket_info);
2705 RGWRados::Bucket::List list_op(&target);
2706
2707 list_op.params.prefix = obj_prefix;
2708 list_op.params.delim = obj_delim;
2709
2710 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
2711 << ", obj_prefix=" << obj_prefix
2712 << ", obj_delim=" << obj_delim
2713 << dendl;
2714
2715 bool is_truncated = false;
2716
2717 boost::optional<rgw_bucket_dir_entry> last_entry;
2718 /* We need to rewind to the last object in a listing. */
2719 do {
2720 /* List bucket entries in chunks. */
2721 static constexpr int MAX_LIST_OBJS = 100;
2722 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
2723
2724 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
9f95a23c 2725 &is_truncated, null_yield);
7c673cae
FG
2726 if (ret < 0) {
2727 return ret;
2728 } else if (!entries.empty()) {
2729 last_entry = entries.back();
2730 }
2731 } while (is_truncated);
2732
2733 if (last_entry) {
2734 return handler(*last_entry);
2735 }
2736
2737 /* Empty listing - no items we can run handler on. */
2738 return 0;
2739}
2740
2741
2742int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
2743 const rgw_user& user,
2744 RGWBucketInfo& bucket_info,
9f95a23c
TL
2745 rgw_obj& obj,
2746 const DoutPrefixProvider *dpp,
2747 optional_yield y)
7c673cae
FG
2748{
2749 if (! swift_versioning_enabled(bucket_info)) {
2750 return 0;
2751 }
2752
11fdf7f2 2753 obj_ctx.set_atomic(obj);
7c673cae
FG
2754
2755 RGWObjState * state = nullptr;
9f95a23c 2756 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false, y);
7c673cae
FG
2757 if (r < 0) {
2758 return r;
2759 }
2760
2761 if (!state->exists) {
2762 return 0;
2763 }
2764
7c673cae
FG
2765 const string& src_name = obj.get_oid();
2766 char buf[src_name.size() + 32];
2767 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
2768 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
2769 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
2770
2771 RGWBucketInfo dest_bucket_info;
2772
9f95a23c 2773 r = get_bucket_info(&svc, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, null_yield, NULL);
7c673cae
FG
2774 if (r < 0) {
2775 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
2776 if (r == -ENOENT) {
2777 return -ERR_PRECONDITION_FAILED;
2778 }
2779 return r;
2780 }
2781
2782 if (dest_bucket_info.owner != bucket_info.owner) {
2783 return -ERR_PRECONDITION_FAILED;
2784 }
2785
2786 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
11fdf7f2
TL
2787
2788 if (dest_bucket_info.versioning_enabled()){
2789 gen_rand_obj_instance_name(&dest_obj);
2790 }
2791
2792 obj_ctx.set_atomic(dest_obj);
7c673cae 2793
9f95a23c 2794 rgw_zone_id no_zone;
7c673cae
FG
2795
2796 r = copy_obj(obj_ctx,
2797 user,
7c673cae
FG
2798 NULL, /* req_info *info */
2799 no_zone,
2800 dest_obj,
2801 obj,
2802 dest_bucket_info,
2803 bucket_info,
11fdf7f2 2804 bucket_info.placement_rule,
7c673cae
FG
2805 NULL, /* time_t *src_mtime */
2806 NULL, /* time_t *mtime */
2807 NULL, /* const time_t *mod_ptr */
2808 NULL, /* const time_t *unmod_ptr */
2809 false, /* bool high_precision_time */
2810 NULL, /* const char *if_match */
2811 NULL, /* const char *if_nomatch */
2812 RGWRados::ATTRSMOD_NONE,
2813 true, /* bool copy_if_newer */
2814 state->attrset,
11fdf7f2 2815 RGWObjCategory::Main,
7c673cae
FG
2816 0, /* uint64_t olh_epoch */
2817 real_time(), /* time_t delete_at */
2818 NULL, /* string *version_id */
2819 NULL, /* string *ptag */
2820 NULL, /* string *petag */
7c673cae 2821 NULL, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2822 NULL, /* void *progress_data */
2823 dpp,
2824 null_yield);
7c673cae
FG
2825 if (r == -ECANCELED || r == -ENOENT) {
2826 /* Has already been overwritten, meaning another rgw process already
2827 * copied it out */
2828 return 0;
2829 }
2830
2831 return r;
2832}
2833
9f95a23c 2834int RGWRados::swift_versioning_restore(RGWObjectCtx& obj_ctx,
7c673cae
FG
2835 const rgw_user& user,
2836 RGWBucketInfo& bucket_info,
2837 rgw_obj& obj,
9f95a23c
TL
2838 bool& restored, /* out */
2839 const DoutPrefixProvider *dpp)
7c673cae
FG
2840{
2841 if (! swift_versioning_enabled(bucket_info)) {
2842 return 0;
2843 }
2844
2845 /* Bucket info of the bucket that stores previous versions of our object. */
2846 RGWBucketInfo archive_binfo;
2847
9f95a23c 2848 int ret = get_bucket_info(&svc, bucket_info.bucket.tenant,
7c673cae 2849 bucket_info.swift_ver_location, archive_binfo,
9f95a23c 2850 nullptr, null_yield, nullptr);
7c673cae
FG
2851 if (ret < 0) {
2852 return ret;
2853 }
2854
2855 /* Abort the operation if the bucket storing our archive belongs to someone
2856 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
2857 * into consideration. For we can live with that.
2858 *
2859 * TODO: delegate this check to un upper layer and compare with ACLs. */
2860 if (bucket_info.owner != archive_binfo.owner) {
2861 return -EPERM;
2862 }
2863
2864 /* This code will be executed on latest version of the object. */
2865 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
9f95a23c 2866 rgw_zone_id no_zone;
7c673cae
FG
2867
2868 /* We don't support object versioning of Swift API on those buckets that
2869 * are already versioned using the S3 mechanism. This affects also bucket
2870 * storing archived objects. Otherwise the delete operation would create
2871 * a deletion marker. */
2872 if (archive_binfo.versioned()) {
2873 restored = false;
2874 return -ERR_PRECONDITION_FAILED;
2875 }
2876
2877 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
2878 * irrelevant and may be safely skipped. */
2879 std::map<std::string, ceph::bufferlist> no_attrs;
2880
2881 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
11fdf7f2
TL
2882
2883 if (bucket_info.versioning_enabled()){
2884 gen_rand_obj_instance_name(&obj);
2885 }
2886
2887 obj_ctx.set_atomic(archive_obj);
2888 obj_ctx.set_atomic(obj);
7c673cae
FG
2889
2890 int ret = copy_obj(obj_ctx,
2891 user,
7c673cae
FG
2892 nullptr, /* req_info *info */
2893 no_zone,
2894 obj, /* dest obj */
2895 archive_obj, /* src obj */
2896 bucket_info, /* dest bucket info */
2897 archive_binfo, /* src bucket info */
11fdf7f2 2898 bucket_info.placement_rule, /* placement_rule */
7c673cae
FG
2899 nullptr, /* time_t *src_mtime */
2900 nullptr, /* time_t *mtime */
2901 nullptr, /* const time_t *mod_ptr */
2902 nullptr, /* const time_t *unmod_ptr */
2903 false, /* bool high_precision_time */
2904 nullptr, /* const char *if_match */
2905 nullptr, /* const char *if_nomatch */
2906 RGWRados::ATTRSMOD_NONE,
2907 true, /* bool copy_if_newer */
2908 no_attrs,
11fdf7f2 2909 RGWObjCategory::Main,
7c673cae
FG
2910 0, /* uint64_t olh_epoch */
2911 real_time(), /* time_t delete_at */
2912 nullptr, /* string *version_id */
2913 nullptr, /* string *ptag */
2914 nullptr, /* string *petag */
7c673cae 2915 nullptr, /* void (*progress_cb)(off_t, void *) */
9f95a23c
TL
2916 nullptr, /* void *progress_data */
2917 dpp,
2918 null_yield);
7c673cae
FG
2919 if (ret == -ECANCELED || ret == -ENOENT) {
2920 /* Has already been overwritten, meaning another rgw process already
2921 * copied it out */
2922 return 0;
2923 } else if (ret < 0) {
2924 return ret;
2925 } else {
2926 restored = true;
2927 }
2928
2929 /* Need to remove the archived copy. */
2930 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
2931 archive_binfo.versioning_status());
2932
2933 return ret;
2934 };
2935
2936 const std::string& obj_name = obj.get_oid();
2937 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
2938 % obj_name);
2939
2940 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
2941 handler);
2942}
2943
7c673cae 2944int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
2945 map<string, bufferlist>& attrs,
2946 bool assume_noent, bool modify_tail,
9f95a23c 2947 void *_index_op, optional_yield y)
7c673cae
FG
2948{
2949 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
2950 RGWRados *store = target->get_store();
2951
2952 ObjectWriteOperation op;
11fdf7f2
TL
2953#ifdef WITH_LTTNG
2954 const struct req_state* s = get_req_state();
2955 string req_id;
2956 if (!s) {
2957 // fake req_id
2958 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
2959 } else {
2960 req_id = s->req_id;
2961 }
2962#endif
7c673cae
FG
2963
2964 RGWObjState *state;
9f95a23c 2965 int r = target->get_state(&state, false, y, assume_noent);
7c673cae
FG
2966 if (r < 0)
2967 return r;
2968
2969 rgw_obj& obj = target->get_obj();
2970
2971 if (obj.get_oid().empty()) {
2972 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
2973 return -EIO;
2974 }
2975
224ce89b 2976 rgw_rados_ref ref;
7c673cae
FG
2977 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
2978 if (r < 0)
2979 return r;
2980
2981 bool is_olh = state->is_olh;
2982
2983 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
2984
2985 const string *ptag = meta.ptag;
2986 if (!ptag && !index_op->get_optag()->empty()) {
2987 ptag = index_op->get_optag();
2988 }
9f95a23c 2989 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail, y);
7c673cae
FG
2990 if (r < 0)
2991 return r;
2992
2993 if (real_clock::is_zero(meta.set_mtime)) {
2994 meta.set_mtime = real_clock::now();
2995 }
2996
eafe8130
TL
2997 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
2998 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
2999 if (iter == attrs.end()) {
3000 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3001 string mode = target->bucket_info.obj_lock.get_mode();
3002 RGWObjectRetention obj_retention(mode, lock_until_date);
3003 bufferlist bl;
3004 obj_retention.encode(bl);
3005 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3006 }
3007 }
3008
7c673cae
FG
3009 if (state->is_olh) {
3010 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3011 }
3012
3013 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3014 op.mtime2(&mtime_ts);
3015
3016 if (meta.data) {
3017 /* if we want to overwrite the data, we also want to overwrite the
3018 xattrs, so just remove the object */
3019 op.write_full(*meta.data);
3020 }
3021
3022 string etag;
3023 string content_type;
3024 bufferlist acl_bl;
11fdf7f2 3025 string storage_class;
7c673cae
FG
3026
3027 map<string, bufferlist>::iterator iter;
3028 if (meta.rmattrs) {
3029 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3030 const string& name = iter->first;
3031 op.rmxattr(name.c_str());
3032 }
3033 }
3034
3035 if (meta.manifest) {
11fdf7f2
TL
3036 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3037
7c673cae
FG
3038 /* remove existing manifest attr */
3039 iter = attrs.find(RGW_ATTR_MANIFEST);
3040 if (iter != attrs.end())
3041 attrs.erase(iter);
3042
3043 bufferlist bl;
11fdf7f2 3044 encode(*meta.manifest, bl);
7c673cae
FG
3045 op.setxattr(RGW_ATTR_MANIFEST, bl);
3046 }
3047
3048 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3049 const string& name = iter->first;
3050 bufferlist& bl = iter->second;
3051
3052 if (!bl.length())
3053 continue;
3054
3055 op.setxattr(name.c_str(), bl);
3056
3057 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3058 etag = rgw_bl_str(bl);
7c673cae 3059 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3060 content_type = rgw_bl_str(bl);
7c673cae
FG
3061 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3062 acl_bl = bl;
3063 }
3064 }
3065 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3066 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3067 }
3068
3069 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3070 bufferlist bl;
11fdf7f2 3071 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3072 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3073 }
3074
11fdf7f2
TL
3075 if (!storage_class.empty()) {
3076 bufferlist bl;
3077 bl.append(storage_class);
3078 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3079 }
3080
7c673cae
FG
3081 if (!op.size())
3082 return 0;
3083
3084 uint64_t epoch;
3085 int64_t poolid;
224ce89b
WB
3086 bool orig_exists;
3087 uint64_t orig_size;
3088
3089 if (!reset_obj) { //Multipart upload, it has immutable head.
3090 orig_exists = false;
3091 orig_size = 0;
3092 } else {
3093 orig_exists = state->exists;
3094 orig_size = state->accounted_size;
3095 }
7c673cae 3096
91327a77
AA
3097 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3098 !obj.key.instance.empty();
7c673cae
FG
3099
3100 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3101
3102 if (versioned_op) {
3103 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3104 }
3105
3106 if (!index_op->is_prepared()) {
11fdf7f2 3107 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
9f95a23c 3108 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
11fdf7f2 3109 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3110 if (r < 0)
3111 return r;
3112 }
3113
9f95a23c
TL
3114 auto& ioctx = ref.pool.ioctx();
3115
11fdf7f2 3116 tracepoint(rgw_rados, operate_enter, req_id.c_str());
9f95a23c 3117 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
11fdf7f2 3118 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3119 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3120 or -ENOENT if was removed, or -EEXIST if it did not exist
3121 before and now it does */
3122 if (r == -EEXIST && assume_noent) {
3123 target->invalidate_state();
3124 return r;
3125 }
3126 goto done_cancel;
3127 }
3128
9f95a23c
TL
3129 epoch = ioctx.get_last_version();
3130 poolid = ioctx.get_id();
7c673cae
FG
3131
3132 r = target->complete_atomic_modification();
3133 if (r < 0) {
3134 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3135 }
3136
11fdf7f2 3137 tracepoint(rgw_rados, complete_enter, req_id.c_str());
7c673cae 3138 r = index_op->complete(poolid, epoch, size, accounted_size,
11fdf7f2
TL
3139 meta.set_mtime, etag, content_type,
3140 storage_class, &acl_bl,
3141 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3142 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3143 if (r < 0)
3144 goto done_cancel;
3145
3146 if (meta.mtime) {
3147 *meta.mtime = meta.set_mtime;
3148 }
3149
3150 /* note that index_op was using state so we couldn't invalidate it earlier */
3151 target->invalidate_state();
3152 state = NULL;
3153
91327a77 3154 if (versioned_op && meta.olh_epoch) {
9f95a23c 3155 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, y, meta.zones_trace);
7c673cae
FG
3156 if (r < 0) {
3157 return r;
3158 }
3159 }
3160
3161 if (!real_clock::is_zero(meta.delete_at)) {
3162 rgw_obj_index_key obj_key;
3163 obj.key.get_index_key(&obj_key);
3164
9f95a23c
TL
3165 r = store->obj_expirer->hint_add(meta.delete_at, obj.bucket.tenant, obj.bucket.name,
3166 obj.bucket.bucket_id, obj_key);
7c673cae
FG
3167 if (r < 0) {
3168 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3169 /* ignoring error, nothing we can do at this point */
3170 }
3171 }
3172 meta.canceled = false;
3173
3174 /* update quota cache */
3efd9988
FG
3175 if (meta.completeMultipart){
3176 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3177 0, orig_size);
3178 }
3179 else {
3180 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3181 accounted_size, orig_size);
3182 }
7c673cae
FG
3183 return 0;
3184
3185done_cancel:
3186 int ret = index_op->cancel();
3187 if (ret < 0) {
3188 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3189 }
3190
3191 meta.canceled = true;
3192
3193 /* we lost in a race. There are a few options:
3194 * - existing object was rewritten (ECANCELED)
3195 * - non existing object was created (EEXIST)
3196 * - object was removed (ENOENT)
3197 * should treat it as a success
3198 */
3199 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3200 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3201 r = 0;
3202 }
3203 } else {
3204 if (meta.if_match != NULL) {
3205 // only overwrite existing object
3206 if (strcmp(meta.if_match, "*") == 0) {
3207 if (r == -ENOENT) {
3208 r = -ERR_PRECONDITION_FAILED;
3209 } else if (r == -ECANCELED) {
3210 r = 0;
3211 }
3212 }
3213 }
3214
3215 if (meta.if_nomatch != NULL) {
3216 // only create a new object
3217 if (strcmp(meta.if_nomatch, "*") == 0) {
3218 if (r == -EEXIST) {
3219 r = -ERR_PRECONDITION_FAILED;
3220 } else if (r == -ENOENT) {
3221 r = 0;
3222 }
3223 }
3224 }
3225 }
3226
3227 return r;
3228}
3229
3230int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
9f95a23c 3231 map<string, bufferlist>& attrs, optional_yield y)
7c673cae
FG
3232{
3233 RGWBucketInfo& bucket_info = target->get_bucket_info();
3234
3235 RGWRados::Bucket bop(target->get_store(), bucket_info);
3236 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3237 index_op.set_zones_trace(meta.zones_trace);
3238
7c673cae
FG
3239 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3240 int r;
3241 if (assume_noent) {
9f95a23c 3242 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3243 if (r == -EEXIST) {
3244 assume_noent = false;
3245 }
3246 }
3247 if (!assume_noent) {
9f95a23c 3248 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op, y);
7c673cae
FG
3249 }
3250 return r;
3251}
3252
11fdf7f2 3253class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae
FG
3254{
3255 CephContext* cct;
3256 rgw_obj obj;
11fdf7f2 3257 rgw::putobj::DataProcessor *filter;
7c673cae 3258 boost::optional<RGWPutObj_Compress>& compressor;
11fdf7f2 3259 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3260 CompressorRef& plugin;
11fdf7f2 3261 rgw::putobj::ObjectProcessor *processor;
7c673cae
FG
3262 void (*progress_cb)(off_t, void *);
3263 void *progress_data;
3264 bufferlist extra_data_bl;
11fdf7f2
TL
3265 uint64_t extra_data_left{0};
3266 bool need_to_process_attrs{true};
3267 uint64_t data_len{0};
7c673cae 3268 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3269 uint64_t ofs{0};
3270 uint64_t lofs{0}; /* logical ofs */
9f95a23c 3271 std::function<int(map<string, bufferlist>&)> attrs_handler;
7c673cae
FG
3272public:
3273 RGWRadosPutObj(CephContext* cct,
3274 CompressorRef& plugin,
3275 boost::optional<RGWPutObj_Compress>& compressor,
11fdf7f2 3276 rgw::putobj::ObjectProcessor *p,
7c673cae 3277 void (*_progress_cb)(off_t, void *),
11fdf7f2 3278 void *_progress_data,
9f95a23c 3279 std::function<int(map<string, bufferlist>&)> _attrs_handler) :
7c673cae
FG
3280 cct(cct),
3281 filter(p),
3282 compressor(compressor),
3283 plugin(plugin),
3284 processor(p),
7c673cae
FG
3285 progress_cb(_progress_cb),
3286 progress_data(_progress_data),
11fdf7f2 3287 attrs_handler(_attrs_handler) {}
7c673cae
FG
3288
3289 int process_attrs(void) {
3290 if (extra_data_bl.length()) {
3291 JSONParser jp;
3292 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3293 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3294 return -EIO;
3295 }
3296
3297 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3298
3299 src_attrs.erase(RGW_ATTR_COMPRESSION);
3300 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
a8e16298
TL
3301
3302 // filter out olh attributes
3303 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3304 while (iter != src_attrs.end()) {
3305 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3306 break;
3307 }
3308 iter = src_attrs.erase(iter);
3309 }
7c673cae
FG
3310 }
3311
11fdf7f2
TL
3312 int ret = attrs_handler(src_attrs);
3313 if (ret < 0) {
3314 return ret;
3315 }
3316
7c673cae
FG
3317 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3318 //do not compress if object is encrypted
3319 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3320 // add a filter that buffers data so we don't try to compress tiny blocks.
3321 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3322 // compression ratio
28e407b8
AA
3323 constexpr unsigned buffer_size = 512 * 1024;
3324 buffering = boost::in_place(&*compressor, buffer_size);
3325 filter = &*buffering;
7c673cae 3326 }
11fdf7f2
TL
3327
3328 need_to_process_attrs = false;
3329
7c673cae
FG
3330 return 0;
3331 }
3332
11fdf7f2 3333 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3334 if (progress_cb) {
11fdf7f2 3335 progress_cb(data_len, progress_data);
7c673cae 3336 }
b32b8144 3337 if (extra_data_left) {
11fdf7f2 3338 uint64_t extra_len = bl.length();
b32b8144
FG
3339 if (extra_len > extra_data_left)
3340 extra_len = extra_data_left;
7c673cae
FG
3341
3342 bufferlist extra;
3343 bl.splice(0, extra_len, &extra);
3344 extra_data_bl.append(extra);
3345
b32b8144
FG
3346 extra_data_left -= extra_len;
3347 if (extra_data_left == 0) {
7c673cae
FG
3348 int res = process_attrs();
3349 if (res < 0)
3350 return res;
3351 }
11fdf7f2 3352 ofs += extra_len;
7c673cae
FG
3353 if (bl.length() == 0) {
3354 return 0;
3355 }
3356 }
11fdf7f2
TL
3357 if (need_to_process_attrs) {
3358 /* need to call process_attrs() even if we don't get any attrs,
3359 * need it to call attrs_handler().
3360 */
3361 int res = process_attrs();
3362 if (res < 0) {
3363 return res;
3364 }
3365 }
7c673cae 3366
11fdf7f2 3367 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3368
11fdf7f2
TL
3369 uint64_t size = bl.length();
3370 ofs += size;
7c673cae 3371
11fdf7f2
TL
3372 const uint64_t lofs = data_len;
3373 data_len += size;
7c673cae 3374
11fdf7f2 3375 return filter->process(std::move(bl), lofs);
7c673cae
FG
3376 }
3377
28e407b8 3378 int flush() {
11fdf7f2 3379 return filter->process({}, data_len);
28e407b8
AA
3380 }
3381
7c673cae
FG
3382 bufferlist& get_extra_data() { return extra_data_bl; }
3383
3384 map<string, bufferlist>& get_attrs() { return src_attrs; }
3385
3386 void set_extra_data_len(uint64_t len) override {
b32b8144 3387 extra_data_left = len;
11fdf7f2 3388 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3389 }
3390
3391 uint64_t get_data_len() {
3392 return data_len;
3393 }
7c673cae
FG
3394};
3395
3396/*
3397 * prepare attrset depending on attrs_mod.
3398 */
3399static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3400 map<string, bufferlist>& attrs,
3401 RGWRados::AttrsMod attrs_mod)
3402{
3403 switch (attrs_mod) {
3404 case RGWRados::ATTRSMOD_NONE:
3405 attrs = src_attrs;
3406 break;
3407 case RGWRados::ATTRSMOD_REPLACE:
3408 if (!attrs[RGW_ATTR_ETAG].length()) {
3409 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3410 }
181888fb
FG
3411 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3412 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3413 if (ttiter != src_attrs.end()) {
3414 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3415 }
3416 }
7c673cae
FG
3417 break;
3418 case RGWRados::ATTRSMOD_MERGE:
3419 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3420 if (attrs.find(it->first) == attrs.end()) {
3421 attrs[it->first] = it->second;
3422 }
3423 }
3424 break;
3425 }
3426}
3427
9f95a23c 3428int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj, const DoutPrefixProvider *dpp, optional_yield y)
7c673cae
FG
3429{
3430 map<string, bufferlist> attrset;
3431
3432 real_time mtime;
3433 uint64_t obj_size;
9f95a23c 3434 RGWObjectCtx rctx(this->store);
7c673cae
FG
3435
3436 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
3437 RGWRados::Object::Read read_op(&op_target);
3438
3439 read_op.params.attrs = &attrset;
3440 read_op.params.lastmod = &mtime;
3441 read_op.params.obj_size = &obj_size;
3442
9f95a23c 3443 int ret = read_op.prepare(y);
7c673cae
FG
3444 if (ret < 0)
3445 return ret;
3446
3447 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 3448 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae 3449
11fdf7f2
TL
3450 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
3451 read_op, obj_size - 1, obj, NULL, mtime, attrset,
9f95a23c 3452 0, real_time(), NULL, dpp, y);
7c673cae
FG
3453}
3454
3455struct obj_time_weight {
3456 real_time mtime;
3457 uint32_t zone_short_id;
3458 uint64_t pg_ver;
3459 bool high_precision;
3460
3461 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
3462
3463 bool compare_low_precision(const obj_time_weight& rhs) {
3464 struct timespec l = ceph::real_clock::to_timespec(mtime);
3465 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
3466 l.tv_nsec = 0;
3467 r.tv_nsec = 0;
3468 if (l > r) {
3469 return false;
3470 }
3471 if (l < r) {
3472 return true;
3473 }
11fdf7f2
TL
3474 if (!zone_short_id || !rhs.zone_short_id) {
3475 /* don't compare zone ids, if one wasn't provided */
3476 return false;
3477 }
7c673cae
FG
3478 if (zone_short_id != rhs.zone_short_id) {
3479 return (zone_short_id < rhs.zone_short_id);
3480 }
3481 return (pg_ver < rhs.pg_ver);
3482
3483 }
3484
3485 bool operator<(const obj_time_weight& rhs) {
3486 if (!high_precision || !rhs.high_precision) {
3487 return compare_low_precision(rhs);
3488 }
3489 if (mtime > rhs.mtime) {
3490 return false;
3491 }
3492 if (mtime < rhs.mtime) {
3493 return true;
3494 }
11fdf7f2
TL
3495 if (!zone_short_id || !rhs.zone_short_id) {
3496 /* don't compare zone ids, if one wasn't provided */
3497 return false;
3498 }
7c673cae
FG
3499 if (zone_short_id != rhs.zone_short_id) {
3500 return (zone_short_id < rhs.zone_short_id);
3501 }
3502 return (pg_ver < rhs.pg_ver);
3503 }
3504
3505 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
3506 mtime = _mtime;
3507 zone_short_id = _short_id;
3508 pg_ver = _pg_ver;
3509 }
3510
3511 void init(RGWObjState *state) {
3512 mtime = state->mtime;
3513 zone_short_id = state->zone_short_id;
3514 pg_ver = state->pg_ver;
3515 }
3516};
3517
3518inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
3519 out << o.mtime;
3520
3521 if (o.zone_short_id != 0 || o.pg_ver != 0) {
3522 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
3523 }
3524
3525 return out;
3526}
3527
11fdf7f2 3528class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
3529 bufferlist extra_data;
3530public:
3531 RGWGetExtraDataCB() {}
11fdf7f2
TL
3532 int handle_data(bufferlist& bl, bool *pause) override {
3533 int bl_len = (int)bl.length();
7c673cae
FG
3534 if (extra_data.length() < extra_data_len) {
3535 off_t max = extra_data_len - extra_data.length();
3536 if (max > bl_len) {
3537 max = bl_len;
3538 }
3539 bl.splice(0, max, &extra_data);
3540 }
3541 return bl_len;
3542 }
3543
3544 bufferlist& get_extra_data() {
3545 return extra_data;
3546 }
3547};
3548
3549int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
3550 const rgw_user& user_id,
7c673cae 3551 req_info *info,
9f95a23c 3552 const rgw_zone_id& source_zone,
7c673cae 3553 rgw_obj& src_obj,
9f95a23c 3554 const RGWBucketInfo *src_bucket_info,
7c673cae
FG
3555 real_time *src_mtime,
3556 uint64_t *psize,
3557 const real_time *mod_ptr,
3558 const real_time *unmod_ptr,
3559 bool high_precision_time,
3560 const char *if_match,
3561 const char *if_nomatch,
3562 map<string, bufferlist> *pattrs,
11fdf7f2 3563 map<string, string> *pheaders,
7c673cae
FG
3564 string *version_id,
3565 string *ptag,
3566 string *petag)
3567{
3568 /* source is in a different zonegroup, copy from there */
3569
3570 RGWRESTStreamRWRequest *in_stream_req;
3571 string tag;
3572 map<string, bufferlist> src_attrs;
3573 append_rand_alpha(cct, tag, tag, 32);
3574 obj_time_weight set_mtime_weight;
3575 set_mtime_weight.high_precision = high_precision_time;
3576
3577 RGWRESTConn *conn;
3578 if (source_zone.empty()) {
9f95a23c 3579 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3580 /* source is in the master zonegroup */
11fdf7f2 3581 conn = svc.zone->get_master_conn();
7c673cae 3582 } else {
11fdf7f2 3583 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
9f95a23c 3584 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae
FG
3585 if (iter == zonegroup_conn_map.end()) {
3586 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3587 return -ENOENT;
3588 }
3589 conn = iter->second;
3590 }
3591 } else {
11fdf7f2 3592 auto& zone_conn_map = svc.zone->get_zone_conn_map();
9f95a23c 3593 auto iter = zone_conn_map.find(source_zone);
7c673cae
FG
3594 if (iter == zone_conn_map.end()) {
3595 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
3596 return -ENOENT;
3597 }
3598 conn = iter->second;
3599 }
3600
3601 RGWGetExtraDataCB cb;
7c673cae
FG
3602 map<string, string> req_headers;
3603 real_time set_mtime;
3604
3605 const real_time *pmod = mod_ptr;
3606
3607 obj_time_weight dest_mtime_weight;
3608
181888fb
FG
3609 constexpr bool prepend_meta = true;
3610 constexpr bool get_op = true;
3611 constexpr bool rgwx_stat = true;
3612 constexpr bool sync_manifest = true;
3613 constexpr bool skip_decrypt = true;
7c673cae
FG
3614 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3615 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3616 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3617 sync_manifest, skip_decrypt,
3618 true, &cb, &in_stream_req);
7c673cae
FG
3619 if (ret < 0) {
3620 return ret;
3621 }
3622
11fdf7f2 3623 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
7c673cae
FG
3624 if (ret < 0) {
3625 return ret;
3626 }
3627
3628 bufferlist& extra_data_bl = cb.get_extra_data();
3629 if (extra_data_bl.length()) {
3630 JSONParser jp;
3631 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3632 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3633 return -EIO;
3634 }
3635
3636 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3637
3638 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3639 }
3640
3641 if (src_mtime) {
3642 *src_mtime = set_mtime;
3643 }
3644
3645 if (petag) {
3646 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
3647 if (iter != src_attrs.end()) {
3648 bufferlist& etagbl = iter->second;
3649 *petag = etagbl.to_str();
11fdf7f2
TL
3650 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
3651 *petag = petag->substr(0, petag->size() - 1);
3652 }
7c673cae
FG
3653 }
3654 }
3655
3656 if (pattrs) {
11fdf7f2 3657 *pattrs = std::move(src_attrs);
7c673cae
FG
3658 }
3659
3660 return 0;
3661}
3662
9f95a23c
TL
3663int RGWFetchObjFilter_Default::filter(CephContext *cct,
3664 const rgw_obj_key& source_key,
3665 const RGWBucketInfo& dest_bucket_info,
3666 std::optional<rgw_placement_rule> dest_placement_rule,
3667 const map<string, bufferlist>& obj_attrs,
3668 std::optional<rgw_user> *poverride_owner,
3669 const rgw_placement_rule **prule)
3670{
3671 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
3672 if (!ptail_rule) {
3673 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
3674 if (iter != obj_attrs.end()) {
3675 dest_rule.storage_class = iter->second.to_str();
3676 dest_rule.inherit_from(dest_bucket_info.placement_rule);
3677 ptail_rule = &dest_rule;
3678 } else {
3679 ptail_rule = &dest_bucket_info.placement_rule;
3680 }
3681 }
3682 *prule = ptail_rule;
3683 return 0;
3684}
3685
7c673cae
FG
3686int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
3687 const rgw_user& user_id,
7c673cae 3688 req_info *info,
9f95a23c 3689 const rgw_zone_id& source_zone,
11fdf7f2
TL
3690 const rgw_obj& dest_obj,
3691 const rgw_obj& src_obj,
9f95a23c
TL
3692 const RGWBucketInfo& dest_bucket_info,
3693 const RGWBucketInfo *src_bucket_info,
11fdf7f2 3694 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
3695 real_time *src_mtime,
3696 real_time *mtime,
3697 const real_time *mod_ptr,
3698 const real_time *unmod_ptr,
3699 bool high_precision_time,
3700 const char *if_match,
3701 const char *if_nomatch,
3702 AttrsMod attrs_mod,
3703 bool copy_if_newer,
3704 map<string, bufferlist>& attrs,
3705 RGWObjCategory category,
11fdf7f2 3706 std::optional<uint64_t> olh_epoch,
7c673cae 3707 real_time delete_at,
7c673cae 3708 string *ptag,
11fdf7f2 3709 string *petag,
7c673cae 3710 void (*progress_cb)(off_t, void *),
31f18b77 3711 void *progress_data,
9f95a23c
TL
3712 const DoutPrefixProvider *dpp,
3713 RGWFetchObjFilter *filter,
81eedcae
TL
3714 rgw_zone_set *zones_trace,
3715 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
3716{
3717 /* source is in a different zonegroup, copy from there */
3718
3719 RGWRESTStreamRWRequest *in_stream_req;
3720 string tag;
3721 int i;
3722 append_rand_alpha(cct, tag, tag, 32);
3723 obj_time_weight set_mtime_weight;
3724 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 3725 int ret;
7c673cae 3726
9f95a23c 3727 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 3728 using namespace rgw::putobj;
9f95a23c
TL
3729 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, nullptr, user_id,
3730 obj_ctx, dest_obj, olh_epoch, tag, dpp, null_yield);
7c673cae 3731 RGWRESTConn *conn;
11fdf7f2
TL
3732 auto& zone_conn_map = svc.zone->get_zone_conn_map();
3733 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae 3734 if (source_zone.empty()) {
9f95a23c 3735 if (!src_bucket_info || src_bucket_info->zonegroup.empty()) {
7c673cae 3736 /* source is in the master zonegroup */
11fdf7f2 3737 conn = svc.zone->get_master_conn();
7c673cae 3738 } else {
9f95a23c 3739 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info->zonegroup);
7c673cae
FG
3740 if (iter == zonegroup_conn_map.end()) {
3741 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
3742 return -ENOENT;
3743 }
3744 conn = iter->second;
3745 }
3746 } else {
9f95a23c 3747 auto iter = zone_conn_map.find(source_zone);
7c673cae
FG
3748 if (iter == zone_conn_map.end()) {
3749 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 3750 return -ENOENT;
7c673cae 3751 }
11fdf7f2 3752 conn = iter->second;
7c673cae
FG
3753 }
3754
3755 boost::optional<RGWPutObj_Compress> compressor;
3756 CompressorRef plugin;
3757
9f95a23c
TL
3758 RGWFetchObjFilter_Default source_filter;
3759 if (!filter) {
3760 filter = &source_filter;
3761 }
3762
3763 std::optional<rgw_user> override_owner;
3764
11fdf7f2 3765 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
9f95a23c
TL
3766 [&](map<string, bufferlist>& obj_attrs) {
3767 const rgw_placement_rule *ptail_rule;
3768
3769 int ret = filter->filter(cct,
3770 src_obj.key,
3771 dest_bucket_info,
3772 dest_placement_rule,
3773 obj_attrs,
3774 &override_owner,
3775 &ptail_rule);
3776 if (ret < 0) {
3777 ldout(cct, 5) << "Aborting fetch: source object filter returned ret=" << ret << dendl;
3778 return ret;
11fdf7f2 3779 }
9f95a23c
TL
3780
3781 processor.set_tail_placement(*ptail_rule);
3782
11fdf7f2
TL
3783 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
3784 if (compression_type != "none") {
3785 plugin = Compressor::create(cct, compression_type);
3786 if (!plugin) {
3787 ldout(cct, 1) << "Cannot load plugin for compression type "
3788 << compression_type << dendl;
3789 }
3790 }
3791
9f95a23c 3792 ret = processor.prepare(null_yield);
11fdf7f2
TL
3793 if (ret < 0) {
3794 return ret;
3795 }
3796 return 0;
3797 });
7c673cae
FG
3798
3799 string etag;
7c673cae 3800 real_time set_mtime;
81eedcae 3801 uint64_t expected_size = 0;
7c673cae
FG
3802
3803 RGWObjState *dest_state = NULL;
3804
3805 const real_time *pmod = mod_ptr;
3806
3807 obj_time_weight dest_mtime_weight;
3808
3809 if (copy_if_newer) {
3810 /* need to get mtime for destination */
9f95a23c 3811 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
7c673cae
FG
3812 if (ret < 0)
3813 goto set_err_state;
3814
3815 if (!real_clock::is_zero(dest_state->mtime)) {
3816 dest_mtime_weight.init(dest_state);
3817 pmod = &dest_mtime_weight.mtime;
3818 }
3819 }
3820
181888fb
FG
3821 static constexpr bool prepend_meta = true;
3822 static constexpr bool get_op = true;
3823 static constexpr bool rgwx_stat = false;
3824 static constexpr bool sync_manifest = true;
3825 static constexpr bool skip_decrypt = true;
7c673cae
FG
3826 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
3827 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 3828 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
3829 sync_manifest, skip_decrypt,
3830 true,
3831 &cb, &in_stream_req);
7c673cae
FG
3832 if (ret < 0) {
3833 goto set_err_state;
3834 }
3835
81eedcae
TL
3836 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
3837 &expected_size, nullptr, nullptr);
7c673cae
FG
3838 if (ret < 0) {
3839 goto set_err_state;
3840 }
28e407b8
AA
3841 ret = cb.flush();
3842 if (ret < 0) {
3843 goto set_err_state;
3844 }
81eedcae
TL
3845 if (cb.get_data_len() != expected_size) {
3846 ret = -EIO;
3847 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
3848 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
3849 goto set_err_state;
3850 }
7c673cae
FG
3851 if (compressor && compressor->is_compressed()) {
3852 bufferlist tmp;
3853 RGWCompressionInfo cs_info;
3854 cs_info.compression_type = plugin->get_type_name();
3855 cs_info.orig_size = cb.get_data_len();
3856 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 3857 encode(cs_info, tmp);
7c673cae
FG
3858 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
3859 }
3860
9f95a23c
TL
3861 if (override_owner) {
3862 processor.set_owner(*override_owner);
3863
3864 auto& obj_attrs = cb.get_attrs();
3865
3866 RGWUserInfo owner_info;
3867 if (ctl.user->get_info_by_uid(*override_owner, &owner_info, null_yield) < 0) {
3868 ldout(cct, 10) << "owner info does not exist" << dendl;
3869 return -EINVAL;
3870 }
3871
3872 RGWAccessControlPolicy acl;
3873
3874 auto aiter = obj_attrs.find(RGW_ATTR_ACL);
3875 if (aiter == obj_attrs.end()) {
3876 ldout(cct, 0) << "WARNING: " << __func__ << "(): object doesn't have ACL attribute, setting default ACLs" << dendl;
3877 acl.create_default(owner_info.user_id, owner_info.display_name);
3878 } else {
3879 auto iter = aiter->second.cbegin();
3880 try {
3881 acl.decode(iter);
3882 } catch (buffer::error& err) {
3883 ldout(cct, 0) << "ERROR: " << __func__ << "(): could not decode policy, caught buffer::error" << dendl;
3884 return -EIO;
3885 }
3886 }
3887
3888 ACLOwner new_owner;
3889 new_owner.set_id(*override_owner);
3890 new_owner.set_name(owner_info.display_name);
3891
3892 acl.set_owner(new_owner);
3893
3894 bufferlist bl;
3895 acl.encode(bl);
3896 obj_attrs[RGW_ATTR_ACL] = std::move(bl);
3897 }
3898
7c673cae
FG
3899 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
3900 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
3901 } else {
3902 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
3903 if (iter != cb.get_attrs().end()) {
3904 try {
11fdf7f2 3905 decode(delete_at, iter->second);
7c673cae
FG
3906 } catch (buffer::error& err) {
3907 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
3908 }
3909 }
3910 }
3911
3912 if (src_mtime) {
3913 *src_mtime = set_mtime;
3914 }
3915
3916 if (petag) {
3917 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
3918 if (iter != cb.get_attrs().end()) {
11fdf7f2 3919 *petag = iter->second.to_str();
7c673cae
FG
3920 }
3921 }
3922
11fdf7f2
TL
3923 //erase the append attr
3924 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
3925
7c673cae
FG
3926 if (source_zone.empty()) {
3927 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
3928 } else {
3929 attrs = cb.get_attrs();
3930 }
3931
3932 if (copy_if_newer) {
3933 uint64_t pg_ver = 0;
3934 auto i = attrs.find(RGW_ATTR_PG_VER);
3935 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 3936 auto iter = i->second.cbegin();
7c673cae 3937 try {
11fdf7f2 3938 decode(pg_ver, iter);
7c673cae
FG
3939 } catch (buffer::error& err) {
3940 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
3941 /* non critical error */
3942 }
3943 }
11fdf7f2 3944 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
3945 }
3946
3947#define MAX_COMPLETE_RETRY 100
3948 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
3949 bool canceled = false;
3950 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
3951 attrs, delete_at, nullptr, nullptr, nullptr,
9f95a23c 3952 zones_trace, &canceled, null_yield);
7c673cae
FG
3953 if (ret < 0) {
3954 goto set_err_state;
3955 }
11fdf7f2 3956 if (copy_if_newer && canceled) {
7c673cae 3957 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
11fdf7f2 3958 obj_ctx.invalidate(dest_obj); /* object was overwritten */
9f95a23c 3959 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false, null_yield);
7c673cae
FG
3960 if (ret < 0) {
3961 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
3962 goto set_err_state;
3963 }
3964 dest_mtime_weight.init(dest_state);
3965 dest_mtime_weight.high_precision = high_precision_time;
3966 if (!dest_state->exists ||
3967 dest_mtime_weight < set_mtime_weight) {
3968 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3969 continue;
3970 } else {
3971 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
3972 }
3973 }
3974 break;
3975 }
3976
3977 if (i == MAX_COMPLETE_RETRY) {
3978 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
3979 ret = -EIO;
3980 goto set_err_state;
3981 }
3982
81eedcae
TL
3983 if (bytes_transferred) {
3984 *bytes_transferred = cb.get_data_len();
3985 }
7c673cae
FG
3986 return 0;
3987set_err_state:
3988 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
3989 // we may have already fetched during sync of OP_ADD, but were waiting
3990 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
3991 if (olh_epoch && *olh_epoch > 0) {
3992 constexpr bool log_data_change = true;
3993 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
9f95a23c 3994 *olh_epoch, real_time(), false, null_yield, zones_trace, log_data_change);
91327a77
AA
3995 } else {
3996 // we already have the latest copy
3997 ret = 0;
3998 }
7c673cae 3999 }
7c673cae
FG
4000 return ret;
4001}
4002
4003
4004int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
4005 map<string, bufferlist>& src_attrs,
4006 RGWRados::Object::Read& read_op,
4007 const rgw_user& user_id,
4008 rgw_obj& dest_obj,
4009 real_time *mtime)
4010{
4011 string etag;
4012
11fdf7f2 4013 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4014
11fdf7f2
TL
4015 auto rest_master_conn = svc.zone->get_master_conn();
4016
4017 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
7c673cae 4018 if (ret < 0) {
7c673cae
FG
4019 return ret;
4020 }
4021
9f95a23c 4022 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb(), null_yield);
224ce89b
WB
4023 if (ret < 0) {
4024 delete out_stream_req;
7c673cae 4025 return ret;
224ce89b 4026 }
7c673cae
FG
4027
4028 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4029 if (ret < 0)
4030 return ret;
4031
4032 return 0;
4033}
4034
4035/**
4036 * Copy an object.
4037 * dest_obj: the object to copy into
4038 * src_obj: the object to copy from
4039 * attrs: usage depends on attrs_mod parameter
4040 * attrs_mod: the modification mode of the attrs, may have the following values:
4041 * ATTRSMOD_NONE - the attributes of the source object will be
4042 * copied without modifications, attrs parameter is ignored;
4043 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4044 * parameter, source object attributes are not copied;
4045 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4046 * are overwritten by values contained in attrs parameter.
4047 * err: stores any errors resulting from the get of the original object
4048 * Returns: 0 on success, -ERR# otherwise.
4049 */
4050int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4051 const rgw_user& user_id,
7c673cae 4052 req_info *info,
9f95a23c 4053 const rgw_zone_id& source_zone,
7c673cae
FG
4054 rgw_obj& dest_obj,
4055 rgw_obj& src_obj,
4056 RGWBucketInfo& dest_bucket_info,
4057 RGWBucketInfo& src_bucket_info,
11fdf7f2 4058 const rgw_placement_rule& dest_placement,
7c673cae
FG
4059 real_time *src_mtime,
4060 real_time *mtime,
4061 const real_time *mod_ptr,
4062 const real_time *unmod_ptr,
4063 bool high_precision_time,
4064 const char *if_match,
4065 const char *if_nomatch,
4066 AttrsMod attrs_mod,
4067 bool copy_if_newer,
4068 map<string, bufferlist>& attrs,
4069 RGWObjCategory category,
4070 uint64_t olh_epoch,
4071 real_time delete_at,
4072 string *version_id,
4073 string *ptag,
11fdf7f2 4074 string *petag,
7c673cae 4075 void (*progress_cb)(off_t, void *),
9f95a23c
TL
4076 void *progress_data,
4077 const DoutPrefixProvider *dpp,
4078 optional_yield y)
7c673cae
FG
4079{
4080 int ret;
4081 uint64_t obj_size;
4082 rgw_obj shadow_obj = dest_obj;
4083 string shadow_oid;
4084
4085 bool remote_src;
4086 bool remote_dest;
4087
4088 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4089 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4090
11fdf7f2
TL
4091 auto& zonegroup = svc.zone->get_zonegroup();
4092
4093 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4094 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
7c673cae
FG
4095
4096 if (remote_src && remote_dest) {
9f95a23c 4097 ldpp_dout(dpp, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
7c673cae
FG
4098 return -EINVAL;
4099 }
4100
9f95a23c 4101 ldpp_dout(dpp, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
7c673cae
FG
4102
4103 if (remote_src || !source_zone.empty()) {
11fdf7f2 4104 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
9f95a23c 4105 dest_obj, src_obj, dest_bucket_info, &src_bucket_info,
11fdf7f2 4106 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4107 unmod_ptr, high_precision_time,
4108 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
9f95a23c
TL
4109 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data, dpp,
4110 nullptr /* filter */);
7c673cae
FG
4111 }
4112
4113 map<string, bufferlist> src_attrs;
4114 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4115 RGWRados::Object::Read read_op(&src_op_target);
4116
4117 read_op.conds.mod_ptr = mod_ptr;
4118 read_op.conds.unmod_ptr = unmod_ptr;
4119 read_op.conds.high_precision_time = high_precision_time;
4120 read_op.conds.if_match = if_match;
4121 read_op.conds.if_nomatch = if_nomatch;
4122 read_op.params.attrs = &src_attrs;
4123 read_op.params.lastmod = src_mtime;
4124 read_op.params.obj_size = &obj_size;
7c673cae 4125
9f95a23c 4126 ret = read_op.prepare(y);
7c673cae
FG
4127 if (ret < 0) {
4128 return ret;
4129 }
94b18763
FG
4130 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4131 // Current implementation does not follow S3 spec and even
4132 // may result in data corruption silently when copying
4133 // multipart objects acorss pools. So reject COPY operations
4134 //on encrypted objects before it is fully functional.
9f95a23c 4135 ldpp_dout(dpp, 0) << "ERROR: copy op for encrypted object " << src_obj
94b18763
FG
4136 << " has not been implemented." << dendl;
4137 return -ERR_NOT_IMPLEMENTED;
4138 }
7c673cae
FG
4139
4140 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4141 src_attrs.erase(RGW_ATTR_DELETE_AT);
4142
4143 set_copy_attrs(src_attrs, attrs, attrs_mod);
4144 attrs.erase(RGW_ATTR_ID_TAG);
4145 attrs.erase(RGW_ATTR_PG_VER);
4146 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4147 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4148 if (cmp != src_attrs.end())
4149 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4150
4151 RGWObjManifest manifest;
4152 RGWObjState *astate = NULL;
4153
9f95a23c 4154 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate, y);
7c673cae
FG
4155 if (ret < 0) {
4156 return ret;
4157 }
4158
4159 vector<rgw_raw_obj> ref_objs;
4160
4161 if (remote_dest) {
4162 /* dest is in a different zonegroup, copy it there */
4163 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4164 }
4165 uint64_t max_chunk_size;
4166
4167 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4168 if (ret < 0) {
9f95a23c 4169 ldpp_dout(dpp, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
7c673cae
FG
4170 return ret;
4171 }
4172
4173 rgw_pool src_pool;
4174 rgw_pool dest_pool;
11fdf7f2
TL
4175
4176 const rgw_placement_rule *src_rule{nullptr};
4177
9f95a23c
TL
4178 if (astate->manifest) {
4179 src_rule = &astate->manifest->get_tail_placement().placement_rule;
4180 ldpp_dout(dpp, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
11fdf7f2
TL
4181 }
4182
4183 if (!src_rule || src_rule->empty()) {
4184 src_rule = &src_bucket_info.placement_rule;
4185 }
4186
4187 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
9f95a23c 4188 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
7c673cae
FG
4189 return -EIO;
4190 }
11fdf7f2
TL
4191
4192 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
9f95a23c 4193 ldpp_dout(dpp, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
7c673cae
FG
4194 return -EIO;
4195 }
4196
9f95a23c 4197 ldpp_dout(dpp, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
11fdf7f2
TL
4198 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4199
9f95a23c 4200 bool copy_data = (!astate->manifest) ||
11fdf7f2
TL
4201 (*src_rule != dest_placement) ||
4202 (src_pool != dest_pool);
7c673cae 4203
7c673cae 4204 bool copy_first = false;
9f95a23c
TL
4205 if (astate->manifest) {
4206 if (!astate->manifest->has_tail()) {
7c673cae
FG
4207 copy_data = true;
4208 } else {
9f95a23c 4209 uint64_t head_size = astate->manifest->get_head_size();
7c673cae
FG
4210
4211 if (head_size > 0) {
4212 if (head_size > max_chunk_size) {
4213 copy_data = true;
4214 } else {
4215 copy_first = true;
4216 }
4217 }
4218 }
4219 }
4220
4221 if (petag) {
4222 const auto iter = attrs.find(RGW_ATTR_ETAG);
4223 if (iter != attrs.end()) {
11fdf7f2 4224 *petag = iter->second.to_str();
7c673cae
FG
4225 }
4226 }
4227
4228 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2
TL
4229 attrs.erase(RGW_ATTR_TAIL_TAG);
4230 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
9f95a23c 4231 mtime, real_time(), attrs, olh_epoch, delete_at, petag, dpp, y);
7c673cae
FG
4232 }
4233
9f95a23c 4234 RGWObjManifest::obj_iterator miter = astate->manifest->obj_begin();
7c673cae
FG
4235
4236 if (copy_first) { // we need to copy first chunk, not increase refcount
4237 ++miter;
4238 }
4239
4240 rgw_rados_ref ref;
4241 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4242 if (ret < 0) {
4243 return ret;
4244 }
4245
7c673cae
FG
4246 bufferlist first_chunk;
4247
4248 bool copy_itself = (dest_obj == src_obj);
4249 RGWObjManifest *pmanifest;
9f95a23c 4250 ldpp_dout(dpp, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
4251
4252 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4253 RGWRados::Object::Write write_op(&dest_op_target);
4254
4255 string tag;
4256
4257 if (ptag) {
4258 tag = *ptag;
4259 }
4260
4261 if (tag.empty()) {
4262 append_rand_alpha(cct, tag, tag, 32);
4263 }
4264
4265 if (!copy_itself) {
181888fb 4266 attrs.erase(RGW_ATTR_TAIL_TAG);
9f95a23c 4267 manifest = *astate->manifest;
7c673cae
FG
4268 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4269 if (tail_placement.bucket.name.empty()) {
4270 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4271 }
3efd9988 4272 string ref_tag;
9f95a23c 4273 for (; miter != astate->manifest->obj_end(); ++miter) {
7c673cae 4274 ObjectWriteOperation op;
3efd9988
FG
4275 ref_tag = tag + '\0';
4276 cls_refcount_get(op, ref_tag, true);
7c673cae 4277 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
7c673cae 4278
9f95a23c
TL
4279 auto& ioctx = ref.pool.ioctx();
4280 ioctx.locator_set_key(loc.loc);
4281
4282 ret = rgw_rados_operate(ioctx, loc.oid, &op, null_yield);
7c673cae
FG
4283 if (ret < 0) {
4284 goto done_ret;
4285 }
4286
4287 ref_objs.push_back(loc);
4288 }
4289
4290 pmanifest = &manifest;
4291 } else {
9f95a23c 4292 pmanifest = &(*astate->manifest);
7c673cae
FG
4293 /* don't send the object's tail for garbage collection */
4294 astate->keep_tail = true;
4295 }
4296
4297 if (copy_first) {
9f95a23c 4298 ret = read_op.read(0, max_chunk_size, first_chunk, y);
7c673cae
FG
4299 if (ret < 0) {
4300 goto done_ret;
4301 }
4302
4303 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4304 } else {
4305 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4306 }
4307
4308 write_op.meta.data = &first_chunk;
4309 write_op.meta.manifest = pmanifest;
4310 write_op.meta.ptag = &tag;
4311 write_op.meta.owner = dest_bucket_info.owner;
4312 write_op.meta.mtime = mtime;
4313 write_op.meta.flags = PUT_OBJ_CREATE;
4314 write_op.meta.category = category;
4315 write_op.meta.olh_epoch = olh_epoch;
4316 write_op.meta.delete_at = delete_at;
181888fb 4317 write_op.meta.modify_tail = !copy_itself;
7c673cae 4318
9f95a23c 4319 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs, y);
7c673cae
FG
4320 if (ret < 0) {
4321 goto done_ret;
4322 }
4323
4324 return 0;
4325
4326done_ret:
4327 if (!copy_itself) {
4328 vector<rgw_raw_obj>::iterator riter;
4329
7c673cae 4330 /* rollback reference */
92f5a8d4 4331 string ref_tag = tag + '\0';
7c673cae
FG
4332 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4333 ObjectWriteOperation op;
92f5a8d4 4334 cls_refcount_put(op, ref_tag, true);
7c673cae 4335
9f95a23c 4336 ref.pool.ioctx().locator_set_key(riter->loc);
7c673cae 4337
9f95a23c 4338 int r = rgw_rados_operate(ref.pool.ioctx(), riter->oid, &op, null_yield);
7c673cae 4339 if (r < 0) {
9f95a23c 4340 ldpp_dout(dpp, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
7c673cae
FG
4341 }
4342 }
4343 }
4344 return ret;
4345}
4346
4347
4348int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4349 RGWBucketInfo& dest_bucket_info,
11fdf7f2 4350 const rgw_placement_rule& dest_placement,
7c673cae 4351 RGWRados::Object::Read& read_op, off_t end,
11fdf7f2 4352 const rgw_obj& dest_obj,
7c673cae
FG
4353 real_time *mtime,
4354 real_time set_mtime,
4355 map<string, bufferlist>& attrs,
7c673cae
FG
4356 uint64_t olh_epoch,
4357 real_time delete_at,
9f95a23c
TL
4358 string *petag,
4359 const DoutPrefixProvider *dpp,
4360 optional_yield y)
7c673cae 4361{
7c673cae
FG
4362 string tag;
4363 append_rand_alpha(cct, tag, tag, 32);
4364
9f95a23c 4365 rgw::BlockingAioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
11fdf7f2 4366 using namespace rgw::putobj;
9f95a23c
TL
4367 // do not change the null_yield in the initialization of this AtomicObjectProcessor
4368 // it causes crashes in the ragweed tests
4369 AtomicObjectProcessor processor(&aio, this->store, dest_bucket_info, &dest_placement,
11fdf7f2 4370 dest_bucket_info.owner, obj_ctx,
9f95a23c
TL
4371 dest_obj, olh_epoch, tag, dpp, null_yield);
4372 int ret = processor.prepare(y);
7c673cae
FG
4373 if (ret < 0)
4374 return ret;
4375
4376 off_t ofs = 0;
4377
4378 do {
4379 bufferlist bl;
9f95a23c 4380 ret = read_op.read(ofs, end, bl, y);
11fdf7f2 4381 if (ret < 0) {
9f95a23c 4382 ldpp_dout(dpp, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
11fdf7f2
TL
4383 return ret;
4384 }
7c673cae
FG
4385
4386 uint64_t read_len = ret;
11fdf7f2
TL
4387 ret = processor.process(std::move(bl), ofs);
4388 if (ret < 0) {
4389 return ret;
4390 }
7c673cae
FG
4391
4392 ofs += read_len;
4393 } while (ofs <= end);
4394
11fdf7f2
TL
4395 // flush
4396 ret = processor.process({}, ofs);
4397 if (ret < 0) {
4398 return ret;
4399 }
4400
7c673cae
FG
4401 string etag;
4402 auto iter = attrs.find(RGW_ATTR_ETAG);
4403 if (iter != attrs.end()) {
4404 bufferlist& bl = iter->second;
11fdf7f2 4405 etag = bl.to_str();
7c673cae 4406 if (petag) {
11fdf7f2 4407 *petag = etag;
7c673cae
FG
4408 }
4409 }
4410
4411 uint64_t accounted_size;
4412 {
4413 bool compressed{false};
4414 RGWCompressionInfo cs_info;
4415 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4416 if (ret < 0) {
9f95a23c 4417 ldpp_dout(dpp, 0) << "ERROR: failed to read compression info" << dendl;
7c673cae
FG
4418 return ret;
4419 }
4420 // pass original size if compressed
4421 accounted_size = compressed ? cs_info.orig_size : ofs;
4422 }
4423
11fdf7f2 4424 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
9f95a23c 4425 nullptr, nullptr, nullptr, nullptr, nullptr, y);
7c673cae
FG
4426}
4427
11fdf7f2
TL
4428int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4429 RGWBucketInfo& bucket_info,
4430 rgw_obj& obj,
4431 const rgw_placement_rule& placement_rule,
4432 const real_time& mtime,
9f95a23c
TL
4433 uint64_t olh_epoch,
4434 const DoutPrefixProvider *dpp,
4435 optional_yield y)
7c673cae 4436{
11fdf7f2
TL
4437 map<string, bufferlist> attrs;
4438 real_time read_mtime;
4439 uint64_t obj_size;
7c673cae 4440
9f95a23c
TL
4441 obj_ctx.set_atomic(obj);
4442
11fdf7f2
TL
4443 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4444 RGWRados::Object::Read read_op(&op_target);
7c673cae 4445
11fdf7f2
TL
4446 read_op.params.attrs = &attrs;
4447 read_op.params.lastmod = &read_mtime;
4448 read_op.params.obj_size = &obj_size;
7c673cae 4449
9f95a23c 4450 int ret = read_op.prepare(y);
11fdf7f2
TL
4451 if (ret < 0) {
4452 return ret;
7c673cae
FG
4453 }
4454
11fdf7f2
TL
4455 if (read_mtime != mtime) {
4456 /* raced */
4457 return -ECANCELED;
7c673cae
FG
4458 }
4459
9f95a23c
TL
4460 attrs.erase(RGW_ATTR_ID_TAG);
4461 attrs.erase(RGW_ATTR_TAIL_TAG);
4462
11fdf7f2
TL
4463 ret = copy_obj_data(obj_ctx,
4464 bucket_info,
4465 placement_rule,
4466 read_op,
4467 obj_size - 1,
4468 obj,
4469 nullptr /* pmtime */,
4470 mtime,
4471 attrs,
4472 olh_epoch,
4473 real_time(),
9f95a23c
TL
4474 nullptr /* petag */,
4475 dpp,
4476 y);
11fdf7f2
TL
4477 if (ret < 0) {
4478 return ret;
7c673cae
FG
4479 }
4480
11fdf7f2 4481 return 0;
7c673cae
FG
4482}
4483
9f95a23c 4484int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info, optional_yield y)
7c673cae 4485{
9f95a23c
TL
4486 constexpr uint NUM_ENTRIES = 1000u;
4487
7c673cae
FG
4488 rgw_obj_index_key marker;
4489 string prefix;
4490 bool is_truncated;
4491
4492 do {
9f95a23c
TL
4493 std::vector<rgw_bucket_dir_entry> ent_list;
4494 ent_list.reserve(NUM_ENTRIES);
4495
1adf2230
AA
4496 int r = cls_bucket_list_unordered(bucket_info,
4497 RGW_NO_SHARD,
4498 marker,
4499 prefix,
4500 NUM_ENTRIES,
4501 true,
4502 ent_list,
4503 &is_truncated,
9f95a23c
TL
4504 &marker,
4505 y);
4506 if (r < 0) {
7c673cae 4507 return r;
9f95a23c 4508 }
7c673cae
FG
4509
4510 string ns;
1adf2230 4511 for (auto const& dirent : ent_list) {
7c673cae
FG
4512 rgw_obj_key obj;
4513
9f95a23c 4514 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns)) {
7c673cae 4515 return -ENOTEMPTY;
9f95a23c 4516 }
7c673cae
FG
4517 }
4518 } while (is_truncated);
1adf2230 4519
7c673cae
FG
4520 return 0;
4521}
4522
4523/**
4524 * Delete a bucket.
4525 * bucket: the name of the bucket to delete
4526 * Returns 0 on success, -ERR# otherwise.
4527 */
9f95a23c 4528int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, optional_yield y, bool check_empty)
7c673cae
FG
4529{
4530 const rgw_bucket& bucket = bucket_info.bucket;
9f95a23c 4531 RGWSI_RADOS::Pool index_pool;
7c673cae 4532 map<int, string> bucket_objs;
9f95a23c 4533 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
4534 if (r < 0)
4535 return r;
4536
4537 if (check_empty) {
9f95a23c 4538 r = check_bucket_empty(bucket_info, y);
7c673cae
FG
4539 if (r < 0) {
4540 return r;
4541 }
4542 }
9f95a23c
TL
4543
4544 bool remove_ep = true;
4545
4546 if (objv_tracker.read_version.empty()) {
4547 RGWBucketEntryPoint ep;
4548 r = ctl.bucket->read_bucket_entrypoint_info(bucket_info.bucket,
4549 &ep,
4550 null_yield,
4551 RGWBucketCtl::Bucket::GetParams()
4552 .set_objv_tracker(&objv_tracker));
4553 if (r < 0 ||
4554 (!bucket_info.bucket.bucket_id.empty() &&
4555 ep.bucket.bucket_id != bucket_info.bucket.bucket_id)) {
4556 if (r != -ENOENT) {
4557 ldout(cct, 0) << "ERROR: read_bucket_entrypoint_info() bucket=" << bucket_info.bucket << " returned error: r=" << r << dendl;
4558 /* we have no idea what caused the error, will not try to remove it */
4559 }
4560 /*
4561 * either failed to read bucket entrypoint, or it points to a different bucket instance than
4562 * requested
4563 */
4564 remove_ep = false;
4565 }
4566 }
4567
4568 if (remove_ep) {
4569 r = ctl.bucket->remove_bucket_entrypoint_info(bucket_info.bucket, null_yield,
4570 RGWBucketCtl::Bucket::RemoveParams()
4571 .set_objv_tracker(&objv_tracker));
4572 if (r < 0)
4573 return r;
4574 }
7c673cae
FG
4575
4576 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 4577 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 4578 RGWObjVersionTracker objv_tracker;
9f95a23c 4579 r = ctl.bucket->remove_bucket_instance_info(bucket, bucket_info, null_yield);
7c673cae
FG
4580 if (r < 0) {
4581 return r;
4582 }
f64942e4
AA
4583
4584 /* remove bucket index objects asynchronously by best effort */
9f95a23c 4585 (void) CLSRGWIssueBucketIndexClean(index_pool.ioctx(),
f64942e4
AA
4586 bucket_objs,
4587 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 4588 }
f64942e4 4589
7c673cae
FG
4590 return 0;
4591}
4592
4593int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
4594{
4595 RGWBucketInfo info;
4596 map<string, bufferlist> attrs;
31f18b77 4597 int r;
9f95a23c
TL
4598 auto obj_ctx = svc.sysobj->init_obj_ctx();
4599
31f18b77 4600 if (bucket.bucket_id.empty()) {
9f95a23c 4601 r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
31f18b77 4602 } else {
9f95a23c 4603 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs, null_yield);
31f18b77 4604 }
7c673cae
FG
4605 if (r < 0) {
4606 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4607 return r;
4608 }
4609
4610 info.owner = owner.get_id();
4611
4612 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4613 if (r < 0) {
4614 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
4615 return r;
4616 }
4617
4618 return 0;
4619}
4620
4621
4622int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
4623{
4624 int ret = 0;
4625
4626 vector<rgw_bucket>::iterator iter;
4627
4628 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
4629 rgw_bucket& bucket = *iter;
4630 if (enabled)
4631 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
4632 else
4633 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
4634
4635 RGWBucketInfo info;
4636 map<string, bufferlist> attrs;
9f95a23c 4637 int r = get_bucket_info(&svc, bucket.tenant, bucket.name, info, NULL, null_yield, &attrs);
7c673cae
FG
4638 if (r < 0) {
4639 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4640 ret = r;
4641 continue;
4642 }
4643 if (enabled) {
4644 info.flags &= ~BUCKET_SUSPENDED;
4645 } else {
4646 info.flags |= BUCKET_SUSPENDED;
4647 }
4648
4649 r = put_bucket_instance_info(info, false, real_time(), &attrs);
4650 if (r < 0) {
4651 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
4652 ret = r;
4653 continue;
4654 }
4655 }
4656 return ret;
4657}
4658
4659int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
4660{
4661 RGWBucketInfo bucket_info;
9f95a23c 4662 int ret = get_bucket_info(&svc, bucket.tenant, bucket.name, bucket_info, NULL, null_yield);
7c673cae
FG
4663 if (ret < 0) {
4664 return ret;
4665 }
4666
4667 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
4668 return 0;
4669}
4670
4671int RGWRados::Object::complete_atomic_modification()
4672{
9f95a23c 4673 if ((!state->manifest)|| state->keep_tail)
7c673cae
FG
4674 return 0;
4675
4676 cls_rgw_obj_chain chain;
9f95a23c 4677 store->update_gc_chain(obj, *state->manifest, &chain);
7c673cae
FG
4678
4679 if (chain.empty()) {
4680 return 0;
4681 }
4682
181888fb 4683 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
9f95a23c
TL
4684 auto ret = store->gc->send_chain(chain, tag); // do it synchronously
4685 if (ret < 0) {
4686 //Delete objects inline if send chain to gc fails
4687 store->delete_objs_inline(chain, tag);
4688 }
4689 return 0;
7c673cae
FG
4690}
4691
4692void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
4693{
4694 RGWObjManifest::obj_iterator iter;
4695 rgw_raw_obj raw_head;
4696 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
4697 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
4698 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
4699 if (mobj == raw_head)
4700 continue;
4701 cls_rgw_obj_key key(mobj.oid);
4702 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
4703 }
4704}
4705
9f95a23c 4706int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4707{
9f95a23c 4708 return gc->send_chain(chain, tag);
7c673cae
FG
4709}
4710
9f95a23c 4711void RGWRados::delete_objs_inline(cls_rgw_obj_chain& chain, const string& tag)
7c673cae 4712{
9f95a23c
TL
4713 string last_pool;
4714 std::unique_ptr<IoCtx> ctx(new IoCtx);
4715 int ret = 0;
4716 for (auto liter = chain.objs.begin(); liter != chain.objs.end(); ++liter) {
4717 cls_rgw_obj& obj = *liter;
4718 if (obj.pool != last_pool) {
4719 ctx.reset(new IoCtx);
4720 ret = rgw_init_ioctx(get_rados_handle(), obj.pool, *ctx);
4721 if (ret < 0) {
4722 last_pool = "";
4723 ldout(cct, 0) << "ERROR: failed to create ioctx pool=" <<
4724 obj.pool << dendl;
4725 continue;
4726 }
4727 last_pool = obj.pool;
4728 }
4729 ctx->locator_set_key(obj.loc);
4730 const string& oid = obj.key.name; /* just stored raw oid there */
4731 ldout(cct, 5) << "delete_objs_inline: removing " << obj.pool <<
4732 ":" << obj.key.name << dendl;
4733 ObjectWriteOperation op;
4734 cls_refcount_put(op, tag, true);
4735 ret = ctx->operate(oid, &op);
4736 if (ret < 0) {
4737 ldout(cct, 5) << "delete_objs_inline: refcount put returned error " << ret << dendl;
4738 }
7c673cae 4739 }
7c673cae
FG
4740}
4741
4742static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
4743 map<RGWObjCategory, RGWStorageStats>& stats)
4744{
4745 for (const auto& pair : header.stats) {
4746 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
4747 const rgw_bucket_category_stats& header_stats = pair.second;
4748
4749 RGWStorageStats& s = stats[category];
4750
4751 s.category = category;
4752 s.size += header_stats.total_size;
4753 s.size_rounded += header_stats.total_size_rounded;
4754 s.size_utilized += header_stats.actual_size;
4755 s.num_objects += header_stats.num_entries;
4756 }
4757}
4758
4759int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
4760 map<RGWObjCategory, RGWStorageStats> *existing_stats,
4761 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
4762{
9f95a23c 4763 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
4764 // key - bucket index object id
4765 // value - bucket index check OP returned result with the given bucket index object (shard)
4766 map<int, string> oids;
4767 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 4768
9f95a23c 4769 int ret = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &oids, nullptr);
31f18b77
FG
4770 if (ret < 0) {
4771 return ret;
4772 }
7c673cae 4773
9f95a23c
TL
4774 for (auto& iter : oids) {
4775 bucket_objs_ret[iter.first] = rgw_cls_check_index_ret();
4776 }
4777
4778 ret = CLSRGWIssueBucketCheck(index_pool.ioctx(), oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
4779 if (ret < 0) {
4780 return ret;
4781 }
7c673cae
FG
4782
4783 // Aggregate results (from different shards if there is any)
4784 map<int, struct rgw_cls_check_index_ret>::iterator iter;
4785 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
4786 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
4787 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
4788 }
4789
4790 return 0;
4791}
4792
4793int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
4794{
9f95a23c 4795 RGWSI_RADOS::Pool index_pool;
7c673cae 4796 map<int, string> bucket_objs;
31f18b77 4797
9f95a23c 4798 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77 4799 if (r < 0) {
7c673cae 4800 return r;
31f18b77 4801 }
7c673cae 4802
9f95a23c 4803 return CLSRGWIssueBucketRebuild(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
4804}
4805
f64942e4 4806int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77 4807{
9f95a23c 4808 RGWSI_RADOS::Pool index_pool;
31f18b77
FG
4809 map<int, string> bucket_objs;
4810
9f95a23c 4811 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
31f18b77
FG
4812 if (r < 0) {
4813 return r;
4814 }
4815
9f95a23c 4816 return CLSRGWIssueSetBucketResharding(index_pool.ioctx(), bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77 4817}
7c673cae 4818
9f95a23c 4819int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, optional_yield y)
7c673cae
FG
4820{
4821 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
4822 std::string oid, key;
4823 get_obj_bucket_and_oid_loc(obj, oid, key);
4824 if (!rctx)
4825 return 0;
4826
4827 RGWObjState *state = NULL;
4828
9f95a23c 4829 int r = get_obj_state(rctx, bucket_info, obj, &state, false, y);
7c673cae
FG
4830 if (r < 0)
4831 return r;
4832
4833 if (!state->is_atomic) {
4834 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
4835 return -EINVAL;
4836 }
4837
181888fb
FG
4838 string tag;
4839
4840 if (state->tail_tag.length() > 0) {
4841 tag = state->tail_tag.c_str();
4842 } else if (state->obj_tag.length() > 0) {
4843 tag = state->obj_tag.c_str();
4844 } else {
7c673cae
FG
4845 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
4846 return -EINVAL;
4847 }
4848
7c673cae
FG
4849 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
4850
9f95a23c
TL
4851 cls_rgw_obj_chain chain;
4852 update_gc_chain(state->obj, *state->manifest, &chain);
4853 return gc->async_defer_chain(tag, chain);
7c673cae
FG
4854}
4855
4856void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
4857{
4858 list<string> prefixes;
4859 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
4860 cls_rgw_remove_obj(op, prefixes);
4861}
4862
4863void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
4864{
4865 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
4866}
4867
4868void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
4869{
4870 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
4871}
4872
9f95a23c
TL
4873struct tombstone_entry {
4874 ceph::real_time mtime;
4875 uint32_t zone_short_id;
4876 uint64_t pg_ver;
4877
4878 tombstone_entry() = default;
4879 explicit tombstone_entry(const RGWObjState& state)
4880 : mtime(state.mtime), zone_short_id(state.zone_short_id),
4881 pg_ver(state.pg_ver) {}
4882};
7c673cae
FG
4883
4884/**
4885 * Delete an object.
4886 * bucket: name of the bucket storing the object
4887 * obj: name of the object to delete
4888 * Returns: 0 on success, -ERR# otherwise.
4889 */
9f95a23c 4890int RGWRados::Object::Delete::delete_obj(optional_yield y)
7c673cae
FG
4891{
4892 RGWRados *store = target->get_store();
4893 rgw_obj& src_obj = target->get_obj();
4894 const string& instance = src_obj.key.instance;
4895 rgw_obj obj = src_obj;
4896
4897 if (instance == "null") {
4898 obj.key.instance.clear();
4899 }
4900
4901 bool explicit_marker_version = (!params.marker_version_id.empty());
4902
4903 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
4904 if (instance.empty() || explicit_marker_version) {
4905 rgw_obj marker = obj;
4906
4907 if (!params.marker_version_id.empty()) {
4908 if (params.marker_version_id != "null") {
4909 marker.key.set_instance(params.marker_version_id);
4910 }
4911 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
4912 store->gen_rand_obj_instance_name(&marker);
4913 }
4914
4915 result.version_id = marker.key.instance;
91327a77
AA
4916 if (result.version_id.empty())
4917 result.version_id = "null";
7c673cae
FG
4918 result.delete_marker = true;
4919
4920 struct rgw_bucket_dir_entry_meta meta;
4921
4922 meta.owner = params.obj_owner.get_id().to_str();
4923 meta.owner_display_name = params.obj_owner.get_display_name();
4924
4925 if (real_clock::is_zero(params.mtime)) {
4926 meta.mtime = real_clock::now();
4927 } else {
4928 meta.mtime = params.mtime;
4929 }
4930
9f95a23c 4931 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, y, params.zones_trace);
7c673cae
FG
4932 if (r < 0) {
4933 return r;
4934 }
4935 } else {
4936 rgw_bucket_dir_entry dirent;
4937
4938 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
4939 if (r < 0) {
4940 return r;
4941 }
4942 result.delete_marker = dirent.is_delete_marker();
9f95a23c 4943 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, y, params.zones_trace);
7c673cae
FG
4944 if (r < 0) {
4945 return r;
4946 }
4947 result.version_id = instance;
4948 }
4949
4950 BucketShard *bs;
4951 int r = target->get_bucket_shard(&bs);
4952 if (r < 0) {
4953 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
4954 return r;
4955 }
4956
9f95a23c
TL
4957 r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
4958 if (r < 0) {
4959 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
4960 return r;
7c673cae
FG
4961 }
4962
4963 return 0;
4964 }
4965
4966 rgw_rados_ref ref;
4967 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
4968 if (r < 0) {
4969 return r;
4970 }
4971
4972 RGWObjState *state;
9f95a23c 4973 r = target->get_state(&state, false, y);
7c673cae
FG
4974 if (r < 0)
4975 return r;
4976
4977 ObjectWriteOperation op;
4978
4979 if (!real_clock::is_zero(params.unmod_since)) {
4980 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
4981 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
4982 if (!params.high_precision_time) {
4983 ctime.tv_nsec = 0;
4984 unmod.tv_nsec = 0;
4985 }
4986
4987 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
4988 if (ctime > unmod) {
4989 return -ERR_PRECONDITION_FAILED;
4990 }
4991
4992 /* only delete object if mtime is less than or equal to params.unmod_since */
4993 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
4994 }
11fdf7f2 4995 uint64_t obj_accounted_size = state->accounted_size;
7c673cae 4996
9f95a23c
TL
4997 if(params.abortmp) {
4998 obj_accounted_size = params.parts_accounted_size;
4999 }
5000
7c673cae
FG
5001 if (!real_clock::is_zero(params.expiration_time)) {
5002 bufferlist bl;
5003 real_time delete_at;
5004
5005 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5006 try {
11fdf7f2
TL
5007 auto iter = bl.cbegin();
5008 decode(delete_at, iter);
7c673cae
FG
5009 } catch (buffer::error& err) {
5010 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5011 return -EIO;
5012 }
5013
5014 if (params.expiration_time != delete_at) {
5015 return -ERR_PRECONDITION_FAILED;
5016 }
5017 } else {
5018 return -ERR_PRECONDITION_FAILED;
5019 }
5020 }
5021
5022 if (!state->exists) {
5023 target->invalidate_state();
5024 return -ENOENT;
5025 }
5026
9f95a23c 5027 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false, y);
7c673cae
FG
5028 if (r < 0)
5029 return r;
5030
5031 RGWBucketInfo& bucket_info = target->get_bucket_info();
5032
5033 RGWRados::Bucket bop(store, bucket_info);
5034 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5035
5036 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5037 index_op.set_bilog_flags(params.bilog_flags);
5038
9f95a23c 5039 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag, y);
7c673cae
FG
5040 if (r < 0)
5041 return r;
5042
5043 store->remove_rgw_head_obj(op);
9f95a23c
TL
5044
5045 auto& ioctx = ref.pool.ioctx();
5046 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
94b18763
FG
5047
5048 /* raced with another operation, object state is indeterminate */
5049 const bool need_invalidate = (r == -ECANCELED);
7c673cae 5050
9f95a23c 5051 int64_t poolid = ioctx.get_id();
7c673cae
FG
5052 if (r >= 0) {
5053 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5054 if (obj_tombstone_cache) {
5055 tombstone_entry entry{*state};
5056 obj_tombstone_cache->add(obj, entry);
5057 }
9f95a23c 5058 r = index_op.complete_del(poolid, ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5059
7c673cae
FG
5060 int ret = target->complete_atomic_modification();
5061 if (ret < 0) {
5062 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5063 }
5064 /* other than that, no need to propagate error */
224ce89b
WB
5065 } else {
5066 int ret = index_op.cancel();
5067 if (ret < 0) {
5068 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5069 }
7c673cae
FG
5070 }
5071
5072 if (need_invalidate) {
5073 target->invalidate_state();
5074 }
5075
5076 if (r < 0)
5077 return r;
5078
5079 /* update quota cache */
11fdf7f2 5080 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5081
5082 return 0;
5083}
5084
5085int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5086 const RGWBucketInfo& bucket_info,
5087 const rgw_obj& obj,
5088 int versioning_status,
5089 uint16_t bilog_flags,
31f18b77
FG
5090 const real_time& expiration_time,
5091 rgw_zone_set *zones_trace)
7c673cae
FG
5092{
5093 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5094 RGWRados::Object::Delete del_op(&del_target);
5095
5096 del_op.params.bucket_owner = bucket_info.owner;
5097 del_op.params.versioning_status = versioning_status;
5098 del_op.params.bilog_flags = bilog_flags;
5099 del_op.params.expiration_time = expiration_time;
31f18b77 5100 del_op.params.zones_trace = zones_trace;
7c673cae 5101
9f95a23c 5102 return del_op.delete_obj(null_yield);
7c673cae
FG
5103}
5104
5105int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5106{
5107 rgw_rados_ref ref;
224ce89b 5108 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
5109 if (r < 0) {
5110 return r;
5111 }
5112
5113 ObjectWriteOperation op;
5114
5115 op.remove();
9f95a23c 5116 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
5117 if (r < 0)
5118 return r;
5119
5120 return 0;
5121}
5122
494da23a 5123int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime)
7c673cae
FG
5124{
5125 std::string oid, key;
5126 get_obj_bucket_and_oid_loc(obj, oid, key);
5127
11fdf7f2 5128 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5129
5130 RGWBucketInfo bucket_info;
9f95a23c 5131 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL, null_yield);
7c673cae
FG
5132 if (ret < 0) {
5133 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5134 return ret;
5135 }
5136
5137 RGWRados::Bucket bop(this, bucket_info);
5138 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5139
494da23a 5140 return index_op.complete_del(-1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5141}
5142
5143static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5144{
5145 string tag;
5146
5147 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5148 if (mi != manifest.obj_end()) {
5149 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5150 ++mi;
5151 tag = mi.get_location().get_raw_obj(store).oid;
5152 tag.append("_");
5153 }
5154
5155 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5156 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5157 MD5 hash;
11fdf7f2 5158 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5159
5160 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5161 if (iter != attrset.end()) {
5162 bufferlist& bl = iter->second;
11fdf7f2 5163 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5164 }
5165
5166 hash.Final(md5);
5167 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5168 tag.append(md5_str);
5169
5170 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5171
5172 tag_bl.append(tag.c_str(), tag.size() + 1);
5173}
5174
5175static bool is_olh(map<string, bufferlist>& attrs)
5176{
5177 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5178 return (iter != attrs.end());
5179}
5180
5181static bool has_olh_tag(map<string, bufferlist>& attrs)
5182{
5183 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5184 return (iter != attrs.end());
5185}
5186
5187int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5188 RGWObjState *olh_state, RGWObjState **target_state, optional_yield y)
7c673cae 5189{
11fdf7f2 5190 ceph_assert(olh_state->is_olh);
7c673cae
FG
5191
5192 rgw_obj target;
5193 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5194 if (r < 0) {
5195 return r;
5196 }
9f95a23c 5197 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false, y);
7c673cae
FG
5198 if (r < 0) {
5199 return r;
5200 }
5201
5202 return 0;
5203}
5204
7c673cae 5205int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5206 RGWObjState **state, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5207{
5208 if (obj.empty()) {
5209 return -EINVAL;
5210 }
5211
5212 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5213
11fdf7f2 5214 RGWObjState *s = rctx->get_state(obj);
7c673cae
FG
5215 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5216 *state = s;
5217 if (s->has_attrs) {
5218 if (s->is_olh && need_follow_olh) {
9f95a23c 5219 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
7c673cae
FG
5220 }
5221 return 0;
5222 }
5223
5224 s->obj = obj;
5225
5226 rgw_raw_obj raw_obj;
5227 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5228
5229 int r = -ENOENT;
5230
5231 if (!assume_noent) {
9f95a23c 5232 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL, y);
7c673cae
FG
5233 }
5234
5235 if (r == -ENOENT) {
5236 s->exists = false;
5237 s->has_attrs = true;
5238 tombstone_entry entry;
5239 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5240 s->mtime = entry.mtime;
5241 s->zone_short_id = entry.zone_short_id;
5242 s->pg_ver = entry.pg_ver;
5243 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5244 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5245 } else {
5246 s->mtime = real_time();
5247 }
5248 return 0;
5249 }
5250 if (r < 0)
5251 return r;
5252
5253 s->exists = true;
5254 s->has_attrs = true;
5255 s->accounted_size = s->size;
5256
11fdf7f2
TL
5257 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5258 if (iter != s->attrset.end()) {
5259 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5260 bufferlist& bletag = iter->second;
5261 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5262 bufferlist newbl;
5263 bletag.splice(0, bletag.length() - 1, &newbl);
5264 bletag.claim(newbl);
5265 }
5266 }
5267
5268 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5269 const bool compressed = (iter != s->attrset.end());
5270 if (compressed) {
7c673cae
FG
5271 // use uncompressed size for accounted_size
5272 try {
5273 RGWCompressionInfo info;
11fdf7f2
TL
5274 auto p = iter->second.cbegin();
5275 decode(info, p);
31f18b77 5276 s->accounted_size = info.orig_size;
7c673cae
FG
5277 } catch (buffer::error&) {
5278 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5279 return -EIO;
5280 }
5281 }
5282
5283 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5284 if (iter != s->attrset.end()) {
5285 bufferlist bl = iter->second;
5286 bufferlist::iterator it = bl.begin();
5287 it.copy(bl.length(), s->shadow_obj);
5288 s->shadow_obj[bl.length()] = '\0';
5289 }
5290 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5291 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5292 if (ttiter != s->attrset.end()) {
5293 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5294 }
7c673cae
FG
5295
5296 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5297 if (manifest_bl.length()) {
11fdf7f2 5298 auto miter = manifest_bl.cbegin();
7c673cae 5299 try {
9f95a23c
TL
5300 s->manifest.emplace();
5301 decode(*s->manifest, miter);
5302 s->manifest->set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
7c673cae 5303 broken due to old bugs */
9f95a23c 5304 s->size = s->manifest->get_obj_size();
31f18b77
FG
5305 if (!compressed)
5306 s->accounted_size = s->size;
7c673cae
FG
5307 } catch (buffer::error& err) {
5308 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5309 return -EIO;
5310 }
9f95a23c 5311 ldout(cct, 10) << "manifest: total_size = " << s->manifest->get_obj_size() << dendl;
11fdf7f2 5312 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
9f95a23c 5313 s->manifest->has_explicit_objs()) {
7c673cae 5314 RGWObjManifest::obj_iterator mi;
9f95a23c 5315 for (mi = s->manifest->obj_begin(); mi != s->manifest->obj_end(); ++mi) {
7c673cae
FG
5316 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5317 }
5318 }
5319
5320 if (!s->obj_tag.length()) {
5321 /*
5322 * Uh oh, something's wrong, object with manifest should have tag. Let's
5323 * create one out of the manifest, would be unique
5324 */
9f95a23c 5325 generate_fake_tag(this, s->attrset, *s->manifest, manifest_bl, s->obj_tag);
7c673cae
FG
5326 s->fake_tag = true;
5327 }
5328 }
5329 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5330 if (aiter != s->attrset.end()) {
5331 bufferlist& pg_ver_bl = aiter->second;
5332 if (pg_ver_bl.length()) {
11fdf7f2 5333 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5334 try {
11fdf7f2 5335 decode(s->pg_ver, pgbl);
7c673cae
FG
5336 } catch (buffer::error& err) {
5337 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5338 }
5339 }
5340 }
5341 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5342 if (aiter != s->attrset.end()) {
5343 bufferlist& zone_short_id_bl = aiter->second;
5344 if (zone_short_id_bl.length()) {
11fdf7f2 5345 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5346 try {
11fdf7f2 5347 decode(s->zone_short_id, zbl);
7c673cae
FG
5348 } catch (buffer::error& err) {
5349 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5350 }
5351 }
5352 }
5353 if (s->obj_tag.length())
31f18b77 5354 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
5355 else
5356 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5357
5358 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5359 * it exist, and not only if is_olh() returns true
5360 */
5361 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5362 if (iter != s->attrset.end()) {
5363 s->olh_tag = iter->second;
5364 }
5365
5366 if (is_olh(s->attrset)) {
5367 s->is_olh = true;
5368
5369 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5370
5371 if (need_follow_olh) {
9f95a23c
TL
5372 return get_olh_target_state(*rctx, bucket_info, obj, s, state, y);
5373 } else if (obj.key.have_null_instance() && !s->manifest) {
11fdf7f2
TL
5374 // read null version, and the head object only have olh info
5375 s->exists = false;
5376 return -ENOENT;
7c673cae
FG
5377 }
5378 }
5379
5380 return 0;
5381}
5382
5383int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
9f95a23c 5384 bool follow_olh, optional_yield y, bool assume_noent)
7c673cae
FG
5385{
5386 int ret;
5387
5388 do {
9f95a23c 5389 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, y, assume_noent);
7c673cae
FG
5390 } while (ret == -EAGAIN);
5391
5392 return ret;
5393}
5394
9f95a23c 5395int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest, optional_yield y)
7c673cae
FG
5396{
5397 RGWObjState *astate;
9f95a23c 5398 int r = get_state(&astate, true, y);
7c673cae
FG
5399 if (r < 0) {
5400 return r;
5401 }
5402
9f95a23c 5403 *pmanifest = &(*astate->manifest);
7c673cae
FG
5404
5405 return 0;
5406}
5407
9f95a23c 5408int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest, optional_yield y)
7c673cae
FG
5409{
5410 RGWObjState *state;
9f95a23c 5411 int r = source->get_state(&state, true, y);
7c673cae
FG
5412 if (r < 0)
5413 return r;
5414 if (!state->exists)
5415 return -ENOENT;
5416 if (!state->get_attr(name, dest))
5417 return -ENODATA;
5418
5419 return 0;
5420}
5421
7c673cae
FG
5422int RGWRados::Object::Stat::stat_async()
5423{
5424 RGWObjectCtx& ctx = source->get_ctx();
5425 rgw_obj& obj = source->get_obj();
5426 RGWRados *store = source->get_store();
5427
11fdf7f2 5428 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5429 result.obj = obj;
5430 if (s->has_attrs) {
5431 state.ret = 0;
5432 result.size = s->size;
5433 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5434 result.attrs = s->attrset;
7c673cae
FG
5435 result.manifest = s->manifest;
5436 return 0;
5437 }
5438
5439 string oid;
5440 string loc;
5441 get_obj_bucket_and_oid_loc(obj, oid, loc);
5442
5443 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5444 if (r < 0) {
5445 return r;
5446 }
5447
5448 librados::ObjectReadOperation op;
5449 op.stat2(&result.size, &result.mtime, NULL);
5450 op.getxattrs(&result.attrs, NULL);
9f95a23c 5451 state.completion = librados::Rados::aio_create_completion(nullptr, nullptr);
7c673cae
FG
5452 state.io_ctx.locator_set_key(loc);
5453 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5454 if (r < 0) {
5455 ldout(store->ctx(), 5) << __func__
5456 << ": ERROR: aio_operate() returned ret=" << r
5457 << dendl;
5458 return r;
5459 }
5460
5461 return 0;
5462}
5463
5464
5465int RGWRados::Object::Stat::wait()
5466{
5467 if (!state.completion) {
5468 return state.ret;
5469 }
5470
9f95a23c 5471 state.completion->wait_for_complete();
7c673cae
FG
5472 state.ret = state.completion->get_return_value();
5473 state.completion->release();
5474
5475 if (state.ret != 0) {
5476 return state.ret;
5477 }
5478
5479 return finish();
5480}
5481
5482int RGWRados::Object::Stat::finish()
5483{
5484 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5485 if (iter != result.attrs.end()) {
5486 bufferlist& bl = iter->second;
11fdf7f2 5487 auto biter = bl.cbegin();
7c673cae 5488 try {
9f95a23c
TL
5489 result.manifest.emplace();
5490 decode(*result.manifest, biter);
7c673cae
FG
5491 } catch (buffer::error& err) {
5492 RGWRados *store = source->get_store();
5493 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5494 return -EIO;
5495 }
7c673cae
FG
5496 }
5497
5498 return 0;
5499}
5500
7c673cae
FG
5501int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
5502 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
9f95a23c 5503 ObjectOperation& op, RGWObjState **pstate, optional_yield y)
7c673cae
FG
5504{
5505 if (!rctx)
5506 return 0;
5507
9f95a23c 5508 int r = get_obj_state(rctx, bucket_info, obj, pstate, false, y);
7c673cae
FG
5509 if (r < 0)
5510 return r;
5511
11fdf7f2
TL
5512 return append_atomic_test(*pstate, op);
5513}
7c673cae 5514
11fdf7f2
TL
5515int RGWRados::append_atomic_test(const RGWObjState* state,
5516 librados::ObjectOperation& op)
5517{
7c673cae 5518 if (!state->is_atomic) {
11fdf7f2 5519 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
5520 return 0;
5521 }
5522
5523 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
5524 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5525 } else {
5526 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
5527 }
5528 return 0;
5529}
5530
9f95a23c 5531int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, optional_yield y, bool assume_noent)
7c673cae 5532{
9f95a23c 5533 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, y, assume_noent);
7c673cae
FG
5534}
5535
5536void RGWRados::Object::invalidate_state()
5537{
11fdf7f2 5538 ctx.invalidate(obj);
7c673cae
FG
5539}
5540
5541int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb 5542 const char *if_match, const char *if_nomatch, bool removal_op,
9f95a23c 5543 bool modify_tail, optional_yield y)
7c673cae 5544{
9f95a23c 5545 int r = get_state(&state, false, y);
7c673cae
FG
5546 if (r < 0)
5547 return r;
5548
9f95a23c 5549 bool need_guard = ((state->manifest) || (state->obj_tag.length() != 0) ||
7c673cae
FG
5550 if_match != NULL || if_nomatch != NULL) &&
5551 (!state->fake_tag);
5552
5553 if (!state->is_atomic) {
5554 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
5555
5556 if (reset_obj) {
5557 op.create(false);
5558 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
5559 }
5560
5561 return 0;
5562 }
5563
5564 if (need_guard) {
5565 /* first verify that the object wasn't replaced under */
5566 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
5567 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
5568 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
5569 }
5570
5571 if (if_match) {
5572 if (strcmp(if_match, "*") == 0) {
5573 // test the object is existing
5574 if (!state->exists) {
5575 return -ERR_PRECONDITION_FAILED;
5576 }
5577 } else {
5578 bufferlist bl;
5579 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5580 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
5581 return -ERR_PRECONDITION_FAILED;
5582 }
5583 }
5584 }
5585
5586 if (if_nomatch) {
5587 if (strcmp(if_nomatch, "*") == 0) {
5588 // test the object is NOT existing
5589 if (state->exists) {
5590 return -ERR_PRECONDITION_FAILED;
5591 }
5592 } else {
5593 bufferlist bl;
5594 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
5595 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
5596 return -ERR_PRECONDITION_FAILED;
5597 }
5598 }
5599 }
5600 }
5601
5602 if (reset_obj) {
5603 if (state->exists) {
5604 op.create(false);
5605 store->remove_rgw_head_obj(op);
5606 } else {
5607 op.create(true);
5608 }
5609 }
5610
5611 if (removal_op) {
5612 /* the object is being removed, no need to update its tag */
5613 return 0;
5614 }
5615
5616 if (ptag) {
5617 state->write_tag = *ptag;
5618 } else {
5619 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
5620 }
5621 bufferlist bl;
5622 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
5623
5624 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
5625
5626 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
5627 if (modify_tail) {
5628 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
5629 }
7c673cae
FG
5630
5631 return 0;
5632}
5633
7c673cae
FG
5634/**
5635 * Set an attr on an object.
5636 * bucket: name of the bucket holding the object
5637 * obj: name of the object to set the attr on
5638 * name: the attr to set
5639 * bl: the contents of the attr
5640 * Returns: 0 on success, -ERR# otherwise.
5641 */
5642int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
5643{
5644 map<string, bufferlist> attrs;
5645 attrs[name] = bl;
9f95a23c 5646 return set_attrs(ctx, bucket_info, obj, attrs, NULL, null_yield);
7c673cae
FG
5647}
5648
494da23a 5649int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae 5650 map<string, bufferlist>& attrs,
9f95a23c
TL
5651 map<string, bufferlist>* rmattrs,
5652 optional_yield y)
7c673cae 5653{
494da23a
TL
5654 rgw_obj obj = src_obj;
5655 if (obj.key.instance == "null") {
5656 obj.key.instance.clear();
5657 }
5658
7c673cae
FG
5659 rgw_rados_ref ref;
5660 int r = get_obj_head_ref(bucket_info, obj, &ref);
5661 if (r < 0) {
5662 return r;
5663 }
5664 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5665
5666 ObjectWriteOperation op;
5667 RGWObjState *state = NULL;
5668
9f95a23c 5669 r = append_atomic_test(rctx, bucket_info, obj, op, &state, y);
7c673cae
FG
5670 if (r < 0)
5671 return r;
5672
494da23a 5673 // ensure null version object exist
9f95a23c 5674 if (src_obj.key.instance == "null" && !state->manifest) {
494da23a
TL
5675 return -ENOENT;
5676 }
5677
7c673cae
FG
5678 map<string, bufferlist>::iterator iter;
5679 if (rmattrs) {
5680 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5681 const string& name = iter->first;
5682 op.rmxattr(name.c_str());
5683 }
5684 }
5685
5686 const rgw_bucket& bucket = obj.bucket;
5687
5688 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5689 const string& name = iter->first;
5690 bufferlist& bl = iter->second;
5691
5692 if (!bl.length())
5693 continue;
5694
5695 op.setxattr(name.c_str(), bl);
5696
5697 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
5698 real_time ts;
5699 try {
11fdf7f2 5700 decode(ts, bl);
7c673cae
FG
5701
5702 rgw_obj_index_key obj_key;
5703 obj.key.get_index_key(&obj_key);
5704
9f95a23c 5705 obj_expirer->hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
7c673cae
FG
5706 } catch (buffer::error& err) {
5707 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
5708 }
5709 }
5710 }
5711
5712 if (!op.size())
5713 return 0;
5714
9f95a23c 5715 RGWObjectCtx obj_ctx(this->store);
7c673cae
FG
5716
5717 bufferlist bl;
5718 RGWRados::Bucket bop(this, bucket_info);
5719 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5720
5721 if (state) {
5722 string tag;
5723 append_rand_alpha(cct, tag, tag, 32);
5724 state->write_tag = tag;
9f95a23c 5725 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag, y);
7c673cae
FG
5726
5727 if (r < 0)
5728 return r;
5729
5730 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
5731 op.setxattr(RGW_ATTR_ID_TAG, bl);
5732 }
5733
3efd9988
FG
5734
5735 real_time mtime = real_clock::now();
5736 struct timespec mtime_ts = real_clock::to_timespec(mtime);
5737 op.mtime2(&mtime_ts);
9f95a23c
TL
5738 auto& ioctx = ref.pool.ioctx();
5739 r = rgw_rados_operate(ioctx, ref.obj.oid, &op, null_yield);
7c673cae
FG
5740 if (state) {
5741 if (r >= 0) {
5742 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
5743 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
5744 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
5745 string etag = rgw_bl_str(etag_bl);
5746 string content_type = rgw_bl_str(content_type_bl);
5747 string storage_class;
5748 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
5749 if (iter != attrs.end()) {
5750 storage_class = rgw_bl_str(iter->second);
5751 }
9f95a23c
TL
5752 uint64_t epoch = ioctx.get_last_version();
5753 int64_t poolid = ioctx.get_id();
7c673cae 5754 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
5755 mtime, etag, content_type, storage_class, &acl_bl,
5756 RGWObjCategory::Main, NULL);
7c673cae
FG
5757 } else {
5758 int ret = index_op.cancel();
5759 if (ret < 0) {
5760 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
5761 }
5762 }
5763 }
5764 if (r < 0)
5765 return r;
5766
5767 if (state) {
5768 state->obj_tag.swap(bl);
5769 if (rmattrs) {
5770 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
5771 state->attrset.erase(iter->first);
5772 }
5773 }
92f5a8d4 5774
7c673cae
FG
5775 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
5776 state->attrset[iter->first] = iter->second;
5777 }
92f5a8d4
TL
5778
5779 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
5780 if (iter != state->attrset.end()) {
5781 iter->second = state->obj_tag;
5782 }
7c673cae
FG
5783 }
5784
5785 return 0;
5786}
5787
9f95a23c 5788int RGWRados::Object::Read::prepare(optional_yield y)
7c673cae
FG
5789{
5790 RGWRados *store = source->get_store();
5791 CephContext *cct = store->ctx();
5792
5793 bufferlist etag;
5794
5795 map<string, bufferlist>::iterator iter;
5796
5797 RGWObjState *astate;
9f95a23c 5798 int r = source->get_state(&astate, true, y);
7c673cae
FG
5799 if (r < 0)
5800 return r;
5801
5802 if (!astate->exists) {
5803 return -ENOENT;
5804 }
5805
5806 const RGWBucketInfo& bucket_info = source->get_bucket_info();
5807
5808 state.obj = astate->obj;
5809 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
5810
11fdf7f2
TL
5811 state.cur_pool = state.head_obj.pool;
5812 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
5813
5814 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
5815 if (r < 0) {
5816 return r;
5817 }
eafe8130
TL
5818 if (params.target_obj) {
5819 *params.target_obj = state.obj;
5820 }
7c673cae
FG
5821 if (params.attrs) {
5822 *params.attrs = astate->attrset;
11fdf7f2 5823 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae
FG
5824 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
5825 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
5826 }
5827 }
5828 }
5829
5830 /* Convert all times go GMT to make them compatible */
5831 if (conds.mod_ptr || conds.unmod_ptr) {
5832 obj_time_weight src_weight;
5833 src_weight.init(astate);
5834 src_weight.high_precision = conds.high_precision_time;
5835
5836 obj_time_weight dest_weight;
5837 dest_weight.high_precision = conds.high_precision_time;
5838
9f95a23c 5839 if (conds.mod_ptr && !conds.if_nomatch) {
7c673cae
FG
5840 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5841 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5842 if (!(dest_weight < src_weight)) {
5843 return -ERR_NOT_MODIFIED;
5844 }
5845 }
5846
9f95a23c 5847 if (conds.unmod_ptr && !conds.if_match) {
7c673cae
FG
5848 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
5849 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
5850 if (dest_weight < src_weight) {
5851 return -ERR_PRECONDITION_FAILED;
5852 }
5853 }
5854 }
5855 if (conds.if_match || conds.if_nomatch) {
9f95a23c 5856 r = get_attr(RGW_ATTR_ETAG, etag, y);
7c673cae
FG
5857 if (r < 0)
5858 return r;
5859
5860 if (conds.if_match) {
5861 string if_match_str = rgw_string_unquote(conds.if_match);
11fdf7f2
TL
5862 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
5863 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
5864 return -ERR_PRECONDITION_FAILED;
5865 }
5866 }
5867
5868 if (conds.if_nomatch) {
5869 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
11fdf7f2
TL
5870 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
5871 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
5872 return -ERR_NOT_MODIFIED;
5873 }
5874 }
5875 }
5876
5877 if (params.obj_size)
5878 *params.obj_size = astate->size;
5879 if (params.lastmod)
5880 *params.lastmod = astate->mtime;
5881
5882 return 0;
5883}
5884
5885int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
5886{
5887 if (ofs < 0) {
5888 ofs += obj_size;
11fdf7f2
TL
5889 if (ofs < 0)
5890 ofs = 0;
5891 end = obj_size - 1;
5892 } else if (end < 0) {
5893 end = obj_size - 1;
7c673cae
FG
5894 }
5895
11fdf7f2
TL
5896 if (obj_size > 0) {
5897 if (ofs >= (off_t)obj_size) {
5898 return -ERANGE;
5899 }
5900 if (end >= (off_t)obj_size) {
5901 end = obj_size - 1;
7c673cae
FG
5902 }
5903 }
7c673cae
FG
5904 return 0;
5905}
5906
31f18b77
FG
5907int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
5908{
5909 RGWRados *store = target->get_store();
5910 BucketShard *bs;
5911 int r;
5912
5913#define NUM_RESHARD_RETRIES 10
5914 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
5915 int ret = get_bucket_shard(&bs);
5916 if (ret < 0) {
5917 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
5918 return ret;
5919 }
5920 r = call(bs);
5921 if (r != -ERR_BUSY_RESHARDING) {
5922 break;
5923 }
5924 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
5925 string new_bucket_id;
11fdf7f2
TL
5926 r = store->block_while_resharding(bs, &new_bucket_id,
5927 target->bucket_info, null_yield);
31f18b77
FG
5928 if (r == -ERR_BUSY_RESHARDING) {
5929 continue;
5930 }
5931 if (r < 0) {
5932 return r;
5933 }
5934 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
5935 i = 0; /* resharding is finished, make sure we can retry */
5936 r = target->update_bucket_id(new_bucket_id);
5937 if (r < 0) {
5938 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
5939 return r;
5940 }
5941 invalidate_bs();
81eedcae 5942 } // for loop
31f18b77
FG
5943
5944 if (r < 0) {
5945 return r;
5946 }
5947
5948 if (pbs) {
5949 *pbs = bs;
5950 }
5951
5952 return 0;
5953}
5954
9f95a23c 5955int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag, optional_yield y)
7c673cae
FG
5956{
5957 if (blind) {
5958 return 0;
5959 }
5960 RGWRados *store = target->get_store();
7c673cae
FG
5961
5962 if (write_tag && write_tag->length()) {
5963 optag = string(write_tag->c_str(), write_tag->length());
5964 } else {
5965 if (optag.empty()) {
5966 append_rand_alpha(store->ctx(), optag, optag, 32);
5967 }
5968 }
5969
f64942e4 5970 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
9f95a23c 5971 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, y, zones_trace);
f64942e4 5972 });
31f18b77 5973
7c673cae
FG
5974 if (r < 0) {
5975 return r;
5976 }
5977 prepared = true;
31f18b77 5978
7c673cae
FG
5979 return 0;
5980}
5981
5982int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
5983 uint64_t size, uint64_t accounted_size,
5984 ceph::real_time& ut, const string& etag,
11fdf7f2 5985 const string& content_type, const string& storage_class,
7c673cae
FG
5986 bufferlist *acl_bl,
5987 RGWObjCategory category,
11fdf7f2
TL
5988 list<rgw_obj_index_key> *remove_objs, const string *user_data,
5989 bool appendable)
7c673cae
FG
5990{
5991 if (blind) {
5992 return 0;
5993 }
5994 RGWRados *store = target->get_store();
5995 BucketShard *bs;
31f18b77 5996
7c673cae
FG
5997 int ret = get_bucket_shard(&bs);
5998 if (ret < 0) {
5999 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6000 return ret;
6001 }
6002
6003 rgw_bucket_dir_entry ent;
6004 obj.key.get_index_key(&ent.key);
6005 ent.meta.size = size;
6006 ent.meta.accounted_size = accounted_size;
6007 ent.meta.mtime = ut;
6008 ent.meta.etag = etag;
11fdf7f2 6009 ent.meta.storage_class = storage_class;
7c673cae
FG
6010 if (user_data)
6011 ent.meta.user_data = *user_data;
6012
6013 ACLOwner owner;
6014 if (acl_bl && acl_bl->length()) {
6015 int ret = store->decode_policy(*acl_bl, &owner);
6016 if (ret < 0) {
6017 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6018 }
6019 }
6020 ent.meta.owner = owner.get_id().to_str();
6021 ent.meta.owner_display_name = owner.get_display_name();
6022 ent.meta.content_type = content_type;
11fdf7f2 6023 ent.meta.appendable = appendable;
7c673cae 6024
31f18b77 6025 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6026
9f95a23c
TL
6027 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6028 if (r < 0) {
6029 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6030 }
6031
6032 return ret;
6033}
6034
6035int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6036 real_time& removed_mtime,
6037 list<rgw_obj_index_key> *remove_objs)
6038{
6039 if (blind) {
6040 return 0;
6041 }
6042 RGWRados *store = target->get_store();
6043 BucketShard *bs;
31f18b77 6044
7c673cae
FG
6045 int ret = get_bucket_shard(&bs);
6046 if (ret < 0) {
6047 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6048 return ret;
6049 }
6050
31f18b77 6051 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6052
9f95a23c
TL
6053 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6054 if (r < 0) {
6055 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6056 }
6057
6058 return ret;
6059}
6060
6061
6062int RGWRados::Bucket::UpdateIndex::cancel()
6063{
6064 if (blind) {
6065 return 0;
6066 }
6067 RGWRados *store = target->get_store();
6068 BucketShard *bs;
7c673cae 6069
f64942e4
AA
6070 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6071 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6072 });
7c673cae
FG
6073
6074 /*
6075 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6076 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6077 * have no way to tell that they're all caught up
6078 */
9f95a23c
TL
6079 int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id);
6080 if (r < 0) {
6081 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
7c673cae
FG
6082 }
6083
6084 return ret;
6085}
6086
9f95a23c 6087int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl, optional_yield y)
7c673cae
FG
6088{
6089 RGWRados *store = source->get_store();
6090 CephContext *cct = store->ctx();
6091
7c673cae
FG
6092 rgw_raw_obj read_obj;
6093 uint64_t read_ofs = ofs;
6094 uint64_t len, read_len;
6095 bool reading_from_head = true;
6096 ObjectReadOperation op;
6097
6098 bool merge_bl = false;
6099 bufferlist *pbl = &bl;
6100 bufferlist read_bl;
6101 uint64_t max_chunk_size;
6102
6103 RGWObjState *astate;
9f95a23c 6104 int r = source->get_state(&astate, true, y);
7c673cae
FG
6105 if (r < 0)
6106 return r;
6107
11fdf7f2
TL
6108 if (astate->size == 0) {
6109 end = 0;
6110 } else if (end >= (int64_t)astate->size) {
6111 end = astate->size - 1;
6112 }
6113
7c673cae
FG
6114 if (end < 0)
6115 len = 0;
6116 else
6117 len = end - ofs + 1;
6118
9f95a23c 6119 if (astate->manifest && astate->manifest->has_tail()) {
7c673cae 6120 /* now get the relevant object part */
9f95a23c 6121 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
7c673cae
FG
6122
6123 uint64_t stripe_ofs = iter.get_stripe_ofs();
6124 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6125 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6126 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6127 reading_from_head = (read_obj == state.head_obj);
6128 } else {
6129 read_obj = state.head_obj;
6130 }
6131
6132 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6133 if (r < 0) {
6134 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6135 return r;
6136 }
6137
6138 if (len > max_chunk_size)
6139 len = max_chunk_size;
6140
6141
7c673cae
FG
6142 read_len = len;
6143
6144 if (reading_from_head) {
6145 /* only when reading from the head object do we need to do the atomic test */
9f95a23c 6146 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate, y);
7c673cae
FG
6147 if (r < 0)
6148 return r;
6149
6150 if (astate && astate->prefetch_data) {
6151 if (!ofs && astate->data.length() >= len) {
6152 bl = astate->data;
6153 return bl.length();
6154 }
6155
6156 if (ofs < astate->data.length()) {
11fdf7f2 6157 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
9f95a23c 6158 astate->data.begin(ofs).copy(copy_len, bl);
7c673cae
FG
6159 read_len -= copy_len;
6160 read_ofs += copy_len;
6161 if (!read_len)
6162 return bl.length();
6163
6164 merge_bl = true;
6165 pbl = &read_bl;
6166 }
6167 }
6168 }
6169
6170 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6171 op.read(read_ofs, read_len, pbl, NULL);
6172
11fdf7f2
TL
6173 if (state.cur_pool != read_obj.pool) {
6174 auto iter = state.io_ctxs.find(read_obj.pool);
6175 if (iter == state.io_ctxs.end()) {
6176 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
494da23a 6177 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false);
11fdf7f2
TL
6178 if (r < 0) {
6179 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6180 return r;
6181 }
6182 } else {
6183 state.cur_ioctx = &iter->second;
7c673cae 6184 }
11fdf7f2 6185 state.cur_pool = read_obj.pool;
7c673cae
FG
6186 }
6187
11fdf7f2 6188 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6189
11fdf7f2
TL
6190 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6191 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6192
7c673cae 6193 if (r < 0) {
7c673cae
FG
6194 return r;
6195 }
7c673cae 6196
11fdf7f2
TL
6197 if (merge_bl) {
6198 bl.append(read_bl);
7c673cae
FG
6199 }
6200
7c673cae
FG
6201 return bl.length();
6202}
6203
11fdf7f2
TL
6204struct get_obj_data {
6205 RGWRados* store;
6206 RGWGetDataCB* client_cb;
6207 rgw::Aio* aio;
6208 uint64_t offset; // next offset to write to client
6209 rgw::AioResultList completed; // completed read results, sorted by offset
9f95a23c 6210 optional_yield yield;
7c673cae 6211
9f95a23c
TL
6212 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio,
6213 uint64_t offset, optional_yield yield)
6214 : store(store), client_cb(cb), aio(aio), offset(offset), yield(yield) {}
7c673cae 6215
11fdf7f2
TL
6216 int flush(rgw::AioResultList&& results) {
6217 int r = rgw::check_for_errors(results);
6218 if (r < 0) {
6219 return r;
7c673cae 6220 }
7c673cae 6221
11fdf7f2
TL
6222 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6223 results.sort(cmp); // merge() requires results to be sorted first
6224 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6225
11fdf7f2
TL
6226 while (!completed.empty() && completed.front().id == offset) {
6227 auto bl = std::move(completed.front().data);
6228 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6229
11fdf7f2
TL
6230 offset += bl.length();
6231 int r = client_cb->handle_data(bl, 0, bl.length());
6232 if (r < 0) {
6233 return r;
6234 }
7c673cae 6235 }
11fdf7f2 6236 return 0;
7c673cae
FG
6237 }
6238
11fdf7f2
TL
6239 void cancel() {
6240 // wait for all completions to drain and ignore the results
6241 aio->drain();
7c673cae
FG
6242 }
6243
11fdf7f2
TL
6244 int drain() {
6245 auto c = aio->wait();
6246 while (!c.empty()) {
6247 int r = flush(std::move(c));
7c673cae 6248 if (r < 0) {
11fdf7f2 6249 cancel();
7c673cae
FG
6250 return r;
6251 }
11fdf7f2 6252 c = aio->wait();
7c673cae 6253 }
11fdf7f2 6254 return flush(std::move(c));
7c673cae
FG
6255 }
6256};
6257
11fdf7f2
TL
6258static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6259 off_t read_ofs, off_t len, bool is_head_obj,
6260 RGWObjState *astate, void *arg)
7c673cae
FG
6261{
6262 struct get_obj_data *d = (struct get_obj_data *)arg;
6263
11fdf7f2
TL
6264 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6265 is_head_obj, astate, arg);
7c673cae
FG
6266}
6267
11fdf7f2
TL
6268int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6269 off_t read_ofs, off_t len, bool is_head_obj,
6270 RGWObjState *astate, void *arg)
7c673cae 6271{
7c673cae
FG
6272 ObjectReadOperation op;
6273 struct get_obj_data *d = (struct get_obj_data *)arg;
6274 string oid, key;
7c673cae
FG
6275
6276 if (is_head_obj) {
6277 /* only when reading from the head object do we need to do the atomic test */
11fdf7f2 6278 int r = append_atomic_test(astate, op);
7c673cae
FG
6279 if (r < 0)
6280 return r;
6281
6282 if (astate &&
6283 obj_ofs < astate->data.length()) {
11fdf7f2 6284 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6285
7c673cae 6286 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6287 if (r < 0)
6288 return r;
6289
7c673cae 6290 len -= chunk_len;
11fdf7f2 6291 d->offset += chunk_len;
7c673cae
FG
6292 read_ofs += chunk_len;
6293 obj_ofs += chunk_len;
6294 if (!len)
6295 return 0;
6296 }
6297 }
6298
11fdf7f2
TL
6299 auto obj = d->store->svc.rados->obj(read_obj);
6300 int r = obj.open();
7c673cae 6301 if (r < 0) {
11fdf7f2
TL
6302 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6303 return r;
7c673cae
FG
6304 }
6305
11fdf7f2
TL
6306 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6307 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6308
11fdf7f2
TL
6309 const uint64_t cost = len;
6310 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6311
9f95a23c 6312 auto completed = d->aio->get(obj, rgw::Aio::librados_op(std::move(op), d->yield), cost, id);
7c673cae 6313
11fdf7f2 6314 return d->flush(std::move(completed));
7c673cae
FG
6315}
6316
9f95a23c
TL
6317int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb,
6318 optional_yield y)
7c673cae
FG
6319{
6320 RGWRados *store = source->get_store();
6321 CephContext *cct = store->ctx();
7c673cae 6322 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6323 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6324 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6325
9f95a23c
TL
6326 auto aio = rgw::make_throttle(window_size, y);
6327 get_obj_data data(store, cb, &*aio, ofs, y);
7c673cae 6328
11fdf7f2 6329 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
9f95a23c 6330 ofs, end, chunk_size, _get_obj_iterate_cb, &data, y);
7c673cae 6331 if (r < 0) {
11fdf7f2
TL
6332 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6333 data.cancel(); // drain completions without writing back to client
6334 return r;
7c673cae
FG
6335 }
6336
11fdf7f2 6337 return data.drain();
7c673cae
FG
6338}
6339
6340int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6341 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2 6342 off_t ofs, off_t end, uint64_t max_chunk_size,
9f95a23c 6343 iterate_obj_cb cb, void *arg, optional_yield y)
7c673cae
FG
6344{
6345 rgw_raw_obj head_obj;
6346 rgw_raw_obj read_obj;
6347 uint64_t read_ofs = ofs;
6348 uint64_t len;
6349 bool reading_from_head = true;
6350 RGWObjState *astate = NULL;
6351
6352 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6353
9f95a23c 6354 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
6355 if (r < 0) {
6356 return r;
6357 }
6358
6359 if (end < 0)
6360 len = 0;
6361 else
6362 len = end - ofs + 1;
6363
9f95a23c 6364 if (astate->manifest) {
7c673cae 6365 /* now get the relevant object stripe */
9f95a23c 6366 RGWObjManifest::obj_iterator iter = astate->manifest->obj_find(ofs);
7c673cae 6367
9f95a23c 6368 RGWObjManifest::obj_iterator obj_end = astate->manifest->obj_end();
7c673cae
FG
6369
6370 for (; iter != obj_end && ofs <= end; ++iter) {
6371 off_t stripe_ofs = iter.get_stripe_ofs();
6372 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6373
6374 while (ofs < next_stripe_ofs && ofs <= end) {
6375 read_obj = iter.get_location().get_raw_obj(this);
11fdf7f2 6376 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6377 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6378
6379 if (read_len > max_chunk_size) {
6380 read_len = max_chunk_size;
6381 }
6382
6383 reading_from_head = (read_obj == head_obj);
11fdf7f2 6384 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6385 if (r < 0) {
6386 return r;
6387 }
6388
6389 len -= read_len;
6390 ofs += read_len;
6391 }
6392 }
6393 } else {
6394 while (ofs <= end) {
6395 read_obj = head_obj;
11fdf7f2 6396 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6397
11fdf7f2 6398 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6399 if (r < 0) {
6400 return r;
6401 }
6402
6403 len -= read_len;
6404 ofs += read_len;
6405 }
6406 }
6407
6408 return 0;
6409}
6410
6411int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6412{
6413 rgw_rados_ref ref;
6414 int r = get_obj_head_ref(bucket_info, obj, &ref);
6415 if (r < 0) {
6416 return r;
6417 }
6418
9f95a23c 6419 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, null_yield);
7c673cae
FG
6420}
6421
6422int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6423{
6424 rgw_rados_ref ref;
6425 int r = get_obj_head_ref(bucket_info, obj, &ref);
6426 if (r < 0) {
6427 return r;
6428 }
6429
6430 bufferlist outbl;
6431
9f95a23c 6432 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, op, &outbl, null_yield);
7c673cae
FG
6433}
6434
6435int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6436{
6437 ObjectWriteOperation op;
6438
11fdf7f2 6439 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6440
6441 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6442
6443 if (!state.exists) {
6444 op.create(true);
6445 } else {
6446 op.assert_exists();
b32b8144
FG
6447 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6448 op.mtime2(&mtime_ts);
7c673cae
FG
6449 }
6450
6451 /*
6452 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6453 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6454 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6455 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6456 * log will reflect that.
6457 *
6458 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6459 * is used for object data instance, olh_tag for olh instance.
6460 */
6461 if (has_tag) {
6462 /* guard against racing writes */
6463 bucket_index_guard_olh_op(state, op);
6464 }
6465
6466 if (!has_tag) {
6467 /* obj tag */
9f95a23c 6468 string obj_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6469
7c673cae
FG
6470 bufferlist bl;
6471 bl.append(obj_tag.c_str(), obj_tag.size());
6472 op.setxattr(RGW_ATTR_ID_TAG, bl);
6473
6474 state.attrset[RGW_ATTR_ID_TAG] = bl;
6475 state.obj_tag = bl;
6476
6477 /* olh tag */
9f95a23c 6478 string olh_tag = gen_rand_alphanumeric_lower(cct, 32);
11fdf7f2 6479
7c673cae
FG
6480 bufferlist olh_bl;
6481 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6482 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6483
6484 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6485 state.olh_tag = olh_bl;
6486 state.is_olh = true;
6487
6488 bufferlist verbl;
6489 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6490 }
6491
6492 bufferlist bl;
6493 RGWOLHPendingInfo pending_info;
6494 pending_info.time = real_clock::now();
11fdf7f2 6495 encode(pending_info, bl);
7c673cae
FG
6496
6497#define OLH_PENDING_TAG_LEN 32
6498 /* tag will start with current time epoch, this so that entries are sorted by time */
6499 char buf[32];
6500 utime_t ut(pending_info.time);
6501 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6502 *op_tag = buf;
6503
9f95a23c 6504 string s = gen_rand_alphanumeric_lower(cct, OLH_PENDING_TAG_LEN - op_tag->size());
11fdf7f2 6505
7c673cae
FG
6506 op_tag->append(s);
6507
6508 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
6509 attr_name.append(*op_tag);
6510
6511 op.setxattr(attr_name.c_str(), bl);
6512
11fdf7f2 6513 int ret = obj_operate(bucket_info, olh_obj, &op);
7c673cae
FG
6514 if (ret < 0) {
6515 return ret;
6516 }
6517
6518 state.exists = true;
6519 state.attrset[attr_name] = bl;
6520
6521 return 0;
6522}
6523
6524int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
6525{
6526 int ret;
6527
6528 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
6529 if (ret == -EEXIST) {
6530 ret = -ECANCELED;
6531 }
6532
6533 return ret;
6534}
6535
f64942e4
AA
6536int RGWRados::guard_reshard(BucketShard *bs,
6537 const rgw_obj& obj_instance,
6538 const RGWBucketInfo& bucket_info,
6539 std::function<int(BucketShard *)> call)
31f18b77
FG
6540{
6541 rgw_obj obj;
6542 const rgw_obj *pobj = &obj_instance;
6543 int r;
6544
6545 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
f64942e4 6546 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
31f18b77
FG
6547 if (r < 0) {
6548 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
6549 return r;
6550 }
6551 r = call(bs);
6552 if (r != -ERR_BUSY_RESHARDING) {
6553 break;
6554 }
6555 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6556 string new_bucket_id;
11fdf7f2 6557 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
31f18b77
FG
6558 if (r == -ERR_BUSY_RESHARDING) {
6559 continue;
6560 }
6561 if (r < 0) {
6562 return r;
6563 }
6564 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6565 i = 0; /* resharding is finished, make sure we can retry */
6566
6567 obj = *pobj;
6568 obj.bucket.update_bucket_id(new_bucket_id);
6569 pobj = &obj;
81eedcae 6570 } // for loop
31f18b77
FG
6571
6572 if (r < 0) {
6573 return r;
6574 }
6575
6576 return 0;
6577}
6578
f64942e4
AA
6579int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
6580 string *new_bucket_id,
11fdf7f2
TL
6581 const RGWBucketInfo& bucket_info,
6582 optional_yield y)
31f18b77 6583{
11fdf7f2
TL
6584 int ret = 0;
6585 cls_rgw_bucket_instance_entry entry;
6586
81eedcae
TL
6587 // since we want to run this recovery code from two distinct places,
6588 // let's just put it in a lambda so we can easily re-use; if the
6589 // lambda successfully fetches a new bucket id, it sets
6590 // new_bucket_id and returns 0, otherwise it returns a negative
6591 // error code
6592 auto fetch_new_bucket_id =
9f95a23c
TL
6593 [this, &bucket_info](const std::string& log_tag,
6594 std::string* new_bucket_id) -> int {
81eedcae
TL
6595 RGWBucketInfo fresh_bucket_info = bucket_info;
6596 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
6597 if (ret < 0) {
6598 ldout(cct, 0) << __func__ <<
6599 " ERROR: failed to refresh bucket info after reshard at " <<
6600 log_tag << ": " << cpp_strerror(-ret) << dendl;
6601 return ret;
6602 }
6603 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
6604 return 0;
6605 };
6606
6607 constexpr int num_retries = 10;
6608 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
9f95a23c
TL
6609 auto& ref = bs->bucket_obj.get_ref();
6610 ret = cls_rgw_get_bucket_resharding(ref.pool.ioctx(), ref.obj.oid, &entry);
81eedcae
TL
6611 if (ret == -ENOENT) {
6612 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
6613 } else if (ret < 0) {
6614 ldout(cct, 0) << __func__ <<
6615 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
6616 dendl;
11fdf7f2
TL
6617 return ret;
6618 }
81eedcae 6619
11fdf7f2 6620 if (!entry.resharding_in_progress()) {
81eedcae
TL
6621 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
6622 new_bucket_id);
11fdf7f2 6623 }
31f18b77 6624
81eedcae
TL
6625 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
6626 (i < num_retries ? "retrying" : "too many retries") << dendl;
6627
6628 if (i == num_retries) {
11fdf7f2
TL
6629 break;
6630 }
6631
6632 // If bucket is erroneously marked as resharding (e.g., crash or
6633 // other error) then fix it. If we can take the bucket reshard
6634 // lock then it means no other resharding should be taking place,
6635 // and we're free to clear the flags.
6636 {
6637 // since we expect to do this rarely, we'll do our work in a
6638 // block and erase our work after each try
6639
9f95a23c 6640 RGWObjectCtx obj_ctx(this->store);
11fdf7f2
TL
6641 const rgw_bucket& b = bs->bucket;
6642 std::string bucket_id = b.get_key();
9f95a23c 6643 RGWBucketReshardLock reshard_lock(this->store, bucket_info, true);
11fdf7f2
TL
6644 ret = reshard_lock.lock();
6645 if (ret < 0) {
6646 ldout(cct, 20) << __func__ <<
6647 " INFO: failed to take reshard lock for bucket " <<
6648 bucket_id << "; expected if resharding underway" << dendl;
6649 } else {
6650 ldout(cct, 10) << __func__ <<
6651 " INFO: was able to take reshard lock for bucket " <<
6652 bucket_id << dendl;
9f95a23c 6653 ret = RGWBucketReshard::clear_resharding(this->store, bucket_info);
11fdf7f2
TL
6654 if (ret < 0) {
6655 reshard_lock.unlock();
6656 ldout(cct, 0) << __func__ <<
6657 " ERROR: failed to clear resharding flags for bucket " <<
6658 bucket_id << dendl;
6659 } else {
6660 reshard_lock.unlock();
6661 ldout(cct, 5) << __func__ <<
6662 " INFO: apparently successfully cleared resharding flags for "
6663 "bucket " << bucket_id << dendl;
6664 continue; // if we apparently succeed immediately test again
6665 } // if clear resharding succeeded
6666 } // if taking of lock succeeded
6667 } // block to encapsulate recovery from incomplete reshard
6668
6669 ret = reshard_wait->wait(y);
6670 if (ret < 0) {
81eedcae
TL
6671 ldout(cct, 0) << __func__ <<
6672 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
6673 return ret;
6674 }
81eedcae
TL
6675 } // for loop
6676
6677 ldout(cct, 0) << __func__ <<
6678 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 6679 return -ERR_BUSY_RESHARDING;
31f18b77
FG
6680}
6681
7c673cae
FG
6682int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
6683 bool delete_marker,
6684 const string& op_tag,
6685 struct rgw_bucket_dir_entry_meta *meta,
6686 uint64_t olh_epoch,
91327a77
AA
6687 real_time unmod_since, bool high_precision_time,
6688 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
6689{
6690 rgw_rados_ref ref;
6691 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6692 if (r < 0) {
6693 return r;
6694 }
6695
31f18b77
FG
6696 rgw_zone_set zones_trace;
6697 if (_zones_trace) {
6698 zones_trace = *_zones_trace;
7c673cae 6699 }
9f95a23c 6700 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
7c673cae 6701
31f18b77
FG
6702 BucketShard bs(this);
6703
f64942e4
AA
6704 r = guard_reshard(&bs, obj_instance, bucket_info,
6705 [&](BucketShard *bs) -> int {
9f95a23c
TL
6706 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
6707 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6708 librados::ObjectWriteOperation op;
6709 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6710 cls_rgw_bucket_link_olh(op, key, olh_state.olh_tag,
6711 delete_marker, op_tag, meta, olh_epoch,
6712 unmod_since, high_precision_time,
6713 svc.zone->get_zone().log_data, zones_trace);
6714 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6715 });
6716 if (r < 0) {
9f95a23c 6717 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_olh() returned r=" << r << dendl;
31f18b77 6718 return r;
7c673cae
FG
6719 }
6720
9f95a23c
TL
6721 r = svc.datalog_rados->add_entry(bucket_info, bs.shard_id);
6722 if (r < 0) {
6723 ldout(cct, 0) << "ERROR: failed writing data log" << dendl;
91327a77
AA
6724 }
6725
7c673cae
FG
6726 return 0;
6727}
6728
6729void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
6730{
6731 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
6732 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
6733}
6734
6735int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 6736 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
6737{
6738 rgw_rados_ref ref;
6739 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6740 if (r < 0) {
6741 return r;
6742 }
6743
31f18b77
FG
6744 rgw_zone_set zones_trace;
6745 if (_zones_trace) {
6746 zones_trace = *_zones_trace;
7c673cae 6747 }
9f95a23c 6748 zones_trace.insert(svc.zone->get_zone().id, bucket_info.bucket.get_key());
31f18b77
FG
6749
6750 BucketShard bs(this);
7c673cae
FG
6751
6752 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
6753 r = guard_reshard(&bs, obj_instance, bucket_info,
6754 [&](BucketShard *bs) -> int {
9f95a23c 6755 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6756 librados::ObjectWriteOperation op;
6757 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6758 cls_rgw_bucket_unlink_instance(op, key, op_tag,
6759 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
6760 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77
FG
6761 });
6762 if (r < 0) {
9f95a23c 6763 ldout(cct, 20) << "rgw_rados_operate() after cls_rgw_bucket_link_instance() returned r=" << r << dendl;
31f18b77 6764 return r;
7c673cae
FG
6765 }
6766
6767 return 0;
6768}
6769
6770int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
6771 const rgw_obj& obj_instance, uint64_t ver_marker,
6772 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
6773 bool *is_truncated)
6774{
6775 rgw_rados_ref ref;
6776 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6777 if (r < 0) {
6778 return r;
6779 }
6780
6781 BucketShard bs(this);
f64942e4
AA
6782 int ret =
6783 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
6784 if (ret < 0) {
6785 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6786 return ret;
6787 }
6788
6789 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6790
6791 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6792
f64942e4
AA
6793 ret = guard_reshard(&bs, obj_instance, bucket_info,
6794 [&](BucketShard *bs) -> int {
9f95a23c 6795 auto& ref = bs->bucket_obj.get_ref();
f64942e4
AA
6796 ObjectReadOperation op;
6797 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6798
6799 rgw_cls_read_olh_log_ret log_ret;
6800 int op_ret = 0;
6801 cls_rgw_get_olh_log(op, key, ver_marker, olh_tag, log_ret, op_ret);
6802 bufferlist outbl;
6803 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
6804 if (r < 0) {
6805 return r;
6806 }
6807 if (op_ret < 0) {
6808 return op_ret;
6809 }
6810
6811 *log = std::move(log_ret.log);
6812 *is_truncated = log_ret.is_truncated;
6813 return r;
f64942e4 6814 });
31f18b77
FG
6815 if (ret < 0) {
6816 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 6817 return ret;
31f18b77 6818 }
7c673cae
FG
6819
6820 return 0;
6821}
6822
a8e16298
TL
6823// a multisite sync bug resulted in the OLH head attributes being overwritten by
6824// the attributes from another zone, causing link_olh() to fail endlessly due to
6825// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
6826// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
6827int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
6828 const rgw_obj& obj)
6829{
6830 // fetch the current olh entry from the bucket index
6831 rgw_bucket_olh_entry olh;
6832 int r = bi_get_olh(bucket_info, obj, &olh);
6833 if (r < 0) {
6834 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
6835 return r;
6836 }
11fdf7f2 6837 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
6838 return 0;
6839 }
6840
6841 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
6842 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
6843
6844 // rewrite OLH_ID_TAG and OLH_INFO from current olh
6845 ObjectWriteOperation op;
6846 // assert this is the same olh tag we think we're fixing
6847 bucket_index_guard_olh_op(*state, op);
6848 // preserve existing mtime
6849 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
6850 op.mtime2(&mtime_ts);
6851 {
6852 bufferlist bl;
6853 bl.append(olh.tag.c_str(), olh.tag.size());
6854 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
6855 }
6856 {
6857 RGWOLHInfo info;
6858 info.target = rgw_obj(bucket_info.bucket, olh.key);
6859 info.removed = olh.delete_marker;
6860 bufferlist bl;
6861 encode(info, bl);
6862 op.setxattr(RGW_ATTR_OLH_INFO, bl);
6863 }
6864 rgw_rados_ref ref;
6865 r = get_obj_head_ref(bucket_info, obj, &ref);
6866 if (r < 0) {
6867 return r;
6868 }
9f95a23c 6869 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
a8e16298
TL
6870 if (r < 0) {
6871 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
6872 << cpp_strerror(r) << dendl;
6873 return r;
6874 }
6875 return 0;
6876}
6877
7c673cae
FG
6878int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
6879{
6880 rgw_rados_ref ref;
6881 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6882 if (r < 0) {
6883 return r;
6884 }
6885
6886 BucketShard bs(this);
f64942e4
AA
6887 int ret =
6888 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
6889 if (ret < 0) {
6890 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
6891 return ret;
6892 }
6893
6894 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6895
6896 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6897
f64942e4
AA
6898 ret = guard_reshard(&bs, obj_instance, bucket_info,
6899 [&](BucketShard *pbs) -> int {
6900 ObjectWriteOperation op;
6901 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
6902 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
9f95a23c 6903 return pbs->bucket_obj.operate(&op, null_yield);
31f18b77
FG
6904 });
6905 if (ret < 0) {
6906 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 6907 return ret;
31f18b77 6908 }
7c673cae
FG
6909
6910 return 0;
6911}
6912
6913int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
6914{
6915 rgw_rados_ref ref;
6916 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
6917 if (r < 0) {
6918 return r;
6919 }
6920
6921 BucketShard bs(this);
7c673cae
FG
6922
6923 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
6924
6925 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
6926
f64942e4
AA
6927 int ret = guard_reshard(&bs, obj_instance, bucket_info,
6928 [&](BucketShard *pbs) -> int {
6929 ObjectWriteOperation op;
9f95a23c 6930 auto& ref = pbs->bucket_obj.get_ref();
f64942e4 6931 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
9f95a23c
TL
6932 cls_rgw_clear_olh(op, key, olh_tag);
6933 return rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
31f18b77 6934 });
7c673cae 6935 if (ret < 0) {
9f95a23c 6936 ldout(cct, 5) << "rgw_rados_operate() after cls_rgw_clear_olh() returned ret=" << ret << dendl;
7c673cae
FG
6937 return ret;
6938 }
6939
6940 return 0;
6941}
6942
92f5a8d4
TL
6943static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
6944{
6945 try {
6946 auto biter = bl.cbegin();
6947 decode(*olh, biter);
6948 return 0;
6949 } catch (buffer::error& err) {
6950 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
6951 return -EIO;
6952 }
6953}
6954
7c673cae
FG
6955int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6956 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 6957 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
6958{
6959 if (log.empty()) {
6960 return 0;
6961 }
6962
6963 librados::ObjectWriteOperation op;
6964
6965 uint64_t last_ver = log.rbegin()->first;
6966 *plast_ver = last_ver;
6967
6968 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
6969
6970 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
92f5a8d4 6971 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7c673cae 6972
a8e16298
TL
6973 bufferlist ver_bl;
6974 string last_ver_s = to_string(last_ver);
6975 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
6976 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
6977
b32b8144
FG
6978 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6979 op.mtime2(&mtime_ts);
6980
7c673cae 6981 bool need_to_link = false;
92f5a8d4 6982 uint64_t link_epoch = 0;
7c673cae
FG
6983 cls_rgw_obj_key key;
6984 bool delete_marker = false;
6985 list<cls_rgw_obj_key> remove_instances;
6986 bool need_to_remove = false;
6987
92f5a8d4
TL
6988 // decode current epoch and instance
6989 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
6990 if (olh_ver != state.attrset.end()) {
6991 std::string str = olh_ver->second.to_str();
6992 std::string err;
6993 link_epoch = strict_strtoll(str.c_str(), 10, &err);
6994 }
6995 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
6996 if (olh_info != state.attrset.end()) {
6997 RGWOLHInfo info;
6998 int r = decode_olh_info(cct, olh_info->second, &info);
6999 if (r < 0) {
7000 return r;
7001 }
7002 info.target.key.get_index_key(&key);
7003 delete_marker = info.removed;
7004 }
7005
7c673cae
FG
7006 for (iter = log.begin(); iter != log.end(); ++iter) {
7007 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7008 for (; viter != iter->second.end(); ++viter) {
7009 rgw_bucket_olh_log_entry& entry = *viter;
7010
92f5a8d4 7011 ldout(cct, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7c673cae
FG
7012 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7013 << (entry.delete_marker ? "(delete)" : "") << dendl;
7014 switch (entry.op) {
7015 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7016 remove_instances.push_back(entry.key);
7017 break;
7018 case CLS_RGW_OLH_OP_LINK_OLH:
92f5a8d4
TL
7019 // only overwrite a link of the same epoch if its key sorts before
7020 if (link_epoch < iter->first || key.instance.empty() ||
7021 key.instance > entry.key.instance) {
7022 ldout(cct, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7023 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7024 need_to_link = true;
7025 need_to_remove = false;
7026 key = entry.key;
7027 delete_marker = entry.delete_marker;
7028 } else {
7029 ldout(cct, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7030 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7031 }
7c673cae
FG
7032 break;
7033 case CLS_RGW_OLH_OP_UNLINK_OLH:
7034 need_to_remove = true;
7035 need_to_link = false;
7036 break;
7037 default:
7038 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7039 return -EIO;
7040 }
7041 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7042 attr_name.append(entry.op_tag);
7043 op.rmxattr(attr_name.c_str());
7044 }
7045 }
7046
7047 rgw_rados_ref ref;
7048 int r = get_obj_head_ref(bucket_info, obj, &ref);
7049 if (r < 0) {
7050 return r;
7051 }
7052
7053 const rgw_bucket& bucket = obj.bucket;
7054
7055 if (need_to_link) {
7056 rgw_obj target(bucket, key);
7057 RGWOLHInfo info;
7058 info.target = target;
7059 info.removed = delete_marker;
7060 bufferlist bl;
11fdf7f2 7061 encode(info, bl);
7c673cae
FG
7062 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7063 }
7064
7065 /* first remove object instances */
7066 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7067 liter != remove_instances.end(); ++liter) {
7068 cls_rgw_obj_key& key = *liter;
7069 rgw_obj obj_instance(bucket, key);
31f18b77 7070 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
7071 if (ret < 0 && ret != -ENOENT) {
7072 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7073 return ret;
7074 }
7075 }
7076
7077 /* update olh object */
9f95a23c 7078 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
7079 if (r == -ECANCELED) {
7080 r = 0;
7081 }
7082 if (r < 0) {
7083 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7084 return r;
7085 }
7086
7087 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7088 if (r < 0) {
7089 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7090 return r;
7091 }
7092
7093 if (need_to_remove) {
7094 ObjectWriteOperation rm_op;
7095
7096 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7097 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7098 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7099 rm_op.remove();
7100
9f95a23c 7101 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &rm_op, null_yield);
7c673cae
FG
7102 if (r == -ECANCELED) {
7103 return 0; /* someone else won this race */
7104 } else {
7105 /*
7106 * only clear if was successful, otherwise we might clobber pending operations on this object
7107 */
7108 r = bucket_index_clear_olh(bucket_info, state, obj);
7109 if (r < 0) {
7110 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7111 return r;
7112 }
7113 }
7114 }
7115
7116 return 0;
7117}
7118
7119/*
7120 * read olh log and apply it
7121 */
31f18b77 7122int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7123{
7124 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7125 bool is_truncated;
7126 uint64_t ver_marker = 0;
7127
7128 do {
7129 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7130 if (ret < 0) {
7131 return ret;
7132 }
31f18b77 7133 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7134 if (ret < 0) {
7135 return ret;
7136 }
7137 } while (is_truncated);
7138
7139 return 0;
7140}
7141
9f95a23c 7142int RGWRados::set_olh(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77 7143 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
9f95a23c 7144 optional_yield y, rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7145{
7146 string op_tag;
7147
7148 rgw_obj olh_obj = target_obj;
7149 olh_obj.key.instance.clear();
7150
7151 RGWObjState *state = NULL;
7152
7153 int ret = 0;
7154 int i;
31f18b77 7155
7c673cae
FG
7156#define MAX_ECANCELED_RETRY 100
7157 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7158 if (ret == -ECANCELED) {
11fdf7f2 7159 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7160 }
7161
9f95a23c 7162 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7163 if (ret < 0) {
7164 return ret;
7165 }
7166
7167 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7168 if (ret < 0) {
7169 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7170 if (ret == -ECANCELED) {
7171 continue;
7172 }
7173 return ret;
7174 }
91327a77
AA
7175 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7176 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7177 zones_trace, log_data_change);
7c673cae
FG
7178 if (ret < 0) {
7179 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7180 if (ret == -ECANCELED) {
a8e16298
TL
7181 // the bucket index rejected the link_olh() due to olh tag mismatch;
7182 // attempt to reconstruct olh head attributes based on the bucket index
7183 int r2 = repair_olh(state, bucket_info, olh_obj);
7184 if (r2 < 0 && r2 != -ECANCELED) {
7185 return r2;
7186 }
7c673cae
FG
7187 continue;
7188 }
7189 return ret;
7190 }
7191 break;
7192 }
7193
7194 if (i == MAX_ECANCELED_RETRY) {
7195 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7196 return -EIO;
7197 }
7198
7199 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7200 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7201 ret = 0;
7202 }
7203 if (ret < 0) {
7204 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7205 return ret;
7206 }
7207
7208 return 0;
7209}
7210
7211int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
9f95a23c 7212 uint64_t olh_epoch, optional_yield y, rgw_zone_set *zones_trace)
7c673cae
FG
7213{
7214 string op_tag;
7215
7216 rgw_obj olh_obj = target_obj;
7217 olh_obj.key.instance.clear();
7218
7219 RGWObjState *state = NULL;
7220
7221 int ret = 0;
7222 int i;
7223
7224 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7225 if (ret == -ECANCELED) {
11fdf7f2 7226 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7227 }
7228
9f95a23c 7229 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false, y); /* don't follow olh */
7c673cae
FG
7230 if (ret < 0)
7231 return ret;
7232
7233 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7234 if (ret < 0) {
7235 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7236 if (ret == -ECANCELED) {
7237 continue;
7238 }
7239 return ret;
7240 }
7241
7242 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7243
31f18b77 7244 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
7245 if (ret < 0) {
7246 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7247 if (ret == -ECANCELED) {
7248 continue;
7249 }
7250 return ret;
7251 }
7252 break;
7253 }
7254
7255 if (i == MAX_ECANCELED_RETRY) {
7256 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7257 return -EIO;
7258 }
7259
31f18b77 7260 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7261 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7262 return 0;
7263 }
7264 if (ret < 0) {
7265 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7266 return ret;
7267 }
7268
7269 return 0;
7270}
7271
11fdf7f2 7272void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7273{
7274#define OBJ_INSTANCE_LEN 32
7275 char buf[OBJ_INSTANCE_LEN + 1];
7276
7277 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7278 no underscore for instance name due to the way we encode the raw keys */
7279
11fdf7f2 7280 target_key->set_instance(buf);
7c673cae
FG
7281}
7282
11fdf7f2 7283void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7284{
11fdf7f2 7285 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7286}
7287
7288int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7289{
92f5a8d4 7290 map<string, bufferlist> attrset;
7c673cae
FG
7291
7292 ObjectReadOperation op;
92f5a8d4 7293 op.getxattrs(&attrset, NULL);
7c673cae 7294
7c673cae 7295 int r = obj_operate(bucket_info, obj, &op);
7c673cae
FG
7296 if (r < 0) {
7297 return r;
7298 }
7c673cae 7299
92f5a8d4 7300 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7c673cae
FG
7301 if (iter == attrset.end()) { /* not an olh */
7302 return -EINVAL;
7303 }
7304
92f5a8d4 7305 return decode_olh_info(cct, iter->second, olh);
7c673cae
FG
7306}
7307
7308void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7309 map<string, bufferlist> *rm_pending_entries)
7310{
7311 map<string, bufferlist>::iterator iter = pending_entries.begin();
7312
7313 real_time now = real_clock::now();
7314
7315 while (iter != pending_entries.end()) {
11fdf7f2 7316 auto biter = iter->second.cbegin();
7c673cae
FG
7317 RGWOLHPendingInfo pending_info;
7318 try {
11fdf7f2 7319 decode(pending_info, biter);
7c673cae
FG
7320 } catch (buffer::error& err) {
7321 /* skipping bad entry, we could remove it but it might hide a bug */
7322 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7323 ++iter;
7324 continue;
7325 }
7326
7327 map<string, bufferlist>::iterator cur_iter = iter;
7328 ++iter;
7329 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7330 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7331 pending_entries.erase(cur_iter);
7332 } else {
7333 /* entries names are sorted by time (rounded to a second) */
7334 break;
7335 }
7336 }
7337}
7338
7339int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7340{
7c673cae
FG
7341 rgw_rados_ref ref;
7342 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7343 if (r < 0) {
7344 return r;
7345 }
7346
81eedcae
TL
7347 // trim no more than 1000 entries per osd op
7348 constexpr int max_entries = 1000;
7c673cae 7349
81eedcae
TL
7350 auto i = pending_attrs.begin();
7351 while (i != pending_attrs.end()) {
7352 ObjectWriteOperation op;
7353 bucket_index_guard_olh_op(state, op);
7354
7355 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7356 op.rmxattr(i->first.c_str());
7357 }
7358
9f95a23c 7359 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
81eedcae
TL
7360 if (r == -ENOENT || r == -ECANCELED) {
7361 /* raced with some other change, shouldn't sweat about it */
7362 return 0;
7363 }
7364 if (r < 0) {
7365 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7366 return r;
7367 }
7368 }
7c673cae
FG
7369 return 0;
7370}
7371
7372int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7373{
7374 map<string, bufferlist> pending_entries;
11fdf7f2 7375 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7376
7377 map<string, bufferlist> rm_pending_entries;
7378 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7379
7380 if (!rm_pending_entries.empty()) {
7381 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7382 if (ret < 0) {
7383 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7384 return ret;
7385 }
7386 }
7387 if (!pending_entries.empty()) {
7388 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7389
7390 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7391 if (ret < 0) {
7392 return ret;
7393 }
7394 }
7395
92f5a8d4
TL
7396 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7397 if (iter == state->attrset.end()) {
7398 return -EINVAL;
7399 }
7400
7c673cae 7401 RGWOLHInfo olh;
92f5a8d4
TL
7402 int ret = decode_olh_info(cct, iter->second, &olh);
7403 if (ret < 0) {
7404 return ret;
7c673cae
FG
7405 }
7406
7407 if (olh.removed) {
7408 return -ENOENT;
7409 }
7410
7411 *target = olh.target;
7412
7413 return 0;
7414}
7415
7416int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7417 map<string, bufferlist> *attrs, bufferlist *first_chunk,
9f95a23c 7418 RGWObjVersionTracker *objv_tracker, optional_yield y)
7c673cae
FG
7419{
7420 rgw_rados_ref ref;
7421 int r = get_raw_obj_ref(obj, &ref);
7422 if (r < 0) {
7423 return r;
7424 }
7425
7426 map<string, bufferlist> unfiltered_attrset;
7427 uint64_t size = 0;
7428 struct timespec mtime_ts;
7429
7430 ObjectReadOperation op;
7431 if (objv_tracker) {
7432 objv_tracker->prepare_op_for_read(&op);
7433 }
7434 if (attrs) {
7435 op.getxattrs(&unfiltered_attrset, NULL);
7436 }
7437 if (psize || pmtime) {
7438 op.stat2(&size, &mtime_ts, NULL);
7439 }
7440 if (first_chunk) {
7441 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7442 }
7443 bufferlist outbl;
9f95a23c 7444 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, &outbl, null_yield);
7c673cae
FG
7445
7446 if (epoch) {
9f95a23c 7447 *epoch = ref.pool.ioctx().get_last_version();
7c673cae
FG
7448 }
7449
7450 if (r < 0)
7451 return r;
7452
7453 if (psize)
7454 *psize = size;
7455 if (pmtime)
7456 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7457 if (attrs) {
11fdf7f2 7458 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7459 }
7460
7461 return 0;
7462}
7463
7464int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7465 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7466{
a8e16298 7467 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7468 map<int, string> bucket_instance_ids;
7469 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7470 if (r < 0) {
7471 return r;
7472 }
7473
11fdf7f2 7474 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7475
a8e16298 7476 auto iter = headers.begin();
7c673cae
FG
7477 map<int, string>::iterator viter = bucket_instance_ids.begin();
7478 BucketIndexShardsManager ver_mgr;
7479 BucketIndexShardsManager master_ver_mgr;
7480 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7481 char buf[64];
7482 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7483 accumulate_raw_stats(*iter, stats);
7484 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7485 ver_mgr.add(viter->first, string(buf));
a8e16298 7486 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7487 master_ver_mgr.add(viter->first, string(buf));
7488 if (shard_id >= 0) {
a8e16298 7489 *max_marker = iter->max_marker;
7c673cae 7490 } else {
a8e16298 7491 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7492 }
c07f9fc5 7493 if (syncstopped != NULL)
a8e16298 7494 *syncstopped = iter->syncstopped;
7c673cae
FG
7495 }
7496 ver_mgr.to_string(bucket_ver);
7497 master_ver_mgr.to_string(master_ver);
7498 if (shard_id < 0) {
7499 marker_mgr.to_string(max_marker);
7500 }
7501 return 0;
7502}
7503
7c673cae
FG
7504class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7505 RGWGetBucketStats_CB *cb;
7506 uint32_t pendings;
7507 map<RGWObjCategory, RGWStorageStats> stats;
7508 int ret_code;
7509 bool should_cb;
9f95a23c 7510 ceph::mutex lock = ceph::make_mutex("RGWGetBucketStatsContext");
7c673cae
FG
7511
7512public:
7513 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
9f95a23c
TL
7514 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true)
7515 {}
7c673cae
FG
7516
7517 void handle_response(int r, rgw_bucket_dir_header& header) override {
9f95a23c 7518 std::lock_guard l{lock};
7c673cae
FG
7519 if (should_cb) {
7520 if ( r >= 0) {
7521 accumulate_raw_stats(header, stats);
7522 } else {
7523 ret_code = r;
7524 }
7525
7526 // Are we all done?
7527 if (--pendings == 0) {
7528 if (!ret_code) {
7529 cb->set_response(&stats);
7530 }
7531 cb->handle_response(ret_code);
7532 cb->put();
7533 }
7534 }
7535 }
7536
7537 void unset_cb() {
9f95a23c 7538 std::lock_guard l{lock};
7c673cae
FG
7539 should_cb = false;
7540 }
7541};
7542
7543int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
7544{
7545 int num_aio = 0;
c07f9fc5 7546 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11fdf7f2 7547 ceph_assert(get_ctx);
7c673cae 7548 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
7549 if (r < 0) {
7550 ctx->put();
7551 if (num_aio) {
7552 get_ctx->unset_cb();
7553 }
7554 }
c07f9fc5 7555 get_ctx->put();
7c673cae
FG
7556 return r;
7557}
7558
e306af50
TL
7559int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx,
7560 const string& meta_key,
7561 RGWBucketInfo& info,
7562 real_time *pmtime,
7563 map<string, bufferlist> *pattrs,
7564 optional_yield y)
9f95a23c
TL
7565{
7566 rgw_bucket bucket;
7567 rgw_bucket_parse_bucket_key(cct, meta_key, &bucket, nullptr);
7c673cae 7568
9f95a23c
TL
7569 return get_bucket_instance_info(obj_ctx, bucket, info, pmtime, pattrs, y);
7570}
7c673cae 7571
11fdf7f2 7572int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
9f95a23c 7573 real_time *pmtime, map<string, bufferlist> *pattrs, optional_yield y)
7c673cae 7574{
9f95a23c
TL
7575 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7576 return ctl.bucket->read_bucket_instance_info(bucket, &info,
7577 y,
7578 RGWBucketCtl::BucketInstance::GetParams()
7579 .set_mtime(pmtime)
7580 .set_attrs(pattrs)
7581 .set_bectx_params(bectx_params));
7c673cae
FG
7582}
7583
9f95a23c 7584int RGWRados::get_bucket_info(RGWServices *svc,
b32b8144
FG
7585 const string& tenant, const string& bucket_name,
7586 RGWBucketInfo& info,
9f95a23c
TL
7587 real_time *pmtime,
7588 optional_yield y, map<string, bufferlist> *pattrs)
b32b8144 7589{
9f95a23c
TL
7590 auto obj_ctx = svc->sysobj->init_obj_ctx();
7591 RGWSI_MetaBackend_CtxParams bectx_params = RGWSI_MetaBackend_CtxParams_SObj(&obj_ctx);
7592 rgw_bucket bucket;
7593 bucket.tenant = tenant;
7594 bucket.name = bucket_name;
7595 return ctl.bucket->read_bucket_info(bucket, &info, y,
7596 RGWBucketCtl::BucketInstance::GetParams()
7597 .set_mtime(pmtime)
7598 .set_attrs(pattrs)
7599 .set_bectx_params(bectx_params));
b32b8144
FG
7600}
7601
7602int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
7603 ceph::real_time *pmtime,
7604 map<string, bufferlist> *pattrs)
7605{
9f95a23c
TL
7606 rgw_bucket bucket = info.bucket;
7607 bucket.bucket_id.clear();
b32b8144 7608
9f95a23c 7609 auto rv = info.objv_tracker.read_version;
b32b8144 7610
9f95a23c
TL
7611 return ctl.bucket->read_bucket_info(bucket, &info, null_yield,
7612 RGWBucketCtl::BucketInstance::GetParams()
7613 .set_mtime(pmtime)
7614 .set_attrs(pattrs)
7615 .set_refresh_version(rv));
7c673cae
FG
7616}
7617
7618int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
7619 real_time mtime, map<string, bufferlist> *pattrs)
7620{
9f95a23c
TL
7621 return ctl.bucket->store_bucket_instance_info(info.bucket, info, null_yield,
7622 RGWBucketCtl::BucketInstance::PutParams()
7623 .set_exclusive(exclusive)
7624 .set_mtime(mtime)
7625 .set_attrs(pattrs));
7c673cae
FG
7626}
7627
7628int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
7629 map<string, bufferlist> *pattrs, bool create_entry_point)
7630{
7631 bool create_head = !info.has_instance_obj || create_entry_point;
7632
7633 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
7634 if (ret < 0) {
7635 return ret;
7636 }
7637
7638 if (!create_head)
7639 return 0; /* done! */
7640
7641 RGWBucketEntryPoint entry_point;
7642 entry_point.bucket = info.bucket;
7643 entry_point.owner = info.owner;
7644 entry_point.creation_time = info.creation_time;
7645 entry_point.linked = true;
7646 RGWObjVersionTracker ot;
7647 if (pep_objv && !pep_objv->tag.empty()) {
7648 ot.write_version = *pep_objv;
7649 } else {
7650 ot.generate_new_write_ver(cct);
7651 if (pep_objv) {
7652 *pep_objv = ot.write_version;
7653 }
7654 }
9f95a23c
TL
7655 ret = ctl.bucket->store_bucket_entrypoint_info(info.bucket, entry_point, null_yield, RGWBucketCtl::Bucket::PutParams()
7656 .set_exclusive(exclusive)
7657 .set_objv_tracker(&ot)
7658 .set_mtime(mtime));
7c673cae
FG
7659 if (ret < 0)
7660 return ret;
7661
7662 return 0;
7663}
7664
7c673cae
FG
7665int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
7666{
11fdf7f2 7667 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
7668
7669 map<string, RGWBucketEnt>::iterator iter;
7670 for (iter = m.begin(); iter != m.end(); ++iter) {
7671 RGWBucketEnt& ent = iter->second;
7672 rgw_bucket& bucket = ent.bucket;
7673 ent.count = 0;
7674 ent.size = 0;
7675 ent.size_rounded = 0;
7676
a8e16298 7677 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7678
7679 RGWBucketInfo bucket_info;
9f95a23c 7680 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL, null_yield);
7c673cae
FG
7681 if (ret < 0) {
7682 return ret;
7683 }
7684
7685 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
7686 if (r < 0)
7687 return r;
7688
a8e16298 7689 auto hiter = headers.begin();
7c673cae
FG
7690 for (; hiter != headers.end(); ++hiter) {
7691 RGWObjCategory category = main_category;
11fdf7f2 7692 auto iter = (hiter->stats).find(category);
a8e16298 7693 if (iter != hiter->stats.end()) {
7c673cae
FG
7694 struct rgw_bucket_category_stats& stats = iter->second;
7695 ent.count += stats.num_entries;
7696 ent.size += stats.total_size;
7697 ent.size_rounded += stats.total_size_rounded;
7698 }
7699 }
3efd9988
FG
7700
7701 // fill in placement_rule from the bucket instance for use in swift's
7702 // per-storage policy statistics
7703 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
7704 }
7705
7706 return m.size();
7707}
7708
7709int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
7710{
7711 rgw_rados_ref ref;
7712 int r = get_raw_obj_ref(obj, &ref);
7713 if (r < 0) {
7714 return r;
7715 }
7716 librados::Rados *rad = get_rados_handle();
9f95a23c 7717 librados::AioCompletion *completion = rad->aio_create_completion(nullptr, nullptr);
7c673cae 7718
9f95a23c 7719 r = ref.pool.ioctx().aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
7720 completion->release();
7721 return r;
7722}
7723
7c673cae
FG
7724int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
7725{
7726 librados::IoCtx& io_ctx = ctx.io_ctx;
7727 librados::NObjectIterator& iter = ctx.iter;
7728
494da23a 7729 int r = open_pool_ctx(pool, io_ctx, false);
7c673cae
FG
7730 if (r < 0)
7731 return r;
7732
7733 iter = io_ctx.nobjects_begin();
7734
7735 return 0;
7736}
7737
181888fb
FG
7738int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
7739{
7740 librados::IoCtx& io_ctx = ctx.io_ctx;
7741 librados::NObjectIterator& iter = ctx.iter;
7742
494da23a 7743 int r = open_pool_ctx(pool, io_ctx, false);
181888fb
FG
7744 if (r < 0)
7745 return r;
7746
7747 librados::ObjectCursor oc;
7748 if (!oc.from_str(cursor)) {
7749 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
7750 return -EINVAL;
7751 }
7752
f64942e4
AA
7753 try {
7754 iter = io_ctx.nobjects_begin(oc);
7755 return 0;
7756 } catch (const std::system_error& e) {
7757 r = -e.code().value();
7758 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7759 << ", returning " << r << dendl;
7760 return r;
7761 } catch (const std::exception& e) {
7762 ldout(cct, 10) << "nobjects_begin threw " << e.what()
7763 << ", returning -5" << dendl;
7764 return -EIO;
7765 }
181888fb
FG
7766}
7767
7768string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
7769{
7770 return ctx.iter.get_cursor().to_str();
7771}
7772
f64942e4
AA
7773static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
7774 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
7775 bool *is_truncated, RGWAccessListFilter *filter)
7776{
7777 librados::IoCtx& io_ctx = ctx.io_ctx;
7778 librados::NObjectIterator& iter = ctx.iter;
7779
7780 if (iter == io_ctx.nobjects_end())
7781 return -ENOENT;
7782
7783 uint32_t i;
7784
7785 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
7786 rgw_bucket_dir_entry e;
7787
7788 string oid = iter->get_oid();
7789 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
7790
7791 // fill it in with initial values; we may correct later
7792 if (filter && !filter->filter(oid, oid))
7793 continue;
7794
7795 e.key = oid;
7796 objs.push_back(e);
7797 }
7798
7799 if (is_truncated)
7800 *is_truncated = (iter != io_ctx.nobjects_end());
7801
7802 return objs.size();
7803}
7c673cae 7804
f64942e4
AA
7805int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
7806 bool *is_truncated, RGWAccessListFilter *filter)
7807{
7808 // catch exceptions from NObjectIterator::operator++()
7809 try {
7810 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
7811 } catch (const std::system_error& e) {
7812 int r = -e.code().value();
7813 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7814 << ", returning " << r << dendl;
7815 return r;
7816 } catch (const std::exception& e) {
7817 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
7818 << ", returning -5" << dendl;
7819 return -EIO;
7820 }
7821}
7822
181888fb 7823int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 7824{
181888fb
FG
7825 if (!ctx->initialized) {
7826 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
7827 if (r < 0) {
7828 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
7829 return r;
7830 }
181888fb 7831 ctx->initialized = true;
7c673cae 7832 }
181888fb
FG
7833 return 0;
7834}
7c673cae 7835
181888fb
FG
7836int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
7837 RGWListRawObjsCtx& ctx, list<string>& oids,
7838 bool *is_truncated)
7839{
7840 if (!ctx.initialized) {
7841 return -EINVAL;
7842 }
7843 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
7844 vector<rgw_bucket_dir_entry> objs;
7845 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
7846 if (r < 0) {
7847 if(r != -ENOENT)
7848 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
7849 return r;
7850 }
7851
7852 vector<rgw_bucket_dir_entry>::iterator iter;
7853 for (iter = objs.begin(); iter != objs.end(); ++iter) {
7854 oids.push_back(iter->key.name);
7855 }
7856
7857 return oids.size();
7858}
7859
181888fb
FG
7860int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
7861 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
7862 bool *is_truncated)
7863{
7864 if (!ctx.initialized) {
7865 int r = list_raw_objects_init(pool, string(), &ctx);
7866 if (r < 0) {
7867 return r;
7868 }
7869 }
7870
7871 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
7872}
7873
7874string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
7875{
7876 return pool_iterate_get_cursor(ctx.iter_ctx);
7877}
7878
a8e16298
TL
7879int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7880 rgw_bucket_dir_entry *dirent)
7c673cae 7881{
a8e16298 7882 rgw_cls_bi_entry bi_entry;
11fdf7f2 7883 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298
TL
7884 if (r < 0 && r != -ENOENT) {
7885 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7886 }
7c673cae
FG
7887 if (r < 0) {
7888 return r;
7889 }
11fdf7f2 7890 auto iter = bi_entry.data.cbegin();
a8e16298 7891 try {
11fdf7f2 7892 decode(*dirent, iter);
a8e16298
TL
7893 } catch (buffer::error& err) {
7894 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7895 return -EIO;
7896 }
7897
7898 return 0;
7899}
7c673cae 7900
a8e16298
TL
7901int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7902 rgw_bucket_olh_entry *olh)
7903{
7c673cae 7904 rgw_cls_bi_entry bi_entry;
11fdf7f2 7905 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae
FG
7906 if (r < 0 && r != -ENOENT) {
7907 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
7908 }
7909 if (r < 0) {
7910 return r;
7911 }
11fdf7f2 7912 auto iter = bi_entry.data.cbegin();
7c673cae 7913 try {
a8e16298 7914 decode(*olh, iter);
7c673cae
FG
7915 } catch (buffer::error& err) {
7916 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
7917 return -EIO;
7918 }
7919
7920 return 0;
7921}
7922
a8e16298
TL
7923int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7924 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
7925{
7926 BucketShard bs(this);
a8e16298 7927 int ret = bs.init(bucket_info, obj);
7c673cae
FG
7928 if (ret < 0) {
7929 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7930 return ret;
7931 }
7932
7933 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9f95a23c
TL
7934
7935 auto& ref = bs.bucket_obj.get_ref();
7c673cae 7936
9f95a23c 7937 return cls_rgw_bi_get(ref.pool.ioctx(), ref.obj.oid, index_type, key, entry);
7c673cae
FG
7938}
7939
7940void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
7941{
9f95a23c
TL
7942 auto& ref = bs.bucket_obj.get_ref();
7943 cls_rgw_bi_put(op, ref.obj.oid, entry);
7c673cae
FG
7944}
7945
7946int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
7947{
9f95a23c
TL
7948 auto& ref = bs.bucket_obj.get_ref();
7949 int ret = cls_rgw_bi_put(ref.pool.ioctx(), ref.obj.oid, entry);
7c673cae
FG
7950 if (ret < 0)
7951 return ret;
7952
7953 return 0;
7954}
7955
7956int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
7957{
7958 BucketShard bs(this);
f64942e4 7959 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7960 if (ret < 0) {
7961 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7962 return ret;
7963 }
7964
7965 return bi_put(bs, entry);
7966}
7967
7968int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7969{
7970 rgw_obj obj(bucket, obj_name);
7971 BucketShard bs(this);
f64942e4 7972 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7973 if (ret < 0) {
7974 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7975 return ret;
7976 }
7977
9f95a23c
TL
7978 auto& ref = bs.bucket_obj.get_ref();
7979 ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
7980 if (ret == -ENOENT) {
7981 *is_truncated = false;
7982 }
7c673cae
FG
7983 if (ret < 0)
7984 return ret;
7985
7986 return 0;
7987}
7988
7989int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
7990{
9f95a23c
TL
7991 auto& ref = bs.bucket_obj.get_ref();
7992 int ret = cls_rgw_bi_list(ref.pool.ioctx(), ref.obj.oid, filter_obj, marker, max, entries, is_truncated);
7c673cae
FG
7993 if (ret < 0)
7994 return ret;
7995
7996 return 0;
7997}
7998
7999int RGWRados::bi_remove(BucketShard& bs)
8000{
9f95a23c
TL
8001 auto& ref = bs.bucket_obj.get_ref();
8002 int ret = ref.pool.ioctx().remove(ref.obj.oid);
7c673cae
FG
8003 if (ret == -ENOENT) {
8004 ret = 0;
8005 }
8006 if (ret < 0) {
8007 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8008 return ret;
8009 }
8010
8011 return 0;
8012}
8013
8014int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8015{
8016 BucketShard bs(this);
f64942e4 8017 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
7c673cae
FG
8018 if (ret < 0) {
8019 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8020 return ret;
8021 }
8022
8023 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8024}
8025
8026int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8027{
9f95a23c 8028 return rgw_rados_operate(gc_pool_ctx, oid, op, null_yield);
7c673cae
FG
8029}
8030
9f95a23c
TL
8031int RGWRados::gc_aio_operate(const string& oid, librados::AioCompletion *c,
8032 librados::ObjectWriteOperation *op)
7c673cae 8033{
9f95a23c 8034 return gc_pool_ctx.aio_operate(oid, c, op);
7c673cae
FG
8035}
8036
8037int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8038{
9f95a23c 8039 return rgw_rados_operate(gc_pool_ctx, oid, op, pbl, null_yield);
7c673cae
FG
8040}
8041
9f95a23c 8042int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated, bool& processing_queue)
7c673cae 8043{
9f95a23c 8044 return gc->list(index, marker, max, expired_only, result, truncated, processing_queue);
7c673cae
FG
8045}
8046
11fdf7f2 8047int RGWRados::process_gc(bool expired_only)
7c673cae 8048{
11fdf7f2 8049 return gc->process(expired_only);
7c673cae
FG
8050}
8051
f6b5b4d7
TL
8052int RGWRados::list_lc_progress(string& marker, uint32_t max_entries,
8053 vector<cls_rgw_lc_entry>& progress_map,
8054 int& index)
7c673cae 8055{
f6b5b4d7 8056 return lc->list_lc_progress(marker, max_entries, progress_map, index);
7c673cae
FG
8057}
8058
8059int RGWRados::process_lc()
8060{
f6b5b4d7
TL
8061 RGWLC lc;
8062 lc.initialize(cct, this->store);
8063 RGWLC::LCWorker worker(&lc, cct, &lc, 0);
8064 auto ret = lc.process(&worker, true /* once */);
8065 lc.stop_processor(); // sets down_flag, but returns immediately
8066 return ret;
7c673cae
FG
8067}
8068
1adf2230 8069bool RGWRados::process_expire_objects()
7c673cae 8070{
1adf2230 8071 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
8072}
8073
7c673cae 8074int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
9f95a23c 8075 rgw_obj& obj, uint16_t bilog_flags, optional_yield y, rgw_zone_set *_zones_trace)
7c673cae 8076{
31f18b77
FG
8077 rgw_zone_set zones_trace;
8078 if (_zones_trace) {
8079 zones_trace = *_zones_trace;
8080 }
9f95a23c 8081 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8082
7c673cae
FG
8083 ObjectWriteOperation o;
8084 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8085 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8086 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
9f95a23c 8087 return bs.bucket_obj.operate(&o, y);
7c673cae
FG
8088}
8089
31f18b77 8090int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8091 int64_t pool, uint64_t epoch,
8092 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8093 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8094{
7c673cae
FG
8095 ObjectWriteOperation o;
8096 rgw_bucket_dir_entry_meta dir_meta;
8097 dir_meta = ent.meta;
8098 dir_meta.category = category;
8099
1adf2230
AA
8100 rgw_zone_set zones_trace;
8101 if (_zones_trace) {
8102 zones_trace = *_zones_trace;
8103 }
9f95a23c 8104 zones_trace.insert(svc.zone->get_zone().id, bs.bucket.get_key());
1adf2230 8105
7c673cae
FG
8106 rgw_bucket_entry_ver ver;
8107 ver.pool = pool;
8108 ver.epoch = epoch;
8109 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
8110 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
8111 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8112 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
8113 complete_op_data *arg;
8114 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 8115 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77 8116 librados::AioCompletion *completion = arg->rados_completion;
9f95a23c 8117 int ret = bs.bucket_obj.aio_operate(arg->rados_completion, &o);
31f18b77 8118 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
8119 return ret;
8120}
8121
31f18b77 8122int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
8123 int64_t pool, uint64_t epoch,
8124 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8125 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 8126{
31f18b77 8127 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
8128}
8129
8130int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
8131 int64_t pool, uint64_t epoch,
8132 rgw_obj& obj,
8133 real_time& removed_mtime,
8134 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
8135 uint16_t bilog_flags,
8136 rgw_zone_set *zones_trace)
7c673cae
FG
8137{
8138 rgw_bucket_dir_entry ent;
8139 ent.meta.mtime = removed_mtime;
8140 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8141 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
8142 ent, RGWObjCategory::None, remove_objs,
8143 bilog_flags, zones_trace);
7c673cae
FG
8144}
8145
31f18b77 8146int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
8147{
8148 rgw_bucket_dir_entry ent;
8149 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
8150 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
8151 -1 /* pool id */, 0, ent,
8152 RGWObjCategory::None, NULL, bilog_flags,
8153 zones_trace);
7c673cae
FG
8154}
8155
8156int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
8157{
9f95a23c 8158 RGWSI_RADOS::Pool index_pool;
7c673cae 8159 map<int, string> bucket_objs;
9f95a23c 8160 int r = svc.bi_rados->open_bucket_index(bucket_info, std::nullopt, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8161 if (r < 0)
8162 return r;
8163
9f95a23c
TL
8164 return CLSRGWIssueSetTagTimeout(index_pool.ioctx(), bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
8165}
8166
8167
8168uint32_t RGWRados::calc_ordered_bucket_list_per_shard(uint32_t num_entries,
8169 uint32_t num_shards)
8170{
8171 // We want to minimize the chances that when num_shards >>
8172 // num_entries that we return much fewer than num_entries to the
8173 // client. Given all the overhead of making a cls call to the osd,
8174 // returning a few entries is not much more work than returning one
8175 // entry. This minimum might be better tuned based on future
8176 // experiments where num_shards >> num_entries. (Note: ">>" should
8177 // be interpreted as "much greater than".)
8178 constexpr uint32_t min_read = 8;
8179
8180 // The following is based on _"Balls into Bins" -- A Simple and
8181 // Tight Analysis_ by Raab and Steger. We add 1 as a way to handle
8182 // cases when num_shards >> num_entries (it almost serves as a
8183 // ceiling calculation). We also assume alpha is 1.0 and extract it
8184 // from the calculation. Future work could involve memoizing some of
8185 // the transcendental functions to minimize repeatedly re-calling
8186 // them with the same parameters, which we expect to be the case the
8187 // majority of the time.
8188 uint32_t calc_read =
8189 1 +
8190 static_cast<uint32_t>((num_entries / num_shards) +
8191 sqrt((2 * num_entries) *
8192 log(num_shards) / num_shards));
8193
8194 return std::max(min_read, calc_read);
7c673cae
FG
8195}
8196
1adf2230
AA
8197
8198int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
9f95a23c
TL
8199 const int shard_id,
8200 const rgw_obj_index_key& start_after,
1adf2230 8201 const string& prefix,
9f95a23c
TL
8202 const string& delimiter,
8203 const uint32_t num_entries,
8204 const bool list_versions,
8205 const uint16_t expansion_factor,
8206 ent_map_t& m,
8207 bool* is_truncated,
8208 bool* cls_filtered,
1adf2230 8209 rgw_obj_index_key *last_entry,
9f95a23c
TL
8210 optional_yield y,
8211 check_filter_t force_check_filter)
7c673cae 8212{
9f95a23c
TL
8213 /* expansion_factor allows the number of entries to read to grow
8214 * exponentially; this is used when earlier reads are producing too
8215 * few results, perhaps due to filtering or to a series of
8216 * namespaced entries */
8217
8218 ldout(cct, 10) << "RGWRados::" << __func__ << ": " << bucket_info.bucket <<
8219 " start_after=\"" << start_after.name <<
8220 "[" << start_after.instance <<
8221 "]\", prefix=\"" << prefix <<
8222 "\" num_entries=" << num_entries <<
8223 ", list_versions=" << list_versions <<
8224 ", expansion_factor=" << expansion_factor << dendl;
7c673cae 8225
9f95a23c
TL
8226 m.clear();
8227
8228 RGWSI_RADOS::Pool index_pool;
7c673cae 8229 // key - oid (for different shards if there is any)
1adf2230
AA
8230 // value - list result for the corresponding oid (shard), it is filled by
8231 // the AIO callback
9f95a23c
TL
8232 map<int, string> shard_oids;
8233 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id,
8234 &index_pool, &shard_oids,
8235 nullptr);
8236 if (r < 0) {
7c673cae 8237 return r;
9f95a23c
TL
8238 }
8239
8240 const uint32_t shard_count = shard_oids.size();
8241 uint32_t num_entries_per_shard;
8242 if (expansion_factor == 0) {
8243 num_entries_per_shard =
8244 calc_ordered_bucket_list_per_shard(num_entries, shard_count);
8245 } else if (expansion_factor <= 11) {
8246 // we'll max out the exponential multiplication factor at 1024 (2<<10)
8247 num_entries_per_shard =
8248 std::min(num_entries,
8249 (uint32_t(1 << (expansion_factor - 1)) *
8250 calc_ordered_bucket_list_per_shard(num_entries, shard_count)));
8251 } else {
8252 num_entries_per_shard = num_entries;
8253 }
8254
8255 ldout(cct, 10) << "RGWRados::" << __func__ <<
8256 " request from each of " << shard_count <<
8257 " shard(s) for " << num_entries_per_shard << " entries to get " <<
8258 num_entries << " total entries" << dendl;
7c673cae 8259
9f95a23c
TL
8260 auto& ioctx = index_pool.ioctx();
8261 map<int, rgw_cls_list_ret> shard_list_results;
8262 cls_rgw_obj_key start_after_key(start_after.name, start_after.instance);
8263 r = CLSRGWIssueBucketList(ioctx, start_after_key, prefix, delimiter,
8264 num_entries_per_shard,
8265 list_versions, shard_oids, shard_list_results,
1adf2230 8266 cct->_conf->rgw_bucket_index_max_aio)();
9f95a23c 8267 if (r < 0) {
7c673cae 8268 return r;
9f95a23c 8269 }
7c673cae 8270
9f95a23c
TL
8271 // to manage the iterators through each shard's list results
8272 struct ShardTracker {
8273 const size_t shard_idx;
8274 rgw_cls_list_ret& result;
8275 const std::string& oid_name;
8276 RGWRados::ent_map_t::iterator cursor;
8277 RGWRados::ent_map_t::iterator end;
8278
8279 // manages an iterator through a shard and provides other
8280 // accessors
8281 ShardTracker(size_t _shard_idx,
8282 rgw_cls_list_ret& _result,
8283 const std::string& _oid_name):
8284 shard_idx(_shard_idx),
8285 result(_result),
8286 oid_name(_oid_name),
8287 cursor(_result.dir.m.begin()),
8288 end(_result.dir.m.end())
8289 {}
8290
8291 inline const std::string& entry_name() const {
8292 return cursor->first;
8293 }
8294 rgw_bucket_dir_entry& dir_entry() const {
8295 return cursor->second;
8296 }
8297 inline bool is_truncated() const {
8298 return result.is_truncated;
8299 }
8300 inline ShardTracker& advance() {
8301 ++cursor;
8302 // return a self-reference to allow for chaining of calls, such
8303 // as x.advance().at_end()
8304 return *this;
8305 }
8306 inline bool at_end() const {
8307 return cursor == end;
8308 }
8309 }; // ShardTracker
8310
8311 // add the next unique candidate, or return false if we reach the end
8312 auto next_candidate = [] (ShardTracker& t,
8313 std::map<std::string, size_t>& candidates,
8314 size_t tracker_idx) {
8315 while (!t.at_end()) {
8316 if (candidates.emplace(t.entry_name(), tracker_idx).second) {
8317 return;
8318 }
8319 t.advance(); // skip duplicate common prefixes
8320 }
8321 };
8322
8323 // one tracker per shard requested (may not be all shards)
8324 std::vector<ShardTracker> results_trackers;
8325 results_trackers.reserve(shard_list_results.size());
8326 for (auto& r : shard_list_results) {
8327 results_trackers.emplace_back(r.first, r.second, shard_oids[r.first]);
8328
8329 // if any *one* shard's result is trucated, the entire result is
8330 // truncated
8331 *is_truncated = *is_truncated || r.second.is_truncated;
8332
8333 // unless *all* are shards are cls_filtered, the entire result is
8334 // not filtered
8335 *cls_filtered = *cls_filtered && r.second.cls_filtered;
7c673cae
FG
8336 }
8337
9f95a23c
TL
8338 // create a map to track the next candidate entry from ShardTracker
8339 // (key=candidate, value=index into results_trackers); as we consume
8340 // entries from shards, we replace them with the next entries in the
8341 // shards until we run out
7c673cae 8342 map<string, size_t> candidates;
9f95a23c
TL
8343 size_t tracker_idx = 0;
8344 for (auto& t : results_trackers) {
8345 // it's important that the values in the map refer to the index
8346 // into the results_trackers vector, which may not be the same
8347 // as the shard number (i.e., when not all shards are requested)
8348 next_candidate(t, candidates, tracker_idx);
8349 ++tracker_idx;
7c673cae
FG
8350 }
8351
9f95a23c
TL
8352 rgw_bucket_dir_entry*
8353 last_entry_visited = nullptr; // to set last_entry (marker)
7c673cae
FG
8354 map<string, bufferlist> updates;
8355 uint32_t count = 0;
8356 while (count < num_entries && !candidates.empty()) {
8357 r = 0;
9f95a23c
TL
8358 // select the next entry in lexical order (first key in map);
8359 // again tracker_idx is not necessarily shard number, but is index
8360 // into results_trackers vector
8361 tracker_idx = candidates.begin()->second;
8362 auto& tracker = results_trackers.at(tracker_idx);
e306af50 8363
9f95a23c
TL
8364 const string& name = tracker.entry_name();
8365 rgw_bucket_dir_entry& dirent = tracker.dir_entry();
8366
8367 ldout(cct, 20) << "RGWRados::" << __func__ << " currently processing " <<
8368 dirent.key << " from shard " << tracker.shard_idx << dendl;
8369
8370 const bool force_check =
8371 force_check_filter && force_check_filter(dirent.key.name);
8372
8373 if ((!dirent.exists &&
8374 !dirent.is_delete_marker() &&
8375 !dirent.is_common_prefix()) ||
3efd9988
FG
8376 !dirent.pending_map.empty() ||
8377 force_check) {
9f95a23c
TL
8378 /* there are uncommitted ops. We need to check the current
8379 * state, and if the tags are old we need to do clean-up as
8380 * well. */
7c673cae 8381 librados::IoCtx sub_ctx;
9f95a23c 8382 sub_ctx.dup(ioctx);
1adf2230 8383 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
9f95a23c 8384 updates[tracker.oid_name], y);
7c673cae 8385 if (r < 0 && r != -ENOENT) {
9f95a23c 8386 return r;
7c673cae 8387 }
eafe8130 8388 } else {
9f95a23c 8389 r = 0;
7c673cae 8390 }
9f95a23c 8391
7c673cae 8392 if (r >= 0) {
9f95a23c 8393 ldout(cct, 10) << "RGWRados::" << __func__ << ": got " <<
1adf2230 8394 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae 8395 m[name] = std::move(dirent);
e306af50 8396 last_entry_visited = &(m[name]);
7c673cae 8397 ++count;
9f95a23c
TL
8398 } else {
8399 ldout(cct, 10) << "RGWRados::" << __func__ << ": skipping " <<
8400 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
e306af50 8401 last_entry_visited = &tracker.dir_entry();
7c673cae
FG
8402 }
8403
9f95a23c 8404 // refresh the candidates map
7c673cae 8405 candidates.erase(candidates.begin());
9f95a23c
TL
8406 tracker.advance();
8407
8408 next_candidate(tracker, candidates, tracker_idx);
8409
8410 if (tracker.at_end() && tracker.is_truncated()) {
8411 // once we exhaust one shard that is truncated, we need to stop,
8412 // as we cannot be certain that one of the next entries needs to
8413 // come from that shard; S3 and swift protocols allow returning
8414 // fewer than what was requested
8415 break;
7c673cae 8416 }
9f95a23c 8417 } // while we haven't provided requested # of result entries
7c673cae 8418
9f95a23c
TL
8419 // suggest updates if there are any
8420 for (auto& miter : updates) {
8421 if (miter.second.length()) {
7c673cae 8422 ObjectWriteOperation o;
9f95a23c 8423 cls_rgw_suggest_changes(o, miter.second);
7c673cae 8424 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8425 AioCompletion *c =
8426 librados::Rados::aio_create_completion(nullptr, nullptr);
8427 ioctx.aio_operate(miter.first, c, &o);
1adf2230 8428 c->release();
7c673cae 8429 }
9f95a23c 8430 } // updates loop
7c673cae 8431
9f95a23c
TL
8432 // determine truncation by checking if all the returned entries are
8433 // consumed or not
8434 *is_truncated = false;
8435 for (const auto& t : results_trackers) {
8436 if (!t.at_end() || t.is_truncated()) {
7c673cae 8437 *is_truncated = true;
1adf2230
AA
8438 break;
8439 }
7c673cae 8440 }
92f5a8d4 8441
9f95a23c
TL
8442 ldout(cct, 20) << "RGWRados::" << __func__ <<
8443 ": returning, count=" << count << ", is_truncated=" << *is_truncated <<
8444 dendl;
8445
8446 if (*is_truncated && count < num_entries) {
8447 ldout(cct, 10) << "RGWRados::" << __func__ <<
8448 ": INFO requested " << num_entries << " entries but returning " <<
8449 count << ", which is truncated" << dendl;
8450 }
8451
8452 if (last_entry_visited != nullptr && last_entry) {
e306af50 8453 *last_entry = last_entry_visited->key;
9f95a23c
TL
8454 ldout(cct, 20) << "RGWRados::" << __func__ <<
8455 ": returning, last_entry=" << *last_entry << dendl;
8456 } else {
8457 ldout(cct, 20) << "RGWRados::" << __func__ <<
8458 ": returning, last_entry NOT SET" << dendl;
8459 }
7c673cae
FG
8460
8461 return 0;
8462}
8463
1adf2230
AA
8464
8465int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
8466 int shard_id,
9f95a23c 8467 const rgw_obj_index_key& start_after,
1adf2230
AA
8468 const string& prefix,
8469 uint32_t num_entries,
8470 bool list_versions,
8471 std::vector<rgw_bucket_dir_entry>& ent_list,
8472 bool *is_truncated,
8473 rgw_obj_index_key *last_entry,
9f95a23c
TL
8474 optional_yield y,
8475 check_filter_t force_check_filter) {
1adf2230 8476 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9f95a23c 8477 " start_after " << start_after.name << "[" << start_after.instance <<
1adf2230
AA
8478 "] num_entries " << num_entries << dendl;
8479
9f95a23c 8480 ent_list.clear();
11fdf7f2
TL
8481 static MultipartMetaFilter multipart_meta_filter;
8482
1adf2230 8483 *is_truncated = false;
9f95a23c 8484 RGWSI_RADOS::Pool index_pool;
1adf2230 8485
1adf2230 8486 map<int, string> oids;
9f95a23c 8487 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, nullptr);
1adf2230
AA
8488 if (r < 0)
8489 return r;
9f95a23c
TL
8490
8491 auto& ioctx = index_pool.ioctx();
8492
1adf2230
AA
8493 const uint32_t num_shards = oids.size();
8494
9f95a23c 8495 rgw_obj_index_key marker = start_after;
1adf2230
AA
8496 uint32_t current_shard;
8497 if (shard_id >= 0) {
8498 current_shard = shard_id;
9f95a23c 8499 } else if (start_after.empty()) {
1adf2230
AA
8500 current_shard = 0u;
8501 } else {
9f95a23c
TL
8502 // at this point we have a marker (start_after) that has something
8503 // in it, so we need to get to the bucket shard index, so we can
11fdf7f2
TL
8504 // start reading from there
8505
8506 std::string key;
8507 // test whether object name is a multipart meta name
9f95a23c 8508 if(! multipart_meta_filter.filter(start_after.name, key)) {
11fdf7f2
TL
8509 // if multipart_meta_filter fails, must be "regular" (i.e.,
8510 // unadorned) and the name is the key
9f95a23c 8511 key = start_after.name;
11fdf7f2
TL
8512 }
8513
8514 // now convert the key (oid) to an rgw_obj_key since that will
8515 // separate out the namespace, name, and instance
8516 rgw_obj_key obj_key;
8517 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
8518 if (!parsed) {
8519 ldout(cct, 0) <<
8520 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9f95a23c 8521 "start marker: '" << start_after << "'" << dendl;
11fdf7f2
TL
8522 return -EINVAL;
8523 } else if (obj_key.name.empty()) {
8524 // if the name is empty that means the object name came in with
8525 // a namespace only, and therefore we need to start our scan at
8526 // the first bucket index shard
8527 current_shard = 0u;
8528 } else {
8529 // so now we have the key used to compute the bucket index shard
8530 // and can extract the specific shard from it
9f95a23c 8531 current_shard = svc.bi_rados->bucket_shard_index(obj_key.name, num_shards);
11fdf7f2 8532 }
1adf2230
AA
8533 }
8534
8535 uint32_t count = 0u;
8536 map<string, bufferlist> updates;
11fdf7f2 8537 rgw_obj_index_key last_added_entry;
1adf2230
AA
8538 while (count <= num_entries &&
8539 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
8540 current_shard < num_shards)) {
81eedcae
TL
8541 const std::string& oid = oids[current_shard];
8542 rgw_cls_list_ret result;
8543
8544 librados::ObjectReadOperation op;
9f95a23c
TL
8545 string empty_delimiter;
8546 cls_rgw_bucket_list_op(op, marker, prefix, empty_delimiter,
8547 num_entries,
81eedcae 8548 list_versions, &result);
9f95a23c 8549 r = rgw_rados_operate(ioctx, oid, &op, nullptr, null_yield);
1adf2230
AA
8550 if (r < 0)
8551 return r;
8552
1adf2230
AA
8553 for (auto& entry : result.dir.m) {
8554 rgw_bucket_dir_entry& dirent = entry.second;
8555
8556 bool force_check = force_check_filter &&
8557 force_check_filter(dirent.key.name);
8558 if ((!dirent.exists && !dirent.is_delete_marker()) ||
8559 !dirent.pending_map.empty() ||
8560 force_check) {
8561 /* there are uncommitted ops. We need to check the current state,
8562 * and if the tags are old we need to do cleanup as well. */
8563 librados::IoCtx sub_ctx;
9f95a23c
TL
8564 sub_ctx.dup(ioctx);
8565 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid], y);
1adf2230
AA
8566 if (r < 0 && r != -ENOENT) {
8567 return r;
8568 }
eafe8130
TL
8569 } else {
8570 r = 0;
1adf2230
AA
8571 }
8572
8573 // at this point either r >=0 or r == -ENOENT
8574 if (r >= 0) { // i.e., if r != -ENOENT
8575 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
8576 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
8577
8578 if (count < num_entries) {
11fdf7f2 8579 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
8580 ent_list.emplace_back(std::move(dirent));
8581 ++count;
8582 } else {
8583 *is_truncated = true;
8584 goto check_updates;
8585 }
8586 } else { // r == -ENOENT
8587 // in the case of -ENOENT, make sure we're advancing marker
8588 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 8589 marker = dirent.key;
1adf2230
AA
8590 }
8591 } // entry for loop
8592
8593 if (!result.is_truncated) {
8594 // if we reached the end of the shard read next shard
8595 ++current_shard;
11fdf7f2 8596 marker = rgw_obj_index_key();
1adf2230
AA
8597 }
8598 } // shard loop
8599
8600check_updates:
11fdf7f2 8601
1adf2230
AA
8602 // suggest updates if there is any
8603 map<string, bufferlist>::iterator miter = updates.begin();
8604 for (; miter != updates.end(); ++miter) {
8605 if (miter->second.length()) {
8606 ObjectWriteOperation o;
8607 cls_rgw_suggest_changes(o, miter->second);
8608 // we don't care if we lose suggested updates, send them off blindly
9f95a23c
TL
8609 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
8610 ioctx.aio_operate(miter->first, c, &o);
1adf2230
AA
8611 c->release();
8612 }
8613 }
8614
8615 if (last_entry && !ent_list.empty()) {
8616 *last_entry = last_added_entry;
8617 }
8618
8619 return 0;
11fdf7f2 8620} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
8621
8622
8623int RGWRados::cls_obj_usage_log_add(const string& oid,
8624 rgw_usage_log_info& info)
7c673cae 8625{
11fdf7f2 8626 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8627
8628 rgw_rados_ref ref;
224ce89b 8629 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8630 if (r < 0) {
8631 return r;
8632 }
8633
8634 ObjectWriteOperation op;
8635 cls_rgw_usage_log_add(op, info);
8636
9f95a23c 8637 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8638 return r;
8639}
8640
11fdf7f2
TL
8641int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
8642 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
8643 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
8644 bool *is_truncated)
7c673cae 8645{
11fdf7f2 8646 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8647
8648 rgw_rados_ref ref;
224ce89b 8649 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8650 if (r < 0) {
8651 return r;
8652 }
8653
8654 *is_truncated = false;
8655
9f95a23c 8656 r = cls_rgw_usage_log_read(ref.pool.ioctx(), ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
8657 max_entries, read_iter, usage, is_truncated);
8658
8659 return r;
8660}
8661
9f95a23c
TL
8662static int cls_rgw_usage_log_trim_repeat(rgw_rados_ref ref, const string& user, const string& bucket, uint64_t start_epoch, uint64_t end_epoch)
8663{
8664 bool done = false;
8665 do {
8666 librados::ObjectWriteOperation op;
8667 cls_rgw_usage_log_trim(op, user, bucket, start_epoch, end_epoch);
8668 int r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
8669 if (r == -ENODATA)
8670 done = true;
8671 else if (r < 0)
8672 return r;
8673 } while (!done);
8674
8675 return 0;
8676}
8677
11fdf7f2
TL
8678int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
8679 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 8680{
11fdf7f2 8681 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
8682
8683 rgw_rados_ref ref;
224ce89b 8684 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
8685 if (r < 0) {
8686 return r;
8687 }
8688
9f95a23c 8689 r = cls_rgw_usage_log_trim_repeat(ref, user, bucket, start_epoch, end_epoch);
11fdf7f2
TL
8690 return r;
8691}
8692
8693int RGWRados::cls_obj_usage_log_clear(string& oid)
8694{
8695 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
8696
8697 rgw_rados_ref ref;
8698 int r = get_raw_obj_ref(obj, &ref);
8699 if (r < 0) {
8700 return r;
8701 }
8702 librados::ObjectWriteOperation op;
8703 cls_rgw_usage_log_clear(op);
9f95a23c 8704 r = rgw_rados_operate(ref.pool.ioctx(), ref.obj.oid, &op, null_yield);
7c673cae
FG
8705 return r;
8706}
8707
11fdf7f2 8708
7c673cae
FG
8709int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
8710{
9f95a23c 8711 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8712 string dir_oid;
8713
11fdf7f2 8714 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae 8715
9f95a23c 8716 int r = svc.bi_rados->open_bucket_index(bucket_info, &index_pool, &dir_oid);
7c673cae
FG
8717 if (r < 0)
8718 return r;
8719
8720 bufferlist updates;
8721
8722 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
8723 rgw_bucket_dir_entry entry;
8724 entry.key = *iter;
8725 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
8726 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
8727 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 8728 encode(entry, updates);
7c673cae
FG
8729 }
8730
8731 bufferlist out;
8732
9f95a23c 8733 r = index_pool.ioctx().exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
7c673cae
FG
8734
8735 return r;
8736}
8737
8738int RGWRados::check_disk_state(librados::IoCtx io_ctx,
8739 const RGWBucketInfo& bucket_info,
8740 rgw_bucket_dir_entry& list_state,
8741 rgw_bucket_dir_entry& object,
9f95a23c
TL
8742 bufferlist& suggested_updates,
8743 optional_yield y)
7c673cae
FG
8744{
8745 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 8746 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
8747
8748 std::string loc;
8749
8750 rgw_obj obj(bucket, list_state.key);
8751
8752 string oid;
8753 get_obj_bucket_and_oid_loc(obj, oid, loc);
8754
8755 if (loc != list_state.locator) {
8756 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
8757 }
8758
8759 io_ctx.locator_set_key(list_state.locator);
8760
8761 RGWObjState *astate = NULL;
9f95a23c
TL
8762 RGWObjectCtx rctx(this->store);
8763 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false, y);
7c673cae
FG
8764 if (r < 0)
8765 return r;
8766
8767 list_state.pending_map.clear(); // we don't need this and it inflates size
9f95a23c 8768 if (!list_state.is_delete_marker() && !astate->exists) {
7c673cae
FG
8769 /* object doesn't exist right now -- hopefully because it's
8770 * marked as !exists and got deleted */
8771 if (list_state.exists) {
8772 /* FIXME: what should happen now? Work out if there are any
8773 * non-bad ways this could happen (there probably are, but annoying
8774 * to handle!) */
8775 }
8776 // encode a suggested removal of that key
8777 list_state.ver.epoch = io_ctx.get_last_version();
8778 list_state.ver.pool = io_ctx.get_id();
8779 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
8780 return -ENOENT;
8781 }
8782
8783 string etag;
8784 string content_type;
8785 ACLOwner owner;
8786
8787 object.meta.size = astate->size;
8788 object.meta.accounted_size = astate->accounted_size;
8789 object.meta.mtime = astate->mtime;
8790
8791 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
8792 if (iter != astate->attrset.end()) {
11fdf7f2 8793 etag = rgw_bl_str(iter->second);
7c673cae
FG
8794 }
8795 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
8796 if (iter != astate->attrset.end()) {
11fdf7f2 8797 content_type = rgw_bl_str(iter->second);
7c673cae
FG
8798 }
8799 iter = astate->attrset.find(RGW_ATTR_ACL);
8800 if (iter != astate->attrset.end()) {
8801 r = decode_policy(iter->second, &owner);
8802 if (r < 0) {
8803 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
8804 }
8805 }
8806
9f95a23c 8807 if (astate->manifest) {
7c673cae 8808 RGWObjManifest::obj_iterator miter;
9f95a23c 8809 RGWObjManifest& manifest = *astate->manifest;
7c673cae
FG
8810 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
8811 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
8812 rgw_obj loc;
9f95a23c 8813 RGWSI_Tier_RADOS::raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
7c673cae
FG
8814
8815 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
8816 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
494da23a 8817 r = delete_obj_index(loc, astate->mtime);
7c673cae
FG
8818 if (r < 0) {
8819 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
8820 }
8821 }
8822 }
8823 }
8824
8825 object.meta.etag = etag;
8826 object.meta.content_type = content_type;
8827 object.meta.owner = owner.get_id().to_str();
8828 object.meta.owner_display_name = owner.get_display_name();
8829
8830 // encode suggested updates
8831 list_state.ver.pool = io_ctx.get_id();
8832 list_state.ver.epoch = astate->epoch;
8833 list_state.meta.size = object.meta.size;
8834 list_state.meta.accounted_size = object.meta.accounted_size;
8835 list_state.meta.mtime = object.meta.mtime;
8836 list_state.meta.category = main_category;
8837 list_state.meta.etag = etag;
8838 list_state.meta.content_type = content_type;
8839 if (astate->obj_tag.length() > 0)
8840 list_state.tag = astate->obj_tag.c_str();
8841 list_state.meta.owner = owner.get_id().to_str();
8842 list_state.meta.owner_display_name = owner.get_display_name();
8843
8844 list_state.exists = true;
8845 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
8846 return 0;
8847}
8848
a8e16298 8849int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae 8850{
9f95a23c 8851 RGWSI_RADOS::Pool index_pool;
7c673cae
FG
8852 map<int, string> oids;
8853 map<int, struct rgw_cls_list_ret> list_results;
9f95a23c
TL
8854 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &oids, bucket_instance_ids);
8855 if (r < 0) {
8856 ldout(cct, 20) << "cls_bucket_head: open_bucket_index() returned "
8857 << r << dendl;
7c673cae 8858 return r;
9f95a23c 8859 }
7c673cae 8860
9f95a23c
TL
8861 r = CLSRGWIssueGetDirHeader(index_pool.ioctx(), oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
8862 if (r < 0) {
8863 ldout(cct, 20) << "cls_bucket_head: CLSRGWIssueGetDirHeader() returned "
8864 << r << dendl;
7c673cae 8865 return r;
9f95a23c 8866 }
7c673cae
FG
8867
8868 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
8869 for(; iter != list_results.end(); ++iter) {
a8e16298 8870 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
8871 }
8872 return 0;
8873}
8874
8875int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
8876{
9f95a23c 8877 RGWSI_RADOS::Pool index_pool;
7c673cae 8878 map<int, string> bucket_objs;
9f95a23c 8879 int r = svc.bi_rados->open_bucket_index(bucket_info, shard_id, &index_pool, &bucket_objs, nullptr);
7c673cae
FG
8880 if (r < 0)
8881 return r;
8882
8883 map<int, string>::iterator iter = bucket_objs.begin();
8884 for (; iter != bucket_objs.end(); ++iter) {
9f95a23c 8885 r = cls_rgw_get_dir_header_async(index_pool.ioctx(), iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
7c673cae
FG
8886 if (r < 0) {
8887 ctx->put();
8888 break;
8889 } else {
8890 (*num_aio)++;
8891 }
8892 }
8893 return r;
8894}
8895
9f95a23c
TL
8896int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info,
8897 const rgw_bucket& bucket,
8898 uint64_t num_objs)
31f18b77 8899{
11fdf7f2 8900 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
8901 return 0;
8902 }
8903
8904 bool need_resharding = false;
9f95a23c
TL
8905 uint32_t num_source_shards =
8906 (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8907 const uint32_t max_dynamic_shards =
8908 uint32_t(cct->_conf.get_val<uint64_t>("rgw_max_dynamic_shards"));
8909
8910 if (num_source_shards >= max_dynamic_shards) {
8911 return 0;
8912 }
31f18b77 8913
9f95a23c 8914 uint32_t suggested_num_shards = 0;
11fdf7f2
TL
8915 const uint64_t max_objs_per_shard =
8916 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9f95a23c
TL
8917
8918 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
8919 num_objs, need_resharding, &suggested_num_shards);
8920 if (! need_resharding) {
8921 return 0;
31f18b77
FG
8922 }
8923
9f95a23c
TL
8924 const uint32_t final_num_shards =
8925 RGWBucketReshard::get_preferred_shards(suggested_num_shards,
8926 max_dynamic_shards);
8927 // final verification, so we don't reduce number of shards
8928 if (final_num_shards <= num_source_shards) {
8929 return 0;
31f18b77
FG
8930 }
8931
f91f0fd5 8932 ldout(cct, 1) << "RGWRados::" << __func__ << " bucket " << bucket.name <<
9f95a23c
TL
8933 " needs resharding; current num shards " << bucket_info.num_shards <<
8934 "; new num shards " << final_num_shards << " (suggested " <<
8935 suggested_num_shards << ")" << dendl;
8936
8937 return add_bucket_to_reshard(bucket_info, final_num_shards);
31f18b77
FG
8938}
8939
8940int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
8941{
9f95a23c 8942 RGWReshard reshard(this->store);
31f18b77
FG
8943
8944 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
8945
11fdf7f2 8946 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77
FG
8947 if (new_num_shards <= num_source_shards) {
8948 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
8949 return 0;
8950 }
8951
8952 cls_rgw_reshard_entry entry;
8953 entry.time = real_clock::now();
8954 entry.tenant = bucket_info.owner.tenant;
8955 entry.bucket_name = bucket_info.bucket.name;
8956 entry.bucket_id = bucket_info.bucket.bucket_id;
8957 entry.old_num_shards = num_source_shards;
8958 entry.new_num_shards = new_num_shards;
8959
8960 return reshard.add(entry);
8961}
8962
7c673cae 8963int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
11fdf7f2 8964 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
7c673cae 8965{
11fdf7f2
TL
8966 // if we only check size, then num_objs will set to 0
8967 if(check_size_only)
8968 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
8969
7c673cae
FG
8970 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
8971}
8972
11fdf7f2
TL
8973int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
8974 int *shard_id)
7c673cae 8975{
11fdf7f2
TL
8976 int r = 0;
8977 switch (bucket_info.bucket_index_shard_hash_type) {
8978 case RGWBucketInfo::MOD:
8979 if (!bucket_info.num_shards) {
8980 if (shard_id) {
8981 *shard_id = -1;
8982 }
8983 } else {
9f95a23c 8984 uint32_t sid = svc.bi_rados->bucket_shard_index(obj_key, bucket_info.num_shards);
11fdf7f2
TL
8985 if (shard_id) {
8986 *shard_id = (int)sid;
8987 }
8988 }
8989 break;
8990 default:
8991 r = -ENOTSUP;
7c673cae 8992 }
11fdf7f2 8993 return r;
7c673cae
FG
8994}
8995
7c673cae
FG
8996uint64_t RGWRados::instance_id()
8997{
8998 return get_rados_handle()->get_instance_id();
8999}
9000
9001uint64_t RGWRados::next_bucket_id()
9002{
9f95a23c 9003 std::lock_guard l{bucket_id_lock};
7c673cae
FG
9004 return ++max_bucket_id;
9005}
9006
7c673cae
FG
9007librados::Rados* RGWRados::get_rados_handle()
9008{
494da23a 9009 return &rados;
7c673cae
FG
9010}
9011
9012int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
9013{
9014 rgw_rados_ref ref;
9015 int ret = get_raw_obj_ref(obj, &ref);
9016 if (ret < 0) {
9017 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9018 return ret;
9019 }
9020
9021 ObjectWriteOperation op;
9022 list<string> prefixes;
9023 cls_rgw_remove_obj(op, prefixes);
9024
9f95a23c
TL
9025 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9026 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
9027 if (ret < 0) {
9028 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9029 c->release();
9030 return ret;
9031 }
9032
9033 handles.push_back(c);
9034
9035 return 0;
9036}
9037
9038int RGWRados::delete_obj_aio(const rgw_obj& obj,
9039 RGWBucketInfo& bucket_info, RGWObjState *astate,
9f95a23c
TL
9040 list<librados::AioCompletion *>& handles, bool keep_index_consistent,
9041 optional_yield y)
7c673cae
FG
9042{
9043 rgw_rados_ref ref;
9044 int ret = get_obj_head_ref(bucket_info, obj, &ref);
9045 if (ret < 0) {
9046 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
9047 return ret;
9048 }
9049
9050 if (keep_index_consistent) {
9051 RGWRados::Bucket bop(this, bucket_info);
9052 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
9053
9f95a23c 9054 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag, y);
7c673cae
FG
9055 if (ret < 0) {
9056 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
9057 return ret;
9058 }
9059 }
9060
9061 ObjectWriteOperation op;
9062 list<string> prefixes;
9063 cls_rgw_remove_obj(op, prefixes);
9064
9f95a23c
TL
9065 AioCompletion *c = librados::Rados::aio_create_completion(nullptr, nullptr);
9066 ret = ref.pool.ioctx().aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
9067 if (ret < 0) {
9068 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
9069 c->release();
9070 return ret;
9071 }
9072
9073 handles.push_back(c);
9074
9075 if (keep_index_consistent) {
494da23a 9076 ret = delete_obj_index(obj, astate->mtime);
7c673cae
FG
9077 if (ret < 0) {
9078 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
9079 return ret;
9080 }
9081 }
9082 return ret;
9083}