]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9 #include <string_view>
10
11 #include <boost/container/flat_set.hpp>
12 #include <boost/format.hpp>
13 #include <boost/optional.hpp>
14 #include <boost/utility/in_place_factory.hpp>
15
16 #include "common/ceph_json.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_zone.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_aio_throttle.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32 #include "rgw_putobj_processor.h"
33
34 #include "cls/rgw/cls_rgw_ops.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "cls/otp/cls_otp_client.h"
44 #include "osd/osd_types.h"
45
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
49
50 #undef fork // fails to compile RGWPeriod::fork() below
51
52 #include "common/Clock.h"
53
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "include/random.h"
63
64 #include "rgw_gc.h"
65 #include "rgw_lc.h"
66
67 #include "rgw_object_expirer_core.h"
68 #include "rgw_sync.h"
69 #include "rgw_sync_counters.h"
70 #include "rgw_sync_trace.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
81
82 #include "compressor/Compressor.h"
83
84 #ifdef WITH_LTTNG
85 #define TRACEPOINT_DEFINE
86 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
87 #include "tracing/rgw_rados.h"
88 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
89 #undef TRACEPOINT_DEFINE
90 #else
91 #define tracepoint(...)
92 #endif
93
94 #define dout_context g_ceph_context
95 #define dout_subsys ceph_subsys_rgw
96
97
98 static string shadow_ns = "shadow";
99 static string dir_oid_prefix = ".dir.";
100 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
101 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
102
103 static string log_lock_name = "rgw_log_lock";
104 static RGWObjCategory main_category = RGWObjCategory::Main;
105 #define RGW_USAGE_OBJ_PREFIX "usage."
106
107 #define dout_subsys ceph_subsys_rgw
108
109 const std::string MP_META_SUFFIX = ".meta";
110
111
112 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
113 const rgw_placement_rule& head_placement_rule,
114 const rgw_obj& obj, rgw_pool *pool)
115 {
116 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
117 RGWZonePlacementInfo placement;
118 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
119 return false;
120 }
121
122 if (!obj.in_extra_data) {
123 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
124 } else {
125 *pool = placement.get_data_extra_pool();
126 }
127 }
128
129 return true;
130 }
131
132 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
133 const rgw_placement_rule& head_placement_rule,
134 const rgw_obj& obj, rgw_raw_obj *raw_obj)
135 {
136 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
137
138 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
139 }
140
141 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
142 {
143 if (!is_raw) {
144 rgw_raw_obj r;
145 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
146 return r;
147 }
148 return raw_obj;
149 }
150
151 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
152 {
153 if (!is_raw) {
154 rgw_raw_obj r;
155 store->obj_to_raw(placement_rule, obj, &r);
156 return r;
157 }
158 return raw_obj;
159 }
160
161 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
162 {
163 obj_version *check_objv = version_for_check();
164
165 if (check_objv) {
166 cls_version_check(*op, *check_objv, VER_COND_EQ);
167 }
168
169 cls_version_read(*op, &read_version);
170 }
171
172 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
173 {
174 obj_version *check_objv = version_for_check();
175 obj_version *modify_version = version_for_write();
176
177 if (check_objv) {
178 cls_version_check(*op, *check_objv, VER_COND_EQ);
179 }
180
181 if (modify_version) {
182 cls_version_set(*op, *modify_version);
183 } else {
184 cls_version_inc(*op);
185 }
186 }
187
188 void RGWObjManifest::obj_iterator::operator++()
189 {
190 if (manifest->explicit_objs) {
191 ++explicit_iter;
192
193 update_explicit_pos();
194
195 update_location();
196 return;
197 }
198
199 uint64_t obj_size = manifest->get_obj_size();
200 uint64_t head_size = manifest->get_head_size();
201
202 if (ofs == obj_size) {
203 return;
204 }
205
206 if (manifest->rules.empty()) {
207 return;
208 }
209
210 /* are we still pointing at the head? */
211 if (ofs < head_size) {
212 rule_iter = manifest->rules.begin();
213 RGWObjManifestRule *rule = &rule_iter->second;
214 ofs = std::min(head_size, obj_size);
215 stripe_ofs = ofs;
216 cur_stripe = 1;
217 stripe_size = std::min(obj_size - ofs, rule->stripe_max_size);
218 if (rule->part_size > 0) {
219 stripe_size = std::min(stripe_size, rule->part_size);
220 }
221 update_location();
222 return;
223 }
224
225 RGWObjManifestRule *rule = &rule_iter->second;
226
227 stripe_ofs += rule->stripe_max_size;
228 cur_stripe++;
229 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
230
231 if (rule->part_size > 0) {
232 /* multi part, multi stripes object */
233
234 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
235
236 if (stripe_ofs >= part_ofs + rule->part_size) {
237 /* moved to the next part */
238 cur_stripe = 0;
239 part_ofs += rule->part_size;
240 stripe_ofs = part_ofs;
241
242 bool last_rule = (next_rule_iter == manifest->rules.end());
243 /* move to the next rule? */
244 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
245 rule_iter = next_rule_iter;
246 last_rule = (next_rule_iter == manifest->rules.end());
247 if (!last_rule) {
248 ++next_rule_iter;
249 }
250 cur_part_id = rule_iter->second.start_part_num;
251 } else {
252 cur_part_id++;
253 }
254
255 rule = &rule_iter->second;
256 }
257
258 stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
259 }
260
261 cur_override_prefix = rule->override_prefix;
262
263 ofs = stripe_ofs;
264 if (ofs > obj_size) {
265 ofs = obj_size;
266 stripe_ofs = ofs;
267 stripe_size = 0;
268 }
269
270 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
271 update_location();
272 }
273
274 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
275 const rgw_placement_rule& head_placement_rule,
276 const rgw_placement_rule *tail_placement_rule,
277 const rgw_bucket& _b, const rgw_obj& _obj)
278 {
279 manifest = _m;
280
281 if (!tail_placement_rule) {
282 manifest->set_tail_placement(head_placement_rule, _b);
283 } else {
284 rgw_placement_rule new_tail_rule = *tail_placement_rule;
285 new_tail_rule.inherit_from(head_placement_rule);
286 manifest->set_tail_placement(new_tail_rule, _b);
287 }
288
289 manifest->set_head(head_placement_rule, _obj, 0);
290 last_ofs = 0;
291
292 if (manifest->get_prefix().empty()) {
293 char buf[33];
294 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
295
296 string oid_prefix = ".";
297 oid_prefix.append(buf);
298 oid_prefix.append("_");
299
300 manifest->set_prefix(oid_prefix);
301 }
302
303 bool found = manifest->get_rule(0, &rule);
304 if (!found) {
305 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
306 return -EIO;
307 }
308
309 uint64_t head_size = manifest->get_head_size();
310
311 if (head_size > 0) {
312 cur_stripe_size = head_size;
313 } else {
314 cur_stripe_size = rule.stripe_max_size;
315 }
316
317 cur_part_id = rule.start_part_num;
318
319 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
320
321 // Normal object which not generated through copy operation
322 manifest->set_tail_instance(_obj.key.instance);
323
324 manifest->update_iterators();
325
326 return 0;
327 }
328
329 int RGWObjManifest::generator::create_next(uint64_t ofs)
330 {
331 if (ofs < last_ofs) /* only going forward */
332 return -EINVAL;
333
334 uint64_t max_head_size = manifest->get_max_head_size();
335
336 if (ofs < max_head_size) {
337 manifest->set_head_size(ofs);
338 }
339
340 if (ofs >= max_head_size) {
341 manifest->set_head_size(max_head_size);
342 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
343 cur_stripe_size = rule.stripe_max_size;
344
345 if (cur_part_id == 0 && max_head_size > 0) {
346 cur_stripe++;
347 }
348 }
349
350 last_ofs = ofs;
351 manifest->set_obj_size(ofs);
352
353 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
354
355 manifest->update_iterators();
356
357 return 0;
358 }
359
360 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
361 {
362 return begin_iter;
363 }
364
365 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
366 {
367 return end_iter;
368 }
369
370 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
371 {
372 if (ofs > obj_size) {
373 ofs = obj_size;
374 }
375 RGWObjManifest::obj_iterator iter(this);
376 iter.seek(ofs);
377 return iter;
378 }
379
380 int RGWObjManifest::append(RGWObjManifest& m, const RGWZoneGroup& zonegroup,
381 const RGWZoneParams& zone_params)
382 {
383 if (explicit_objs || m.explicit_objs) {
384 return append_explicit(m, zonegroup, zone_params);
385 }
386
387 if (rules.empty()) {
388 *this = m;
389 return 0;
390 }
391
392 string override_prefix;
393
394 if (prefix.empty()) {
395 prefix = m.prefix;
396 }
397
398 if (prefix != m.prefix) {
399 override_prefix = m.prefix;
400 }
401
402 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
403 if (miter == m.rules.end()) {
404 return append_explicit(m, zonegroup, zone_params);
405 }
406
407 for (; miter != m.rules.end(); ++miter) {
408 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
409
410 RGWObjManifestRule& rule = last_rule->second;
411
412 if (rule.part_size == 0) {
413 rule.part_size = obj_size - rule.start_ofs;
414 }
415
416 RGWObjManifestRule& next_rule = miter->second;
417 if (!next_rule.part_size) {
418 next_rule.part_size = m.obj_size - next_rule.start_ofs;
419 }
420
421 string rule_prefix = prefix;
422 if (!rule.override_prefix.empty()) {
423 rule_prefix = rule.override_prefix;
424 }
425
426 string next_rule_prefix = m.prefix;
427 if (!next_rule.override_prefix.empty()) {
428 next_rule_prefix = next_rule.override_prefix;
429 }
430
431 if (rule.part_size != next_rule.part_size ||
432 rule.stripe_max_size != next_rule.stripe_max_size ||
433 rule_prefix != next_rule_prefix) {
434 if (next_rule_prefix != prefix) {
435 append_rules(m, miter, &next_rule_prefix);
436 } else {
437 append_rules(m, miter, NULL);
438 }
439 break;
440 }
441
442 uint64_t expected_part_num = rule.start_part_num + 1;
443 if (rule.part_size > 0) {
444 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
445 }
446
447 if (expected_part_num != next_rule.start_part_num) {
448 append_rules(m, miter, NULL);
449 break;
450 }
451 }
452
453 set_obj_size(obj_size + m.obj_size);
454
455 return 0;
456 }
457
458 int RGWObjManifest::append(RGWObjManifest& m, RGWSI_Zone *zone_svc)
459 {
460 return append(m, zone_svc->get_zonegroup(), zone_svc->get_zone_params());
461 }
462
463 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
464 string *override_prefix)
465 {
466 for (; miter != m.rules.end(); ++miter) {
467 RGWObjManifestRule rule = miter->second;
468 rule.start_ofs += obj_size;
469 if (override_prefix)
470 rule.override_prefix = *override_prefix;
471 rules[rule.start_ofs] = rule;
472 }
473 }
474
475 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
476 {
477 if (explicit_objs) {
478 return;
479 }
480 obj_iterator iter = obj_begin();
481
482 while (iter != obj_end()) {
483 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
484 const rgw_obj_select& os = iter.get_location();
485 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
486 part.loc_ofs = 0;
487
488 uint64_t ofs = iter.get_stripe_ofs();
489
490 if (ofs == 0) {
491 part.loc = obj;
492 } else {
493 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
494 }
495 ++iter;
496 uint64_t next_ofs = iter.get_stripe_ofs();
497
498 part.size = next_ofs - ofs;
499 }
500
501 explicit_objs = true;
502 rules.clear();
503 prefix.clear();
504 }
505
506 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
507 {
508 if (!explicit_objs) {
509 convert_to_explicit(zonegroup, zone_params);
510 }
511 if (!m.explicit_objs) {
512 m.convert_to_explicit(zonegroup, zone_params);
513 }
514 map<uint64_t, RGWObjManifestPart>::iterator iter;
515 uint64_t base = obj_size;
516 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
517 RGWObjManifestPart& part = iter->second;
518 objs[base + iter->first] = part;
519 }
520 obj_size += m.obj_size;
521
522 return 0;
523 }
524
525 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
526 {
527 if (rules.empty()) {
528 return false;
529 }
530
531 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
532 if (iter != rules.begin()) {
533 --iter;
534 }
535
536 *rule = iter->second;
537
538 return true;
539 }
540
541 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
542 {
543 write_version.ver = 1;
544 #define TAG_LEN 24
545
546 write_version.tag.clear();
547 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
548 }
549
550 class RGWMetaNotifierManager : public RGWCoroutinesManager {
551 RGWRados *store;
552 RGWHTTPManager http_manager;
553
554 public:
555 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
556 http_manager(store->ctx(), completion_mgr) {
557 http_manager.start();
558 }
559
560 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
561 rgw_http_param_pair pairs[] = { { "type", "metadata" },
562 { "notify", NULL },
563 { NULL, NULL } };
564
565 list<RGWCoroutinesStack *> stacks;
566 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
567 RGWRESTConn *conn = iter->second;
568 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
569 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
570
571 stacks.push_back(stack);
572 }
573 return run(stacks);
574 }
575 };
576
577 class RGWDataNotifierManager : public RGWCoroutinesManager {
578 RGWRados *store;
579 RGWHTTPManager http_manager;
580
581 public:
582 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
583 http_manager(store->ctx(), completion_mgr) {
584 http_manager.start();
585 }
586
587 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
588 rgw_http_param_pair pairs[] = { { "type", "data" },
589 { "notify", NULL },
590 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
591 { NULL, NULL } };
592
593 list<RGWCoroutinesStack *> stacks;
594 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
595 RGWRESTConn *conn = iter->second;
596 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
597 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
598
599 stacks.push_back(stack);
600 }
601 return run(stacks);
602 }
603 };
604
605 /* class RGWRadosThread */
606
607 void RGWRadosThread::start()
608 {
609 worker = new Worker(cct, this);
610 worker->create(thread_name.c_str());
611 }
612
613 void RGWRadosThread::stop()
614 {
615 down_flag = true;
616 stop_process();
617 if (worker) {
618 worker->signal();
619 worker->join();
620 }
621 delete worker;
622 worker = NULL;
623 }
624
625 void *RGWRadosThread::Worker::entry() {
626 uint64_t msec = processor->interval_msec();
627 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
628
629 do {
630 utime_t start = ceph_clock_now();
631 int r = processor->process();
632 if (r < 0) {
633 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
634 }
635
636 if (processor->going_down())
637 break;
638
639 utime_t end = ceph_clock_now();
640 end -= start;
641
642 uint64_t cur_msec = processor->interval_msec();
643 if (cur_msec != msec) { /* was it reconfigured? */
644 msec = cur_msec;
645 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
646 }
647
648 if (cur_msec > 0) {
649 if (interval <= end)
650 continue; // next round
651
652 utime_t wait_time = interval;
653 wait_time -= end;
654
655 wait_interval(wait_time);
656 } else {
657 wait();
658 }
659 } while (!processor->going_down());
660
661 return NULL;
662 }
663
664 class RGWMetaNotifier : public RGWRadosThread {
665 RGWMetaNotifierManager notify_mgr;
666 RGWMetadataLog *const log;
667
668 uint64_t interval_msec() override {
669 return cct->_conf->rgw_md_notify_interval_msec;
670 }
671 void stop_process() override {
672 notify_mgr.stop();
673 }
674 public:
675 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
676 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
677
678 int process() override;
679 };
680
681 int RGWMetaNotifier::process()
682 {
683 set<int> shards;
684
685 log->read_clear_modified(shards);
686
687 if (shards.empty()) {
688 return 0;
689 }
690
691 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
692 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
693 }
694
695 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
696
697 return 0;
698 }
699
700 class RGWDataNotifier : public RGWRadosThread {
701 RGWDataNotifierManager notify_mgr;
702
703 uint64_t interval_msec() override {
704 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
705 }
706 void stop_process() override {
707 notify_mgr.stop();
708 }
709 public:
710 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
711
712 int process() override;
713 };
714
715 int RGWDataNotifier::process()
716 {
717 if (!store->data_log) {
718 return 0;
719 }
720
721 map<int, set<string> > shards;
722
723 store->data_log->read_clear_modified(shards);
724
725 if (shards.empty()) {
726 return 0;
727 }
728
729 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
730 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
731 }
732
733 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
734
735 return 0;
736 }
737
738 class RGWSyncProcessorThread : public RGWRadosThread {
739 public:
740 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
741 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
742 ~RGWSyncProcessorThread() override {}
743 int init() override = 0 ;
744 int process() override = 0;
745 };
746
747 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
748 {
749 RGWMetaSyncStatusManager sync;
750
751 uint64_t interval_msec() override {
752 return 0; /* no interval associated, it'll run once until stopped */
753 }
754 void stop_process() override {
755 sync.stop();
756 }
757 public:
758 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
759 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
760
761 void wakeup_sync_shards(set<int>& shard_ids) {
762 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
763 sync.wakeup(*iter);
764 }
765 }
766 RGWMetaSyncStatusManager* get_manager() { return &sync; }
767
768 int init() override {
769 int ret = sync.init();
770 if (ret < 0) {
771 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
772 return ret;
773 }
774 return 0;
775 }
776
777 int process() override {
778 sync.run();
779 return 0;
780 }
781 };
782
783 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
784 {
785 PerfCountersRef counters;
786 RGWDataSyncStatusManager sync;
787 bool initialized;
788
789 uint64_t interval_msec() override {
790 if (initialized) {
791 return 0; /* no interval associated, it'll run once until stopped */
792 } else {
793 #define DATA_SYNC_INIT_WAIT_SEC 20
794 return DATA_SYNC_INIT_WAIT_SEC * 1000;
795 }
796 }
797 void stop_process() override {
798 sync.stop();
799 }
800 public:
801 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
802 const RGWZone* source_zone)
803 : RGWSyncProcessorThread(_store, "data-sync"),
804 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
805 sync(_store, async_rados, source_zone->id, counters.get()),
806 initialized(false) {}
807
808 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
809 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
810 sync.wakeup(iter->first, iter->second);
811 }
812 }
813 RGWDataSyncStatusManager* get_manager() { return &sync; }
814
815 int init() override {
816 return 0;
817 }
818
819 int process() override {
820 while (!initialized) {
821 if (going_down()) {
822 return 0;
823 }
824 int ret = sync.init();
825 if (ret >= 0) {
826 initialized = true;
827 break;
828 }
829 /* we'll be back! */
830 return 0;
831 }
832 sync.run();
833 return 0;
834 }
835 };
836
837 class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
838 {
839 RGWCoroutinesManager crs;
840 RGWRados *store;
841 rgw::BucketTrimManager *bucket_trim;
842 RGWHTTPManager http;
843 const utime_t trim_interval;
844
845 uint64_t interval_msec() override { return 0; }
846 void stop_process() override { crs.stop(); }
847 public:
848 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
849 int interval)
850 : RGWSyncProcessorThread(store, "sync-log-trim"),
851 crs(store->ctx(), store->get_cr_registry()), store(store),
852 bucket_trim(bucket_trim),
853 http(store->ctx(), crs.get_completion_mgr()),
854 trim_interval(interval, 0)
855 {}
856
857 int init() override {
858 return http.start();
859 }
860 int process() override {
861 list<RGWCoroutinesStack*> stacks;
862 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
863 meta->call(create_meta_log_trim_cr(this, store, &http,
864 cct->_conf->rgw_md_log_max_shards,
865 trim_interval));
866 stacks.push_back(meta);
867
868 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
869 data->call(create_data_log_trim_cr(store, &http,
870 cct->_conf->rgw_data_log_num_shards,
871 trim_interval));
872 stacks.push_back(data);
873
874 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
875 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
876 stacks.push_back(bucket);
877
878 crs.run(stacks);
879 return 0;
880 }
881
882 // implements DoutPrefixProvider
883 CephContext *get_cct() const override { return store->ctx(); }
884 unsigned get_subsys() const
885 {
886 return dout_subsys;
887 }
888
889 std::ostream& gen_prefix(std::ostream& out) const
890 {
891 return out << "sync log trim: ";
892 }
893
894 };
895
896 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
897 {
898 Mutex::Locker l(meta_sync_thread_lock);
899 if (meta_sync_processor_thread) {
900 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
901 }
902 }
903
904 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
905 {
906 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
907 Mutex::Locker l(data_sync_thread_lock);
908 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
909 if (iter == data_sync_processor_threads.end()) {
910 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
911 return;
912 }
913
914 RGWDataSyncProcessorThread *thread = iter->second;
915 ceph_assert(thread);
916 thread->wakeup_sync_shards(shard_ids);
917 }
918
919 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
920 {
921 Mutex::Locker l(meta_sync_thread_lock);
922 if (meta_sync_processor_thread) {
923 return meta_sync_processor_thread->get_manager();
924 }
925 return nullptr;
926 }
927
928 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
929 {
930 Mutex::Locker l(data_sync_thread_lock);
931 auto thread = data_sync_processor_threads.find(source_zone);
932 if (thread == data_sync_processor_threads.end()) {
933 return nullptr;
934 }
935 return thread->second->get_manager();
936 }
937
938 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
939 {
940 IoCtx ioctx;
941 int r = open_pool_ctx(pool, ioctx, false);
942 if (r < 0) {
943 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
944 return r;
945 }
946
947 bool requires;
948 r = ioctx.pool_requires_alignment2(&requires);
949 if (r < 0) {
950 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
951 << r << dendl;
952 return r;
953 }
954
955 if (!requires) {
956 *alignment = 0;
957 return 0;
958 }
959
960 uint64_t align;
961 r = ioctx.pool_required_alignment2(&align);
962 if (r < 0) {
963 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
964 << r << dendl;
965 return r;
966 }
967 if (align != 0) {
968 ldout(cct, 20) << "required alignment=" << align << dendl;
969 }
970 *alignment = align;
971 return 0;
972 }
973
974 void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
975 {
976 if (alignment == 0) {
977 *max_size = size;
978 return;
979 }
980
981 if (size <= alignment) {
982 *max_size = alignment;
983 return;
984 }
985
986 *max_size = size - (size % alignment);
987 }
988
989 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
990 {
991 uint64_t alignment;
992 int r = get_required_alignment(pool, &alignment);
993 if (r < 0) {
994 return r;
995 }
996
997 if (palignment) {
998 *palignment = alignment;
999 }
1000
1001 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
1002
1003 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
1004
1005 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
1006
1007 return 0;
1008 }
1009
1010 int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
1011 uint64_t *max_chunk_size, uint64_t *palignment)
1012 {
1013 rgw_pool pool;
1014 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
1015 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
1016 return -EIO;
1017 }
1018 return get_max_chunk_size(pool, max_chunk_size, palignment);
1019 }
1020
1021 class RGWIndexCompletionManager;
1022
1023 struct complete_op_data {
1024 Mutex lock{"complete_op_data"};
1025 AioCompletion *rados_completion{nullptr};
1026 int manager_shard_id{-1};
1027 RGWIndexCompletionManager *manager{nullptr};
1028 rgw_obj obj;
1029 RGWModifyOp op;
1030 string tag;
1031 rgw_bucket_entry_ver ver;
1032 cls_rgw_obj_key key;
1033 rgw_bucket_dir_entry_meta dir_meta;
1034 list<cls_rgw_obj_key> remove_objs;
1035 bool log_op;
1036 uint16_t bilog_op;
1037 rgw_zone_set zones_trace;
1038
1039 bool stopped{false};
1040
1041 void stop() {
1042 Mutex::Locker l(lock);
1043 stopped = true;
1044 }
1045 };
1046
1047 class RGWIndexCompletionThread : public RGWRadosThread {
1048 RGWRados *store;
1049
1050 uint64_t interval_msec() override {
1051 return 0;
1052 }
1053
1054 list<complete_op_data *> completions;
1055
1056 Mutex completions_lock;
1057 public:
1058 RGWIndexCompletionThread(RGWRados *_store)
1059 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
1060
1061 int process() override;
1062
1063 void add_completion(complete_op_data *completion) {
1064 {
1065 Mutex::Locker l(completions_lock);
1066 completions.push_back(completion);
1067 }
1068
1069 signal();
1070 }
1071 };
1072
1073 int RGWIndexCompletionThread::process()
1074 {
1075 list<complete_op_data *> comps;
1076
1077 {
1078 Mutex::Locker l(completions_lock);
1079 completions.swap(comps);
1080 }
1081
1082 for (auto c : comps) {
1083 std::unique_ptr<complete_op_data> up{c};
1084
1085 if (going_down()) {
1086 continue;
1087 }
1088 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
1089
1090 RGWRados::BucketShard bs(store);
1091 RGWBucketInfo bucket_info;
1092
1093 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
1094 if (r < 0) {
1095 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
1096 /* not much to do */
1097 continue;
1098 }
1099
1100 r = store->guard_reshard(&bs, c->obj, bucket_info,
1101 [&](RGWRados::BucketShard *bs) -> int {
1102 librados::ObjectWriteOperation o;
1103 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
1104 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
1105 c->log_op, c->bilog_op, &c->zones_trace);
1106 return bs->index_ctx.operate(bs->bucket_obj, &o);
1107 });
1108 if (r < 0) {
1109 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
1110 /* ignoring error, can't do anything about it */
1111 continue;
1112 }
1113 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
1114 if (r < 0) {
1115 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
1116 }
1117 }
1118
1119 return 0;
1120 }
1121
1122 class RGWIndexCompletionManager {
1123 RGWRados *store{nullptr};
1124 vector<Mutex *> locks;
1125 vector<set<complete_op_data *> > completions;
1126
1127 RGWIndexCompletionThread *completion_thread{nullptr};
1128
1129 int num_shards;
1130
1131 std::atomic<int> cur_shard {0};
1132
1133
1134 public:
1135 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
1136 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
1137
1138 for (int i = 0; i < num_shards; i++) {
1139 char buf[64];
1140 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
1141 locks.push_back(new Mutex(buf));
1142 }
1143
1144 completions.resize(num_shards);
1145 }
1146 ~RGWIndexCompletionManager() {
1147 stop();
1148
1149 for (auto l : locks) {
1150 delete l;
1151 }
1152 }
1153
1154 int next_shard() {
1155 int result = cur_shard % num_shards;
1156 cur_shard++;
1157 return result;
1158 }
1159
1160 void create_completion(const rgw_obj& obj,
1161 RGWModifyOp op, string& tag,
1162 rgw_bucket_entry_ver& ver,
1163 const cls_rgw_obj_key& key,
1164 rgw_bucket_dir_entry_meta& dir_meta,
1165 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1166 uint16_t bilog_op,
1167 rgw_zone_set *zones_trace,
1168 complete_op_data **result);
1169 bool handle_completion(completion_t cb, complete_op_data *arg);
1170
1171 int start() {
1172 completion_thread = new RGWIndexCompletionThread(store);
1173 int ret = completion_thread->init();
1174 if (ret < 0) {
1175 return ret;
1176 }
1177 completion_thread->start();
1178 return 0;
1179 }
1180 void stop() {
1181 if (completion_thread) {
1182 completion_thread->stop();
1183 delete completion_thread;
1184 }
1185
1186 for (int i = 0; i < num_shards; ++i) {
1187 Mutex::Locker l(*locks[i]);
1188 for (auto c : completions[i]) {
1189 c->stop();
1190 }
1191 }
1192 completions.clear();
1193 }
1194 };
1195
1196 static void obj_complete_cb(completion_t cb, void *arg)
1197 {
1198 complete_op_data *completion = (complete_op_data *)arg;
1199 completion->lock.Lock();
1200 if (completion->stopped) {
1201 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
1202 delete completion;
1203 return;
1204 }
1205 bool need_delete = completion->manager->handle_completion(cb, completion);
1206 completion->lock.Unlock();
1207 if (need_delete) {
1208 delete completion;
1209 }
1210 }
1211
1212
1213 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
1214 RGWModifyOp op, string& tag,
1215 rgw_bucket_entry_ver& ver,
1216 const cls_rgw_obj_key& key,
1217 rgw_bucket_dir_entry_meta& dir_meta,
1218 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1219 uint16_t bilog_op,
1220 rgw_zone_set *zones_trace,
1221 complete_op_data **result)
1222 {
1223 complete_op_data *entry = new complete_op_data;
1224
1225 int shard_id = next_shard();
1226
1227 entry->manager_shard_id = shard_id;
1228 entry->manager = this;
1229 entry->obj = obj;
1230 entry->op = op;
1231 entry->tag = tag;
1232 entry->ver = ver;
1233 entry->key = key;
1234 entry->dir_meta = dir_meta;
1235 entry->log_op = log_op;
1236 entry->bilog_op = bilog_op;
1237
1238 if (remove_objs) {
1239 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
1240 entry->remove_objs.push_back(*iter);
1241 }
1242 }
1243
1244 if (zones_trace) {
1245 entry->zones_trace = *zones_trace;
1246 } else {
1247 entry->zones_trace.insert(store->svc.zone->get_zone().id);
1248 }
1249
1250 *result = entry;
1251
1252 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
1253
1254 Mutex::Locker l(*locks[shard_id]);
1255 completions[shard_id].insert(entry);
1256 }
1257
1258 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1259 {
1260 int shard_id = arg->manager_shard_id;
1261 {
1262 Mutex::Locker l(*locks[shard_id]);
1263
1264 auto& comps = completions[shard_id];
1265
1266 auto iter = comps.find(arg);
1267 if (iter == comps.end()) {
1268 return true;
1269 }
1270
1271 comps.erase(iter);
1272 }
1273
1274 int r = rados_aio_get_return_value(cb);
1275 if (r != -ERR_BUSY_RESHARDING) {
1276 return true;
1277 }
1278 completion_thread->add_completion(arg);
1279 return false;
1280 }
1281
1282 void RGWRados::finalize()
1283 {
1284 cct->get_admin_socket()->unregister_commands(this);
1285 if (run_sync_thread) {
1286 Mutex::Locker l(meta_sync_thread_lock);
1287 meta_sync_processor_thread->stop();
1288
1289 Mutex::Locker dl(data_sync_thread_lock);
1290 for (auto iter : data_sync_processor_threads) {
1291 RGWDataSyncProcessorThread *thread = iter.second;
1292 thread->stop();
1293 }
1294 if (sync_log_trimmer) {
1295 sync_log_trimmer->stop();
1296 }
1297 }
1298 if (async_rados) {
1299 async_rados->stop();
1300 }
1301 if (run_sync_thread) {
1302 delete meta_sync_processor_thread;
1303 meta_sync_processor_thread = NULL;
1304 Mutex::Locker dl(data_sync_thread_lock);
1305 for (auto iter : data_sync_processor_threads) {
1306 RGWDataSyncProcessorThread *thread = iter.second;
1307 delete thread;
1308 }
1309 data_sync_processor_threads.clear();
1310 delete sync_log_trimmer;
1311 sync_log_trimmer = nullptr;
1312 bucket_trim = boost::none;
1313 }
1314 if (meta_notifier) {
1315 meta_notifier->stop();
1316 delete meta_notifier;
1317 }
1318 if (data_notifier) {
1319 data_notifier->stop();
1320 delete data_notifier;
1321 }
1322 delete data_log;
1323 delete sync_tracer;
1324 if (async_rados) {
1325 delete async_rados;
1326 }
1327
1328 delete lc;
1329 lc = NULL;
1330
1331 delete gc;
1332 gc = NULL;
1333
1334 delete obj_expirer;
1335 obj_expirer = NULL;
1336
1337 RGWQuotaHandler::free_handler(quota_handler);
1338 if (cr_registry) {
1339 cr_registry->put();
1340 }
1341
1342 svc.shutdown();
1343
1344 delete meta_mgr;
1345 delete binfo_cache;
1346 delete obj_tombstone_cache;
1347
1348 if (reshard_wait.get()) {
1349 reshard_wait->stop();
1350 reshard_wait.reset();
1351 }
1352
1353 if (run_reshard_thread) {
1354 reshard->stop_processor();
1355 }
1356 delete reshard;
1357 delete index_completion_manager;
1358 }
1359
1360 /**
1361 * Initialize the RADOS instance and prepare to do other ops
1362 * Returns 0 on success, -ERR# on failure.
1363 */
1364 int RGWRados::init_rados()
1365 {
1366 int ret = 0;
1367 auto admin_socket = cct->get_admin_socket();
1368 for (auto cmd : admin_commands) {
1369 int r = admin_socket->register_command(cmd[0], cmd[1], this,
1370 cmd[2]);
1371 if (r < 0) {
1372 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
1373 << ")" << dendl;
1374 return r;
1375 }
1376 }
1377
1378 ret = rados.init_with_context(cct);
1379 if (ret < 0) {
1380 return ret;
1381 }
1382 ret = rados.connect();
1383 if (ret < 0) {
1384 return ret;
1385 }
1386
1387 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1388 new RGWCoroutinesManagerRegistry(cct)};
1389 ret = crs->hook_to_admin_command("cr dump");
1390 if (ret < 0) {
1391 return ret;
1392 }
1393
1394 meta_mgr = new RGWMetadataManager(cct, this);
1395 data_log = new RGWDataChangesLog(cct, this);
1396 cr_registry = crs.release();
1397 return ret;
1398 }
1399
1400 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
1401 {
1402 map<string,string> metadata = meta;
1403 metadata["num_handles"] = "1"s;
1404 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1405 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1406 metadata["zone_name"] = svc.zone->zone_name();
1407 metadata["zone_id"] = svc.zone->zone_id();
1408 string name = cct->_conf->name.get_id();
1409 if (name.compare(0, 4, "rgw.") == 0) {
1410 name = name.substr(4);
1411 }
1412 int ret = rados.service_daemon_register(daemon_type, name, metadata);
1413 if (ret < 0) {
1414 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1415 return ret;
1416 }
1417
1418 return 0;
1419 }
1420
1421 int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
1422 {
1423 int ret = rados.service_daemon_update_status(move(status));
1424 if (ret < 0) {
1425 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1426 return ret;
1427 }
1428
1429 return 0;
1430 }
1431
1432 /**
1433 * Initialize the RADOS instance and prepare to do other ops
1434 * Returns 0 on success, -ERR# on failure.
1435 */
1436 int RGWRados::init_complete()
1437 {
1438 int ret;
1439
1440 /*
1441 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1442 */
1443 auto& zone_public_config = svc.zone->get_zone();
1444 ret = svc.sync_modules->get_manager()->create_instance(cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module);
1445 if (ret < 0) {
1446 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
1447 if (ret == -ENOENT) {
1448 lderr(cct) << "ERROR: " << zone_public_config.tier_type
1449 << " sync module does not exist. valid sync modules: "
1450 << svc.sync_modules->get_manager()->get_registered_module_names()
1451 << dendl;
1452 }
1453 return ret;
1454 }
1455
1456 period_puller.reset(new RGWPeriodPuller(this));
1457 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
1458 svc.zone->get_current_period()));
1459
1460 ret = open_root_pool_ctx();
1461 if (ret < 0)
1462 return ret;
1463
1464 ret = open_gc_pool_ctx();
1465 if (ret < 0)
1466 return ret;
1467
1468 ret = open_lc_pool_ctx();
1469 if (ret < 0)
1470 return ret;
1471
1472 ret = open_objexp_pool_ctx();
1473 if (ret < 0)
1474 return ret;
1475
1476 ret = open_reshard_pool_ctx();
1477 if (ret < 0)
1478 return ret;
1479
1480 pools_initialized = true;
1481
1482 gc = new RGWGC();
1483 gc->initialize(cct, this);
1484
1485 obj_expirer = new RGWObjectExpirer(this);
1486
1487 if (use_gc_thread) {
1488 gc->start_processor();
1489 obj_expirer->start_processor();
1490 }
1491
1492 auto& current_period = svc.zone->get_current_period();
1493 auto& zonegroup = svc.zone->get_zonegroup();
1494 auto& zone_params = svc.zone->get_zone_params();
1495 auto& zone = svc.zone->get_zone();
1496
1497 /* no point of running sync thread if we don't have a master zone configured
1498 or there is no rest_master_conn */
1499 if (zonegroup.master_zone.empty() || !svc.zone->get_master_conn()
1500 || current_period.get_id().empty()) {
1501 run_sync_thread = false;
1502 }
1503
1504 if (run_sync_thread) {
1505 // initialize the log period history
1506 meta_mgr->init_oldest_log_period();
1507 }
1508
1509 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
1510 async_rados->start();
1511
1512 ret = meta_mgr->init(current_period.get_id());
1513 if (ret < 0) {
1514 lderr(cct) << "ERROR: failed to initialize metadata log: "
1515 << cpp_strerror(-ret) << dendl;
1516 return ret;
1517 }
1518
1519 if (svc.zone->is_meta_master()) {
1520 auto md_log = meta_mgr->get_log(current_period.get_id());
1521 meta_notifier = new RGWMetaNotifier(this, md_log);
1522 meta_notifier->start();
1523 }
1524
1525 /* init it anyway, might run sync through radosgw-admin explicitly */
1526 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1527 sync_tracer->init(this);
1528 ret = sync_tracer->hook_to_admin_command();
1529 if (ret < 0) {
1530 return ret;
1531 }
1532
1533 if (run_sync_thread) {
1534 for (const auto &pt: zonegroup.placement_targets) {
1535 if (zone_params.placement_pools.find(pt.second.name)
1536 == zone_params.placement_pools.end()){
1537 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1538 << pt.second.name << " present in zonegroup" << dendl;
1539 }
1540 }
1541 Mutex::Locker l(meta_sync_thread_lock);
1542 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
1543 ret = meta_sync_processor_thread->init();
1544 if (ret < 0) {
1545 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1546 return ret;
1547 }
1548 meta_sync_processor_thread->start();
1549
1550 // configure the bucket trim manager
1551 rgw::BucketTrimConfig config;
1552 rgw::configure_bucket_trim(cct, config);
1553
1554 bucket_trim.emplace(this, config);
1555 ret = bucket_trim->init();
1556 if (ret < 0) {
1557 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1558 return ret;
1559 }
1560 data_log->set_observer(&*bucket_trim);
1561
1562 Mutex::Locker dl(data_sync_thread_lock);
1563 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1564 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
1565 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, source_zone);
1566 ret = thread->init();
1567 if (ret < 0) {
1568 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1569 return ret;
1570 }
1571 thread->start();
1572 data_sync_processor_threads[source_zone->id] = thread;
1573 }
1574 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1575 if (interval > 0) {
1576 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
1577 ret = sync_log_trimmer->init();
1578 if (ret < 0) {
1579 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1580 return ret;
1581 }
1582 sync_log_trimmer->start();
1583 }
1584 }
1585 data_notifier = new RGWDataNotifier(this);
1586 data_notifier->start();
1587
1588 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1589 binfo_cache->init(svc.cache);
1590
1591 lc = new RGWLC();
1592 lc->initialize(cct, this);
1593
1594 if (use_lc_thread)
1595 lc->start_processor();
1596
1597 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
1598
1599 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
1600 zone.bucket_index_max_shards);
1601 if (bucket_index_max_shards > get_max_bucket_shards()) {
1602 bucket_index_max_shards = get_max_bucket_shards();
1603 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
1604 << get_max_bucket_shards() << dendl;
1605 }
1606 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1607
1608 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1609
1610 if (need_tombstone_cache) {
1611 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1612 }
1613
1614 reshard_wait = std::make_shared<RGWReshardWait>();
1615
1616 reshard = new RGWReshard(this);
1617
1618 /* only the master zone in the zonegroup reshards buckets */
1619 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
1620 if (run_reshard_thread) {
1621 reshard->start_processor();
1622 }
1623
1624 index_completion_manager = new RGWIndexCompletionManager(this);
1625 ret = index_completion_manager->start();
1626
1627 return ret;
1628 }
1629
1630 int RGWRados::init_svc(bool raw)
1631 {
1632 if (raw) {
1633 return svc.init_raw(cct, use_cache);
1634 }
1635
1636 return svc.init(cct, use_cache);
1637 }
1638
1639 /**
1640 * Initialize the RADOS instance and prepare to do other ops
1641 * Returns 0 on success, -ERR# on failure.
1642 */
1643 int RGWRados::initialize()
1644 {
1645 int ret;
1646
1647 inject_notify_timeout_probability =
1648 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1649 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
1650
1651 ret = init_svc(false);
1652 if (ret < 0) {
1653 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
1654 return ret;
1655 }
1656
1657 host_id = svc.zone_utils->gen_host_id();
1658
1659 ret = init_rados();
1660 if (ret < 0)
1661 return ret;
1662
1663 return init_complete();
1664 }
1665
1666 /**
1667 * Open the pool used as root for this gateway
1668 * Returns: 0 on success, -ERR# otherwise.
1669 */
1670 int RGWRados::open_root_pool_ctx()
1671 {
1672 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
1673 }
1674
1675 int RGWRados::open_gc_pool_ctx()
1676 {
1677 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
1678 }
1679
1680 int RGWRados::open_lc_pool_ctx()
1681 {
1682 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
1683 }
1684
1685 int RGWRados::open_objexp_pool_ctx()
1686 {
1687 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
1688 }
1689
1690 int RGWRados::open_reshard_pool_ctx()
1691 {
1692 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
1693 }
1694
1695 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx,
1696 bool mostly_omap)
1697 {
1698 constexpr bool create = true; // create the pool if it doesn't exist
1699 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap);
1700 }
1701
1702 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
1703 string *marker) {
1704 if (marker) {
1705 *marker = shard_id_str;
1706 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
1707 marker->append(shard_marker);
1708 }
1709 }
1710
1711 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
1712 {
1713 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
1714
1715 if (!explicit_pool.empty()) {
1716 return open_pool_ctx(explicit_pool, index_ctx, false);
1717 }
1718
1719 auto& zonegroup = svc.zone->get_zonegroup();
1720 auto& zone_params = svc.zone->get_zone_params();
1721
1722 const rgw_placement_rule *rule = &bucket_info.placement_rule;
1723 if (rule->empty()) {
1724 rule = &zonegroup.default_placement;
1725 }
1726 auto iter = zone_params.placement_pools.find(rule->name);
1727 if (iter == zone_params.placement_pools.end()) {
1728 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
1729 return -EINVAL;
1730 }
1731
1732 int r = open_pool_ctx(iter->second.index_pool, index_ctx, true);
1733 if (r < 0)
1734 return r;
1735
1736 return 0;
1737 }
1738
1739 /**** logs ****/
1740
1741 struct log_list_state {
1742 string prefix;
1743 librados::IoCtx io_ctx;
1744 librados::NObjectIterator obit;
1745 };
1746
1747 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1748 {
1749 log_list_state *state = new log_list_state;
1750 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1751 if (r < 0) {
1752 delete state;
1753 return r;
1754 }
1755 state->prefix = prefix;
1756 state->obit = state->io_ctx.nobjects_begin();
1757 *handle = (RGWAccessHandle)state;
1758 return 0;
1759 }
1760
1761 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1762 {
1763 log_list_state *state = static_cast<log_list_state *>(handle);
1764 while (true) {
1765 if (state->obit == state->io_ctx.nobjects_end()) {
1766 delete state;
1767 return -ENOENT;
1768 }
1769 if (state->prefix.length() &&
1770 state->obit->get_oid().find(state->prefix) != 0) {
1771 state->obit++;
1772 continue;
1773 }
1774 *name = state->obit->get_oid();
1775 state->obit++;
1776 break;
1777 }
1778 return 0;
1779 }
1780
1781 int RGWRados::log_remove(const string& name)
1782 {
1783 librados::IoCtx io_ctx;
1784 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
1785 if (r < 0)
1786 return r;
1787 return io_ctx.remove(name);
1788 }
1789
1790 struct log_show_state {
1791 librados::IoCtx io_ctx;
1792 bufferlist bl;
1793 bufferlist::const_iterator p;
1794 string name;
1795 uint64_t pos;
1796 bool eof;
1797 log_show_state() : pos(0), eof(false) {}
1798 };
1799
1800 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1801 {
1802 log_show_state *state = new log_show_state;
1803 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1804 if (r < 0) {
1805 delete state;
1806 return r;
1807 }
1808 state->name = name;
1809 *handle = (RGWAccessHandle)state;
1810 return 0;
1811 }
1812
1813 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1814 {
1815 log_show_state *state = static_cast<log_show_state *>(handle);
1816 off_t off = state->p.get_off();
1817
1818 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1819 << " off " << off
1820 << " eof " << (int)state->eof
1821 << dendl;
1822 // read some?
1823 unsigned chunk = 1024*1024;
1824 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1825 bufferlist more;
1826 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1827 if (r < 0)
1828 return r;
1829 state->pos += r;
1830 bufferlist old;
1831 try {
1832 old.substr_of(state->bl, off, state->bl.length() - off);
1833 } catch (buffer::error& err) {
1834 return -EINVAL;
1835 }
1836 state->bl.clear();
1837 state->bl.claim(old);
1838 state->bl.claim_append(more);
1839 state->p = state->bl.cbegin();
1840 if ((unsigned)r < chunk)
1841 state->eof = true;
1842 ldout(cct, 10) << " read " << r << dendl;
1843 }
1844
1845 if (state->p.end())
1846 return 0; // end of file
1847 try {
1848 decode(*entry, state->p);
1849 }
1850 catch (const buffer::error &e) {
1851 return -EINVAL;
1852 }
1853 return 1;
1854 }
1855
1856 /**
1857 * usage_log_hash: get usage log key hash, based on name and index
1858 *
1859 * Get the usage object name. Since a user may have more than 1
1860 * object holding that info (multiple shards), we use index to
1861 * specify that shard number. Once index exceeds max shards it
1862 * wraps.
1863 * If name is not being set, results for all users will be returned
1864 * and index will wrap only after total shards number.
1865 *
1866 * @param cct [in] ceph context
1867 * @param name [in] user name
1868 * @param hash [out] hash value
1869 * @param index [in] shard index number
1870 */
1871 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1872 {
1873 uint32_t val = index;
1874
1875 if (!name.empty()) {
1876 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
1877 val %= max_user_shards;
1878 val += ceph_str_hash_linux(name.c_str(), name.size());
1879 }
1880 char buf[17];
1881 int max_shards = cct->_conf->rgw_usage_max_shards;
1882 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1883 hash = buf;
1884 }
1885
1886 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1887 {
1888 uint32_t index = 0;
1889
1890 map<string, rgw_usage_log_info> log_objs;
1891
1892 string hash;
1893 string last_user;
1894
1895 /* restructure usage map, zone by object hash */
1896 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1897 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1898 const rgw_user_bucket& ub = iter->first;
1899 RGWUsageBatch& info = iter->second;
1900
1901 if (ub.user.empty()) {
1902 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1903 continue;
1904 }
1905
1906 if (ub.user != last_user) {
1907 /* index *should* be random, but why waste extra cycles
1908 in most cases max user shards is not going to exceed 1,
1909 so just incrementing it */
1910 usage_log_hash(cct, ub.user, hash, index++);
1911 }
1912 last_user = ub.user;
1913 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1914
1915 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1916 v.push_back(miter->second);
1917 }
1918 }
1919
1920 map<string, rgw_usage_log_info>::iterator liter;
1921
1922 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1923 int r = cls_obj_usage_log_add(liter->first, liter->second);
1924 if (r < 0)
1925 return r;
1926 }
1927 return 0;
1928 }
1929
1930 int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1931 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1932 rgw_usage_log_entry>& usage)
1933 {
1934 uint32_t num = max_entries;
1935 string hash, first_hash;
1936 string user_str = user.to_str();
1937 usage_log_hash(cct, user_str, first_hash, 0);
1938
1939 if (usage_iter.index) {
1940 usage_log_hash(cct, user_str, hash, usage_iter.index);
1941 } else {
1942 hash = first_hash;
1943 }
1944
1945 usage.clear();
1946
1947 do {
1948 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1949 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1950
1951 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
1952 usage_iter.read_iter, ret_usage, is_truncated);
1953 if (ret == -ENOENT)
1954 goto next;
1955
1956 if (ret < 0)
1957 return ret;
1958
1959 num -= ret_usage.size();
1960
1961 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1962 usage[iter->first].aggregate(iter->second);
1963 }
1964
1965 next:
1966 if (!*is_truncated) {
1967 usage_iter.read_iter.clear();
1968 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1969 }
1970 } while (num && !*is_truncated && hash != first_hash);
1971 return 0;
1972 }
1973
1974 int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
1975 {
1976 uint32_t index = 0;
1977 string hash, first_hash;
1978 string user_str = user.to_str();
1979 usage_log_hash(cct, user_str, first_hash, index);
1980
1981 hash = first_hash;
1982 do {
1983 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
1984
1985 if (ret < 0 && ret != -ENOENT)
1986 return ret;
1987
1988 usage_log_hash(cct, user_str, hash, ++index);
1989 } while (hash != first_hash);
1990
1991 return 0;
1992 }
1993
1994
1995 int RGWRados::clear_usage()
1996 {
1997 auto max_shards = cct->_conf->rgw_usage_max_shards;
1998 int ret=0;
1999 for (unsigned i=0; i < max_shards; i++){
2000 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
2001 ret = cls_obj_usage_log_clear(oid);
2002 if (ret < 0){
2003 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
2004 return ret;
2005 }
2006 }
2007 return ret;
2008 }
2009
2010 int RGWRados::key_to_shard_id(const string& key, int max_shards)
2011 {
2012 return rgw_shard_id(key, max_shards);
2013 }
2014
2015 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
2016 {
2017 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2018 char buf[16];
2019 if (shard_id) {
2020 *shard_id = val % max_shards;
2021 }
2022 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2023 name = prefix + buf;
2024 }
2025
2026 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
2027 {
2028 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2029 val ^= ceph_str_hash_linux(section.c_str(), section.size());
2030 char buf[16];
2031 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2032 name = prefix + buf;
2033 }
2034
2035 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
2036 {
2037 char buf[16];
2038 snprintf(buf, sizeof(buf), "%u", shard_id);
2039 name = prefix + buf;
2040
2041 }
2042
2043 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2044 {
2045 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
2046 }
2047
2048 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
2049 {
2050 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx, true);
2051
2052 }
2053
2054 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2055 {
2056 librados::IoCtx io_ctx;
2057
2058 int r = time_log_add_init(io_ctx);
2059 if (r < 0) {
2060 return r;
2061 }
2062
2063 ObjectWriteOperation op;
2064 utime_t t(ut);
2065 cls_log_add(op, t, section, key, bl);
2066
2067 return io_ctx.operate(oid, &op);
2068 }
2069
2070 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
2071 librados::AioCompletion *completion, bool monotonic_inc)
2072 {
2073 librados::IoCtx io_ctx;
2074
2075 int r = time_log_add_init(io_ctx);
2076 if (r < 0) {
2077 return r;
2078 }
2079
2080 ObjectWriteOperation op;
2081 cls_log_add(op, entries, monotonic_inc);
2082
2083 if (!completion) {
2084 r = io_ctx.operate(oid, &op);
2085 } else {
2086 r = io_ctx.aio_operate(oid, completion, &op);
2087 }
2088 return r;
2089 }
2090
2091 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
2092 int max_entries, list<cls_log_entry>& entries,
2093 const string& marker,
2094 string *out_marker,
2095 bool *truncated)
2096 {
2097 librados::IoCtx io_ctx;
2098
2099 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2100 if (r < 0)
2101 return r;
2102 librados::ObjectReadOperation op;
2103
2104 utime_t st(start_time);
2105 utime_t et(end_time);
2106
2107 cls_log_list(op, st, et, marker, max_entries, entries,
2108 out_marker, truncated);
2109
2110 bufferlist obl;
2111
2112 int ret = io_ctx.operate(oid, &op, &obl);
2113 if (ret < 0)
2114 return ret;
2115
2116 return 0;
2117 }
2118
2119 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
2120 {
2121 librados::IoCtx io_ctx;
2122
2123 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2124 if (r < 0)
2125 return r;
2126 librados::ObjectReadOperation op;
2127
2128 cls_log_info(op, header);
2129
2130 bufferlist obl;
2131
2132 int ret = io_ctx.operate(oid, &op, &obl);
2133 if (ret < 0)
2134 return ret;
2135
2136 return 0;
2137 }
2138
2139 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
2140 {
2141 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2142 if (r < 0)
2143 return r;
2144
2145 librados::ObjectReadOperation op;
2146
2147 cls_log_info(op, header);
2148
2149 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
2150 if (ret < 0)
2151 return ret;
2152
2153 return 0;
2154 }
2155
2156 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
2157 const string& from_marker, const string& to_marker,
2158 librados::AioCompletion *completion)
2159 {
2160 librados::IoCtx io_ctx;
2161
2162 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2163 if (r < 0)
2164 return r;
2165
2166 utime_t st(start_time);
2167 utime_t et(end_time);
2168
2169 ObjectWriteOperation op;
2170 cls_log_trim(op, st, et, from_marker, to_marker);
2171
2172 if (!completion) {
2173 r = io_ctx.operate(oid, &op);
2174 } else {
2175 r = io_ctx.aio_operate(oid, completion, &op);
2176 }
2177 return r;
2178 }
2179
2180 string RGWRados::objexp_hint_get_shardname(int shard_num)
2181 {
2182 char buf[32];
2183 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
2184
2185 string objname("obj_delete_at_hint.");
2186 return objname + buf;
2187 }
2188
2189 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
2190 {
2191 string obj_key = key.name + key.instance;
2192 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
2193 return rgw_bucket_shard_index(obj_key, num_shards);
2194 }
2195
2196 static string objexp_hint_get_keyext(const string& tenant_name,
2197 const string& bucket_name,
2198 const string& bucket_id,
2199 const rgw_obj_key& obj_key)
2200 {
2201 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
2202 ":" + obj_key.name + ":" + obj_key.instance;
2203 }
2204
2205 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
2206 const string& tenant_name,
2207 const string& bucket_name,
2208 const string& bucket_id,
2209 const rgw_obj_index_key& obj_key)
2210 {
2211 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
2212 bucket_id, obj_key);
2213 objexp_hint_entry he = {
2214 .tenant = tenant_name,
2215 .bucket_name = bucket_name,
2216 .bucket_id = bucket_id,
2217 .obj_key = obj_key,
2218 .exp_time = delete_at };
2219 bufferlist hebl;
2220 encode(he, hebl);
2221 ObjectWriteOperation op;
2222 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
2223
2224 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
2225 return objexp_pool_ctx.operate(shard_name, &op);
2226 }
2227
2228 void RGWRados::objexp_get_shard(int shard_num,
2229 string& shard) /* out */
2230 {
2231 shard = objexp_hint_get_shardname(shard_num);
2232 }
2233
2234 int RGWRados::objexp_hint_list(const string& oid,
2235 const ceph::real_time& start_time,
2236 const ceph::real_time& end_time,
2237 const int max_entries,
2238 const string& marker,
2239 list<cls_timeindex_entry>& entries, /* out */
2240 string *out_marker, /* out */
2241 bool *truncated) /* out */
2242 {
2243 librados::ObjectReadOperation op;
2244 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
2245 out_marker, truncated);
2246
2247 bufferlist obl;
2248 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
2249
2250 if ((ret < 0 ) && (ret != -ENOENT)) {
2251 return ret;
2252 }
2253
2254 if ((ret == -ENOENT) && truncated) {
2255 *truncated = false;
2256 }
2257
2258 return 0;
2259 }
2260
2261 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
2262 objexp_hint_entry& hint_entry) /* out */
2263 {
2264 try {
2265 auto iter = ti_entry.value.cbegin();
2266 decode(hint_entry, iter);
2267 } catch (buffer::error& err) {
2268 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
2269 }
2270
2271 return 0;
2272 }
2273
2274 int RGWRados::objexp_hint_trim(const string& oid,
2275 const ceph::real_time& start_time,
2276 const ceph::real_time& end_time,
2277 const string& from_marker,
2278 const string& to_marker)
2279 {
2280 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
2281 from_marker, to_marker);
2282 if ((ret < 0 ) && (ret != -ENOENT)) {
2283 return ret;
2284 }
2285
2286 return 0;
2287 }
2288
2289 int RGWRados::lock_exclusive(const rgw_pool& pool, const string& oid, timespan& duration,
2290 string& zone_id, string& owner_id) {
2291 librados::IoCtx io_ctx;
2292
2293 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2294 if (r < 0) {
2295 return r;
2296 }
2297 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
2298 utime_t ut(msec / 1000, msec % 1000);
2299
2300 rados::cls::lock::Lock l(log_lock_name);
2301 l.set_duration(ut);
2302 l.set_cookie(owner_id);
2303 l.set_tag(zone_id);
2304 l.set_may_renew(true);
2305
2306 return l.lock_exclusive(&io_ctx, oid);
2307 }
2308
2309 int RGWRados::unlock(const rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
2310 librados::IoCtx io_ctx;
2311
2312 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2313 if (r < 0) {
2314 return r;
2315 }
2316
2317 rados::cls::lock::Lock l(log_lock_name);
2318 l.set_tag(zone_id);
2319 l.set_cookie(owner_id);
2320
2321 return l.unlock(&io_ctx, oid);
2322 }
2323
2324 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
2325 {
2326 auto i = bl.cbegin();
2327 RGWAccessControlPolicy policy(cct);
2328 try {
2329 policy.decode_owner(i);
2330 } catch (buffer::error& err) {
2331 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2332 return -EIO;
2333 }
2334 *owner = policy.get_owner();
2335 return 0;
2336 }
2337
2338 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
2339 {
2340 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
2341 if (aiter == attrset.end())
2342 return -EIO;
2343
2344 bufferlist& bl = aiter->second;
2345 auto iter = bl.cbegin();
2346 try {
2347 policy->decode(iter);
2348 } catch (buffer::error& err) {
2349 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2350 return -EIO;
2351 }
2352 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
2353 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
2354 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
2355 s3policy->to_xml(*_dout);
2356 *_dout << dendl;
2357 }
2358 return 0;
2359 }
2360
2361
2362 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
2363 {
2364 rgw_bucket bucket = bucket_info.bucket;
2365 bucket.update_bucket_id(new_bucket_id);
2366
2367 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2368
2369 bucket_info.objv_tracker.clear();
2370 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
2371 if (ret < 0) {
2372 return ret;
2373 }
2374
2375 return 0;
2376 }
2377
2378
2379 static inline std::string after_delim(std::string_view delim)
2380 {
2381 // assert: ! delim.empty()
2382 std::string result{delim.data(), delim.length()};
2383 result += char(255);
2384 return result;
2385 }
2386
2387
2388 /**
2389 * Get ordered listing of the objects in a bucket.
2390 *
2391 * max: maximum number of results to return
2392 * bucket: bucket to list contents of
2393 * prefix: only return results that match this prefix
2394 * delim: do not include results that match this string.
2395 * Any skipped results will have the matching portion of their name
2396 * inserted in common_prefixes with a "true" mark.
2397 * marker: if filled in, begin the listing with this object.
2398 * end_marker: if filled in, end the listing with this object.
2399 * result: the objects are put in here.
2400 * common_prefixes: if delim is filled in, any matching prefixes are
2401 * placed here.
2402 * is_truncated: if number of objects in the bucket is bigger than
2403 * max, then truncated.
2404 */
2405 int RGWRados::Bucket::List::list_objects_ordered(
2406 int64_t max_p,
2407 vector<rgw_bucket_dir_entry> *result,
2408 map<string, bool> *common_prefixes,
2409 bool *is_truncated)
2410 {
2411 RGWRados *store = target->get_store();
2412 CephContext *cct = store->ctx();
2413 int shard_id = target->get_shard_id();
2414
2415 int count = 0;
2416 bool truncated = true;
2417 const int64_t max = // protect against memory issues and negative vals
2418 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2419 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead, max);
2420
2421 result->clear();
2422
2423 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
2424 rgw_obj_index_key cur_marker;
2425 marker_obj.get_index_key(&cur_marker);
2426
2427 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
2428 params.ns);
2429 rgw_obj_index_key cur_end_marker;
2430 end_marker_obj.get_index_key(&cur_end_marker);
2431 const bool cur_end_marker_valid = !params.end_marker.empty();
2432
2433 rgw_obj_key prefix_obj(params.prefix);
2434 prefix_obj.ns = params.ns;
2435 string cur_prefix = prefix_obj.get_index_key_name();
2436 string after_delim_s; /* needed in !params.delim.empty() AND later */
2437
2438 if (!params.delim.empty()) {
2439 after_delim_s = after_delim(params.delim);
2440 /* if marker points at a common prefix, fast forward it into its
2441 * upper bound string */
2442 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
2443 if (delim_pos >= 0) {
2444 string s = cur_marker.name.substr(0, delim_pos);
2445 s.append(after_delim_s);
2446 cur_marker = s;
2447 }
2448 }
2449
2450 string skip_after_delim;
2451 while (truncated && count <= max) {
2452 std::map<string, rgw_bucket_dir_entry> ent_map;
2453 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
2454 shard_id,
2455 cur_marker,
2456 cur_prefix,
2457 read_ahead + 1 - count,
2458 params.list_versions,
2459 ent_map,
2460 &truncated,
2461 &cur_marker);
2462 if (r < 0)
2463 return r;
2464
2465 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
2466 rgw_bucket_dir_entry& entry = eiter->second;
2467 rgw_obj_index_key index_key = entry.key;
2468
2469 rgw_obj_key obj(index_key);
2470
2471 /* note that parse_raw_oid() here will not set the correct
2472 * object's instance, as rgw_obj_index_key encodes that
2473 * separately. We don't need to set the instance because it's
2474 * not needed for the checks here and we end up using the raw
2475 * entry for the return vector
2476 */
2477 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2478 if (!valid) {
2479 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
2480 continue;
2481 }
2482
2483 bool check_ns = (obj.ns == params.ns);
2484 if (!params.list_versions && !entry.is_visible()) {
2485 continue;
2486 }
2487
2488 if (params.enforce_ns && !check_ns) {
2489 if (!params.ns.empty()) {
2490 /* we've iterated past the namespace we're searching -- done now */
2491 truncated = false;
2492 goto done;
2493 }
2494
2495 /* we're not looking at the namespace this object is in, next! */
2496 continue;
2497 }
2498
2499 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2500 truncated = false;
2501 goto done;
2502 }
2503
2504 if (count < max) {
2505 params.marker = index_key;
2506 next_marker = index_key;
2507 }
2508
2509 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2510 continue;
2511
2512 if (params.prefix.size() &&
2513 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
2514 continue;
2515
2516 if (!params.delim.empty()) {
2517 int delim_pos = obj.name.find(params.delim, params.prefix.size());
2518
2519 if (delim_pos >= 0) {
2520 /* extract key -with trailing delimiter- for CommonPrefix */
2521 string prefix_key =
2522 obj.name.substr(0, delim_pos + params.delim.length());
2523
2524 if (common_prefixes &&
2525 common_prefixes->find(prefix_key) == common_prefixes->end()) {
2526 if (count >= max) {
2527 truncated = true;
2528 goto done;
2529 }
2530 next_marker = prefix_key;
2531 (*common_prefixes)[prefix_key] = true;
2532
2533 count++;
2534 }
2535
2536 continue;
2537 }
2538 }
2539
2540 if (count >= max) {
2541 truncated = true;
2542 goto done;
2543 }
2544
2545 result->emplace_back(std::move(entry));
2546 count++;
2547 }
2548
2549 if (!params.delim.empty()) {
2550 int marker_delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
2551 if (marker_delim_pos >= 0) {
2552 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
2553 skip_after_delim.append(after_delim_s);
2554
2555 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
2556
2557 if (skip_after_delim > cur_marker.name) {
2558 cur_marker = skip_after_delim;
2559 ldout(cct, 20) << "setting cur_marker="
2560 << cur_marker.name
2561 << "[" << cur_marker.instance << "]"
2562 << dendl;
2563 }
2564 }
2565 }
2566 }
2567
2568 done:
2569 if (is_truncated)
2570 *is_truncated = truncated;
2571
2572 return 0;
2573 } // list_objects_ordered
2574
2575
2576 /**
2577 * Get listing of the objects in a bucket and allow the results to be out
2578 * of order.
2579 *
2580 * Even though there are key differences with the ordered counterpart,
2581 * the parameters are the same to maintain some compatability.
2582 *
2583 * max: maximum number of results to return
2584 * bucket: bucket to list contents of
2585 * prefix: only return results that match this prefix
2586 * delim: should not be set; if it is we should have indicated an error
2587 * marker: if filled in, begin the listing with this object.
2588 * end_marker: if filled in, end the listing with this object.
2589 * result: the objects are put in here.
2590 * common_prefixes: this is never filled with an unordered list; the param
2591 * is maintained for compatibility
2592 * is_truncated: if number of objects in the bucket is bigger than max, then
2593 * truncated.
2594 */
2595 int RGWRados::Bucket::List::list_objects_unordered(int64_t max_p,
2596 vector<rgw_bucket_dir_entry> *result,
2597 map<string, bool> *common_prefixes,
2598 bool *is_truncated)
2599 {
2600 RGWRados *store = target->get_store();
2601 CephContext *cct = store->ctx();
2602 int shard_id = target->get_shard_id();
2603
2604 int count = 0;
2605 bool truncated = true;
2606
2607 const int64_t max = // protect against memory issues and negative vals
2608 std::min(bucket_list_objects_absolute_max, std::max(int64_t(0), max_p));
2609
2610 // read a few extra in each call to cls_bucket_list_unordered in
2611 // case some are filtered out due to namespace matching, versioning,
2612 // filtering, etc.
2613 const int64_t max_read_ahead = 100;
2614 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2615
2616 result->clear();
2617
2618 rgw_obj_key marker_obj(params.marker.name,
2619 params.marker.instance,
2620 params.ns);
2621 rgw_obj_index_key cur_marker;
2622 marker_obj.get_index_key(&cur_marker);
2623
2624 rgw_obj_key end_marker_obj(params.end_marker.name,
2625 params.end_marker.instance,
2626 params.ns);
2627 rgw_obj_index_key cur_end_marker;
2628 end_marker_obj.get_index_key(&cur_end_marker);
2629 const bool cur_end_marker_valid = !params.end_marker.empty();
2630
2631 rgw_obj_key prefix_obj(params.prefix);
2632 prefix_obj.ns = params.ns;
2633 string cur_prefix = prefix_obj.get_index_key_name();
2634
2635 while (truncated && count <= max) {
2636 std::vector<rgw_bucket_dir_entry> ent_list;
2637 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2638 shard_id,
2639 cur_marker,
2640 cur_prefix,
2641 read_ahead,
2642 params.list_versions,
2643 ent_list,
2644 &truncated,
2645 &cur_marker);
2646 if (r < 0)
2647 return r;
2648
2649 // NB: while regions of ent_list will be sorted, we have no
2650 // guarantee that all items will be sorted since they can cross
2651 // shard boundaries
2652
2653 for (auto& entry : ent_list) {
2654 rgw_obj_index_key index_key = entry.key;
2655 rgw_obj_key obj(index_key);
2656
2657 /* note that parse_raw_oid() here will not set the correct
2658 * object's instance, as rgw_obj_index_key encodes that
2659 * separately. We don't need to set the instance because it's
2660 * not needed for the checks here and we end up using the raw
2661 * entry for the return vector
2662 */
2663 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2664 if (!valid) {
2665 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2666 obj.name << dendl;
2667 continue;
2668 }
2669
2670 if (!params.list_versions && !entry.is_visible()) {
2671 continue;
2672 }
2673
2674 if (params.enforce_ns && obj.ns != params.ns) {
2675 continue;
2676 }
2677
2678 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2679 // we're not guaranteed items will come in order, so we have
2680 // to loop through all
2681 continue;
2682 }
2683
2684 if (count < max) {
2685 params.marker.set(index_key);
2686 next_marker.set(index_key);
2687 }
2688
2689 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2690 continue;
2691
2692 if (params.prefix.size() &&
2693 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2694 continue;
2695
2696 if (count >= max) {
2697 truncated = true;
2698 goto done;
2699 }
2700
2701 result->emplace_back(std::move(entry));
2702 count++;
2703 } // for (auto& entry : ent_list)
2704 } // while (truncated && count <= max)
2705
2706 done:
2707 if (is_truncated)
2708 *is_truncated = truncated;
2709
2710 return 0;
2711 } // list_objects_unordered
2712
2713
2714 /**
2715 * create a rados pool, associated meta info
2716 * returns 0 on success, -ERR# otherwise.
2717 */
2718 int RGWRados::create_pool(const rgw_pool& pool)
2719 {
2720 librados::IoCtx io_ctx;
2721 constexpr bool create = true;
2722 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
2723 }
2724
2725 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2726 {
2727 librados::IoCtx index_ctx;
2728
2729 string dir_oid = dir_oid_prefix;
2730 int r = open_bucket_index_ctx(bucket_info, index_ctx);
2731 if (r < 0) {
2732 return r;
2733 }
2734
2735 dir_oid.append(bucket_info.bucket.bucket_id);
2736
2737 map<int, string> bucket_objs;
2738 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2739
2740 return CLSRGWIssueBucketIndexInit(index_ctx,
2741 bucket_objs,
2742 cct->_conf->rgw_bucket_index_max_aio)();
2743 }
2744
2745 int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2746 {
2747 librados::IoCtx index_ctx;
2748
2749 std::string dir_oid = dir_oid_prefix;
2750 int r = open_bucket_index_ctx(bucket_info, index_ctx);
2751 if (r < 0) {
2752 return r;
2753 }
2754
2755 dir_oid.append(bucket_info.bucket.bucket_id);
2756
2757 std::map<int, std::string> bucket_objs;
2758 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2759
2760 return CLSRGWIssueBucketIndexClean(index_ctx,
2761 bucket_objs,
2762 cct->_conf->rgw_bucket_index_max_aio)();
2763 }
2764
2765 void RGWRados::create_bucket_id(string *bucket_id)
2766 {
2767 uint64_t iid = instance_id();
2768 uint64_t bid = next_bucket_id();
2769 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2770 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2771 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2772 *bucket_id = buf;
2773 }
2774
2775 int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
2776 const string& zonegroup_id,
2777 const rgw_placement_rule& placement_rule,
2778 const string& swift_ver_location,
2779 const RGWQuotaInfo * pquota_info,
2780 map<std::string, bufferlist>& attrs,
2781 RGWBucketInfo& info,
2782 obj_version *pobjv,
2783 obj_version *pep_objv,
2784 real_time creation_time,
2785 rgw_bucket *pmaster_bucket,
2786 uint32_t *pmaster_num_shards,
2787 bool exclusive)
2788 {
2789 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2790 rgw_placement_rule selected_placement_rule;
2791 RGWZonePlacementInfo rule_info;
2792
2793 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2794 int ret = 0;
2795 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2796 &selected_placement_rule, &rule_info);
2797 if (ret < 0)
2798 return ret;
2799
2800 if (!pmaster_bucket) {
2801 create_bucket_id(&bucket.marker);
2802 bucket.bucket_id = bucket.marker;
2803 } else {
2804 bucket.marker = pmaster_bucket->marker;
2805 bucket.bucket_id = pmaster_bucket->bucket_id;
2806 }
2807
2808 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2809
2810 if (pobjv) {
2811 objv_tracker.write_version = *pobjv;
2812 } else {
2813 objv_tracker.generate_new_write_ver(cct);
2814 }
2815
2816 info.bucket = bucket;
2817 info.owner = owner.user_id;
2818 info.zonegroup = zonegroup_id;
2819 info.placement_rule = selected_placement_rule;
2820 info.index_type = rule_info.index_type;
2821 info.swift_ver_location = swift_ver_location;
2822 info.swift_versioning = (!swift_ver_location.empty());
2823 if (pmaster_num_shards) {
2824 info.num_shards = *pmaster_num_shards;
2825 } else {
2826 info.num_shards = bucket_index_max_shards;
2827 }
2828 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2829 info.requester_pays = false;
2830 if (real_clock::is_zero(creation_time)) {
2831 info.creation_time = ceph::real_clock::now();
2832 } else {
2833 info.creation_time = creation_time;
2834 }
2835 if (pquota_info) {
2836 info.quota = *pquota_info;
2837 }
2838
2839 int r = init_bucket_index(info, info.num_shards);
2840 if (r < 0) {
2841 return r;
2842 }
2843
2844 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
2845 if (ret == -EEXIST) {
2846 librados::IoCtx index_ctx;
2847 map<int, string> bucket_objs;
2848 int r = open_bucket_index(info, index_ctx, bucket_objs);
2849 if (r < 0)
2850 return r;
2851
2852 /* we need to reread the info and return it, caller will have a use for it */
2853 RGWObjVersionTracker instance_ver = info.objv_tracker;
2854 info.objv_tracker.clear();
2855 auto obj_ctx = svc.sysobj->init_obj_ctx();
2856 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
2857 if (r < 0) {
2858 if (r == -ENOENT) {
2859 continue;
2860 }
2861 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2862 return r;
2863 }
2864
2865 /* only remove it if it's a different bucket instance */
2866 if (info.bucket.bucket_id != bucket.bucket_id) {
2867 /* remove bucket meta instance */
2868 r = rgw_bucket_instance_remove_entry(this,
2869 bucket.get_key(),
2870 &instance_ver);
2871 if (r < 0)
2872 return r;
2873
2874 /* remove bucket index objects asynchronously by best effort */
2875 (void) CLSRGWIssueBucketIndexClean(index_ctx,
2876 bucket_objs,
2877 cct->_conf->rgw_bucket_index_max_aio)();
2878 }
2879 /* ret == -ENOENT here */
2880 }
2881 return ret;
2882 }
2883
2884 /* this is highly unlikely */
2885 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2886 return -ENOENT;
2887 }
2888
2889 bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
2890 {
2891 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2892 }
2893
2894 bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2895 {
2896 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
2897
2898 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
2899 }
2900
2901 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2902 {
2903 string oid, key;
2904 get_obj_bucket_and_oid_loc(obj, oid, key);
2905
2906 rgw_pool pool;
2907 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2908 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2909 return -EIO;
2910 }
2911
2912 int r = open_pool_ctx(pool, *ioctx, false);
2913 if (r < 0) {
2914 return r;
2915 }
2916
2917 ioctx->locator_set_key(key);
2918
2919 return 0;
2920 }
2921
2922 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2923 {
2924 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
2925
2926 rgw_pool pool;
2927 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2928 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2929 return -EIO;
2930 }
2931
2932 int r = open_pool_ctx(pool, ref->ioctx, false);
2933 if (r < 0) {
2934 return r;
2935 }
2936
2937 ref->ioctx.locator_set_key(ref->obj.loc);
2938
2939 return 0;
2940 }
2941
2942 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2943 {
2944 ref->obj = obj;
2945
2946 int r;
2947
2948 if (ref->obj.oid.empty()) {
2949 ref->obj.oid = obj.pool.to_str();
2950 ref->obj.pool = svc.zone->get_zone_params().domain_root;
2951 }
2952 r = open_pool_ctx(ref->obj.pool, ref->ioctx, false);
2953 if (r < 0)
2954 return r;
2955
2956 ref->ioctx.locator_set_key(ref->obj.loc);
2957
2958 return 0;
2959 }
2960
2961 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2962 {
2963 return get_raw_obj_ref(obj, ref);
2964 }
2965
2966 /*
2967 * fixes an issue where head objects were supposed to have a locator created, but ended
2968 * up without one
2969 */
2970 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2971 {
2972 const rgw_bucket& bucket = bucket_info.bucket;
2973 string oid;
2974 string locator;
2975
2976 rgw_obj obj(bucket, key);
2977
2978 get_obj_bucket_and_oid_loc(obj, oid, locator);
2979
2980 if (locator.empty()) {
2981 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
2982 return 0;
2983 }
2984
2985 librados::IoCtx ioctx;
2986
2987 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
2988 if (ret < 0) {
2989 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2990 return ret;
2991 }
2992 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2993
2994 uint64_t size;
2995 bufferlist data;
2996
2997 struct timespec mtime_ts;
2998 map<string, bufferlist> attrs;
2999 librados::ObjectReadOperation op;
3000 op.getxattrs(&attrs, NULL);
3001 op.stat2(&size, &mtime_ts, NULL);
3002 #define HEAD_SIZE 512 * 1024
3003 op.read(0, HEAD_SIZE, &data, NULL);
3004
3005 ret = ioctx.operate(oid, &op, NULL);
3006 if (ret < 0) {
3007 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
3008 return ret;
3009 }
3010
3011 if (size > HEAD_SIZE) {
3012 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
3013 return -EIO;
3014 }
3015
3016 if (size != data.length()) {
3017 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
3018 return -EIO;
3019 }
3020
3021 if (copy_obj) {
3022 librados::ObjectWriteOperation wop;
3023
3024 wop.mtime2(&mtime_ts);
3025
3026 map<string, bufferlist>::iterator iter;
3027 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3028 wop.setxattr(iter->first.c_str(), iter->second);
3029 }
3030
3031 wop.write(0, data);
3032
3033 ioctx.locator_set_key(locator);
3034 ioctx.operate(oid, &wop);
3035 }
3036
3037 if (remove_bad) {
3038 ioctx.locator_set_key(string());
3039
3040 ret = ioctx.remove(oid);
3041 if (ret < 0) {
3042 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
3043 return ret;
3044 }
3045 }
3046
3047 return 0;
3048 }
3049
3050 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
3051 const string& src_oid, const string& src_locator,
3052 librados::IoCtx& dst_ioctx,
3053 const string& dst_oid, const string& dst_locator)
3054 {
3055
3056 #define COPY_BUF_SIZE (4 * 1024 * 1024)
3057 bool done = false;
3058 uint64_t chunk_size = COPY_BUF_SIZE;
3059 uint64_t ofs = 0;
3060 int ret = 0;
3061 real_time mtime;
3062 struct timespec mtime_ts;
3063 uint64_t size;
3064
3065 if (src_oid == dst_oid && src_locator == dst_locator) {
3066 return 0;
3067 }
3068
3069 src_ioctx.locator_set_key(src_locator);
3070 dst_ioctx.locator_set_key(dst_locator);
3071
3072 do {
3073 bufferlist data;
3074 ObjectReadOperation rop;
3075 ObjectWriteOperation wop;
3076
3077 if (ofs == 0) {
3078 rop.stat2(&size, &mtime_ts, NULL);
3079 mtime = real_clock::from_timespec(mtime_ts);
3080 }
3081 rop.read(ofs, chunk_size, &data, NULL);
3082 ret = src_ioctx.operate(src_oid, &rop, NULL);
3083 if (ret < 0) {
3084 goto done_err;
3085 }
3086
3087 if (data.length() == 0) {
3088 break;
3089 }
3090
3091 if (ofs == 0) {
3092 wop.create(true); /* make it exclusive */
3093 wop.mtime2(&mtime_ts);
3094 mtime = real_clock::from_timespec(mtime_ts);
3095 }
3096 wop.write(ofs, data);
3097 ret = dst_ioctx.operate(dst_oid, &wop);
3098 if (ret < 0) {
3099 goto done_err;
3100 }
3101 ofs += data.length();
3102 done = data.length() != chunk_size;
3103 } while (!done);
3104
3105 if (ofs != size) {
3106 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
3107 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
3108 ret = -EIO;
3109 goto done_err;
3110 }
3111
3112 src_ioctx.remove(src_oid);
3113
3114 return 0;
3115
3116 done_err:
3117 // TODO: clean up dst_oid if we created it
3118 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
3119 return ret;
3120 }
3121
3122 /*
3123 * fixes an issue where head objects were supposed to have a locator created, but ended
3124 * up without one
3125 */
3126 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
3127 {
3128 const rgw_bucket& bucket = bucket_info.bucket;
3129 rgw_obj obj(bucket, key);
3130
3131 if (need_fix) {
3132 *need_fix = false;
3133 }
3134
3135 rgw_rados_ref ref;
3136 int r = get_obj_head_ref(bucket_info, obj, &ref);
3137 if (r < 0) {
3138 return r;
3139 }
3140
3141 RGWObjState *astate = NULL;
3142 RGWObjectCtx rctx(this);
3143 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
3144 if (r < 0)
3145 return r;
3146
3147 if (astate->has_manifest) {
3148 RGWObjManifest::obj_iterator miter;
3149 RGWObjManifest& manifest = astate->manifest;
3150 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
3151 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
3152 rgw_obj loc;
3153 string oid;
3154 string locator;
3155
3156 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
3157
3158 if (loc.key.ns.empty()) {
3159 /* continue, we're only interested in tail objects */
3160 continue;
3161 }
3162
3163 get_obj_bucket_and_oid_loc(loc, oid, locator);
3164 ref.ioctx.locator_set_key(locator);
3165
3166 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
3167
3168 r = ref.ioctx.stat(oid, NULL, NULL);
3169 if (r != -ENOENT) {
3170 continue;
3171 }
3172
3173 string bad_loc;
3174 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
3175
3176 /* create a new ioctx with the bad locator */
3177 librados::IoCtx src_ioctx;
3178 src_ioctx.dup(ref.ioctx);
3179 src_ioctx.locator_set_key(bad_loc);
3180
3181 r = src_ioctx.stat(oid, NULL, NULL);
3182 if (r != 0) {
3183 /* cannot find a broken part */
3184 continue;
3185 }
3186 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
3187 if (need_fix) {
3188 *need_fix = true;
3189 }
3190 if (fix) {
3191 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
3192 if (r < 0) {
3193 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
3194 }
3195 }
3196 }
3197 }
3198
3199 return 0;
3200 }
3201
3202 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3203 const rgw_obj& obj,
3204 RGWBucketInfo* bucket_info_out)
3205 {
3206 bucket = _bucket;
3207
3208 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
3209
3210 RGWBucketInfo bucket_info;
3211 RGWBucketInfo* bucket_info_p =
3212 bucket_info_out ? bucket_info_out : &bucket_info;
3213
3214 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
3215 if (ret < 0) {
3216 return ret;
3217 }
3218
3219 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
3220 if (ret < 0) {
3221 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3222 return ret;
3223 }
3224 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3225
3226 return 0;
3227 }
3228
3229 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3230 int sid,
3231 RGWBucketInfo* bucket_info_out)
3232 {
3233 bucket = _bucket;
3234 shard_id = sid;
3235
3236 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
3237
3238 RGWBucketInfo bucket_info;
3239 RGWBucketInfo* bucket_info_p =
3240 bucket_info_out ? bucket_info_out : &bucket_info;
3241 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
3242 if (ret < 0) {
3243 return ret;
3244 }
3245
3246 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
3247 if (ret < 0) {
3248 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3249 return ret;
3250 }
3251 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3252
3253 return 0;
3254 }
3255
3256 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
3257 const rgw_obj& obj)
3258 {
3259 bucket = bucket_info.bucket;
3260
3261 int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
3262 obj.get_hash_object(), &bucket_obj,
3263 &shard_id);
3264 if (ret < 0) {
3265 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3266 return ret;
3267 }
3268 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3269
3270 return 0;
3271 }
3272
3273 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
3274 {
3275 bucket = bucket_info.bucket;
3276 shard_id = sid;
3277
3278 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
3279 if (ret < 0) {
3280 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3281 return ret;
3282 }
3283 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3284
3285 return 0;
3286 }
3287
3288
3289 /* Execute @handler on last item in bucket listing for bucket specified
3290 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
3291 * to objects matching these criterias. */
3292 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
3293 const std::string& obj_prefix,
3294 const std::string& obj_delim,
3295 std::function<int(const rgw_bucket_dir_entry&)> handler)
3296 {
3297 RGWRados::Bucket target(this, bucket_info);
3298 RGWRados::Bucket::List list_op(&target);
3299
3300 list_op.params.prefix = obj_prefix;
3301 list_op.params.delim = obj_delim;
3302
3303 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
3304 << ", obj_prefix=" << obj_prefix
3305 << ", obj_delim=" << obj_delim
3306 << dendl;
3307
3308 bool is_truncated = false;
3309
3310 boost::optional<rgw_bucket_dir_entry> last_entry;
3311 /* We need to rewind to the last object in a listing. */
3312 do {
3313 /* List bucket entries in chunks. */
3314 static constexpr int MAX_LIST_OBJS = 100;
3315 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
3316
3317 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
3318 &is_truncated);
3319 if (ret < 0) {
3320 return ret;
3321 } else if (!entries.empty()) {
3322 last_entry = entries.back();
3323 }
3324 } while (is_truncated);
3325
3326 if (last_entry) {
3327 return handler(*last_entry);
3328 }
3329
3330 /* Empty listing - no items we can run handler on. */
3331 return 0;
3332 }
3333
3334
3335 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
3336 const rgw_user& user,
3337 RGWBucketInfo& bucket_info,
3338 rgw_obj& obj)
3339 {
3340 if (! swift_versioning_enabled(bucket_info)) {
3341 return 0;
3342 }
3343
3344 obj_ctx.set_atomic(obj);
3345
3346 RGWObjState * state = nullptr;
3347 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
3348 if (r < 0) {
3349 return r;
3350 }
3351
3352 if (!state->exists) {
3353 return 0;
3354 }
3355
3356 const string& src_name = obj.get_oid();
3357 char buf[src_name.size() + 32];
3358 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
3359 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
3360 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
3361
3362 RGWBucketInfo dest_bucket_info;
3363
3364 auto sysobj_ctx = svc.sysobj->init_obj_ctx();
3365
3366 r = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
3367 if (r < 0) {
3368 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
3369 if (r == -ENOENT) {
3370 return -ERR_PRECONDITION_FAILED;
3371 }
3372 return r;
3373 }
3374
3375 if (dest_bucket_info.owner != bucket_info.owner) {
3376 return -ERR_PRECONDITION_FAILED;
3377 }
3378
3379 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
3380
3381 if (dest_bucket_info.versioning_enabled()){
3382 gen_rand_obj_instance_name(&dest_obj);
3383 }
3384
3385 obj_ctx.set_atomic(dest_obj);
3386
3387 string no_zone;
3388
3389 r = copy_obj(obj_ctx,
3390 user,
3391 NULL, /* req_info *info */
3392 no_zone,
3393 dest_obj,
3394 obj,
3395 dest_bucket_info,
3396 bucket_info,
3397 bucket_info.placement_rule,
3398 NULL, /* time_t *src_mtime */
3399 NULL, /* time_t *mtime */
3400 NULL, /* const time_t *mod_ptr */
3401 NULL, /* const time_t *unmod_ptr */
3402 false, /* bool high_precision_time */
3403 NULL, /* const char *if_match */
3404 NULL, /* const char *if_nomatch */
3405 RGWRados::ATTRSMOD_NONE,
3406 true, /* bool copy_if_newer */
3407 state->attrset,
3408 RGWObjCategory::Main,
3409 0, /* uint64_t olh_epoch */
3410 real_time(), /* time_t delete_at */
3411 NULL, /* string *version_id */
3412 NULL, /* string *ptag */
3413 NULL, /* string *petag */
3414 NULL, /* void (*progress_cb)(off_t, void *) */
3415 NULL); /* void *progress_data */
3416 if (r == -ECANCELED || r == -ENOENT) {
3417 /* Has already been overwritten, meaning another rgw process already
3418 * copied it out */
3419 return 0;
3420 }
3421
3422 return r;
3423 }
3424
3425 int RGWRados::swift_versioning_restore(RGWSysObjectCtx& sysobj_ctx,
3426 RGWObjectCtx& obj_ctx,
3427 const rgw_user& user,
3428 RGWBucketInfo& bucket_info,
3429 rgw_obj& obj,
3430 bool& restored) /* out */
3431 {
3432 if (! swift_versioning_enabled(bucket_info)) {
3433 return 0;
3434 }
3435
3436 /* Bucket info of the bucket that stores previous versions of our object. */
3437 RGWBucketInfo archive_binfo;
3438
3439 int ret = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant,
3440 bucket_info.swift_ver_location, archive_binfo,
3441 nullptr, nullptr);
3442 if (ret < 0) {
3443 return ret;
3444 }
3445
3446 /* Abort the operation if the bucket storing our archive belongs to someone
3447 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3448 * into consideration. For we can live with that.
3449 *
3450 * TODO: delegate this check to un upper layer and compare with ACLs. */
3451 if (bucket_info.owner != archive_binfo.owner) {
3452 return -EPERM;
3453 }
3454
3455 /* This code will be executed on latest version of the object. */
3456 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
3457 std::string no_zone;
3458
3459 /* We don't support object versioning of Swift API on those buckets that
3460 * are already versioned using the S3 mechanism. This affects also bucket
3461 * storing archived objects. Otherwise the delete operation would create
3462 * a deletion marker. */
3463 if (archive_binfo.versioned()) {
3464 restored = false;
3465 return -ERR_PRECONDITION_FAILED;
3466 }
3467
3468 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3469 * irrelevant and may be safely skipped. */
3470 std::map<std::string, ceph::bufferlist> no_attrs;
3471
3472 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
3473
3474 if (bucket_info.versioning_enabled()){
3475 gen_rand_obj_instance_name(&obj);
3476 }
3477
3478 obj_ctx.set_atomic(archive_obj);
3479 obj_ctx.set_atomic(obj);
3480
3481 int ret = copy_obj(obj_ctx,
3482 user,
3483 nullptr, /* req_info *info */
3484 no_zone,
3485 obj, /* dest obj */
3486 archive_obj, /* src obj */
3487 bucket_info, /* dest bucket info */
3488 archive_binfo, /* src bucket info */
3489 bucket_info.placement_rule, /* placement_rule */
3490 nullptr, /* time_t *src_mtime */
3491 nullptr, /* time_t *mtime */
3492 nullptr, /* const time_t *mod_ptr */
3493 nullptr, /* const time_t *unmod_ptr */
3494 false, /* bool high_precision_time */
3495 nullptr, /* const char *if_match */
3496 nullptr, /* const char *if_nomatch */
3497 RGWRados::ATTRSMOD_NONE,
3498 true, /* bool copy_if_newer */
3499 no_attrs,
3500 RGWObjCategory::Main,
3501 0, /* uint64_t olh_epoch */
3502 real_time(), /* time_t delete_at */
3503 nullptr, /* string *version_id */
3504 nullptr, /* string *ptag */
3505 nullptr, /* string *petag */
3506 nullptr, /* void (*progress_cb)(off_t, void *) */
3507 nullptr); /* void *progress_data */
3508 if (ret == -ECANCELED || ret == -ENOENT) {
3509 /* Has already been overwritten, meaning another rgw process already
3510 * copied it out */
3511 return 0;
3512 } else if (ret < 0) {
3513 return ret;
3514 } else {
3515 restored = true;
3516 }
3517
3518 /* Need to remove the archived copy. */
3519 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
3520 archive_binfo.versioning_status());
3521
3522 return ret;
3523 };
3524
3525 const std::string& obj_name = obj.get_oid();
3526 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
3527 % obj_name);
3528
3529 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
3530 handler);
3531 }
3532
3533 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
3534 map<string, bufferlist>& attrs,
3535 bool assume_noent, bool modify_tail,
3536 void *_index_op)
3537 {
3538 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
3539 RGWRados *store = target->get_store();
3540
3541 ObjectWriteOperation op;
3542 #ifdef WITH_LTTNG
3543 const struct req_state* s = get_req_state();
3544 string req_id;
3545 if (!s) {
3546 // fake req_id
3547 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3548 } else {
3549 req_id = s->req_id;
3550 }
3551 #endif
3552
3553 RGWObjState *state;
3554 int r = target->get_state(&state, false, assume_noent);
3555 if (r < 0)
3556 return r;
3557
3558 rgw_obj& obj = target->get_obj();
3559
3560 if (obj.get_oid().empty()) {
3561 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
3562 return -EIO;
3563 }
3564
3565 rgw_rados_ref ref;
3566 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
3567 if (r < 0)
3568 return r;
3569
3570 bool is_olh = state->is_olh;
3571
3572 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3573
3574 const string *ptag = meta.ptag;
3575 if (!ptag && !index_op->get_optag()->empty()) {
3576 ptag = index_op->get_optag();
3577 }
3578 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
3579 if (r < 0)
3580 return r;
3581
3582 if (real_clock::is_zero(meta.set_mtime)) {
3583 meta.set_mtime = real_clock::now();
3584 }
3585
3586 if (target->bucket_info.obj_lock_enabled() && target->bucket_info.obj_lock.has_rule() && meta.flags == PUT_OBJ_CREATE) {
3587 auto iter = attrs.find(RGW_ATTR_OBJECT_RETENTION);
3588 if (iter == attrs.end()) {
3589 real_time lock_until_date = target->bucket_info.obj_lock.get_lock_until_date(meta.set_mtime);
3590 string mode = target->bucket_info.obj_lock.get_mode();
3591 RGWObjectRetention obj_retention(mode, lock_until_date);
3592 bufferlist bl;
3593 obj_retention.encode(bl);
3594 op.setxattr(RGW_ATTR_OBJECT_RETENTION, bl);
3595 }
3596 }
3597
3598 if (state->is_olh) {
3599 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3600 }
3601
3602 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3603 op.mtime2(&mtime_ts);
3604
3605 if (meta.data) {
3606 /* if we want to overwrite the data, we also want to overwrite the
3607 xattrs, so just remove the object */
3608 op.write_full(*meta.data);
3609 }
3610
3611 string etag;
3612 string content_type;
3613 bufferlist acl_bl;
3614 string storage_class;
3615
3616 map<string, bufferlist>::iterator iter;
3617 if (meta.rmattrs) {
3618 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3619 const string& name = iter->first;
3620 op.rmxattr(name.c_str());
3621 }
3622 }
3623
3624 if (meta.manifest) {
3625 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3626
3627 /* remove existing manifest attr */
3628 iter = attrs.find(RGW_ATTR_MANIFEST);
3629 if (iter != attrs.end())
3630 attrs.erase(iter);
3631
3632 bufferlist bl;
3633 encode(*meta.manifest, bl);
3634 op.setxattr(RGW_ATTR_MANIFEST, bl);
3635 }
3636
3637 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3638 const string& name = iter->first;
3639 bufferlist& bl = iter->second;
3640
3641 if (!bl.length())
3642 continue;
3643
3644 op.setxattr(name.c_str(), bl);
3645
3646 if (name.compare(RGW_ATTR_ETAG) == 0) {
3647 etag = rgw_bl_str(bl);
3648 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
3649 content_type = rgw_bl_str(bl);
3650 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3651 acl_bl = bl;
3652 }
3653 }
3654 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3655 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3656 }
3657
3658 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3659 bufferlist bl;
3660 encode(store->svc.zone->get_zone_short_id(), bl);
3661 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3662 }
3663
3664 if (!storage_class.empty()) {
3665 bufferlist bl;
3666 bl.append(storage_class);
3667 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3668 }
3669
3670 if (!op.size())
3671 return 0;
3672
3673 uint64_t epoch;
3674 int64_t poolid;
3675 bool orig_exists;
3676 uint64_t orig_size;
3677
3678 if (!reset_obj) { //Multipart upload, it has immutable head.
3679 orig_exists = false;
3680 orig_size = 0;
3681 } else {
3682 orig_exists = state->exists;
3683 orig_size = state->accounted_size;
3684 }
3685
3686 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3687 !obj.key.instance.empty();
3688
3689 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3690
3691 if (versioned_op) {
3692 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3693 }
3694
3695 if (!index_op->is_prepared()) {
3696 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
3697 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
3698 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
3699 if (r < 0)
3700 return r;
3701 }
3702
3703 tracepoint(rgw_rados, operate_enter, req_id.c_str());
3704 r = ref.ioctx.operate(ref.obj.oid, &op);
3705 tracepoint(rgw_rados, operate_exit, req_id.c_str());
3706 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3707 or -ENOENT if was removed, or -EEXIST if it did not exist
3708 before and now it does */
3709 if (r == -EEXIST && assume_noent) {
3710 target->invalidate_state();
3711 return r;
3712 }
3713 goto done_cancel;
3714 }
3715
3716 epoch = ref.ioctx.get_last_version();
3717 poolid = ref.ioctx.get_id();
3718
3719 r = target->complete_atomic_modification();
3720 if (r < 0) {
3721 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3722 }
3723
3724 tracepoint(rgw_rados, complete_enter, req_id.c_str());
3725 r = index_op->complete(poolid, epoch, size, accounted_size,
3726 meta.set_mtime, etag, content_type,
3727 storage_class, &acl_bl,
3728 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3729 tracepoint(rgw_rados, complete_exit, req_id.c_str());
3730 if (r < 0)
3731 goto done_cancel;
3732
3733 if (meta.mtime) {
3734 *meta.mtime = meta.set_mtime;
3735 }
3736
3737 /* note that index_op was using state so we couldn't invalidate it earlier */
3738 target->invalidate_state();
3739 state = NULL;
3740
3741 if (versioned_op && meta.olh_epoch) {
3742 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
3743 if (r < 0) {
3744 return r;
3745 }
3746 }
3747
3748 if (!real_clock::is_zero(meta.delete_at)) {
3749 rgw_obj_index_key obj_key;
3750 obj.key.get_index_key(&obj_key);
3751
3752 r = store->objexp_hint_add(meta.delete_at,
3753 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
3754 if (r < 0) {
3755 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3756 /* ignoring error, nothing we can do at this point */
3757 }
3758 }
3759 meta.canceled = false;
3760
3761 /* update quota cache */
3762 if (meta.completeMultipart){
3763 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3764 0, orig_size);
3765 }
3766 else {
3767 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3768 accounted_size, orig_size);
3769 }
3770 return 0;
3771
3772 done_cancel:
3773 int ret = index_op->cancel();
3774 if (ret < 0) {
3775 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3776 }
3777
3778 meta.canceled = true;
3779
3780 /* we lost in a race. There are a few options:
3781 * - existing object was rewritten (ECANCELED)
3782 * - non existing object was created (EEXIST)
3783 * - object was removed (ENOENT)
3784 * should treat it as a success
3785 */
3786 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3787 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3788 r = 0;
3789 }
3790 } else {
3791 if (meta.if_match != NULL) {
3792 // only overwrite existing object
3793 if (strcmp(meta.if_match, "*") == 0) {
3794 if (r == -ENOENT) {
3795 r = -ERR_PRECONDITION_FAILED;
3796 } else if (r == -ECANCELED) {
3797 r = 0;
3798 }
3799 }
3800 }
3801
3802 if (meta.if_nomatch != NULL) {
3803 // only create a new object
3804 if (strcmp(meta.if_nomatch, "*") == 0) {
3805 if (r == -EEXIST) {
3806 r = -ERR_PRECONDITION_FAILED;
3807 } else if (r == -ENOENT) {
3808 r = 0;
3809 }
3810 }
3811 }
3812 }
3813
3814 return r;
3815 }
3816
3817 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
3818 map<string, bufferlist>& attrs)
3819 {
3820 RGWBucketInfo& bucket_info = target->get_bucket_info();
3821
3822 RGWRados::Bucket bop(target->get_store(), bucket_info);
3823 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
3824 index_op.set_zones_trace(meta.zones_trace);
3825
3826 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3827 int r;
3828 if (assume_noent) {
3829 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
3830 if (r == -EEXIST) {
3831 assume_noent = false;
3832 }
3833 }
3834 if (!assume_noent) {
3835 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
3836 }
3837 return r;
3838 }
3839
3840 class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
3841 {
3842 CephContext* cct;
3843 rgw_obj obj;
3844 rgw::putobj::DataProcessor *filter;
3845 boost::optional<RGWPutObj_Compress>& compressor;
3846 boost::optional<rgw::putobj::ChunkProcessor> buffering;
3847 CompressorRef& plugin;
3848 rgw::putobj::ObjectProcessor *processor;
3849 void (*progress_cb)(off_t, void *);
3850 void *progress_data;
3851 bufferlist extra_data_bl;
3852 uint64_t extra_data_left{0};
3853 bool need_to_process_attrs{true};
3854 uint64_t data_len{0};
3855 map<string, bufferlist> src_attrs;
3856 uint64_t ofs{0};
3857 uint64_t lofs{0}; /* logical ofs */
3858 std::function<int(const map<string, bufferlist>&)> attrs_handler;
3859 public:
3860 RGWRadosPutObj(CephContext* cct,
3861 CompressorRef& plugin,
3862 boost::optional<RGWPutObj_Compress>& compressor,
3863 rgw::putobj::ObjectProcessor *p,
3864 void (*_progress_cb)(off_t, void *),
3865 void *_progress_data,
3866 std::function<int(const map<string, bufferlist>&)> _attrs_handler) :
3867 cct(cct),
3868 filter(p),
3869 compressor(compressor),
3870 plugin(plugin),
3871 processor(p),
3872 progress_cb(_progress_cb),
3873 progress_data(_progress_data),
3874 attrs_handler(_attrs_handler) {}
3875
3876 int process_attrs(void) {
3877 if (extra_data_bl.length()) {
3878 JSONParser jp;
3879 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3880 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3881 return -EIO;
3882 }
3883
3884 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3885
3886 src_attrs.erase(RGW_ATTR_COMPRESSION);
3887 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3888
3889 // filter out olh attributes
3890 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3891 while (iter != src_attrs.end()) {
3892 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3893 break;
3894 }
3895 iter = src_attrs.erase(iter);
3896 }
3897 }
3898
3899 int ret = attrs_handler(src_attrs);
3900 if (ret < 0) {
3901 return ret;
3902 }
3903
3904 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3905 //do not compress if object is encrypted
3906 compressor = boost::in_place(cct, plugin, filter);
3907 // add a filter that buffers data so we don't try to compress tiny blocks.
3908 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3909 // compression ratio
3910 constexpr unsigned buffer_size = 512 * 1024;
3911 buffering = boost::in_place(&*compressor, buffer_size);
3912 filter = &*buffering;
3913 }
3914
3915 need_to_process_attrs = false;
3916
3917 return 0;
3918 }
3919
3920 int handle_data(bufferlist& bl, bool *pause) override {
3921 if (progress_cb) {
3922 progress_cb(data_len, progress_data);
3923 }
3924 if (extra_data_left) {
3925 uint64_t extra_len = bl.length();
3926 if (extra_len > extra_data_left)
3927 extra_len = extra_data_left;
3928
3929 bufferlist extra;
3930 bl.splice(0, extra_len, &extra);
3931 extra_data_bl.append(extra);
3932
3933 extra_data_left -= extra_len;
3934 if (extra_data_left == 0) {
3935 int res = process_attrs();
3936 if (res < 0)
3937 return res;
3938 }
3939 ofs += extra_len;
3940 if (bl.length() == 0) {
3941 return 0;
3942 }
3943 }
3944 if (need_to_process_attrs) {
3945 /* need to call process_attrs() even if we don't get any attrs,
3946 * need it to call attrs_handler().
3947 */
3948 int res = process_attrs();
3949 if (res < 0) {
3950 return res;
3951 }
3952 }
3953
3954 ceph_assert(uint64_t(ofs) >= extra_data_len);
3955
3956 uint64_t size = bl.length();
3957 ofs += size;
3958
3959 const uint64_t lofs = data_len;
3960 data_len += size;
3961
3962 return filter->process(std::move(bl), lofs);
3963 }
3964
3965 int flush() {
3966 return filter->process({}, data_len);
3967 }
3968
3969 bufferlist& get_extra_data() { return extra_data_bl; }
3970
3971 map<string, bufferlist>& get_attrs() { return src_attrs; }
3972
3973 void set_extra_data_len(uint64_t len) override {
3974 extra_data_left = len;
3975 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
3976 }
3977
3978 uint64_t get_data_len() {
3979 return data_len;
3980 }
3981 };
3982
3983 /*
3984 * prepare attrset depending on attrs_mod.
3985 */
3986 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3987 map<string, bufferlist>& attrs,
3988 RGWRados::AttrsMod attrs_mod)
3989 {
3990 switch (attrs_mod) {
3991 case RGWRados::ATTRSMOD_NONE:
3992 attrs = src_attrs;
3993 break;
3994 case RGWRados::ATTRSMOD_REPLACE:
3995 if (!attrs[RGW_ATTR_ETAG].length()) {
3996 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3997 }
3998 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3999 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
4000 if (ttiter != src_attrs.end()) {
4001 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
4002 }
4003 }
4004 break;
4005 case RGWRados::ATTRSMOD_MERGE:
4006 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
4007 if (attrs.find(it->first) == attrs.end()) {
4008 attrs[it->first] = it->second;
4009 }
4010 }
4011 break;
4012 }
4013 }
4014
4015 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj)
4016 {
4017 map<string, bufferlist> attrset;
4018
4019 real_time mtime;
4020 uint64_t obj_size;
4021 RGWObjectCtx rctx(this);
4022
4023 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
4024 RGWRados::Object::Read read_op(&op_target);
4025
4026 read_op.params.attrs = &attrset;
4027 read_op.params.lastmod = &mtime;
4028 read_op.params.obj_size = &obj_size;
4029
4030 int ret = read_op.prepare();
4031 if (ret < 0)
4032 return ret;
4033
4034 attrset.erase(RGW_ATTR_ID_TAG);
4035 attrset.erase(RGW_ATTR_TAIL_TAG);
4036
4037 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
4038 read_op, obj_size - 1, obj, NULL, mtime, attrset,
4039 0, real_time(), NULL);
4040 }
4041
4042 struct obj_time_weight {
4043 real_time mtime;
4044 uint32_t zone_short_id;
4045 uint64_t pg_ver;
4046 bool high_precision;
4047
4048 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
4049
4050 bool compare_low_precision(const obj_time_weight& rhs) {
4051 struct timespec l = ceph::real_clock::to_timespec(mtime);
4052 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
4053 l.tv_nsec = 0;
4054 r.tv_nsec = 0;
4055 if (l > r) {
4056 return false;
4057 }
4058 if (l < r) {
4059 return true;
4060 }
4061 if (!zone_short_id || !rhs.zone_short_id) {
4062 /* don't compare zone ids, if one wasn't provided */
4063 return false;
4064 }
4065 if (zone_short_id != rhs.zone_short_id) {
4066 return (zone_short_id < rhs.zone_short_id);
4067 }
4068 return (pg_ver < rhs.pg_ver);
4069
4070 }
4071
4072 bool operator<(const obj_time_weight& rhs) {
4073 if (!high_precision || !rhs.high_precision) {
4074 return compare_low_precision(rhs);
4075 }
4076 if (mtime > rhs.mtime) {
4077 return false;
4078 }
4079 if (mtime < rhs.mtime) {
4080 return true;
4081 }
4082 if (!zone_short_id || !rhs.zone_short_id) {
4083 /* don't compare zone ids, if one wasn't provided */
4084 return false;
4085 }
4086 if (zone_short_id != rhs.zone_short_id) {
4087 return (zone_short_id < rhs.zone_short_id);
4088 }
4089 return (pg_ver < rhs.pg_ver);
4090 }
4091
4092 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
4093 mtime = _mtime;
4094 zone_short_id = _short_id;
4095 pg_ver = _pg_ver;
4096 }
4097
4098 void init(RGWObjState *state) {
4099 mtime = state->mtime;
4100 zone_short_id = state->zone_short_id;
4101 pg_ver = state->pg_ver;
4102 }
4103 };
4104
4105 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
4106 out << o.mtime;
4107
4108 if (o.zone_short_id != 0 || o.pg_ver != 0) {
4109 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
4110 }
4111
4112 return out;
4113 }
4114
4115 class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
4116 bufferlist extra_data;
4117 public:
4118 RGWGetExtraDataCB() {}
4119 int handle_data(bufferlist& bl, bool *pause) override {
4120 int bl_len = (int)bl.length();
4121 if (extra_data.length() < extra_data_len) {
4122 off_t max = extra_data_len - extra_data.length();
4123 if (max > bl_len) {
4124 max = bl_len;
4125 }
4126 bl.splice(0, max, &extra_data);
4127 }
4128 return bl_len;
4129 }
4130
4131 bufferlist& get_extra_data() {
4132 return extra_data;
4133 }
4134 };
4135
4136 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
4137 const rgw_user& user_id,
4138 req_info *info,
4139 const string& source_zone,
4140 rgw_obj& src_obj,
4141 RGWBucketInfo& src_bucket_info,
4142 real_time *src_mtime,
4143 uint64_t *psize,
4144 const real_time *mod_ptr,
4145 const real_time *unmod_ptr,
4146 bool high_precision_time,
4147 const char *if_match,
4148 const char *if_nomatch,
4149 map<string, bufferlist> *pattrs,
4150 map<string, string> *pheaders,
4151 string *version_id,
4152 string *ptag,
4153 string *petag)
4154 {
4155 /* source is in a different zonegroup, copy from there */
4156
4157 RGWRESTStreamRWRequest *in_stream_req;
4158 string tag;
4159 map<string, bufferlist> src_attrs;
4160 append_rand_alpha(cct, tag, tag, 32);
4161 obj_time_weight set_mtime_weight;
4162 set_mtime_weight.high_precision = high_precision_time;
4163
4164 RGWRESTConn *conn;
4165 if (source_zone.empty()) {
4166 if (src_bucket_info.zonegroup.empty()) {
4167 /* source is in the master zonegroup */
4168 conn = svc.zone->get_master_conn();
4169 } else {
4170 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
4171 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4172 if (iter == zonegroup_conn_map.end()) {
4173 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4174 return -ENOENT;
4175 }
4176 conn = iter->second;
4177 }
4178 } else {
4179 auto& zone_conn_map = svc.zone->get_zone_conn_map();
4180 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4181 if (iter == zone_conn_map.end()) {
4182 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
4183 return -ENOENT;
4184 }
4185 conn = iter->second;
4186 }
4187
4188 RGWGetExtraDataCB cb;
4189 map<string, string> req_headers;
4190 real_time set_mtime;
4191
4192 const real_time *pmod = mod_ptr;
4193
4194 obj_time_weight dest_mtime_weight;
4195
4196 constexpr bool prepend_meta = true;
4197 constexpr bool get_op = true;
4198 constexpr bool rgwx_stat = true;
4199 constexpr bool sync_manifest = true;
4200 constexpr bool skip_decrypt = true;
4201 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4202 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
4203 prepend_meta, get_op, rgwx_stat,
4204 sync_manifest, skip_decrypt,
4205 true, &cb, &in_stream_req);
4206 if (ret < 0) {
4207 return ret;
4208 }
4209
4210 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
4211 if (ret < 0) {
4212 return ret;
4213 }
4214
4215 bufferlist& extra_data_bl = cb.get_extra_data();
4216 if (extra_data_bl.length()) {
4217 JSONParser jp;
4218 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
4219 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
4220 return -EIO;
4221 }
4222
4223 JSONDecoder::decode_json("attrs", src_attrs, &jp);
4224
4225 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
4226 }
4227
4228 if (src_mtime) {
4229 *src_mtime = set_mtime;
4230 }
4231
4232 if (petag) {
4233 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
4234 if (iter != src_attrs.end()) {
4235 bufferlist& etagbl = iter->second;
4236 *petag = etagbl.to_str();
4237 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
4238 *petag = petag->substr(0, petag->size() - 1);
4239 }
4240 }
4241 }
4242
4243 if (pattrs) {
4244 *pattrs = std::move(src_attrs);
4245 }
4246
4247 return 0;
4248 }
4249
4250 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
4251 const rgw_user& user_id,
4252 req_info *info,
4253 const string& source_zone,
4254 const rgw_obj& dest_obj,
4255 const rgw_obj& src_obj,
4256 RGWBucketInfo& dest_bucket_info,
4257 RGWBucketInfo& src_bucket_info,
4258 std::optional<rgw_placement_rule> dest_placement_rule,
4259 real_time *src_mtime,
4260 real_time *mtime,
4261 const real_time *mod_ptr,
4262 const real_time *unmod_ptr,
4263 bool high_precision_time,
4264 const char *if_match,
4265 const char *if_nomatch,
4266 AttrsMod attrs_mod,
4267 bool copy_if_newer,
4268 map<string, bufferlist>& attrs,
4269 RGWObjCategory category,
4270 std::optional<uint64_t> olh_epoch,
4271 real_time delete_at,
4272 string *ptag,
4273 string *petag,
4274 void (*progress_cb)(off_t, void *),
4275 void *progress_data,
4276 rgw_zone_set *zones_trace,
4277 std::optional<uint64_t>* bytes_transferred)
4278 {
4279 /* source is in a different zonegroup, copy from there */
4280
4281 RGWRESTStreamRWRequest *in_stream_req;
4282 string tag;
4283 int i;
4284 append_rand_alpha(cct, tag, tag, 32);
4285 obj_time_weight set_mtime_weight;
4286 set_mtime_weight.high_precision = high_precision_time;
4287 int ret;
4288
4289 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4290 using namespace rgw::putobj;
4291 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
4292 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, ptail_rule, user_id,
4293 obj_ctx, dest_obj, olh_epoch, tag);
4294 RGWRESTConn *conn;
4295 auto& zone_conn_map = svc.zone->get_zone_conn_map();
4296 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
4297 if (source_zone.empty()) {
4298 if (dest_bucket_info.zonegroup.empty()) {
4299 /* source is in the master zonegroup */
4300 conn = svc.zone->get_master_conn();
4301 } else {
4302 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4303 if (iter == zonegroup_conn_map.end()) {
4304 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4305 return -ENOENT;
4306 }
4307 conn = iter->second;
4308 }
4309 } else {
4310 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4311 if (iter == zone_conn_map.end()) {
4312 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
4313 return -ENOENT;
4314 }
4315 conn = iter->second;
4316 }
4317
4318 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
4319
4320 boost::optional<RGWPutObj_Compress> compressor;
4321 CompressorRef plugin;
4322
4323 rgw_placement_rule dest_rule;
4324 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
4325 [&](const map<string, bufferlist>& obj_attrs) {
4326 if (!ptail_rule) {
4327 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
4328 if (iter != obj_attrs.end()) {
4329 dest_rule.storage_class = iter->second.to_str();
4330 dest_rule.inherit_from(dest_bucket_info.placement_rule);
4331 processor.set_tail_placement(std::move(dest_rule));
4332 ptail_rule = &dest_rule;
4333 } else {
4334 ptail_rule = &dest_bucket_info.placement_rule;
4335 }
4336 }
4337 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
4338 if (compression_type != "none") {
4339 plugin = Compressor::create(cct, compression_type);
4340 if (!plugin) {
4341 ldout(cct, 1) << "Cannot load plugin for compression type "
4342 << compression_type << dendl;
4343 }
4344 }
4345
4346 int ret = processor.prepare();
4347 if (ret < 0) {
4348 return ret;
4349 }
4350 return 0;
4351 });
4352
4353 string etag;
4354 real_time set_mtime;
4355 uint64_t expected_size = 0;
4356
4357 RGWObjState *dest_state = NULL;
4358
4359 const real_time *pmod = mod_ptr;
4360
4361 obj_time_weight dest_mtime_weight;
4362
4363 if (copy_if_newer) {
4364 /* need to get mtime for destination */
4365 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4366 if (ret < 0)
4367 goto set_err_state;
4368
4369 if (!real_clock::is_zero(dest_state->mtime)) {
4370 dest_mtime_weight.init(dest_state);
4371 pmod = &dest_mtime_weight.mtime;
4372 }
4373 }
4374
4375 static constexpr bool prepend_meta = true;
4376 static constexpr bool get_op = true;
4377 static constexpr bool rgwx_stat = false;
4378 static constexpr bool sync_manifest = true;
4379 static constexpr bool skip_decrypt = true;
4380 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4381 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
4382 prepend_meta, get_op, rgwx_stat,
4383 sync_manifest, skip_decrypt,
4384 true,
4385 &cb, &in_stream_req);
4386 if (ret < 0) {
4387 goto set_err_state;
4388 }
4389
4390 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
4391 &expected_size, nullptr, nullptr);
4392 if (ret < 0) {
4393 goto set_err_state;
4394 }
4395 ret = cb.flush();
4396 if (ret < 0) {
4397 goto set_err_state;
4398 }
4399 if (cb.get_data_len() != expected_size) {
4400 ret = -EIO;
4401 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
4402 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
4403 goto set_err_state;
4404 }
4405 if (compressor && compressor->is_compressed()) {
4406 bufferlist tmp;
4407 RGWCompressionInfo cs_info;
4408 cs_info.compression_type = plugin->get_type_name();
4409 cs_info.orig_size = cb.get_data_len();
4410 cs_info.blocks = move(compressor->get_compression_blocks());
4411 encode(cs_info, tmp);
4412 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
4413 }
4414
4415 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4416 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
4417 } else {
4418 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
4419 if (iter != cb.get_attrs().end()) {
4420 try {
4421 decode(delete_at, iter->second);
4422 } catch (buffer::error& err) {
4423 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
4424 }
4425 }
4426 }
4427
4428 if (src_mtime) {
4429 *src_mtime = set_mtime;
4430 }
4431
4432 if (petag) {
4433 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4434 if (iter != cb.get_attrs().end()) {
4435 *petag = iter->second.to_str();
4436 }
4437 }
4438
4439 //erase the append attr
4440 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4441
4442 if (source_zone.empty()) {
4443 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4444 } else {
4445 attrs = cb.get_attrs();
4446 }
4447
4448 if (copy_if_newer) {
4449 uint64_t pg_ver = 0;
4450 auto i = attrs.find(RGW_ATTR_PG_VER);
4451 if (i != attrs.end() && i->second.length() > 0) {
4452 auto iter = i->second.cbegin();
4453 try {
4454 decode(pg_ver, iter);
4455 } catch (buffer::error& err) {
4456 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
4457 /* non critical error */
4458 }
4459 }
4460 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
4461 }
4462
4463 #define MAX_COMPLETE_RETRY 100
4464 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
4465 bool canceled = false;
4466 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4467 attrs, delete_at, nullptr, nullptr, nullptr,
4468 zones_trace, &canceled);
4469 if (ret < 0) {
4470 goto set_err_state;
4471 }
4472 if (copy_if_newer && canceled) {
4473 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
4474 obj_ctx.invalidate(dest_obj); /* object was overwritten */
4475 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4476 if (ret < 0) {
4477 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
4478 goto set_err_state;
4479 }
4480 dest_mtime_weight.init(dest_state);
4481 dest_mtime_weight.high_precision = high_precision_time;
4482 if (!dest_state->exists ||
4483 dest_mtime_weight < set_mtime_weight) {
4484 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4485 continue;
4486 } else {
4487 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4488 }
4489 }
4490 break;
4491 }
4492
4493 if (i == MAX_COMPLETE_RETRY) {
4494 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
4495 ret = -EIO;
4496 goto set_err_state;
4497 }
4498
4499 if (bytes_transferred) {
4500 *bytes_transferred = cb.get_data_len();
4501 }
4502 return 0;
4503 set_err_state:
4504 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
4505 // we may have already fetched during sync of OP_ADD, but were waiting
4506 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4507 if (olh_epoch && *olh_epoch > 0) {
4508 constexpr bool log_data_change = true;
4509 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
4510 *olh_epoch, real_time(), false, zones_trace, log_data_change);
4511 } else {
4512 // we already have the latest copy
4513 ret = 0;
4514 }
4515 }
4516 return ret;
4517 }
4518
4519
4520 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
4521 map<string, bufferlist>& src_attrs,
4522 RGWRados::Object::Read& read_op,
4523 const rgw_user& user_id,
4524 rgw_obj& dest_obj,
4525 real_time *mtime)
4526 {
4527 string etag;
4528
4529 RGWRESTStreamS3PutObj *out_stream_req;
4530
4531 auto rest_master_conn = svc.zone->get_master_conn();
4532
4533 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
4534 if (ret < 0) {
4535 return ret;
4536 }
4537
4538 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
4539 if (ret < 0) {
4540 delete out_stream_req;
4541 return ret;
4542 }
4543
4544 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4545 if (ret < 0)
4546 return ret;
4547
4548 return 0;
4549 }
4550
4551 /**
4552 * Copy an object.
4553 * dest_obj: the object to copy into
4554 * src_obj: the object to copy from
4555 * attrs: usage depends on attrs_mod parameter
4556 * attrs_mod: the modification mode of the attrs, may have the following values:
4557 * ATTRSMOD_NONE - the attributes of the source object will be
4558 * copied without modifications, attrs parameter is ignored;
4559 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4560 * parameter, source object attributes are not copied;
4561 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4562 * are overwritten by values contained in attrs parameter.
4563 * err: stores any errors resulting from the get of the original object
4564 * Returns: 0 on success, -ERR# otherwise.
4565 */
4566 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4567 const rgw_user& user_id,
4568 req_info *info,
4569 const string& source_zone,
4570 rgw_obj& dest_obj,
4571 rgw_obj& src_obj,
4572 RGWBucketInfo& dest_bucket_info,
4573 RGWBucketInfo& src_bucket_info,
4574 const rgw_placement_rule& dest_placement,
4575 real_time *src_mtime,
4576 real_time *mtime,
4577 const real_time *mod_ptr,
4578 const real_time *unmod_ptr,
4579 bool high_precision_time,
4580 const char *if_match,
4581 const char *if_nomatch,
4582 AttrsMod attrs_mod,
4583 bool copy_if_newer,
4584 map<string, bufferlist>& attrs,
4585 RGWObjCategory category,
4586 uint64_t olh_epoch,
4587 real_time delete_at,
4588 string *version_id,
4589 string *ptag,
4590 string *petag,
4591 void (*progress_cb)(off_t, void *),
4592 void *progress_data)
4593 {
4594 int ret;
4595 uint64_t obj_size;
4596 rgw_obj shadow_obj = dest_obj;
4597 string shadow_oid;
4598
4599 bool remote_src;
4600 bool remote_dest;
4601
4602 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4603 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4604
4605 auto& zonegroup = svc.zone->get_zonegroup();
4606
4607 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4608 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
4609
4610 if (remote_src && remote_dest) {
4611 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
4612 return -EINVAL;
4613 }
4614
4615 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
4616
4617 if (remote_src || !source_zone.empty()) {
4618 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
4619 dest_obj, src_obj, dest_bucket_info, src_bucket_info,
4620 dest_placement, src_mtime, mtime, mod_ptr,
4621 unmod_ptr, high_precision_time,
4622 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
4623 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data);
4624 }
4625
4626 map<string, bufferlist> src_attrs;
4627 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4628 RGWRados::Object::Read read_op(&src_op_target);
4629
4630 read_op.conds.mod_ptr = mod_ptr;
4631 read_op.conds.unmod_ptr = unmod_ptr;
4632 read_op.conds.high_precision_time = high_precision_time;
4633 read_op.conds.if_match = if_match;
4634 read_op.conds.if_nomatch = if_nomatch;
4635 read_op.params.attrs = &src_attrs;
4636 read_op.params.lastmod = src_mtime;
4637 read_op.params.obj_size = &obj_size;
4638
4639 ret = read_op.prepare();
4640 if (ret < 0) {
4641 return ret;
4642 }
4643 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4644 // Current implementation does not follow S3 spec and even
4645 // may result in data corruption silently when copying
4646 // multipart objects acorss pools. So reject COPY operations
4647 //on encrypted objects before it is fully functional.
4648 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
4649 << " has not been implemented." << dendl;
4650 return -ERR_NOT_IMPLEMENTED;
4651 }
4652
4653 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4654 src_attrs.erase(RGW_ATTR_DELETE_AT);
4655
4656 set_copy_attrs(src_attrs, attrs, attrs_mod);
4657 attrs.erase(RGW_ATTR_ID_TAG);
4658 attrs.erase(RGW_ATTR_PG_VER);
4659 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4660 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4661 if (cmp != src_attrs.end())
4662 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4663
4664 RGWObjManifest manifest;
4665 RGWObjState *astate = NULL;
4666
4667 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
4668 if (ret < 0) {
4669 return ret;
4670 }
4671
4672 vector<rgw_raw_obj> ref_objs;
4673
4674 if (remote_dest) {
4675 /* dest is in a different zonegroup, copy it there */
4676 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4677 }
4678 uint64_t max_chunk_size;
4679
4680 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4681 if (ret < 0) {
4682 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
4683 return ret;
4684 }
4685
4686 rgw_pool src_pool;
4687 rgw_pool dest_pool;
4688
4689 const rgw_placement_rule *src_rule{nullptr};
4690
4691 if (astate->has_manifest) {
4692 src_rule = &astate->manifest.get_tail_placement().placement_rule;
4693 ldout(cct, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
4694 }
4695
4696 if (!src_rule || src_rule->empty()) {
4697 src_rule = &src_bucket_info.placement_rule;
4698 }
4699
4700 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
4701 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
4702 return -EIO;
4703 }
4704
4705 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
4706 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
4707 return -EIO;
4708 }
4709
4710 ldout(cct, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
4711 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4712
4713 bool copy_data = !astate->has_manifest ||
4714 (*src_rule != dest_placement) ||
4715 (src_pool != dest_pool);
4716
4717 bool copy_first = false;
4718 if (astate->has_manifest) {
4719 if (!astate->manifest.has_tail()) {
4720 copy_data = true;
4721 } else {
4722 uint64_t head_size = astate->manifest.get_head_size();
4723
4724 if (head_size > 0) {
4725 if (head_size > max_chunk_size) {
4726 copy_data = true;
4727 } else {
4728 copy_first = true;
4729 }
4730 }
4731 }
4732 }
4733
4734 if (petag) {
4735 const auto iter = attrs.find(RGW_ATTR_ETAG);
4736 if (iter != attrs.end()) {
4737 *petag = iter->second.to_str();
4738 }
4739 }
4740
4741 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
4742 attrs.erase(RGW_ATTR_TAIL_TAG);
4743 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
4744 mtime, real_time(), attrs, olh_epoch, delete_at, petag);
4745 }
4746
4747 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
4748
4749 if (copy_first) { // we need to copy first chunk, not increase refcount
4750 ++miter;
4751 }
4752
4753 rgw_rados_ref ref;
4754 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4755 if (ret < 0) {
4756 return ret;
4757 }
4758
4759 bufferlist first_chunk;
4760
4761 bool copy_itself = (dest_obj == src_obj);
4762 RGWObjManifest *pmanifest;
4763 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
4764
4765 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4766 RGWRados::Object::Write write_op(&dest_op_target);
4767
4768 string tag;
4769
4770 if (ptag) {
4771 tag = *ptag;
4772 }
4773
4774 if (tag.empty()) {
4775 append_rand_alpha(cct, tag, tag, 32);
4776 }
4777
4778 if (!copy_itself) {
4779 attrs.erase(RGW_ATTR_TAIL_TAG);
4780 manifest = astate->manifest;
4781 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4782 if (tail_placement.bucket.name.empty()) {
4783 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4784 }
4785 string ref_tag;
4786 for (; miter != astate->manifest.obj_end(); ++miter) {
4787 ObjectWriteOperation op;
4788 ref_tag = tag + '\0';
4789 cls_refcount_get(op, ref_tag, true);
4790 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
4791 ref.ioctx.locator_set_key(loc.loc);
4792
4793 ret = ref.ioctx.operate(loc.oid, &op);
4794 if (ret < 0) {
4795 goto done_ret;
4796 }
4797
4798 ref_objs.push_back(loc);
4799 }
4800
4801 pmanifest = &manifest;
4802 } else {
4803 pmanifest = &astate->manifest;
4804 /* don't send the object's tail for garbage collection */
4805 astate->keep_tail = true;
4806 }
4807
4808 if (copy_first) {
4809 ret = read_op.read(0, max_chunk_size, first_chunk);
4810 if (ret < 0) {
4811 goto done_ret;
4812 }
4813
4814 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4815 } else {
4816 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4817 }
4818
4819 write_op.meta.data = &first_chunk;
4820 write_op.meta.manifest = pmanifest;
4821 write_op.meta.ptag = &tag;
4822 write_op.meta.owner = dest_bucket_info.owner;
4823 write_op.meta.mtime = mtime;
4824 write_op.meta.flags = PUT_OBJ_CREATE;
4825 write_op.meta.category = category;
4826 write_op.meta.olh_epoch = olh_epoch;
4827 write_op.meta.delete_at = delete_at;
4828 write_op.meta.modify_tail = !copy_itself;
4829
4830 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
4831 if (ret < 0) {
4832 goto done_ret;
4833 }
4834
4835 return 0;
4836
4837 done_ret:
4838 if (!copy_itself) {
4839 vector<rgw_raw_obj>::iterator riter;
4840
4841 /* rollback reference */
4842 string ref_tag = tag + '\0';
4843 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4844 ObjectWriteOperation op;
4845 cls_refcount_put(op, ref_tag, true);
4846
4847 ref.ioctx.locator_set_key(riter->loc);
4848
4849 int r = ref.ioctx.operate(riter->oid, &op);
4850 if (r < 0) {
4851 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
4852 }
4853 }
4854 }
4855 return ret;
4856 }
4857
4858
4859 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4860 RGWBucketInfo& dest_bucket_info,
4861 const rgw_placement_rule& dest_placement,
4862 RGWRados::Object::Read& read_op, off_t end,
4863 const rgw_obj& dest_obj,
4864 real_time *mtime,
4865 real_time set_mtime,
4866 map<string, bufferlist>& attrs,
4867 uint64_t olh_epoch,
4868 real_time delete_at,
4869 string *petag)
4870 {
4871 string tag;
4872 append_rand_alpha(cct, tag, tag, 32);
4873
4874 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4875 using namespace rgw::putobj;
4876 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement,
4877 dest_bucket_info.owner, obj_ctx,
4878 dest_obj, olh_epoch, tag);
4879 int ret = processor.prepare();
4880 if (ret < 0)
4881 return ret;
4882
4883 off_t ofs = 0;
4884
4885 do {
4886 bufferlist bl;
4887 ret = read_op.read(ofs, end, bl);
4888 if (ret < 0) {
4889 ldout(cct, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
4890 return ret;
4891 }
4892
4893 uint64_t read_len = ret;
4894 ret = processor.process(std::move(bl), ofs);
4895 if (ret < 0) {
4896 return ret;
4897 }
4898
4899 ofs += read_len;
4900 } while (ofs <= end);
4901
4902 // flush
4903 ret = processor.process({}, ofs);
4904 if (ret < 0) {
4905 return ret;
4906 }
4907
4908 string etag;
4909 auto iter = attrs.find(RGW_ATTR_ETAG);
4910 if (iter != attrs.end()) {
4911 bufferlist& bl = iter->second;
4912 etag = bl.to_str();
4913 if (petag) {
4914 *petag = etag;
4915 }
4916 }
4917
4918 uint64_t accounted_size;
4919 {
4920 bool compressed{false};
4921 RGWCompressionInfo cs_info;
4922 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4923 if (ret < 0) {
4924 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
4925 return ret;
4926 }
4927 // pass original size if compressed
4928 accounted_size = compressed ? cs_info.orig_size : ofs;
4929 }
4930
4931 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
4932 nullptr, nullptr, nullptr, nullptr, nullptr);
4933 }
4934
4935 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4936 RGWBucketInfo& bucket_info,
4937 rgw_obj& obj,
4938 const rgw_placement_rule& placement_rule,
4939 const real_time& mtime,
4940 uint64_t olh_epoch)
4941 {
4942 map<string, bufferlist> attrs;
4943 real_time read_mtime;
4944 uint64_t obj_size;
4945
4946 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4947 RGWRados::Object::Read read_op(&op_target);
4948
4949 read_op.params.attrs = &attrs;
4950 read_op.params.lastmod = &read_mtime;
4951 read_op.params.obj_size = &obj_size;
4952
4953 int ret = read_op.prepare();
4954 if (ret < 0) {
4955 return ret;
4956 }
4957
4958 if (read_mtime != mtime) {
4959 /* raced */
4960 return -ECANCELED;
4961 }
4962
4963 ret = copy_obj_data(obj_ctx,
4964 bucket_info,
4965 placement_rule,
4966 read_op,
4967 obj_size - 1,
4968 obj,
4969 nullptr /* pmtime */,
4970 mtime,
4971 attrs,
4972 olh_epoch,
4973 real_time(),
4974 nullptr /* petag */);
4975 if (ret < 0) {
4976 return ret;
4977 }
4978
4979 return 0;
4980 }
4981
4982 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
4983 {
4984 std::vector<rgw_bucket_dir_entry> ent_list;
4985 rgw_obj_index_key marker;
4986 string prefix;
4987 bool is_truncated;
4988
4989 do {
4990 constexpr uint NUM_ENTRIES = 1000u;
4991 int r = cls_bucket_list_unordered(bucket_info,
4992 RGW_NO_SHARD,
4993 marker,
4994 prefix,
4995 NUM_ENTRIES,
4996 true,
4997 ent_list,
4998 &is_truncated,
4999 &marker);
5000 if (r < 0)
5001 return r;
5002
5003 string ns;
5004 for (auto const& dirent : ent_list) {
5005 rgw_obj_key obj;
5006
5007 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
5008 return -ENOTEMPTY;
5009 }
5010 } while (is_truncated);
5011
5012 return 0;
5013 }
5014
5015 /**
5016 * Delete a bucket.
5017 * bucket: the name of the bucket to delete
5018 * Returns 0 on success, -ERR# otherwise.
5019 */
5020 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
5021 {
5022 const rgw_bucket& bucket = bucket_info.bucket;
5023 librados::IoCtx index_ctx;
5024 map<int, string> bucket_objs;
5025 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5026 if (r < 0)
5027 return r;
5028
5029 if (check_empty) {
5030 r = check_bucket_empty(bucket_info);
5031 if (r < 0) {
5032 return r;
5033 }
5034 }
5035
5036 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
5037 if (r < 0)
5038 return r;
5039
5040 /* if the bucket is not synced we can remove the meta file */
5041 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
5042 RGWObjVersionTracker objv_tracker;
5043 r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
5044 if (r < 0) {
5045 return r;
5046 }
5047
5048 /* remove bucket index objects asynchronously by best effort */
5049 (void) CLSRGWIssueBucketIndexClean(index_ctx,
5050 bucket_objs,
5051 cct->_conf->rgw_bucket_index_max_aio)();
5052 }
5053
5054 return 0;
5055 }
5056
5057 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
5058 {
5059 RGWBucketInfo info;
5060 map<string, bufferlist> attrs;
5061 auto obj_ctx = svc.sysobj->init_obj_ctx();
5062 int r;
5063 if (bucket.bucket_id.empty()) {
5064 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5065 } else {
5066 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
5067 }
5068 if (r < 0) {
5069 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5070 return r;
5071 }
5072
5073 info.owner = owner.get_id();
5074
5075 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5076 if (r < 0) {
5077 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5078 return r;
5079 }
5080
5081 return 0;
5082 }
5083
5084
5085 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
5086 {
5087 int ret = 0;
5088
5089 vector<rgw_bucket>::iterator iter;
5090
5091 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
5092 rgw_bucket& bucket = *iter;
5093 if (enabled)
5094 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
5095 else
5096 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
5097
5098 RGWBucketInfo info;
5099 map<string, bufferlist> attrs;
5100 auto obj_ctx = svc.sysobj->init_obj_ctx();
5101 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5102 if (r < 0) {
5103 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5104 ret = r;
5105 continue;
5106 }
5107 if (enabled) {
5108 info.flags &= ~BUCKET_SUSPENDED;
5109 } else {
5110 info.flags |= BUCKET_SUSPENDED;
5111 }
5112
5113 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5114 if (r < 0) {
5115 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5116 ret = r;
5117 continue;
5118 }
5119 }
5120 return ret;
5121 }
5122
5123 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
5124 {
5125 RGWBucketInfo bucket_info;
5126 auto obj_ctx = svc.sysobj->init_obj_ctx();
5127 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
5128 if (ret < 0) {
5129 return ret;
5130 }
5131
5132 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
5133 return 0;
5134 }
5135
5136 int RGWRados::Object::complete_atomic_modification()
5137 {
5138 if (!state->has_manifest || state->keep_tail)
5139 return 0;
5140
5141 cls_rgw_obj_chain chain;
5142 store->update_gc_chain(obj, state->manifest, &chain);
5143
5144 if (chain.empty()) {
5145 return 0;
5146 }
5147
5148 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
5149 return store->gc->send_chain(chain, tag, false); // do it async
5150 }
5151
5152 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
5153 {
5154 RGWObjManifest::obj_iterator iter;
5155 rgw_raw_obj raw_head;
5156 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
5157 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
5158 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
5159 if (mobj == raw_head)
5160 continue;
5161 cls_rgw_obj_key key(mobj.oid);
5162 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
5163 }
5164 }
5165
5166 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
5167 {
5168 return gc->send_chain(chain, tag, sync);
5169 }
5170
5171 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5172 librados::IoCtx& index_ctx,
5173 string& bucket_oid)
5174 {
5175 const rgw_bucket& bucket = bucket_info.bucket;
5176 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5177 if (r < 0)
5178 return r;
5179
5180 if (bucket.bucket_id.empty()) {
5181 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
5182 return -EIO;
5183 }
5184
5185 bucket_oid = dir_oid_prefix;
5186 bucket_oid.append(bucket.bucket_id);
5187
5188 return 0;
5189 }
5190
5191 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
5192 librados::IoCtx& index_ctx,
5193 string& bucket_oid_base) {
5194 const rgw_bucket& bucket = bucket_info.bucket;
5195 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5196 if (r < 0)
5197 return r;
5198
5199 if (bucket.bucket_id.empty()) {
5200 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
5201 return -EIO;
5202 }
5203
5204 bucket_oid_base = dir_oid_prefix;
5205 bucket_oid_base.append(bucket.bucket_id);
5206
5207 return 0;
5208
5209 }
5210
5211 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5212 librados::IoCtx& index_ctx,
5213 map<int, string>& bucket_objs,
5214 int shard_id,
5215 map<int, string> *bucket_instance_ids) {
5216 string bucket_oid_base;
5217 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5218 if (ret < 0) {
5219 return ret;
5220 }
5221
5222 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
5223 if (bucket_instance_ids) {
5224 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
5225 }
5226 return 0;
5227 }
5228
5229 template<typename T>
5230 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5231 map<int, string>& oids, map<int, T>& bucket_objs,
5232 int shard_id, map<int, string> *bucket_instance_ids)
5233 {
5234 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
5235 if (ret < 0)
5236 return ret;
5237
5238 map<int, string>::const_iterator iter = oids.begin();
5239 for (; iter != oids.end(); ++iter) {
5240 bucket_objs[iter->first] = T();
5241 }
5242 return 0;
5243 }
5244
5245 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5246 const string& obj_key, string *bucket_obj, int *shard_id)
5247 {
5248 string bucket_oid_base;
5249 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5250 if (ret < 0)
5251 return ret;
5252
5253 RGWObjectCtx obj_ctx(this);
5254
5255 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
5256 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
5257 if (ret < 0) {
5258 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
5259 return ret;
5260 }
5261 return 0;
5262 }
5263
5264 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5265 int shard_id, string *bucket_obj)
5266 {
5267 string bucket_oid_base;
5268 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5269 if (ret < 0)
5270 return ret;
5271
5272 RGWObjectCtx obj_ctx(this);
5273
5274 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
5275 shard_id, bucket_obj);
5276 return 0;
5277 }
5278
5279 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
5280 map<RGWObjCategory, RGWStorageStats>& stats)
5281 {
5282 for (const auto& pair : header.stats) {
5283 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
5284 const rgw_bucket_category_stats& header_stats = pair.second;
5285
5286 RGWStorageStats& s = stats[category];
5287
5288 s.category = category;
5289 s.size += header_stats.total_size;
5290 s.size_rounded += header_stats.total_size_rounded;
5291 s.size_utilized += header_stats.actual_size;
5292 s.num_objects += header_stats.num_entries;
5293 }
5294 }
5295
5296 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
5297 map<RGWObjCategory, RGWStorageStats> *existing_stats,
5298 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
5299 {
5300 librados::IoCtx index_ctx;
5301 // key - bucket index object id
5302 // value - bucket index check OP returned result with the given bucket index object (shard)
5303 map<int, string> oids;
5304 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
5305
5306 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
5307 if (ret < 0) {
5308 return ret;
5309 }
5310
5311 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
5312 if (ret < 0) {
5313 return ret;
5314 }
5315
5316 // Aggregate results (from different shards if there is any)
5317 map<int, struct rgw_cls_check_index_ret>::iterator iter;
5318 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
5319 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
5320 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
5321 }
5322
5323 return 0;
5324 }
5325
5326 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
5327 {
5328 librados::IoCtx index_ctx;
5329 map<int, string> bucket_objs;
5330
5331 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5332 if (r < 0) {
5333 return r;
5334 }
5335
5336 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5337 }
5338
5339 int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
5340 {
5341 librados::IoCtx index_ctx;
5342 map<int, string> bucket_objs;
5343
5344 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5345 if (r < 0) {
5346 return r;
5347 }
5348
5349 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
5350 }
5351
5352 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
5353 {
5354 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5355 std::string oid, key;
5356 get_obj_bucket_and_oid_loc(obj, oid, key);
5357 if (!rctx)
5358 return 0;
5359
5360 RGWObjState *state = NULL;
5361
5362 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
5363 if (r < 0)
5364 return r;
5365
5366 if (!state->is_atomic) {
5367 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
5368 return -EINVAL;
5369 }
5370
5371 string tag;
5372
5373 if (state->tail_tag.length() > 0) {
5374 tag = state->tail_tag.c_str();
5375 } else if (state->obj_tag.length() > 0) {
5376 tag = state->obj_tag.c_str();
5377 } else {
5378 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
5379 return -EINVAL;
5380 }
5381
5382 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
5383
5384 return gc->defer_chain(tag, false);
5385 }
5386
5387 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
5388 {
5389 list<string> prefixes;
5390 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
5391 cls_rgw_remove_obj(op, prefixes);
5392 }
5393
5394 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
5395 {
5396 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
5397 }
5398
5399 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
5400 {
5401 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
5402 }
5403
5404
5405 /**
5406 * Delete an object.
5407 * bucket: name of the bucket storing the object
5408 * obj: name of the object to delete
5409 * Returns: 0 on success, -ERR# otherwise.
5410 */
5411 int RGWRados::Object::Delete::delete_obj()
5412 {
5413 RGWRados *store = target->get_store();
5414 rgw_obj& src_obj = target->get_obj();
5415 const string& instance = src_obj.key.instance;
5416 rgw_obj obj = src_obj;
5417
5418 if (instance == "null") {
5419 obj.key.instance.clear();
5420 }
5421
5422 bool explicit_marker_version = (!params.marker_version_id.empty());
5423
5424 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5425 if (instance.empty() || explicit_marker_version) {
5426 rgw_obj marker = obj;
5427
5428 if (!params.marker_version_id.empty()) {
5429 if (params.marker_version_id != "null") {
5430 marker.key.set_instance(params.marker_version_id);
5431 }
5432 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5433 store->gen_rand_obj_instance_name(&marker);
5434 }
5435
5436 result.version_id = marker.key.instance;
5437 if (result.version_id.empty())
5438 result.version_id = "null";
5439 result.delete_marker = true;
5440
5441 struct rgw_bucket_dir_entry_meta meta;
5442
5443 meta.owner = params.obj_owner.get_id().to_str();
5444 meta.owner_display_name = params.obj_owner.get_display_name();
5445
5446 if (real_clock::is_zero(params.mtime)) {
5447 meta.mtime = real_clock::now();
5448 } else {
5449 meta.mtime = params.mtime;
5450 }
5451
5452 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
5453 if (r < 0) {
5454 return r;
5455 }
5456 } else {
5457 rgw_bucket_dir_entry dirent;
5458
5459 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
5460 if (r < 0) {
5461 return r;
5462 }
5463 result.delete_marker = dirent.is_delete_marker();
5464 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
5465 if (r < 0) {
5466 return r;
5467 }
5468 result.version_id = instance;
5469 }
5470
5471 BucketShard *bs;
5472 int r = target->get_bucket_shard(&bs);
5473 if (r < 0) {
5474 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
5475 return r;
5476 }
5477
5478 if (target->bucket_info.datasync_flag_enabled()) {
5479 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
5480 if (r < 0) {
5481 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
5482 return r;
5483 }
5484 }
5485
5486 return 0;
5487 }
5488
5489 rgw_rados_ref ref;
5490 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
5491 if (r < 0) {
5492 return r;
5493 }
5494
5495 RGWObjState *state;
5496 r = target->get_state(&state, false);
5497 if (r < 0)
5498 return r;
5499
5500 ObjectWriteOperation op;
5501
5502 if (!real_clock::is_zero(params.unmod_since)) {
5503 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5504 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5505 if (!params.high_precision_time) {
5506 ctime.tv_nsec = 0;
5507 unmod.tv_nsec = 0;
5508 }
5509
5510 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
5511 if (ctime > unmod) {
5512 return -ERR_PRECONDITION_FAILED;
5513 }
5514
5515 /* only delete object if mtime is less than or equal to params.unmod_since */
5516 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5517 }
5518 uint64_t obj_accounted_size = state->accounted_size;
5519
5520 if (!real_clock::is_zero(params.expiration_time)) {
5521 bufferlist bl;
5522 real_time delete_at;
5523
5524 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5525 try {
5526 auto iter = bl.cbegin();
5527 decode(delete_at, iter);
5528 } catch (buffer::error& err) {
5529 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5530 return -EIO;
5531 }
5532
5533 if (params.expiration_time != delete_at) {
5534 return -ERR_PRECONDITION_FAILED;
5535 }
5536 } else {
5537 return -ERR_PRECONDITION_FAILED;
5538 }
5539 }
5540
5541 if (!state->exists) {
5542 target->invalidate_state();
5543 return -ENOENT;
5544 }
5545
5546 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
5547 if (r < 0)
5548 return r;
5549
5550 RGWBucketInfo& bucket_info = target->get_bucket_info();
5551
5552 RGWRados::Bucket bop(store, bucket_info);
5553 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5554
5555 index_op.set_zones_trace(params.zones_trace);
5556 index_op.set_bilog_flags(params.bilog_flags);
5557
5558 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
5559 if (r < 0)
5560 return r;
5561
5562 store->remove_rgw_head_obj(op);
5563 r = ref.ioctx.operate(ref.obj.oid, &op);
5564
5565 /* raced with another operation, object state is indeterminate */
5566 const bool need_invalidate = (r == -ECANCELED);
5567
5568 int64_t poolid = ref.ioctx.get_id();
5569 if (r >= 0) {
5570 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5571 if (obj_tombstone_cache) {
5572 tombstone_entry entry{*state};
5573 obj_tombstone_cache->add(obj, entry);
5574 }
5575 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
5576
5577 int ret = target->complete_atomic_modification();
5578 if (ret < 0) {
5579 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5580 }
5581 /* other than that, no need to propagate error */
5582 } else {
5583 int ret = index_op.cancel();
5584 if (ret < 0) {
5585 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5586 }
5587 }
5588
5589 if (need_invalidate) {
5590 target->invalidate_state();
5591 }
5592
5593 if (r < 0)
5594 return r;
5595
5596 /* update quota cache */
5597 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
5598
5599 return 0;
5600 }
5601
5602 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5603 const RGWBucketInfo& bucket_info,
5604 const rgw_obj& obj,
5605 int versioning_status,
5606 uint16_t bilog_flags,
5607 const real_time& expiration_time,
5608 rgw_zone_set *zones_trace)
5609 {
5610 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5611 RGWRados::Object::Delete del_op(&del_target);
5612
5613 del_op.params.bucket_owner = bucket_info.owner;
5614 del_op.params.versioning_status = versioning_status;
5615 del_op.params.bilog_flags = bilog_flags;
5616 del_op.params.expiration_time = expiration_time;
5617 del_op.params.zones_trace = zones_trace;
5618
5619 return del_op.delete_obj();
5620 }
5621
5622 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5623 {
5624 rgw_rados_ref ref;
5625 int r = get_raw_obj_ref(obj, &ref);
5626 if (r < 0) {
5627 return r;
5628 }
5629
5630 ObjectWriteOperation op;
5631
5632 op.remove();
5633 r = ref.ioctx.operate(ref.obj.oid, &op);
5634 if (r < 0)
5635 return r;
5636
5637 return 0;
5638 }
5639
5640 int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime)
5641 {
5642 std::string oid, key;
5643 get_obj_bucket_and_oid_loc(obj, oid, key);
5644
5645 auto obj_ctx = svc.sysobj->init_obj_ctx();
5646
5647 RGWBucketInfo bucket_info;
5648 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
5649 if (ret < 0) {
5650 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5651 return ret;
5652 }
5653
5654 RGWRados::Bucket bop(this, bucket_info);
5655 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5656
5657 return index_op.complete_del(-1 /* pool */, 0, mtime, NULL);
5658 }
5659
5660 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5661 {
5662 string tag;
5663
5664 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5665 if (mi != manifest.obj_end()) {
5666 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5667 ++mi;
5668 tag = mi.get_location().get_raw_obj(store).oid;
5669 tag.append("_");
5670 }
5671
5672 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5673 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5674 MD5 hash;
5675 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
5676
5677 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5678 if (iter != attrset.end()) {
5679 bufferlist& bl = iter->second;
5680 hash.Update((const unsigned char *)bl.c_str(), bl.length());
5681 }
5682
5683 hash.Final(md5);
5684 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5685 tag.append(md5_str);
5686
5687 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5688
5689 tag_bl.append(tag.c_str(), tag.size() + 1);
5690 }
5691
5692 static bool is_olh(map<string, bufferlist>& attrs)
5693 {
5694 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5695 return (iter != attrs.end());
5696 }
5697
5698 static bool has_olh_tag(map<string, bufferlist>& attrs)
5699 {
5700 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5701 return (iter != attrs.end());
5702 }
5703
5704 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5705 RGWObjState *olh_state, RGWObjState **target_state)
5706 {
5707 ceph_assert(olh_state->is_olh);
5708
5709 rgw_obj target;
5710 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5711 if (r < 0) {
5712 return r;
5713 }
5714 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
5715 if (r < 0) {
5716 return r;
5717 }
5718
5719 return 0;
5720 }
5721
5722 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5723 RGWObjState **state, bool follow_olh, bool assume_noent)
5724 {
5725 if (obj.empty()) {
5726 return -EINVAL;
5727 }
5728
5729 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5730
5731 RGWObjState *s = rctx->get_state(obj);
5732 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5733 *state = s;
5734 if (s->has_attrs) {
5735 if (s->is_olh && need_follow_olh) {
5736 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
5737 }
5738 return 0;
5739 }
5740
5741 s->obj = obj;
5742
5743 rgw_raw_obj raw_obj;
5744 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5745
5746 int r = -ENOENT;
5747
5748 if (!assume_noent) {
5749 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
5750 }
5751
5752 if (r == -ENOENT) {
5753 s->exists = false;
5754 s->has_attrs = true;
5755 tombstone_entry entry;
5756 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5757 s->mtime = entry.mtime;
5758 s->zone_short_id = entry.zone_short_id;
5759 s->pg_ver = entry.pg_ver;
5760 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5761 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5762 } else {
5763 s->mtime = real_time();
5764 }
5765 return 0;
5766 }
5767 if (r < 0)
5768 return r;
5769
5770 s->exists = true;
5771 s->has_attrs = true;
5772 s->accounted_size = s->size;
5773
5774 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5775 if (iter != s->attrset.end()) {
5776 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5777 bufferlist& bletag = iter->second;
5778 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5779 bufferlist newbl;
5780 bletag.splice(0, bletag.length() - 1, &newbl);
5781 bletag.claim(newbl);
5782 }
5783 }
5784
5785 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
5786 const bool compressed = (iter != s->attrset.end());
5787 if (compressed) {
5788 // use uncompressed size for accounted_size
5789 try {
5790 RGWCompressionInfo info;
5791 auto p = iter->second.cbegin();
5792 decode(info, p);
5793 s->accounted_size = info.orig_size;
5794 } catch (buffer::error&) {
5795 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5796 return -EIO;
5797 }
5798 }
5799
5800 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5801 if (iter != s->attrset.end()) {
5802 bufferlist bl = iter->second;
5803 bufferlist::iterator it = bl.begin();
5804 it.copy(bl.length(), s->shadow_obj);
5805 s->shadow_obj[bl.length()] = '\0';
5806 }
5807 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
5808 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5809 if (ttiter != s->attrset.end()) {
5810 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5811 }
5812
5813 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5814 if (manifest_bl.length()) {
5815 auto miter = manifest_bl.cbegin();
5816 try {
5817 decode(s->manifest, miter);
5818 s->has_manifest = true;
5819 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
5820 broken due to old bugs */
5821 s->size = s->manifest.get_obj_size();
5822 if (!compressed)
5823 s->accounted_size = s->size;
5824 } catch (buffer::error& err) {
5825 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5826 return -EIO;
5827 }
5828 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
5829 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
5830 s->manifest.has_explicit_objs()) {
5831 RGWObjManifest::obj_iterator mi;
5832 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
5833 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5834 }
5835 }
5836
5837 if (!s->obj_tag.length()) {
5838 /*
5839 * Uh oh, something's wrong, object with manifest should have tag. Let's
5840 * create one out of the manifest, would be unique
5841 */
5842 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
5843 s->fake_tag = true;
5844 }
5845 }
5846 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5847 if (aiter != s->attrset.end()) {
5848 bufferlist& pg_ver_bl = aiter->second;
5849 if (pg_ver_bl.length()) {
5850 auto pgbl = pg_ver_bl.cbegin();
5851 try {
5852 decode(s->pg_ver, pgbl);
5853 } catch (buffer::error& err) {
5854 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5855 }
5856 }
5857 }
5858 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5859 if (aiter != s->attrset.end()) {
5860 bufferlist& zone_short_id_bl = aiter->second;
5861 if (zone_short_id_bl.length()) {
5862 auto zbl = zone_short_id_bl.cbegin();
5863 try {
5864 decode(s->zone_short_id, zbl);
5865 } catch (buffer::error& err) {
5866 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5867 }
5868 }
5869 }
5870 if (s->obj_tag.length())
5871 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5872 else
5873 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5874
5875 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5876 * it exist, and not only if is_olh() returns true
5877 */
5878 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5879 if (iter != s->attrset.end()) {
5880 s->olh_tag = iter->second;
5881 }
5882
5883 if (is_olh(s->attrset)) {
5884 s->is_olh = true;
5885
5886 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5887
5888 if (need_follow_olh) {
5889 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
5890 } else if (obj.key.have_null_instance() && !s->has_manifest) {
5891 // read null version, and the head object only have olh info
5892 s->exists = false;
5893 return -ENOENT;
5894 }
5895 }
5896
5897 return 0;
5898 }
5899
5900 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
5901 bool follow_olh, bool assume_noent)
5902 {
5903 int ret;
5904
5905 do {
5906 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
5907 } while (ret == -EAGAIN);
5908
5909 return ret;
5910 }
5911
5912 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
5913 {
5914 RGWObjState *astate;
5915 int r = get_state(&astate, true);
5916 if (r < 0) {
5917 return r;
5918 }
5919
5920 *pmanifest = &astate->manifest;
5921
5922 return 0;
5923 }
5924
5925 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
5926 {
5927 RGWObjState *state;
5928 int r = source->get_state(&state, true);
5929 if (r < 0)
5930 return r;
5931 if (!state->exists)
5932 return -ENOENT;
5933 if (!state->get_attr(name, dest))
5934 return -ENODATA;
5935
5936 return 0;
5937 }
5938
5939
5940 int RGWRados::Object::Stat::stat_async()
5941 {
5942 RGWObjectCtx& ctx = source->get_ctx();
5943 rgw_obj& obj = source->get_obj();
5944 RGWRados *store = source->get_store();
5945
5946 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
5947 result.obj = obj;
5948 if (s->has_attrs) {
5949 state.ret = 0;
5950 result.size = s->size;
5951 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5952 result.attrs = s->attrset;
5953 result.has_manifest = s->has_manifest;
5954 result.manifest = s->manifest;
5955 return 0;
5956 }
5957
5958 string oid;
5959 string loc;
5960 get_obj_bucket_and_oid_loc(obj, oid, loc);
5961
5962 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5963 if (r < 0) {
5964 return r;
5965 }
5966
5967 librados::ObjectReadOperation op;
5968 op.stat2(&result.size, &result.mtime, NULL);
5969 op.getxattrs(&result.attrs, NULL);
5970 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
5971 state.io_ctx.locator_set_key(loc);
5972 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5973 if (r < 0) {
5974 ldout(store->ctx(), 5) << __func__
5975 << ": ERROR: aio_operate() returned ret=" << r
5976 << dendl;
5977 return r;
5978 }
5979
5980 return 0;
5981 }
5982
5983
5984 int RGWRados::Object::Stat::wait()
5985 {
5986 if (!state.completion) {
5987 return state.ret;
5988 }
5989
5990 state.completion->wait_for_safe();
5991 state.ret = state.completion->get_return_value();
5992 state.completion->release();
5993
5994 if (state.ret != 0) {
5995 return state.ret;
5996 }
5997
5998 return finish();
5999 }
6000
6001 int RGWRados::Object::Stat::finish()
6002 {
6003 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
6004 if (iter != result.attrs.end()) {
6005 bufferlist& bl = iter->second;
6006 auto biter = bl.cbegin();
6007 try {
6008 decode(result.manifest, biter);
6009 } catch (buffer::error& err) {
6010 RGWRados *store = source->get_store();
6011 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
6012 return -EIO;
6013 }
6014 result.has_manifest = true;
6015 }
6016
6017 return 0;
6018 }
6019
6020 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
6021 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6022 ObjectOperation& op, RGWObjState **pstate)
6023 {
6024 if (!rctx)
6025 return 0;
6026
6027 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
6028 if (r < 0)
6029 return r;
6030
6031 return append_atomic_test(*pstate, op);
6032 }
6033
6034 int RGWRados::append_atomic_test(const RGWObjState* state,
6035 librados::ObjectOperation& op)
6036 {
6037 if (!state->is_atomic) {
6038 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
6039 return 0;
6040 }
6041
6042 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
6043 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6044 } else {
6045 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
6046 }
6047 return 0;
6048 }
6049
6050 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
6051 {
6052 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
6053 }
6054
6055 void RGWRados::Object::invalidate_state()
6056 {
6057 ctx.invalidate(obj);
6058 }
6059
6060 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
6061 const char *if_match, const char *if_nomatch, bool removal_op,
6062 bool modify_tail)
6063 {
6064 int r = get_state(&state, false);
6065 if (r < 0)
6066 return r;
6067
6068 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
6069 if_match != NULL || if_nomatch != NULL) &&
6070 (!state->fake_tag);
6071
6072 if (!state->is_atomic) {
6073 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
6074
6075 if (reset_obj) {
6076 op.create(false);
6077 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
6078 }
6079
6080 return 0;
6081 }
6082
6083 if (need_guard) {
6084 /* first verify that the object wasn't replaced under */
6085 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
6086 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6087 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
6088 }
6089
6090 if (if_match) {
6091 if (strcmp(if_match, "*") == 0) {
6092 // test the object is existing
6093 if (!state->exists) {
6094 return -ERR_PRECONDITION_FAILED;
6095 }
6096 } else {
6097 bufferlist bl;
6098 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6099 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
6100 return -ERR_PRECONDITION_FAILED;
6101 }
6102 }
6103 }
6104
6105 if (if_nomatch) {
6106 if (strcmp(if_nomatch, "*") == 0) {
6107 // test the object is NOT existing
6108 if (state->exists) {
6109 return -ERR_PRECONDITION_FAILED;
6110 }
6111 } else {
6112 bufferlist bl;
6113 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6114 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
6115 return -ERR_PRECONDITION_FAILED;
6116 }
6117 }
6118 }
6119 }
6120
6121 if (reset_obj) {
6122 if (state->exists) {
6123 op.create(false);
6124 store->remove_rgw_head_obj(op);
6125 } else {
6126 op.create(true);
6127 }
6128 }
6129
6130 if (removal_op) {
6131 /* the object is being removed, no need to update its tag */
6132 return 0;
6133 }
6134
6135 if (ptag) {
6136 state->write_tag = *ptag;
6137 } else {
6138 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
6139 }
6140 bufferlist bl;
6141 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
6142
6143 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
6144
6145 op.setxattr(RGW_ATTR_ID_TAG, bl);
6146 if (modify_tail) {
6147 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
6148 }
6149
6150 return 0;
6151 }
6152
6153 /**
6154 * Set an attr on an object.
6155 * bucket: name of the bucket holding the object
6156 * obj: name of the object to set the attr on
6157 * name: the attr to set
6158 * bl: the contents of the attr
6159 * Returns: 0 on success, -ERR# otherwise.
6160 */
6161 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
6162 {
6163 map<string, bufferlist> attrs;
6164 attrs[name] = bl;
6165 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
6166 }
6167
6168 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
6169 map<string, bufferlist>& attrs,
6170 map<string, bufferlist>* rmattrs)
6171 {
6172 rgw_obj obj = src_obj;
6173 if (obj.key.instance == "null") {
6174 obj.key.instance.clear();
6175 }
6176
6177 rgw_rados_ref ref;
6178 int r = get_obj_head_ref(bucket_info, obj, &ref);
6179 if (r < 0) {
6180 return r;
6181 }
6182 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
6183
6184 ObjectWriteOperation op;
6185 RGWObjState *state = NULL;
6186
6187 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
6188 if (r < 0)
6189 return r;
6190
6191 // ensure null version object exist
6192 if (src_obj.key.instance == "null" && !state->has_manifest) {
6193 return -ENOENT;
6194 }
6195
6196 map<string, bufferlist>::iterator iter;
6197 if (rmattrs) {
6198 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6199 const string& name = iter->first;
6200 op.rmxattr(name.c_str());
6201 }
6202 }
6203
6204 const rgw_bucket& bucket = obj.bucket;
6205
6206 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6207 const string& name = iter->first;
6208 bufferlist& bl = iter->second;
6209
6210 if (!bl.length())
6211 continue;
6212
6213 op.setxattr(name.c_str(), bl);
6214
6215 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
6216 real_time ts;
6217 try {
6218 decode(ts, bl);
6219
6220 rgw_obj_index_key obj_key;
6221 obj.key.get_index_key(&obj_key);
6222
6223 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
6224 } catch (buffer::error& err) {
6225 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
6226 }
6227 }
6228 }
6229
6230 if (!op.size())
6231 return 0;
6232
6233 RGWObjectCtx obj_ctx(this);
6234
6235 bufferlist bl;
6236 RGWRados::Bucket bop(this, bucket_info);
6237 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
6238
6239 if (state) {
6240 string tag;
6241 append_rand_alpha(cct, tag, tag, 32);
6242 state->write_tag = tag;
6243 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
6244
6245 if (r < 0)
6246 return r;
6247
6248 bl.append(tag.c_str(), tag.size() + 1);
6249 op.setxattr(RGW_ATTR_ID_TAG, bl);
6250 }
6251
6252
6253 real_time mtime = real_clock::now();
6254 struct timespec mtime_ts = real_clock::to_timespec(mtime);
6255 op.mtime2(&mtime_ts);
6256 r = ref.ioctx.operate(ref.obj.oid, &op);
6257 if (state) {
6258 if (r >= 0) {
6259 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
6260 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
6261 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
6262 string etag = rgw_bl_str(etag_bl);
6263 string content_type = rgw_bl_str(content_type_bl);
6264 string storage_class;
6265 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
6266 if (iter != attrs.end()) {
6267 storage_class = rgw_bl_str(iter->second);
6268 }
6269 uint64_t epoch = ref.ioctx.get_last_version();
6270 int64_t poolid = ref.ioctx.get_id();
6271 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
6272 mtime, etag, content_type, storage_class, &acl_bl,
6273 RGWObjCategory::Main, NULL);
6274 } else {
6275 int ret = index_op.cancel();
6276 if (ret < 0) {
6277 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
6278 }
6279 }
6280 }
6281 if (r < 0)
6282 return r;
6283
6284 if (state) {
6285 state->obj_tag.swap(bl);
6286 if (rmattrs) {
6287 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6288 state->attrset.erase(iter->first);
6289 }
6290 }
6291
6292 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6293 state->attrset[iter->first] = iter->second;
6294 }
6295
6296 auto iter = state->attrset.find(RGW_ATTR_ID_TAG);
6297 if (iter != state->attrset.end()) {
6298 iter->second = state->obj_tag;
6299 }
6300 }
6301
6302 return 0;
6303 }
6304
6305 int RGWRados::Object::Read::prepare()
6306 {
6307 RGWRados *store = source->get_store();
6308 CephContext *cct = store->ctx();
6309
6310 bufferlist etag;
6311
6312 map<string, bufferlist>::iterator iter;
6313
6314 RGWObjState *astate;
6315 int r = source->get_state(&astate, true);
6316 if (r < 0)
6317 return r;
6318
6319 if (!astate->exists) {
6320 return -ENOENT;
6321 }
6322
6323 const RGWBucketInfo& bucket_info = source->get_bucket_info();
6324
6325 state.obj = astate->obj;
6326 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
6327
6328 state.cur_pool = state.head_obj.pool;
6329 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
6330
6331 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
6332 if (r < 0) {
6333 return r;
6334 }
6335 if (params.target_obj) {
6336 *params.target_obj = state.obj;
6337 }
6338 if (params.attrs) {
6339 *params.attrs = astate->attrset;
6340 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
6341 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
6342 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
6343 }
6344 }
6345 }
6346
6347 /* Convert all times go GMT to make them compatible */
6348 if (conds.mod_ptr || conds.unmod_ptr) {
6349 obj_time_weight src_weight;
6350 src_weight.init(astate);
6351 src_weight.high_precision = conds.high_precision_time;
6352
6353 obj_time_weight dest_weight;
6354 dest_weight.high_precision = conds.high_precision_time;
6355
6356 if (conds.mod_ptr) {
6357 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6358 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6359 if (!(dest_weight < src_weight)) {
6360 return -ERR_NOT_MODIFIED;
6361 }
6362 }
6363
6364 if (conds.unmod_ptr) {
6365 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6366 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6367 if (dest_weight < src_weight) {
6368 return -ERR_PRECONDITION_FAILED;
6369 }
6370 }
6371 }
6372 if (conds.if_match || conds.if_nomatch) {
6373 r = get_attr(RGW_ATTR_ETAG, etag);
6374 if (r < 0)
6375 return r;
6376
6377
6378
6379 if (conds.if_match) {
6380 string if_match_str = rgw_string_unquote(conds.if_match);
6381 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
6382 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
6383 return -ERR_PRECONDITION_FAILED;
6384 }
6385 }
6386
6387 if (conds.if_nomatch) {
6388 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
6389 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
6390 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
6391 return -ERR_NOT_MODIFIED;
6392 }
6393 }
6394 }
6395
6396 if (params.obj_size)
6397 *params.obj_size = astate->size;
6398 if (params.lastmod)
6399 *params.lastmod = astate->mtime;
6400
6401 return 0;
6402 }
6403
6404 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6405 {
6406 if (ofs < 0) {
6407 ofs += obj_size;
6408 if (ofs < 0)
6409 ofs = 0;
6410 end = obj_size - 1;
6411 } else if (end < 0) {
6412 end = obj_size - 1;
6413 }
6414
6415 if (obj_size > 0) {
6416 if (ofs >= (off_t)obj_size) {
6417 return -ERANGE;
6418 }
6419 if (end >= (off_t)obj_size) {
6420 end = obj_size - 1;
6421 }
6422 }
6423 return 0;
6424 }
6425
6426 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
6427 {
6428 RGWRados *store = target->get_store();
6429 BucketShard *bs;
6430 int r;
6431
6432 #define NUM_RESHARD_RETRIES 10
6433 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6434 int ret = get_bucket_shard(&bs);
6435 if (ret < 0) {
6436 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6437 return ret;
6438 }
6439 r = call(bs);
6440 if (r != -ERR_BUSY_RESHARDING) {
6441 break;
6442 }
6443 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6444 string new_bucket_id;
6445 r = store->block_while_resharding(bs, &new_bucket_id,
6446 target->bucket_info, null_yield);
6447 if (r == -ERR_BUSY_RESHARDING) {
6448 continue;
6449 }
6450 if (r < 0) {
6451 return r;
6452 }
6453 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6454 i = 0; /* resharding is finished, make sure we can retry */
6455 r = target->update_bucket_id(new_bucket_id);
6456 if (r < 0) {
6457 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
6458 return r;
6459 }
6460 invalidate_bs();
6461 } // for loop
6462
6463 if (r < 0) {
6464 return r;
6465 }
6466
6467 if (pbs) {
6468 *pbs = bs;
6469 }
6470
6471 return 0;
6472 }
6473
6474 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
6475 {
6476 if (blind) {
6477 return 0;
6478 }
6479 RGWRados *store = target->get_store();
6480
6481 if (write_tag && write_tag->length()) {
6482 optag = string(write_tag->c_str(), write_tag->length());
6483 } else {
6484 if (optag.empty()) {
6485 append_rand_alpha(store->ctx(), optag, optag, 32);
6486 }
6487 }
6488
6489 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
6490 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
6491 });
6492
6493 if (r < 0) {
6494 return r;
6495 }
6496 prepared = true;
6497
6498 return 0;
6499 }
6500
6501 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
6502 uint64_t size, uint64_t accounted_size,
6503 ceph::real_time& ut, const string& etag,
6504 const string& content_type, const string& storage_class,
6505 bufferlist *acl_bl,
6506 RGWObjCategory category,
6507 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6508 bool appendable)
6509 {
6510 if (blind) {
6511 return 0;
6512 }
6513 RGWRados *store = target->get_store();
6514 BucketShard *bs;
6515
6516 int ret = get_bucket_shard(&bs);
6517 if (ret < 0) {
6518 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6519 return ret;
6520 }
6521
6522 rgw_bucket_dir_entry ent;
6523 obj.key.get_index_key(&ent.key);
6524 ent.meta.size = size;
6525 ent.meta.accounted_size = accounted_size;
6526 ent.meta.mtime = ut;
6527 ent.meta.etag = etag;
6528 ent.meta.storage_class = storage_class;
6529 if (user_data)
6530 ent.meta.user_data = *user_data;
6531
6532 ACLOwner owner;
6533 if (acl_bl && acl_bl->length()) {
6534 int ret = store->decode_policy(*acl_bl, &owner);
6535 if (ret < 0) {
6536 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6537 }
6538 }
6539 ent.meta.owner = owner.get_id().to_str();
6540 ent.meta.owner_display_name = owner.get_display_name();
6541 ent.meta.content_type = content_type;
6542 ent.meta.appendable = appendable;
6543
6544 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
6545
6546 if (target->bucket_info.datasync_flag_enabled()) {
6547 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6548 if (r < 0) {
6549 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6550 }
6551 }
6552
6553 return ret;
6554 }
6555
6556 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6557 real_time& removed_mtime,
6558 list<rgw_obj_index_key> *remove_objs)
6559 {
6560 if (blind) {
6561 return 0;
6562 }
6563 RGWRados *store = target->get_store();
6564 BucketShard *bs;
6565
6566 int ret = get_bucket_shard(&bs);
6567 if (ret < 0) {
6568 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6569 return ret;
6570 }
6571
6572 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
6573
6574 if (target->bucket_info.datasync_flag_enabled()) {
6575 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6576 if (r < 0) {
6577 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6578 }
6579 }
6580
6581 return ret;
6582 }
6583
6584
6585 int RGWRados::Bucket::UpdateIndex::cancel()
6586 {
6587 if (blind) {
6588 return 0;
6589 }
6590 RGWRados *store = target->get_store();
6591 BucketShard *bs;
6592
6593 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6594 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6595 });
6596
6597 /*
6598 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6599 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6600 * have no way to tell that they're all caught up
6601 */
6602 if (target->bucket_info.datasync_flag_enabled()) {
6603 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6604 if (r < 0) {
6605 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6606 }
6607 }
6608
6609 return ret;
6610 }
6611
6612 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
6613 {
6614 RGWRados *store = source->get_store();
6615 CephContext *cct = store->ctx();
6616
6617 rgw_raw_obj read_obj;
6618 uint64_t read_ofs = ofs;
6619 uint64_t len, read_len;
6620 bool reading_from_head = true;
6621 ObjectReadOperation op;
6622
6623 bool merge_bl = false;
6624 bufferlist *pbl = &bl;
6625 bufferlist read_bl;
6626 uint64_t max_chunk_size;
6627
6628 RGWObjState *astate;
6629 int r = source->get_state(&astate, true);
6630 if (r < 0)
6631 return r;
6632
6633 if (astate->size == 0) {
6634 end = 0;
6635 } else if (end >= (int64_t)astate->size) {
6636 end = astate->size - 1;
6637 }
6638
6639 if (end < 0)
6640 len = 0;
6641 else
6642 len = end - ofs + 1;
6643
6644 if (astate->has_manifest && astate->manifest.has_tail()) {
6645 /* now get the relevant object part */
6646 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6647
6648 uint64_t stripe_ofs = iter.get_stripe_ofs();
6649 read_obj = iter.get_location().get_raw_obj(store);
6650 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6651 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6652 reading_from_head = (read_obj == state.head_obj);
6653 } else {
6654 read_obj = state.head_obj;
6655 }
6656
6657 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6658 if (r < 0) {
6659 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6660 return r;
6661 }
6662
6663 if (len > max_chunk_size)
6664 len = max_chunk_size;
6665
6666
6667 read_len = len;
6668
6669 if (reading_from_head) {
6670 /* only when reading from the head object do we need to do the atomic test */
6671 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
6672 if (r < 0)
6673 return r;
6674
6675 if (astate && astate->prefetch_data) {
6676 if (!ofs && astate->data.length() >= len) {
6677 bl = astate->data;
6678 return bl.length();
6679 }
6680
6681 if (ofs < astate->data.length()) {
6682 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
6683 astate->data.copy(ofs, copy_len, bl);
6684 read_len -= copy_len;
6685 read_ofs += copy_len;
6686 if (!read_len)
6687 return bl.length();
6688
6689 merge_bl = true;
6690 pbl = &read_bl;
6691 }
6692 }
6693 }
6694
6695 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6696 op.read(read_ofs, read_len, pbl, NULL);
6697
6698 if (state.cur_pool != read_obj.pool) {
6699 auto iter = state.io_ctxs.find(read_obj.pool);
6700 if (iter == state.io_ctxs.end()) {
6701 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
6702 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false);
6703 if (r < 0) {
6704 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6705 return r;
6706 }
6707 } else {
6708 state.cur_ioctx = &iter->second;
6709 }
6710 state.cur_pool = read_obj.pool;
6711 }
6712
6713 state.cur_ioctx->locator_set_key(read_obj.loc);
6714
6715 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6716 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
6717
6718 if (r < 0) {
6719 return r;
6720 }
6721
6722 if (merge_bl) {
6723 bl.append(read_bl);
6724 }
6725
6726 return bl.length();
6727 }
6728
6729 struct get_obj_data {
6730 RGWRados* store;
6731 RGWGetDataCB* client_cb;
6732 rgw::Aio* aio;
6733 uint64_t offset; // next offset to write to client
6734 rgw::AioResultList completed; // completed read results, sorted by offset
6735
6736 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio, uint64_t offset)
6737 : store(store), client_cb(cb), aio(aio), offset(offset) {}
6738
6739 int flush(rgw::AioResultList&& results) {
6740 int r = rgw::check_for_errors(results);
6741 if (r < 0) {
6742 return r;
6743 }
6744
6745 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6746 results.sort(cmp); // merge() requires results to be sorted first
6747 completed.merge(results, cmp); // merge results in sorted order
6748
6749 while (!completed.empty() && completed.front().id == offset) {
6750 auto bl = std::move(completed.front().data);
6751 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
6752
6753 offset += bl.length();
6754 int r = client_cb->handle_data(bl, 0, bl.length());
6755 if (r < 0) {
6756 return r;
6757 }
6758 }
6759 return 0;
6760 }
6761
6762 void cancel() {
6763 // wait for all completions to drain and ignore the results
6764 aio->drain();
6765 }
6766
6767 int drain() {
6768 auto c = aio->wait();
6769 while (!c.empty()) {
6770 int r = flush(std::move(c));
6771 if (r < 0) {
6772 cancel();
6773 return r;
6774 }
6775 c = aio->wait();
6776 }
6777 return flush(std::move(c));
6778 }
6779 };
6780
6781 static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6782 off_t read_ofs, off_t len, bool is_head_obj,
6783 RGWObjState *astate, void *arg)
6784 {
6785 struct get_obj_data *d = (struct get_obj_data *)arg;
6786
6787 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6788 is_head_obj, astate, arg);
6789 }
6790
6791 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6792 off_t read_ofs, off_t len, bool is_head_obj,
6793 RGWObjState *astate, void *arg)
6794 {
6795 ObjectReadOperation op;
6796 struct get_obj_data *d = (struct get_obj_data *)arg;
6797 string oid, key;
6798
6799 if (is_head_obj) {
6800 /* only when reading from the head object do we need to do the atomic test */
6801 int r = append_atomic_test(astate, op);
6802 if (r < 0)
6803 return r;
6804
6805 if (astate &&
6806 obj_ofs < astate->data.length()) {
6807 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
6808
6809 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
6810 if (r < 0)
6811 return r;
6812
6813 len -= chunk_len;
6814 d->offset += chunk_len;
6815 read_ofs += chunk_len;
6816 obj_ofs += chunk_len;
6817 if (!len)
6818 return 0;
6819 }
6820 }
6821
6822 auto obj = d->store->svc.rados->obj(read_obj);
6823 int r = obj.open();
6824 if (r < 0) {
6825 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6826 return r;
6827 }
6828
6829 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6830 op.read(read_ofs, len, nullptr, nullptr);
6831
6832 const uint64_t cost = len;
6833 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
6834
6835 auto completed = d->aio->submit(obj, &op, cost, id);
6836
6837 return d->flush(std::move(completed));
6838 }
6839
6840 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
6841 {
6842 RGWRados *store = source->get_store();
6843 CephContext *cct = store->ctx();
6844 RGWObjectCtx& obj_ctx = source->get_ctx();
6845 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6846 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
6847
6848 rgw::AioThrottle aio(window_size);
6849 get_obj_data data(store, cb, &aio, ofs);
6850
6851 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
6852 ofs, end, chunk_size, _get_obj_iterate_cb, &data);
6853 if (r < 0) {
6854 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6855 data.cancel(); // drain completions without writing back to client
6856 return r;
6857 }
6858
6859 return data.drain();
6860 }
6861
6862 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6863 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6864 off_t ofs, off_t end, uint64_t max_chunk_size,
6865 iterate_obj_cb cb, void *arg)
6866 {
6867 rgw_raw_obj head_obj;
6868 rgw_raw_obj read_obj;
6869 uint64_t read_ofs = ofs;
6870 uint64_t len;
6871 bool reading_from_head = true;
6872 RGWObjState *astate = NULL;
6873
6874 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6875
6876 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
6877 if (r < 0) {
6878 return r;
6879 }
6880
6881 if (end < 0)
6882 len = 0;
6883 else
6884 len = end - ofs + 1;
6885
6886 if (astate->has_manifest) {
6887 /* now get the relevant object stripe */
6888 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6889
6890 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
6891
6892 for (; iter != obj_end && ofs <= end; ++iter) {
6893 off_t stripe_ofs = iter.get_stripe_ofs();
6894 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6895
6896 while (ofs < next_stripe_ofs && ofs <= end) {
6897 read_obj = iter.get_location().get_raw_obj(this);
6898 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6899 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6900
6901 if (read_len > max_chunk_size) {
6902 read_len = max_chunk_size;
6903 }
6904
6905 reading_from_head = (read_obj == head_obj);
6906 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
6907 if (r < 0) {
6908 return r;
6909 }
6910
6911 len -= read_len;
6912 ofs += read_len;
6913 }
6914 }
6915 } else {
6916 while (ofs <= end) {
6917 read_obj = head_obj;
6918 uint64_t read_len = std::min(len, max_chunk_size);
6919
6920 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
6921 if (r < 0) {
6922 return r;
6923 }
6924
6925 len -= read_len;
6926 ofs += read_len;
6927 }
6928 }
6929
6930 return 0;
6931 }
6932
6933 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6934 {
6935 rgw_rados_ref ref;
6936 int r = get_obj_head_ref(bucket_info, obj, &ref);
6937 if (r < 0) {
6938 return r;
6939 }
6940
6941 return ref.ioctx.operate(ref.obj.oid, op);
6942 }
6943
6944 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6945 {
6946 rgw_rados_ref ref;
6947 int r = get_obj_head_ref(bucket_info, obj, &ref);
6948 if (r < 0) {
6949 return r;
6950 }
6951
6952 bufferlist outbl;
6953
6954 return ref.ioctx.operate(ref.obj.oid, op, &outbl);
6955 }
6956
6957 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6958 {
6959 ObjectWriteOperation op;
6960
6961 ceph_assert(olh_obj.key.instance.empty());
6962
6963 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6964
6965 if (!state.exists) {
6966 op.create(true);
6967 } else {
6968 op.assert_exists();
6969 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6970 op.mtime2(&mtime_ts);
6971 }
6972
6973 /*
6974 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6975 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6976 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6977 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6978 * log will reflect that.
6979 *
6980 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6981 * is used for object data instance, olh_tag for olh instance.
6982 */
6983 if (has_tag) {
6984 /* guard against racing writes */
6985 bucket_index_guard_olh_op(state, op);
6986 }
6987
6988 if (!has_tag) {
6989 /* obj tag */
6990 string obj_tag;
6991 gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
6992
6993 bufferlist bl;
6994 bl.append(obj_tag.c_str(), obj_tag.size());
6995 op.setxattr(RGW_ATTR_ID_TAG, bl);
6996
6997 state.attrset[RGW_ATTR_ID_TAG] = bl;
6998 state.obj_tag = bl;
6999
7000 /* olh tag */
7001 string olh_tag;
7002 gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
7003
7004 bufferlist olh_bl;
7005 olh_bl.append(olh_tag.c_str(), olh_tag.size());
7006 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
7007
7008 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
7009 state.olh_tag = olh_bl;
7010 state.is_olh = true;
7011
7012 bufferlist verbl;
7013 op.setxattr(RGW_ATTR_OLH_VER, verbl);
7014 }
7015
7016 bufferlist bl;
7017 RGWOLHPendingInfo pending_info;
7018 pending_info.time = real_clock::now();
7019 encode(pending_info, bl);
7020
7021 #define OLH_PENDING_TAG_LEN 32
7022 /* tag will start with current time epoch, this so that entries are sorted by time */
7023 char buf[32];
7024 utime_t ut(pending_info.time);
7025 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
7026 *op_tag = buf;
7027
7028 string s;
7029 gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
7030
7031 op_tag->append(s);
7032
7033 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7034 attr_name.append(*op_tag);
7035
7036 op.setxattr(attr_name.c_str(), bl);
7037
7038 int ret = obj_operate(bucket_info, olh_obj, &op);
7039 if (ret < 0) {
7040 return ret;
7041 }
7042
7043 state.exists = true;
7044 state.attrset[attr_name] = bl;
7045
7046 return 0;
7047 }
7048
7049 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7050 {
7051 int ret;
7052
7053 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
7054 if (ret == -EEXIST) {
7055 ret = -ECANCELED;
7056 }
7057
7058 return ret;
7059 }
7060
7061 int RGWRados::guard_reshard(BucketShard *bs,
7062 const rgw_obj& obj_instance,
7063 const RGWBucketInfo& bucket_info,
7064 std::function<int(BucketShard *)> call)
7065 {
7066 rgw_obj obj;
7067 const rgw_obj *pobj = &obj_instance;
7068 int r;
7069
7070 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
7071 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
7072 if (r < 0) {
7073 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
7074 return r;
7075 }
7076 r = call(bs);
7077 if (r != -ERR_BUSY_RESHARDING) {
7078 break;
7079 }
7080 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
7081 string new_bucket_id;
7082 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
7083 if (r == -ERR_BUSY_RESHARDING) {
7084 continue;
7085 }
7086 if (r < 0) {
7087 return r;
7088 }
7089 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
7090 i = 0; /* resharding is finished, make sure we can retry */
7091
7092 obj = *pobj;
7093 obj.bucket.update_bucket_id(new_bucket_id);
7094 pobj = &obj;
7095 } // for loop
7096
7097 if (r < 0) {
7098 return r;
7099 }
7100
7101 return 0;
7102 }
7103
7104 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
7105 string *new_bucket_id,
7106 const RGWBucketInfo& bucket_info,
7107 optional_yield y)
7108 {
7109 int ret = 0;
7110 cls_rgw_bucket_instance_entry entry;
7111
7112 // since we want to run this recovery code from two distinct places,
7113 // let's just put it in a lambda so we can easily re-use; if the
7114 // lambda successfully fetches a new bucket id, it sets
7115 // new_bucket_id and returns 0, otherwise it returns a negative
7116 // error code
7117 auto fetch_new_bucket_id =
7118 [this, bucket_info](const std::string& log_tag,
7119 std::string* new_bucket_id) -> int {
7120 RGWBucketInfo fresh_bucket_info = bucket_info;
7121 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
7122 if (ret < 0) {
7123 ldout(cct, 0) << __func__ <<
7124 " ERROR: failed to refresh bucket info after reshard at " <<
7125 log_tag << ": " << cpp_strerror(-ret) << dendl;
7126 return ret;
7127 }
7128 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
7129 return 0;
7130 };
7131
7132 constexpr int num_retries = 10;
7133 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
7134 ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
7135 if (ret == -ENOENT) {
7136 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
7137 } else if (ret < 0) {
7138 ldout(cct, 0) << __func__ <<
7139 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
7140 dendl;
7141 return ret;
7142 }
7143
7144 if (!entry.resharding_in_progress()) {
7145 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
7146 new_bucket_id);
7147 }
7148
7149 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
7150 (i < num_retries ? "retrying" : "too many retries") << dendl;
7151
7152 if (i == num_retries) {
7153 break;
7154 }
7155
7156 // If bucket is erroneously marked as resharding (e.g., crash or
7157 // other error) then fix it. If we can take the bucket reshard
7158 // lock then it means no other resharding should be taking place,
7159 // and we're free to clear the flags.
7160 {
7161 // since we expect to do this rarely, we'll do our work in a
7162 // block and erase our work after each try
7163
7164 RGWObjectCtx obj_ctx(this);
7165 const rgw_bucket& b = bs->bucket;
7166 std::string bucket_id = b.get_key();
7167 RGWBucketReshardLock reshard_lock(this, bucket_info, true);
7168 ret = reshard_lock.lock();
7169 if (ret < 0) {
7170 ldout(cct, 20) << __func__ <<
7171 " INFO: failed to take reshard lock for bucket " <<
7172 bucket_id << "; expected if resharding underway" << dendl;
7173 } else {
7174 ldout(cct, 10) << __func__ <<
7175 " INFO: was able to take reshard lock for bucket " <<
7176 bucket_id << dendl;
7177 ret = RGWBucketReshard::clear_resharding(this, bucket_info);
7178 if (ret < 0) {
7179 reshard_lock.unlock();
7180 ldout(cct, 0) << __func__ <<
7181 " ERROR: failed to clear resharding flags for bucket " <<
7182 bucket_id << dendl;
7183 } else {
7184 reshard_lock.unlock();
7185 ldout(cct, 5) << __func__ <<
7186 " INFO: apparently successfully cleared resharding flags for "
7187 "bucket " << bucket_id << dendl;
7188 continue; // if we apparently succeed immediately test again
7189 } // if clear resharding succeeded
7190 } // if taking of lock succeeded
7191 } // block to encapsulate recovery from incomplete reshard
7192
7193 ret = reshard_wait->wait(y);
7194 if (ret < 0) {
7195 ldout(cct, 0) << __func__ <<
7196 " ERROR: bucket is still resharding, please retry" << dendl;
7197 return ret;
7198 }
7199 } // for loop
7200
7201 ldout(cct, 0) << __func__ <<
7202 " ERROR: bucket is still resharding, please retry" << dendl;
7203 return -ERR_BUSY_RESHARDING;
7204 }
7205
7206 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7207 bool delete_marker,
7208 const string& op_tag,
7209 struct rgw_bucket_dir_entry_meta *meta,
7210 uint64_t olh_epoch,
7211 real_time unmod_since, bool high_precision_time,
7212 rgw_zone_set *_zones_trace, bool log_data_change)
7213 {
7214 rgw_rados_ref ref;
7215 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7216 if (r < 0) {
7217 return r;
7218 }
7219
7220 rgw_zone_set zones_trace;
7221 if (_zones_trace) {
7222 zones_trace = *_zones_trace;
7223 }
7224 zones_trace.insert(svc.zone->get_zone().id);
7225
7226 BucketShard bs(this);
7227
7228 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
7229 r = guard_reshard(&bs, obj_instance, bucket_info,
7230 [&](BucketShard *bs) -> int {
7231 librados::ObjectWriteOperation op;
7232 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7233 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
7234 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
7235 unmod_since, high_precision_time,
7236 svc.zone->get_zone().log_data, zones_trace);
7237 });
7238 if (r < 0) {
7239 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7240 return r;
7241 }
7242
7243 if (log_data_change && bucket_info.datasync_flag_enabled()) {
7244 data_log->add_entry(bs.bucket, bs.shard_id);
7245 }
7246
7247 return 0;
7248 }
7249
7250 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
7251 {
7252 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7253 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
7254 }
7255
7256 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
7257 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7258 {
7259 rgw_rados_ref ref;
7260 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7261 if (r < 0) {
7262 return r;
7263 }
7264
7265 rgw_zone_set zones_trace;
7266 if (_zones_trace) {
7267 zones_trace = *_zones_trace;
7268 }
7269 zones_trace.insert(svc.zone->get_zone().id);
7270
7271 BucketShard bs(this);
7272
7273 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
7274 r = guard_reshard(&bs, obj_instance, bucket_info,
7275 [&](BucketShard *bs) -> int {
7276 librados::ObjectWriteOperation op;
7277 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7278 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
7279 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
7280 });
7281 if (r < 0) {
7282 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7283 return r;
7284 }
7285
7286 return 0;
7287 }
7288
7289 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
7290 const rgw_obj& obj_instance, uint64_t ver_marker,
7291 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
7292 bool *is_truncated)
7293 {
7294 rgw_rados_ref ref;
7295 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7296 if (r < 0) {
7297 return r;
7298 }
7299
7300 BucketShard bs(this);
7301 int ret =
7302 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7303 if (ret < 0) {
7304 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7305 return ret;
7306 }
7307
7308 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7309
7310 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7311
7312 ret = guard_reshard(&bs, obj_instance, bucket_info,
7313 [&](BucketShard *bs) -> int {
7314 ObjectReadOperation op;
7315 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7316 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
7317 key, ver_marker, olh_tag, log, is_truncated);
7318 });
7319 if (ret < 0) {
7320 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7321 return ret;
7322 }
7323
7324 return 0;
7325 }
7326
7327 // a multisite sync bug resulted in the OLH head attributes being overwritten by
7328 // the attributes from another zone, causing link_olh() to fail endlessly due to
7329 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7330 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7331 int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
7332 const rgw_obj& obj)
7333 {
7334 // fetch the current olh entry from the bucket index
7335 rgw_bucket_olh_entry olh;
7336 int r = bi_get_olh(bucket_info, obj, &olh);
7337 if (r < 0) {
7338 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
7339 return r;
7340 }
7341 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
7342 return 0;
7343 }
7344
7345 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
7346 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
7347
7348 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7349 ObjectWriteOperation op;
7350 // assert this is the same olh tag we think we're fixing
7351 bucket_index_guard_olh_op(*state, op);
7352 // preserve existing mtime
7353 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
7354 op.mtime2(&mtime_ts);
7355 {
7356 bufferlist bl;
7357 bl.append(olh.tag.c_str(), olh.tag.size());
7358 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
7359 }
7360 {
7361 RGWOLHInfo info;
7362 info.target = rgw_obj(bucket_info.bucket, olh.key);
7363 info.removed = olh.delete_marker;
7364 bufferlist bl;
7365 encode(info, bl);
7366 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7367 }
7368 rgw_rados_ref ref;
7369 r = get_obj_head_ref(bucket_info, obj, &ref);
7370 if (r < 0) {
7371 return r;
7372 }
7373 r = ref.ioctx.operate(ref.obj.oid, &op);
7374 if (r < 0) {
7375 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
7376 << cpp_strerror(r) << dendl;
7377 return r;
7378 }
7379 return 0;
7380 }
7381
7382 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7383 {
7384 rgw_rados_ref ref;
7385 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7386 if (r < 0) {
7387 return r;
7388 }
7389
7390 BucketShard bs(this);
7391 int ret =
7392 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7393 if (ret < 0) {
7394 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7395 return ret;
7396 }
7397
7398 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7399
7400 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7401
7402 ret = guard_reshard(&bs, obj_instance, bucket_info,
7403 [&](BucketShard *pbs) -> int {
7404 ObjectWriteOperation op;
7405 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7406 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
7407 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
7408 });
7409 if (ret < 0) {
7410 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7411 return ret;
7412 }
7413
7414 return 0;
7415 }
7416
7417 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7418 {
7419 rgw_rados_ref ref;
7420 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7421 if (r < 0) {
7422 return r;
7423 }
7424
7425 BucketShard bs(this);
7426
7427 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7428
7429 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7430
7431 int ret = guard_reshard(&bs, obj_instance, bucket_info,
7432 [&](BucketShard *pbs) -> int {
7433 ObjectWriteOperation op;
7434 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7435 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
7436 });
7437 if (ret < 0) {
7438 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
7439 return ret;
7440 }
7441
7442 return 0;
7443 }
7444
7445 static int decode_olh_info(CephContext* cct, const bufferlist& bl, RGWOLHInfo *olh)
7446 {
7447 try {
7448 auto biter = bl.cbegin();
7449 decode(*olh, biter);
7450 return 0;
7451 } catch (buffer::error& err) {
7452 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7453 return -EIO;
7454 }
7455 }
7456
7457 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7458 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
7459 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7460 {
7461 if (log.empty()) {
7462 return 0;
7463 }
7464
7465 librados::ObjectWriteOperation op;
7466
7467 uint64_t last_ver = log.rbegin()->first;
7468 *plast_ver = last_ver;
7469
7470 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7471
7472 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7473 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GTE, last_ver);
7474
7475 bufferlist ver_bl;
7476 string last_ver_s = to_string(last_ver);
7477 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7478 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7479
7480 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7481 op.mtime2(&mtime_ts);
7482
7483 bool need_to_link = false;
7484 uint64_t link_epoch = 0;
7485 cls_rgw_obj_key key;
7486 bool delete_marker = false;
7487 list<cls_rgw_obj_key> remove_instances;
7488 bool need_to_remove = false;
7489
7490 // decode current epoch and instance
7491 auto olh_ver = state.attrset.find(RGW_ATTR_OLH_VER);
7492 if (olh_ver != state.attrset.end()) {
7493 std::string str = olh_ver->second.to_str();
7494 std::string err;
7495 link_epoch = strict_strtoll(str.c_str(), 10, &err);
7496 }
7497 auto olh_info = state.attrset.find(RGW_ATTR_OLH_INFO);
7498 if (olh_info != state.attrset.end()) {
7499 RGWOLHInfo info;
7500 int r = decode_olh_info(cct, olh_info->second, &info);
7501 if (r < 0) {
7502 return r;
7503 }
7504 info.target.key.get_index_key(&key);
7505 delete_marker = info.removed;
7506 }
7507
7508 for (iter = log.begin(); iter != log.end(); ++iter) {
7509 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7510 for (; viter != iter->second.end(); ++viter) {
7511 rgw_bucket_olh_log_entry& entry = *viter;
7512
7513 ldout(cct, 20) << "olh_log_entry: epoch=" << iter->first << " op=" << (int)entry.op
7514 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7515 << (entry.delete_marker ? "(delete)" : "") << dendl;
7516 switch (entry.op) {
7517 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7518 remove_instances.push_back(entry.key);
7519 break;
7520 case CLS_RGW_OLH_OP_LINK_OLH:
7521 // only overwrite a link of the same epoch if its key sorts before
7522 if (link_epoch < iter->first || key.instance.empty() ||
7523 key.instance > entry.key.instance) {
7524 ldout(cct, 20) << "apply_olh_log applying key=" << entry.key << " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7525 << " over current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7526 need_to_link = true;
7527 need_to_remove = false;
7528 key = entry.key;
7529 delete_marker = entry.delete_marker;
7530 } else {
7531 ldout(cct, 20) << "apply_olh skipping key=" << entry.key<< " epoch=" << iter->first << " delete_marker=" << entry.delete_marker
7532 << " before current=" << key << " epoch=" << link_epoch << " delete_marker=" << delete_marker << dendl;
7533 }
7534 break;
7535 case CLS_RGW_OLH_OP_UNLINK_OLH:
7536 need_to_remove = true;
7537 need_to_link = false;
7538 break;
7539 default:
7540 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7541 return -EIO;
7542 }
7543 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7544 attr_name.append(entry.op_tag);
7545 op.rmxattr(attr_name.c_str());
7546 }
7547 }
7548
7549 rgw_rados_ref ref;
7550 int r = get_obj_head_ref(bucket_info, obj, &ref);
7551 if (r < 0) {
7552 return r;
7553 }
7554
7555 const rgw_bucket& bucket = obj.bucket;
7556
7557 if (need_to_link) {
7558 rgw_obj target(bucket, key);
7559 RGWOLHInfo info;
7560 info.target = target;
7561 info.removed = delete_marker;
7562 bufferlist bl;
7563 encode(info, bl);
7564 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7565 }
7566
7567 /* first remove object instances */
7568 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7569 liter != remove_instances.end(); ++liter) {
7570 cls_rgw_obj_key& key = *liter;
7571 rgw_obj obj_instance(bucket, key);
7572 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7573 if (ret < 0 && ret != -ENOENT) {
7574 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7575 return ret;
7576 }
7577 }
7578
7579 /* update olh object */
7580 r = ref.ioctx.operate(ref.obj.oid, &op);
7581 if (r == -ECANCELED) {
7582 r = 0;
7583 }
7584 if (r < 0) {
7585 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7586 return r;
7587 }
7588
7589 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7590 if (r < 0) {
7591 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7592 return r;
7593 }
7594
7595 if (need_to_remove) {
7596 ObjectWriteOperation rm_op;
7597
7598 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7599 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7600 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7601 rm_op.remove();
7602
7603 r = ref.ioctx.operate(ref.obj.oid, &rm_op);
7604 if (r == -ECANCELED) {
7605 return 0; /* someone else won this race */
7606 } else {
7607 /*
7608 * only clear if was successful, otherwise we might clobber pending operations on this object
7609 */
7610 r = bucket_index_clear_olh(bucket_info, state, obj);
7611 if (r < 0) {
7612 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7613 return r;
7614 }
7615 }
7616 }
7617
7618 return 0;
7619 }
7620
7621 /*
7622 * read olh log and apply it
7623 */
7624 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7625 {
7626 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7627 bool is_truncated;
7628 uint64_t ver_marker = 0;
7629
7630 do {
7631 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7632 if (ret < 0) {
7633 return ret;
7634 }
7635 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7636 if (ret < 0) {
7637 return ret;
7638 }
7639 } while (is_truncated);
7640
7641 return 0;
7642 }
7643
7644 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
7645 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
7646 rgw_zone_set *zones_trace, bool log_data_change)
7647 {
7648 string op_tag;
7649
7650 rgw_obj olh_obj = target_obj;
7651 olh_obj.key.instance.clear();
7652
7653 RGWObjState *state = NULL;
7654
7655 int ret = 0;
7656 int i;
7657
7658 #define MAX_ECANCELED_RETRY 100
7659 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7660 if (ret == -ECANCELED) {
7661 obj_ctx.invalidate(olh_obj);
7662 }
7663
7664 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7665 if (ret < 0) {
7666 return ret;
7667 }
7668
7669 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7670 if (ret < 0) {
7671 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7672 if (ret == -ECANCELED) {
7673 continue;
7674 }
7675 return ret;
7676 }
7677 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7678 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7679 zones_trace, log_data_change);
7680 if (ret < 0) {
7681 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7682 if (ret == -ECANCELED) {
7683 // the bucket index rejected the link_olh() due to olh tag mismatch;
7684 // attempt to reconstruct olh head attributes based on the bucket index
7685 int r2 = repair_olh(state, bucket_info, olh_obj);
7686 if (r2 < 0 && r2 != -ECANCELED) {
7687 return r2;
7688 }
7689 continue;
7690 }
7691 return ret;
7692 }
7693 break;
7694 }
7695
7696 if (i == MAX_ECANCELED_RETRY) {
7697 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7698 return -EIO;
7699 }
7700
7701 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7702 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7703 ret = 0;
7704 }
7705 if (ret < 0) {
7706 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7707 return ret;
7708 }
7709
7710 return 0;
7711 }
7712
7713 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
7714 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7715 {
7716 string op_tag;
7717
7718 rgw_obj olh_obj = target_obj;
7719 olh_obj.key.instance.clear();
7720
7721 RGWObjState *state = NULL;
7722
7723 int ret = 0;
7724 int i;
7725
7726 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7727 if (ret == -ECANCELED) {
7728 obj_ctx.invalidate(olh_obj);
7729 }
7730
7731 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7732 if (ret < 0)
7733 return ret;
7734
7735 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7736 if (ret < 0) {
7737 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7738 if (ret == -ECANCELED) {
7739 continue;
7740 }
7741 return ret;
7742 }
7743
7744 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7745
7746 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7747 if (ret < 0) {
7748 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7749 if (ret == -ECANCELED) {
7750 continue;
7751 }
7752 return ret;
7753 }
7754 break;
7755 }
7756
7757 if (i == MAX_ECANCELED_RETRY) {
7758 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7759 return -EIO;
7760 }
7761
7762 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7763 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7764 return 0;
7765 }
7766 if (ret < 0) {
7767 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7768 return ret;
7769 }
7770
7771 return 0;
7772 }
7773
7774 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7775 {
7776 #define OBJ_INSTANCE_LEN 32
7777 char buf[OBJ_INSTANCE_LEN + 1];
7778
7779 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7780 no underscore for instance name due to the way we encode the raw keys */
7781
7782 target_key->set_instance(buf);
7783 }
7784
7785 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7786 {
7787 gen_rand_obj_instance_name(&target_obj->key);
7788 }
7789
7790 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7791 {
7792 map<string, bufferlist> attrset;
7793
7794 ObjectReadOperation op;
7795 op.getxattrs(&attrset, NULL);
7796
7797 int r = obj_operate(bucket_info, obj, &op);
7798 if (r < 0) {
7799 return r;
7800 }
7801
7802 auto iter = attrset.find(RGW_ATTR_OLH_INFO);
7803 if (iter == attrset.end()) { /* not an olh */
7804 return -EINVAL;
7805 }
7806
7807 return decode_olh_info(cct, iter->second, olh);
7808 }
7809
7810 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7811 map<string, bufferlist> *rm_pending_entries)
7812 {
7813 map<string, bufferlist>::iterator iter = pending_entries.begin();
7814
7815 real_time now = real_clock::now();
7816
7817 while (iter != pending_entries.end()) {
7818 auto biter = iter->second.cbegin();
7819 RGWOLHPendingInfo pending_info;
7820 try {
7821 decode(pending_info, biter);
7822 } catch (buffer::error& err) {
7823 /* skipping bad entry, we could remove it but it might hide a bug */
7824 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7825 ++iter;
7826 continue;
7827 }
7828
7829 map<string, bufferlist>::iterator cur_iter = iter;
7830 ++iter;
7831 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7832 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7833 pending_entries.erase(cur_iter);
7834 } else {
7835 /* entries names are sorted by time (rounded to a second) */
7836 break;
7837 }
7838 }
7839 }
7840
7841 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7842 {
7843 rgw_rados_ref ref;
7844 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7845 if (r < 0) {
7846 return r;
7847 }
7848
7849 // trim no more than 1000 entries per osd op
7850 constexpr int max_entries = 1000;
7851
7852 auto i = pending_attrs.begin();
7853 while (i != pending_attrs.end()) {
7854 ObjectWriteOperation op;
7855 bucket_index_guard_olh_op(state, op);
7856
7857 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7858 op.rmxattr(i->first.c_str());
7859 }
7860
7861 r = ref.ioctx.operate(ref.obj.oid, &op);
7862 if (r == -ENOENT || r == -ECANCELED) {
7863 /* raced with some other change, shouldn't sweat about it */
7864 return 0;
7865 }
7866 if (r < 0) {
7867 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7868 return r;
7869 }
7870 }
7871 return 0;
7872 }
7873
7874 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7875 {
7876 map<string, bufferlist> pending_entries;
7877 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7878
7879 map<string, bufferlist> rm_pending_entries;
7880 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7881
7882 if (!rm_pending_entries.empty()) {
7883 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7884 if (ret < 0) {
7885 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7886 return ret;
7887 }
7888 }
7889 if (!pending_entries.empty()) {
7890 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7891
7892 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7893 if (ret < 0) {
7894 return ret;
7895 }
7896 }
7897
7898 auto iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7899 if (iter == state->attrset.end()) {
7900 return -EINVAL;
7901 }
7902
7903 RGWOLHInfo olh;
7904 int ret = decode_olh_info(cct, iter->second, &olh);
7905 if (ret < 0) {
7906 return ret;
7907 }
7908
7909 if (olh.removed) {
7910 return -ENOENT;
7911 }
7912
7913 *target = olh.target;
7914
7915 return 0;
7916 }
7917
7918 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7919 map<string, bufferlist> *attrs, bufferlist *first_chunk,
7920 RGWObjVersionTracker *objv_tracker)
7921 {
7922 rgw_rados_ref ref;
7923 int r = get_raw_obj_ref(obj, &ref);
7924 if (r < 0) {
7925 return r;
7926 }
7927
7928 map<string, bufferlist> unfiltered_attrset;
7929 uint64_t size = 0;
7930 struct timespec mtime_ts;
7931
7932 ObjectReadOperation op;
7933 if (objv_tracker) {
7934 objv_tracker->prepare_op_for_read(&op);
7935 }
7936 if (attrs) {
7937 op.getxattrs(&unfiltered_attrset, NULL);
7938 }
7939 if (psize || pmtime) {
7940 op.stat2(&size, &mtime_ts, NULL);
7941 }
7942 if (first_chunk) {
7943 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7944 }
7945 bufferlist outbl;
7946 r = ref.ioctx.operate(ref.obj.oid, &op, &outbl);
7947
7948 if (epoch) {
7949 *epoch = ref.ioctx.get_last_version();
7950 }
7951
7952 if (r < 0)
7953 return r;
7954
7955 if (psize)
7956 *psize = size;
7957 if (pmtime)
7958 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7959 if (attrs) {
7960 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7961 }
7962
7963 return 0;
7964 }
7965
7966 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
7967 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7968 {
7969 vector<rgw_bucket_dir_header> headers;
7970 map<int, string> bucket_instance_ids;
7971 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7972 if (r < 0) {
7973 return r;
7974 }
7975
7976 ceph_assert(headers.size() == bucket_instance_ids.size());
7977
7978 auto iter = headers.begin();
7979 map<int, string>::iterator viter = bucket_instance_ids.begin();
7980 BucketIndexShardsManager ver_mgr;
7981 BucketIndexShardsManager master_ver_mgr;
7982 BucketIndexShardsManager marker_mgr;
7983 char buf[64];
7984 for(; iter != headers.end(); ++iter, ++viter) {
7985 accumulate_raw_stats(*iter, stats);
7986 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7987 ver_mgr.add(viter->first, string(buf));
7988 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7989 master_ver_mgr.add(viter->first, string(buf));
7990 if (shard_id >= 0) {
7991 *max_marker = iter->max_marker;
7992 } else {
7993 marker_mgr.add(viter->first, iter->max_marker);
7994 }
7995 if (syncstopped != NULL)
7996 *syncstopped = iter->syncstopped;
7997 }
7998 ver_mgr.to_string(bucket_ver);
7999 master_ver_mgr.to_string(master_ver);
8000 if (shard_id < 0) {
8001 marker_mgr.to_string(max_marker);
8002 }
8003 return 0;
8004 }
8005
8006 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
8007 map<int, string>& markers)
8008 {
8009 vector<rgw_bucket_dir_header> headers;
8010 map<int, string> bucket_instance_ids;
8011 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
8012 if (r < 0)
8013 return r;
8014
8015 ceph_assert(headers.size() == bucket_instance_ids.size());
8016
8017 auto iter = headers.begin();
8018 map<int, string>::iterator viter = bucket_instance_ids.begin();
8019
8020 for(; iter != headers.end(); ++iter, ++viter) {
8021 if (shard_id >= 0) {
8022 markers[shard_id] = iter->max_marker;
8023 } else {
8024 markers[viter->first] = iter->max_marker;
8025 }
8026 }
8027 return 0;
8028 }
8029
8030 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
8031 RGWGetBucketStats_CB *cb;
8032 uint32_t pendings;
8033 map<RGWObjCategory, RGWStorageStats> stats;
8034 int ret_code;
8035 bool should_cb;
8036 Mutex lock;
8037
8038 public:
8039 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
8040 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
8041 lock("RGWGetBucketStatsContext") {}
8042
8043 void handle_response(int r, rgw_bucket_dir_header& header) override {
8044 Mutex::Locker l(lock);
8045 if (should_cb) {
8046 if ( r >= 0) {
8047 accumulate_raw_stats(header, stats);
8048 } else {
8049 ret_code = r;
8050 }
8051
8052 // Are we all done?
8053 if (--pendings == 0) {
8054 if (!ret_code) {
8055 cb->set_response(&stats);
8056 }
8057 cb->handle_response(ret_code);
8058 cb->put();
8059 }
8060 }
8061 }
8062
8063 void unset_cb() {
8064 Mutex::Locker l(lock);
8065 should_cb = false;
8066 }
8067 };
8068
8069 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
8070 {
8071 int num_aio = 0;
8072 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
8073 ceph_assert(get_ctx);
8074 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
8075 if (r < 0) {
8076 ctx->put();
8077 if (num_aio) {
8078 get_ctx->unset_cb();
8079 }
8080 }
8081 get_ctx->put();
8082 return r;
8083 }
8084
8085 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
8086 RGWGetUserStats_CB *cb;
8087
8088 public:
8089 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
8090 : cb(cb) {}
8091
8092 void handle_response(int r, cls_user_header& header) override {
8093 const cls_user_stats& hs = header.stats;
8094 if (r >= 0) {
8095 RGWStorageStats stats;
8096
8097 stats.size = hs.total_bytes;
8098 stats.size_rounded = hs.total_bytes_rounded;
8099 stats.num_objects = hs.total_entries;
8100
8101 cb->set_response(stats);
8102 }
8103
8104 cb->handle_response(r);
8105
8106 cb->put();
8107 }
8108 };
8109
8110 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
8111 {
8112 string user_str = user.to_str();
8113
8114 cls_user_header header;
8115 int r = cls_user_get_header(user_str, &header);
8116 if (r < 0)
8117 return r;
8118
8119 const cls_user_stats& hs = header.stats;
8120
8121 stats.size = hs.total_bytes;
8122 stats.size_rounded = hs.total_bytes_rounded;
8123 stats.num_objects = hs.total_entries;
8124
8125 return 0;
8126 }
8127
8128 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
8129 {
8130 string user_str = user.to_str();
8131
8132 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
8133 int r = cls_user_get_header_async(user_str, get_ctx);
8134 if (r < 0) {
8135 ctx->put();
8136 delete get_ctx;
8137 return r;
8138 }
8139
8140 return 0;
8141 }
8142
8143 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
8144 {
8145 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
8146 }
8147
8148 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
8149 {
8150 if (!bucket.oid.empty()) {
8151 obj.init(svc.zone->get_zone_params().domain_root, bucket.oid);
8152 } else {
8153 string oid;
8154 get_bucket_meta_oid(bucket, oid);
8155 obj.init(svc.zone->get_zone_params().domain_root, oid);
8156 }
8157 }
8158
8159 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
8160 real_time *pmtime, map<string, bufferlist> *pattrs)
8161 {
8162 size_t pos = meta_key.find(':');
8163 if (pos == string::npos) {
8164 return -EINVAL;
8165 }
8166 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
8167 rgw_bucket_instance_key_to_oid(oid);
8168
8169 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8170 }
8171
8172 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
8173 real_time *pmtime, map<string, bufferlist> *pattrs)
8174 {
8175 string oid;
8176 if (bucket.oid.empty()) {
8177 get_bucket_meta_oid(bucket, oid);
8178 } else {
8179 oid = bucket.oid;
8180 }
8181
8182 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8183 }
8184
8185 int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
8186 real_time *pmtime, map<string, bufferlist> *pattrs,
8187 rgw_cache_entry_info *cache_info,
8188 boost::optional<obj_version> refresh_version)
8189 {
8190 auto& domain_root = svc.zone->get_zone_params().domain_root;
8191
8192 ldout(cct, 20) << "reading from " << domain_root << ":" << oid << dendl;
8193
8194 bufferlist epbl;
8195
8196 int ret = rgw_get_system_obj(this, obj_ctx, domain_root,
8197 oid, epbl, &info.objv_tracker, pmtime, pattrs,
8198 cache_info, refresh_version);
8199 if (ret < 0) {
8200 return ret;
8201 }
8202
8203 auto iter = epbl.cbegin();
8204 try {
8205 decode(info, iter);
8206 } catch (buffer::error& err) {
8207 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8208 return -EIO;
8209 }
8210 info.bucket.oid = oid;
8211 return 0;
8212 }
8213
8214 int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx& obj_ctx,
8215 const string& tenant_name,
8216 const string& bucket_name,
8217 RGWBucketEntryPoint& entry_point,
8218 RGWObjVersionTracker *objv_tracker,
8219 real_time *pmtime,
8220 map<string, bufferlist> *pattrs,
8221 rgw_cache_entry_info *cache_info,
8222 boost::optional<obj_version> refresh_version)
8223 {
8224 bufferlist bl;
8225 string bucket_entry;
8226
8227 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
8228 int ret = rgw_get_system_obj(this, obj_ctx, svc.zone->get_zone_params().domain_root,
8229 bucket_entry, bl, objv_tracker, pmtime, pattrs,
8230 cache_info, refresh_version);
8231 if (ret < 0) {
8232 return ret;
8233 }
8234
8235 auto iter = bl.cbegin();
8236 try {
8237 decode(entry_point, iter);
8238 } catch (buffer::error& err) {
8239 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8240 return -EIO;
8241 }
8242 return 0;
8243 }
8244
8245 int RGWRados::convert_old_bucket_info(RGWSysObjectCtx& obj_ctx,
8246 const string& tenant_name,
8247 const string& bucket_name)
8248 {
8249 RGWBucketEntryPoint entry_point;
8250 real_time ep_mtime;
8251 RGWObjVersionTracker ot;
8252 map<string, bufferlist> attrs;
8253 RGWBucketInfo info;
8254
8255 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
8256
8257 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
8258 if (ret < 0) {
8259 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
8260 return ret;
8261 }
8262
8263 if (!entry_point.has_bucket_info) {
8264 /* already converted! */
8265 return 0;
8266 }
8267
8268 info = entry_point.old_bucket_info;
8269 info.bucket.oid = bucket_name;
8270 info.ep_objv = ot.read_version;
8271
8272 ot.generate_new_write_ver(cct);
8273
8274 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
8275 if (ret < 0) {
8276 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
8277 return ret;
8278 }
8279
8280 return 0;
8281 }
8282
8283 int RGWRados::_get_bucket_info(RGWSysObjectCtx& obj_ctx,
8284 const string& tenant,
8285 const string& bucket_name,
8286 RGWBucketInfo& info,
8287 real_time *pmtime,
8288 map<string, bufferlist> *pattrs,
8289 boost::optional<obj_version> refresh_version)
8290 {
8291 string bucket_entry;
8292 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
8293
8294
8295 if (auto e = binfo_cache->find(bucket_entry)) {
8296 if (refresh_version &&
8297 e->info.objv_tracker.read_version.compare(&(*refresh_version))) {
8298 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
8299 << "a failure that should be debugged. I am a nice machine, "
8300 << "so I will try to recover." << dendl;
8301 binfo_cache->invalidate(bucket_entry);
8302 } else {
8303 info = e->info;
8304 if (pattrs)
8305 *pattrs = e->attrs;
8306 if (pmtime)
8307 *pmtime = e->mtime;
8308 return 0;
8309 }
8310 }
8311
8312 bucket_info_entry e;
8313 RGWBucketEntryPoint entry_point;
8314 real_time ep_mtime;
8315 RGWObjVersionTracker ot;
8316 rgw_cache_entry_info entry_cache_info;
8317 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
8318 entry_point, &ot, &ep_mtime, pattrs,
8319 &entry_cache_info, refresh_version);
8320 if (ret < 0) {
8321 /* only init these fields */
8322 info.bucket.tenant = tenant;
8323 info.bucket.name = bucket_name;
8324 return ret;
8325 }
8326
8327 if (entry_point.has_bucket_info) {
8328 info = entry_point.old_bucket_info;
8329 info.bucket.oid = bucket_name;
8330 info.bucket.tenant = tenant;
8331 info.ep_objv = ot.read_version;
8332 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
8333 return 0;
8334 }
8335
8336 /* data is in the bucket instance object, we need to get attributes from there, clear everything
8337 * that we got
8338 */
8339 if (pattrs) {
8340 pattrs->clear();
8341 }
8342
8343 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
8344
8345
8346 /* read bucket instance info */
8347
8348 string oid;
8349 get_bucket_meta_oid(entry_point.bucket, oid);
8350
8351 rgw_cache_entry_info cache_info;
8352
8353 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
8354 &cache_info, refresh_version);
8355 e.info.ep_objv = ot.read_version;
8356 info = e.info;
8357 if (ret < 0) {
8358 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
8359 info.bucket.tenant = tenant;
8360 info.bucket.name = bucket_name;
8361 // XXX and why return anything in case of an error anyway?
8362 return ret;
8363 }
8364
8365 if (pmtime)
8366 *pmtime = e.mtime;
8367 if (pattrs)
8368 *pattrs = e.attrs;
8369
8370 /* chain to both bucket entry point and bucket instance */
8371 if (!binfo_cache->put(svc.cache, bucket_entry, &e, {&entry_cache_info, &cache_info})) {
8372 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
8373 }
8374
8375 if (refresh_version &&
8376 refresh_version->compare(&info.objv_tracker.read_version)) {
8377 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
8378 << "have gone squirrelly. An administrator may have forced a "
8379 << "change; otherwise there is a problem somewhere." << dendl;
8380 }
8381
8382 return 0;
8383 }
8384
8385 int RGWRados::get_bucket_info(RGWSysObjectCtx& obj_ctx,
8386 const string& tenant, const string& bucket_name,
8387 RGWBucketInfo& info,
8388 real_time *pmtime, map<string, bufferlist> *pattrs)
8389 {
8390 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
8391 pattrs, boost::none);
8392 }
8393
8394 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
8395 ceph::real_time *pmtime,
8396 map<string, bufferlist> *pattrs)
8397 {
8398 RGWSysObjectCtx obj_ctx = svc.sysobj->init_obj_ctx();
8399
8400 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
8401 info, pmtime, pattrs, info.objv_tracker.read_version);
8402 }
8403
8404 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
8405 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
8406 map<string, bufferlist> *pattrs)
8407 {
8408 bufferlist epbl;
8409 encode(entry_point, epbl);
8410 string bucket_entry;
8411 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
8412 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
8413 }
8414
8415 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
8416 real_time mtime, map<string, bufferlist> *pattrs)
8417 {
8418 info.has_instance_obj = true;
8419 bufferlist bl;
8420
8421 encode(info, bl);
8422
8423 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
8424 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
8425 if (ret == -EEXIST) {
8426 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
8427 * bucket operation on this specific bucket (e.g., being synced from the master), but
8428 * since bucket instace meta object is unique for this specific bucket instace, we don't
8429 * need to return an error.
8430 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
8431 * master, creating a bucket, sending bucket creation to the master, we create the bucket
8432 * locally, while in the sync thread we sync the new bucket.
8433 */
8434 ret = 0;
8435 }
8436 return ret;
8437 }
8438
8439 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
8440 map<string, bufferlist> *pattrs, bool create_entry_point)
8441 {
8442 bool create_head = !info.has_instance_obj || create_entry_point;
8443
8444 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
8445 if (ret < 0) {
8446 return ret;
8447 }
8448
8449 if (!create_head)
8450 return 0; /* done! */
8451
8452 RGWBucketEntryPoint entry_point;
8453 entry_point.bucket = info.bucket;
8454 entry_point.owner = info.owner;
8455 entry_point.creation_time = info.creation_time;
8456 entry_point.linked = true;
8457 RGWObjVersionTracker ot;
8458 if (pep_objv && !pep_objv->tag.empty()) {
8459 ot.write_version = *pep_objv;
8460 } else {
8461 ot.generate_new_write_ver(cct);
8462 if (pep_objv) {
8463 *pep_objv = ot.write_version;
8464 }
8465 }
8466 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
8467 if (ret < 0)
8468 return ret;
8469
8470 return 0;
8471 }
8472
8473 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
8474 {
8475 auto obj_ctx = svc.sysobj->init_obj_ctx();
8476
8477 map<string, RGWBucketEnt>::iterator iter;
8478 for (iter = m.begin(); iter != m.end(); ++iter) {
8479 RGWBucketEnt& ent = iter->second;
8480 rgw_bucket& bucket = ent.bucket;
8481 ent.count = 0;
8482 ent.size = 0;
8483 ent.size_rounded = 0;
8484
8485 vector<rgw_bucket_dir_header> headers;
8486
8487 RGWBucketInfo bucket_info;
8488 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
8489 if (ret < 0) {
8490 return ret;
8491 }
8492
8493 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
8494 if (r < 0)
8495 return r;
8496
8497 auto hiter = headers.begin();
8498 for (; hiter != headers.end(); ++hiter) {
8499 RGWObjCategory category = main_category;
8500 auto iter = (hiter->stats).find(category);
8501 if (iter != hiter->stats.end()) {
8502 struct rgw_bucket_category_stats& stats = iter->second;
8503 ent.count += stats.num_entries;
8504 ent.size += stats.total_size;
8505 ent.size_rounded += stats.total_size_rounded;
8506 }
8507 }
8508
8509 // fill in placement_rule from the bucket instance for use in swift's
8510 // per-storage policy statistics
8511 ent.placement_rule = std::move(bucket_info.placement_rule);
8512 }
8513
8514 return m.size();
8515 }
8516
8517 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
8518 {
8519 rgw_rados_ref ref;
8520 int r = get_raw_obj_ref(obj, &ref);
8521 if (r < 0) {
8522 return r;
8523 }
8524 librados::Rados *rad = get_rados_handle();
8525 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
8526
8527 r = ref.ioctx.aio_append(ref.obj.oid, completion, bl, size);
8528 completion->release();
8529 return r;
8530 }
8531
8532 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
8533 {
8534 librados::IoCtx& io_ctx = ctx.io_ctx;
8535 librados::NObjectIterator& iter = ctx.iter;
8536
8537 int r = open_pool_ctx(pool, io_ctx, false);
8538 if (r < 0)
8539 return r;
8540
8541 iter = io_ctx.nobjects_begin();
8542
8543 return 0;
8544 }
8545
8546 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
8547 {
8548 librados::IoCtx& io_ctx = ctx.io_ctx;
8549 librados::NObjectIterator& iter = ctx.iter;
8550
8551 int r = open_pool_ctx(pool, io_ctx, false);
8552 if (r < 0)
8553 return r;
8554
8555 librados::ObjectCursor oc;
8556 if (!oc.from_str(cursor)) {
8557 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
8558 return -EINVAL;
8559 }
8560
8561 try {
8562 iter = io_ctx.nobjects_begin(oc);
8563 return 0;
8564 } catch (const std::system_error& e) {
8565 r = -e.code().value();
8566 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8567 << ", returning " << r << dendl;
8568 return r;
8569 } catch (const std::exception& e) {
8570 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8571 << ", returning -5" << dendl;
8572 return -EIO;
8573 }
8574 }
8575
8576 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
8577 {
8578 return ctx.iter.get_cursor().to_str();
8579 }
8580
8581 static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
8582 vector<rgw_bucket_dir_entry>& objs,
8583 bool *is_truncated, RGWAccessListFilter *filter)
8584 {
8585 librados::IoCtx& io_ctx = ctx.io_ctx;
8586 librados::NObjectIterator& iter = ctx.iter;
8587
8588 if (iter == io_ctx.nobjects_end())
8589 return -ENOENT;
8590
8591 uint32_t i;
8592
8593 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
8594 rgw_bucket_dir_entry e;
8595
8596 string oid = iter->get_oid();
8597 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
8598
8599 // fill it in with initial values; we may correct later
8600 if (filter && !filter->filter(oid, oid))
8601 continue;
8602
8603 e.key = oid;
8604 objs.push_back(e);
8605 }
8606
8607 if (is_truncated)
8608 *is_truncated = (iter != io_ctx.nobjects_end());
8609
8610 return objs.size();
8611 }
8612
8613 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
8614 bool *is_truncated, RGWAccessListFilter *filter)
8615 {
8616 // catch exceptions from NObjectIterator::operator++()
8617 try {
8618 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
8619 } catch (const std::system_error& e) {
8620 int r = -e.code().value();
8621 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8622 << ", returning " << r << dendl;
8623 return r;
8624 } catch (const std::exception& e) {
8625 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8626 << ", returning -5" << dendl;
8627 return -EIO;
8628 }
8629 }
8630
8631 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
8632 {
8633 if (!ctx->initialized) {
8634 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
8635 if (r < 0) {
8636 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
8637 return r;
8638 }
8639 ctx->initialized = true;
8640 }
8641 return 0;
8642 }
8643
8644 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
8645 RGWListRawObjsCtx& ctx, list<string>& oids,
8646 bool *is_truncated)
8647 {
8648 if (!ctx.initialized) {
8649 return -EINVAL;
8650 }
8651 RGWAccessListFilterPrefix filter(prefix_filter);
8652 vector<rgw_bucket_dir_entry> objs;
8653 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
8654 if (r < 0) {
8655 if(r != -ENOENT)
8656 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
8657 return r;
8658 }
8659
8660 vector<rgw_bucket_dir_entry>::iterator iter;
8661 for (iter = objs.begin(); iter != objs.end(); ++iter) {
8662 oids.push_back(iter->key.name);
8663 }
8664
8665 return oids.size();
8666 }
8667
8668 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
8669 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8670 bool *is_truncated)
8671 {
8672 if (!ctx.initialized) {
8673 int r = list_raw_objects_init(pool, string(), &ctx);
8674 if (r < 0) {
8675 return r;
8676 }
8677 }
8678
8679 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
8680 }
8681
8682 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8683 {
8684 return pool_iterate_get_cursor(ctx.iter_ctx);
8685 }
8686
8687 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
8688 std::list<rgw_bi_log_entry>& result, bool *truncated)
8689 {
8690 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
8691 result.clear();
8692
8693 librados::IoCtx index_ctx;
8694 map<int, string> oids;
8695 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
8696 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
8697 if (r < 0)
8698 return r;
8699
8700 BucketIndexShardsManager marker_mgr;
8701 bool has_shards = (oids.size() > 1 || shard_id >= 0);
8702 // If there are multiple shards for the bucket index object, the marker
8703 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
8704 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
8705 // only contain one record, and the key is the bucket instance id.
8706 r = marker_mgr.from_string(marker, shard_id);
8707 if (r < 0)
8708 return r;
8709
8710 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
8711 if (r < 0)
8712 return r;
8713
8714 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
8715 map<int, list<rgw_bi_log_entry>::iterator> vends;
8716 if (truncated) {
8717 *truncated = false;
8718 }
8719 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
8720 for (; miter != bi_log_lists.end(); ++miter) {
8721 int shard_id = miter->first;
8722 vcurrents[shard_id] = miter->second.entries.begin();
8723 vends[shard_id] = miter->second.entries.end();
8724 if (truncated) {
8725 *truncated = (*truncated || miter->second.truncated);
8726 }
8727 }
8728
8729 size_t total = 0;
8730 bool has_more = true;
8731 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
8732 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
8733 while (total < max && has_more) {
8734 has_more = false;
8735
8736 viter = vcurrents.begin();
8737 eiter = vends.begin();
8738
8739 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
8740 assert (eiter != vends.end());
8741
8742 int shard_id = viter->first;
8743 list<rgw_bi_log_entry>::iterator& liter = viter->second;
8744
8745 if (liter == eiter->second){
8746 continue;
8747 }
8748 rgw_bi_log_entry& entry = *(liter);
8749 if (has_shards) {
8750 char buf[16];
8751 snprintf(buf, sizeof(buf), "%d", shard_id);
8752 string tmp_id;
8753 build_bucket_index_marker(buf, entry.id, &tmp_id);
8754 entry.id.swap(tmp_id);
8755 }
8756 marker_mgr.add(shard_id, entry.id);
8757 result.push_back(entry);
8758 total++;
8759 has_more = true;
8760 ++liter;
8761 }
8762 }
8763
8764 if (truncated) {
8765 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
8766 assert (eiter != vends.end());
8767 *truncated = (*truncated || (viter->second != eiter->second));
8768 }
8769 }
8770
8771 // Refresh marker, if there are multiple shards, the output will look like
8772 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
8773 // if there is no sharding, the simply marker (without oid) is returned
8774 if (has_shards) {
8775 marker_mgr.to_string(&marker);
8776 } else {
8777 if (!result.empty()) {
8778 marker = result.rbegin()->id;
8779 }
8780 }
8781
8782 return 0;
8783 }
8784
8785 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
8786 {
8787 librados::IoCtx index_ctx;
8788 map<int, string> bucket_objs;
8789
8790 BucketIndexShardsManager start_marker_mgr;
8791 BucketIndexShardsManager end_marker_mgr;
8792
8793 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8794 if (r < 0) {
8795 return r;
8796 }
8797
8798 r = start_marker_mgr.from_string(start_marker, shard_id);
8799 if (r < 0) {
8800 return r;
8801 }
8802
8803 r = end_marker_mgr.from_string(end_marker, shard_id);
8804 if (r < 0) {
8805 return r;
8806 }
8807
8808 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
8809 cct->_conf->rgw_bucket_index_max_aio)();
8810 }
8811
8812 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8813 {
8814 librados::IoCtx index_ctx;
8815 map<int, string> bucket_objs;
8816 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8817 if (r < 0)
8818 return r;
8819
8820 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8821 }
8822
8823 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8824 {
8825 librados::IoCtx index_ctx;
8826 map<int, string> bucket_objs;
8827 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8828 if (r < 0)
8829 return r;
8830
8831 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8832 }
8833
8834 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8835 rgw_bucket_dir_entry *dirent)
8836 {
8837 rgw_cls_bi_entry bi_entry;
8838 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
8839 if (r < 0 && r != -ENOENT) {
8840 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8841 }
8842 if (r < 0) {
8843 return r;
8844 }
8845 auto iter = bi_entry.data.cbegin();
8846 try {
8847 decode(*dirent, iter);
8848 } catch (buffer::error& err) {
8849 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8850 return -EIO;
8851 }
8852
8853 return 0;
8854 }
8855
8856 int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8857 rgw_bucket_olh_entry *olh)
8858 {
8859 rgw_cls_bi_entry bi_entry;
8860 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
8861 if (r < 0 && r != -ENOENT) {
8862 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8863 }
8864 if (r < 0) {
8865 return r;
8866 }
8867 auto iter = bi_entry.data.cbegin();
8868 try {
8869 decode(*olh, iter);
8870 } catch (buffer::error& err) {
8871 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8872 return -EIO;
8873 }
8874
8875 return 0;
8876 }
8877
8878 int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8879 BIIndexType index_type, rgw_cls_bi_entry *entry)
8880 {
8881 BucketShard bs(this);
8882 int ret = bs.init(bucket_info, obj);
8883 if (ret < 0) {
8884 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8885 return ret;
8886 }
8887
8888 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8889
8890 return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
8891 }
8892
8893 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8894 {
8895 cls_rgw_bi_put(op, bs.bucket_obj, entry);
8896 }
8897
8898 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8899 {
8900 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
8901 if (ret < 0)
8902 return ret;
8903
8904 return 0;
8905 }
8906
8907 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
8908 {
8909 BucketShard bs(this);
8910 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
8911 if (ret < 0) {
8912 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8913 return ret;
8914 }
8915
8916 return bi_put(bs, entry);
8917 }
8918
8919 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8920 {
8921 rgw_obj obj(bucket, obj_name);
8922 BucketShard bs(this);
8923 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
8924 if (ret < 0) {
8925 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8926 return ret;
8927 }
8928
8929 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
8930 if (ret == -ENOENT) {
8931 *is_truncated = false;
8932 }
8933 if (ret < 0)
8934 return ret;
8935
8936 return 0;
8937 }
8938
8939 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8940 {
8941 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
8942 if (ret < 0)
8943 return ret;
8944
8945 return 0;
8946 }
8947
8948 int RGWRados::bi_remove(BucketShard& bs)
8949 {
8950 int ret = bs.index_ctx.remove(bs.bucket_obj);
8951 if (ret == -ENOENT) {
8952 ret = 0;
8953 }
8954 if (ret < 0) {
8955 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8956 return ret;
8957 }
8958
8959 return 0;
8960 }
8961
8962 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8963 {
8964 BucketShard bs(this);
8965 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
8966 if (ret < 0) {
8967 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8968 return ret;
8969 }
8970
8971 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8972 }
8973
8974 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8975 {
8976 return gc_pool_ctx.operate(oid, op);
8977 }
8978
8979 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op, AioCompletion **pc)
8980 {
8981 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
8982 int r = gc_pool_ctx.aio_operate(oid, c, op);
8983 if (!pc) {
8984 c->release();
8985 } else {
8986 *pc = c;
8987 }
8988 return r;
8989 }
8990
8991 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8992 {
8993 return gc_pool_ctx.operate(oid, op, pbl);
8994 }
8995
8996 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
8997 {
8998 return gc->list(index, marker, max, expired_only, result, truncated);
8999 }
9000
9001 int RGWRados::process_gc(bool expired_only)
9002 {
9003 return gc->process(expired_only);
9004 }
9005
9006 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
9007 {
9008 return lc->list_lc_progress(marker, max_entries, progress_map);
9009 }
9010
9011 int RGWRados::process_lc()
9012 {
9013 return lc->process();
9014 }
9015
9016 bool RGWRados::process_expire_objects()
9017 {
9018 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
9019 }
9020
9021 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
9022 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
9023 {
9024 rgw_zone_set zones_trace;
9025 if (_zones_trace) {
9026 zones_trace = *_zones_trace;
9027 }
9028 zones_trace.insert(svc.zone->get_zone().id);
9029
9030 ObjectWriteOperation o;
9031 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9032 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
9033 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
9034 return bs.index_ctx.operate(bs.bucket_obj, &o);
9035 }
9036
9037 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
9038 int64_t pool, uint64_t epoch,
9039 rgw_bucket_dir_entry& ent, RGWObjCategory category,
9040 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
9041 {
9042 ObjectWriteOperation o;
9043 rgw_bucket_dir_entry_meta dir_meta;
9044 dir_meta = ent.meta;
9045 dir_meta.category = category;
9046
9047 rgw_zone_set zones_trace;
9048 if (_zones_trace) {
9049 zones_trace = *_zones_trace;
9050 }
9051 zones_trace.insert(svc.zone->get_zone().id);
9052
9053 rgw_bucket_entry_ver ver;
9054 ver.pool = pool;
9055 ver.epoch = epoch;
9056 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
9057 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
9058 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
9059 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
9060 complete_op_data *arg;
9061 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
9062 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
9063 librados::AioCompletion *completion = arg->rados_completion;
9064 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
9065 completion->release(); /* can't reference arg here, as it might have already been released */
9066 return ret;
9067 }
9068
9069 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
9070 int64_t pool, uint64_t epoch,
9071 rgw_bucket_dir_entry& ent, RGWObjCategory category,
9072 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
9073 {
9074 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
9075 }
9076
9077 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
9078 int64_t pool, uint64_t epoch,
9079 rgw_obj& obj,
9080 real_time& removed_mtime,
9081 list<rgw_obj_index_key> *remove_objs,
9082 uint16_t bilog_flags,
9083 rgw_zone_set *zones_trace)
9084 {
9085 rgw_bucket_dir_entry ent;
9086 ent.meta.mtime = removed_mtime;
9087 obj.key.get_index_key(&ent.key);
9088 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
9089 ent, RGWObjCategory::None, remove_objs,
9090 bilog_flags, zones_trace);
9091 }
9092
9093 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
9094 {
9095 rgw_bucket_dir_entry ent;
9096 obj.key.get_index_key(&ent.key);
9097 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
9098 -1 /* pool id */, 0, ent,
9099 RGWObjCategory::None, NULL, bilog_flags,
9100 zones_trace);
9101 }
9102
9103 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
9104 {
9105 librados::IoCtx index_ctx;
9106 map<int, string> bucket_objs;
9107 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
9108 if (r < 0)
9109 return r;
9110
9111 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
9112 }
9113
9114
9115 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
9116 int shard_id,
9117 const rgw_obj_index_key& start,
9118 const string& prefix,
9119 uint32_t num_entries,
9120 bool list_versions,
9121 map<string, rgw_bucket_dir_entry>& m,
9122 bool *is_truncated,
9123 rgw_obj_index_key *last_entry,
9124 bool (*force_check_filter)(const string& name))
9125 {
9126 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
9127 " start " << start.name << "[" << start.instance << "] num_entries " <<
9128 num_entries << dendl;
9129
9130 librados::IoCtx index_ctx;
9131 // key - oid (for different shards if there is any)
9132 // value - list result for the corresponding oid (shard), it is filled by
9133 // the AIO callback
9134 map<int, string> oids;
9135 map<int, struct rgw_cls_list_ret> list_results;
9136 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9137 if (r < 0)
9138 return r;
9139
9140 cls_rgw_obj_key start_key(start.name, start.instance);
9141 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
9142 list_versions, oids, list_results,
9143 cct->_conf->rgw_bucket_index_max_aio)();
9144 if (r < 0)
9145 return r;
9146
9147 // Create a list of iterators that are used to iterate each shard
9148 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents;
9149 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends;
9150 vector<string> vnames;
9151 vcurrents.reserve(list_results.size());
9152 vends.reserve(list_results.size());
9153 vnames.reserve(list_results.size());
9154 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9155 *is_truncated = false;
9156 for (; iter != list_results.end(); ++iter) {
9157 vcurrents.push_back(iter->second.dir.m.begin());
9158 vends.push_back(iter->second.dir.m.end());
9159 vnames.push_back(oids[iter->first]);
9160 *is_truncated = (*is_truncated || iter->second.is_truncated);
9161 }
9162
9163 // Create a map to track the next candidate entry from each shard, if the entry
9164 // from a specified shard is selected/erased, the next entry from that shard will
9165 // be inserted for next round selection
9166 map<string, size_t> candidates;
9167 for (size_t i = 0; i < vcurrents.size(); ++i) {
9168 if (vcurrents[i] != vends[i]) {
9169 candidates[vcurrents[i]->first] = i;
9170 }
9171 }
9172
9173 map<string, bufferlist> updates;
9174 uint32_t count = 0;
9175 int pos = -1;
9176 while (count < num_entries && !candidates.empty()) {
9177 r = 0;
9178 // Select the next one
9179 pos = candidates.begin()->second;
9180 const string& name = vcurrents[pos]->first;
9181 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
9182
9183 bool force_check = force_check_filter &&
9184 force_check_filter(dirent.key.name);
9185 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9186 !dirent.pending_map.empty() ||
9187 force_check) {
9188 /* there are uncommitted ops. We need to check the current state,
9189 * and if the tags are old we need to do cleanup as well. */
9190 librados::IoCtx sub_ctx;
9191 sub_ctx.dup(index_ctx);
9192 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
9193 updates[vnames[pos]]);
9194 if (r < 0 && r != -ENOENT) {
9195 return r;
9196 }
9197 } else {
9198 r = 0;
9199 }
9200 if (r >= 0) {
9201 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
9202 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
9203 m[name] = std::move(dirent);
9204 ++count;
9205 }
9206
9207 // Refresh the candidates map
9208 candidates.erase(candidates.begin());
9209 ++vcurrents[pos];
9210 if (vcurrents[pos] != vends[pos]) {
9211 candidates[vcurrents[pos]->first] = pos;
9212 }
9213 }
9214
9215 // Suggest updates if there is any
9216 map<string, bufferlist>::iterator miter = updates.begin();
9217 for (; miter != updates.end(); ++miter) {
9218 if (miter->second.length()) {
9219 ObjectWriteOperation o;
9220 cls_rgw_suggest_changes(o, miter->second);
9221 // we don't care if we lose suggested updates, send them off blindly
9222 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9223 index_ctx.aio_operate(miter->first, c, &o);
9224 c->release();
9225 }
9226 }
9227
9228 // Check if all the returned entries are consumed or not
9229 for (size_t i = 0; i < vcurrents.size(); ++i) {
9230 if (vcurrents[i] != vends[i]) {
9231 *is_truncated = true;
9232 break;
9233 }
9234 }
9235
9236 if (pos >= 0)
9237 *last_entry = std::move((--vcurrents[pos])->first);
9238
9239 return 0;
9240 }
9241
9242
9243 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
9244 int shard_id,
9245 const rgw_obj_index_key& start,
9246 const string& prefix,
9247 uint32_t num_entries,
9248 bool list_versions,
9249 std::vector<rgw_bucket_dir_entry>& ent_list,
9250 bool *is_truncated,
9251 rgw_obj_index_key *last_entry,
9252 bool (*force_check_filter)(const string& name)) {
9253 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9254 " start " << start.name << "[" << start.instance <<
9255 "] num_entries " << num_entries << dendl;
9256
9257 static MultipartMetaFilter multipart_meta_filter;
9258
9259 *is_truncated = false;
9260 librados::IoCtx index_ctx;
9261
9262 map<int, string> oids;
9263 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9264 if (r < 0)
9265 return r;
9266 const uint32_t num_shards = oids.size();
9267
9268 rgw_obj_index_key marker = start;
9269 uint32_t current_shard;
9270 if (shard_id >= 0) {
9271 current_shard = shard_id;
9272 } else if (start.empty()) {
9273 current_shard = 0u;
9274 } else {
9275 // at this point we have a marker (start) that has something in
9276 // it, so we need to get to the bucket shard index, so we can
9277 // start reading from there
9278
9279 std::string key;
9280 // test whether object name is a multipart meta name
9281 if(! multipart_meta_filter.filter(start.name, key)) {
9282 // if multipart_meta_filter fails, must be "regular" (i.e.,
9283 // unadorned) and the name is the key
9284 key = start.name;
9285 }
9286
9287 // now convert the key (oid) to an rgw_obj_key since that will
9288 // separate out the namespace, name, and instance
9289 rgw_obj_key obj_key;
9290 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
9291 if (!parsed) {
9292 ldout(cct, 0) <<
9293 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9294 "start marker: '" << start << "'" << dendl;
9295 return -EINVAL;
9296 } else if (obj_key.name.empty()) {
9297 // if the name is empty that means the object name came in with
9298 // a namespace only, and therefore we need to start our scan at
9299 // the first bucket index shard
9300 current_shard = 0u;
9301 } else {
9302 // so now we have the key used to compute the bucket index shard
9303 // and can extract the specific shard from it
9304 current_shard = rgw_bucket_shard_index(obj_key.name, num_shards);
9305 }
9306 }
9307
9308 uint32_t count = 0u;
9309 map<string, bufferlist> updates;
9310 rgw_obj_index_key last_added_entry;
9311 while (count <= num_entries &&
9312 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
9313 current_shard < num_shards)) {
9314 const std::string& oid = oids[current_shard];
9315 rgw_cls_list_ret result;
9316
9317 librados::ObjectReadOperation op;
9318 cls_rgw_bucket_list_op(op, marker, prefix, num_entries,
9319 list_versions, &result);
9320 r = index_ctx.operate(oid, &op, nullptr);
9321 if (r < 0)
9322 return r;
9323
9324 for (auto& entry : result.dir.m) {
9325 rgw_bucket_dir_entry& dirent = entry.second;
9326
9327 bool force_check = force_check_filter &&
9328 force_check_filter(dirent.key.name);
9329 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9330 !dirent.pending_map.empty() ||
9331 force_check) {
9332 /* there are uncommitted ops. We need to check the current state,
9333 * and if the tags are old we need to do cleanup as well. */
9334 librados::IoCtx sub_ctx;
9335 sub_ctx.dup(index_ctx);
9336 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
9337 if (r < 0 && r != -ENOENT) {
9338 return r;
9339 }
9340 } else {
9341 r = 0;
9342 }
9343
9344 // at this point either r >=0 or r == -ENOENT
9345 if (r >= 0) { // i.e., if r != -ENOENT
9346 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
9347 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
9348
9349 if (count < num_entries) {
9350 marker = last_added_entry = dirent.key; // double assign
9351 ent_list.emplace_back(std::move(dirent));
9352 ++count;
9353 } else {
9354 *is_truncated = true;
9355 goto check_updates;
9356 }
9357 } else { // r == -ENOENT
9358 // in the case of -ENOENT, make sure we're advancing marker
9359 // for possible next call to CLSRGWIssueBucketList
9360 marker = dirent.key;
9361 }
9362 } // entry for loop
9363
9364 if (!result.is_truncated) {
9365 // if we reached the end of the shard read next shard
9366 ++current_shard;
9367 marker = rgw_obj_index_key();
9368 }
9369 } // shard loop
9370
9371 check_updates:
9372
9373 // suggest updates if there is any
9374 map<string, bufferlist>::iterator miter = updates.begin();
9375 for (; miter != updates.end(); ++miter) {
9376 if (miter->second.length()) {
9377 ObjectWriteOperation o;
9378 cls_rgw_suggest_changes(o, miter->second);
9379 // we don't care if we lose suggested updates, send them off blindly
9380 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9381 index_ctx.aio_operate(miter->first, c, &o);
9382 c->release();
9383 }
9384 }
9385
9386 if (last_entry && !ent_list.empty()) {
9387 *last_entry = last_added_entry;
9388 }
9389
9390 return 0;
9391 } // RGWRados::cls_bucket_list_unordered
9392
9393
9394 int RGWRados::cls_obj_usage_log_add(const string& oid,
9395 rgw_usage_log_info& info)
9396 {
9397 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9398
9399 rgw_rados_ref ref;
9400 int r = get_raw_obj_ref(obj, &ref);
9401 if (r < 0) {
9402 return r;
9403 }
9404
9405 ObjectWriteOperation op;
9406 cls_rgw_usage_log_add(op, info);
9407
9408 r = ref.ioctx.operate(ref.obj.oid, &op);
9409 return r;
9410 }
9411
9412 int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
9413 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
9414 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
9415 bool *is_truncated)
9416 {
9417 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9418
9419 rgw_rados_ref ref;
9420 int r = get_raw_obj_ref(obj, &ref);
9421 if (r < 0) {
9422 return r;
9423 }
9424
9425 *is_truncated = false;
9426
9427 r = cls_rgw_usage_log_read(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch,
9428 max_entries, read_iter, usage, is_truncated);
9429
9430 return r;
9431 }
9432
9433 int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
9434 uint64_t start_epoch, uint64_t end_epoch)
9435 {
9436 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9437
9438 rgw_rados_ref ref;
9439 int r = get_raw_obj_ref(obj, &ref);
9440 if (r < 0) {
9441 return r;
9442 }
9443
9444 r = cls_rgw_usage_log_trim(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch);
9445 return r;
9446 }
9447
9448 int RGWRados::cls_obj_usage_log_clear(string& oid)
9449 {
9450 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9451
9452 rgw_rados_ref ref;
9453 int r = get_raw_obj_ref(obj, &ref);
9454 if (r < 0) {
9455 return r;
9456 }
9457 librados::ObjectWriteOperation op;
9458 cls_rgw_usage_log_clear(op);
9459 r = ref.ioctx.operate(ref.obj.oid, &op);
9460 return r;
9461 }
9462
9463
9464 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
9465 {
9466 librados::IoCtx index_ctx;
9467 string dir_oid;
9468
9469 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
9470
9471 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
9472 if (r < 0)
9473 return r;
9474
9475 bufferlist updates;
9476
9477 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
9478 rgw_bucket_dir_entry entry;
9479 entry.key = *iter;
9480 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
9481 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9482 updates.append(CEPH_RGW_REMOVE | suggest_flag);
9483 encode(entry, updates);
9484 }
9485
9486 bufferlist out;
9487
9488 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
9489
9490 return r;
9491 }
9492
9493 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
9494 const RGWBucketInfo& bucket_info,
9495 rgw_bucket_dir_entry& list_state,
9496 rgw_bucket_dir_entry& object,
9497 bufferlist& suggested_updates)
9498 {
9499 const rgw_bucket& bucket = bucket_info.bucket;
9500 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
9501
9502 std::string loc;
9503
9504 rgw_obj obj(bucket, list_state.key);
9505
9506 string oid;
9507 get_obj_bucket_and_oid_loc(obj, oid, loc);
9508
9509 if (loc != list_state.locator) {
9510 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
9511 }
9512
9513 io_ctx.locator_set_key(list_state.locator);
9514
9515 RGWObjState *astate = NULL;
9516 RGWObjectCtx rctx(this);
9517 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
9518 if (r < 0)
9519 return r;
9520
9521 list_state.pending_map.clear(); // we don't need this and it inflates size
9522 if (!astate->exists) {
9523 /* object doesn't exist right now -- hopefully because it's
9524 * marked as !exists and got deleted */
9525 if (list_state.exists) {
9526 /* FIXME: what should happen now? Work out if there are any
9527 * non-bad ways this could happen (there probably are, but annoying
9528 * to handle!) */
9529 }
9530 // encode a suggested removal of that key
9531 list_state.ver.epoch = io_ctx.get_last_version();
9532 list_state.ver.pool = io_ctx.get_id();
9533 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
9534 return -ENOENT;
9535 }
9536
9537 string etag;
9538 string content_type;
9539 ACLOwner owner;
9540
9541 object.meta.size = astate->size;
9542 object.meta.accounted_size = astate->accounted_size;
9543 object.meta.mtime = astate->mtime;
9544
9545 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
9546 if (iter != astate->attrset.end()) {
9547 etag = rgw_bl_str(iter->second);
9548 }
9549 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
9550 if (iter != astate->attrset.end()) {
9551 content_type = rgw_bl_str(iter->second);
9552 }
9553 iter = astate->attrset.find(RGW_ATTR_ACL);
9554 if (iter != astate->attrset.end()) {
9555 r = decode_policy(iter->second, &owner);
9556 if (r < 0) {
9557 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
9558 }
9559 }
9560
9561 if (astate->has_manifest) {
9562 RGWObjManifest::obj_iterator miter;
9563 RGWObjManifest& manifest = astate->manifest;
9564 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
9565 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
9566 rgw_obj loc;
9567 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
9568
9569 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
9570 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
9571 r = delete_obj_index(loc, astate->mtime);
9572 if (r < 0) {
9573 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
9574 }
9575 }
9576 }
9577 }
9578
9579 object.meta.etag = etag;
9580 object.meta.content_type = content_type;
9581 object.meta.owner = owner.get_id().to_str();
9582 object.meta.owner_display_name = owner.get_display_name();
9583
9584 // encode suggested updates
9585 list_state.ver.pool = io_ctx.get_id();
9586 list_state.ver.epoch = astate->epoch;
9587 list_state.meta.size = object.meta.size;
9588 list_state.meta.accounted_size = object.meta.accounted_size;
9589 list_state.meta.mtime = object.meta.mtime;
9590 list_state.meta.category = main_category;
9591 list_state.meta.etag = etag;
9592 list_state.meta.content_type = content_type;
9593 if (astate->obj_tag.length() > 0)
9594 list_state.tag = astate->obj_tag.c_str();
9595 list_state.meta.owner = owner.get_id().to_str();
9596 list_state.meta.owner_display_name = owner.get_display_name();
9597
9598 list_state.exists = true;
9599 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9600 return 0;
9601 }
9602
9603 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
9604 {
9605 librados::IoCtx index_ctx;
9606 map<int, string> oids;
9607 map<int, struct rgw_cls_list_ret> list_results;
9608 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
9609 if (r < 0)
9610 return r;
9611
9612 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9613 if (r < 0)
9614 return r;
9615
9616 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9617 for(; iter != list_results.end(); ++iter) {
9618 headers.push_back(std::move(iter->second.dir.header));
9619 }
9620 return 0;
9621 }
9622
9623 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
9624 {
9625 librados::IoCtx index_ctx;
9626 map<int, string> bucket_objs;
9627 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
9628 if (r < 0)
9629 return r;
9630
9631 map<int, string>::iterator iter = bucket_objs.begin();
9632 for (; iter != bucket_objs.end(); ++iter) {
9633 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
9634 if (r < 0) {
9635 ctx->put();
9636 break;
9637 } else {
9638 (*num_aio)++;
9639 }
9640 }
9641 return r;
9642 }
9643
9644 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
9645 {
9646 string buckets_obj_id;
9647 rgw_get_buckets_obj(user_id, buckets_obj_id);
9648 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9649
9650 rgw_rados_ref ref;
9651 int r = get_raw_obj_ref(obj, &ref);
9652 if (r < 0) {
9653 return r;
9654 }
9655
9656 librados::ObjectReadOperation op;
9657 int rc;
9658 ::cls_user_get_header(op, header, &rc);
9659 bufferlist ibl;
9660 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
9661 if (r < 0)
9662 return r;
9663 if (rc < 0)
9664 return rc;
9665
9666 return 0;
9667 }
9668
9669 int RGWRados::cls_user_reset_stats(const string& user_id)
9670 {
9671 string buckets_obj_id;
9672 rgw_get_buckets_obj(user_id, buckets_obj_id);
9673 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9674
9675 rgw_rados_ref ref;
9676 int r = get_raw_obj_ref(obj, &ref);
9677 if (r < 0) {
9678 return r;
9679 }
9680
9681 librados::ObjectWriteOperation op;
9682 ::cls_user_reset_stats(op);
9683 return ref.ioctx.operate(ref.obj.oid, &op);
9684 }
9685
9686 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
9687 {
9688 string buckets_obj_id;
9689 rgw_get_buckets_obj(user_id, buckets_obj_id);
9690 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9691
9692 rgw_rados_ref ref;
9693 int r = get_raw_obj_ref(obj, &ref);
9694 if (r < 0) {
9695 return r;
9696 }
9697
9698 r = ::cls_user_get_header_async(ref.ioctx, ref.obj.oid, ctx);
9699 if (r < 0)
9700 return r;
9701
9702 return 0;
9703 }
9704
9705 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj,
9706 const RGWBucketInfo& bucket_info)
9707 {
9708 vector<rgw_bucket_dir_header> headers;
9709 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9710 if (r < 0) {
9711 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
9712 return r;
9713 }
9714
9715 cls_user_bucket_entry entry;
9716
9717 bucket_info.bucket.convert(&entry.bucket);
9718
9719 for (const auto& hiter : headers) {
9720 for (const auto& iter : hiter.stats) {
9721 if (RGWObjCategory::Main == iter.first ||
9722 RGWObjCategory::MultiMeta == iter.first) {
9723 const struct rgw_bucket_category_stats& header_stats = iter.second;
9724 entry.size += header_stats.total_size;
9725 entry.size_rounded += header_stats.total_size_rounded;
9726 entry.count += header_stats.num_entries;
9727 }
9728 }
9729 }
9730
9731 list<cls_user_bucket_entry> entries;
9732 entries.push_back(entry);
9733
9734 r = cls_user_update_buckets(user_obj, entries, false);
9735 if (r < 0) {
9736 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
9737 return r;
9738 }
9739
9740 return 0;
9741 }
9742
9743 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
9744 {
9745 vector<rgw_bucket_dir_header> headers;
9746 RGWBucketInfo bucket_info;
9747 auto obj_ctx = svc.sysobj->init_obj_ctx();
9748 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
9749 if (ret < 0) {
9750 return ret;
9751 }
9752
9753 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9754 if (ret < 0) {
9755 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
9756 return ret;
9757 }
9758
9759 bucket.convert(&entry.bucket);
9760
9761 for (const auto& hiter : headers) {
9762 for (const auto& iter : hiter.stats) {
9763 const struct rgw_bucket_category_stats& header_stats = iter.second;
9764 entry.size += header_stats.total_size;
9765 entry.size_rounded += header_stats.total_size_rounded;
9766 entry.count += header_stats.num_entries;
9767 }
9768 }
9769
9770 return 0;
9771 }
9772
9773 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
9774 const string& in_marker,
9775 const string& end_marker,
9776 const int max_entries,
9777 list<cls_user_bucket_entry>& entries,
9778 string * const out_marker,
9779 bool * const truncated)
9780 {
9781 rgw_rados_ref ref;
9782 int r = get_raw_obj_ref(obj, &ref);
9783 if (r < 0) {
9784 return r;
9785 }
9786
9787 librados::ObjectReadOperation op;
9788 int rc;
9789
9790 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
9791 bufferlist ibl;
9792 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
9793 if (r < 0)
9794 return r;
9795 if (rc < 0)
9796 return rc;
9797
9798 return 0;
9799 }
9800
9801 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
9802 {
9803 rgw_rados_ref ref;
9804 int r = get_raw_obj_ref(obj, &ref);
9805 if (r < 0) {
9806 return r;
9807 }
9808
9809 librados::ObjectWriteOperation op;
9810 cls_user_set_buckets(op, entries, add);
9811 r = ref.ioctx.operate(ref.obj.oid, &op);
9812 if (r < 0)
9813 return r;
9814
9815 return 0;
9816 }
9817
9818 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
9819 {
9820 string buckets_obj_id;
9821 rgw_get_buckets_obj(user_id, buckets_obj_id);
9822 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9823 return cls_user_complete_stats_sync(obj);
9824 }
9825
9826 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
9827 {
9828 rgw_rados_ref ref;
9829 int r = get_raw_obj_ref(obj, &ref);
9830 if (r < 0) {
9831 return r;
9832 }
9833
9834 librados::ObjectWriteOperation op;
9835 ::cls_user_complete_stats_sync(op);
9836 r = ref.ioctx.operate(ref.obj.oid, &op);
9837 if (r < 0)
9838 return r;
9839
9840 return 0;
9841 }
9842
9843 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
9844 {
9845 list<cls_user_bucket_entry> l;
9846 l.push_back(entry);
9847
9848 return cls_user_update_buckets(obj, l, true);
9849 }
9850
9851 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
9852 {
9853 rgw_rados_ref ref;
9854 int r = get_system_obj_ref(obj, &ref);
9855 if (r < 0) {
9856 return r;
9857 }
9858
9859 librados::ObjectWriteOperation op;
9860 ::cls_user_remove_bucket(op, bucket);
9861 r = ref.ioctx.operate(ref.obj.oid, &op);
9862 if (r < 0)
9863 return r;
9864
9865 return 0;
9866 }
9867
9868 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
9869 RGWQuotaInfo& bucket_quota)
9870 {
9871 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
9872 return 0;
9873 }
9874
9875 bool need_resharding = false;
9876 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9877 uint32_t suggested_num_shards;
9878
9879 const uint64_t max_objs_per_shard =
9880 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9881 int ret =
9882 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
9883 bucket_info.owner, bucket, bucket_quota,
9884 1, need_resharding, &suggested_num_shards);
9885 if (ret < 0) {
9886 return ret;
9887 }
9888
9889 if (need_resharding) {
9890 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
9891 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
9892 dendl;
9893 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
9894 }
9895
9896 return ret;
9897 }
9898
9899 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
9900 {
9901 RGWReshard reshard(this);
9902
9903 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9904
9905 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
9906 if (new_num_shards <= num_source_shards) {
9907 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
9908 return 0;
9909 }
9910
9911 cls_rgw_reshard_entry entry;
9912 entry.time = real_clock::now();
9913 entry.tenant = bucket_info.owner.tenant;
9914 entry.bucket_name = bucket_info.bucket.name;
9915 entry.bucket_id = bucket_info.bucket.bucket_id;
9916 entry.old_num_shards = num_source_shards;
9917 entry.new_num_shards = new_num_shards;
9918
9919 return reshard.add(entry);
9920 }
9921
9922 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
9923 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
9924 {
9925 // if we only check size, then num_objs will set to 0
9926 if(check_size_only)
9927 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
9928
9929 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
9930 }
9931
9932 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
9933 uint32_t num_shards,
9934 map<int, string>& bucket_objects,
9935 int shard_id) {
9936 if (!num_shards) {
9937 bucket_objects[0] = bucket_oid_base;
9938 } else {
9939 char buf[bucket_oid_base.size() + 32];
9940 if (shard_id < 0) {
9941 for (uint32_t i = 0; i < num_shards; ++i) {
9942 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
9943 bucket_objects[i] = buf;
9944 }
9945 } else {
9946 if ((uint32_t)shard_id > num_shards) {
9947 return;
9948 }
9949 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
9950 bucket_objects[shard_id] = buf;
9951 }
9952 }
9953 }
9954
9955 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
9956 {
9957 const rgw_bucket& bucket = bucket_info.bucket;
9958 string plain_id = bucket.name + ":" + bucket.bucket_id;
9959 if (!bucket_info.num_shards) {
9960 (*result)[0] = plain_id;
9961 } else {
9962 char buf[16];
9963 if (shard_id < 0) {
9964 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
9965 snprintf(buf, sizeof(buf), ":%d", i);
9966 (*result)[i] = plain_id + buf;
9967 }
9968 } else {
9969 if ((uint32_t)shard_id > bucket_info.num_shards) {
9970 return;
9971 }
9972 snprintf(buf, sizeof(buf), ":%d", shard_id);
9973 (*result)[shard_id] = plain_id + buf;
9974 }
9975 }
9976 }
9977
9978 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
9979 int *shard_id)
9980 {
9981 int r = 0;
9982 switch (bucket_info.bucket_index_shard_hash_type) {
9983 case RGWBucketInfo::MOD:
9984 if (!bucket_info.num_shards) {
9985 if (shard_id) {
9986 *shard_id = -1;
9987 }
9988 } else {
9989 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
9990 if (shard_id) {
9991 *shard_id = (int)sid;
9992 }
9993 }
9994 break;
9995 default:
9996 r = -ENOTSUP;
9997 }
9998 return r;
9999 }
10000
10001 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
10002 int shard_id, string *bucket_obj)
10003 {
10004 if (!num_shards) {
10005 // By default with no sharding, we use the bucket oid as itself
10006 (*bucket_obj) = bucket_oid_base;
10007 } else {
10008 char buf[bucket_oid_base.size() + 32];
10009 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
10010 (*bucket_obj) = buf;
10011 }
10012 }
10013
10014 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
10015 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
10016 {
10017 int r = 0;
10018 switch (hash_type) {
10019 case RGWBucketInfo::MOD:
10020 if (!num_shards) {
10021 // By default with no sharding, we use the bucket oid as itself
10022 (*bucket_obj) = bucket_oid_base;
10023 if (shard_id) {
10024 *shard_id = -1;
10025 }
10026 } else {
10027 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
10028 char buf[bucket_oid_base.size() + 32];
10029 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
10030 (*bucket_obj) = buf;
10031 if (shard_id) {
10032 *shard_id = (int)sid;
10033 }
10034 }
10035 break;
10036 default:
10037 r = -ENOTSUP;
10038 }
10039 return r;
10040 }
10041
10042 uint64_t RGWRados::instance_id()
10043 {
10044 return get_rados_handle()->get_instance_id();
10045 }
10046
10047 uint64_t RGWRados::next_bucket_id()
10048 {
10049 Mutex::Locker l(bucket_id_lock);
10050 return ++max_bucket_id;
10051 }
10052
10053 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
10054 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
10055 {
10056 RGWRados *store = new RGWRados;
10057
10058 if ((*store).set_use_cache(use_cache)
10059 .set_run_gc_thread(use_gc_thread)
10060 .set_run_lc_thread(use_lc_thread)
10061 .set_run_quota_threads(quota_threads)
10062 .set_run_sync_thread(run_sync_thread)
10063 .set_run_reshard_thread(run_reshard_thread)
10064 .initialize(cct) < 0) {
10065 delete store;
10066 return NULL;
10067 }
10068
10069 return store;
10070 }
10071
10072 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
10073 {
10074 RGWRados *store = NULL;
10075 store = new RGWRados;
10076
10077 store->set_context(cct);
10078
10079 int ret = store->init_svc(true);
10080 if (ret < 0) {
10081 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
10082 return nullptr;
10083 }
10084
10085 if (store->init_rados() < 0) {
10086 delete store;
10087 return nullptr;
10088 }
10089
10090 return store;
10091 }
10092
10093 void RGWStoreManager::close_storage(RGWRados *store)
10094 {
10095 if (!store)
10096 return;
10097
10098 store->finalize();
10099
10100 delete store;
10101 }
10102
10103 librados::Rados* RGWRados::get_rados_handle()
10104 {
10105 return &rados;
10106 }
10107
10108 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
10109 {
10110 rgw_rados_ref ref;
10111 int ret = get_raw_obj_ref(obj, &ref);
10112 if (ret < 0) {
10113 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10114 return ret;
10115 }
10116
10117 ObjectWriteOperation op;
10118 list<string> prefixes;
10119 cls_rgw_remove_obj(op, prefixes);
10120
10121 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
10122 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
10123 if (ret < 0) {
10124 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10125 c->release();
10126 return ret;
10127 }
10128
10129 handles.push_back(c);
10130
10131 return 0;
10132 }
10133
10134 int RGWRados::delete_obj_aio(const rgw_obj& obj,
10135 RGWBucketInfo& bucket_info, RGWObjState *astate,
10136 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
10137 {
10138 rgw_rados_ref ref;
10139 int ret = get_obj_head_ref(bucket_info, obj, &ref);
10140 if (ret < 0) {
10141 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10142 return ret;
10143 }
10144
10145 if (keep_index_consistent) {
10146 RGWRados::Bucket bop(this, bucket_info);
10147 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
10148
10149 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
10150 if (ret < 0) {
10151 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
10152 return ret;
10153 }
10154 }
10155
10156 ObjectWriteOperation op;
10157 list<string> prefixes;
10158 cls_rgw_remove_obj(op, prefixes);
10159
10160 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
10161 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
10162 if (ret < 0) {
10163 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10164 c->release();
10165 return ret;
10166 }
10167
10168 handles.push_back(c);
10169
10170 if (keep_index_consistent) {
10171 ret = delete_obj_index(obj, astate->mtime);
10172 if (ret < 0) {
10173 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
10174 return ret;
10175 }
10176 }
10177 return ret;
10178 }
10179
10180 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
10181 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
10182 if (value != attrs.end()) {
10183 auto bliter = value->second.cbegin();
10184 try {
10185 decode(cs_info, bliter);
10186 } catch (buffer::error& err) {
10187 return -EIO;
10188 }
10189 if (cs_info.blocks.size() == 0) {
10190 return -EIO;
10191 }
10192 if (cs_info.compression_type != "none")
10193 need_decompress = true;
10194 else
10195 need_decompress = false;
10196 return 0;
10197 } else {
10198 need_decompress = false;
10199 return 0;
10200 }
10201 }
10202
10203 bool RGWRados::call(std::string_view command, const cmdmap_t& cmdmap,
10204 std::string_view format, bufferlist& out)
10205 {
10206 if (command == "cache list"sv) {
10207 std::optional<std::string> filter;
10208 if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) {
10209 filter = boost::get<std::string>(i->second);
10210 }
10211 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
10212 if (f) {
10213 f->open_array_section("cache_entries");
10214 call_list(filter, f.get());
10215 f->close_section();
10216 f->flush(out);
10217 return true;
10218 } else {
10219 out.append("Unable to create Formatter.\n");
10220 return false;
10221 }
10222 } else if (command == "cache inspect"sv) {
10223 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
10224 if (f) {
10225 const auto& target = boost::get<std::string>(cmdmap.at("target"));
10226 if (call_inspect(target, f.get())) {
10227 f->flush(out);
10228 return true;
10229 } else {
10230 out.append("Unable to find entry "s + target + ".\n");
10231 return false;
10232 }
10233 } else {
10234 out.append("Unable to create Formatter.\n");
10235 return false;
10236 }
10237 } else if (command == "cache erase"sv) {
10238 const auto& target = boost::get<std::string>(cmdmap.at("target"));
10239 if (call_erase(target)) {
10240 return true;
10241 } else {
10242 out.append("Unable to find entry "s + target + ".\n");
10243 return false;
10244 }
10245 } else if (command == "cache zap"sv) {
10246 call_zap();
10247 return true;
10248 }
10249 return false;
10250 }
10251
10252 void RGWRados::call_list(const std::optional<std::string>& s,
10253 ceph::Formatter *f)
10254 {
10255 if (!svc.cache) {
10256 return;
10257 }
10258 svc.cache->call_list(s, f);
10259 }
10260
10261 bool RGWRados::call_inspect(const std::string& s, Formatter *f)
10262 {
10263 if (!svc.cache) {
10264 return false;
10265 }
10266 return svc.cache->call_inspect(s, f);
10267 }
10268
10269 bool RGWRados::call_erase(const std::string& s) {
10270 if (!svc.cache) {
10271 return false;
10272 }
10273 return svc.cache->call_erase(s);
10274 }
10275
10276 void RGWRados::call_zap() {
10277 if (svc.cache) {
10278 return;
10279 }
10280 svc.cache->call_zap();
10281 }
10282
10283 string RGWRados::get_mfa_oid(const rgw_user& user)
10284 {
10285 return string("user:") + user.to_str();
10286 }
10287
10288 int RGWRados::get_mfa_ref(const rgw_user& user, rgw_rados_ref *ref)
10289 {
10290 string oid = get_mfa_oid(user);
10291 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10292 return get_system_obj_ref(obj, ref);
10293 }
10294
10295 int RGWRados::check_mfa(const rgw_user& user, const string& otp_id, const string& pin)
10296 {
10297 rgw_rados_ref ref;
10298
10299 int r = get_mfa_ref(user, &ref);
10300 if (r < 0) {
10301 return r;
10302 }
10303
10304 rados::cls::otp::otp_check_t result;
10305
10306 r = rados::cls::otp::OTP::check(cct, ref.ioctx, ref.obj.oid, otp_id, pin, &result);
10307 if (r < 0)
10308 return r;
10309
10310 ldout(cct, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl;
10311
10312 return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES);
10313 }
10314
10315 void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation *op,
10316 RGWObjVersionTracker *objv_tracker,
10317 const ceph::real_time& mtime)
10318 {
10319 RGWObjVersionTracker ot;
10320
10321 if (objv_tracker) {
10322 ot = *objv_tracker;
10323 }
10324
10325 if (ot.write_version.tag.empty()) {
10326 if (ot.read_version.tag.empty()) {
10327 ot.generate_new_write_ver(cct);
10328 } else {
10329 ot.write_version = ot.read_version;
10330 ot.write_version.ver++;
10331 }
10332 }
10333
10334 ot.prepare_op_for_write(op);
10335 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10336 op->mtime2(&mtime_ts);
10337 }
10338
10339 int RGWRados::create_mfa(const rgw_user& user, const rados::cls::otp::otp_info_t& config,
10340 RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime)
10341 {
10342 rgw_rados_ref ref;
10343
10344 int r = get_mfa_ref(user, &ref);
10345 if (r < 0) {
10346 return r;
10347 }
10348
10349 librados::ObjectWriteOperation op;
10350 prepare_mfa_write(&op, objv_tracker, mtime);
10351 rados::cls::otp::OTP::create(&op, config);
10352 r = ref.ioctx.operate(ref.obj.oid, &op);
10353 if (r < 0) {
10354 ldout(cct, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl;
10355 return r;
10356 }
10357
10358 return 0;
10359 }
10360
10361 int RGWRados::remove_mfa(const rgw_user& user, const string& id,
10362 RGWObjVersionTracker *objv_tracker,
10363 const ceph::real_time& mtime)
10364 {
10365 rgw_rados_ref ref;
10366
10367 int r = get_mfa_ref(user, &ref);
10368 if (r < 0) {
10369 return r;
10370 }
10371
10372 librados::ObjectWriteOperation op;
10373 prepare_mfa_write(&op, objv_tracker, mtime);
10374 rados::cls::otp::OTP::remove(&op, id);
10375 r = ref.ioctx.operate(ref.obj.oid, &op);
10376 if (r < 0) {
10377 ldout(cct, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl;
10378 return r;
10379 }
10380
10381 return 0;
10382 }
10383
10384 int RGWRados::get_mfa(const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result)
10385 {
10386 rgw_rados_ref ref;
10387
10388 int r = get_mfa_ref(user, &ref);
10389 if (r < 0) {
10390 return r;
10391 }
10392
10393 r = rados::cls::otp::OTP::get(nullptr, ref.ioctx, ref.obj.oid, id, result);
10394 if (r < 0) {
10395 return r;
10396 }
10397
10398 return 0;
10399 }
10400
10401 int RGWRados::list_mfa(const rgw_user& user, list<rados::cls::otp::otp_info_t> *result)
10402 {
10403 rgw_rados_ref ref;
10404
10405 int r = get_mfa_ref(user, &ref);
10406 if (r < 0) {
10407 return r;
10408 }
10409
10410 r = rados::cls::otp::OTP::get_all(nullptr, ref.ioctx, ref.obj.oid, result);
10411 if (r < 0) {
10412 return r;
10413 }
10414
10415 return 0;
10416 }
10417
10418 int RGWRados::otp_get_current_time(const rgw_user& user, ceph::real_time *result)
10419 {
10420 rgw_rados_ref ref;
10421
10422 int r = get_mfa_ref(user, &ref);
10423 if (r < 0) {
10424 return r;
10425 }
10426
10427 r = rados::cls::otp::OTP::get_current_time(ref.ioctx, ref.obj.oid, result);
10428 if (r < 0) {
10429 return r;
10430 }
10431
10432 return 0;
10433 }
10434
10435 int RGWRados::set_mfa(const string& oid, const list<rados::cls::otp::otp_info_t>& entries,
10436 bool reset_obj, RGWObjVersionTracker *objv_tracker,
10437 const real_time& mtime)
10438 {
10439 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10440 rgw_rados_ref ref;
10441 int r = get_system_obj_ref(obj, &ref);
10442 if (r < 0) {
10443 return r;
10444 }
10445
10446 librados::ObjectWriteOperation op;
10447 if (reset_obj) {
10448 op.remove();
10449 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
10450 op.create(false);
10451 }
10452 prepare_mfa_write(&op, objv_tracker, mtime);
10453 rados::cls::otp::OTP::set(&op, entries);
10454 r = ref.ioctx.operate(ref.obj.oid, &op);
10455 if (r < 0) {
10456 ldout(cct, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl;
10457 return r;
10458 }
10459
10460 return 0;
10461 }
10462
10463 int RGWRados::list_mfa(const string& oid, list<rados::cls::otp::otp_info_t> *result,
10464 RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime)
10465 {
10466 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10467 rgw_rados_ref ref;
10468 int r = get_system_obj_ref(obj, &ref);
10469 if (r < 0) {
10470 return r;
10471 }
10472 librados::ObjectReadOperation op;
10473 struct timespec mtime_ts;
10474 if (pmtime) {
10475 op.stat2(nullptr, &mtime_ts, nullptr);
10476 }
10477 objv_tracker->prepare_op_for_read(&op);
10478 r = rados::cls::otp::OTP::get_all(&op, ref.ioctx, ref.obj.oid, result);
10479 if (r < 0) {
10480 return r;
10481 }
10482 if (pmtime) {
10483 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
10484 }
10485
10486 return 0;
10487 }