]> git.proxmox.com Git - ceph.git/blob - ceph/src/rgw/rgw_rados.cc
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / rgw / rgw_rados.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3
4 #include "include/compat.h"
5 #include <errno.h>
6 #include <stdlib.h>
7 #include <sys/types.h>
8 #include <boost/algorithm/string.hpp>
9 #include <string_view>
10
11 #include <boost/container/flat_set.hpp>
12 #include <boost/format.hpp>
13 #include <boost/optional.hpp>
14 #include <boost/utility/in_place_factory.hpp>
15
16 #include "common/ceph_json.h"
17
18 #include "common/errno.h"
19 #include "common/Formatter.h"
20 #include "common/Throttle.h"
21
22 #include "rgw_rados.h"
23 #include "rgw_zone.h"
24 #include "rgw_cache.h"
25 #include "rgw_acl.h"
26 #include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
27 #include "rgw_aio_throttle.h"
28 #include "rgw_bucket.h"
29 #include "rgw_rest_conn.h"
30 #include "rgw_cr_rados.h"
31 #include "rgw_cr_rest.h"
32 #include "rgw_putobj_processor.h"
33
34 #include "cls/rgw/cls_rgw_ops.h"
35 #include "cls/rgw/cls_rgw_client.h"
36 #include "cls/rgw/cls_rgw_const.h"
37 #include "cls/refcount/cls_refcount_client.h"
38 #include "cls/version/cls_version_client.h"
39 #include "cls/log/cls_log_client.h"
40 #include "cls/timeindex/cls_timeindex_client.h"
41 #include "cls/lock/cls_lock_client.h"
42 #include "cls/user/cls_user_client.h"
43 #include "cls/otp/cls_otp_client.h"
44 #include "osd/osd_types.h"
45
46 #include "rgw_tools.h"
47 #include "rgw_coroutine.h"
48 #include "rgw_compression.h"
49
50 #undef fork // fails to compile RGWPeriod::fork() below
51
52 #include "common/Clock.h"
53
54 using namespace librados;
55
56 #include <string>
57 #include <iostream>
58 #include <vector>
59 #include <atomic>
60 #include <list>
61 #include <map>
62 #include "include/random.h"
63
64 #include "rgw_gc.h"
65 #include "rgw_lc.h"
66
67 #include "rgw_object_expirer_core.h"
68 #include "rgw_sync.h"
69 #include "rgw_sync_counters.h"
70 #include "rgw_sync_trace.h"
71 #include "rgw_data_sync.h"
72 #include "rgw_realm_watcher.h"
73 #include "rgw_reshard.h"
74
75 #include "services/svc_zone.h"
76 #include "services/svc_zone_utils.h"
77 #include "services/svc_quota.h"
78 #include "services/svc_sync_modules.h"
79 #include "services/svc_sys_obj.h"
80 #include "services/svc_sys_obj_cache.h"
81
82 #include "compressor/Compressor.h"
83
84 #ifdef WITH_LTTNG
85 #define TRACEPOINT_DEFINE
86 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
87 #include "tracing/rgw_rados.h"
88 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
89 #undef TRACEPOINT_DEFINE
90 #else
91 #define tracepoint(...)
92 #endif
93
94 #define dout_context g_ceph_context
95 #define dout_subsys ceph_subsys_rgw
96
97
98 static string shadow_ns = "shadow";
99 static string dir_oid_prefix = ".dir.";
100 static string default_bucket_index_pool_suffix = "rgw.buckets.index";
101 static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
102
103 static string log_lock_name = "rgw_log_lock";
104 static RGWObjCategory main_category = RGWObjCategory::Main;
105 #define RGW_USAGE_OBJ_PREFIX "usage."
106
107 #define dout_subsys ceph_subsys_rgw
108
109 const std::string MP_META_SUFFIX = ".meta";
110
111
112 static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
113 const rgw_placement_rule& head_placement_rule,
114 const rgw_obj& obj, rgw_pool *pool)
115 {
116 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
117 RGWZonePlacementInfo placement;
118 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
119 return false;
120 }
121
122 if (!obj.in_extra_data) {
123 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
124 } else {
125 *pool = placement.get_data_extra_pool();
126 }
127 }
128
129 return true;
130 }
131
132 static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
133 const rgw_placement_rule& head_placement_rule,
134 const rgw_obj& obj, rgw_raw_obj *raw_obj)
135 {
136 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
137
138 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
139 }
140
141 rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
142 {
143 if (!is_raw) {
144 rgw_raw_obj r;
145 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
146 return r;
147 }
148 return raw_obj;
149 }
150
151 rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
152 {
153 if (!is_raw) {
154 rgw_raw_obj r;
155 store->obj_to_raw(placement_rule, obj, &r);
156 return r;
157 }
158 return raw_obj;
159 }
160
161 int rgw_init_ioctx(librados::Rados *rados, const rgw_pool& pool, IoCtx& ioctx, bool create)
162 {
163 int r = rados->ioctx_create(pool.name.c_str(), ioctx);
164 if (r == -ENOENT && create) {
165 r = rados->pool_create(pool.name.c_str());
166 if (r == -ERANGE) {
167 dout(0)
168 << __func__
169 << " ERROR: librados::Rados::pool_create returned " << cpp_strerror(-r)
170 << " (this can be due to a pool or placement group misconfiguration, e.g."
171 << " pg_num < pgp_num or mon_max_pg_per_osd exceeded)"
172 << dendl;
173 }
174 if (r < 0 && r != -EEXIST) {
175 return r;
176 }
177
178 r = rados->ioctx_create(pool.name.c_str(), ioctx);
179 if (r < 0) {
180 return r;
181 }
182
183 r = ioctx.application_enable(pg_pool_t::APPLICATION_NAME_RGW, false);
184 if (r < 0 && r != -EOPNOTSUPP) {
185 return r;
186 }
187 } else if (r < 0) {
188 return r;
189 }
190 if (!pool.ns.empty()) {
191 ioctx.set_namespace(pool.ns);
192 }
193 return 0;
194 }
195
196 void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
197 {
198 obj_version *check_objv = version_for_check();
199
200 if (check_objv) {
201 cls_version_check(*op, *check_objv, VER_COND_EQ);
202 }
203
204 cls_version_read(*op, &read_version);
205 }
206
207 void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
208 {
209 obj_version *check_objv = version_for_check();
210 obj_version *modify_version = version_for_write();
211
212 if (check_objv) {
213 cls_version_check(*op, *check_objv, VER_COND_EQ);
214 }
215
216 if (modify_version) {
217 cls_version_set(*op, *modify_version);
218 } else {
219 cls_version_inc(*op);
220 }
221 }
222
223 void RGWObjManifest::obj_iterator::operator++()
224 {
225 if (manifest->explicit_objs) {
226 ++explicit_iter;
227
228 update_explicit_pos();
229
230 update_location();
231 return;
232 }
233
234 uint64_t obj_size = manifest->get_obj_size();
235 uint64_t head_size = manifest->get_head_size();
236
237 if (ofs == obj_size) {
238 return;
239 }
240
241 if (manifest->rules.empty()) {
242 return;
243 }
244
245 /* are we still pointing at the head? */
246 if (ofs < head_size) {
247 rule_iter = manifest->rules.begin();
248 RGWObjManifestRule *rule = &rule_iter->second;
249 ofs = std::min(head_size, obj_size);
250 stripe_ofs = ofs;
251 cur_stripe = 1;
252 stripe_size = std::min(obj_size - ofs, rule->stripe_max_size);
253 if (rule->part_size > 0) {
254 stripe_size = std::min(stripe_size, rule->part_size);
255 }
256 update_location();
257 return;
258 }
259
260 RGWObjManifestRule *rule = &rule_iter->second;
261
262 stripe_ofs += rule->stripe_max_size;
263 cur_stripe++;
264 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
265
266 if (rule->part_size > 0) {
267 /* multi part, multi stripes object */
268
269 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
270
271 if (stripe_ofs >= part_ofs + rule->part_size) {
272 /* moved to the next part */
273 cur_stripe = 0;
274 part_ofs += rule->part_size;
275 stripe_ofs = part_ofs;
276
277 bool last_rule = (next_rule_iter == manifest->rules.end());
278 /* move to the next rule? */
279 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
280 rule_iter = next_rule_iter;
281 last_rule = (next_rule_iter == manifest->rules.end());
282 if (!last_rule) {
283 ++next_rule_iter;
284 }
285 cur_part_id = rule_iter->second.start_part_num;
286 } else {
287 cur_part_id++;
288 }
289
290 rule = &rule_iter->second;
291 }
292
293 stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
294 }
295
296 cur_override_prefix = rule->override_prefix;
297
298 ofs = stripe_ofs;
299 if (ofs > obj_size) {
300 ofs = obj_size;
301 stripe_ofs = ofs;
302 stripe_size = 0;
303 }
304
305 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
306 update_location();
307 }
308
309 int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
310 const rgw_placement_rule& head_placement_rule,
311 const rgw_placement_rule *tail_placement_rule,
312 const rgw_bucket& _b, const rgw_obj& _obj)
313 {
314 manifest = _m;
315
316 if (!tail_placement_rule) {
317 manifest->set_tail_placement(head_placement_rule, _b);
318 } else {
319 rgw_placement_rule new_tail_rule = *tail_placement_rule;
320 new_tail_rule.inherit_from(head_placement_rule);
321 manifest->set_tail_placement(new_tail_rule, _b);
322 }
323
324 manifest->set_head(head_placement_rule, _obj, 0);
325 last_ofs = 0;
326
327 if (manifest->get_prefix().empty()) {
328 char buf[33];
329 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
330
331 string oid_prefix = ".";
332 oid_prefix.append(buf);
333 oid_prefix.append("_");
334
335 manifest->set_prefix(oid_prefix);
336 }
337
338 bool found = manifest->get_rule(0, &rule);
339 if (!found) {
340 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
341 return -EIO;
342 }
343
344 uint64_t head_size = manifest->get_head_size();
345
346 if (head_size > 0) {
347 cur_stripe_size = head_size;
348 } else {
349 cur_stripe_size = rule.stripe_max_size;
350 }
351
352 cur_part_id = rule.start_part_num;
353
354 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
355
356 // Normal object which not generated through copy operation
357 manifest->set_tail_instance(_obj.key.instance);
358
359 manifest->update_iterators();
360
361 return 0;
362 }
363
364 int RGWObjManifest::generator::create_next(uint64_t ofs)
365 {
366 if (ofs < last_ofs) /* only going forward */
367 return -EINVAL;
368
369 uint64_t max_head_size = manifest->get_max_head_size();
370
371 if (ofs < max_head_size) {
372 manifest->set_head_size(ofs);
373 }
374
375 if (ofs >= max_head_size) {
376 manifest->set_head_size(max_head_size);
377 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
378 cur_stripe_size = rule.stripe_max_size;
379
380 if (cur_part_id == 0 && max_head_size > 0) {
381 cur_stripe++;
382 }
383 }
384
385 last_ofs = ofs;
386 manifest->set_obj_size(ofs);
387
388 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
389
390 manifest->update_iterators();
391
392 return 0;
393 }
394
395 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
396 {
397 return begin_iter;
398 }
399
400 const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
401 {
402 return end_iter;
403 }
404
405 RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
406 {
407 if (ofs > obj_size) {
408 ofs = obj_size;
409 }
410 RGWObjManifest::obj_iterator iter(this);
411 iter.seek(ofs);
412 return iter;
413 }
414
415 int RGWObjManifest::append(RGWObjManifest& m, const RGWZoneGroup& zonegroup,
416 const RGWZoneParams& zone_params)
417 {
418 if (explicit_objs || m.explicit_objs) {
419 return append_explicit(m, zonegroup, zone_params);
420 }
421
422 if (rules.empty()) {
423 *this = m;
424 return 0;
425 }
426
427 string override_prefix;
428
429 if (prefix.empty()) {
430 prefix = m.prefix;
431 }
432
433 if (prefix != m.prefix) {
434 override_prefix = m.prefix;
435 }
436
437 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
438 if (miter == m.rules.end()) {
439 return append_explicit(m, zonegroup, zone_params);
440 }
441
442 for (; miter != m.rules.end(); ++miter) {
443 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
444
445 RGWObjManifestRule& rule = last_rule->second;
446
447 if (rule.part_size == 0) {
448 rule.part_size = obj_size - rule.start_ofs;
449 }
450
451 RGWObjManifestRule& next_rule = miter->second;
452 if (!next_rule.part_size) {
453 next_rule.part_size = m.obj_size - next_rule.start_ofs;
454 }
455
456 string rule_prefix = prefix;
457 if (!rule.override_prefix.empty()) {
458 rule_prefix = rule.override_prefix;
459 }
460
461 string next_rule_prefix = m.prefix;
462 if (!next_rule.override_prefix.empty()) {
463 next_rule_prefix = next_rule.override_prefix;
464 }
465
466 if (rule.part_size != next_rule.part_size ||
467 rule.stripe_max_size != next_rule.stripe_max_size ||
468 rule_prefix != next_rule_prefix) {
469 if (next_rule_prefix != prefix) {
470 append_rules(m, miter, &next_rule_prefix);
471 } else {
472 append_rules(m, miter, NULL);
473 }
474 break;
475 }
476
477 uint64_t expected_part_num = rule.start_part_num + 1;
478 if (rule.part_size > 0) {
479 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
480 }
481
482 if (expected_part_num != next_rule.start_part_num) {
483 append_rules(m, miter, NULL);
484 break;
485 }
486 }
487
488 set_obj_size(obj_size + m.obj_size);
489
490 return 0;
491 }
492
493 int RGWObjManifest::append(RGWObjManifest& m, RGWSI_Zone *zone_svc)
494 {
495 return append(m, zone_svc->get_zonegroup(), zone_svc->get_zone_params());
496 }
497
498 void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
499 string *override_prefix)
500 {
501 for (; miter != m.rules.end(); ++miter) {
502 RGWObjManifestRule rule = miter->second;
503 rule.start_ofs += obj_size;
504 if (override_prefix)
505 rule.override_prefix = *override_prefix;
506 rules[rule.start_ofs] = rule;
507 }
508 }
509
510 void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
511 {
512 if (explicit_objs) {
513 return;
514 }
515 obj_iterator iter = obj_begin();
516
517 while (iter != obj_end()) {
518 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
519 const rgw_obj_select& os = iter.get_location();
520 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
521 part.loc_ofs = 0;
522
523 uint64_t ofs = iter.get_stripe_ofs();
524
525 if (ofs == 0) {
526 part.loc = obj;
527 } else {
528 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
529 }
530 ++iter;
531 uint64_t next_ofs = iter.get_stripe_ofs();
532
533 part.size = next_ofs - ofs;
534 }
535
536 explicit_objs = true;
537 rules.clear();
538 prefix.clear();
539 }
540
541 int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
542 {
543 if (!explicit_objs) {
544 convert_to_explicit(zonegroup, zone_params);
545 }
546 if (!m.explicit_objs) {
547 m.convert_to_explicit(zonegroup, zone_params);
548 }
549 map<uint64_t, RGWObjManifestPart>::iterator iter;
550 uint64_t base = obj_size;
551 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
552 RGWObjManifestPart& part = iter->second;
553 objs[base + iter->first] = part;
554 }
555 obj_size += m.obj_size;
556
557 return 0;
558 }
559
560 bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
561 {
562 if (rules.empty()) {
563 return false;
564 }
565
566 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
567 if (iter != rules.begin()) {
568 --iter;
569 }
570
571 *rule = iter->second;
572
573 return true;
574 }
575
576 void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
577 {
578 write_version.ver = 1;
579 #define TAG_LEN 24
580
581 write_version.tag.clear();
582 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
583 }
584
585 class RGWMetaNotifierManager : public RGWCoroutinesManager {
586 RGWRados *store;
587 RGWHTTPManager http_manager;
588
589 public:
590 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
591 http_manager(store->ctx(), completion_mgr) {
592 http_manager.start();
593 }
594
595 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
596 rgw_http_param_pair pairs[] = { { "type", "metadata" },
597 { "notify", NULL },
598 { NULL, NULL } };
599
600 list<RGWCoroutinesStack *> stacks;
601 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
602 RGWRESTConn *conn = iter->second;
603 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
604 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
605
606 stacks.push_back(stack);
607 }
608 return run(stacks);
609 }
610 };
611
612 class RGWDataNotifierManager : public RGWCoroutinesManager {
613 RGWRados *store;
614 RGWHTTPManager http_manager;
615
616 public:
617 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
618 http_manager(store->ctx(), completion_mgr) {
619 http_manager.start();
620 }
621
622 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
623 rgw_http_param_pair pairs[] = { { "type", "data" },
624 { "notify", NULL },
625 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
626 { NULL, NULL } };
627
628 list<RGWCoroutinesStack *> stacks;
629 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
630 RGWRESTConn *conn = iter->second;
631 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
632 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
633
634 stacks.push_back(stack);
635 }
636 return run(stacks);
637 }
638 };
639
640 /* class RGWRadosThread */
641
642 void RGWRadosThread::start()
643 {
644 worker = new Worker(cct, this);
645 worker->create(thread_name.c_str());
646 }
647
648 void RGWRadosThread::stop()
649 {
650 down_flag = true;
651 stop_process();
652 if (worker) {
653 worker->signal();
654 worker->join();
655 }
656 delete worker;
657 worker = NULL;
658 }
659
660 void *RGWRadosThread::Worker::entry() {
661 uint64_t msec = processor->interval_msec();
662 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
663
664 do {
665 utime_t start = ceph_clock_now();
666 int r = processor->process();
667 if (r < 0) {
668 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
669 }
670
671 if (processor->going_down())
672 break;
673
674 utime_t end = ceph_clock_now();
675 end -= start;
676
677 uint64_t cur_msec = processor->interval_msec();
678 if (cur_msec != msec) { /* was it reconfigured? */
679 msec = cur_msec;
680 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
681 }
682
683 if (cur_msec > 0) {
684 if (interval <= end)
685 continue; // next round
686
687 utime_t wait_time = interval;
688 wait_time -= end;
689
690 wait_interval(wait_time);
691 } else {
692 wait();
693 }
694 } while (!processor->going_down());
695
696 return NULL;
697 }
698
699 class RGWMetaNotifier : public RGWRadosThread {
700 RGWMetaNotifierManager notify_mgr;
701 RGWMetadataLog *const log;
702
703 uint64_t interval_msec() override {
704 return cct->_conf->rgw_md_notify_interval_msec;
705 }
706 void stop_process() override {
707 notify_mgr.stop();
708 }
709 public:
710 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
711 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
712
713 int process() override;
714 };
715
716 int RGWMetaNotifier::process()
717 {
718 set<int> shards;
719
720 log->read_clear_modified(shards);
721
722 if (shards.empty()) {
723 return 0;
724 }
725
726 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
727 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
728 }
729
730 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
731
732 return 0;
733 }
734
735 class RGWDataNotifier : public RGWRadosThread {
736 RGWDataNotifierManager notify_mgr;
737
738 uint64_t interval_msec() override {
739 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
740 }
741 void stop_process() override {
742 notify_mgr.stop();
743 }
744 public:
745 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
746
747 int process() override;
748 };
749
750 int RGWDataNotifier::process()
751 {
752 if (!store->data_log) {
753 return 0;
754 }
755
756 map<int, set<string> > shards;
757
758 store->data_log->read_clear_modified(shards);
759
760 if (shards.empty()) {
761 return 0;
762 }
763
764 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
765 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
766 }
767
768 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
769
770 return 0;
771 }
772
773 class RGWSyncProcessorThread : public RGWRadosThread {
774 public:
775 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
776 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
777 ~RGWSyncProcessorThread() override {}
778 int init() override = 0 ;
779 int process() override = 0;
780 };
781
782 class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
783 {
784 RGWMetaSyncStatusManager sync;
785
786 uint64_t interval_msec() override {
787 return 0; /* no interval associated, it'll run once until stopped */
788 }
789 void stop_process() override {
790 sync.stop();
791 }
792 public:
793 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
794 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
795
796 void wakeup_sync_shards(set<int>& shard_ids) {
797 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
798 sync.wakeup(*iter);
799 }
800 }
801 RGWMetaSyncStatusManager* get_manager() { return &sync; }
802
803 int init() override {
804 int ret = sync.init();
805 if (ret < 0) {
806 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
807 return ret;
808 }
809 return 0;
810 }
811
812 int process() override {
813 sync.run();
814 return 0;
815 }
816 };
817
818 class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
819 {
820 PerfCountersRef counters;
821 RGWDataSyncStatusManager sync;
822 bool initialized;
823
824 uint64_t interval_msec() override {
825 if (initialized) {
826 return 0; /* no interval associated, it'll run once until stopped */
827 } else {
828 #define DATA_SYNC_INIT_WAIT_SEC 20
829 return DATA_SYNC_INIT_WAIT_SEC * 1000;
830 }
831 }
832 void stop_process() override {
833 sync.stop();
834 }
835 public:
836 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
837 const RGWZone* source_zone)
838 : RGWSyncProcessorThread(_store, "data-sync"),
839 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
840 sync(_store, async_rados, source_zone->id, counters.get()),
841 initialized(false) {}
842
843 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
844 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
845 sync.wakeup(iter->first, iter->second);
846 }
847 }
848 RGWDataSyncStatusManager* get_manager() { return &sync; }
849
850 int init() override {
851 return 0;
852 }
853
854 int process() override {
855 while (!initialized) {
856 if (going_down()) {
857 return 0;
858 }
859 int ret = sync.init();
860 if (ret >= 0) {
861 initialized = true;
862 break;
863 }
864 /* we'll be back! */
865 return 0;
866 }
867 sync.run();
868 return 0;
869 }
870 };
871
872 class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
873 {
874 RGWCoroutinesManager crs;
875 RGWRados *store;
876 rgw::BucketTrimManager *bucket_trim;
877 RGWHTTPManager http;
878 const utime_t trim_interval;
879
880 uint64_t interval_msec() override { return 0; }
881 void stop_process() override { crs.stop(); }
882 public:
883 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
884 int interval)
885 : RGWSyncProcessorThread(store, "sync-log-trim"),
886 crs(store->ctx(), store->get_cr_registry()), store(store),
887 bucket_trim(bucket_trim),
888 http(store->ctx(), crs.get_completion_mgr()),
889 trim_interval(interval, 0)
890 {}
891
892 int init() override {
893 return http.start();
894 }
895 int process() override {
896 list<RGWCoroutinesStack*> stacks;
897 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
898 meta->call(create_meta_log_trim_cr(this, store, &http,
899 cct->_conf->rgw_md_log_max_shards,
900 trim_interval));
901 stacks.push_back(meta);
902
903 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
904 data->call(create_data_log_trim_cr(store, &http,
905 cct->_conf->rgw_data_log_num_shards,
906 trim_interval));
907 stacks.push_back(data);
908
909 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
910 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
911 stacks.push_back(bucket);
912
913 crs.run(stacks);
914 return 0;
915 }
916
917 // implements DoutPrefixProvider
918 CephContext *get_cct() const override { return store->ctx(); }
919 unsigned get_subsys() const
920 {
921 return dout_subsys;
922 }
923
924 std::ostream& gen_prefix(std::ostream& out) const
925 {
926 return out << "sync log trim: ";
927 }
928
929 };
930
931 void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
932 {
933 Mutex::Locker l(meta_sync_thread_lock);
934 if (meta_sync_processor_thread) {
935 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
936 }
937 }
938
939 void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
940 {
941 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
942 Mutex::Locker l(data_sync_thread_lock);
943 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
944 if (iter == data_sync_processor_threads.end()) {
945 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
946 return;
947 }
948
949 RGWDataSyncProcessorThread *thread = iter->second;
950 ceph_assert(thread);
951 thread->wakeup_sync_shards(shard_ids);
952 }
953
954 RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
955 {
956 Mutex::Locker l(meta_sync_thread_lock);
957 if (meta_sync_processor_thread) {
958 return meta_sync_processor_thread->get_manager();
959 }
960 return nullptr;
961 }
962
963 RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
964 {
965 Mutex::Locker l(data_sync_thread_lock);
966 auto thread = data_sync_processor_threads.find(source_zone);
967 if (thread == data_sync_processor_threads.end()) {
968 return nullptr;
969 }
970 return thread->second->get_manager();
971 }
972
973 int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
974 {
975 IoCtx ioctx;
976 int r = open_pool_ctx(pool, ioctx);
977 if (r < 0) {
978 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
979 return r;
980 }
981
982 bool requires;
983 r = ioctx.pool_requires_alignment2(&requires);
984 if (r < 0) {
985 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
986 << r << dendl;
987 return r;
988 }
989
990 if (!requires) {
991 *alignment = 0;
992 return 0;
993 }
994
995 uint64_t align;
996 r = ioctx.pool_required_alignment2(&align);
997 if (r < 0) {
998 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
999 << r << dendl;
1000 return r;
1001 }
1002 if (align != 0) {
1003 ldout(cct, 20) << "required alignment=" << align << dendl;
1004 }
1005 *alignment = align;
1006 return 0;
1007 }
1008
1009 void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
1010 {
1011 if (alignment == 0) {
1012 *max_size = size;
1013 return;
1014 }
1015
1016 if (size <= alignment) {
1017 *max_size = alignment;
1018 return;
1019 }
1020
1021 *max_size = size - (size % alignment);
1022 }
1023
1024 int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
1025 {
1026 uint64_t alignment;
1027 int r = get_required_alignment(pool, &alignment);
1028 if (r < 0) {
1029 return r;
1030 }
1031
1032 if (palignment) {
1033 *palignment = alignment;
1034 }
1035
1036 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
1037
1038 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
1039
1040 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
1041
1042 return 0;
1043 }
1044
1045 int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
1046 uint64_t *max_chunk_size, uint64_t *palignment)
1047 {
1048 rgw_pool pool;
1049 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
1050 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
1051 return -EIO;
1052 }
1053 return get_max_chunk_size(pool, max_chunk_size, palignment);
1054 }
1055
1056 class RGWIndexCompletionManager;
1057
1058 struct complete_op_data {
1059 Mutex lock{"complete_op_data"};
1060 AioCompletion *rados_completion{nullptr};
1061 int manager_shard_id{-1};
1062 RGWIndexCompletionManager *manager{nullptr};
1063 rgw_obj obj;
1064 RGWModifyOp op;
1065 string tag;
1066 rgw_bucket_entry_ver ver;
1067 cls_rgw_obj_key key;
1068 rgw_bucket_dir_entry_meta dir_meta;
1069 list<cls_rgw_obj_key> remove_objs;
1070 bool log_op;
1071 uint16_t bilog_op;
1072 rgw_zone_set zones_trace;
1073
1074 bool stopped{false};
1075
1076 void stop() {
1077 Mutex::Locker l(lock);
1078 stopped = true;
1079 }
1080 };
1081
1082 class RGWIndexCompletionThread : public RGWRadosThread {
1083 RGWRados *store;
1084
1085 uint64_t interval_msec() override {
1086 return 0;
1087 }
1088
1089 list<complete_op_data *> completions;
1090
1091 Mutex completions_lock;
1092 public:
1093 RGWIndexCompletionThread(RGWRados *_store)
1094 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
1095
1096 int process() override;
1097
1098 void add_completion(complete_op_data *completion) {
1099 {
1100 Mutex::Locker l(completions_lock);
1101 completions.push_back(completion);
1102 }
1103
1104 signal();
1105 }
1106 };
1107
1108 int RGWIndexCompletionThread::process()
1109 {
1110 list<complete_op_data *> comps;
1111
1112 {
1113 Mutex::Locker l(completions_lock);
1114 completions.swap(comps);
1115 }
1116
1117 for (auto c : comps) {
1118 std::unique_ptr<complete_op_data> up{c};
1119
1120 if (going_down()) {
1121 continue;
1122 }
1123 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
1124
1125 RGWRados::BucketShard bs(store);
1126 RGWBucketInfo bucket_info;
1127
1128 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
1129 if (r < 0) {
1130 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
1131 /* not much to do */
1132 continue;
1133 }
1134
1135 r = store->guard_reshard(&bs, c->obj, bucket_info,
1136 [&](RGWRados::BucketShard *bs) -> int {
1137 librados::ObjectWriteOperation o;
1138 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
1139 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
1140 c->log_op, c->bilog_op, &c->zones_trace);
1141 return bs->index_ctx.operate(bs->bucket_obj, &o);
1142 });
1143 if (r < 0) {
1144 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
1145 /* ignoring error, can't do anything about it */
1146 continue;
1147 }
1148 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
1149 if (r < 0) {
1150 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
1151 }
1152 }
1153
1154 return 0;
1155 }
1156
1157 class RGWIndexCompletionManager {
1158 RGWRados *store{nullptr};
1159 vector<Mutex *> locks;
1160 vector<set<complete_op_data *> > completions;
1161
1162 RGWIndexCompletionThread *completion_thread{nullptr};
1163
1164 int num_shards;
1165
1166 std::atomic<int> cur_shard {0};
1167
1168
1169 public:
1170 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
1171 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
1172
1173 for (int i = 0; i < num_shards; i++) {
1174 char buf[64];
1175 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
1176 locks.push_back(new Mutex(buf));
1177 }
1178
1179 completions.resize(num_shards);
1180 }
1181 ~RGWIndexCompletionManager() {
1182 stop();
1183
1184 for (auto l : locks) {
1185 delete l;
1186 }
1187 }
1188
1189 int next_shard() {
1190 int result = cur_shard % num_shards;
1191 cur_shard++;
1192 return result;
1193 }
1194
1195 void create_completion(const rgw_obj& obj,
1196 RGWModifyOp op, string& tag,
1197 rgw_bucket_entry_ver& ver,
1198 const cls_rgw_obj_key& key,
1199 rgw_bucket_dir_entry_meta& dir_meta,
1200 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1201 uint16_t bilog_op,
1202 rgw_zone_set *zones_trace,
1203 complete_op_data **result);
1204 bool handle_completion(completion_t cb, complete_op_data *arg);
1205
1206 int start() {
1207 completion_thread = new RGWIndexCompletionThread(store);
1208 int ret = completion_thread->init();
1209 if (ret < 0) {
1210 return ret;
1211 }
1212 completion_thread->start();
1213 return 0;
1214 }
1215 void stop() {
1216 if (completion_thread) {
1217 completion_thread->stop();
1218 delete completion_thread;
1219 }
1220
1221 for (int i = 0; i < num_shards; ++i) {
1222 Mutex::Locker l(*locks[i]);
1223 for (auto c : completions[i]) {
1224 c->stop();
1225 }
1226 }
1227 completions.clear();
1228 }
1229 };
1230
1231 static void obj_complete_cb(completion_t cb, void *arg)
1232 {
1233 complete_op_data *completion = (complete_op_data *)arg;
1234 completion->lock.Lock();
1235 if (completion->stopped) {
1236 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
1237 delete completion;
1238 return;
1239 }
1240 bool need_delete = completion->manager->handle_completion(cb, completion);
1241 completion->lock.Unlock();
1242 if (need_delete) {
1243 delete completion;
1244 }
1245 }
1246
1247
1248 void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
1249 RGWModifyOp op, string& tag,
1250 rgw_bucket_entry_ver& ver,
1251 const cls_rgw_obj_key& key,
1252 rgw_bucket_dir_entry_meta& dir_meta,
1253 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1254 uint16_t bilog_op,
1255 rgw_zone_set *zones_trace,
1256 complete_op_data **result)
1257 {
1258 complete_op_data *entry = new complete_op_data;
1259
1260 int shard_id = next_shard();
1261
1262 entry->manager_shard_id = shard_id;
1263 entry->manager = this;
1264 entry->obj = obj;
1265 entry->op = op;
1266 entry->tag = tag;
1267 entry->ver = ver;
1268 entry->key = key;
1269 entry->dir_meta = dir_meta;
1270 entry->log_op = log_op;
1271 entry->bilog_op = bilog_op;
1272
1273 if (remove_objs) {
1274 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
1275 entry->remove_objs.push_back(*iter);
1276 }
1277 }
1278
1279 if (zones_trace) {
1280 entry->zones_trace = *zones_trace;
1281 } else {
1282 entry->zones_trace.insert(store->svc.zone->get_zone().id);
1283 }
1284
1285 *result = entry;
1286
1287 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
1288
1289 Mutex::Locker l(*locks[shard_id]);
1290 completions[shard_id].insert(entry);
1291 }
1292
1293 bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1294 {
1295 int shard_id = arg->manager_shard_id;
1296 {
1297 Mutex::Locker l(*locks[shard_id]);
1298
1299 auto& comps = completions[shard_id];
1300
1301 auto iter = comps.find(arg);
1302 if (iter == comps.end()) {
1303 return true;
1304 }
1305
1306 comps.erase(iter);
1307 }
1308
1309 int r = rados_aio_get_return_value(cb);
1310 if (r != -ERR_BUSY_RESHARDING) {
1311 return true;
1312 }
1313 completion_thread->add_completion(arg);
1314 return false;
1315 }
1316
1317 void RGWRados::finalize()
1318 {
1319 cct->get_admin_socket()->unregister_commands(this);
1320 if (run_sync_thread) {
1321 Mutex::Locker l(meta_sync_thread_lock);
1322 meta_sync_processor_thread->stop();
1323
1324 Mutex::Locker dl(data_sync_thread_lock);
1325 for (auto iter : data_sync_processor_threads) {
1326 RGWDataSyncProcessorThread *thread = iter.second;
1327 thread->stop();
1328 }
1329 if (sync_log_trimmer) {
1330 sync_log_trimmer->stop();
1331 }
1332 }
1333 if (async_rados) {
1334 async_rados->stop();
1335 }
1336 if (run_sync_thread) {
1337 delete meta_sync_processor_thread;
1338 meta_sync_processor_thread = NULL;
1339 Mutex::Locker dl(data_sync_thread_lock);
1340 for (auto iter : data_sync_processor_threads) {
1341 RGWDataSyncProcessorThread *thread = iter.second;
1342 delete thread;
1343 }
1344 data_sync_processor_threads.clear();
1345 delete sync_log_trimmer;
1346 sync_log_trimmer = nullptr;
1347 bucket_trim = boost::none;
1348 }
1349 if (meta_notifier) {
1350 meta_notifier->stop();
1351 delete meta_notifier;
1352 }
1353 if (data_notifier) {
1354 data_notifier->stop();
1355 delete data_notifier;
1356 }
1357 delete data_log;
1358 delete sync_tracer;
1359 if (async_rados) {
1360 delete async_rados;
1361 }
1362
1363 delete lc;
1364 lc = NULL;
1365
1366 delete gc;
1367 gc = NULL;
1368
1369 delete obj_expirer;
1370 obj_expirer = NULL;
1371
1372 RGWQuotaHandler::free_handler(quota_handler);
1373 if (cr_registry) {
1374 cr_registry->put();
1375 }
1376
1377 svc.shutdown();
1378
1379 delete meta_mgr;
1380 delete binfo_cache;
1381 delete obj_tombstone_cache;
1382
1383 if (reshard_wait.get()) {
1384 reshard_wait->stop();
1385 reshard_wait.reset();
1386 }
1387
1388 if (run_reshard_thread) {
1389 reshard->stop_processor();
1390 }
1391 delete reshard;
1392 delete index_completion_manager;
1393 }
1394
1395 /**
1396 * Initialize the RADOS instance and prepare to do other ops
1397 * Returns 0 on success, -ERR# on failure.
1398 */
1399 int RGWRados::init_rados()
1400 {
1401 int ret = 0;
1402 auto admin_socket = cct->get_admin_socket();
1403 for (auto cmd : admin_commands) {
1404 int r = admin_socket->register_command(cmd[0], cmd[1], this,
1405 cmd[2]);
1406 if (r < 0) {
1407 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
1408 << ")" << dendl;
1409 return r;
1410 }
1411 }
1412
1413 auto handles = std::vector<librados::Rados>{static_cast<size_t>(cct->_conf->rgw_num_rados_handles)};
1414
1415 for (auto& r : handles) {
1416 ret = r.init_with_context(cct);
1417 if (ret < 0) {
1418 return ret;
1419 }
1420 ret = r.connect();
1421 if (ret < 0) {
1422 return ret;
1423 }
1424 }
1425
1426 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1427 new RGWCoroutinesManagerRegistry(cct)};
1428 ret = crs->hook_to_admin_command("cr dump");
1429 if (ret < 0) {
1430 return ret;
1431 }
1432
1433 meta_mgr = new RGWMetadataManager(cct, this);
1434 data_log = new RGWDataChangesLog(cct, this);
1435 cr_registry = crs.release();
1436
1437 std::swap(handles, rados);
1438 return ret;
1439 }
1440
1441 int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
1442 {
1443 map<string,string> metadata = meta;
1444 metadata["num_handles"] = stringify(rados.size());
1445 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1446 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1447 metadata["zone_name"] = svc.zone->zone_name();
1448 metadata["zone_id"] = svc.zone->zone_id();
1449 string name = cct->_conf->name.get_id();
1450 if (name.compare(0, 4, "rgw.") == 0) {
1451 name = name.substr(4);
1452 }
1453 int ret = rados[0].service_daemon_register(daemon_type, name, metadata);
1454 if (ret < 0) {
1455 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1456 return ret;
1457 }
1458
1459 return 0;
1460 }
1461
1462 int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
1463 {
1464 int ret = rados[0].service_daemon_update_status(move(status));
1465 if (ret < 0) {
1466 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1467 return ret;
1468 }
1469
1470 return 0;
1471 }
1472
1473 /**
1474 * Initialize the RADOS instance and prepare to do other ops
1475 * Returns 0 on success, -ERR# on failure.
1476 */
1477 int RGWRados::init_complete()
1478 {
1479 int ret;
1480
1481 /*
1482 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1483 */
1484 auto& zone_public_config = svc.zone->get_zone();
1485 ret = svc.sync_modules->get_manager()->create_instance(cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module);
1486 if (ret < 0) {
1487 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
1488 if (ret == -ENOENT) {
1489 lderr(cct) << "ERROR: " << zone_public_config.tier_type
1490 << " sync module does not exist. valid sync modules: "
1491 << svc.sync_modules->get_manager()->get_registered_module_names()
1492 << dendl;
1493 }
1494 return ret;
1495 }
1496
1497 period_puller.reset(new RGWPeriodPuller(this));
1498 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
1499 svc.zone->get_current_period()));
1500
1501 ret = open_root_pool_ctx();
1502 if (ret < 0)
1503 return ret;
1504
1505 ret = open_gc_pool_ctx();
1506 if (ret < 0)
1507 return ret;
1508
1509 ret = open_lc_pool_ctx();
1510 if (ret < 0)
1511 return ret;
1512
1513 ret = open_objexp_pool_ctx();
1514 if (ret < 0)
1515 return ret;
1516
1517 ret = open_reshard_pool_ctx();
1518 if (ret < 0)
1519 return ret;
1520
1521 pools_initialized = true;
1522
1523 gc = new RGWGC();
1524 gc->initialize(cct, this);
1525
1526 obj_expirer = new RGWObjectExpirer(this);
1527
1528 if (use_gc_thread) {
1529 gc->start_processor();
1530 obj_expirer->start_processor();
1531 }
1532
1533 auto& current_period = svc.zone->get_current_period();
1534 auto& zonegroup = svc.zone->get_zonegroup();
1535 auto& zone_params = svc.zone->get_zone_params();
1536 auto& zone = svc.zone->get_zone();
1537
1538 /* no point of running sync thread if we don't have a master zone configured
1539 or there is no rest_master_conn */
1540 if (zonegroup.master_zone.empty() || !svc.zone->get_master_conn()
1541 || current_period.get_id().empty()) {
1542 run_sync_thread = false;
1543 }
1544
1545 if (run_sync_thread) {
1546 // initialize the log period history
1547 meta_mgr->init_oldest_log_period();
1548 }
1549
1550 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
1551 async_rados->start();
1552
1553 ret = meta_mgr->init(current_period.get_id());
1554 if (ret < 0) {
1555 lderr(cct) << "ERROR: failed to initialize metadata log: "
1556 << cpp_strerror(-ret) << dendl;
1557 return ret;
1558 }
1559
1560 if (svc.zone->is_meta_master()) {
1561 auto md_log = meta_mgr->get_log(current_period.get_id());
1562 meta_notifier = new RGWMetaNotifier(this, md_log);
1563 meta_notifier->start();
1564 }
1565
1566 /* init it anyway, might run sync through radosgw-admin explicitly */
1567 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1568 sync_tracer->init(this);
1569 ret = sync_tracer->hook_to_admin_command();
1570 if (ret < 0) {
1571 return ret;
1572 }
1573
1574 if (run_sync_thread) {
1575 for (const auto &pt: zonegroup.placement_targets) {
1576 if (zone_params.placement_pools.find(pt.second.name)
1577 == zone_params.placement_pools.end()){
1578 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1579 << pt.second.name << " present in zonegroup" << dendl;
1580 }
1581 }
1582 Mutex::Locker l(meta_sync_thread_lock);
1583 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
1584 ret = meta_sync_processor_thread->init();
1585 if (ret < 0) {
1586 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1587 return ret;
1588 }
1589 meta_sync_processor_thread->start();
1590
1591 // configure the bucket trim manager
1592 rgw::BucketTrimConfig config;
1593 rgw::configure_bucket_trim(cct, config);
1594
1595 bucket_trim.emplace(this, config);
1596 ret = bucket_trim->init();
1597 if (ret < 0) {
1598 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1599 return ret;
1600 }
1601 data_log->set_observer(&*bucket_trim);
1602
1603 Mutex::Locker dl(data_sync_thread_lock);
1604 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1605 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
1606 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, source_zone);
1607 ret = thread->init();
1608 if (ret < 0) {
1609 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1610 return ret;
1611 }
1612 thread->start();
1613 data_sync_processor_threads[source_zone->id] = thread;
1614 }
1615 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1616 if (interval > 0) {
1617 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
1618 ret = sync_log_trimmer->init();
1619 if (ret < 0) {
1620 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1621 return ret;
1622 }
1623 sync_log_trimmer->start();
1624 }
1625 }
1626 data_notifier = new RGWDataNotifier(this);
1627 data_notifier->start();
1628
1629 lc = new RGWLC();
1630 lc->initialize(cct, this);
1631
1632 if (use_lc_thread)
1633 lc->start_processor();
1634
1635 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
1636
1637 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
1638 zone.bucket_index_max_shards);
1639 if (bucket_index_max_shards > get_max_bucket_shards()) {
1640 bucket_index_max_shards = get_max_bucket_shards();
1641 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
1642 << get_max_bucket_shards() << dendl;
1643 }
1644 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1645
1646 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
1647 binfo_cache->init(svc.cache);
1648
1649 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
1650
1651 if (need_tombstone_cache) {
1652 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1653 }
1654
1655 reshard_wait = std::make_shared<RGWReshardWait>();
1656
1657 reshard = new RGWReshard(this);
1658
1659 /* only the master zone in the zonegroup reshards buckets */
1660 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
1661 if (run_reshard_thread) {
1662 reshard->start_processor();
1663 }
1664
1665 index_completion_manager = new RGWIndexCompletionManager(this);
1666 ret = index_completion_manager->start();
1667
1668 return ret;
1669 }
1670
1671 int RGWRados::init_svc(bool raw)
1672 {
1673 if (raw) {
1674 return svc.init_raw(cct, use_cache);
1675 }
1676
1677 return svc.init(cct, use_cache);
1678 }
1679
1680 /**
1681 * Initialize the RADOS instance and prepare to do other ops
1682 * Returns 0 on success, -ERR# on failure.
1683 */
1684 int RGWRados::initialize()
1685 {
1686 int ret;
1687
1688 inject_notify_timeout_probability =
1689 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1690 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
1691
1692 ret = init_svc(false);
1693 if (ret < 0) {
1694 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
1695 return ret;
1696 }
1697
1698 host_id = svc.zone_utils->gen_host_id();
1699
1700 ret = init_rados();
1701 if (ret < 0)
1702 return ret;
1703
1704 return init_complete();
1705 }
1706
1707 /**
1708 * Open the pool used as root for this gateway
1709 * Returns: 0 on success, -ERR# otherwise.
1710 */
1711 int RGWRados::open_root_pool_ctx()
1712 {
1713 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true);
1714 }
1715
1716 int RGWRados::open_gc_pool_ctx()
1717 {
1718 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true);
1719 }
1720
1721 int RGWRados::open_lc_pool_ctx()
1722 {
1723 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true);
1724 }
1725
1726 int RGWRados::open_objexp_pool_ctx()
1727 {
1728 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true);
1729 }
1730
1731 int RGWRados::open_reshard_pool_ctx()
1732 {
1733 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true);
1734 }
1735
1736 int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx)
1737 {
1738 constexpr bool create = true; // create the pool if it doesn't exist
1739 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
1740 }
1741
1742 void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
1743 string *marker) {
1744 if (marker) {
1745 *marker = shard_id_str;
1746 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
1747 marker->append(shard_marker);
1748 }
1749 }
1750
1751 int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
1752 {
1753 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
1754
1755 if (!explicit_pool.empty()) {
1756 return open_pool_ctx(explicit_pool, index_ctx);
1757 }
1758
1759 auto& zonegroup = svc.zone->get_zonegroup();
1760 auto& zone_params = svc.zone->get_zone_params();
1761
1762 const rgw_placement_rule *rule = &bucket_info.placement_rule;
1763 if (rule->empty()) {
1764 rule = &zonegroup.default_placement;
1765 }
1766 auto iter = zone_params.placement_pools.find(rule->name);
1767 if (iter == zone_params.placement_pools.end()) {
1768 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
1769 return -EINVAL;
1770 }
1771
1772 int r = open_pool_ctx(iter->second.index_pool, index_ctx);
1773 if (r < 0)
1774 return r;
1775
1776 return 0;
1777 }
1778
1779 /**** logs ****/
1780
1781 struct log_list_state {
1782 string prefix;
1783 librados::IoCtx io_ctx;
1784 librados::NObjectIterator obit;
1785 };
1786
1787 int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1788 {
1789 log_list_state *state = new log_list_state;
1790 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1791 if (r < 0) {
1792 delete state;
1793 return r;
1794 }
1795 state->prefix = prefix;
1796 state->obit = state->io_ctx.nobjects_begin();
1797 *handle = (RGWAccessHandle)state;
1798 return 0;
1799 }
1800
1801 int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1802 {
1803 log_list_state *state = static_cast<log_list_state *>(handle);
1804 while (true) {
1805 if (state->obit == state->io_ctx.nobjects_end()) {
1806 delete state;
1807 return -ENOENT;
1808 }
1809 if (state->prefix.length() &&
1810 state->obit->get_oid().find(state->prefix) != 0) {
1811 state->obit++;
1812 continue;
1813 }
1814 *name = state->obit->get_oid();
1815 state->obit++;
1816 break;
1817 }
1818 return 0;
1819 }
1820
1821 int RGWRados::log_remove(const string& name)
1822 {
1823 librados::IoCtx io_ctx;
1824 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
1825 if (r < 0)
1826 return r;
1827 return io_ctx.remove(name);
1828 }
1829
1830 struct log_show_state {
1831 librados::IoCtx io_ctx;
1832 bufferlist bl;
1833 bufferlist::const_iterator p;
1834 string name;
1835 uint64_t pos;
1836 bool eof;
1837 log_show_state() : pos(0), eof(false) {}
1838 };
1839
1840 int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1841 {
1842 log_show_state *state = new log_show_state;
1843 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
1844 if (r < 0) {
1845 delete state;
1846 return r;
1847 }
1848 state->name = name;
1849 *handle = (RGWAccessHandle)state;
1850 return 0;
1851 }
1852
1853 int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1854 {
1855 log_show_state *state = static_cast<log_show_state *>(handle);
1856 off_t off = state->p.get_off();
1857
1858 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1859 << " off " << off
1860 << " eof " << (int)state->eof
1861 << dendl;
1862 // read some?
1863 unsigned chunk = 1024*1024;
1864 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1865 bufferlist more;
1866 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1867 if (r < 0)
1868 return r;
1869 state->pos += r;
1870 bufferlist old;
1871 try {
1872 old.substr_of(state->bl, off, state->bl.length() - off);
1873 } catch (buffer::error& err) {
1874 return -EINVAL;
1875 }
1876 state->bl.clear();
1877 state->bl.claim(old);
1878 state->bl.claim_append(more);
1879 state->p = state->bl.cbegin();
1880 if ((unsigned)r < chunk)
1881 state->eof = true;
1882 ldout(cct, 10) << " read " << r << dendl;
1883 }
1884
1885 if (state->p.end())
1886 return 0; // end of file
1887 try {
1888 decode(*entry, state->p);
1889 }
1890 catch (const buffer::error &e) {
1891 return -EINVAL;
1892 }
1893 return 1;
1894 }
1895
1896 /**
1897 * usage_log_hash: get usage log key hash, based on name and index
1898 *
1899 * Get the usage object name. Since a user may have more than 1
1900 * object holding that info (multiple shards), we use index to
1901 * specify that shard number. Once index exceeds max shards it
1902 * wraps.
1903 * If name is not being set, results for all users will be returned
1904 * and index will wrap only after total shards number.
1905 *
1906 * @param cct [in] ceph context
1907 * @param name [in] user name
1908 * @param hash [out] hash value
1909 * @param index [in] shard index number
1910 */
1911 static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1912 {
1913 uint32_t val = index;
1914
1915 if (!name.empty()) {
1916 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
1917 val %= max_user_shards;
1918 val += ceph_str_hash_linux(name.c_str(), name.size());
1919 }
1920 char buf[17];
1921 int max_shards = cct->_conf->rgw_usage_max_shards;
1922 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1923 hash = buf;
1924 }
1925
1926 int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1927 {
1928 uint32_t index = 0;
1929
1930 map<string, rgw_usage_log_info> log_objs;
1931
1932 string hash;
1933 string last_user;
1934
1935 /* restructure usage map, zone by object hash */
1936 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1937 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1938 const rgw_user_bucket& ub = iter->first;
1939 RGWUsageBatch& info = iter->second;
1940
1941 if (ub.user.empty()) {
1942 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1943 continue;
1944 }
1945
1946 if (ub.user != last_user) {
1947 /* index *should* be random, but why waste extra cycles
1948 in most cases max user shards is not going to exceed 1,
1949 so just incrementing it */
1950 usage_log_hash(cct, ub.user, hash, index++);
1951 }
1952 last_user = ub.user;
1953 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1954
1955 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1956 v.push_back(miter->second);
1957 }
1958 }
1959
1960 map<string, rgw_usage_log_info>::iterator liter;
1961
1962 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1963 int r = cls_obj_usage_log_add(liter->first, liter->second);
1964 if (r < 0)
1965 return r;
1966 }
1967 return 0;
1968 }
1969
1970 int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1971 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1972 rgw_usage_log_entry>& usage)
1973 {
1974 uint32_t num = max_entries;
1975 string hash, first_hash;
1976 string user_str = user.to_str();
1977 usage_log_hash(cct, user_str, first_hash, 0);
1978
1979 if (usage_iter.index) {
1980 usage_log_hash(cct, user_str, hash, usage_iter.index);
1981 } else {
1982 hash = first_hash;
1983 }
1984
1985 usage.clear();
1986
1987 do {
1988 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1989 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1990
1991 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
1992 usage_iter.read_iter, ret_usage, is_truncated);
1993 if (ret == -ENOENT)
1994 goto next;
1995
1996 if (ret < 0)
1997 return ret;
1998
1999 num -= ret_usage.size();
2000
2001 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
2002 usage[iter->first].aggregate(iter->second);
2003 }
2004
2005 next:
2006 if (!*is_truncated) {
2007 usage_iter.read_iter.clear();
2008 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
2009 }
2010 } while (num && !*is_truncated && hash != first_hash);
2011 return 0;
2012 }
2013
2014 int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
2015 {
2016 uint32_t index = 0;
2017 string hash, first_hash;
2018 string user_str = user.to_str();
2019 usage_log_hash(cct, user_str, first_hash, index);
2020
2021 hash = first_hash;
2022 do {
2023 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
2024
2025 if (ret < 0 && ret != -ENOENT)
2026 return ret;
2027
2028 usage_log_hash(cct, user_str, hash, ++index);
2029 } while (hash != first_hash);
2030
2031 return 0;
2032 }
2033
2034
2035 int RGWRados::clear_usage()
2036 {
2037 auto max_shards = cct->_conf->rgw_usage_max_shards;
2038 int ret=0;
2039 for (unsigned i=0; i < max_shards; i++){
2040 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
2041 ret = cls_obj_usage_log_clear(oid);
2042 if (ret < 0){
2043 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
2044 return ret;
2045 }
2046 }
2047 return ret;
2048 }
2049
2050 int RGWRados::key_to_shard_id(const string& key, int max_shards)
2051 {
2052 return rgw_shard_id(key, max_shards);
2053 }
2054
2055 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
2056 {
2057 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2058 char buf[16];
2059 if (shard_id) {
2060 *shard_id = val % max_shards;
2061 }
2062 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2063 name = prefix + buf;
2064 }
2065
2066 void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
2067 {
2068 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2069 val ^= ceph_str_hash_linux(section.c_str(), section.size());
2070 char buf[16];
2071 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2072 name = prefix + buf;
2073 }
2074
2075 void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
2076 {
2077 char buf[16];
2078 snprintf(buf, sizeof(buf), "%u", shard_id);
2079 name = prefix + buf;
2080
2081 }
2082
2083 void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2084 {
2085 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
2086 }
2087
2088 int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
2089 {
2090 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx, true);
2091
2092 }
2093
2094 int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2095 {
2096 librados::IoCtx io_ctx;
2097
2098 int r = time_log_add_init(io_ctx);
2099 if (r < 0) {
2100 return r;
2101 }
2102
2103 ObjectWriteOperation op;
2104 utime_t t(ut);
2105 cls_log_add(op, t, section, key, bl);
2106
2107 return io_ctx.operate(oid, &op);
2108 }
2109
2110 int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
2111 librados::AioCompletion *completion, bool monotonic_inc)
2112 {
2113 librados::IoCtx io_ctx;
2114
2115 int r = time_log_add_init(io_ctx);
2116 if (r < 0) {
2117 return r;
2118 }
2119
2120 ObjectWriteOperation op;
2121 cls_log_add(op, entries, monotonic_inc);
2122
2123 if (!completion) {
2124 r = io_ctx.operate(oid, &op);
2125 } else {
2126 r = io_ctx.aio_operate(oid, completion, &op);
2127 }
2128 return r;
2129 }
2130
2131 int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
2132 int max_entries, list<cls_log_entry>& entries,
2133 const string& marker,
2134 string *out_marker,
2135 bool *truncated)
2136 {
2137 librados::IoCtx io_ctx;
2138
2139 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2140 if (r < 0)
2141 return r;
2142 librados::ObjectReadOperation op;
2143
2144 utime_t st(start_time);
2145 utime_t et(end_time);
2146
2147 cls_log_list(op, st, et, marker, max_entries, entries,
2148 out_marker, truncated);
2149
2150 bufferlist obl;
2151
2152 int ret = io_ctx.operate(oid, &op, &obl);
2153 if (ret < 0)
2154 return ret;
2155
2156 return 0;
2157 }
2158
2159 int RGWRados::time_log_info(const string& oid, cls_log_header *header)
2160 {
2161 librados::IoCtx io_ctx;
2162
2163 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2164 if (r < 0)
2165 return r;
2166 librados::ObjectReadOperation op;
2167
2168 cls_log_info(op, header);
2169
2170 bufferlist obl;
2171
2172 int ret = io_ctx.operate(oid, &op, &obl);
2173 if (ret < 0)
2174 return ret;
2175
2176 return 0;
2177 }
2178
2179 int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
2180 {
2181 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2182 if (r < 0)
2183 return r;
2184
2185 librados::ObjectReadOperation op;
2186
2187 cls_log_info(op, header);
2188
2189 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
2190 if (ret < 0)
2191 return ret;
2192
2193 return 0;
2194 }
2195
2196 int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
2197 const string& from_marker, const string& to_marker,
2198 librados::AioCompletion *completion)
2199 {
2200 librados::IoCtx io_ctx;
2201
2202 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
2203 if (r < 0)
2204 return r;
2205
2206 utime_t st(start_time);
2207 utime_t et(end_time);
2208
2209 ObjectWriteOperation op;
2210 cls_log_trim(op, st, et, from_marker, to_marker);
2211
2212 if (!completion) {
2213 r = io_ctx.operate(oid, &op);
2214 } else {
2215 r = io_ctx.aio_operate(oid, completion, &op);
2216 }
2217 return r;
2218 }
2219
2220 string RGWRados::objexp_hint_get_shardname(int shard_num)
2221 {
2222 char buf[32];
2223 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
2224
2225 string objname("obj_delete_at_hint.");
2226 return objname + buf;
2227 }
2228
2229 int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
2230 {
2231 string obj_key = key.name + key.instance;
2232 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
2233 return rgw_bucket_shard_index(obj_key, num_shards);
2234 }
2235
2236 static string objexp_hint_get_keyext(const string& tenant_name,
2237 const string& bucket_name,
2238 const string& bucket_id,
2239 const rgw_obj_key& obj_key)
2240 {
2241 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
2242 ":" + obj_key.name + ":" + obj_key.instance;
2243 }
2244
2245 int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
2246 const string& tenant_name,
2247 const string& bucket_name,
2248 const string& bucket_id,
2249 const rgw_obj_index_key& obj_key)
2250 {
2251 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
2252 bucket_id, obj_key);
2253 objexp_hint_entry he = {
2254 .tenant = tenant_name,
2255 .bucket_name = bucket_name,
2256 .bucket_id = bucket_id,
2257 .obj_key = obj_key,
2258 .exp_time = delete_at };
2259 bufferlist hebl;
2260 encode(he, hebl);
2261 ObjectWriteOperation op;
2262 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
2263
2264 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
2265 return objexp_pool_ctx.operate(shard_name, &op);
2266 }
2267
2268 void RGWRados::objexp_get_shard(int shard_num,
2269 string& shard) /* out */
2270 {
2271 shard = objexp_hint_get_shardname(shard_num);
2272 }
2273
2274 int RGWRados::objexp_hint_list(const string& oid,
2275 const ceph::real_time& start_time,
2276 const ceph::real_time& end_time,
2277 const int max_entries,
2278 const string& marker,
2279 list<cls_timeindex_entry>& entries, /* out */
2280 string *out_marker, /* out */
2281 bool *truncated) /* out */
2282 {
2283 librados::ObjectReadOperation op;
2284 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
2285 out_marker, truncated);
2286
2287 bufferlist obl;
2288 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
2289
2290 if ((ret < 0 ) && (ret != -ENOENT)) {
2291 return ret;
2292 }
2293
2294 if ((ret == -ENOENT) && truncated) {
2295 *truncated = false;
2296 }
2297
2298 return 0;
2299 }
2300
2301 int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
2302 objexp_hint_entry& hint_entry) /* out */
2303 {
2304 try {
2305 auto iter = ti_entry.value.cbegin();
2306 decode(hint_entry, iter);
2307 } catch (buffer::error& err) {
2308 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
2309 }
2310
2311 return 0;
2312 }
2313
2314 int RGWRados::objexp_hint_trim(const string& oid,
2315 const ceph::real_time& start_time,
2316 const ceph::real_time& end_time,
2317 const string& from_marker,
2318 const string& to_marker)
2319 {
2320 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
2321 from_marker, to_marker);
2322 if ((ret < 0 ) && (ret != -ENOENT)) {
2323 return ret;
2324 }
2325
2326 return 0;
2327 }
2328
2329 int RGWRados::lock_exclusive(const rgw_pool& pool, const string& oid, timespan& duration,
2330 string& zone_id, string& owner_id) {
2331 librados::IoCtx io_ctx;
2332
2333 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2334 if (r < 0) {
2335 return r;
2336 }
2337 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
2338 utime_t ut(msec / 1000, msec % 1000);
2339
2340 rados::cls::lock::Lock l(log_lock_name);
2341 l.set_duration(ut);
2342 l.set_cookie(owner_id);
2343 l.set_tag(zone_id);
2344 l.set_may_renew(true);
2345
2346 return l.lock_exclusive(&io_ctx, oid);
2347 }
2348
2349 int RGWRados::unlock(const rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
2350 librados::IoCtx io_ctx;
2351
2352 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2353 if (r < 0) {
2354 return r;
2355 }
2356
2357 rados::cls::lock::Lock l(log_lock_name);
2358 l.set_tag(zone_id);
2359 l.set_cookie(owner_id);
2360
2361 return l.unlock(&io_ctx, oid);
2362 }
2363
2364 int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
2365 {
2366 auto i = bl.cbegin();
2367 RGWAccessControlPolicy policy(cct);
2368 try {
2369 policy.decode_owner(i);
2370 } catch (buffer::error& err) {
2371 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2372 return -EIO;
2373 }
2374 *owner = policy.get_owner();
2375 return 0;
2376 }
2377
2378 int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
2379 {
2380 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
2381 if (aiter == attrset.end())
2382 return -EIO;
2383
2384 bufferlist& bl = aiter->second;
2385 auto iter = bl.cbegin();
2386 try {
2387 policy->decode(iter);
2388 } catch (buffer::error& err) {
2389 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2390 return -EIO;
2391 }
2392 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
2393 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
2394 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
2395 s3policy->to_xml(*_dout);
2396 *_dout << dendl;
2397 }
2398 return 0;
2399 }
2400
2401
2402 int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
2403 {
2404 rgw_bucket bucket = bucket_info.bucket;
2405 bucket.update_bucket_id(new_bucket_id);
2406
2407 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
2408
2409 bucket_info.objv_tracker.clear();
2410 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
2411 if (ret < 0) {
2412 return ret;
2413 }
2414
2415 return 0;
2416 }
2417
2418
2419 /**
2420 * Get ordered listing of the objects in a bucket.
2421 *
2422 * max: maximum number of results to return
2423 * bucket: bucket to list contents of
2424 * prefix: only return results that match this prefix
2425 * delim: do not include results that match this string.
2426 * Any skipped results will have the matching portion of their name
2427 * inserted in common_prefixes with a "true" mark.
2428 * marker: if filled in, begin the listing with this object.
2429 * end_marker: if filled in, end the listing with this object.
2430 * result: the objects are put in here.
2431 * common_prefixes: if delim is filled in, any matching prefixes are
2432 * placed here.
2433 * is_truncated: if number of objects in the bucket is bigger than
2434 * max, then truncated.
2435 */
2436 static inline std::string after_delim(std::string_view delim)
2437 {
2438 // assert: ! delim.empty()
2439 std::string result{delim.data(), delim.length()};
2440 result += char(255);
2441 return result;
2442 }
2443
2444 int RGWRados::Bucket::List::list_objects_ordered(
2445 int64_t max,
2446 vector<rgw_bucket_dir_entry> *result,
2447 map<string, bool> *common_prefixes,
2448 bool *is_truncated)
2449 {
2450 RGWRados *store = target->get_store();
2451 CephContext *cct = store->ctx();
2452 int shard_id = target->get_shard_id();
2453
2454 int count = 0;
2455 bool truncated = true;
2456 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
2457
2458 result->clear();
2459
2460 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
2461 rgw_obj_index_key cur_marker;
2462 marker_obj.get_index_key(&cur_marker);
2463
2464 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
2465 params.ns);
2466 rgw_obj_index_key cur_end_marker;
2467 end_marker_obj.get_index_key(&cur_end_marker);
2468 const bool cur_end_marker_valid = !params.end_marker.empty();
2469
2470 rgw_obj_key prefix_obj(params.prefix);
2471 prefix_obj.ns = params.ns;
2472 string cur_prefix = prefix_obj.get_index_key_name();
2473 string after_delim_s; /* needed in !params.delim.empty() AND later */
2474
2475 if (!params.delim.empty()) {
2476 after_delim_s = after_delim(params.delim);
2477 /* if marker points at a common prefix, fast forward it into its
2478 * upper bound string */
2479 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
2480 if (delim_pos >= 0) {
2481 string s = cur_marker.name.substr(0, delim_pos);
2482 s.append(after_delim_s);
2483 cur_marker = s;
2484 }
2485 }
2486
2487 string skip_after_delim;
2488 while (truncated && count <= max) {
2489 if (skip_after_delim > cur_marker.name) {
2490 cur_marker = skip_after_delim;
2491
2492 ldout(cct, 20) << "setting cur_marker="
2493 << cur_marker.name
2494 << "[" << cur_marker.instance << "]"
2495 << dendl;
2496 }
2497 std::map<string, rgw_bucket_dir_entry> ent_map;
2498 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
2499 shard_id,
2500 cur_marker,
2501 cur_prefix,
2502 read_ahead + 1 - count,
2503 params.list_versions,
2504 ent_map,
2505 &truncated,
2506 &cur_marker);
2507 if (r < 0)
2508 return r;
2509
2510 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
2511 rgw_bucket_dir_entry& entry = eiter->second;
2512 rgw_obj_index_key index_key = entry.key;
2513
2514 rgw_obj_key obj(index_key);
2515
2516 /* note that parse_raw_oid() here will not set the correct
2517 * object's instance, as rgw_obj_index_key encodes that
2518 * separately. We don't need to set the instance because it's
2519 * not needed for the checks here and we end up using the raw
2520 * entry for the return vector
2521 */
2522 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2523 if (!valid) {
2524 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
2525 continue;
2526 }
2527
2528 bool check_ns = (obj.ns == params.ns);
2529 if (!params.list_versions && !entry.is_visible()) {
2530 continue;
2531 }
2532
2533 if (params.enforce_ns && !check_ns) {
2534 if (!params.ns.empty()) {
2535 /* we've iterated past the namespace we're searching -- done now */
2536 truncated = false;
2537 goto done;
2538 }
2539
2540 /* we're not looking at the namespace this object is in, next! */
2541 continue;
2542 }
2543
2544 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2545 truncated = false;
2546 goto done;
2547 }
2548
2549 if (count < max) {
2550 params.marker = index_key;
2551 next_marker = index_key;
2552 }
2553
2554 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2555 continue;
2556
2557 if (params.prefix.size() &&
2558 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
2559 continue;
2560
2561 if (!params.delim.empty()) {
2562 int delim_pos = obj.name.find(params.delim, params.prefix.size());
2563
2564 if (delim_pos >= 0) {
2565 /* extract key -with trailing delimiter- for CommonPrefix */
2566 string prefix_key =
2567 obj.name.substr(0, delim_pos + params.delim.length());
2568
2569 if (common_prefixes &&
2570 common_prefixes->find(prefix_key) == common_prefixes->end()) {
2571 if (count >= max) {
2572 truncated = true;
2573 goto done;
2574 }
2575 next_marker = prefix_key;
2576 (*common_prefixes)[prefix_key] = true;
2577
2578 int marker_delim_pos = cur_marker.name.find(
2579 params.delim, cur_prefix.size());
2580
2581 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
2582 skip_after_delim.append(after_delim_s);
2583
2584 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
2585
2586 count++;
2587 }
2588
2589 continue;
2590 }
2591 }
2592
2593 if (count >= max) {
2594 truncated = true;
2595 goto done;
2596 }
2597
2598 result->emplace_back(std::move(entry));
2599 count++;
2600 }
2601 }
2602
2603 done:
2604 if (is_truncated)
2605 *is_truncated = truncated;
2606
2607 return 0;
2608 } // list_objects_ordered
2609
2610
2611 /**
2612 * Get listing of the objects in a bucket and allow the results to be out
2613 * of order.
2614 *
2615 * Even though there are key differences with the ordered counterpart,
2616 * the parameters are the same to maintain some compatability.
2617 *
2618 * max: maximum number of results to return
2619 * bucket: bucket to list contents of
2620 * prefix: only return results that match this prefix
2621 * delim: should not be set; if it is we should have indicated an error
2622 * marker: if filled in, begin the listing with this object.
2623 * end_marker: if filled in, end the listing with this object.
2624 * result: the objects are put in here.
2625 * common_prefixes: this is never filled with an unordered list; the param
2626 * is maintained for compatibility
2627 * is_truncated: if number of objects in the bucket is bigger than max, then
2628 * truncated.
2629 */
2630 int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
2631 vector<rgw_bucket_dir_entry> *result,
2632 map<string, bool> *common_prefixes,
2633 bool *is_truncated)
2634 {
2635 RGWRados *store = target->get_store();
2636 CephContext *cct = store->ctx();
2637 int shard_id = target->get_shard_id();
2638
2639 int count = 0;
2640 bool truncated = true;
2641
2642 // read a few extra in each call to cls_bucket_list_unordered in
2643 // case some are filtered out due to namespace matching, versioning,
2644 // filtering, etc.
2645 const int64_t max_read_ahead = 100;
2646 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2647
2648 result->clear();
2649
2650 rgw_obj_key marker_obj(params.marker.name,
2651 params.marker.instance,
2652 params.ns);
2653 rgw_obj_index_key cur_marker;
2654 marker_obj.get_index_key(&cur_marker);
2655
2656 rgw_obj_key end_marker_obj(params.end_marker.name,
2657 params.end_marker.instance,
2658 params.ns);
2659 rgw_obj_index_key cur_end_marker;
2660 end_marker_obj.get_index_key(&cur_end_marker);
2661 const bool cur_end_marker_valid = !params.end_marker.empty();
2662
2663 rgw_obj_key prefix_obj(params.prefix);
2664 prefix_obj.ns = params.ns;
2665 string cur_prefix = prefix_obj.get_index_key_name();
2666
2667 while (truncated && count <= max) {
2668 std::vector<rgw_bucket_dir_entry> ent_list;
2669 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2670 shard_id,
2671 cur_marker,
2672 cur_prefix,
2673 read_ahead,
2674 params.list_versions,
2675 ent_list,
2676 &truncated,
2677 &cur_marker);
2678 if (r < 0)
2679 return r;
2680
2681 // NB: while regions of ent_list will be sorted, we have no
2682 // guarantee that all items will be sorted since they can cross
2683 // shard boundaries
2684
2685 for (auto& entry : ent_list) {
2686 rgw_obj_index_key index_key = entry.key;
2687 rgw_obj_key obj(index_key);
2688
2689 /* note that parse_raw_oid() here will not set the correct
2690 * object's instance, as rgw_obj_index_key encodes that
2691 * separately. We don't need to set the instance because it's
2692 * not needed for the checks here and we end up using the raw
2693 * entry for the return vector
2694 */
2695 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2696 if (!valid) {
2697 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2698 obj.name << dendl;
2699 continue;
2700 }
2701
2702 if (!params.list_versions && !entry.is_visible()) {
2703 continue;
2704 }
2705
2706 if (params.enforce_ns && obj.ns != params.ns) {
2707 continue;
2708 }
2709
2710 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2711 // we're not guaranteed items will come in order, so we have
2712 // to loop through all
2713 continue;
2714 }
2715
2716 if (count < max) {
2717 params.marker.set(index_key);
2718 next_marker.set(index_key);
2719 }
2720
2721 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2722 continue;
2723
2724 if (params.prefix.size() &&
2725 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2726 continue;
2727
2728 if (count >= max) {
2729 truncated = true;
2730 goto done;
2731 }
2732
2733 result->emplace_back(std::move(entry));
2734 count++;
2735 } // for (auto& entry : ent_list)
2736 } // while (truncated && count <= max)
2737
2738 done:
2739 if (is_truncated)
2740 *is_truncated = truncated;
2741
2742 return 0;
2743 } // list_objects_unordered
2744
2745
2746 /**
2747 * create a rados pool, associated meta info
2748 * returns 0 on success, -ERR# otherwise.
2749 */
2750 int RGWRados::create_pool(const rgw_pool& pool)
2751 {
2752 librados::IoCtx io_ctx;
2753 constexpr bool create = true;
2754 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
2755 }
2756
2757 int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2758 {
2759 librados::IoCtx index_ctx;
2760
2761 string dir_oid = dir_oid_prefix;
2762 int r = open_bucket_index_ctx(bucket_info, index_ctx);
2763 if (r < 0) {
2764 return r;
2765 }
2766
2767 dir_oid.append(bucket_info.bucket.bucket_id);
2768
2769 map<int, string> bucket_objs;
2770 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2771
2772 return CLSRGWIssueBucketIndexInit(index_ctx,
2773 bucket_objs,
2774 cct->_conf->rgw_bucket_index_max_aio)();
2775 }
2776
2777 int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2778 {
2779 librados::IoCtx index_ctx;
2780
2781 std::string dir_oid = dir_oid_prefix;
2782 int r = open_bucket_index_ctx(bucket_info, index_ctx);
2783 if (r < 0) {
2784 return r;
2785 }
2786
2787 dir_oid.append(bucket_info.bucket.bucket_id);
2788
2789 std::map<int, std::string> bucket_objs;
2790 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2791
2792 return CLSRGWIssueBucketIndexClean(index_ctx,
2793 bucket_objs,
2794 cct->_conf->rgw_bucket_index_max_aio)();
2795 }
2796
2797 void RGWRados::create_bucket_id(string *bucket_id)
2798 {
2799 uint64_t iid = instance_id();
2800 uint64_t bid = next_bucket_id();
2801 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2802 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2803 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
2804 *bucket_id = buf;
2805 }
2806
2807 int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
2808 const string& zonegroup_id,
2809 const rgw_placement_rule& placement_rule,
2810 const string& swift_ver_location,
2811 const RGWQuotaInfo * pquota_info,
2812 map<std::string, bufferlist>& attrs,
2813 RGWBucketInfo& info,
2814 obj_version *pobjv,
2815 obj_version *pep_objv,
2816 real_time creation_time,
2817 rgw_bucket *pmaster_bucket,
2818 uint32_t *pmaster_num_shards,
2819 bool exclusive)
2820 {
2821 #define MAX_CREATE_RETRIES 20 /* need to bound retries */
2822 rgw_placement_rule selected_placement_rule;
2823 RGWZonePlacementInfo rule_info;
2824
2825 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2826 int ret = 0;
2827 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2828 &selected_placement_rule, &rule_info);
2829 if (ret < 0)
2830 return ret;
2831
2832 if (!pmaster_bucket) {
2833 create_bucket_id(&bucket.marker);
2834 bucket.bucket_id = bucket.marker;
2835 } else {
2836 bucket.marker = pmaster_bucket->marker;
2837 bucket.bucket_id = pmaster_bucket->bucket_id;
2838 }
2839
2840 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2841
2842 if (pobjv) {
2843 objv_tracker.write_version = *pobjv;
2844 } else {
2845 objv_tracker.generate_new_write_ver(cct);
2846 }
2847
2848 info.bucket = bucket;
2849 info.owner = owner.user_id;
2850 info.zonegroup = zonegroup_id;
2851 info.placement_rule = selected_placement_rule;
2852 info.index_type = rule_info.index_type;
2853 info.swift_ver_location = swift_ver_location;
2854 info.swift_versioning = (!swift_ver_location.empty());
2855 if (pmaster_num_shards) {
2856 info.num_shards = *pmaster_num_shards;
2857 } else {
2858 info.num_shards = bucket_index_max_shards;
2859 }
2860 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2861 info.requester_pays = false;
2862 if (real_clock::is_zero(creation_time)) {
2863 info.creation_time = ceph::real_clock::now();
2864 } else {
2865 info.creation_time = creation_time;
2866 }
2867 if (pquota_info) {
2868 info.quota = *pquota_info;
2869 }
2870
2871 int r = init_bucket_index(info, info.num_shards);
2872 if (r < 0) {
2873 return r;
2874 }
2875
2876 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
2877 if (ret == -EEXIST) {
2878 librados::IoCtx index_ctx;
2879 map<int, string> bucket_objs;
2880 int r = open_bucket_index(info, index_ctx, bucket_objs);
2881 if (r < 0)
2882 return r;
2883
2884 /* we need to reread the info and return it, caller will have a use for it */
2885 RGWObjVersionTracker instance_ver = info.objv_tracker;
2886 info.objv_tracker.clear();
2887 auto obj_ctx = svc.sysobj->init_obj_ctx();
2888 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
2889 if (r < 0) {
2890 if (r == -ENOENT) {
2891 continue;
2892 }
2893 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2894 return r;
2895 }
2896
2897 /* only remove it if it's a different bucket instance */
2898 if (info.bucket.bucket_id != bucket.bucket_id) {
2899 /* remove bucket meta instance */
2900 r = rgw_bucket_instance_remove_entry(this,
2901 bucket.get_key(),
2902 &instance_ver);
2903 if (r < 0)
2904 return r;
2905
2906 /* remove bucket index objects asynchronously by best effort */
2907 (void) CLSRGWIssueBucketIndexClean(index_ctx,
2908 bucket_objs,
2909 cct->_conf->rgw_bucket_index_max_aio)();
2910 }
2911 /* ret == -ENOENT here */
2912 }
2913 return ret;
2914 }
2915
2916 /* this is highly unlikely */
2917 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2918 return -ENOENT;
2919 }
2920
2921 bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
2922 {
2923 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2924 }
2925
2926 bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2927 {
2928 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
2929
2930 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
2931 }
2932
2933 int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2934 {
2935 string oid, key;
2936 get_obj_bucket_and_oid_loc(obj, oid, key);
2937
2938 rgw_pool pool;
2939 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2940 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2941 return -EIO;
2942 }
2943
2944 int r = open_pool_ctx(pool, *ioctx);
2945 if (r < 0) {
2946 return r;
2947 }
2948
2949 ioctx->locator_set_key(key);
2950
2951 return 0;
2952 }
2953
2954 int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2955 {
2956 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
2957
2958 rgw_pool pool;
2959 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2960 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2961 return -EIO;
2962 }
2963
2964 int r = open_pool_ctx(pool, ref->ioctx);
2965 if (r < 0) {
2966 return r;
2967 }
2968
2969 ref->ioctx.locator_set_key(ref->obj.loc);
2970
2971 return 0;
2972 }
2973
2974 int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2975 {
2976 ref->obj = obj;
2977
2978 int r;
2979
2980 if (ref->obj.oid.empty()) {
2981 ref->obj.oid = obj.pool.to_str();
2982 ref->obj.pool = svc.zone->get_zone_params().domain_root;
2983 }
2984 r = open_pool_ctx(ref->obj.pool, ref->ioctx);
2985 if (r < 0)
2986 return r;
2987
2988 ref->ioctx.locator_set_key(ref->obj.loc);
2989
2990 return 0;
2991 }
2992
2993 int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
2994 {
2995 return get_raw_obj_ref(obj, ref);
2996 }
2997
2998 /*
2999 * fixes an issue where head objects were supposed to have a locator created, but ended
3000 * up without one
3001 */
3002 int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
3003 {
3004 const rgw_bucket& bucket = bucket_info.bucket;
3005 string oid;
3006 string locator;
3007
3008 rgw_obj obj(bucket, key);
3009
3010 get_obj_bucket_and_oid_loc(obj, oid, locator);
3011
3012 if (locator.empty()) {
3013 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
3014 return 0;
3015 }
3016
3017 librados::IoCtx ioctx;
3018
3019 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
3020 if (ret < 0) {
3021 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
3022 return ret;
3023 }
3024 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
3025
3026 uint64_t size;
3027 bufferlist data;
3028
3029 struct timespec mtime_ts;
3030 map<string, bufferlist> attrs;
3031 librados::ObjectReadOperation op;
3032 op.getxattrs(&attrs, NULL);
3033 op.stat2(&size, &mtime_ts, NULL);
3034 #define HEAD_SIZE 512 * 1024
3035 op.read(0, HEAD_SIZE, &data, NULL);
3036
3037 ret = ioctx.operate(oid, &op, NULL);
3038 if (ret < 0) {
3039 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
3040 return ret;
3041 }
3042
3043 if (size > HEAD_SIZE) {
3044 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
3045 return -EIO;
3046 }
3047
3048 if (size != data.length()) {
3049 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
3050 return -EIO;
3051 }
3052
3053 if (copy_obj) {
3054 librados::ObjectWriteOperation wop;
3055
3056 wop.mtime2(&mtime_ts);
3057
3058 map<string, bufferlist>::iterator iter;
3059 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3060 wop.setxattr(iter->first.c_str(), iter->second);
3061 }
3062
3063 wop.write(0, data);
3064
3065 ioctx.locator_set_key(locator);
3066 ioctx.operate(oid, &wop);
3067 }
3068
3069 if (remove_bad) {
3070 ioctx.locator_set_key(string());
3071
3072 ret = ioctx.remove(oid);
3073 if (ret < 0) {
3074 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
3075 return ret;
3076 }
3077 }
3078
3079 return 0;
3080 }
3081
3082 int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
3083 const string& src_oid, const string& src_locator,
3084 librados::IoCtx& dst_ioctx,
3085 const string& dst_oid, const string& dst_locator)
3086 {
3087
3088 #define COPY_BUF_SIZE (4 * 1024 * 1024)
3089 bool done = false;
3090 uint64_t chunk_size = COPY_BUF_SIZE;
3091 uint64_t ofs = 0;
3092 int ret = 0;
3093 real_time mtime;
3094 struct timespec mtime_ts;
3095 uint64_t size;
3096
3097 if (src_oid == dst_oid && src_locator == dst_locator) {
3098 return 0;
3099 }
3100
3101 src_ioctx.locator_set_key(src_locator);
3102 dst_ioctx.locator_set_key(dst_locator);
3103
3104 do {
3105 bufferlist data;
3106 ObjectReadOperation rop;
3107 ObjectWriteOperation wop;
3108
3109 if (ofs == 0) {
3110 rop.stat2(&size, &mtime_ts, NULL);
3111 mtime = real_clock::from_timespec(mtime_ts);
3112 }
3113 rop.read(ofs, chunk_size, &data, NULL);
3114 ret = src_ioctx.operate(src_oid, &rop, NULL);
3115 if (ret < 0) {
3116 goto done_err;
3117 }
3118
3119 if (data.length() == 0) {
3120 break;
3121 }
3122
3123 if (ofs == 0) {
3124 wop.create(true); /* make it exclusive */
3125 wop.mtime2(&mtime_ts);
3126 mtime = real_clock::from_timespec(mtime_ts);
3127 }
3128 wop.write(ofs, data);
3129 ret = dst_ioctx.operate(dst_oid, &wop);
3130 if (ret < 0) {
3131 goto done_err;
3132 }
3133 ofs += data.length();
3134 done = data.length() != chunk_size;
3135 } while (!done);
3136
3137 if (ofs != size) {
3138 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
3139 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
3140 ret = -EIO;
3141 goto done_err;
3142 }
3143
3144 src_ioctx.remove(src_oid);
3145
3146 return 0;
3147
3148 done_err:
3149 // TODO: clean up dst_oid if we created it
3150 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
3151 return ret;
3152 }
3153
3154 /*
3155 * fixes an issue where head objects were supposed to have a locator created, but ended
3156 * up without one
3157 */
3158 int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
3159 {
3160 const rgw_bucket& bucket = bucket_info.bucket;
3161 rgw_obj obj(bucket, key);
3162
3163 if (need_fix) {
3164 *need_fix = false;
3165 }
3166
3167 rgw_rados_ref ref;
3168 int r = get_obj_head_ref(bucket_info, obj, &ref);
3169 if (r < 0) {
3170 return r;
3171 }
3172
3173 RGWObjState *astate = NULL;
3174 RGWObjectCtx rctx(this);
3175 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
3176 if (r < 0)
3177 return r;
3178
3179 if (astate->has_manifest) {
3180 RGWObjManifest::obj_iterator miter;
3181 RGWObjManifest& manifest = astate->manifest;
3182 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
3183 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
3184 rgw_obj loc;
3185 string oid;
3186 string locator;
3187
3188 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
3189
3190 if (loc.key.ns.empty()) {
3191 /* continue, we're only interested in tail objects */
3192 continue;
3193 }
3194
3195 get_obj_bucket_and_oid_loc(loc, oid, locator);
3196 ref.ioctx.locator_set_key(locator);
3197
3198 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
3199
3200 r = ref.ioctx.stat(oid, NULL, NULL);
3201 if (r != -ENOENT) {
3202 continue;
3203 }
3204
3205 string bad_loc;
3206 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
3207
3208 /* create a new ioctx with the bad locator */
3209 librados::IoCtx src_ioctx;
3210 src_ioctx.dup(ref.ioctx);
3211 src_ioctx.locator_set_key(bad_loc);
3212
3213 r = src_ioctx.stat(oid, NULL, NULL);
3214 if (r != 0) {
3215 /* cannot find a broken part */
3216 continue;
3217 }
3218 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
3219 if (need_fix) {
3220 *need_fix = true;
3221 }
3222 if (fix) {
3223 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
3224 if (r < 0) {
3225 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
3226 }
3227 }
3228 }
3229 }
3230
3231 return 0;
3232 }
3233
3234 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3235 const rgw_obj& obj,
3236 RGWBucketInfo* bucket_info_out)
3237 {
3238 bucket = _bucket;
3239
3240 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
3241
3242 RGWBucketInfo bucket_info;
3243 RGWBucketInfo* bucket_info_p =
3244 bucket_info_out ? bucket_info_out : &bucket_info;
3245
3246 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
3247 if (ret < 0) {
3248 return ret;
3249 }
3250
3251 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
3252 if (ret < 0) {
3253 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3254 return ret;
3255 }
3256 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3257
3258 return 0;
3259 }
3260
3261 int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3262 int sid,
3263 RGWBucketInfo* bucket_info_out)
3264 {
3265 bucket = _bucket;
3266 shard_id = sid;
3267
3268 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
3269
3270 RGWBucketInfo bucket_info;
3271 RGWBucketInfo* bucket_info_p =
3272 bucket_info_out ? bucket_info_out : &bucket_info;
3273 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
3274 if (ret < 0) {
3275 return ret;
3276 }
3277
3278 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
3279 if (ret < 0) {
3280 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3281 return ret;
3282 }
3283 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3284
3285 return 0;
3286 }
3287
3288 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
3289 const rgw_obj& obj)
3290 {
3291 bucket = bucket_info.bucket;
3292
3293 int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
3294 obj.get_hash_object(), &bucket_obj,
3295 &shard_id);
3296 if (ret < 0) {
3297 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3298 return ret;
3299 }
3300 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3301
3302 return 0;
3303 }
3304
3305 int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
3306 {
3307 bucket = bucket_info.bucket;
3308 shard_id = sid;
3309
3310 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
3311 if (ret < 0) {
3312 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3313 return ret;
3314 }
3315 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3316
3317 return 0;
3318 }
3319
3320
3321 /* Execute @handler on last item in bucket listing for bucket specified
3322 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
3323 * to objects matching these criterias. */
3324 int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
3325 const std::string& obj_prefix,
3326 const std::string& obj_delim,
3327 std::function<int(const rgw_bucket_dir_entry&)> handler)
3328 {
3329 RGWRados::Bucket target(this, bucket_info);
3330 RGWRados::Bucket::List list_op(&target);
3331
3332 list_op.params.prefix = obj_prefix;
3333 list_op.params.delim = obj_delim;
3334
3335 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
3336 << ", obj_prefix=" << obj_prefix
3337 << ", obj_delim=" << obj_delim
3338 << dendl;
3339
3340 bool is_truncated = false;
3341
3342 boost::optional<rgw_bucket_dir_entry> last_entry;
3343 /* We need to rewind to the last object in a listing. */
3344 do {
3345 /* List bucket entries in chunks. */
3346 static constexpr int MAX_LIST_OBJS = 100;
3347 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
3348
3349 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
3350 &is_truncated);
3351 if (ret < 0) {
3352 return ret;
3353 } else if (!entries.empty()) {
3354 last_entry = entries.back();
3355 }
3356 } while (is_truncated);
3357
3358 if (last_entry) {
3359 return handler(*last_entry);
3360 }
3361
3362 /* Empty listing - no items we can run handler on. */
3363 return 0;
3364 }
3365
3366
3367 int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
3368 const rgw_user& user,
3369 RGWBucketInfo& bucket_info,
3370 rgw_obj& obj)
3371 {
3372 if (! swift_versioning_enabled(bucket_info)) {
3373 return 0;
3374 }
3375
3376 obj_ctx.set_atomic(obj);
3377
3378 RGWObjState * state = nullptr;
3379 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
3380 if (r < 0) {
3381 return r;
3382 }
3383
3384 if (!state->exists) {
3385 return 0;
3386 }
3387
3388 const string& src_name = obj.get_oid();
3389 char buf[src_name.size() + 32];
3390 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
3391 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
3392 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
3393
3394 RGWBucketInfo dest_bucket_info;
3395
3396 auto sysobj_ctx = svc.sysobj->init_obj_ctx();
3397
3398 r = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
3399 if (r < 0) {
3400 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
3401 if (r == -ENOENT) {
3402 return -ERR_PRECONDITION_FAILED;
3403 }
3404 return r;
3405 }
3406
3407 if (dest_bucket_info.owner != bucket_info.owner) {
3408 return -ERR_PRECONDITION_FAILED;
3409 }
3410
3411 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
3412
3413 if (dest_bucket_info.versioning_enabled()){
3414 gen_rand_obj_instance_name(&dest_obj);
3415 }
3416
3417 obj_ctx.set_atomic(dest_obj);
3418
3419 string no_zone;
3420
3421 r = copy_obj(obj_ctx,
3422 user,
3423 NULL, /* req_info *info */
3424 no_zone,
3425 dest_obj,
3426 obj,
3427 dest_bucket_info,
3428 bucket_info,
3429 bucket_info.placement_rule,
3430 NULL, /* time_t *src_mtime */
3431 NULL, /* time_t *mtime */
3432 NULL, /* const time_t *mod_ptr */
3433 NULL, /* const time_t *unmod_ptr */
3434 false, /* bool high_precision_time */
3435 NULL, /* const char *if_match */
3436 NULL, /* const char *if_nomatch */
3437 RGWRados::ATTRSMOD_NONE,
3438 true, /* bool copy_if_newer */
3439 state->attrset,
3440 RGWObjCategory::Main,
3441 0, /* uint64_t olh_epoch */
3442 real_time(), /* time_t delete_at */
3443 NULL, /* string *version_id */
3444 NULL, /* string *ptag */
3445 NULL, /* string *petag */
3446 NULL, /* void (*progress_cb)(off_t, void *) */
3447 NULL); /* void *progress_data */
3448 if (r == -ECANCELED || r == -ENOENT) {
3449 /* Has already been overwritten, meaning another rgw process already
3450 * copied it out */
3451 return 0;
3452 }
3453
3454 return r;
3455 }
3456
3457 int RGWRados::swift_versioning_restore(RGWSysObjectCtx& sysobj_ctx,
3458 RGWObjectCtx& obj_ctx,
3459 const rgw_user& user,
3460 RGWBucketInfo& bucket_info,
3461 rgw_obj& obj,
3462 bool& restored) /* out */
3463 {
3464 if (! swift_versioning_enabled(bucket_info)) {
3465 return 0;
3466 }
3467
3468 /* Bucket info of the bucket that stores previous versions of our object. */
3469 RGWBucketInfo archive_binfo;
3470
3471 int ret = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant,
3472 bucket_info.swift_ver_location, archive_binfo,
3473 nullptr, nullptr);
3474 if (ret < 0) {
3475 return ret;
3476 }
3477
3478 /* Abort the operation if the bucket storing our archive belongs to someone
3479 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3480 * into consideration. For we can live with that.
3481 *
3482 * TODO: delegate this check to un upper layer and compare with ACLs. */
3483 if (bucket_info.owner != archive_binfo.owner) {
3484 return -EPERM;
3485 }
3486
3487 /* This code will be executed on latest version of the object. */
3488 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
3489 std::string no_zone;
3490
3491 /* We don't support object versioning of Swift API on those buckets that
3492 * are already versioned using the S3 mechanism. This affects also bucket
3493 * storing archived objects. Otherwise the delete operation would create
3494 * a deletion marker. */
3495 if (archive_binfo.versioned()) {
3496 restored = false;
3497 return -ERR_PRECONDITION_FAILED;
3498 }
3499
3500 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3501 * irrelevant and may be safely skipped. */
3502 std::map<std::string, ceph::bufferlist> no_attrs;
3503
3504 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
3505
3506 if (bucket_info.versioning_enabled()){
3507 gen_rand_obj_instance_name(&obj);
3508 }
3509
3510 obj_ctx.set_atomic(archive_obj);
3511 obj_ctx.set_atomic(obj);
3512
3513 int ret = copy_obj(obj_ctx,
3514 user,
3515 nullptr, /* req_info *info */
3516 no_zone,
3517 obj, /* dest obj */
3518 archive_obj, /* src obj */
3519 bucket_info, /* dest bucket info */
3520 archive_binfo, /* src bucket info */
3521 bucket_info.placement_rule, /* placement_rule */
3522 nullptr, /* time_t *src_mtime */
3523 nullptr, /* time_t *mtime */
3524 nullptr, /* const time_t *mod_ptr */
3525 nullptr, /* const time_t *unmod_ptr */
3526 false, /* bool high_precision_time */
3527 nullptr, /* const char *if_match */
3528 nullptr, /* const char *if_nomatch */
3529 RGWRados::ATTRSMOD_NONE,
3530 true, /* bool copy_if_newer */
3531 no_attrs,
3532 RGWObjCategory::Main,
3533 0, /* uint64_t olh_epoch */
3534 real_time(), /* time_t delete_at */
3535 nullptr, /* string *version_id */
3536 nullptr, /* string *ptag */
3537 nullptr, /* string *petag */
3538 nullptr, /* void (*progress_cb)(off_t, void *) */
3539 nullptr); /* void *progress_data */
3540 if (ret == -ECANCELED || ret == -ENOENT) {
3541 /* Has already been overwritten, meaning another rgw process already
3542 * copied it out */
3543 return 0;
3544 } else if (ret < 0) {
3545 return ret;
3546 } else {
3547 restored = true;
3548 }
3549
3550 /* Need to remove the archived copy. */
3551 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
3552 archive_binfo.versioning_status());
3553
3554 return ret;
3555 };
3556
3557 const std::string& obj_name = obj.get_oid();
3558 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
3559 % obj_name);
3560
3561 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
3562 handler);
3563 }
3564
3565 int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
3566 map<string, bufferlist>& attrs,
3567 bool assume_noent, bool modify_tail,
3568 void *_index_op)
3569 {
3570 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
3571 RGWRados *store = target->get_store();
3572
3573 ObjectWriteOperation op;
3574 #ifdef WITH_LTTNG
3575 const struct req_state* s = get_req_state();
3576 string req_id;
3577 if (!s) {
3578 // fake req_id
3579 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3580 } else {
3581 req_id = s->req_id;
3582 }
3583 #endif
3584
3585 RGWObjState *state;
3586 int r = target->get_state(&state, false, assume_noent);
3587 if (r < 0)
3588 return r;
3589
3590 rgw_obj& obj = target->get_obj();
3591
3592 if (obj.get_oid().empty()) {
3593 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
3594 return -EIO;
3595 }
3596
3597 rgw_rados_ref ref;
3598 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
3599 if (r < 0)
3600 return r;
3601
3602 bool is_olh = state->is_olh;
3603
3604 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3605
3606 const string *ptag = meta.ptag;
3607 if (!ptag && !index_op->get_optag()->empty()) {
3608 ptag = index_op->get_optag();
3609 }
3610 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
3611 if (r < 0)
3612 return r;
3613
3614 if (real_clock::is_zero(meta.set_mtime)) {
3615 meta.set_mtime = real_clock::now();
3616 }
3617
3618 if (state->is_olh) {
3619 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3620 }
3621
3622 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3623 op.mtime2(&mtime_ts);
3624
3625 if (meta.data) {
3626 /* if we want to overwrite the data, we also want to overwrite the
3627 xattrs, so just remove the object */
3628 op.write_full(*meta.data);
3629 }
3630
3631 string etag;
3632 string content_type;
3633 bufferlist acl_bl;
3634 string storage_class;
3635
3636 map<string, bufferlist>::iterator iter;
3637 if (meta.rmattrs) {
3638 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3639 const string& name = iter->first;
3640 op.rmxattr(name.c_str());
3641 }
3642 }
3643
3644 if (meta.manifest) {
3645 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3646
3647 /* remove existing manifest attr */
3648 iter = attrs.find(RGW_ATTR_MANIFEST);
3649 if (iter != attrs.end())
3650 attrs.erase(iter);
3651
3652 bufferlist bl;
3653 encode(*meta.manifest, bl);
3654 op.setxattr(RGW_ATTR_MANIFEST, bl);
3655 }
3656
3657 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3658 const string& name = iter->first;
3659 bufferlist& bl = iter->second;
3660
3661 if (!bl.length())
3662 continue;
3663
3664 op.setxattr(name.c_str(), bl);
3665
3666 if (name.compare(RGW_ATTR_ETAG) == 0) {
3667 etag = rgw_bl_str(bl);
3668 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
3669 content_type = rgw_bl_str(bl);
3670 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3671 acl_bl = bl;
3672 }
3673 }
3674 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3675 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3676 }
3677
3678 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3679 bufferlist bl;
3680 encode(store->svc.zone->get_zone_short_id(), bl);
3681 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3682 }
3683
3684 if (!storage_class.empty()) {
3685 bufferlist bl;
3686 bl.append(storage_class);
3687 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3688 }
3689
3690 if (!op.size())
3691 return 0;
3692
3693 uint64_t epoch;
3694 int64_t poolid;
3695 bool orig_exists;
3696 uint64_t orig_size;
3697
3698 if (!reset_obj) { //Multipart upload, it has immutable head.
3699 orig_exists = false;
3700 orig_size = 0;
3701 } else {
3702 orig_exists = state->exists;
3703 orig_size = state->accounted_size;
3704 }
3705
3706 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3707 !obj.key.instance.empty();
3708
3709 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3710
3711 if (versioned_op) {
3712 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3713 }
3714
3715 if (!index_op->is_prepared()) {
3716 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
3717 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
3718 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
3719 if (r < 0)
3720 return r;
3721 }
3722
3723 tracepoint(rgw_rados, operate_enter, req_id.c_str());
3724 r = ref.ioctx.operate(ref.obj.oid, &op);
3725 tracepoint(rgw_rados, operate_exit, req_id.c_str());
3726 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3727 or -ENOENT if was removed, or -EEXIST if it did not exist
3728 before and now it does */
3729 if (r == -EEXIST && assume_noent) {
3730 target->invalidate_state();
3731 return r;
3732 }
3733 goto done_cancel;
3734 }
3735
3736 epoch = ref.ioctx.get_last_version();
3737 poolid = ref.ioctx.get_id();
3738
3739 r = target->complete_atomic_modification();
3740 if (r < 0) {
3741 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3742 }
3743
3744 tracepoint(rgw_rados, complete_enter, req_id.c_str());
3745 r = index_op->complete(poolid, epoch, size, accounted_size,
3746 meta.set_mtime, etag, content_type,
3747 storage_class, &acl_bl,
3748 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3749 tracepoint(rgw_rados, complete_exit, req_id.c_str());
3750 if (r < 0)
3751 goto done_cancel;
3752
3753 if (meta.mtime) {
3754 *meta.mtime = meta.set_mtime;
3755 }
3756
3757 /* note that index_op was using state so we couldn't invalidate it earlier */
3758 target->invalidate_state();
3759 state = NULL;
3760
3761 if (versioned_op && meta.olh_epoch) {
3762 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
3763 if (r < 0) {
3764 return r;
3765 }
3766 }
3767
3768 if (!real_clock::is_zero(meta.delete_at)) {
3769 rgw_obj_index_key obj_key;
3770 obj.key.get_index_key(&obj_key);
3771
3772 r = store->objexp_hint_add(meta.delete_at,
3773 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
3774 if (r < 0) {
3775 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3776 /* ignoring error, nothing we can do at this point */
3777 }
3778 }
3779 meta.canceled = false;
3780
3781 /* update quota cache */
3782 if (meta.completeMultipart){
3783 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3784 0, orig_size);
3785 }
3786 else {
3787 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3788 accounted_size, orig_size);
3789 }
3790 return 0;
3791
3792 done_cancel:
3793 int ret = index_op->cancel();
3794 if (ret < 0) {
3795 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3796 }
3797
3798 meta.canceled = true;
3799
3800 /* we lost in a race. There are a few options:
3801 * - existing object was rewritten (ECANCELED)
3802 * - non existing object was created (EEXIST)
3803 * - object was removed (ENOENT)
3804 * should treat it as a success
3805 */
3806 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3807 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3808 r = 0;
3809 }
3810 } else {
3811 if (meta.if_match != NULL) {
3812 // only overwrite existing object
3813 if (strcmp(meta.if_match, "*") == 0) {
3814 if (r == -ENOENT) {
3815 r = -ERR_PRECONDITION_FAILED;
3816 } else if (r == -ECANCELED) {
3817 r = 0;
3818 }
3819 }
3820 }
3821
3822 if (meta.if_nomatch != NULL) {
3823 // only create a new object
3824 if (strcmp(meta.if_nomatch, "*") == 0) {
3825 if (r == -EEXIST) {
3826 r = -ERR_PRECONDITION_FAILED;
3827 } else if (r == -ENOENT) {
3828 r = 0;
3829 }
3830 }
3831 }
3832 }
3833
3834 return r;
3835 }
3836
3837 int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
3838 map<string, bufferlist>& attrs)
3839 {
3840 RGWBucketInfo& bucket_info = target->get_bucket_info();
3841
3842 RGWRados::Bucket bop(target->get_store(), bucket_info);
3843 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
3844 index_op.set_zones_trace(meta.zones_trace);
3845
3846 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3847 int r;
3848 if (assume_noent) {
3849 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
3850 if (r == -EEXIST) {
3851 assume_noent = false;
3852 }
3853 }
3854 if (!assume_noent) {
3855 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
3856 }
3857 return r;
3858 }
3859
3860 class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
3861 {
3862 CephContext* cct;
3863 rgw_obj obj;
3864 rgw::putobj::DataProcessor *filter;
3865 boost::optional<RGWPutObj_Compress>& compressor;
3866 boost::optional<rgw::putobj::ChunkProcessor> buffering;
3867 CompressorRef& plugin;
3868 rgw::putobj::ObjectProcessor *processor;
3869 void (*progress_cb)(off_t, void *);
3870 void *progress_data;
3871 bufferlist extra_data_bl;
3872 uint64_t extra_data_left{0};
3873 bool need_to_process_attrs{true};
3874 uint64_t data_len{0};
3875 map<string, bufferlist> src_attrs;
3876 uint64_t ofs{0};
3877 uint64_t lofs{0}; /* logical ofs */
3878 std::function<int(const map<string, bufferlist>&)> attrs_handler;
3879 public:
3880 RGWRadosPutObj(CephContext* cct,
3881 CompressorRef& plugin,
3882 boost::optional<RGWPutObj_Compress>& compressor,
3883 rgw::putobj::ObjectProcessor *p,
3884 void (*_progress_cb)(off_t, void *),
3885 void *_progress_data,
3886 std::function<int(const map<string, bufferlist>&)> _attrs_handler) :
3887 cct(cct),
3888 filter(p),
3889 compressor(compressor),
3890 plugin(plugin),
3891 processor(p),
3892 progress_cb(_progress_cb),
3893 progress_data(_progress_data),
3894 attrs_handler(_attrs_handler) {}
3895
3896 int process_attrs(void) {
3897 if (extra_data_bl.length()) {
3898 JSONParser jp;
3899 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3900 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3901 return -EIO;
3902 }
3903
3904 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3905
3906 src_attrs.erase(RGW_ATTR_COMPRESSION);
3907 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
3908
3909 // filter out olh attributes
3910 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3911 while (iter != src_attrs.end()) {
3912 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3913 break;
3914 }
3915 iter = src_attrs.erase(iter);
3916 }
3917 }
3918
3919 int ret = attrs_handler(src_attrs);
3920 if (ret < 0) {
3921 return ret;
3922 }
3923
3924 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3925 //do not compress if object is encrypted
3926 compressor = boost::in_place(cct, plugin, filter);
3927 // add a filter that buffers data so we don't try to compress tiny blocks.
3928 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3929 // compression ratio
3930 constexpr unsigned buffer_size = 512 * 1024;
3931 buffering = boost::in_place(&*compressor, buffer_size);
3932 filter = &*buffering;
3933 }
3934
3935 need_to_process_attrs = false;
3936
3937 return 0;
3938 }
3939
3940 int handle_data(bufferlist& bl, bool *pause) override {
3941 if (progress_cb) {
3942 progress_cb(data_len, progress_data);
3943 }
3944 if (extra_data_left) {
3945 uint64_t extra_len = bl.length();
3946 if (extra_len > extra_data_left)
3947 extra_len = extra_data_left;
3948
3949 bufferlist extra;
3950 bl.splice(0, extra_len, &extra);
3951 extra_data_bl.append(extra);
3952
3953 extra_data_left -= extra_len;
3954 if (extra_data_left == 0) {
3955 int res = process_attrs();
3956 if (res < 0)
3957 return res;
3958 }
3959 ofs += extra_len;
3960 if (bl.length() == 0) {
3961 return 0;
3962 }
3963 }
3964 if (need_to_process_attrs) {
3965 /* need to call process_attrs() even if we don't get any attrs,
3966 * need it to call attrs_handler().
3967 */
3968 int res = process_attrs();
3969 if (res < 0) {
3970 return res;
3971 }
3972 }
3973
3974 ceph_assert(uint64_t(ofs) >= extra_data_len);
3975
3976 uint64_t size = bl.length();
3977 ofs += size;
3978
3979 const uint64_t lofs = data_len;
3980 data_len += size;
3981
3982 return filter->process(std::move(bl), lofs);
3983 }
3984
3985 int flush() {
3986 return filter->process({}, data_len);
3987 }
3988
3989 bufferlist& get_extra_data() { return extra_data_bl; }
3990
3991 map<string, bufferlist>& get_attrs() { return src_attrs; }
3992
3993 void set_extra_data_len(uint64_t len) override {
3994 extra_data_left = len;
3995 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
3996 }
3997
3998 uint64_t get_data_len() {
3999 return data_len;
4000 }
4001 };
4002
4003 /*
4004 * prepare attrset depending on attrs_mod.
4005 */
4006 static void set_copy_attrs(map<string, bufferlist>& src_attrs,
4007 map<string, bufferlist>& attrs,
4008 RGWRados::AttrsMod attrs_mod)
4009 {
4010 switch (attrs_mod) {
4011 case RGWRados::ATTRSMOD_NONE:
4012 attrs = src_attrs;
4013 break;
4014 case RGWRados::ATTRSMOD_REPLACE:
4015 if (!attrs[RGW_ATTR_ETAG].length()) {
4016 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
4017 }
4018 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
4019 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
4020 if (ttiter != src_attrs.end()) {
4021 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
4022 }
4023 }
4024 break;
4025 case RGWRados::ATTRSMOD_MERGE:
4026 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
4027 if (attrs.find(it->first) == attrs.end()) {
4028 attrs[it->first] = it->second;
4029 }
4030 }
4031 break;
4032 }
4033 }
4034
4035 int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj)
4036 {
4037 map<string, bufferlist> attrset;
4038
4039 real_time mtime;
4040 uint64_t obj_size;
4041 RGWObjectCtx rctx(this);
4042
4043 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
4044 RGWRados::Object::Read read_op(&op_target);
4045
4046 read_op.params.attrs = &attrset;
4047 read_op.params.lastmod = &mtime;
4048 read_op.params.obj_size = &obj_size;
4049
4050 int ret = read_op.prepare();
4051 if (ret < 0)
4052 return ret;
4053
4054 attrset.erase(RGW_ATTR_ID_TAG);
4055 attrset.erase(RGW_ATTR_TAIL_TAG);
4056
4057 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
4058 read_op, obj_size - 1, obj, NULL, mtime, attrset,
4059 0, real_time(), NULL);
4060 }
4061
4062 struct obj_time_weight {
4063 real_time mtime;
4064 uint32_t zone_short_id;
4065 uint64_t pg_ver;
4066 bool high_precision;
4067
4068 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
4069
4070 bool compare_low_precision(const obj_time_weight& rhs) {
4071 struct timespec l = ceph::real_clock::to_timespec(mtime);
4072 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
4073 l.tv_nsec = 0;
4074 r.tv_nsec = 0;
4075 if (l > r) {
4076 return false;
4077 }
4078 if (l < r) {
4079 return true;
4080 }
4081 if (!zone_short_id || !rhs.zone_short_id) {
4082 /* don't compare zone ids, if one wasn't provided */
4083 return false;
4084 }
4085 if (zone_short_id != rhs.zone_short_id) {
4086 return (zone_short_id < rhs.zone_short_id);
4087 }
4088 return (pg_ver < rhs.pg_ver);
4089
4090 }
4091
4092 bool operator<(const obj_time_weight& rhs) {
4093 if (!high_precision || !rhs.high_precision) {
4094 return compare_low_precision(rhs);
4095 }
4096 if (mtime > rhs.mtime) {
4097 return false;
4098 }
4099 if (mtime < rhs.mtime) {
4100 return true;
4101 }
4102 if (!zone_short_id || !rhs.zone_short_id) {
4103 /* don't compare zone ids, if one wasn't provided */
4104 return false;
4105 }
4106 if (zone_short_id != rhs.zone_short_id) {
4107 return (zone_short_id < rhs.zone_short_id);
4108 }
4109 return (pg_ver < rhs.pg_ver);
4110 }
4111
4112 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
4113 mtime = _mtime;
4114 zone_short_id = _short_id;
4115 pg_ver = _pg_ver;
4116 }
4117
4118 void init(RGWObjState *state) {
4119 mtime = state->mtime;
4120 zone_short_id = state->zone_short_id;
4121 pg_ver = state->pg_ver;
4122 }
4123 };
4124
4125 inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
4126 out << o.mtime;
4127
4128 if (o.zone_short_id != 0 || o.pg_ver != 0) {
4129 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
4130 }
4131
4132 return out;
4133 }
4134
4135 class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
4136 bufferlist extra_data;
4137 public:
4138 RGWGetExtraDataCB() {}
4139 int handle_data(bufferlist& bl, bool *pause) override {
4140 int bl_len = (int)bl.length();
4141 if (extra_data.length() < extra_data_len) {
4142 off_t max = extra_data_len - extra_data.length();
4143 if (max > bl_len) {
4144 max = bl_len;
4145 }
4146 bl.splice(0, max, &extra_data);
4147 }
4148 return bl_len;
4149 }
4150
4151 bufferlist& get_extra_data() {
4152 return extra_data;
4153 }
4154 };
4155
4156 int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
4157 const rgw_user& user_id,
4158 req_info *info,
4159 const string& source_zone,
4160 rgw_obj& src_obj,
4161 RGWBucketInfo& src_bucket_info,
4162 real_time *src_mtime,
4163 uint64_t *psize,
4164 const real_time *mod_ptr,
4165 const real_time *unmod_ptr,
4166 bool high_precision_time,
4167 const char *if_match,
4168 const char *if_nomatch,
4169 map<string, bufferlist> *pattrs,
4170 map<string, string> *pheaders,
4171 string *version_id,
4172 string *ptag,
4173 string *petag)
4174 {
4175 /* source is in a different zonegroup, copy from there */
4176
4177 RGWRESTStreamRWRequest *in_stream_req;
4178 string tag;
4179 map<string, bufferlist> src_attrs;
4180 append_rand_alpha(cct, tag, tag, 32);
4181 obj_time_weight set_mtime_weight;
4182 set_mtime_weight.high_precision = high_precision_time;
4183
4184 RGWRESTConn *conn;
4185 if (source_zone.empty()) {
4186 if (src_bucket_info.zonegroup.empty()) {
4187 /* source is in the master zonegroup */
4188 conn = svc.zone->get_master_conn();
4189 } else {
4190 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
4191 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4192 if (iter == zonegroup_conn_map.end()) {
4193 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4194 return -ENOENT;
4195 }
4196 conn = iter->second;
4197 }
4198 } else {
4199 auto& zone_conn_map = svc.zone->get_zone_conn_map();
4200 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4201 if (iter == zone_conn_map.end()) {
4202 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
4203 return -ENOENT;
4204 }
4205 conn = iter->second;
4206 }
4207
4208 RGWGetExtraDataCB cb;
4209 map<string, string> req_headers;
4210 real_time set_mtime;
4211
4212 const real_time *pmod = mod_ptr;
4213
4214 obj_time_weight dest_mtime_weight;
4215
4216 constexpr bool prepend_meta = true;
4217 constexpr bool get_op = true;
4218 constexpr bool rgwx_stat = true;
4219 constexpr bool sync_manifest = true;
4220 constexpr bool skip_decrypt = true;
4221 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4222 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
4223 prepend_meta, get_op, rgwx_stat,
4224 sync_manifest, skip_decrypt,
4225 true, &cb, &in_stream_req);
4226 if (ret < 0) {
4227 return ret;
4228 }
4229
4230 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
4231 if (ret < 0) {
4232 return ret;
4233 }
4234
4235 bufferlist& extra_data_bl = cb.get_extra_data();
4236 if (extra_data_bl.length()) {
4237 JSONParser jp;
4238 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
4239 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
4240 return -EIO;
4241 }
4242
4243 JSONDecoder::decode_json("attrs", src_attrs, &jp);
4244
4245 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
4246 }
4247
4248 if (src_mtime) {
4249 *src_mtime = set_mtime;
4250 }
4251
4252 if (petag) {
4253 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
4254 if (iter != src_attrs.end()) {
4255 bufferlist& etagbl = iter->second;
4256 *petag = etagbl.to_str();
4257 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
4258 *petag = petag->substr(0, petag->size() - 1);
4259 }
4260 }
4261 }
4262
4263 if (pattrs) {
4264 *pattrs = std::move(src_attrs);
4265 }
4266
4267 return 0;
4268 }
4269
4270 int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
4271 const rgw_user& user_id,
4272 req_info *info,
4273 const string& source_zone,
4274 const rgw_obj& dest_obj,
4275 const rgw_obj& src_obj,
4276 RGWBucketInfo& dest_bucket_info,
4277 RGWBucketInfo& src_bucket_info,
4278 std::optional<rgw_placement_rule> dest_placement_rule,
4279 real_time *src_mtime,
4280 real_time *mtime,
4281 const real_time *mod_ptr,
4282 const real_time *unmod_ptr,
4283 bool high_precision_time,
4284 const char *if_match,
4285 const char *if_nomatch,
4286 AttrsMod attrs_mod,
4287 bool copy_if_newer,
4288 map<string, bufferlist>& attrs,
4289 RGWObjCategory category,
4290 std::optional<uint64_t> olh_epoch,
4291 real_time delete_at,
4292 string *ptag,
4293 string *petag,
4294 void (*progress_cb)(off_t, void *),
4295 void *progress_data,
4296 rgw_zone_set *zones_trace,
4297 std::optional<uint64_t>* bytes_transferred)
4298 {
4299 /* source is in a different zonegroup, copy from there */
4300
4301 RGWRESTStreamRWRequest *in_stream_req;
4302 string tag;
4303 int i;
4304 append_rand_alpha(cct, tag, tag, 32);
4305 obj_time_weight set_mtime_weight;
4306 set_mtime_weight.high_precision = high_precision_time;
4307 int ret;
4308
4309 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4310 using namespace rgw::putobj;
4311 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
4312 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, ptail_rule, user_id,
4313 obj_ctx, dest_obj, olh_epoch, tag);
4314 RGWRESTConn *conn;
4315 auto& zone_conn_map = svc.zone->get_zone_conn_map();
4316 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
4317 if (source_zone.empty()) {
4318 if (dest_bucket_info.zonegroup.empty()) {
4319 /* source is in the master zonegroup */
4320 conn = svc.zone->get_master_conn();
4321 } else {
4322 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4323 if (iter == zonegroup_conn_map.end()) {
4324 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4325 return -ENOENT;
4326 }
4327 conn = iter->second;
4328 }
4329 } else {
4330 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4331 if (iter == zone_conn_map.end()) {
4332 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
4333 return -ENOENT;
4334 }
4335 conn = iter->second;
4336 }
4337
4338 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
4339
4340 boost::optional<RGWPutObj_Compress> compressor;
4341 CompressorRef plugin;
4342
4343 rgw_placement_rule dest_rule;
4344 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
4345 [&](const map<string, bufferlist>& obj_attrs) {
4346 if (!ptail_rule) {
4347 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
4348 if (iter != obj_attrs.end()) {
4349 dest_rule.storage_class = iter->second.to_str();
4350 dest_rule.inherit_from(dest_bucket_info.placement_rule);
4351 processor.set_tail_placement(std::move(dest_rule));
4352 ptail_rule = &dest_rule;
4353 } else {
4354 ptail_rule = &dest_bucket_info.placement_rule;
4355 }
4356 }
4357 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
4358 if (compression_type != "none") {
4359 plugin = Compressor::create(cct, compression_type);
4360 if (!plugin) {
4361 ldout(cct, 1) << "Cannot load plugin for compression type "
4362 << compression_type << dendl;
4363 }
4364 }
4365
4366 int ret = processor.prepare();
4367 if (ret < 0) {
4368 return ret;
4369 }
4370 return 0;
4371 });
4372
4373 string etag;
4374 real_time set_mtime;
4375 uint64_t expected_size = 0;
4376
4377 RGWObjState *dest_state = NULL;
4378
4379 const real_time *pmod = mod_ptr;
4380
4381 obj_time_weight dest_mtime_weight;
4382
4383 if (copy_if_newer) {
4384 /* need to get mtime for destination */
4385 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4386 if (ret < 0)
4387 goto set_err_state;
4388
4389 if (!real_clock::is_zero(dest_state->mtime)) {
4390 dest_mtime_weight.init(dest_state);
4391 pmod = &dest_mtime_weight.mtime;
4392 }
4393 }
4394
4395 static constexpr bool prepend_meta = true;
4396 static constexpr bool get_op = true;
4397 static constexpr bool rgwx_stat = false;
4398 static constexpr bool sync_manifest = true;
4399 static constexpr bool skip_decrypt = true;
4400 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4401 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
4402 prepend_meta, get_op, rgwx_stat,
4403 sync_manifest, skip_decrypt,
4404 true,
4405 &cb, &in_stream_req);
4406 if (ret < 0) {
4407 goto set_err_state;
4408 }
4409
4410 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
4411 &expected_size, nullptr, nullptr);
4412 if (ret < 0) {
4413 goto set_err_state;
4414 }
4415 ret = cb.flush();
4416 if (ret < 0) {
4417 goto set_err_state;
4418 }
4419 if (cb.get_data_len() != expected_size) {
4420 ret = -EIO;
4421 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
4422 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
4423 goto set_err_state;
4424 }
4425 if (compressor && compressor->is_compressed()) {
4426 bufferlist tmp;
4427 RGWCompressionInfo cs_info;
4428 cs_info.compression_type = plugin->get_type_name();
4429 cs_info.orig_size = cb.get_data_len();
4430 cs_info.blocks = move(compressor->get_compression_blocks());
4431 encode(cs_info, tmp);
4432 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
4433 }
4434
4435 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4436 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
4437 } else {
4438 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
4439 if (iter != cb.get_attrs().end()) {
4440 try {
4441 decode(delete_at, iter->second);
4442 } catch (buffer::error& err) {
4443 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
4444 }
4445 }
4446 }
4447
4448 if (src_mtime) {
4449 *src_mtime = set_mtime;
4450 }
4451
4452 if (petag) {
4453 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4454 if (iter != cb.get_attrs().end()) {
4455 *petag = iter->second.to_str();
4456 }
4457 }
4458
4459 //erase the append attr
4460 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4461
4462 if (source_zone.empty()) {
4463 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4464 } else {
4465 attrs = cb.get_attrs();
4466 }
4467
4468 if (copy_if_newer) {
4469 uint64_t pg_ver = 0;
4470 auto i = attrs.find(RGW_ATTR_PG_VER);
4471 if (i != attrs.end() && i->second.length() > 0) {
4472 auto iter = i->second.cbegin();
4473 try {
4474 decode(pg_ver, iter);
4475 } catch (buffer::error& err) {
4476 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
4477 /* non critical error */
4478 }
4479 }
4480 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
4481 }
4482
4483 #define MAX_COMPLETE_RETRY 100
4484 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
4485 bool canceled = false;
4486 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4487 attrs, delete_at, nullptr, nullptr, nullptr,
4488 zones_trace, &canceled);
4489 if (ret < 0) {
4490 goto set_err_state;
4491 }
4492 if (copy_if_newer && canceled) {
4493 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
4494 obj_ctx.invalidate(dest_obj); /* object was overwritten */
4495 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4496 if (ret < 0) {
4497 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
4498 goto set_err_state;
4499 }
4500 dest_mtime_weight.init(dest_state);
4501 dest_mtime_weight.high_precision = high_precision_time;
4502 if (!dest_state->exists ||
4503 dest_mtime_weight < set_mtime_weight) {
4504 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4505 continue;
4506 } else {
4507 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4508 }
4509 }
4510 break;
4511 }
4512
4513 if (i == MAX_COMPLETE_RETRY) {
4514 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
4515 ret = -EIO;
4516 goto set_err_state;
4517 }
4518
4519 if (bytes_transferred) {
4520 *bytes_transferred = cb.get_data_len();
4521 }
4522 return 0;
4523 set_err_state:
4524 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
4525 // we may have already fetched during sync of OP_ADD, but were waiting
4526 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4527 if (olh_epoch && *olh_epoch > 0) {
4528 constexpr bool log_data_change = true;
4529 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
4530 *olh_epoch, real_time(), false, zones_trace, log_data_change);
4531 } else {
4532 // we already have the latest copy
4533 ret = 0;
4534 }
4535 }
4536 return ret;
4537 }
4538
4539
4540 int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
4541 map<string, bufferlist>& src_attrs,
4542 RGWRados::Object::Read& read_op,
4543 const rgw_user& user_id,
4544 rgw_obj& dest_obj,
4545 real_time *mtime)
4546 {
4547 string etag;
4548
4549 RGWRESTStreamS3PutObj *out_stream_req;
4550
4551 auto rest_master_conn = svc.zone->get_master_conn();
4552
4553 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
4554 if (ret < 0) {
4555 return ret;
4556 }
4557
4558 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
4559 if (ret < 0) {
4560 delete out_stream_req;
4561 return ret;
4562 }
4563
4564 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4565 if (ret < 0)
4566 return ret;
4567
4568 return 0;
4569 }
4570
4571 /**
4572 * Copy an object.
4573 * dest_obj: the object to copy into
4574 * src_obj: the object to copy from
4575 * attrs: usage depends on attrs_mod parameter
4576 * attrs_mod: the modification mode of the attrs, may have the following values:
4577 * ATTRSMOD_NONE - the attributes of the source object will be
4578 * copied without modifications, attrs parameter is ignored;
4579 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4580 * parameter, source object attributes are not copied;
4581 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4582 * are overwritten by values contained in attrs parameter.
4583 * err: stores any errors resulting from the get of the original object
4584 * Returns: 0 on success, -ERR# otherwise.
4585 */
4586 int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4587 const rgw_user& user_id,
4588 req_info *info,
4589 const string& source_zone,
4590 rgw_obj& dest_obj,
4591 rgw_obj& src_obj,
4592 RGWBucketInfo& dest_bucket_info,
4593 RGWBucketInfo& src_bucket_info,
4594 const rgw_placement_rule& dest_placement,
4595 real_time *src_mtime,
4596 real_time *mtime,
4597 const real_time *mod_ptr,
4598 const real_time *unmod_ptr,
4599 bool high_precision_time,
4600 const char *if_match,
4601 const char *if_nomatch,
4602 AttrsMod attrs_mod,
4603 bool copy_if_newer,
4604 map<string, bufferlist>& attrs,
4605 RGWObjCategory category,
4606 uint64_t olh_epoch,
4607 real_time delete_at,
4608 string *version_id,
4609 string *ptag,
4610 string *petag,
4611 void (*progress_cb)(off_t, void *),
4612 void *progress_data)
4613 {
4614 int ret;
4615 uint64_t obj_size;
4616 rgw_obj shadow_obj = dest_obj;
4617 string shadow_oid;
4618
4619 bool remote_src;
4620 bool remote_dest;
4621
4622 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4623 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4624
4625 auto& zonegroup = svc.zone->get_zonegroup();
4626
4627 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4628 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
4629
4630 if (remote_src && remote_dest) {
4631 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
4632 return -EINVAL;
4633 }
4634
4635 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
4636
4637 if (remote_src || !source_zone.empty()) {
4638 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
4639 dest_obj, src_obj, dest_bucket_info, src_bucket_info,
4640 dest_placement, src_mtime, mtime, mod_ptr,
4641 unmod_ptr, high_precision_time,
4642 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
4643 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data);
4644 }
4645
4646 map<string, bufferlist> src_attrs;
4647 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4648 RGWRados::Object::Read read_op(&src_op_target);
4649
4650 read_op.conds.mod_ptr = mod_ptr;
4651 read_op.conds.unmod_ptr = unmod_ptr;
4652 read_op.conds.high_precision_time = high_precision_time;
4653 read_op.conds.if_match = if_match;
4654 read_op.conds.if_nomatch = if_nomatch;
4655 read_op.params.attrs = &src_attrs;
4656 read_op.params.lastmod = src_mtime;
4657 read_op.params.obj_size = &obj_size;
4658
4659 ret = read_op.prepare();
4660 if (ret < 0) {
4661 return ret;
4662 }
4663 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4664 // Current implementation does not follow S3 spec and even
4665 // may result in data corruption silently when copying
4666 // multipart objects acorss pools. So reject COPY operations
4667 //on encrypted objects before it is fully functional.
4668 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
4669 << " has not been implemented." << dendl;
4670 return -ERR_NOT_IMPLEMENTED;
4671 }
4672
4673 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4674 src_attrs.erase(RGW_ATTR_DELETE_AT);
4675
4676 set_copy_attrs(src_attrs, attrs, attrs_mod);
4677 attrs.erase(RGW_ATTR_ID_TAG);
4678 attrs.erase(RGW_ATTR_PG_VER);
4679 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4680 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4681 if (cmp != src_attrs.end())
4682 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4683
4684 RGWObjManifest manifest;
4685 RGWObjState *astate = NULL;
4686
4687 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
4688 if (ret < 0) {
4689 return ret;
4690 }
4691
4692 vector<rgw_raw_obj> ref_objs;
4693
4694 if (remote_dest) {
4695 /* dest is in a different zonegroup, copy it there */
4696 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4697 }
4698 uint64_t max_chunk_size;
4699
4700 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4701 if (ret < 0) {
4702 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
4703 return ret;
4704 }
4705
4706 rgw_pool src_pool;
4707 rgw_pool dest_pool;
4708
4709 const rgw_placement_rule *src_rule{nullptr};
4710
4711 if (astate->has_manifest) {
4712 src_rule = &astate->manifest.get_tail_placement().placement_rule;
4713 ldout(cct, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
4714 }
4715
4716 if (!src_rule || src_rule->empty()) {
4717 src_rule = &src_bucket_info.placement_rule;
4718 }
4719
4720 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
4721 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
4722 return -EIO;
4723 }
4724
4725 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
4726 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
4727 return -EIO;
4728 }
4729
4730 ldout(cct, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
4731 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4732
4733 bool copy_data = !astate->has_manifest ||
4734 (*src_rule != dest_placement) ||
4735 (src_pool != dest_pool);
4736
4737 bool copy_first = false;
4738 if (astate->has_manifest) {
4739 if (!astate->manifest.has_tail()) {
4740 copy_data = true;
4741 } else {
4742 uint64_t head_size = astate->manifest.get_head_size();
4743
4744 if (head_size > 0) {
4745 if (head_size > max_chunk_size) {
4746 copy_data = true;
4747 } else {
4748 copy_first = true;
4749 }
4750 }
4751 }
4752 }
4753
4754 if (petag) {
4755 const auto iter = attrs.find(RGW_ATTR_ETAG);
4756 if (iter != attrs.end()) {
4757 *petag = iter->second.to_str();
4758 }
4759 }
4760
4761 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
4762 attrs.erase(RGW_ATTR_TAIL_TAG);
4763 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
4764 mtime, real_time(), attrs, olh_epoch, delete_at, petag);
4765 }
4766
4767 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
4768
4769 if (copy_first) { // we need to copy first chunk, not increase refcount
4770 ++miter;
4771 }
4772
4773 rgw_rados_ref ref;
4774 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4775 if (ret < 0) {
4776 return ret;
4777 }
4778
4779 bufferlist first_chunk;
4780
4781 bool copy_itself = (dest_obj == src_obj);
4782 RGWObjManifest *pmanifest;
4783 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
4784
4785 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4786 RGWRados::Object::Write write_op(&dest_op_target);
4787
4788 string tag;
4789
4790 if (ptag) {
4791 tag = *ptag;
4792 }
4793
4794 if (tag.empty()) {
4795 append_rand_alpha(cct, tag, tag, 32);
4796 }
4797
4798 if (!copy_itself) {
4799 attrs.erase(RGW_ATTR_TAIL_TAG);
4800 manifest = astate->manifest;
4801 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4802 if (tail_placement.bucket.name.empty()) {
4803 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4804 }
4805 string ref_tag;
4806 for (; miter != astate->manifest.obj_end(); ++miter) {
4807 ObjectWriteOperation op;
4808 ref_tag = tag + '\0';
4809 cls_refcount_get(op, ref_tag, true);
4810 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
4811 ref.ioctx.locator_set_key(loc.loc);
4812
4813 ret = ref.ioctx.operate(loc.oid, &op);
4814 if (ret < 0) {
4815 goto done_ret;
4816 }
4817
4818 ref_objs.push_back(loc);
4819 }
4820
4821 pmanifest = &manifest;
4822 } else {
4823 pmanifest = &astate->manifest;
4824 /* don't send the object's tail for garbage collection */
4825 astate->keep_tail = true;
4826 }
4827
4828 if (copy_first) {
4829 ret = read_op.read(0, max_chunk_size, first_chunk);
4830 if (ret < 0) {
4831 goto done_ret;
4832 }
4833
4834 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4835 } else {
4836 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4837 }
4838
4839 write_op.meta.data = &first_chunk;
4840 write_op.meta.manifest = pmanifest;
4841 write_op.meta.ptag = &tag;
4842 write_op.meta.owner = dest_bucket_info.owner;
4843 write_op.meta.mtime = mtime;
4844 write_op.meta.flags = PUT_OBJ_CREATE;
4845 write_op.meta.category = category;
4846 write_op.meta.olh_epoch = olh_epoch;
4847 write_op.meta.delete_at = delete_at;
4848 write_op.meta.modify_tail = !copy_itself;
4849
4850 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
4851 if (ret < 0) {
4852 goto done_ret;
4853 }
4854
4855 return 0;
4856
4857 done_ret:
4858 if (!copy_itself) {
4859 vector<rgw_raw_obj>::iterator riter;
4860
4861 /* rollback reference */
4862 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4863 ObjectWriteOperation op;
4864 cls_refcount_put(op, tag, true);
4865
4866 ref.ioctx.locator_set_key(riter->loc);
4867
4868 int r = ref.ioctx.operate(riter->oid, &op);
4869 if (r < 0) {
4870 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
4871 }
4872 }
4873 }
4874 return ret;
4875 }
4876
4877
4878 int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4879 RGWBucketInfo& dest_bucket_info,
4880 const rgw_placement_rule& dest_placement,
4881 RGWRados::Object::Read& read_op, off_t end,
4882 const rgw_obj& dest_obj,
4883 real_time *mtime,
4884 real_time set_mtime,
4885 map<string, bufferlist>& attrs,
4886 uint64_t olh_epoch,
4887 real_time delete_at,
4888 string *petag)
4889 {
4890 string tag;
4891 append_rand_alpha(cct, tag, tag, 32);
4892
4893 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4894 using namespace rgw::putobj;
4895 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement,
4896 dest_bucket_info.owner, obj_ctx,
4897 dest_obj, olh_epoch, tag);
4898 int ret = processor.prepare();
4899 if (ret < 0)
4900 return ret;
4901
4902 off_t ofs = 0;
4903
4904 do {
4905 bufferlist bl;
4906 ret = read_op.read(ofs, end, bl);
4907 if (ret < 0) {
4908 ldout(cct, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
4909 return ret;
4910 }
4911
4912 uint64_t read_len = ret;
4913 ret = processor.process(std::move(bl), ofs);
4914 if (ret < 0) {
4915 return ret;
4916 }
4917
4918 ofs += read_len;
4919 } while (ofs <= end);
4920
4921 // flush
4922 ret = processor.process({}, ofs);
4923 if (ret < 0) {
4924 return ret;
4925 }
4926
4927 string etag;
4928 auto iter = attrs.find(RGW_ATTR_ETAG);
4929 if (iter != attrs.end()) {
4930 bufferlist& bl = iter->second;
4931 etag = bl.to_str();
4932 if (petag) {
4933 *petag = etag;
4934 }
4935 }
4936
4937 uint64_t accounted_size;
4938 {
4939 bool compressed{false};
4940 RGWCompressionInfo cs_info;
4941 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4942 if (ret < 0) {
4943 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
4944 return ret;
4945 }
4946 // pass original size if compressed
4947 accounted_size = compressed ? cs_info.orig_size : ofs;
4948 }
4949
4950 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
4951 nullptr, nullptr, nullptr, nullptr, nullptr);
4952 }
4953
4954 int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4955 RGWBucketInfo& bucket_info,
4956 rgw_obj& obj,
4957 const rgw_placement_rule& placement_rule,
4958 const real_time& mtime,
4959 uint64_t olh_epoch)
4960 {
4961 map<string, bufferlist> attrs;
4962 real_time read_mtime;
4963 uint64_t obj_size;
4964
4965 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4966 RGWRados::Object::Read read_op(&op_target);
4967
4968 read_op.params.attrs = &attrs;
4969 read_op.params.lastmod = &read_mtime;
4970 read_op.params.obj_size = &obj_size;
4971
4972 int ret = read_op.prepare();
4973 if (ret < 0) {
4974 return ret;
4975 }
4976
4977 if (read_mtime != mtime) {
4978 /* raced */
4979 return -ECANCELED;
4980 }
4981
4982 ret = copy_obj_data(obj_ctx,
4983 bucket_info,
4984 placement_rule,
4985 read_op,
4986 obj_size - 1,
4987 obj,
4988 nullptr /* pmtime */,
4989 mtime,
4990 attrs,
4991 olh_epoch,
4992 real_time(),
4993 nullptr /* petag */);
4994 if (ret < 0) {
4995 return ret;
4996 }
4997
4998 return 0;
4999 }
5000
5001 int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
5002 {
5003 std::vector<rgw_bucket_dir_entry> ent_list;
5004 rgw_obj_index_key marker;
5005 string prefix;
5006 bool is_truncated;
5007
5008 do {
5009 constexpr uint NUM_ENTRIES = 1000u;
5010 int r = cls_bucket_list_unordered(bucket_info,
5011 RGW_NO_SHARD,
5012 marker,
5013 prefix,
5014 NUM_ENTRIES,
5015 true,
5016 ent_list,
5017 &is_truncated,
5018 &marker);
5019 if (r < 0)
5020 return r;
5021
5022 string ns;
5023 for (auto const& dirent : ent_list) {
5024 rgw_obj_key obj;
5025
5026 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
5027 return -ENOTEMPTY;
5028 }
5029 } while (is_truncated);
5030
5031 return 0;
5032 }
5033
5034 /**
5035 * Delete a bucket.
5036 * bucket: the name of the bucket to delete
5037 * Returns 0 on success, -ERR# otherwise.
5038 */
5039 int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
5040 {
5041 const rgw_bucket& bucket = bucket_info.bucket;
5042 librados::IoCtx index_ctx;
5043 map<int, string> bucket_objs;
5044 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5045 if (r < 0)
5046 return r;
5047
5048 if (check_empty) {
5049 r = check_bucket_empty(bucket_info);
5050 if (r < 0) {
5051 return r;
5052 }
5053 }
5054
5055 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
5056 if (r < 0)
5057 return r;
5058
5059 /* if the bucket is not synced we can remove the meta file */
5060 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
5061 RGWObjVersionTracker objv_tracker;
5062 r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
5063 if (r < 0) {
5064 return r;
5065 }
5066
5067 /* remove bucket index objects asynchronously by best effort */
5068 (void) CLSRGWIssueBucketIndexClean(index_ctx,
5069 bucket_objs,
5070 cct->_conf->rgw_bucket_index_max_aio)();
5071 }
5072
5073 return 0;
5074 }
5075
5076 int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
5077 {
5078 RGWBucketInfo info;
5079 map<string, bufferlist> attrs;
5080 auto obj_ctx = svc.sysobj->init_obj_ctx();
5081 int r;
5082 if (bucket.bucket_id.empty()) {
5083 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5084 } else {
5085 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
5086 }
5087 if (r < 0) {
5088 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5089 return r;
5090 }
5091
5092 info.owner = owner.get_id();
5093
5094 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5095 if (r < 0) {
5096 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5097 return r;
5098 }
5099
5100 return 0;
5101 }
5102
5103
5104 int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
5105 {
5106 int ret = 0;
5107
5108 vector<rgw_bucket>::iterator iter;
5109
5110 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
5111 rgw_bucket& bucket = *iter;
5112 if (enabled)
5113 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
5114 else
5115 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
5116
5117 RGWBucketInfo info;
5118 map<string, bufferlist> attrs;
5119 auto obj_ctx = svc.sysobj->init_obj_ctx();
5120 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5121 if (r < 0) {
5122 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5123 ret = r;
5124 continue;
5125 }
5126 if (enabled) {
5127 info.flags &= ~BUCKET_SUSPENDED;
5128 } else {
5129 info.flags |= BUCKET_SUSPENDED;
5130 }
5131
5132 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5133 if (r < 0) {
5134 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5135 ret = r;
5136 continue;
5137 }
5138 }
5139 return ret;
5140 }
5141
5142 int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
5143 {
5144 RGWBucketInfo bucket_info;
5145 auto obj_ctx = svc.sysobj->init_obj_ctx();
5146 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
5147 if (ret < 0) {
5148 return ret;
5149 }
5150
5151 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
5152 return 0;
5153 }
5154
5155 int RGWRados::Object::complete_atomic_modification()
5156 {
5157 if (!state->has_manifest || state->keep_tail)
5158 return 0;
5159
5160 cls_rgw_obj_chain chain;
5161 store->update_gc_chain(obj, state->manifest, &chain);
5162
5163 if (chain.empty()) {
5164 return 0;
5165 }
5166
5167 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
5168 return store->gc->send_chain(chain, tag, false); // do it async
5169 }
5170
5171 void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
5172 {
5173 RGWObjManifest::obj_iterator iter;
5174 rgw_raw_obj raw_head;
5175 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
5176 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
5177 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
5178 if (mobj == raw_head)
5179 continue;
5180 cls_rgw_obj_key key(mobj.oid);
5181 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
5182 }
5183 }
5184
5185 int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
5186 {
5187 return gc->send_chain(chain, tag, sync);
5188 }
5189
5190 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5191 librados::IoCtx& index_ctx,
5192 string& bucket_oid)
5193 {
5194 const rgw_bucket& bucket = bucket_info.bucket;
5195 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5196 if (r < 0)
5197 return r;
5198
5199 if (bucket.bucket_id.empty()) {
5200 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
5201 return -EIO;
5202 }
5203
5204 bucket_oid = dir_oid_prefix;
5205 bucket_oid.append(bucket.bucket_id);
5206
5207 return 0;
5208 }
5209
5210 int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
5211 librados::IoCtx& index_ctx,
5212 string& bucket_oid_base) {
5213 const rgw_bucket& bucket = bucket_info.bucket;
5214 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5215 if (r < 0)
5216 return r;
5217
5218 if (bucket.bucket_id.empty()) {
5219 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
5220 return -EIO;
5221 }
5222
5223 bucket_oid_base = dir_oid_prefix;
5224 bucket_oid_base.append(bucket.bucket_id);
5225
5226 return 0;
5227
5228 }
5229
5230 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5231 librados::IoCtx& index_ctx,
5232 map<int, string>& bucket_objs,
5233 int shard_id,
5234 map<int, string> *bucket_instance_ids) {
5235 string bucket_oid_base;
5236 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5237 if (ret < 0) {
5238 return ret;
5239 }
5240
5241 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
5242 if (bucket_instance_ids) {
5243 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
5244 }
5245 return 0;
5246 }
5247
5248 template<typename T>
5249 int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5250 map<int, string>& oids, map<int, T>& bucket_objs,
5251 int shard_id, map<int, string> *bucket_instance_ids)
5252 {
5253 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
5254 if (ret < 0)
5255 return ret;
5256
5257 map<int, string>::const_iterator iter = oids.begin();
5258 for (; iter != oids.end(); ++iter) {
5259 bucket_objs[iter->first] = T();
5260 }
5261 return 0;
5262 }
5263
5264 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5265 const string& obj_key, string *bucket_obj, int *shard_id)
5266 {
5267 string bucket_oid_base;
5268 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5269 if (ret < 0)
5270 return ret;
5271
5272 RGWObjectCtx obj_ctx(this);
5273
5274 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
5275 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
5276 if (ret < 0) {
5277 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
5278 return ret;
5279 }
5280 return 0;
5281 }
5282
5283 int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5284 int shard_id, string *bucket_obj)
5285 {
5286 string bucket_oid_base;
5287 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5288 if (ret < 0)
5289 return ret;
5290
5291 RGWObjectCtx obj_ctx(this);
5292
5293 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
5294 shard_id, bucket_obj);
5295 return 0;
5296 }
5297
5298 static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
5299 map<RGWObjCategory, RGWStorageStats>& stats)
5300 {
5301 for (const auto& pair : header.stats) {
5302 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
5303 const rgw_bucket_category_stats& header_stats = pair.second;
5304
5305 RGWStorageStats& s = stats[category];
5306
5307 s.category = category;
5308 s.size += header_stats.total_size;
5309 s.size_rounded += header_stats.total_size_rounded;
5310 s.size_utilized += header_stats.actual_size;
5311 s.num_objects += header_stats.num_entries;
5312 }
5313 }
5314
5315 int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
5316 map<RGWObjCategory, RGWStorageStats> *existing_stats,
5317 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
5318 {
5319 librados::IoCtx index_ctx;
5320 // key - bucket index object id
5321 // value - bucket index check OP returned result with the given bucket index object (shard)
5322 map<int, string> oids;
5323 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
5324
5325 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
5326 if (ret < 0) {
5327 return ret;
5328 }
5329
5330 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
5331 if (ret < 0) {
5332 return ret;
5333 }
5334
5335 // Aggregate results (from different shards if there is any)
5336 map<int, struct rgw_cls_check_index_ret>::iterator iter;
5337 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
5338 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
5339 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
5340 }
5341
5342 return 0;
5343 }
5344
5345 int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
5346 {
5347 librados::IoCtx index_ctx;
5348 map<int, string> bucket_objs;
5349
5350 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5351 if (r < 0) {
5352 return r;
5353 }
5354
5355 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5356 }
5357
5358 int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
5359 {
5360 librados::IoCtx index_ctx;
5361 map<int, string> bucket_objs;
5362
5363 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5364 if (r < 0) {
5365 return r;
5366 }
5367
5368 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
5369 }
5370
5371 int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
5372 {
5373 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5374 std::string oid, key;
5375 get_obj_bucket_and_oid_loc(obj, oid, key);
5376 if (!rctx)
5377 return 0;
5378
5379 RGWObjState *state = NULL;
5380
5381 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
5382 if (r < 0)
5383 return r;
5384
5385 if (!state->is_atomic) {
5386 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
5387 return -EINVAL;
5388 }
5389
5390 string tag;
5391
5392 if (state->tail_tag.length() > 0) {
5393 tag = state->tail_tag.c_str();
5394 } else if (state->obj_tag.length() > 0) {
5395 tag = state->obj_tag.c_str();
5396 } else {
5397 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
5398 return -EINVAL;
5399 }
5400
5401 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
5402
5403 return gc->defer_chain(tag, false);
5404 }
5405
5406 void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
5407 {
5408 list<string> prefixes;
5409 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
5410 cls_rgw_remove_obj(op, prefixes);
5411 }
5412
5413 void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
5414 {
5415 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
5416 }
5417
5418 void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
5419 {
5420 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
5421 }
5422
5423
5424 /**
5425 * Delete an object.
5426 * bucket: name of the bucket storing the object
5427 * obj: name of the object to delete
5428 * Returns: 0 on success, -ERR# otherwise.
5429 */
5430 int RGWRados::Object::Delete::delete_obj()
5431 {
5432 RGWRados *store = target->get_store();
5433 rgw_obj& src_obj = target->get_obj();
5434 const string& instance = src_obj.key.instance;
5435 rgw_obj obj = src_obj;
5436
5437 if (instance == "null") {
5438 obj.key.instance.clear();
5439 }
5440
5441 bool explicit_marker_version = (!params.marker_version_id.empty());
5442
5443 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5444 if (instance.empty() || explicit_marker_version) {
5445 rgw_obj marker = obj;
5446
5447 if (!params.marker_version_id.empty()) {
5448 if (params.marker_version_id != "null") {
5449 marker.key.set_instance(params.marker_version_id);
5450 }
5451 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5452 store->gen_rand_obj_instance_name(&marker);
5453 }
5454
5455 result.version_id = marker.key.instance;
5456 if (result.version_id.empty())
5457 result.version_id = "null";
5458 result.delete_marker = true;
5459
5460 struct rgw_bucket_dir_entry_meta meta;
5461
5462 meta.owner = params.obj_owner.get_id().to_str();
5463 meta.owner_display_name = params.obj_owner.get_display_name();
5464
5465 if (real_clock::is_zero(params.mtime)) {
5466 meta.mtime = real_clock::now();
5467 } else {
5468 meta.mtime = params.mtime;
5469 }
5470
5471 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
5472 if (r < 0) {
5473 return r;
5474 }
5475 } else {
5476 rgw_bucket_dir_entry dirent;
5477
5478 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
5479 if (r < 0) {
5480 return r;
5481 }
5482 result.delete_marker = dirent.is_delete_marker();
5483 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
5484 if (r < 0) {
5485 return r;
5486 }
5487 result.version_id = instance;
5488 }
5489
5490 BucketShard *bs;
5491 int r = target->get_bucket_shard(&bs);
5492 if (r < 0) {
5493 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
5494 return r;
5495 }
5496
5497 if (target->bucket_info.datasync_flag_enabled()) {
5498 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
5499 if (r < 0) {
5500 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
5501 return r;
5502 }
5503 }
5504
5505 return 0;
5506 }
5507
5508 rgw_rados_ref ref;
5509 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
5510 if (r < 0) {
5511 return r;
5512 }
5513
5514 RGWObjState *state;
5515 r = target->get_state(&state, false);
5516 if (r < 0)
5517 return r;
5518
5519 ObjectWriteOperation op;
5520
5521 if (!real_clock::is_zero(params.unmod_since)) {
5522 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5523 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5524 if (!params.high_precision_time) {
5525 ctime.tv_nsec = 0;
5526 unmod.tv_nsec = 0;
5527 }
5528
5529 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
5530 if (ctime > unmod) {
5531 return -ERR_PRECONDITION_FAILED;
5532 }
5533
5534 /* only delete object if mtime is less than or equal to params.unmod_since */
5535 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5536 }
5537 uint64_t obj_accounted_size = state->accounted_size;
5538
5539 if (!real_clock::is_zero(params.expiration_time)) {
5540 bufferlist bl;
5541 real_time delete_at;
5542
5543 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5544 try {
5545 auto iter = bl.cbegin();
5546 decode(delete_at, iter);
5547 } catch (buffer::error& err) {
5548 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5549 return -EIO;
5550 }
5551
5552 if (params.expiration_time != delete_at) {
5553 return -ERR_PRECONDITION_FAILED;
5554 }
5555 } else {
5556 return -ERR_PRECONDITION_FAILED;
5557 }
5558 }
5559
5560 if (!state->exists) {
5561 target->invalidate_state();
5562 return -ENOENT;
5563 }
5564
5565 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
5566 if (r < 0)
5567 return r;
5568
5569 RGWBucketInfo& bucket_info = target->get_bucket_info();
5570
5571 RGWRados::Bucket bop(store, bucket_info);
5572 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5573
5574 index_op.set_zones_trace(params.zones_trace);
5575 index_op.set_bilog_flags(params.bilog_flags);
5576
5577 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
5578 if (r < 0)
5579 return r;
5580
5581 store->remove_rgw_head_obj(op);
5582 r = ref.ioctx.operate(ref.obj.oid, &op);
5583
5584 /* raced with another operation, object state is indeterminate */
5585 const bool need_invalidate = (r == -ECANCELED);
5586
5587 int64_t poolid = ref.ioctx.get_id();
5588 if (r >= 0) {
5589 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5590 if (obj_tombstone_cache) {
5591 tombstone_entry entry{*state};
5592 obj_tombstone_cache->add(obj, entry);
5593 }
5594 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
5595
5596 int ret = target->complete_atomic_modification();
5597 if (ret < 0) {
5598 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5599 }
5600 /* other than that, no need to propagate error */
5601 } else {
5602 int ret = index_op.cancel();
5603 if (ret < 0) {
5604 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5605 }
5606 }
5607
5608 if (need_invalidate) {
5609 target->invalidate_state();
5610 }
5611
5612 if (r < 0)
5613 return r;
5614
5615 /* update quota cache */
5616 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
5617
5618 return 0;
5619 }
5620
5621 int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5622 const RGWBucketInfo& bucket_info,
5623 const rgw_obj& obj,
5624 int versioning_status,
5625 uint16_t bilog_flags,
5626 const real_time& expiration_time,
5627 rgw_zone_set *zones_trace)
5628 {
5629 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5630 RGWRados::Object::Delete del_op(&del_target);
5631
5632 del_op.params.bucket_owner = bucket_info.owner;
5633 del_op.params.versioning_status = versioning_status;
5634 del_op.params.bilog_flags = bilog_flags;
5635 del_op.params.expiration_time = expiration_time;
5636 del_op.params.zones_trace = zones_trace;
5637
5638 return del_op.delete_obj();
5639 }
5640
5641 int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5642 {
5643 rgw_rados_ref ref;
5644 int r = get_raw_obj_ref(obj, &ref);
5645 if (r < 0) {
5646 return r;
5647 }
5648
5649 ObjectWriteOperation op;
5650
5651 op.remove();
5652 r = ref.ioctx.operate(ref.obj.oid, &op);
5653 if (r < 0)
5654 return r;
5655
5656 return 0;
5657 }
5658
5659 int RGWRados::delete_obj_index(const rgw_obj& obj)
5660 {
5661 std::string oid, key;
5662 get_obj_bucket_and_oid_loc(obj, oid, key);
5663
5664 auto obj_ctx = svc.sysobj->init_obj_ctx();
5665
5666 RGWBucketInfo bucket_info;
5667 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
5668 if (ret < 0) {
5669 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5670 return ret;
5671 }
5672
5673 RGWRados::Bucket bop(this, bucket_info);
5674 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5675
5676 real_time removed_mtime;
5677 int r = index_op.complete_del(-1 /* pool */, 0, removed_mtime, NULL);
5678
5679 return r;
5680 }
5681
5682 static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5683 {
5684 string tag;
5685
5686 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5687 if (mi != manifest.obj_end()) {
5688 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5689 ++mi;
5690 tag = mi.get_location().get_raw_obj(store).oid;
5691 tag.append("_");
5692 }
5693
5694 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5695 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5696 MD5 hash;
5697 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
5698
5699 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5700 if (iter != attrset.end()) {
5701 bufferlist& bl = iter->second;
5702 hash.Update((const unsigned char *)bl.c_str(), bl.length());
5703 }
5704
5705 hash.Final(md5);
5706 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5707 tag.append(md5_str);
5708
5709 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5710
5711 tag_bl.append(tag.c_str(), tag.size() + 1);
5712 }
5713
5714 static bool is_olh(map<string, bufferlist>& attrs)
5715 {
5716 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5717 return (iter != attrs.end());
5718 }
5719
5720 static bool has_olh_tag(map<string, bufferlist>& attrs)
5721 {
5722 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5723 return (iter != attrs.end());
5724 }
5725
5726 int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5727 RGWObjState *olh_state, RGWObjState **target_state)
5728 {
5729 ceph_assert(olh_state->is_olh);
5730
5731 rgw_obj target;
5732 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5733 if (r < 0) {
5734 return r;
5735 }
5736 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
5737 if (r < 0) {
5738 return r;
5739 }
5740
5741 return 0;
5742 }
5743
5744 int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5745 RGWObjState **state, bool follow_olh, bool assume_noent)
5746 {
5747 if (obj.empty()) {
5748 return -EINVAL;
5749 }
5750
5751 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5752
5753 RGWObjState *s = rctx->get_state(obj);
5754 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5755 *state = s;
5756 if (s->has_attrs) {
5757 if (s->is_olh && need_follow_olh) {
5758 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
5759 }
5760 return 0;
5761 }
5762
5763 s->obj = obj;
5764
5765 rgw_raw_obj raw_obj;
5766 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5767
5768 int r = -ENOENT;
5769
5770 if (!assume_noent) {
5771 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
5772 }
5773
5774 if (r == -ENOENT) {
5775 s->exists = false;
5776 s->has_attrs = true;
5777 tombstone_entry entry;
5778 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5779 s->mtime = entry.mtime;
5780 s->zone_short_id = entry.zone_short_id;
5781 s->pg_ver = entry.pg_ver;
5782 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5783 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5784 } else {
5785 s->mtime = real_time();
5786 }
5787 return 0;
5788 }
5789 if (r < 0)
5790 return r;
5791
5792 s->exists = true;
5793 s->has_attrs = true;
5794 s->accounted_size = s->size;
5795
5796 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5797 if (iter != s->attrset.end()) {
5798 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5799 bufferlist& bletag = iter->second;
5800 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5801 bufferlist newbl;
5802 bletag.splice(0, bletag.length() - 1, &newbl);
5803 bletag.claim(newbl);
5804 }
5805 }
5806
5807 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
5808 const bool compressed = (iter != s->attrset.end());
5809 if (compressed) {
5810 // use uncompressed size for accounted_size
5811 try {
5812 RGWCompressionInfo info;
5813 auto p = iter->second.cbegin();
5814 decode(info, p);
5815 s->accounted_size = info.orig_size;
5816 } catch (buffer::error&) {
5817 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5818 return -EIO;
5819 }
5820 }
5821
5822 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5823 if (iter != s->attrset.end()) {
5824 bufferlist bl = iter->second;
5825 bufferlist::iterator it = bl.begin();
5826 it.copy(bl.length(), s->shadow_obj);
5827 s->shadow_obj[bl.length()] = '\0';
5828 }
5829 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
5830 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5831 if (ttiter != s->attrset.end()) {
5832 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5833 }
5834
5835 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5836 if (manifest_bl.length()) {
5837 auto miter = manifest_bl.cbegin();
5838 try {
5839 decode(s->manifest, miter);
5840 s->has_manifest = true;
5841 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
5842 broken due to old bugs */
5843 s->size = s->manifest.get_obj_size();
5844 if (!compressed)
5845 s->accounted_size = s->size;
5846 } catch (buffer::error& err) {
5847 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5848 return -EIO;
5849 }
5850 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
5851 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
5852 s->manifest.has_explicit_objs()) {
5853 RGWObjManifest::obj_iterator mi;
5854 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
5855 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5856 }
5857 }
5858
5859 if (!s->obj_tag.length()) {
5860 /*
5861 * Uh oh, something's wrong, object with manifest should have tag. Let's
5862 * create one out of the manifest, would be unique
5863 */
5864 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
5865 s->fake_tag = true;
5866 }
5867 }
5868 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5869 if (aiter != s->attrset.end()) {
5870 bufferlist& pg_ver_bl = aiter->second;
5871 if (pg_ver_bl.length()) {
5872 auto pgbl = pg_ver_bl.cbegin();
5873 try {
5874 decode(s->pg_ver, pgbl);
5875 } catch (buffer::error& err) {
5876 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5877 }
5878 }
5879 }
5880 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5881 if (aiter != s->attrset.end()) {
5882 bufferlist& zone_short_id_bl = aiter->second;
5883 if (zone_short_id_bl.length()) {
5884 auto zbl = zone_short_id_bl.cbegin();
5885 try {
5886 decode(s->zone_short_id, zbl);
5887 } catch (buffer::error& err) {
5888 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5889 }
5890 }
5891 }
5892 if (s->obj_tag.length())
5893 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
5894 else
5895 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5896
5897 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5898 * it exist, and not only if is_olh() returns true
5899 */
5900 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5901 if (iter != s->attrset.end()) {
5902 s->olh_tag = iter->second;
5903 }
5904
5905 if (is_olh(s->attrset)) {
5906 s->is_olh = true;
5907
5908 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5909
5910 if (need_follow_olh) {
5911 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
5912 } else if (obj.key.have_null_instance() && !s->has_manifest) {
5913 // read null version, and the head object only have olh info
5914 s->exists = false;
5915 return -ENOENT;
5916 }
5917 }
5918
5919 return 0;
5920 }
5921
5922 int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
5923 bool follow_olh, bool assume_noent)
5924 {
5925 int ret;
5926
5927 do {
5928 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
5929 } while (ret == -EAGAIN);
5930
5931 return ret;
5932 }
5933
5934 int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
5935 {
5936 RGWObjState *astate;
5937 int r = get_state(&astate, true);
5938 if (r < 0) {
5939 return r;
5940 }
5941
5942 *pmanifest = &astate->manifest;
5943
5944 return 0;
5945 }
5946
5947 int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
5948 {
5949 RGWObjState *state;
5950 int r = source->get_state(&state, true);
5951 if (r < 0)
5952 return r;
5953 if (!state->exists)
5954 return -ENOENT;
5955 if (!state->get_attr(name, dest))
5956 return -ENODATA;
5957
5958 return 0;
5959 }
5960
5961
5962 int RGWRados::Object::Stat::stat_async()
5963 {
5964 RGWObjectCtx& ctx = source->get_ctx();
5965 rgw_obj& obj = source->get_obj();
5966 RGWRados *store = source->get_store();
5967
5968 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
5969 result.obj = obj;
5970 if (s->has_attrs) {
5971 state.ret = 0;
5972 result.size = s->size;
5973 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5974 result.attrs = s->attrset;
5975 result.has_manifest = s->has_manifest;
5976 result.manifest = s->manifest;
5977 return 0;
5978 }
5979
5980 string oid;
5981 string loc;
5982 get_obj_bucket_and_oid_loc(obj, oid, loc);
5983
5984 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5985 if (r < 0) {
5986 return r;
5987 }
5988
5989 librados::ObjectReadOperation op;
5990 op.stat2(&result.size, &result.mtime, NULL);
5991 op.getxattrs(&result.attrs, NULL);
5992 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
5993 state.io_ctx.locator_set_key(loc);
5994 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5995 if (r < 0) {
5996 ldout(store->ctx(), 5) << __func__
5997 << ": ERROR: aio_operate() returned ret=" << r
5998 << dendl;
5999 return r;
6000 }
6001
6002 return 0;
6003 }
6004
6005
6006 int RGWRados::Object::Stat::wait()
6007 {
6008 if (!state.completion) {
6009 return state.ret;
6010 }
6011
6012 state.completion->wait_for_safe();
6013 state.ret = state.completion->get_return_value();
6014 state.completion->release();
6015
6016 if (state.ret != 0) {
6017 return state.ret;
6018 }
6019
6020 return finish();
6021 }
6022
6023 int RGWRados::Object::Stat::finish()
6024 {
6025 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
6026 if (iter != result.attrs.end()) {
6027 bufferlist& bl = iter->second;
6028 auto biter = bl.cbegin();
6029 try {
6030 decode(result.manifest, biter);
6031 } catch (buffer::error& err) {
6032 RGWRados *store = source->get_store();
6033 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
6034 return -EIO;
6035 }
6036 result.has_manifest = true;
6037 }
6038
6039 return 0;
6040 }
6041
6042 int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
6043 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6044 ObjectOperation& op, RGWObjState **pstate)
6045 {
6046 if (!rctx)
6047 return 0;
6048
6049 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
6050 if (r < 0)
6051 return r;
6052
6053 return append_atomic_test(*pstate, op);
6054 }
6055
6056 int RGWRados::append_atomic_test(const RGWObjState* state,
6057 librados::ObjectOperation& op)
6058 {
6059 if (!state->is_atomic) {
6060 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
6061 return 0;
6062 }
6063
6064 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
6065 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6066 } else {
6067 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
6068 }
6069 return 0;
6070 }
6071
6072 int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
6073 {
6074 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
6075 }
6076
6077 void RGWRados::Object::invalidate_state()
6078 {
6079 ctx.invalidate(obj);
6080 }
6081
6082 int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
6083 const char *if_match, const char *if_nomatch, bool removal_op,
6084 bool modify_tail)
6085 {
6086 int r = get_state(&state, false);
6087 if (r < 0)
6088 return r;
6089
6090 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
6091 if_match != NULL || if_nomatch != NULL) &&
6092 (!state->fake_tag);
6093
6094 if (!state->is_atomic) {
6095 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
6096
6097 if (reset_obj) {
6098 op.create(false);
6099 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
6100 }
6101
6102 return 0;
6103 }
6104
6105 if (need_guard) {
6106 /* first verify that the object wasn't replaced under */
6107 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
6108 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6109 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
6110 }
6111
6112 if (if_match) {
6113 if (strcmp(if_match, "*") == 0) {
6114 // test the object is existing
6115 if (!state->exists) {
6116 return -ERR_PRECONDITION_FAILED;
6117 }
6118 } else {
6119 bufferlist bl;
6120 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6121 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
6122 return -ERR_PRECONDITION_FAILED;
6123 }
6124 }
6125 }
6126
6127 if (if_nomatch) {
6128 if (strcmp(if_nomatch, "*") == 0) {
6129 // test the object is NOT existing
6130 if (state->exists) {
6131 return -ERR_PRECONDITION_FAILED;
6132 }
6133 } else {
6134 bufferlist bl;
6135 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6136 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
6137 return -ERR_PRECONDITION_FAILED;
6138 }
6139 }
6140 }
6141 }
6142
6143 if (reset_obj) {
6144 if (state->exists) {
6145 op.create(false);
6146 store->remove_rgw_head_obj(op);
6147 } else {
6148 op.create(true);
6149 }
6150 }
6151
6152 if (removal_op) {
6153 /* the object is being removed, no need to update its tag */
6154 return 0;
6155 }
6156
6157 if (ptag) {
6158 state->write_tag = *ptag;
6159 } else {
6160 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
6161 }
6162 bufferlist bl;
6163 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
6164
6165 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
6166
6167 op.setxattr(RGW_ATTR_ID_TAG, bl);
6168 if (modify_tail) {
6169 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
6170 }
6171
6172 return 0;
6173 }
6174
6175 /**
6176 * Set an attr on an object.
6177 * bucket: name of the bucket holding the object
6178 * obj: name of the object to set the attr on
6179 * name: the attr to set
6180 * bl: the contents of the attr
6181 * Returns: 0 on success, -ERR# otherwise.
6182 */
6183 int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
6184 {
6185 map<string, bufferlist> attrs;
6186 attrs[name] = bl;
6187 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
6188 }
6189
6190 int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj,
6191 map<string, bufferlist>& attrs,
6192 map<string, bufferlist>* rmattrs)
6193 {
6194 rgw_rados_ref ref;
6195 int r = get_obj_head_ref(bucket_info, obj, &ref);
6196 if (r < 0) {
6197 return r;
6198 }
6199 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
6200
6201 ObjectWriteOperation op;
6202 RGWObjState *state = NULL;
6203
6204 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
6205 if (r < 0)
6206 return r;
6207
6208 map<string, bufferlist>::iterator iter;
6209 if (rmattrs) {
6210 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6211 const string& name = iter->first;
6212 op.rmxattr(name.c_str());
6213 }
6214 }
6215
6216 const rgw_bucket& bucket = obj.bucket;
6217
6218 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6219 const string& name = iter->first;
6220 bufferlist& bl = iter->second;
6221
6222 if (!bl.length())
6223 continue;
6224
6225 op.setxattr(name.c_str(), bl);
6226
6227 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
6228 real_time ts;
6229 try {
6230 decode(ts, bl);
6231
6232 rgw_obj_index_key obj_key;
6233 obj.key.get_index_key(&obj_key);
6234
6235 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
6236 } catch (buffer::error& err) {
6237 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
6238 }
6239 }
6240 }
6241
6242 if (!op.size())
6243 return 0;
6244
6245 RGWObjectCtx obj_ctx(this);
6246
6247 bufferlist bl;
6248 RGWRados::Bucket bop(this, bucket_info);
6249 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
6250
6251 if (state) {
6252 string tag;
6253 append_rand_alpha(cct, tag, tag, 32);
6254 state->write_tag = tag;
6255 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
6256
6257 if (r < 0)
6258 return r;
6259
6260 bl.append(tag.c_str(), tag.size() + 1);
6261 op.setxattr(RGW_ATTR_ID_TAG, bl);
6262 }
6263
6264
6265 real_time mtime = real_clock::now();
6266 struct timespec mtime_ts = real_clock::to_timespec(mtime);
6267 op.mtime2(&mtime_ts);
6268 r = ref.ioctx.operate(ref.obj.oid, &op);
6269 if (state) {
6270 if (r >= 0) {
6271 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
6272 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
6273 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
6274 string etag = rgw_bl_str(etag_bl);
6275 string content_type = rgw_bl_str(content_type_bl);
6276 string storage_class;
6277 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
6278 if (iter != attrs.end()) {
6279 storage_class = rgw_bl_str(iter->second);
6280 }
6281 uint64_t epoch = ref.ioctx.get_last_version();
6282 int64_t poolid = ref.ioctx.get_id();
6283 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
6284 mtime, etag, content_type, storage_class, &acl_bl,
6285 RGWObjCategory::Main, NULL);
6286 } else {
6287 int ret = index_op.cancel();
6288 if (ret < 0) {
6289 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
6290 }
6291 }
6292 }
6293 if (r < 0)
6294 return r;
6295
6296 if (state) {
6297 state->obj_tag.swap(bl);
6298 if (rmattrs) {
6299 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6300 state->attrset.erase(iter->first);
6301 }
6302 }
6303 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6304 state->attrset[iter->first] = iter->second;
6305 }
6306 }
6307
6308 return 0;
6309 }
6310
6311 int RGWRados::Object::Read::prepare()
6312 {
6313 RGWRados *store = source->get_store();
6314 CephContext *cct = store->ctx();
6315
6316 bufferlist etag;
6317
6318 map<string, bufferlist>::iterator iter;
6319
6320 RGWObjState *astate;
6321 int r = source->get_state(&astate, true);
6322 if (r < 0)
6323 return r;
6324
6325 if (!astate->exists) {
6326 return -ENOENT;
6327 }
6328
6329 const RGWBucketInfo& bucket_info = source->get_bucket_info();
6330
6331 state.obj = astate->obj;
6332 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
6333
6334 state.cur_pool = state.head_obj.pool;
6335 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
6336
6337 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
6338 if (r < 0) {
6339 return r;
6340 }
6341 if (params.attrs) {
6342 *params.attrs = astate->attrset;
6343 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
6344 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
6345 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
6346 }
6347 }
6348 }
6349
6350 /* Convert all times go GMT to make them compatible */
6351 if (conds.mod_ptr || conds.unmod_ptr) {
6352 obj_time_weight src_weight;
6353 src_weight.init(astate);
6354 src_weight.high_precision = conds.high_precision_time;
6355
6356 obj_time_weight dest_weight;
6357 dest_weight.high_precision = conds.high_precision_time;
6358
6359 if (conds.mod_ptr) {
6360 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6361 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6362 if (!(dest_weight < src_weight)) {
6363 return -ERR_NOT_MODIFIED;
6364 }
6365 }
6366
6367 if (conds.unmod_ptr) {
6368 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6369 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6370 if (dest_weight < src_weight) {
6371 return -ERR_PRECONDITION_FAILED;
6372 }
6373 }
6374 }
6375 if (conds.if_match || conds.if_nomatch) {
6376 r = get_attr(RGW_ATTR_ETAG, etag);
6377 if (r < 0)
6378 return r;
6379
6380
6381
6382 if (conds.if_match) {
6383 string if_match_str = rgw_string_unquote(conds.if_match);
6384 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
6385 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
6386 return -ERR_PRECONDITION_FAILED;
6387 }
6388 }
6389
6390 if (conds.if_nomatch) {
6391 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
6392 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
6393 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
6394 return -ERR_NOT_MODIFIED;
6395 }
6396 }
6397 }
6398
6399 if (params.obj_size)
6400 *params.obj_size = astate->size;
6401 if (params.lastmod)
6402 *params.lastmod = astate->mtime;
6403
6404 return 0;
6405 }
6406
6407 int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6408 {
6409 if (ofs < 0) {
6410 ofs += obj_size;
6411 if (ofs < 0)
6412 ofs = 0;
6413 end = obj_size - 1;
6414 } else if (end < 0) {
6415 end = obj_size - 1;
6416 }
6417
6418 if (obj_size > 0) {
6419 if (ofs >= (off_t)obj_size) {
6420 return -ERANGE;
6421 }
6422 if (end >= (off_t)obj_size) {
6423 end = obj_size - 1;
6424 }
6425 }
6426 return 0;
6427 }
6428
6429 int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
6430 {
6431 RGWRados *store = target->get_store();
6432 BucketShard *bs;
6433 int r;
6434
6435 #define NUM_RESHARD_RETRIES 10
6436 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6437 int ret = get_bucket_shard(&bs);
6438 if (ret < 0) {
6439 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6440 return ret;
6441 }
6442 r = call(bs);
6443 if (r != -ERR_BUSY_RESHARDING) {
6444 break;
6445 }
6446 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6447 string new_bucket_id;
6448 r = store->block_while_resharding(bs, &new_bucket_id,
6449 target->bucket_info, null_yield);
6450 if (r == -ERR_BUSY_RESHARDING) {
6451 continue;
6452 }
6453 if (r < 0) {
6454 return r;
6455 }
6456 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6457 i = 0; /* resharding is finished, make sure we can retry */
6458 r = target->update_bucket_id(new_bucket_id);
6459 if (r < 0) {
6460 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
6461 return r;
6462 }
6463 invalidate_bs();
6464 } // for loop
6465
6466 if (r < 0) {
6467 return r;
6468 }
6469
6470 if (pbs) {
6471 *pbs = bs;
6472 }
6473
6474 return 0;
6475 }
6476
6477 int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
6478 {
6479 if (blind) {
6480 return 0;
6481 }
6482 RGWRados *store = target->get_store();
6483
6484 if (write_tag && write_tag->length()) {
6485 optag = string(write_tag->c_str(), write_tag->length());
6486 } else {
6487 if (optag.empty()) {
6488 append_rand_alpha(store->ctx(), optag, optag, 32);
6489 }
6490 }
6491
6492 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
6493 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
6494 });
6495
6496 if (r < 0) {
6497 return r;
6498 }
6499 prepared = true;
6500
6501 return 0;
6502 }
6503
6504 int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
6505 uint64_t size, uint64_t accounted_size,
6506 ceph::real_time& ut, const string& etag,
6507 const string& content_type, const string& storage_class,
6508 bufferlist *acl_bl,
6509 RGWObjCategory category,
6510 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6511 bool appendable)
6512 {
6513 if (blind) {
6514 return 0;
6515 }
6516 RGWRados *store = target->get_store();
6517 BucketShard *bs;
6518
6519 int ret = get_bucket_shard(&bs);
6520 if (ret < 0) {
6521 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6522 return ret;
6523 }
6524
6525 rgw_bucket_dir_entry ent;
6526 obj.key.get_index_key(&ent.key);
6527 ent.meta.size = size;
6528 ent.meta.accounted_size = accounted_size;
6529 ent.meta.mtime = ut;
6530 ent.meta.etag = etag;
6531 ent.meta.storage_class = storage_class;
6532 if (user_data)
6533 ent.meta.user_data = *user_data;
6534
6535 ACLOwner owner;
6536 if (acl_bl && acl_bl->length()) {
6537 int ret = store->decode_policy(*acl_bl, &owner);
6538 if (ret < 0) {
6539 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6540 }
6541 }
6542 ent.meta.owner = owner.get_id().to_str();
6543 ent.meta.owner_display_name = owner.get_display_name();
6544 ent.meta.content_type = content_type;
6545 ent.meta.appendable = appendable;
6546
6547 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
6548
6549 if (target->bucket_info.datasync_flag_enabled()) {
6550 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6551 if (r < 0) {
6552 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6553 }
6554 }
6555
6556 return ret;
6557 }
6558
6559 int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6560 real_time& removed_mtime,
6561 list<rgw_obj_index_key> *remove_objs)
6562 {
6563 if (blind) {
6564 return 0;
6565 }
6566 RGWRados *store = target->get_store();
6567 BucketShard *bs;
6568
6569 int ret = get_bucket_shard(&bs);
6570 if (ret < 0) {
6571 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6572 return ret;
6573 }
6574
6575 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
6576
6577 if (target->bucket_info.datasync_flag_enabled()) {
6578 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6579 if (r < 0) {
6580 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6581 }
6582 }
6583
6584 return ret;
6585 }
6586
6587
6588 int RGWRados::Bucket::UpdateIndex::cancel()
6589 {
6590 if (blind) {
6591 return 0;
6592 }
6593 RGWRados *store = target->get_store();
6594 BucketShard *bs;
6595
6596 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6597 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6598 });
6599
6600 /*
6601 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6602 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6603 * have no way to tell that they're all caught up
6604 */
6605 if (target->bucket_info.datasync_flag_enabled()) {
6606 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6607 if (r < 0) {
6608 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6609 }
6610 }
6611
6612 return ret;
6613 }
6614
6615 int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
6616 {
6617 RGWRados *store = source->get_store();
6618 CephContext *cct = store->ctx();
6619
6620 rgw_raw_obj read_obj;
6621 uint64_t read_ofs = ofs;
6622 uint64_t len, read_len;
6623 bool reading_from_head = true;
6624 ObjectReadOperation op;
6625
6626 bool merge_bl = false;
6627 bufferlist *pbl = &bl;
6628 bufferlist read_bl;
6629 uint64_t max_chunk_size;
6630
6631 RGWObjState *astate;
6632 int r = source->get_state(&astate, true);
6633 if (r < 0)
6634 return r;
6635
6636 if (astate->size == 0) {
6637 end = 0;
6638 } else if (end >= (int64_t)astate->size) {
6639 end = astate->size - 1;
6640 }
6641
6642 if (end < 0)
6643 len = 0;
6644 else
6645 len = end - ofs + 1;
6646
6647 if (astate->has_manifest && astate->manifest.has_tail()) {
6648 /* now get the relevant object part */
6649 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6650
6651 uint64_t stripe_ofs = iter.get_stripe_ofs();
6652 read_obj = iter.get_location().get_raw_obj(store);
6653 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6654 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6655 reading_from_head = (read_obj == state.head_obj);
6656 } else {
6657 read_obj = state.head_obj;
6658 }
6659
6660 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6661 if (r < 0) {
6662 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6663 return r;
6664 }
6665
6666 if (len > max_chunk_size)
6667 len = max_chunk_size;
6668
6669
6670 read_len = len;
6671
6672 if (reading_from_head) {
6673 /* only when reading from the head object do we need to do the atomic test */
6674 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
6675 if (r < 0)
6676 return r;
6677
6678 if (astate && astate->prefetch_data) {
6679 if (!ofs && astate->data.length() >= len) {
6680 bl = astate->data;
6681 return bl.length();
6682 }
6683
6684 if (ofs < astate->data.length()) {
6685 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
6686 astate->data.copy(ofs, copy_len, bl);
6687 read_len -= copy_len;
6688 read_ofs += copy_len;
6689 if (!read_len)
6690 return bl.length();
6691
6692 merge_bl = true;
6693 pbl = &read_bl;
6694 }
6695 }
6696 }
6697
6698 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6699 op.read(read_ofs, read_len, pbl, NULL);
6700
6701 if (state.cur_pool != read_obj.pool) {
6702 auto iter = state.io_ctxs.find(read_obj.pool);
6703 if (iter == state.io_ctxs.end()) {
6704 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
6705 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx);
6706 if (r < 0) {
6707 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6708 return r;
6709 }
6710 } else {
6711 state.cur_ioctx = &iter->second;
6712 }
6713 state.cur_pool = read_obj.pool;
6714 }
6715
6716 state.cur_ioctx->locator_set_key(read_obj.loc);
6717
6718 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6719 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
6720
6721 if (r < 0) {
6722 return r;
6723 }
6724
6725 if (merge_bl) {
6726 bl.append(read_bl);
6727 }
6728
6729 return bl.length();
6730 }
6731
6732 struct get_obj_data {
6733 RGWRados* store;
6734 RGWGetDataCB* client_cb;
6735 rgw::Aio* aio;
6736 uint64_t offset; // next offset to write to client
6737 rgw::AioResultList completed; // completed read results, sorted by offset
6738
6739 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio, uint64_t offset)
6740 : store(store), client_cb(cb), aio(aio), offset(offset) {}
6741
6742 int flush(rgw::AioResultList&& results) {
6743 int r = rgw::check_for_errors(results);
6744 if (r < 0) {
6745 return r;
6746 }
6747
6748 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6749 results.sort(cmp); // merge() requires results to be sorted first
6750 completed.merge(results, cmp); // merge results in sorted order
6751
6752 while (!completed.empty() && completed.front().id == offset) {
6753 auto bl = std::move(completed.front().data);
6754 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
6755
6756 offset += bl.length();
6757 int r = client_cb->handle_data(bl, 0, bl.length());
6758 if (r < 0) {
6759 return r;
6760 }
6761 }
6762 return 0;
6763 }
6764
6765 void cancel() {
6766 // wait for all completions to drain and ignore the results
6767 aio->drain();
6768 }
6769
6770 int drain() {
6771 auto c = aio->wait();
6772 while (!c.empty()) {
6773 int r = flush(std::move(c));
6774 if (r < 0) {
6775 cancel();
6776 return r;
6777 }
6778 c = aio->wait();
6779 }
6780 return flush(std::move(c));
6781 }
6782 };
6783
6784 static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6785 off_t read_ofs, off_t len, bool is_head_obj,
6786 RGWObjState *astate, void *arg)
6787 {
6788 struct get_obj_data *d = (struct get_obj_data *)arg;
6789
6790 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6791 is_head_obj, astate, arg);
6792 }
6793
6794 int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6795 off_t read_ofs, off_t len, bool is_head_obj,
6796 RGWObjState *astate, void *arg)
6797 {
6798 ObjectReadOperation op;
6799 struct get_obj_data *d = (struct get_obj_data *)arg;
6800 string oid, key;
6801
6802 if (is_head_obj) {
6803 /* only when reading from the head object do we need to do the atomic test */
6804 int r = append_atomic_test(astate, op);
6805 if (r < 0)
6806 return r;
6807
6808 if (astate &&
6809 obj_ofs < astate->data.length()) {
6810 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
6811
6812 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
6813 if (r < 0)
6814 return r;
6815
6816 len -= chunk_len;
6817 d->offset += chunk_len;
6818 read_ofs += chunk_len;
6819 obj_ofs += chunk_len;
6820 if (!len)
6821 return 0;
6822 }
6823 }
6824
6825 auto obj = d->store->svc.rados->obj(read_obj);
6826 int r = obj.open();
6827 if (r < 0) {
6828 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6829 return r;
6830 }
6831
6832 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6833 op.read(read_ofs, len, nullptr, nullptr);
6834
6835 const uint64_t cost = len;
6836 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
6837
6838 auto completed = d->aio->submit(obj, &op, cost, id);
6839
6840 return d->flush(std::move(completed));
6841 }
6842
6843 int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
6844 {
6845 RGWRados *store = source->get_store();
6846 CephContext *cct = store->ctx();
6847 RGWObjectCtx& obj_ctx = source->get_ctx();
6848 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6849 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
6850
6851 rgw::AioThrottle aio(window_size);
6852 get_obj_data data(store, cb, &aio, ofs);
6853
6854 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
6855 ofs, end, chunk_size, _get_obj_iterate_cb, &data);
6856 if (r < 0) {
6857 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6858 data.cancel(); // drain completions without writing back to client
6859 return r;
6860 }
6861
6862 return data.drain();
6863 }
6864
6865 int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6866 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6867 off_t ofs, off_t end, uint64_t max_chunk_size,
6868 iterate_obj_cb cb, void *arg)
6869 {
6870 rgw_raw_obj head_obj;
6871 rgw_raw_obj read_obj;
6872 uint64_t read_ofs = ofs;
6873 uint64_t len;
6874 bool reading_from_head = true;
6875 RGWObjState *astate = NULL;
6876
6877 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6878
6879 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
6880 if (r < 0) {
6881 return r;
6882 }
6883
6884 if (end < 0)
6885 len = 0;
6886 else
6887 len = end - ofs + 1;
6888
6889 if (astate->has_manifest) {
6890 /* now get the relevant object stripe */
6891 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6892
6893 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
6894
6895 for (; iter != obj_end && ofs <= end; ++iter) {
6896 off_t stripe_ofs = iter.get_stripe_ofs();
6897 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6898
6899 while (ofs < next_stripe_ofs && ofs <= end) {
6900 read_obj = iter.get_location().get_raw_obj(this);
6901 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
6902 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6903
6904 if (read_len > max_chunk_size) {
6905 read_len = max_chunk_size;
6906 }
6907
6908 reading_from_head = (read_obj == head_obj);
6909 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
6910 if (r < 0) {
6911 return r;
6912 }
6913
6914 len -= read_len;
6915 ofs += read_len;
6916 }
6917 }
6918 } else {
6919 while (ofs <= end) {
6920 read_obj = head_obj;
6921 uint64_t read_len = std::min(len, max_chunk_size);
6922
6923 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
6924 if (r < 0) {
6925 return r;
6926 }
6927
6928 len -= read_len;
6929 ofs += read_len;
6930 }
6931 }
6932
6933 return 0;
6934 }
6935
6936 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6937 {
6938 rgw_rados_ref ref;
6939 int r = get_obj_head_ref(bucket_info, obj, &ref);
6940 if (r < 0) {
6941 return r;
6942 }
6943
6944 return ref.ioctx.operate(ref.obj.oid, op);
6945 }
6946
6947 int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6948 {
6949 rgw_rados_ref ref;
6950 int r = get_obj_head_ref(bucket_info, obj, &ref);
6951 if (r < 0) {
6952 return r;
6953 }
6954
6955 bufferlist outbl;
6956
6957 return ref.ioctx.operate(ref.obj.oid, op, &outbl);
6958 }
6959
6960 int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6961 {
6962 ObjectWriteOperation op;
6963
6964 ceph_assert(olh_obj.key.instance.empty());
6965
6966 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6967
6968 if (!state.exists) {
6969 op.create(true);
6970 } else {
6971 op.assert_exists();
6972 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6973 op.mtime2(&mtime_ts);
6974 }
6975
6976 /*
6977 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6978 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6979 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6980 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6981 * log will reflect that.
6982 *
6983 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6984 * is used for object data instance, olh_tag for olh instance.
6985 */
6986 if (has_tag) {
6987 /* guard against racing writes */
6988 bucket_index_guard_olh_op(state, op);
6989 }
6990
6991 if (!has_tag) {
6992 /* obj tag */
6993 string obj_tag;
6994 gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
6995
6996 bufferlist bl;
6997 bl.append(obj_tag.c_str(), obj_tag.size());
6998 op.setxattr(RGW_ATTR_ID_TAG, bl);
6999
7000 state.attrset[RGW_ATTR_ID_TAG] = bl;
7001 state.obj_tag = bl;
7002
7003 /* olh tag */
7004 string olh_tag;
7005 gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
7006
7007 bufferlist olh_bl;
7008 olh_bl.append(olh_tag.c_str(), olh_tag.size());
7009 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
7010
7011 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
7012 state.olh_tag = olh_bl;
7013 state.is_olh = true;
7014
7015 bufferlist verbl;
7016 op.setxattr(RGW_ATTR_OLH_VER, verbl);
7017 }
7018
7019 bufferlist bl;
7020 RGWOLHPendingInfo pending_info;
7021 pending_info.time = real_clock::now();
7022 encode(pending_info, bl);
7023
7024 #define OLH_PENDING_TAG_LEN 32
7025 /* tag will start with current time epoch, this so that entries are sorted by time */
7026 char buf[32];
7027 utime_t ut(pending_info.time);
7028 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
7029 *op_tag = buf;
7030
7031 string s;
7032 gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
7033
7034 op_tag->append(s);
7035
7036 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7037 attr_name.append(*op_tag);
7038
7039 op.setxattr(attr_name.c_str(), bl);
7040
7041 int ret = obj_operate(bucket_info, olh_obj, &op);
7042 if (ret < 0) {
7043 return ret;
7044 }
7045
7046 state.exists = true;
7047 state.attrset[attr_name] = bl;
7048
7049 return 0;
7050 }
7051
7052 int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7053 {
7054 int ret;
7055
7056 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
7057 if (ret == -EEXIST) {
7058 ret = -ECANCELED;
7059 }
7060
7061 return ret;
7062 }
7063
7064 int RGWRados::guard_reshard(BucketShard *bs,
7065 const rgw_obj& obj_instance,
7066 const RGWBucketInfo& bucket_info,
7067 std::function<int(BucketShard *)> call)
7068 {
7069 rgw_obj obj;
7070 const rgw_obj *pobj = &obj_instance;
7071 int r;
7072
7073 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
7074 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
7075 if (r < 0) {
7076 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
7077 return r;
7078 }
7079 r = call(bs);
7080 if (r != -ERR_BUSY_RESHARDING) {
7081 break;
7082 }
7083 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
7084 string new_bucket_id;
7085 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
7086 if (r == -ERR_BUSY_RESHARDING) {
7087 continue;
7088 }
7089 if (r < 0) {
7090 return r;
7091 }
7092 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
7093 i = 0; /* resharding is finished, make sure we can retry */
7094
7095 obj = *pobj;
7096 obj.bucket.update_bucket_id(new_bucket_id);
7097 pobj = &obj;
7098 } // for loop
7099
7100 if (r < 0) {
7101 return r;
7102 }
7103
7104 return 0;
7105 }
7106
7107 int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
7108 string *new_bucket_id,
7109 const RGWBucketInfo& bucket_info,
7110 optional_yield y)
7111 {
7112 int ret = 0;
7113 cls_rgw_bucket_instance_entry entry;
7114
7115 // since we want to run this recovery code from two distinct places,
7116 // let's just put it in a lambda so we can easily re-use; if the
7117 // lambda successfully fetches a new bucket id, it sets
7118 // new_bucket_id and returns 0, otherwise it returns a negative
7119 // error code
7120 auto fetch_new_bucket_id =
7121 [this, bucket_info](const std::string& log_tag,
7122 std::string* new_bucket_id) -> int {
7123 RGWBucketInfo fresh_bucket_info = bucket_info;
7124 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
7125 if (ret < 0) {
7126 ldout(cct, 0) << __func__ <<
7127 " ERROR: failed to refresh bucket info after reshard at " <<
7128 log_tag << ": " << cpp_strerror(-ret) << dendl;
7129 return ret;
7130 }
7131 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
7132 return 0;
7133 };
7134
7135 constexpr int num_retries = 10;
7136 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
7137 ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
7138 if (ret == -ENOENT) {
7139 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
7140 } else if (ret < 0) {
7141 ldout(cct, 0) << __func__ <<
7142 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
7143 dendl;
7144 return ret;
7145 }
7146
7147 if (!entry.resharding_in_progress()) {
7148 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
7149 new_bucket_id);
7150 }
7151
7152 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
7153 (i < num_retries ? "retrying" : "too many retries") << dendl;
7154
7155 if (i == num_retries) {
7156 break;
7157 }
7158
7159 // If bucket is erroneously marked as resharding (e.g., crash or
7160 // other error) then fix it. If we can take the bucket reshard
7161 // lock then it means no other resharding should be taking place,
7162 // and we're free to clear the flags.
7163 {
7164 // since we expect to do this rarely, we'll do our work in a
7165 // block and erase our work after each try
7166
7167 RGWObjectCtx obj_ctx(this);
7168 const rgw_bucket& b = bs->bucket;
7169 std::string bucket_id = b.get_key();
7170 RGWBucketReshardLock reshard_lock(this, bucket_info, true);
7171 ret = reshard_lock.lock();
7172 if (ret < 0) {
7173 ldout(cct, 20) << __func__ <<
7174 " INFO: failed to take reshard lock for bucket " <<
7175 bucket_id << "; expected if resharding underway" << dendl;
7176 } else {
7177 ldout(cct, 10) << __func__ <<
7178 " INFO: was able to take reshard lock for bucket " <<
7179 bucket_id << dendl;
7180 ret = RGWBucketReshard::clear_resharding(this, bucket_info);
7181 if (ret < 0) {
7182 reshard_lock.unlock();
7183 ldout(cct, 0) << __func__ <<
7184 " ERROR: failed to clear resharding flags for bucket " <<
7185 bucket_id << dendl;
7186 } else {
7187 reshard_lock.unlock();
7188 ldout(cct, 5) << __func__ <<
7189 " INFO: apparently successfully cleared resharding flags for "
7190 "bucket " << bucket_id << dendl;
7191 continue; // if we apparently succeed immediately test again
7192 } // if clear resharding succeeded
7193 } // if taking of lock succeeded
7194 } // block to encapsulate recovery from incomplete reshard
7195
7196 ret = reshard_wait->wait(y);
7197 if (ret < 0) {
7198 ldout(cct, 0) << __func__ <<
7199 " ERROR: bucket is still resharding, please retry" << dendl;
7200 return ret;
7201 }
7202 } // for loop
7203
7204 ldout(cct, 0) << __func__ <<
7205 " ERROR: bucket is still resharding, please retry" << dendl;
7206 return -ERR_BUSY_RESHARDING;
7207 }
7208
7209 int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7210 bool delete_marker,
7211 const string& op_tag,
7212 struct rgw_bucket_dir_entry_meta *meta,
7213 uint64_t olh_epoch,
7214 real_time unmod_since, bool high_precision_time,
7215 rgw_zone_set *_zones_trace, bool log_data_change)
7216 {
7217 rgw_rados_ref ref;
7218 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7219 if (r < 0) {
7220 return r;
7221 }
7222
7223 rgw_zone_set zones_trace;
7224 if (_zones_trace) {
7225 zones_trace = *_zones_trace;
7226 }
7227 zones_trace.insert(svc.zone->get_zone().id);
7228
7229 BucketShard bs(this);
7230
7231 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
7232 r = guard_reshard(&bs, obj_instance, bucket_info,
7233 [&](BucketShard *bs) -> int {
7234 librados::ObjectWriteOperation op;
7235 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7236 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
7237 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
7238 unmod_since, high_precision_time,
7239 svc.zone->get_zone().log_data, zones_trace);
7240 });
7241 if (r < 0) {
7242 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7243 return r;
7244 }
7245
7246 if (log_data_change && bucket_info.datasync_flag_enabled()) {
7247 data_log->add_entry(bs.bucket, bs.shard_id);
7248 }
7249
7250 return 0;
7251 }
7252
7253 void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
7254 {
7255 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7256 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
7257 }
7258
7259 int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
7260 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7261 {
7262 rgw_rados_ref ref;
7263 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7264 if (r < 0) {
7265 return r;
7266 }
7267
7268 rgw_zone_set zones_trace;
7269 if (_zones_trace) {
7270 zones_trace = *_zones_trace;
7271 }
7272 zones_trace.insert(svc.zone->get_zone().id);
7273
7274 BucketShard bs(this);
7275
7276 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
7277 r = guard_reshard(&bs, obj_instance, bucket_info,
7278 [&](BucketShard *bs) -> int {
7279 librados::ObjectWriteOperation op;
7280 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7281 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
7282 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
7283 });
7284 if (r < 0) {
7285 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7286 return r;
7287 }
7288
7289 return 0;
7290 }
7291
7292 int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
7293 const rgw_obj& obj_instance, uint64_t ver_marker,
7294 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
7295 bool *is_truncated)
7296 {
7297 rgw_rados_ref ref;
7298 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7299 if (r < 0) {
7300 return r;
7301 }
7302
7303 BucketShard bs(this);
7304 int ret =
7305 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7306 if (ret < 0) {
7307 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7308 return ret;
7309 }
7310
7311 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7312
7313 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7314
7315 ret = guard_reshard(&bs, obj_instance, bucket_info,
7316 [&](BucketShard *bs) -> int {
7317 ObjectReadOperation op;
7318 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7319 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
7320 key, ver_marker, olh_tag, log, is_truncated);
7321 });
7322 if (ret < 0) {
7323 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7324 return ret;
7325 }
7326
7327 return 0;
7328 }
7329
7330 // a multisite sync bug resulted in the OLH head attributes being overwritten by
7331 // the attributes from another zone, causing link_olh() to fail endlessly due to
7332 // olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7333 // attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7334 int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
7335 const rgw_obj& obj)
7336 {
7337 // fetch the current olh entry from the bucket index
7338 rgw_bucket_olh_entry olh;
7339 int r = bi_get_olh(bucket_info, obj, &olh);
7340 if (r < 0) {
7341 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
7342 return r;
7343 }
7344 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
7345 return 0;
7346 }
7347
7348 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
7349 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
7350
7351 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7352 ObjectWriteOperation op;
7353 // assert this is the same olh tag we think we're fixing
7354 bucket_index_guard_olh_op(*state, op);
7355 // preserve existing mtime
7356 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
7357 op.mtime2(&mtime_ts);
7358 {
7359 bufferlist bl;
7360 bl.append(olh.tag.c_str(), olh.tag.size());
7361 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
7362 }
7363 {
7364 RGWOLHInfo info;
7365 info.target = rgw_obj(bucket_info.bucket, olh.key);
7366 info.removed = olh.delete_marker;
7367 bufferlist bl;
7368 encode(info, bl);
7369 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7370 }
7371 rgw_rados_ref ref;
7372 r = get_obj_head_ref(bucket_info, obj, &ref);
7373 if (r < 0) {
7374 return r;
7375 }
7376 r = ref.ioctx.operate(ref.obj.oid, &op);
7377 if (r < 0) {
7378 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
7379 << cpp_strerror(r) << dendl;
7380 return r;
7381 }
7382 return 0;
7383 }
7384
7385 int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7386 {
7387 rgw_rados_ref ref;
7388 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7389 if (r < 0) {
7390 return r;
7391 }
7392
7393 BucketShard bs(this);
7394 int ret =
7395 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7396 if (ret < 0) {
7397 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7398 return ret;
7399 }
7400
7401 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7402
7403 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7404
7405 ret = guard_reshard(&bs, obj_instance, bucket_info,
7406 [&](BucketShard *pbs) -> int {
7407 ObjectWriteOperation op;
7408 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7409 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
7410 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
7411 });
7412 if (ret < 0) {
7413 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7414 return ret;
7415 }
7416
7417 return 0;
7418 }
7419
7420 int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7421 {
7422 rgw_rados_ref ref;
7423 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7424 if (r < 0) {
7425 return r;
7426 }
7427
7428 BucketShard bs(this);
7429
7430 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7431
7432 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7433
7434 int ret = guard_reshard(&bs, obj_instance, bucket_info,
7435 [&](BucketShard *pbs) -> int {
7436 ObjectWriteOperation op;
7437 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7438 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
7439 });
7440 if (ret < 0) {
7441 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
7442 return ret;
7443 }
7444
7445 return 0;
7446 }
7447
7448 int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7449 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
7450 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7451 {
7452 if (log.empty()) {
7453 return 0;
7454 }
7455
7456 librados::ObjectWriteOperation op;
7457
7458 uint64_t last_ver = log.rbegin()->first;
7459 *plast_ver = last_ver;
7460
7461 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7462
7463 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7464 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
7465
7466 bufferlist ver_bl;
7467 string last_ver_s = to_string(last_ver);
7468 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7469 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7470
7471 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7472 op.mtime2(&mtime_ts);
7473
7474 bool need_to_link = false;
7475 cls_rgw_obj_key key;
7476 bool delete_marker = false;
7477 list<cls_rgw_obj_key> remove_instances;
7478 bool need_to_remove = false;
7479
7480 for (iter = log.begin(); iter != log.end(); ++iter) {
7481 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7482 for (; viter != iter->second.end(); ++viter) {
7483 rgw_bucket_olh_log_entry& entry = *viter;
7484
7485 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
7486 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7487 << (entry.delete_marker ? "(delete)" : "") << dendl;
7488 switch (entry.op) {
7489 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7490 remove_instances.push_back(entry.key);
7491 break;
7492 case CLS_RGW_OLH_OP_LINK_OLH:
7493 need_to_link = true;
7494 need_to_remove = false;
7495 key = entry.key;
7496 delete_marker = entry.delete_marker;
7497 break;
7498 case CLS_RGW_OLH_OP_UNLINK_OLH:
7499 need_to_remove = true;
7500 need_to_link = false;
7501 break;
7502 default:
7503 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7504 return -EIO;
7505 }
7506 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7507 attr_name.append(entry.op_tag);
7508 op.rmxattr(attr_name.c_str());
7509 }
7510 }
7511
7512 rgw_rados_ref ref;
7513 int r = get_obj_head_ref(bucket_info, obj, &ref);
7514 if (r < 0) {
7515 return r;
7516 }
7517
7518 const rgw_bucket& bucket = obj.bucket;
7519
7520 if (need_to_link) {
7521 rgw_obj target(bucket, key);
7522 RGWOLHInfo info;
7523 info.target = target;
7524 info.removed = delete_marker;
7525 bufferlist bl;
7526 encode(info, bl);
7527 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7528 }
7529
7530 /* first remove object instances */
7531 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7532 liter != remove_instances.end(); ++liter) {
7533 cls_rgw_obj_key& key = *liter;
7534 rgw_obj obj_instance(bucket, key);
7535 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7536 if (ret < 0 && ret != -ENOENT) {
7537 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7538 return ret;
7539 }
7540 }
7541
7542 /* update olh object */
7543 r = ref.ioctx.operate(ref.obj.oid, &op);
7544 if (r == -ECANCELED) {
7545 r = 0;
7546 }
7547 if (r < 0) {
7548 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7549 return r;
7550 }
7551
7552 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7553 if (r < 0) {
7554 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7555 return r;
7556 }
7557
7558 if (need_to_remove) {
7559 ObjectWriteOperation rm_op;
7560
7561 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7562 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7563 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7564 rm_op.remove();
7565
7566 r = ref.ioctx.operate(ref.obj.oid, &rm_op);
7567 if (r == -ECANCELED) {
7568 return 0; /* someone else won this race */
7569 } else {
7570 /*
7571 * only clear if was successful, otherwise we might clobber pending operations on this object
7572 */
7573 r = bucket_index_clear_olh(bucket_info, state, obj);
7574 if (r < 0) {
7575 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7576 return r;
7577 }
7578 }
7579 }
7580
7581 return 0;
7582 }
7583
7584 /*
7585 * read olh log and apply it
7586 */
7587 int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7588 {
7589 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7590 bool is_truncated;
7591 uint64_t ver_marker = 0;
7592
7593 do {
7594 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7595 if (ret < 0) {
7596 return ret;
7597 }
7598 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7599 if (ret < 0) {
7600 return ret;
7601 }
7602 } while (is_truncated);
7603
7604 return 0;
7605 }
7606
7607 int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
7608 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
7609 rgw_zone_set *zones_trace, bool log_data_change)
7610 {
7611 string op_tag;
7612
7613 rgw_obj olh_obj = target_obj;
7614 olh_obj.key.instance.clear();
7615
7616 RGWObjState *state = NULL;
7617
7618 int ret = 0;
7619 int i;
7620
7621 #define MAX_ECANCELED_RETRY 100
7622 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7623 if (ret == -ECANCELED) {
7624 obj_ctx.invalidate(olh_obj);
7625 }
7626
7627 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7628 if (ret < 0) {
7629 return ret;
7630 }
7631
7632 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7633 if (ret < 0) {
7634 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7635 if (ret == -ECANCELED) {
7636 continue;
7637 }
7638 return ret;
7639 }
7640 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7641 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7642 zones_trace, log_data_change);
7643 if (ret < 0) {
7644 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7645 if (ret == -ECANCELED) {
7646 // the bucket index rejected the link_olh() due to olh tag mismatch;
7647 // attempt to reconstruct olh head attributes based on the bucket index
7648 int r2 = repair_olh(state, bucket_info, olh_obj);
7649 if (r2 < 0 && r2 != -ECANCELED) {
7650 return r2;
7651 }
7652 continue;
7653 }
7654 return ret;
7655 }
7656 break;
7657 }
7658
7659 if (i == MAX_ECANCELED_RETRY) {
7660 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7661 return -EIO;
7662 }
7663
7664 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7665 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7666 ret = 0;
7667 }
7668 if (ret < 0) {
7669 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7670 return ret;
7671 }
7672
7673 return 0;
7674 }
7675
7676 int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
7677 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7678 {
7679 string op_tag;
7680
7681 rgw_obj olh_obj = target_obj;
7682 olh_obj.key.instance.clear();
7683
7684 RGWObjState *state = NULL;
7685
7686 int ret = 0;
7687 int i;
7688
7689 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7690 if (ret == -ECANCELED) {
7691 obj_ctx.invalidate(olh_obj);
7692 }
7693
7694 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7695 if (ret < 0)
7696 return ret;
7697
7698 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7699 if (ret < 0) {
7700 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7701 if (ret == -ECANCELED) {
7702 continue;
7703 }
7704 return ret;
7705 }
7706
7707 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7708
7709 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7710 if (ret < 0) {
7711 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7712 if (ret == -ECANCELED) {
7713 continue;
7714 }
7715 return ret;
7716 }
7717 break;
7718 }
7719
7720 if (i == MAX_ECANCELED_RETRY) {
7721 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7722 return -EIO;
7723 }
7724
7725 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7726 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7727 return 0;
7728 }
7729 if (ret < 0) {
7730 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7731 return ret;
7732 }
7733
7734 return 0;
7735 }
7736
7737 void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7738 {
7739 #define OBJ_INSTANCE_LEN 32
7740 char buf[OBJ_INSTANCE_LEN + 1];
7741
7742 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7743 no underscore for instance name due to the way we encode the raw keys */
7744
7745 target_key->set_instance(buf);
7746 }
7747
7748 void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7749 {
7750 gen_rand_obj_instance_name(&target_obj->key);
7751 }
7752
7753 int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7754 {
7755 map<string, bufferlist> unfiltered_attrset;
7756
7757 ObjectReadOperation op;
7758 op.getxattrs(&unfiltered_attrset, NULL);
7759
7760 bufferlist outbl;
7761 int r = obj_operate(bucket_info, obj, &op);
7762
7763 if (r < 0) {
7764 return r;
7765 }
7766 map<string, bufferlist> attrset;
7767
7768 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
7769
7770 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
7771 if (iter == attrset.end()) { /* not an olh */
7772 return -EINVAL;
7773 }
7774
7775 try {
7776 auto biter = iter->second.cbegin();
7777 decode(*olh, biter);
7778 } catch (buffer::error& err) {
7779 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7780 return -EIO;
7781 }
7782
7783 return 0;
7784 }
7785
7786 void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7787 map<string, bufferlist> *rm_pending_entries)
7788 {
7789 map<string, bufferlist>::iterator iter = pending_entries.begin();
7790
7791 real_time now = real_clock::now();
7792
7793 while (iter != pending_entries.end()) {
7794 auto biter = iter->second.cbegin();
7795 RGWOLHPendingInfo pending_info;
7796 try {
7797 decode(pending_info, biter);
7798 } catch (buffer::error& err) {
7799 /* skipping bad entry, we could remove it but it might hide a bug */
7800 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7801 ++iter;
7802 continue;
7803 }
7804
7805 map<string, bufferlist>::iterator cur_iter = iter;
7806 ++iter;
7807 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7808 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7809 pending_entries.erase(cur_iter);
7810 } else {
7811 /* entries names are sorted by time (rounded to a second) */
7812 break;
7813 }
7814 }
7815 }
7816
7817 int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7818 {
7819 rgw_rados_ref ref;
7820 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7821 if (r < 0) {
7822 return r;
7823 }
7824
7825 // trim no more than 1000 entries per osd op
7826 constexpr int max_entries = 1000;
7827
7828 auto i = pending_attrs.begin();
7829 while (i != pending_attrs.end()) {
7830 ObjectWriteOperation op;
7831 bucket_index_guard_olh_op(state, op);
7832
7833 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7834 op.rmxattr(i->first.c_str());
7835 }
7836
7837 r = ref.ioctx.operate(ref.obj.oid, &op);
7838 if (r == -ENOENT || r == -ECANCELED) {
7839 /* raced with some other change, shouldn't sweat about it */
7840 return 0;
7841 }
7842 if (r < 0) {
7843 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7844 return r;
7845 }
7846 }
7847 return 0;
7848 }
7849
7850 int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7851 {
7852 map<string, bufferlist> pending_entries;
7853 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7854
7855 map<string, bufferlist> rm_pending_entries;
7856 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7857
7858 if (!rm_pending_entries.empty()) {
7859 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7860 if (ret < 0) {
7861 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7862 return ret;
7863 }
7864 }
7865 if (!pending_entries.empty()) {
7866 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7867
7868 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7869 if (ret < 0) {
7870 return ret;
7871 }
7872 }
7873
7874 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
7875 ceph_assert(iter != state->attrset.end());
7876 RGWOLHInfo olh;
7877 try {
7878 auto biter = iter->second.cbegin();
7879 decode(olh, biter);
7880 } catch (buffer::error& err) {
7881 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7882 return -EIO;
7883 }
7884
7885 if (olh.removed) {
7886 return -ENOENT;
7887 }
7888
7889 *target = olh.target;
7890
7891 return 0;
7892 }
7893
7894 int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7895 map<string, bufferlist> *attrs, bufferlist *first_chunk,
7896 RGWObjVersionTracker *objv_tracker)
7897 {
7898 rgw_rados_ref ref;
7899 int r = get_raw_obj_ref(obj, &ref);
7900 if (r < 0) {
7901 return r;
7902 }
7903
7904 map<string, bufferlist> unfiltered_attrset;
7905 uint64_t size = 0;
7906 struct timespec mtime_ts;
7907
7908 ObjectReadOperation op;
7909 if (objv_tracker) {
7910 objv_tracker->prepare_op_for_read(&op);
7911 }
7912 if (attrs) {
7913 op.getxattrs(&unfiltered_attrset, NULL);
7914 }
7915 if (psize || pmtime) {
7916 op.stat2(&size, &mtime_ts, NULL);
7917 }
7918 if (first_chunk) {
7919 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7920 }
7921 bufferlist outbl;
7922 r = ref.ioctx.operate(ref.obj.oid, &op, &outbl);
7923
7924 if (epoch) {
7925 *epoch = ref.ioctx.get_last_version();
7926 }
7927
7928 if (r < 0)
7929 return r;
7930
7931 if (psize)
7932 *psize = size;
7933 if (pmtime)
7934 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7935 if (attrs) {
7936 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7937 }
7938
7939 return 0;
7940 }
7941
7942 int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
7943 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7944 {
7945 vector<rgw_bucket_dir_header> headers;
7946 map<int, string> bucket_instance_ids;
7947 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7948 if (r < 0) {
7949 return r;
7950 }
7951
7952 ceph_assert(headers.size() == bucket_instance_ids.size());
7953
7954 auto iter = headers.begin();
7955 map<int, string>::iterator viter = bucket_instance_ids.begin();
7956 BucketIndexShardsManager ver_mgr;
7957 BucketIndexShardsManager master_ver_mgr;
7958 BucketIndexShardsManager marker_mgr;
7959 char buf[64];
7960 for(; iter != headers.end(); ++iter, ++viter) {
7961 accumulate_raw_stats(*iter, stats);
7962 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7963 ver_mgr.add(viter->first, string(buf));
7964 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7965 master_ver_mgr.add(viter->first, string(buf));
7966 if (shard_id >= 0) {
7967 *max_marker = iter->max_marker;
7968 } else {
7969 marker_mgr.add(viter->first, iter->max_marker);
7970 }
7971 if (syncstopped != NULL)
7972 *syncstopped = iter->syncstopped;
7973 }
7974 ver_mgr.to_string(bucket_ver);
7975 master_ver_mgr.to_string(master_ver);
7976 if (shard_id < 0) {
7977 marker_mgr.to_string(max_marker);
7978 }
7979 return 0;
7980 }
7981
7982 int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
7983 map<int, string>& markers)
7984 {
7985 vector<rgw_bucket_dir_header> headers;
7986 map<int, string> bucket_instance_ids;
7987 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7988 if (r < 0)
7989 return r;
7990
7991 ceph_assert(headers.size() == bucket_instance_ids.size());
7992
7993 auto iter = headers.begin();
7994 map<int, string>::iterator viter = bucket_instance_ids.begin();
7995
7996 for(; iter != headers.end(); ++iter, ++viter) {
7997 if (shard_id >= 0) {
7998 markers[shard_id] = iter->max_marker;
7999 } else {
8000 markers[viter->first] = iter->max_marker;
8001 }
8002 }
8003 return 0;
8004 }
8005
8006 class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
8007 RGWGetBucketStats_CB *cb;
8008 uint32_t pendings;
8009 map<RGWObjCategory, RGWStorageStats> stats;
8010 int ret_code;
8011 bool should_cb;
8012 Mutex lock;
8013
8014 public:
8015 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
8016 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
8017 lock("RGWGetBucketStatsContext") {}
8018
8019 void handle_response(int r, rgw_bucket_dir_header& header) override {
8020 Mutex::Locker l(lock);
8021 if (should_cb) {
8022 if ( r >= 0) {
8023 accumulate_raw_stats(header, stats);
8024 } else {
8025 ret_code = r;
8026 }
8027
8028 // Are we all done?
8029 if (--pendings == 0) {
8030 if (!ret_code) {
8031 cb->set_response(&stats);
8032 }
8033 cb->handle_response(ret_code);
8034 cb->put();
8035 }
8036 }
8037 }
8038
8039 void unset_cb() {
8040 Mutex::Locker l(lock);
8041 should_cb = false;
8042 }
8043 };
8044
8045 int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
8046 {
8047 int num_aio = 0;
8048 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
8049 ceph_assert(get_ctx);
8050 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
8051 if (r < 0) {
8052 ctx->put();
8053 if (num_aio) {
8054 get_ctx->unset_cb();
8055 }
8056 }
8057 get_ctx->put();
8058 return r;
8059 }
8060
8061 class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
8062 RGWGetUserStats_CB *cb;
8063
8064 public:
8065 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
8066 : cb(cb) {}
8067
8068 void handle_response(int r, cls_user_header& header) override {
8069 const cls_user_stats& hs = header.stats;
8070 if (r >= 0) {
8071 RGWStorageStats stats;
8072
8073 stats.size = hs.total_bytes;
8074 stats.size_rounded = hs.total_bytes_rounded;
8075 stats.num_objects = hs.total_entries;
8076
8077 cb->set_response(stats);
8078 }
8079
8080 cb->handle_response(r);
8081
8082 cb->put();
8083 }
8084 };
8085
8086 int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
8087 {
8088 string user_str = user.to_str();
8089
8090 cls_user_header header;
8091 int r = cls_user_get_header(user_str, &header);
8092 if (r < 0)
8093 return r;
8094
8095 const cls_user_stats& hs = header.stats;
8096
8097 stats.size = hs.total_bytes;
8098 stats.size_rounded = hs.total_bytes_rounded;
8099 stats.num_objects = hs.total_entries;
8100
8101 return 0;
8102 }
8103
8104 int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
8105 {
8106 string user_str = user.to_str();
8107
8108 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
8109 int r = cls_user_get_header_async(user_str, get_ctx);
8110 if (r < 0) {
8111 ctx->put();
8112 delete get_ctx;
8113 return r;
8114 }
8115
8116 return 0;
8117 }
8118
8119 void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
8120 {
8121 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
8122 }
8123
8124 void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
8125 {
8126 if (!bucket.oid.empty()) {
8127 obj.init(svc.zone->get_zone_params().domain_root, bucket.oid);
8128 } else {
8129 string oid;
8130 get_bucket_meta_oid(bucket, oid);
8131 obj.init(svc.zone->get_zone_params().domain_root, oid);
8132 }
8133 }
8134
8135 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
8136 real_time *pmtime, map<string, bufferlist> *pattrs)
8137 {
8138 size_t pos = meta_key.find(':');
8139 if (pos == string::npos) {
8140 return -EINVAL;
8141 }
8142 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
8143 rgw_bucket_instance_key_to_oid(oid);
8144
8145 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8146 }
8147
8148 int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
8149 real_time *pmtime, map<string, bufferlist> *pattrs)
8150 {
8151 string oid;
8152 if (bucket.oid.empty()) {
8153 get_bucket_meta_oid(bucket, oid);
8154 } else {
8155 oid = bucket.oid;
8156 }
8157
8158 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8159 }
8160
8161 int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
8162 real_time *pmtime, map<string, bufferlist> *pattrs,
8163 rgw_cache_entry_info *cache_info,
8164 boost::optional<obj_version> refresh_version)
8165 {
8166 auto& domain_root = svc.zone->get_zone_params().domain_root;
8167
8168 ldout(cct, 20) << "reading from " << domain_root << ":" << oid << dendl;
8169
8170 bufferlist epbl;
8171
8172 int ret = rgw_get_system_obj(this, obj_ctx, domain_root,
8173 oid, epbl, &info.objv_tracker, pmtime, pattrs,
8174 cache_info, refresh_version);
8175 if (ret < 0) {
8176 return ret;
8177 }
8178
8179 auto iter = epbl.cbegin();
8180 try {
8181 decode(info, iter);
8182 } catch (buffer::error& err) {
8183 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8184 return -EIO;
8185 }
8186 info.bucket.oid = oid;
8187 return 0;
8188 }
8189
8190 int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx& obj_ctx,
8191 const string& tenant_name,
8192 const string& bucket_name,
8193 RGWBucketEntryPoint& entry_point,
8194 RGWObjVersionTracker *objv_tracker,
8195 real_time *pmtime,
8196 map<string, bufferlist> *pattrs,
8197 rgw_cache_entry_info *cache_info,
8198 boost::optional<obj_version> refresh_version)
8199 {
8200 bufferlist bl;
8201 string bucket_entry;
8202
8203 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
8204 int ret = rgw_get_system_obj(this, obj_ctx, svc.zone->get_zone_params().domain_root,
8205 bucket_entry, bl, objv_tracker, pmtime, pattrs,
8206 cache_info, refresh_version);
8207 if (ret < 0) {
8208 return ret;
8209 }
8210
8211 auto iter = bl.cbegin();
8212 try {
8213 decode(entry_point, iter);
8214 } catch (buffer::error& err) {
8215 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8216 return -EIO;
8217 }
8218 return 0;
8219 }
8220
8221 int RGWRados::convert_old_bucket_info(RGWSysObjectCtx& obj_ctx,
8222 const string& tenant_name,
8223 const string& bucket_name)
8224 {
8225 RGWBucketEntryPoint entry_point;
8226 real_time ep_mtime;
8227 RGWObjVersionTracker ot;
8228 map<string, bufferlist> attrs;
8229 RGWBucketInfo info;
8230
8231 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
8232
8233 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
8234 if (ret < 0) {
8235 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
8236 return ret;
8237 }
8238
8239 if (!entry_point.has_bucket_info) {
8240 /* already converted! */
8241 return 0;
8242 }
8243
8244 info = entry_point.old_bucket_info;
8245 info.bucket.oid = bucket_name;
8246 info.ep_objv = ot.read_version;
8247
8248 ot.generate_new_write_ver(cct);
8249
8250 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
8251 if (ret < 0) {
8252 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
8253 return ret;
8254 }
8255
8256 return 0;
8257 }
8258
8259 int RGWRados::_get_bucket_info(RGWSysObjectCtx& obj_ctx,
8260 const string& tenant,
8261 const string& bucket_name,
8262 RGWBucketInfo& info,
8263 real_time *pmtime,
8264 map<string, bufferlist> *pattrs,
8265 boost::optional<obj_version> refresh_version)
8266 {
8267 string bucket_entry;
8268 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
8269
8270
8271 if (auto e = binfo_cache->find(bucket_entry)) {
8272 if (refresh_version &&
8273 e->info.objv_tracker.read_version.compare(&(*refresh_version))) {
8274 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
8275 << "a failure that should be debugged. I am a nice machine, "
8276 << "so I will try to recover." << dendl;
8277 binfo_cache->invalidate(bucket_entry);
8278 } else {
8279 info = e->info;
8280 if (pattrs)
8281 *pattrs = e->attrs;
8282 if (pmtime)
8283 *pmtime = e->mtime;
8284 return 0;
8285 }
8286 }
8287
8288 bucket_info_entry e;
8289 RGWBucketEntryPoint entry_point;
8290 real_time ep_mtime;
8291 RGWObjVersionTracker ot;
8292 rgw_cache_entry_info entry_cache_info;
8293 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
8294 entry_point, &ot, &ep_mtime, pattrs,
8295 &entry_cache_info, refresh_version);
8296 if (ret < 0) {
8297 /* only init these fields */
8298 info.bucket.tenant = tenant;
8299 info.bucket.name = bucket_name;
8300 return ret;
8301 }
8302
8303 if (entry_point.has_bucket_info) {
8304 info = entry_point.old_bucket_info;
8305 info.bucket.oid = bucket_name;
8306 info.bucket.tenant = tenant;
8307 info.ep_objv = ot.read_version;
8308 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
8309 return 0;
8310 }
8311
8312 /* data is in the bucket instance object, we need to get attributes from there, clear everything
8313 * that we got
8314 */
8315 if (pattrs) {
8316 pattrs->clear();
8317 }
8318
8319 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
8320
8321
8322 /* read bucket instance info */
8323
8324 string oid;
8325 get_bucket_meta_oid(entry_point.bucket, oid);
8326
8327 rgw_cache_entry_info cache_info;
8328
8329 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
8330 &cache_info, refresh_version);
8331 e.info.ep_objv = ot.read_version;
8332 info = e.info;
8333 if (ret < 0) {
8334 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
8335 info.bucket.tenant = tenant;
8336 info.bucket.name = bucket_name;
8337 // XXX and why return anything in case of an error anyway?
8338 return ret;
8339 }
8340
8341 if (pmtime)
8342 *pmtime = e.mtime;
8343 if (pattrs)
8344 *pattrs = e.attrs;
8345
8346 /* chain to both bucket entry point and bucket instance */
8347 if (!binfo_cache->put(svc.cache, bucket_entry, &e, {&entry_cache_info, &cache_info})) {
8348 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
8349 }
8350
8351 if (refresh_version &&
8352 refresh_version->compare(&info.objv_tracker.read_version)) {
8353 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
8354 << "have gone squirrelly. An administrator may have forced a "
8355 << "change; otherwise there is a problem somewhere." << dendl;
8356 }
8357
8358 return 0;
8359 }
8360
8361 int RGWRados::get_bucket_info(RGWSysObjectCtx& obj_ctx,
8362 const string& tenant, const string& bucket_name,
8363 RGWBucketInfo& info,
8364 real_time *pmtime, map<string, bufferlist> *pattrs)
8365 {
8366 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
8367 pattrs, boost::none);
8368 }
8369
8370 int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
8371 ceph::real_time *pmtime,
8372 map<string, bufferlist> *pattrs)
8373 {
8374 RGWSysObjectCtx obj_ctx = svc.sysobj->init_obj_ctx();
8375
8376 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
8377 info, pmtime, pattrs, info.objv_tracker.read_version);
8378 }
8379
8380 int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
8381 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
8382 map<string, bufferlist> *pattrs)
8383 {
8384 bufferlist epbl;
8385 encode(entry_point, epbl);
8386 string bucket_entry;
8387 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
8388 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
8389 }
8390
8391 int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
8392 real_time mtime, map<string, bufferlist> *pattrs)
8393 {
8394 info.has_instance_obj = true;
8395 bufferlist bl;
8396
8397 encode(info, bl);
8398
8399 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
8400 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
8401 if (ret == -EEXIST) {
8402 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
8403 * bucket operation on this specific bucket (e.g., being synced from the master), but
8404 * since bucket instace meta object is unique for this specific bucket instace, we don't
8405 * need to return an error.
8406 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
8407 * master, creating a bucket, sending bucket creation to the master, we create the bucket
8408 * locally, while in the sync thread we sync the new bucket.
8409 */
8410 ret = 0;
8411 }
8412 return ret;
8413 }
8414
8415 int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
8416 map<string, bufferlist> *pattrs, bool create_entry_point)
8417 {
8418 bool create_head = !info.has_instance_obj || create_entry_point;
8419
8420 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
8421 if (ret < 0) {
8422 return ret;
8423 }
8424
8425 if (!create_head)
8426 return 0; /* done! */
8427
8428 RGWBucketEntryPoint entry_point;
8429 entry_point.bucket = info.bucket;
8430 entry_point.owner = info.owner;
8431 entry_point.creation_time = info.creation_time;
8432 entry_point.linked = true;
8433 RGWObjVersionTracker ot;
8434 if (pep_objv && !pep_objv->tag.empty()) {
8435 ot.write_version = *pep_objv;
8436 } else {
8437 ot.generate_new_write_ver(cct);
8438 if (pep_objv) {
8439 *pep_objv = ot.write_version;
8440 }
8441 }
8442 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
8443 if (ret < 0)
8444 return ret;
8445
8446 return 0;
8447 }
8448
8449 int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
8450 {
8451 auto obj_ctx = svc.sysobj->init_obj_ctx();
8452
8453 map<string, RGWBucketEnt>::iterator iter;
8454 for (iter = m.begin(); iter != m.end(); ++iter) {
8455 RGWBucketEnt& ent = iter->second;
8456 rgw_bucket& bucket = ent.bucket;
8457 ent.count = 0;
8458 ent.size = 0;
8459 ent.size_rounded = 0;
8460
8461 vector<rgw_bucket_dir_header> headers;
8462
8463 RGWBucketInfo bucket_info;
8464 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
8465 if (ret < 0) {
8466 return ret;
8467 }
8468
8469 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
8470 if (r < 0)
8471 return r;
8472
8473 auto hiter = headers.begin();
8474 for (; hiter != headers.end(); ++hiter) {
8475 RGWObjCategory category = main_category;
8476 auto iter = (hiter->stats).find(category);
8477 if (iter != hiter->stats.end()) {
8478 struct rgw_bucket_category_stats& stats = iter->second;
8479 ent.count += stats.num_entries;
8480 ent.size += stats.total_size;
8481 ent.size_rounded += stats.total_size_rounded;
8482 }
8483 }
8484
8485 // fill in placement_rule from the bucket instance for use in swift's
8486 // per-storage policy statistics
8487 ent.placement_rule = std::move(bucket_info.placement_rule);
8488 }
8489
8490 return m.size();
8491 }
8492
8493 int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
8494 {
8495 rgw_rados_ref ref;
8496 int r = get_raw_obj_ref(obj, &ref);
8497 if (r < 0) {
8498 return r;
8499 }
8500 librados::Rados *rad = get_rados_handle();
8501 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
8502
8503 r = ref.ioctx.aio_append(ref.obj.oid, completion, bl, size);
8504 completion->release();
8505 return r;
8506 }
8507
8508 int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
8509 {
8510 librados::IoCtx& io_ctx = ctx.io_ctx;
8511 librados::NObjectIterator& iter = ctx.iter;
8512
8513 int r = open_pool_ctx(pool, io_ctx);
8514 if (r < 0)
8515 return r;
8516
8517 iter = io_ctx.nobjects_begin();
8518
8519 return 0;
8520 }
8521
8522 int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
8523 {
8524 librados::IoCtx& io_ctx = ctx.io_ctx;
8525 librados::NObjectIterator& iter = ctx.iter;
8526
8527 int r = open_pool_ctx(pool, io_ctx);
8528 if (r < 0)
8529 return r;
8530
8531 librados::ObjectCursor oc;
8532 if (!oc.from_str(cursor)) {
8533 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
8534 return -EINVAL;
8535 }
8536
8537 try {
8538 iter = io_ctx.nobjects_begin(oc);
8539 return 0;
8540 } catch (const std::system_error& e) {
8541 r = -e.code().value();
8542 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8543 << ", returning " << r << dendl;
8544 return r;
8545 } catch (const std::exception& e) {
8546 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8547 << ", returning -5" << dendl;
8548 return -EIO;
8549 }
8550 }
8551
8552 string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
8553 {
8554 return ctx.iter.get_cursor().to_str();
8555 }
8556
8557 static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
8558 vector<rgw_bucket_dir_entry>& objs,
8559 bool *is_truncated, RGWAccessListFilter *filter)
8560 {
8561 librados::IoCtx& io_ctx = ctx.io_ctx;
8562 librados::NObjectIterator& iter = ctx.iter;
8563
8564 if (iter == io_ctx.nobjects_end())
8565 return -ENOENT;
8566
8567 uint32_t i;
8568
8569 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
8570 rgw_bucket_dir_entry e;
8571
8572 string oid = iter->get_oid();
8573 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
8574
8575 // fill it in with initial values; we may correct later
8576 if (filter && !filter->filter(oid, oid))
8577 continue;
8578
8579 e.key = oid;
8580 objs.push_back(e);
8581 }
8582
8583 if (is_truncated)
8584 *is_truncated = (iter != io_ctx.nobjects_end());
8585
8586 return objs.size();
8587 }
8588
8589 int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
8590 bool *is_truncated, RGWAccessListFilter *filter)
8591 {
8592 // catch exceptions from NObjectIterator::operator++()
8593 try {
8594 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
8595 } catch (const std::system_error& e) {
8596 int r = -e.code().value();
8597 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8598 << ", returning " << r << dendl;
8599 return r;
8600 } catch (const std::exception& e) {
8601 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8602 << ", returning -5" << dendl;
8603 return -EIO;
8604 }
8605 }
8606
8607 int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
8608 {
8609 if (!ctx->initialized) {
8610 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
8611 if (r < 0) {
8612 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
8613 return r;
8614 }
8615 ctx->initialized = true;
8616 }
8617 return 0;
8618 }
8619
8620 int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
8621 RGWListRawObjsCtx& ctx, list<string>& oids,
8622 bool *is_truncated)
8623 {
8624 if (!ctx.initialized) {
8625 return -EINVAL;
8626 }
8627 RGWAccessListFilterPrefix filter(prefix_filter);
8628 vector<rgw_bucket_dir_entry> objs;
8629 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
8630 if (r < 0) {
8631 if(r != -ENOENT)
8632 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
8633 return r;
8634 }
8635
8636 vector<rgw_bucket_dir_entry>::iterator iter;
8637 for (iter = objs.begin(); iter != objs.end(); ++iter) {
8638 oids.push_back(iter->key.name);
8639 }
8640
8641 return oids.size();
8642 }
8643
8644 int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
8645 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8646 bool *is_truncated)
8647 {
8648 if (!ctx.initialized) {
8649 int r = list_raw_objects_init(pool, string(), &ctx);
8650 if (r < 0) {
8651 return r;
8652 }
8653 }
8654
8655 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
8656 }
8657
8658 string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8659 {
8660 return pool_iterate_get_cursor(ctx.iter_ctx);
8661 }
8662
8663 int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
8664 std::list<rgw_bi_log_entry>& result, bool *truncated)
8665 {
8666 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
8667 result.clear();
8668
8669 librados::IoCtx index_ctx;
8670 map<int, string> oids;
8671 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
8672 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
8673 if (r < 0)
8674 return r;
8675
8676 BucketIndexShardsManager marker_mgr;
8677 bool has_shards = (oids.size() > 1 || shard_id >= 0);
8678 // If there are multiple shards for the bucket index object, the marker
8679 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
8680 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
8681 // only contain one record, and the key is the bucket instance id.
8682 r = marker_mgr.from_string(marker, shard_id);
8683 if (r < 0)
8684 return r;
8685
8686 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
8687 if (r < 0)
8688 return r;
8689
8690 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
8691 map<int, list<rgw_bi_log_entry>::iterator> vends;
8692 if (truncated) {
8693 *truncated = false;
8694 }
8695 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
8696 for (; miter != bi_log_lists.end(); ++miter) {
8697 int shard_id = miter->first;
8698 vcurrents[shard_id] = miter->second.entries.begin();
8699 vends[shard_id] = miter->second.entries.end();
8700 if (truncated) {
8701 *truncated = (*truncated || miter->second.truncated);
8702 }
8703 }
8704
8705 size_t total = 0;
8706 bool has_more = true;
8707 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
8708 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
8709 while (total < max && has_more) {
8710 has_more = false;
8711
8712 viter = vcurrents.begin();
8713 eiter = vends.begin();
8714
8715 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
8716 assert (eiter != vends.end());
8717
8718 int shard_id = viter->first;
8719 list<rgw_bi_log_entry>::iterator& liter = viter->second;
8720
8721 if (liter == eiter->second){
8722 continue;
8723 }
8724 rgw_bi_log_entry& entry = *(liter);
8725 if (has_shards) {
8726 char buf[16];
8727 snprintf(buf, sizeof(buf), "%d", shard_id);
8728 string tmp_id;
8729 build_bucket_index_marker(buf, entry.id, &tmp_id);
8730 entry.id.swap(tmp_id);
8731 }
8732 marker_mgr.add(shard_id, entry.id);
8733 result.push_back(entry);
8734 total++;
8735 has_more = true;
8736 ++liter;
8737 }
8738 }
8739
8740 if (truncated) {
8741 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
8742 assert (eiter != vends.end());
8743 *truncated = (*truncated || (viter->second != eiter->second));
8744 }
8745 }
8746
8747 // Refresh marker, if there are multiple shards, the output will look like
8748 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
8749 // if there is no sharding, the simply marker (without oid) is returned
8750 if (has_shards) {
8751 marker_mgr.to_string(&marker);
8752 } else {
8753 if (!result.empty()) {
8754 marker = result.rbegin()->id;
8755 }
8756 }
8757
8758 return 0;
8759 }
8760
8761 int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
8762 {
8763 librados::IoCtx index_ctx;
8764 map<int, string> bucket_objs;
8765
8766 BucketIndexShardsManager start_marker_mgr;
8767 BucketIndexShardsManager end_marker_mgr;
8768
8769 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8770 if (r < 0) {
8771 return r;
8772 }
8773
8774 r = start_marker_mgr.from_string(start_marker, shard_id);
8775 if (r < 0) {
8776 return r;
8777 }
8778
8779 r = end_marker_mgr.from_string(end_marker, shard_id);
8780 if (r < 0) {
8781 return r;
8782 }
8783
8784 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
8785 cct->_conf->rgw_bucket_index_max_aio)();
8786 }
8787
8788 int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8789 {
8790 librados::IoCtx index_ctx;
8791 map<int, string> bucket_objs;
8792 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8793 if (r < 0)
8794 return r;
8795
8796 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8797 }
8798
8799 int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8800 {
8801 librados::IoCtx index_ctx;
8802 map<int, string> bucket_objs;
8803 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8804 if (r < 0)
8805 return r;
8806
8807 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8808 }
8809
8810 int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8811 rgw_bucket_dir_entry *dirent)
8812 {
8813 rgw_cls_bi_entry bi_entry;
8814 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
8815 if (r < 0 && r != -ENOENT) {
8816 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8817 }
8818 if (r < 0) {
8819 return r;
8820 }
8821 auto iter = bi_entry.data.cbegin();
8822 try {
8823 decode(*dirent, iter);
8824 } catch (buffer::error& err) {
8825 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8826 return -EIO;
8827 }
8828
8829 return 0;
8830 }
8831
8832 int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8833 rgw_bucket_olh_entry *olh)
8834 {
8835 rgw_cls_bi_entry bi_entry;
8836 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
8837 if (r < 0 && r != -ENOENT) {
8838 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8839 }
8840 if (r < 0) {
8841 return r;
8842 }
8843 auto iter = bi_entry.data.cbegin();
8844 try {
8845 decode(*olh, iter);
8846 } catch (buffer::error& err) {
8847 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8848 return -EIO;
8849 }
8850
8851 return 0;
8852 }
8853
8854 int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8855 BIIndexType index_type, rgw_cls_bi_entry *entry)
8856 {
8857 BucketShard bs(this);
8858 int ret = bs.init(bucket_info, obj);
8859 if (ret < 0) {
8860 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8861 return ret;
8862 }
8863
8864 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8865
8866 return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
8867 }
8868
8869 void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8870 {
8871 cls_rgw_bi_put(op, bs.bucket_obj, entry);
8872 }
8873
8874 int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8875 {
8876 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
8877 if (ret < 0)
8878 return ret;
8879
8880 return 0;
8881 }
8882
8883 int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
8884 {
8885 BucketShard bs(this);
8886 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
8887 if (ret < 0) {
8888 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8889 return ret;
8890 }
8891
8892 return bi_put(bs, entry);
8893 }
8894
8895 int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8896 {
8897 rgw_obj obj(bucket, obj_name);
8898 BucketShard bs(this);
8899 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
8900 if (ret < 0) {
8901 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8902 return ret;
8903 }
8904
8905 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
8906 if (ret == -ENOENT) {
8907 *is_truncated = false;
8908 }
8909 if (ret < 0)
8910 return ret;
8911
8912 return 0;
8913 }
8914
8915 int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8916 {
8917 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
8918 if (ret < 0)
8919 return ret;
8920
8921 return 0;
8922 }
8923
8924 int RGWRados::bi_remove(BucketShard& bs)
8925 {
8926 int ret = bs.index_ctx.remove(bs.bucket_obj);
8927 if (ret == -ENOENT) {
8928 ret = 0;
8929 }
8930 if (ret < 0) {
8931 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8932 return ret;
8933 }
8934
8935 return 0;
8936 }
8937
8938 int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8939 {
8940 BucketShard bs(this);
8941 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
8942 if (ret < 0) {
8943 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8944 return ret;
8945 }
8946
8947 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8948 }
8949
8950 int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8951 {
8952 return gc_pool_ctx.operate(oid, op);
8953 }
8954
8955 int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op, AioCompletion **pc)
8956 {
8957 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
8958 int r = gc_pool_ctx.aio_operate(oid, c, op);
8959 if (!pc) {
8960 c->release();
8961 } else {
8962 *pc = c;
8963 }
8964 return r;
8965 }
8966
8967 int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8968 {
8969 return gc_pool_ctx.operate(oid, op, pbl);
8970 }
8971
8972 int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
8973 {
8974 return gc->list(index, marker, max, expired_only, result, truncated);
8975 }
8976
8977 int RGWRados::process_gc(bool expired_only)
8978 {
8979 return gc->process(expired_only);
8980 }
8981
8982 int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
8983 {
8984 return lc->list_lc_progress(marker, max_entries, progress_map);
8985 }
8986
8987 int RGWRados::process_lc()
8988 {
8989 return lc->process();
8990 }
8991
8992 bool RGWRados::process_expire_objects()
8993 {
8994 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
8995 }
8996
8997 int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
8998 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
8999 {
9000 rgw_zone_set zones_trace;
9001 if (_zones_trace) {
9002 zones_trace = *_zones_trace;
9003 }
9004 zones_trace.insert(svc.zone->get_zone().id);
9005
9006 ObjectWriteOperation o;
9007 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
9008 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
9009 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
9010 return bs.index_ctx.operate(bs.bucket_obj, &o);
9011 }
9012
9013 int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
9014 int64_t pool, uint64_t epoch,
9015 rgw_bucket_dir_entry& ent, RGWObjCategory category,
9016 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
9017 {
9018 ObjectWriteOperation o;
9019 rgw_bucket_dir_entry_meta dir_meta;
9020 dir_meta = ent.meta;
9021 dir_meta.category = category;
9022
9023 rgw_zone_set zones_trace;
9024 if (_zones_trace) {
9025 zones_trace = *_zones_trace;
9026 }
9027 zones_trace.insert(svc.zone->get_zone().id);
9028
9029 rgw_bucket_entry_ver ver;
9030 ver.pool = pool;
9031 ver.epoch = epoch;
9032 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
9033 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
9034 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
9035 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
9036 complete_op_data *arg;
9037 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
9038 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
9039 librados::AioCompletion *completion = arg->rados_completion;
9040 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
9041 completion->release(); /* can't reference arg here, as it might have already been released */
9042 return ret;
9043 }
9044
9045 int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
9046 int64_t pool, uint64_t epoch,
9047 rgw_bucket_dir_entry& ent, RGWObjCategory category,
9048 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
9049 {
9050 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
9051 }
9052
9053 int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
9054 int64_t pool, uint64_t epoch,
9055 rgw_obj& obj,
9056 real_time& removed_mtime,
9057 list<rgw_obj_index_key> *remove_objs,
9058 uint16_t bilog_flags,
9059 rgw_zone_set *zones_trace)
9060 {
9061 rgw_bucket_dir_entry ent;
9062 ent.meta.mtime = removed_mtime;
9063 obj.key.get_index_key(&ent.key);
9064 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
9065 ent, RGWObjCategory::None, remove_objs,
9066 bilog_flags, zones_trace);
9067 }
9068
9069 int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
9070 {
9071 rgw_bucket_dir_entry ent;
9072 obj.key.get_index_key(&ent.key);
9073 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
9074 -1 /* pool id */, 0, ent,
9075 RGWObjCategory::None, NULL, bilog_flags,
9076 zones_trace);
9077 }
9078
9079 int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
9080 {
9081 librados::IoCtx index_ctx;
9082 map<int, string> bucket_objs;
9083 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
9084 if (r < 0)
9085 return r;
9086
9087 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
9088 }
9089
9090
9091 int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
9092 int shard_id,
9093 const rgw_obj_index_key& start,
9094 const string& prefix,
9095 uint32_t num_entries,
9096 bool list_versions,
9097 map<string, rgw_bucket_dir_entry>& m,
9098 bool *is_truncated,
9099 rgw_obj_index_key *last_entry,
9100 bool (*force_check_filter)(const string& name))
9101 {
9102 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
9103 " start " << start.name << "[" << start.instance << "] num_entries " <<
9104 num_entries << dendl;
9105
9106 librados::IoCtx index_ctx;
9107 // key - oid (for different shards if there is any)
9108 // value - list result for the corresponding oid (shard), it is filled by
9109 // the AIO callback
9110 map<int, string> oids;
9111 map<int, struct rgw_cls_list_ret> list_results;
9112 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9113 if (r < 0)
9114 return r;
9115
9116 cls_rgw_obj_key start_key(start.name, start.instance);
9117 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
9118 list_versions, oids, list_results,
9119 cct->_conf->rgw_bucket_index_max_aio)();
9120 if (r < 0)
9121 return r;
9122
9123 // Create a list of iterators that are used to iterate each shard
9124 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents;
9125 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends;
9126 vector<string> vnames;
9127 vcurrents.reserve(list_results.size());
9128 vends.reserve(list_results.size());
9129 vnames.reserve(list_results.size());
9130 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9131 *is_truncated = false;
9132 for (; iter != list_results.end(); ++iter) {
9133 vcurrents.push_back(iter->second.dir.m.begin());
9134 vends.push_back(iter->second.dir.m.end());
9135 vnames.push_back(oids[iter->first]);
9136 *is_truncated = (*is_truncated || iter->second.is_truncated);
9137 }
9138
9139 // Create a map to track the next candidate entry from each shard, if the entry
9140 // from a specified shard is selected/erased, the next entry from that shard will
9141 // be inserted for next round selection
9142 map<string, size_t> candidates;
9143 for (size_t i = 0; i < vcurrents.size(); ++i) {
9144 if (vcurrents[i] != vends[i]) {
9145 candidates[vcurrents[i]->first] = i;
9146 }
9147 }
9148
9149 map<string, bufferlist> updates;
9150 uint32_t count = 0;
9151 while (count < num_entries && !candidates.empty()) {
9152 r = 0;
9153 // Select the next one
9154 int pos = candidates.begin()->second;
9155 const string& name = vcurrents[pos]->first;
9156 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
9157
9158 bool force_check = force_check_filter &&
9159 force_check_filter(dirent.key.name);
9160 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9161 !dirent.pending_map.empty() ||
9162 force_check) {
9163 /* there are uncommitted ops. We need to check the current state,
9164 * and if the tags are old we need to do cleanup as well. */
9165 librados::IoCtx sub_ctx;
9166 sub_ctx.dup(index_ctx);
9167 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
9168 updates[vnames[pos]]);
9169 if (r < 0 && r != -ENOENT) {
9170 return r;
9171 }
9172 }
9173 if (r >= 0) {
9174 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
9175 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
9176 m[name] = std::move(dirent);
9177 ++count;
9178 }
9179
9180 // Refresh the candidates map
9181 candidates.erase(candidates.begin());
9182 ++vcurrents[pos];
9183 if (vcurrents[pos] != vends[pos]) {
9184 candidates[vcurrents[pos]->first] = pos;
9185 }
9186 }
9187
9188 // Suggest updates if there is any
9189 map<string, bufferlist>::iterator miter = updates.begin();
9190 for (; miter != updates.end(); ++miter) {
9191 if (miter->second.length()) {
9192 ObjectWriteOperation o;
9193 cls_rgw_suggest_changes(o, miter->second);
9194 // we don't care if we lose suggested updates, send them off blindly
9195 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9196 index_ctx.aio_operate(miter->first, c, &o);
9197 c->release();
9198 }
9199 }
9200
9201 // Check if all the returned entries are consumed or not
9202 for (size_t i = 0; i < vcurrents.size(); ++i) {
9203 if (vcurrents[i] != vends[i]) {
9204 *is_truncated = true;
9205 break;
9206 }
9207 }
9208 if (!m.empty())
9209 *last_entry = m.rbegin()->first;
9210
9211 return 0;
9212 }
9213
9214
9215 int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
9216 int shard_id,
9217 const rgw_obj_index_key& start,
9218 const string& prefix,
9219 uint32_t num_entries,
9220 bool list_versions,
9221 std::vector<rgw_bucket_dir_entry>& ent_list,
9222 bool *is_truncated,
9223 rgw_obj_index_key *last_entry,
9224 bool (*force_check_filter)(const string& name)) {
9225 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9226 " start " << start.name << "[" << start.instance <<
9227 "] num_entries " << num_entries << dendl;
9228
9229 static MultipartMetaFilter multipart_meta_filter;
9230
9231 *is_truncated = false;
9232 librados::IoCtx index_ctx;
9233
9234 map<int, string> oids;
9235 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9236 if (r < 0)
9237 return r;
9238 const uint32_t num_shards = oids.size();
9239
9240 rgw_obj_index_key marker = start;
9241 uint32_t current_shard;
9242 if (shard_id >= 0) {
9243 current_shard = shard_id;
9244 } else if (start.empty()) {
9245 current_shard = 0u;
9246 } else {
9247 // at this point we have a marker (start) that has something in
9248 // it, so we need to get to the bucket shard index, so we can
9249 // start reading from there
9250
9251 std::string key;
9252 // test whether object name is a multipart meta name
9253 if(! multipart_meta_filter.filter(start.name, key)) {
9254 // if multipart_meta_filter fails, must be "regular" (i.e.,
9255 // unadorned) and the name is the key
9256 key = start.name;
9257 }
9258
9259 // now convert the key (oid) to an rgw_obj_key since that will
9260 // separate out the namespace, name, and instance
9261 rgw_obj_key obj_key;
9262 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
9263 if (!parsed) {
9264 ldout(cct, 0) <<
9265 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9266 "start marker: '" << start << "'" << dendl;
9267 return -EINVAL;
9268 } else if (obj_key.name.empty()) {
9269 // if the name is empty that means the object name came in with
9270 // a namespace only, and therefore we need to start our scan at
9271 // the first bucket index shard
9272 current_shard = 0u;
9273 } else {
9274 // so now we have the key used to compute the bucket index shard
9275 // and can extract the specific shard from it
9276 current_shard = rgw_bucket_shard_index(obj_key.name, num_shards);
9277 }
9278 }
9279
9280 uint32_t count = 0u;
9281 map<string, bufferlist> updates;
9282 rgw_obj_index_key last_added_entry;
9283 while (count <= num_entries &&
9284 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
9285 current_shard < num_shards)) {
9286 const std::string& oid = oids[current_shard];
9287 rgw_cls_list_ret result;
9288
9289 librados::ObjectReadOperation op;
9290 cls_rgw_bucket_list_op(op, marker, prefix, num_entries,
9291 list_versions, &result);
9292 r = index_ctx.operate(oid, &op, nullptr);
9293 if (r < 0)
9294 return r;
9295
9296 for (auto& entry : result.dir.m) {
9297 rgw_bucket_dir_entry& dirent = entry.second;
9298
9299 bool force_check = force_check_filter &&
9300 force_check_filter(dirent.key.name);
9301 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9302 !dirent.pending_map.empty() ||
9303 force_check) {
9304 /* there are uncommitted ops. We need to check the current state,
9305 * and if the tags are old we need to do cleanup as well. */
9306 librados::IoCtx sub_ctx;
9307 sub_ctx.dup(index_ctx);
9308 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
9309 if (r < 0 && r != -ENOENT) {
9310 return r;
9311 }
9312 }
9313
9314 // at this point either r >=0 or r == -ENOENT
9315 if (r >= 0) { // i.e., if r != -ENOENT
9316 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
9317 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
9318
9319 if (count < num_entries) {
9320 marker = last_added_entry = dirent.key; // double assign
9321 ent_list.emplace_back(std::move(dirent));
9322 ++count;
9323 } else {
9324 *is_truncated = true;
9325 goto check_updates;
9326 }
9327 } else { // r == -ENOENT
9328 // in the case of -ENOENT, make sure we're advancing marker
9329 // for possible next call to CLSRGWIssueBucketList
9330 marker = dirent.key;
9331 }
9332 } // entry for loop
9333
9334 if (!result.is_truncated) {
9335 // if we reached the end of the shard read next shard
9336 ++current_shard;
9337 marker = rgw_obj_index_key();
9338 }
9339 } // shard loop
9340
9341 check_updates:
9342
9343 // suggest updates if there is any
9344 map<string, bufferlist>::iterator miter = updates.begin();
9345 for (; miter != updates.end(); ++miter) {
9346 if (miter->second.length()) {
9347 ObjectWriteOperation o;
9348 cls_rgw_suggest_changes(o, miter->second);
9349 // we don't care if we lose suggested updates, send them off blindly
9350 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9351 index_ctx.aio_operate(miter->first, c, &o);
9352 c->release();
9353 }
9354 }
9355
9356 if (last_entry && !ent_list.empty()) {
9357 *last_entry = last_added_entry;
9358 }
9359
9360 return 0;
9361 } // RGWRados::cls_bucket_list_unordered
9362
9363
9364 int RGWRados::cls_obj_usage_log_add(const string& oid,
9365 rgw_usage_log_info& info)
9366 {
9367 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9368
9369 rgw_rados_ref ref;
9370 int r = get_raw_obj_ref(obj, &ref);
9371 if (r < 0) {
9372 return r;
9373 }
9374
9375 ObjectWriteOperation op;
9376 cls_rgw_usage_log_add(op, info);
9377
9378 r = ref.ioctx.operate(ref.obj.oid, &op);
9379 return r;
9380 }
9381
9382 int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
9383 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
9384 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
9385 bool *is_truncated)
9386 {
9387 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9388
9389 rgw_rados_ref ref;
9390 int r = get_raw_obj_ref(obj, &ref);
9391 if (r < 0) {
9392 return r;
9393 }
9394
9395 *is_truncated = false;
9396
9397 r = cls_rgw_usage_log_read(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch,
9398 max_entries, read_iter, usage, is_truncated);
9399
9400 return r;
9401 }
9402
9403 int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
9404 uint64_t start_epoch, uint64_t end_epoch)
9405 {
9406 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9407
9408 rgw_rados_ref ref;
9409 int r = get_raw_obj_ref(obj, &ref);
9410 if (r < 0) {
9411 return r;
9412 }
9413
9414 r = cls_rgw_usage_log_trim(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch);
9415 return r;
9416 }
9417
9418 int RGWRados::cls_obj_usage_log_clear(string& oid)
9419 {
9420 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9421
9422 rgw_rados_ref ref;
9423 int r = get_raw_obj_ref(obj, &ref);
9424 if (r < 0) {
9425 return r;
9426 }
9427 librados::ObjectWriteOperation op;
9428 cls_rgw_usage_log_clear(op);
9429 r = ref.ioctx.operate(ref.obj.oid, &op);
9430 return r;
9431 }
9432
9433
9434 int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
9435 {
9436 librados::IoCtx index_ctx;
9437 string dir_oid;
9438
9439 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
9440
9441 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
9442 if (r < 0)
9443 return r;
9444
9445 bufferlist updates;
9446
9447 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
9448 rgw_bucket_dir_entry entry;
9449 entry.key = *iter;
9450 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
9451 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9452 updates.append(CEPH_RGW_REMOVE | suggest_flag);
9453 encode(entry, updates);
9454 }
9455
9456 bufferlist out;
9457
9458 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
9459
9460 return r;
9461 }
9462
9463 int RGWRados::check_disk_state(librados::IoCtx io_ctx,
9464 const RGWBucketInfo& bucket_info,
9465 rgw_bucket_dir_entry& list_state,
9466 rgw_bucket_dir_entry& object,
9467 bufferlist& suggested_updates)
9468 {
9469 const rgw_bucket& bucket = bucket_info.bucket;
9470 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
9471
9472 std::string loc;
9473
9474 rgw_obj obj(bucket, list_state.key);
9475
9476 string oid;
9477 get_obj_bucket_and_oid_loc(obj, oid, loc);
9478
9479 if (loc != list_state.locator) {
9480 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
9481 }
9482
9483 io_ctx.locator_set_key(list_state.locator);
9484
9485 RGWObjState *astate = NULL;
9486 RGWObjectCtx rctx(this);
9487 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
9488 if (r < 0)
9489 return r;
9490
9491 list_state.pending_map.clear(); // we don't need this and it inflates size
9492 if (!astate->exists) {
9493 /* object doesn't exist right now -- hopefully because it's
9494 * marked as !exists and got deleted */
9495 if (list_state.exists) {
9496 /* FIXME: what should happen now? Work out if there are any
9497 * non-bad ways this could happen (there probably are, but annoying
9498 * to handle!) */
9499 }
9500 // encode a suggested removal of that key
9501 list_state.ver.epoch = io_ctx.get_last_version();
9502 list_state.ver.pool = io_ctx.get_id();
9503 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
9504 return -ENOENT;
9505 }
9506
9507 string etag;
9508 string content_type;
9509 ACLOwner owner;
9510
9511 object.meta.size = astate->size;
9512 object.meta.accounted_size = astate->accounted_size;
9513 object.meta.mtime = astate->mtime;
9514
9515 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
9516 if (iter != astate->attrset.end()) {
9517 etag = rgw_bl_str(iter->second);
9518 }
9519 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
9520 if (iter != astate->attrset.end()) {
9521 content_type = rgw_bl_str(iter->second);
9522 }
9523 iter = astate->attrset.find(RGW_ATTR_ACL);
9524 if (iter != astate->attrset.end()) {
9525 r = decode_policy(iter->second, &owner);
9526 if (r < 0) {
9527 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
9528 }
9529 }
9530
9531 if (astate->has_manifest) {
9532 RGWObjManifest::obj_iterator miter;
9533 RGWObjManifest& manifest = astate->manifest;
9534 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
9535 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
9536 rgw_obj loc;
9537 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
9538
9539 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
9540 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
9541 r = delete_obj_index(loc);
9542 if (r < 0) {
9543 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
9544 }
9545 }
9546 }
9547 }
9548
9549 object.meta.etag = etag;
9550 object.meta.content_type = content_type;
9551 object.meta.owner = owner.get_id().to_str();
9552 object.meta.owner_display_name = owner.get_display_name();
9553
9554 // encode suggested updates
9555 list_state.ver.pool = io_ctx.get_id();
9556 list_state.ver.epoch = astate->epoch;
9557 list_state.meta.size = object.meta.size;
9558 list_state.meta.accounted_size = object.meta.accounted_size;
9559 list_state.meta.mtime = object.meta.mtime;
9560 list_state.meta.category = main_category;
9561 list_state.meta.etag = etag;
9562 list_state.meta.content_type = content_type;
9563 if (astate->obj_tag.length() > 0)
9564 list_state.tag = astate->obj_tag.c_str();
9565 list_state.meta.owner = owner.get_id().to_str();
9566 list_state.meta.owner_display_name = owner.get_display_name();
9567
9568 list_state.exists = true;
9569 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9570 return 0;
9571 }
9572
9573 int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
9574 {
9575 librados::IoCtx index_ctx;
9576 map<int, string> oids;
9577 map<int, struct rgw_cls_list_ret> list_results;
9578 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
9579 if (r < 0)
9580 return r;
9581
9582 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9583 if (r < 0)
9584 return r;
9585
9586 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9587 for(; iter != list_results.end(); ++iter) {
9588 headers.push_back(std::move(iter->second.dir.header));
9589 }
9590 return 0;
9591 }
9592
9593 int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
9594 {
9595 librados::IoCtx index_ctx;
9596 map<int, string> bucket_objs;
9597 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
9598 if (r < 0)
9599 return r;
9600
9601 map<int, string>::iterator iter = bucket_objs.begin();
9602 for (; iter != bucket_objs.end(); ++iter) {
9603 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
9604 if (r < 0) {
9605 ctx->put();
9606 break;
9607 } else {
9608 (*num_aio)++;
9609 }
9610 }
9611 return r;
9612 }
9613
9614 int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
9615 {
9616 string buckets_obj_id;
9617 rgw_get_buckets_obj(user_id, buckets_obj_id);
9618 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9619
9620 rgw_rados_ref ref;
9621 int r = get_raw_obj_ref(obj, &ref);
9622 if (r < 0) {
9623 return r;
9624 }
9625
9626 librados::ObjectReadOperation op;
9627 int rc;
9628 ::cls_user_get_header(op, header, &rc);
9629 bufferlist ibl;
9630 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
9631 if (r < 0)
9632 return r;
9633 if (rc < 0)
9634 return rc;
9635
9636 return 0;
9637 }
9638
9639 int RGWRados::cls_user_reset_stats(const string& user_id)
9640 {
9641 string buckets_obj_id;
9642 rgw_get_buckets_obj(user_id, buckets_obj_id);
9643 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9644
9645 rgw_rados_ref ref;
9646 int r = get_raw_obj_ref(obj, &ref);
9647 if (r < 0) {
9648 return r;
9649 }
9650
9651 librados::ObjectWriteOperation op;
9652 ::cls_user_reset_stats(op);
9653 return ref.ioctx.operate(ref.obj.oid, &op);
9654 }
9655
9656 int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
9657 {
9658 string buckets_obj_id;
9659 rgw_get_buckets_obj(user_id, buckets_obj_id);
9660 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9661
9662 rgw_rados_ref ref;
9663 int r = get_raw_obj_ref(obj, &ref);
9664 if (r < 0) {
9665 return r;
9666 }
9667
9668 r = ::cls_user_get_header_async(ref.ioctx, ref.obj.oid, ctx);
9669 if (r < 0)
9670 return r;
9671
9672 return 0;
9673 }
9674
9675 int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj,
9676 const RGWBucketInfo& bucket_info)
9677 {
9678 vector<rgw_bucket_dir_header> headers;
9679 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9680 if (r < 0) {
9681 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
9682 return r;
9683 }
9684
9685 cls_user_bucket_entry entry;
9686
9687 bucket_info.bucket.convert(&entry.bucket);
9688
9689 for (const auto& hiter : headers) {
9690 for (const auto& iter : hiter.stats) {
9691 if (RGWObjCategory::Main == iter.first ||
9692 RGWObjCategory::MultiMeta == iter.first) {
9693 const struct rgw_bucket_category_stats& header_stats = iter.second;
9694 entry.size += header_stats.total_size;
9695 entry.size_rounded += header_stats.total_size_rounded;
9696 entry.count += header_stats.num_entries;
9697 }
9698 }
9699 }
9700
9701 list<cls_user_bucket_entry> entries;
9702 entries.push_back(entry);
9703
9704 r = cls_user_update_buckets(user_obj, entries, false);
9705 if (r < 0) {
9706 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
9707 return r;
9708 }
9709
9710 return 0;
9711 }
9712
9713 int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
9714 {
9715 vector<rgw_bucket_dir_header> headers;
9716 RGWBucketInfo bucket_info;
9717 auto obj_ctx = svc.sysobj->init_obj_ctx();
9718 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
9719 if (ret < 0) {
9720 return ret;
9721 }
9722
9723 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9724 if (ret < 0) {
9725 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
9726 return ret;
9727 }
9728
9729 bucket.convert(&entry.bucket);
9730
9731 for (const auto& hiter : headers) {
9732 for (const auto& iter : hiter.stats) {
9733 const struct rgw_bucket_category_stats& header_stats = iter.second;
9734 entry.size += header_stats.total_size;
9735 entry.size_rounded += header_stats.total_size_rounded;
9736 entry.count += header_stats.num_entries;
9737 }
9738 }
9739
9740 return 0;
9741 }
9742
9743 int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
9744 const string& in_marker,
9745 const string& end_marker,
9746 const int max_entries,
9747 list<cls_user_bucket_entry>& entries,
9748 string * const out_marker,
9749 bool * const truncated)
9750 {
9751 rgw_rados_ref ref;
9752 int r = get_raw_obj_ref(obj, &ref);
9753 if (r < 0) {
9754 return r;
9755 }
9756
9757 librados::ObjectReadOperation op;
9758 int rc;
9759
9760 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
9761 bufferlist ibl;
9762 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
9763 if (r < 0)
9764 return r;
9765 if (rc < 0)
9766 return rc;
9767
9768 return 0;
9769 }
9770
9771 int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
9772 {
9773 rgw_rados_ref ref;
9774 int r = get_raw_obj_ref(obj, &ref);
9775 if (r < 0) {
9776 return r;
9777 }
9778
9779 librados::ObjectWriteOperation op;
9780 cls_user_set_buckets(op, entries, add);
9781 r = ref.ioctx.operate(ref.obj.oid, &op);
9782 if (r < 0)
9783 return r;
9784
9785 return 0;
9786 }
9787
9788 int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
9789 {
9790 string buckets_obj_id;
9791 rgw_get_buckets_obj(user_id, buckets_obj_id);
9792 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
9793 return cls_user_complete_stats_sync(obj);
9794 }
9795
9796 int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
9797 {
9798 rgw_rados_ref ref;
9799 int r = get_raw_obj_ref(obj, &ref);
9800 if (r < 0) {
9801 return r;
9802 }
9803
9804 librados::ObjectWriteOperation op;
9805 ::cls_user_complete_stats_sync(op);
9806 r = ref.ioctx.operate(ref.obj.oid, &op);
9807 if (r < 0)
9808 return r;
9809
9810 return 0;
9811 }
9812
9813 int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
9814 {
9815 list<cls_user_bucket_entry> l;
9816 l.push_back(entry);
9817
9818 return cls_user_update_buckets(obj, l, true);
9819 }
9820
9821 int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
9822 {
9823 rgw_rados_ref ref;
9824 int r = get_system_obj_ref(obj, &ref);
9825 if (r < 0) {
9826 return r;
9827 }
9828
9829 librados::ObjectWriteOperation op;
9830 ::cls_user_remove_bucket(op, bucket);
9831 r = ref.ioctx.operate(ref.obj.oid, &op);
9832 if (r < 0)
9833 return r;
9834
9835 return 0;
9836 }
9837
9838 int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
9839 RGWQuotaInfo& bucket_quota)
9840 {
9841 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
9842 return 0;
9843 }
9844
9845 bool need_resharding = false;
9846 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9847 uint32_t suggested_num_shards;
9848
9849 const uint64_t max_objs_per_shard =
9850 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9851 int ret =
9852 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
9853 bucket_info.owner, bucket, bucket_quota,
9854 1, need_resharding, &suggested_num_shards);
9855 if (ret < 0) {
9856 return ret;
9857 }
9858
9859 if (need_resharding) {
9860 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
9861 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
9862 dendl;
9863 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
9864 }
9865
9866 return ret;
9867 }
9868
9869 int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
9870 {
9871 RGWReshard reshard(this);
9872
9873 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9874
9875 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
9876 if (new_num_shards <= num_source_shards) {
9877 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
9878 return 0;
9879 }
9880
9881 cls_rgw_reshard_entry entry;
9882 entry.time = real_clock::now();
9883 entry.tenant = bucket_info.owner.tenant;
9884 entry.bucket_name = bucket_info.bucket.name;
9885 entry.bucket_id = bucket_info.bucket.bucket_id;
9886 entry.old_num_shards = num_source_shards;
9887 entry.new_num_shards = new_num_shards;
9888
9889 return reshard.add(entry);
9890 }
9891
9892 int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
9893 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
9894 {
9895 // if we only check size, then num_objs will set to 0
9896 if(check_size_only)
9897 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
9898
9899 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
9900 }
9901
9902 void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
9903 uint32_t num_shards,
9904 map<int, string>& bucket_objects,
9905 int shard_id) {
9906 if (!num_shards) {
9907 bucket_objects[0] = bucket_oid_base;
9908 } else {
9909 char buf[bucket_oid_base.size() + 32];
9910 if (shard_id < 0) {
9911 for (uint32_t i = 0; i < num_shards; ++i) {
9912 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
9913 bucket_objects[i] = buf;
9914 }
9915 } else {
9916 if ((uint32_t)shard_id > num_shards) {
9917 return;
9918 }
9919 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
9920 bucket_objects[shard_id] = buf;
9921 }
9922 }
9923 }
9924
9925 void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
9926 {
9927 const rgw_bucket& bucket = bucket_info.bucket;
9928 string plain_id = bucket.name + ":" + bucket.bucket_id;
9929 if (!bucket_info.num_shards) {
9930 (*result)[0] = plain_id;
9931 } else {
9932 char buf[16];
9933 if (shard_id < 0) {
9934 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
9935 snprintf(buf, sizeof(buf), ":%d", i);
9936 (*result)[i] = plain_id + buf;
9937 }
9938 } else {
9939 if ((uint32_t)shard_id > bucket_info.num_shards) {
9940 return;
9941 }
9942 snprintf(buf, sizeof(buf), ":%d", shard_id);
9943 (*result)[shard_id] = plain_id + buf;
9944 }
9945 }
9946 }
9947
9948 int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
9949 int *shard_id)
9950 {
9951 int r = 0;
9952 switch (bucket_info.bucket_index_shard_hash_type) {
9953 case RGWBucketInfo::MOD:
9954 if (!bucket_info.num_shards) {
9955 if (shard_id) {
9956 *shard_id = -1;
9957 }
9958 } else {
9959 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
9960 if (shard_id) {
9961 *shard_id = (int)sid;
9962 }
9963 }
9964 break;
9965 default:
9966 r = -ENOTSUP;
9967 }
9968 return r;
9969 }
9970
9971 void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
9972 int shard_id, string *bucket_obj)
9973 {
9974 if (!num_shards) {
9975 // By default with no sharding, we use the bucket oid as itself
9976 (*bucket_obj) = bucket_oid_base;
9977 } else {
9978 char buf[bucket_oid_base.size() + 32];
9979 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
9980 (*bucket_obj) = buf;
9981 }
9982 }
9983
9984 int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
9985 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
9986 {
9987 int r = 0;
9988 switch (hash_type) {
9989 case RGWBucketInfo::MOD:
9990 if (!num_shards) {
9991 // By default with no sharding, we use the bucket oid as itself
9992 (*bucket_obj) = bucket_oid_base;
9993 if (shard_id) {
9994 *shard_id = -1;
9995 }
9996 } else {
9997 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
9998 char buf[bucket_oid_base.size() + 32];
9999 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
10000 (*bucket_obj) = buf;
10001 if (shard_id) {
10002 *shard_id = (int)sid;
10003 }
10004 }
10005 break;
10006 default:
10007 r = -ENOTSUP;
10008 }
10009 return r;
10010 }
10011
10012 uint64_t RGWRados::instance_id()
10013 {
10014 return get_rados_handle()->get_instance_id();
10015 }
10016
10017 uint64_t RGWRados::next_bucket_id()
10018 {
10019 Mutex::Locker l(bucket_id_lock);
10020 return ++max_bucket_id;
10021 }
10022
10023 RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
10024 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
10025 {
10026 RGWRados *store = new RGWRados;
10027
10028 if ((*store).set_use_cache(use_cache)
10029 .set_run_gc_thread(use_gc_thread)
10030 .set_run_lc_thread(use_lc_thread)
10031 .set_run_quota_threads(quota_threads)
10032 .set_run_sync_thread(run_sync_thread)
10033 .set_run_reshard_thread(run_reshard_thread)
10034 .initialize(cct) < 0) {
10035 delete store;
10036 return NULL;
10037 }
10038
10039 return store;
10040 }
10041
10042 RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
10043 {
10044 RGWRados *store = NULL;
10045 store = new RGWRados;
10046
10047 store->set_context(cct);
10048
10049 int ret = store->init_svc(true);
10050 if (ret < 0) {
10051 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
10052 return nullptr;
10053 }
10054
10055 if (store->init_rados() < 0) {
10056 delete store;
10057 return nullptr;
10058 }
10059
10060 return store;
10061 }
10062
10063 void RGWStoreManager::close_storage(RGWRados *store)
10064 {
10065 if (!store)
10066 return;
10067
10068 store->finalize();
10069
10070 delete store;
10071 }
10072
10073 librados::Rados* RGWRados::get_rados_handle()
10074 {
10075 if (rados.size() == 1) {
10076 return &rados[0];
10077 } else {
10078 handle_lock.get_read();
10079 pthread_t id = pthread_self();
10080 std::map<pthread_t, int>:: iterator it = rados_map.find(id);
10081
10082 if (it != rados_map.end()) {
10083 handle_lock.put_read();
10084 return &rados[it->second];
10085 } else {
10086 handle_lock.put_read();
10087 handle_lock.get_write();
10088 const uint32_t handle = next_rados_handle;
10089 rados_map[id] = handle;
10090 if (++next_rados_handle == rados.size()) {
10091 next_rados_handle = 0;
10092 }
10093 handle_lock.put_write();
10094 return &rados[handle];
10095 }
10096 }
10097 }
10098
10099 int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
10100 {
10101 rgw_rados_ref ref;
10102 int ret = get_raw_obj_ref(obj, &ref);
10103 if (ret < 0) {
10104 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10105 return ret;
10106 }
10107
10108 ObjectWriteOperation op;
10109 list<string> prefixes;
10110 cls_rgw_remove_obj(op, prefixes);
10111
10112 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
10113 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
10114 if (ret < 0) {
10115 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10116 c->release();
10117 return ret;
10118 }
10119
10120 handles.push_back(c);
10121
10122 return 0;
10123 }
10124
10125 int RGWRados::delete_obj_aio(const rgw_obj& obj,
10126 RGWBucketInfo& bucket_info, RGWObjState *astate,
10127 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
10128 {
10129 rgw_rados_ref ref;
10130 int ret = get_obj_head_ref(bucket_info, obj, &ref);
10131 if (ret < 0) {
10132 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10133 return ret;
10134 }
10135
10136 if (keep_index_consistent) {
10137 RGWRados::Bucket bop(this, bucket_info);
10138 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
10139
10140 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
10141 if (ret < 0) {
10142 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
10143 return ret;
10144 }
10145 }
10146
10147 ObjectWriteOperation op;
10148 list<string> prefixes;
10149 cls_rgw_remove_obj(op, prefixes);
10150
10151 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
10152 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
10153 if (ret < 0) {
10154 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10155 c->release();
10156 return ret;
10157 }
10158
10159 handles.push_back(c);
10160
10161 if (keep_index_consistent) {
10162 ret = delete_obj_index(obj);
10163 if (ret < 0) {
10164 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
10165 return ret;
10166 }
10167 }
10168 return ret;
10169 }
10170
10171 int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
10172 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
10173 if (value != attrs.end()) {
10174 auto bliter = value->second.cbegin();
10175 try {
10176 decode(cs_info, bliter);
10177 } catch (buffer::error& err) {
10178 return -EIO;
10179 }
10180 if (cs_info.blocks.size() == 0) {
10181 return -EIO;
10182 }
10183 if (cs_info.compression_type != "none")
10184 need_decompress = true;
10185 else
10186 need_decompress = false;
10187 return 0;
10188 } else {
10189 need_decompress = false;
10190 return 0;
10191 }
10192 }
10193
10194 bool RGWRados::call(std::string_view command, const cmdmap_t& cmdmap,
10195 std::string_view format, bufferlist& out)
10196 {
10197 if (command == "cache list"sv) {
10198 std::optional<std::string> filter;
10199 if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) {
10200 filter = boost::get<std::string>(i->second);
10201 }
10202 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
10203 if (f) {
10204 f->open_array_section("cache_entries");
10205 call_list(filter, f.get());
10206 f->close_section();
10207 f->flush(out);
10208 return true;
10209 } else {
10210 out.append("Unable to create Formatter.\n");
10211 return false;
10212 }
10213 } else if (command == "cache inspect"sv) {
10214 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
10215 if (f) {
10216 const auto& target = boost::get<std::string>(cmdmap.at("target"));
10217 if (call_inspect(target, f.get())) {
10218 f->flush(out);
10219 return true;
10220 } else {
10221 out.append("Unable to find entry "s + target + ".\n");
10222 return false;
10223 }
10224 } else {
10225 out.append("Unable to create Formatter.\n");
10226 return false;
10227 }
10228 } else if (command == "cache erase"sv) {
10229 const auto& target = boost::get<std::string>(cmdmap.at("target"));
10230 if (call_erase(target)) {
10231 return true;
10232 } else {
10233 out.append("Unable to find entry "s + target + ".\n");
10234 return false;
10235 }
10236 } else if (command == "cache zap"sv) {
10237 call_zap();
10238 return true;
10239 }
10240 return false;
10241 }
10242
10243 void RGWRados::call_list(const std::optional<std::string>& s,
10244 ceph::Formatter *f)
10245 {
10246 if (!svc.cache) {
10247 return;
10248 }
10249 svc.cache->call_list(s, f);
10250 }
10251
10252 bool RGWRados::call_inspect(const std::string& s, Formatter *f)
10253 {
10254 if (!svc.cache) {
10255 return false;
10256 }
10257 return svc.cache->call_inspect(s, f);
10258 }
10259
10260 bool RGWRados::call_erase(const std::string& s) {
10261 if (!svc.cache) {
10262 return false;
10263 }
10264 return svc.cache->call_erase(s);
10265 }
10266
10267 void RGWRados::call_zap() {
10268 if (svc.cache) {
10269 return;
10270 }
10271 svc.cache->call_zap();
10272 }
10273
10274 string RGWRados::get_mfa_oid(const rgw_user& user)
10275 {
10276 return string("user:") + user.to_str();
10277 }
10278
10279 int RGWRados::get_mfa_ref(const rgw_user& user, rgw_rados_ref *ref)
10280 {
10281 string oid = get_mfa_oid(user);
10282 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10283 return get_system_obj_ref(obj, ref);
10284 }
10285
10286 int RGWRados::check_mfa(const rgw_user& user, const string& otp_id, const string& pin)
10287 {
10288 rgw_rados_ref ref;
10289
10290 int r = get_mfa_ref(user, &ref);
10291 if (r < 0) {
10292 return r;
10293 }
10294
10295 rados::cls::otp::otp_check_t result;
10296
10297 r = rados::cls::otp::OTP::check(cct, ref.ioctx, ref.obj.oid, otp_id, pin, &result);
10298 if (r < 0)
10299 return r;
10300
10301 ldout(cct, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl;
10302
10303 return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES);
10304 }
10305
10306 void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation *op,
10307 RGWObjVersionTracker *objv_tracker,
10308 const ceph::real_time& mtime)
10309 {
10310 RGWObjVersionTracker ot;
10311
10312 if (objv_tracker) {
10313 ot = *objv_tracker;
10314 }
10315
10316 if (ot.write_version.tag.empty()) {
10317 if (ot.read_version.tag.empty()) {
10318 ot.generate_new_write_ver(cct);
10319 } else {
10320 ot.write_version = ot.read_version;
10321 ot.write_version.ver++;
10322 }
10323 }
10324
10325 ot.prepare_op_for_write(op);
10326 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10327 op->mtime2(&mtime_ts);
10328 }
10329
10330 int RGWRados::create_mfa(const rgw_user& user, const rados::cls::otp::otp_info_t& config,
10331 RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime)
10332 {
10333 rgw_rados_ref ref;
10334
10335 int r = get_mfa_ref(user, &ref);
10336 if (r < 0) {
10337 return r;
10338 }
10339
10340 librados::ObjectWriteOperation op;
10341 prepare_mfa_write(&op, objv_tracker, mtime);
10342 rados::cls::otp::OTP::create(&op, config);
10343 r = ref.ioctx.operate(ref.obj.oid, &op);
10344 if (r < 0) {
10345 ldout(cct, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl;
10346 return r;
10347 }
10348
10349 return 0;
10350 }
10351
10352 int RGWRados::remove_mfa(const rgw_user& user, const string& id,
10353 RGWObjVersionTracker *objv_tracker,
10354 const ceph::real_time& mtime)
10355 {
10356 rgw_rados_ref ref;
10357
10358 int r = get_mfa_ref(user, &ref);
10359 if (r < 0) {
10360 return r;
10361 }
10362
10363 librados::ObjectWriteOperation op;
10364 prepare_mfa_write(&op, objv_tracker, mtime);
10365 rados::cls::otp::OTP::remove(&op, id);
10366 r = ref.ioctx.operate(ref.obj.oid, &op);
10367 if (r < 0) {
10368 ldout(cct, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl;
10369 return r;
10370 }
10371
10372 return 0;
10373 }
10374
10375 int RGWRados::get_mfa(const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result)
10376 {
10377 rgw_rados_ref ref;
10378
10379 int r = get_mfa_ref(user, &ref);
10380 if (r < 0) {
10381 return r;
10382 }
10383
10384 r = rados::cls::otp::OTP::get(nullptr, ref.ioctx, ref.obj.oid, id, result);
10385 if (r < 0) {
10386 return r;
10387 }
10388
10389 return 0;
10390 }
10391
10392 int RGWRados::list_mfa(const rgw_user& user, list<rados::cls::otp::otp_info_t> *result)
10393 {
10394 rgw_rados_ref ref;
10395
10396 int r = get_mfa_ref(user, &ref);
10397 if (r < 0) {
10398 return r;
10399 }
10400
10401 r = rados::cls::otp::OTP::get_all(nullptr, ref.ioctx, ref.obj.oid, result);
10402 if (r < 0) {
10403 return r;
10404 }
10405
10406 return 0;
10407 }
10408
10409 int RGWRados::otp_get_current_time(const rgw_user& user, ceph::real_time *result)
10410 {
10411 rgw_rados_ref ref;
10412
10413 int r = get_mfa_ref(user, &ref);
10414 if (r < 0) {
10415 return r;
10416 }
10417
10418 r = rados::cls::otp::OTP::get_current_time(ref.ioctx, ref.obj.oid, result);
10419 if (r < 0) {
10420 return r;
10421 }
10422
10423 return 0;
10424 }
10425
10426 int RGWRados::set_mfa(const string& oid, const list<rados::cls::otp::otp_info_t>& entries,
10427 bool reset_obj, RGWObjVersionTracker *objv_tracker,
10428 const real_time& mtime)
10429 {
10430 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10431 rgw_rados_ref ref;
10432 int r = get_system_obj_ref(obj, &ref);
10433 if (r < 0) {
10434 return r;
10435 }
10436
10437 librados::ObjectWriteOperation op;
10438 if (reset_obj) {
10439 op.remove();
10440 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
10441 op.create(false);
10442 }
10443 prepare_mfa_write(&op, objv_tracker, mtime);
10444 rados::cls::otp::OTP::set(&op, entries);
10445 r = ref.ioctx.operate(ref.obj.oid, &op);
10446 if (r < 0) {
10447 ldout(cct, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl;
10448 return r;
10449 }
10450
10451 return 0;
10452 }
10453
10454 int RGWRados::list_mfa(const string& oid, list<rados::cls::otp::otp_info_t> *result,
10455 RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime)
10456 {
10457 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10458 rgw_rados_ref ref;
10459 int r = get_system_obj_ref(obj, &ref);
10460 if (r < 0) {
10461 return r;
10462 }
10463 librados::ObjectReadOperation op;
10464 struct timespec mtime_ts;
10465 if (pmtime) {
10466 op.stat2(nullptr, &mtime_ts, nullptr);
10467 }
10468 objv_tracker->prepare_op_for_read(&op);
10469 r = rados::cls::otp::OTP::get_all(&op, ref.ioctx, ref.obj.oid, result);
10470 if (r < 0) {
10471 return r;
10472 }
10473 if (pmtime) {
10474 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
10475 }
10476
10477 return 0;
10478 }