]> git.proxmox.com Git - ceph.git/blame - ceph/src/rgw/rgw_rados.cc
buildsys: auto-determine current version for makefile
[ceph.git] / ceph / src / rgw / rgw_rados.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3
31f18b77 4#include "include/compat.h"
7c673cae
FG
5#include <errno.h>
6#include <stdlib.h>
7#include <sys/types.h>
8#include <boost/algorithm/string.hpp>
11fdf7f2 9#include <string_view>
7c673cae 10
11fdf7f2 11#include <boost/container/flat_set.hpp>
7c673cae
FG
12#include <boost/format.hpp>
13#include <boost/optional.hpp>
14#include <boost/utility/in_place_factory.hpp>
15
16#include "common/ceph_json.h"
7c673cae
FG
17
18#include "common/errno.h"
19#include "common/Formatter.h"
20#include "common/Throttle.h"
7c673cae
FG
21
22#include "rgw_rados.h"
11fdf7f2 23#include "rgw_zone.h"
7c673cae
FG
24#include "rgw_cache.h"
25#include "rgw_acl.h"
26#include "rgw_acl_s3.h" /* for dumping s3policy in debug log */
11fdf7f2 27#include "rgw_aio_throttle.h"
7c673cae
FG
28#include "rgw_bucket.h"
29#include "rgw_rest_conn.h"
30#include "rgw_cr_rados.h"
31#include "rgw_cr_rest.h"
11fdf7f2 32#include "rgw_putobj_processor.h"
7c673cae
FG
33
34#include "cls/rgw/cls_rgw_ops.h"
7c673cae
FG
35#include "cls/rgw/cls_rgw_client.h"
36#include "cls/rgw/cls_rgw_const.h"
37#include "cls/refcount/cls_refcount_client.h"
38#include "cls/version/cls_version_client.h"
39#include "cls/log/cls_log_client.h"
7c673cae
FG
40#include "cls/timeindex/cls_timeindex_client.h"
41#include "cls/lock/cls_lock_client.h"
42#include "cls/user/cls_user_client.h"
11fdf7f2 43#include "cls/otp/cls_otp_client.h"
c07f9fc5 44#include "osd/osd_types.h"
7c673cae
FG
45
46#include "rgw_tools.h"
47#include "rgw_coroutine.h"
48#include "rgw_compression.h"
49
7c673cae
FG
50#undef fork // fails to compile RGWPeriod::fork() below
51
52#include "common/Clock.h"
53
7c673cae
FG
54using namespace librados;
55
56#include <string>
57#include <iostream>
58#include <vector>
59#include <atomic>
60#include <list>
61#include <map>
11fdf7f2 62#include "include/random.h"
7c673cae
FG
63
64#include "rgw_gc.h"
65#include "rgw_lc.h"
66
67#include "rgw_object_expirer_core.h"
68#include "rgw_sync.h"
81eedcae 69#include "rgw_sync_counters.h"
11fdf7f2 70#include "rgw_sync_trace.h"
7c673cae
FG
71#include "rgw_data_sync.h"
72#include "rgw_realm_watcher.h"
31f18b77 73#include "rgw_reshard.h"
7c673cae 74
11fdf7f2
TL
75#include "services/svc_zone.h"
76#include "services/svc_zone_utils.h"
77#include "services/svc_quota.h"
78#include "services/svc_sync_modules.h"
79#include "services/svc_sys_obj.h"
80#include "services/svc_sys_obj_cache.h"
81
7c673cae
FG
82#include "compressor/Compressor.h"
83
11fdf7f2
TL
84#ifdef WITH_LTTNG
85#define TRACEPOINT_DEFINE
86#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
87#include "tracing/rgw_rados.h"
88#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
89#undef TRACEPOINT_DEFINE
90#else
91#define tracepoint(...)
92#endif
93
7c673cae
FG
94#define dout_context g_ceph_context
95#define dout_subsys ceph_subsys_rgw
96
7c673cae 97
7c673cae
FG
98static string shadow_ns = "shadow";
99static string dir_oid_prefix = ".dir.";
7c673cae
FG
100static string default_bucket_index_pool_suffix = "rgw.buckets.index";
101static string default_storage_extra_pool_suffix = "rgw.buckets.non-ec";
11fdf7f2 102
7c673cae 103static string log_lock_name = "rgw_log_lock";
11fdf7f2 104static RGWObjCategory main_category = RGWObjCategory::Main;
7c673cae 105#define RGW_USAGE_OBJ_PREFIX "usage."
7c673cae
FG
106
107#define dout_subsys ceph_subsys_rgw
108
11fdf7f2
TL
109const std::string MP_META_SUFFIX = ".meta";
110
7c673cae
FG
111
112static bool rgw_get_obj_data_pool(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
113 const rgw_placement_rule& head_placement_rule,
114 const rgw_obj& obj, rgw_pool *pool)
7c673cae 115{
11fdf7f2 116 if (!zone_params.get_head_data_pool(head_placement_rule, obj, pool)) {
7c673cae 117 RGWZonePlacementInfo placement;
11fdf7f2 118 if (!zone_params.get_placement(zonegroup.default_placement.name, &placement)) {
7c673cae
FG
119 return false;
120 }
121
122 if (!obj.in_extra_data) {
11fdf7f2 123 *pool = placement.get_data_pool(zonegroup.default_placement.storage_class);
7c673cae 124 } else {
31f18b77 125 *pool = placement.get_data_extra_pool();
7c673cae
FG
126 }
127 }
128
129 return true;
130}
131
132static bool rgw_obj_to_raw(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params,
11fdf7f2
TL
133 const rgw_placement_rule& head_placement_rule,
134 const rgw_obj& obj, rgw_raw_obj *raw_obj)
7c673cae
FG
135{
136 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
137
11fdf7f2 138 return rgw_get_obj_data_pool(zonegroup, zone_params, head_placement_rule, obj, &raw_obj->pool);
7c673cae
FG
139}
140
141rgw_raw_obj rgw_obj_select::get_raw_obj(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params) const
142{
143 if (!is_raw) {
144 rgw_raw_obj r;
145 rgw_obj_to_raw(zonegroup, zone_params, placement_rule, obj, &r);
146 return r;
147 }
148 return raw_obj;
149}
150
151rgw_raw_obj rgw_obj_select::get_raw_obj(RGWRados *store) const
152{
153 if (!is_raw) {
154 rgw_raw_obj r;
155 store->obj_to_raw(placement_rule, obj, &r);
156 return r;
157 }
158 return raw_obj;
159}
160
11fdf7f2
TL
161void RGWObjVersionTracker::prepare_op_for_read(ObjectReadOperation *op)
162{
163 obj_version *check_objv = version_for_check();
7c673cae 164
11fdf7f2
TL
165 if (check_objv) {
166 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae
FG
167 }
168
11fdf7f2 169 cls_version_read(*op, &read_version);
7c673cae
FG
170}
171
11fdf7f2
TL
172void RGWObjVersionTracker::prepare_op_for_write(ObjectWriteOperation *op)
173{
174 obj_version *check_objv = version_for_check();
175 obj_version *modify_version = version_for_write();
7c673cae 176
11fdf7f2
TL
177 if (check_objv) {
178 cls_version_check(*op, *check_objv, VER_COND_EQ);
7c673cae 179 }
7c673cae 180
11fdf7f2
TL
181 if (modify_version) {
182 cls_version_set(*op, *modify_version);
183 } else {
184 cls_version_inc(*op);
7c673cae 185 }
7c673cae
FG
186}
187
11fdf7f2 188void RGWObjManifest::obj_iterator::operator++()
7c673cae 189{
11fdf7f2
TL
190 if (manifest->explicit_objs) {
191 ++explicit_iter;
7c673cae 192
11fdf7f2 193 update_explicit_pos();
7c673cae 194
11fdf7f2
TL
195 update_location();
196 return;
7c673cae
FG
197 }
198
11fdf7f2
TL
199 uint64_t obj_size = manifest->get_obj_size();
200 uint64_t head_size = manifest->get_head_size();
201
202 if (ofs == obj_size) {
203 return;
7c673cae
FG
204 }
205
11fdf7f2
TL
206 if (manifest->rules.empty()) {
207 return;
7c673cae
FG
208 }
209
11fdf7f2
TL
210 /* are we still pointing at the head? */
211 if (ofs < head_size) {
212 rule_iter = manifest->rules.begin();
213 RGWObjManifestRule *rule = &rule_iter->second;
214 ofs = std::min(head_size, obj_size);
215 stripe_ofs = ofs;
216 cur_stripe = 1;
217 stripe_size = std::min(obj_size - ofs, rule->stripe_max_size);
218 if (rule->part_size > 0) {
219 stripe_size = std::min(stripe_size, rule->part_size);
220 }
221 update_location();
222 return;
7c673cae
FG
223 }
224
11fdf7f2 225 RGWObjManifestRule *rule = &rule_iter->second;
7c673cae 226
11fdf7f2
TL
227 stripe_ofs += rule->stripe_max_size;
228 cur_stripe++;
229 dout(20) << "RGWObjManifest::operator++(): rule->part_size=" << rule->part_size << " rules.size()=" << manifest->rules.size() << dendl;
7c673cae 230
11fdf7f2
TL
231 if (rule->part_size > 0) {
232 /* multi part, multi stripes object */
7c673cae 233
11fdf7f2 234 dout(20) << "RGWObjManifest::operator++(): stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
7c673cae 235
11fdf7f2
TL
236 if (stripe_ofs >= part_ofs + rule->part_size) {
237 /* moved to the next part */
238 cur_stripe = 0;
239 part_ofs += rule->part_size;
240 stripe_ofs = part_ofs;
7c673cae 241
11fdf7f2
TL
242 bool last_rule = (next_rule_iter == manifest->rules.end());
243 /* move to the next rule? */
244 if (!last_rule && stripe_ofs >= next_rule_iter->second.start_ofs) {
245 rule_iter = next_rule_iter;
246 last_rule = (next_rule_iter == manifest->rules.end());
247 if (!last_rule) {
248 ++next_rule_iter;
249 }
250 cur_part_id = rule_iter->second.start_part_num;
251 } else {
252 cur_part_id++;
253 }
7c673cae 254
11fdf7f2
TL
255 rule = &rule_iter->second;
256 }
7c673cae 257
11fdf7f2 258 stripe_size = std::min(rule->part_size - (stripe_ofs - part_ofs), rule->stripe_max_size);
7c673cae 259 }
7c673cae 260
11fdf7f2 261 cur_override_prefix = rule->override_prefix;
7c673cae 262
11fdf7f2
TL
263 ofs = stripe_ofs;
264 if (ofs > obj_size) {
265 ofs = obj_size;
266 stripe_ofs = ofs;
267 stripe_size = 0;
268 }
7c673cae 269
11fdf7f2
TL
270 dout(20) << "RGWObjManifest::operator++(): result: ofs=" << ofs << " stripe_ofs=" << stripe_ofs << " part_ofs=" << part_ofs << " rule->part_size=" << rule->part_size << dendl;
271 update_location();
7c673cae
FG
272}
273
11fdf7f2
TL
274int RGWObjManifest::generator::create_begin(CephContext *cct, RGWObjManifest *_m,
275 const rgw_placement_rule& head_placement_rule,
276 const rgw_placement_rule *tail_placement_rule,
277 const rgw_bucket& _b, const rgw_obj& _obj)
7c673cae 278{
11fdf7f2 279 manifest = _m;
7c673cae 280
11fdf7f2
TL
281 if (!tail_placement_rule) {
282 manifest->set_tail_placement(head_placement_rule, _b);
283 } else {
284 rgw_placement_rule new_tail_rule = *tail_placement_rule;
285 new_tail_rule.inherit_from(head_placement_rule);
286 manifest->set_tail_placement(new_tail_rule, _b);
7c673cae
FG
287 }
288
11fdf7f2
TL
289 manifest->set_head(head_placement_rule, _obj, 0);
290 last_ofs = 0;
7c673cae 291
11fdf7f2
TL
292 if (manifest->get_prefix().empty()) {
293 char buf[33];
294 gen_rand_alphanumeric(cct, buf, sizeof(buf) - 1);
295
296 string oid_prefix = ".";
297 oid_prefix.append(buf);
298 oid_prefix.append("_");
7c673cae 299
11fdf7f2 300 manifest->set_prefix(oid_prefix);
7c673cae
FG
301 }
302
11fdf7f2
TL
303 bool found = manifest->get_rule(0, &rule);
304 if (!found) {
305 derr << "ERROR: manifest->get_rule() could not find rule" << dendl;
306 return -EIO;
7c673cae
FG
307 }
308
11fdf7f2
TL
309 uint64_t head_size = manifest->get_head_size();
310
311 if (head_size > 0) {
312 cur_stripe_size = head_size;
313 } else {
314 cur_stripe_size = rule.stripe_max_size;
7c673cae 315 }
11fdf7f2
TL
316
317 cur_part_id = rule.start_part_num;
7c673cae 318
11fdf7f2 319 manifest->get_implicit_location(cur_part_id, cur_stripe, 0, NULL, &cur_obj);
7c673cae 320
11fdf7f2
TL
321 // Normal object which not generated through copy operation
322 manifest->set_tail_instance(_obj.key.instance);
7c673cae 323
11fdf7f2 324 manifest->update_iterators();
7c673cae 325
11fdf7f2 326 return 0;
7c673cae
FG
327}
328
11fdf7f2 329int RGWObjManifest::generator::create_next(uint64_t ofs)
7c673cae 330{
11fdf7f2
TL
331 if (ofs < last_ofs) /* only going forward */
332 return -EINVAL;
7c673cae 333
11fdf7f2
TL
334 uint64_t max_head_size = manifest->get_max_head_size();
335
336 if (ofs < max_head_size) {
337 manifest->set_head_size(ofs);
7c673cae 338 }
7c673cae 339
11fdf7f2
TL
340 if (ofs >= max_head_size) {
341 manifest->set_head_size(max_head_size);
342 cur_stripe = (ofs - max_head_size) / rule.stripe_max_size;
343 cur_stripe_size = rule.stripe_max_size;
7c673cae 344
11fdf7f2
TL
345 if (cur_part_id == 0 && max_head_size > 0) {
346 cur_stripe++;
7c673cae
FG
347 }
348 }
349
11fdf7f2
TL
350 last_ofs = ofs;
351 manifest->set_obj_size(ofs);
7c673cae 352
11fdf7f2 353 manifest->get_implicit_location(cur_part_id, cur_stripe, ofs, NULL, &cur_obj);
7c673cae 354
11fdf7f2 355 manifest->update_iterators();
7c673cae 356
11fdf7f2 357 return 0;
7c673cae
FG
358}
359
11fdf7f2 360const RGWObjManifest::obj_iterator& RGWObjManifest::obj_begin()
7c673cae 361{
11fdf7f2
TL
362 return begin_iter;
363}
7c673cae 364
11fdf7f2
TL
365const RGWObjManifest::obj_iterator& RGWObjManifest::obj_end()
366{
367 return end_iter;
7c673cae
FG
368}
369
11fdf7f2 370RGWObjManifest::obj_iterator RGWObjManifest::obj_find(uint64_t ofs)
7c673cae 371{
11fdf7f2
TL
372 if (ofs > obj_size) {
373 ofs = obj_size;
7c673cae 374 }
11fdf7f2
TL
375 RGWObjManifest::obj_iterator iter(this);
376 iter.seek(ofs);
377 return iter;
7c673cae
FG
378}
379
11fdf7f2
TL
380int RGWObjManifest::append(RGWObjManifest& m, const RGWZoneGroup& zonegroup,
381 const RGWZoneParams& zone_params)
7c673cae 382{
11fdf7f2
TL
383 if (explicit_objs || m.explicit_objs) {
384 return append_explicit(m, zonegroup, zone_params);
7c673cae
FG
385 }
386
11fdf7f2
TL
387 if (rules.empty()) {
388 *this = m;
389 return 0;
7c673cae
FG
390 }
391
11fdf7f2 392 string override_prefix;
7c673cae 393
11fdf7f2
TL
394 if (prefix.empty()) {
395 prefix = m.prefix;
7c673cae
FG
396 }
397
11fdf7f2
TL
398 if (prefix != m.prefix) {
399 override_prefix = m.prefix;
400 }
7c673cae 401
11fdf7f2
TL
402 map<uint64_t, RGWObjManifestRule>::iterator miter = m.rules.begin();
403 if (miter == m.rules.end()) {
404 return append_explicit(m, zonegroup, zone_params);
7c673cae
FG
405 }
406
11fdf7f2
TL
407 for (; miter != m.rules.end(); ++miter) {
408 map<uint64_t, RGWObjManifestRule>::reverse_iterator last_rule = rules.rbegin();
7c673cae 409
11fdf7f2 410 RGWObjManifestRule& rule = last_rule->second;
7c673cae 411
11fdf7f2
TL
412 if (rule.part_size == 0) {
413 rule.part_size = obj_size - rule.start_ofs;
414 }
7c673cae 415
11fdf7f2
TL
416 RGWObjManifestRule& next_rule = miter->second;
417 if (!next_rule.part_size) {
418 next_rule.part_size = m.obj_size - next_rule.start_ofs;
419 }
7c673cae 420
11fdf7f2
TL
421 string rule_prefix = prefix;
422 if (!rule.override_prefix.empty()) {
423 rule_prefix = rule.override_prefix;
7c673cae 424 }
7c673cae 425
11fdf7f2
TL
426 string next_rule_prefix = m.prefix;
427 if (!next_rule.override_prefix.empty()) {
428 next_rule_prefix = next_rule.override_prefix;
429 }
7c673cae 430
11fdf7f2
TL
431 if (rule.part_size != next_rule.part_size ||
432 rule.stripe_max_size != next_rule.stripe_max_size ||
433 rule_prefix != next_rule_prefix) {
434 if (next_rule_prefix != prefix) {
435 append_rules(m, miter, &next_rule_prefix);
436 } else {
437 append_rules(m, miter, NULL);
438 }
439 break;
7c673cae 440 }
7c673cae 441
11fdf7f2
TL
442 uint64_t expected_part_num = rule.start_part_num + 1;
443 if (rule.part_size > 0) {
444 expected_part_num = rule.start_part_num + (obj_size + next_rule.start_ofs - rule.start_ofs) / rule.part_size;
445 }
7c673cae 446
11fdf7f2
TL
447 if (expected_part_num != next_rule.start_part_num) {
448 append_rules(m, miter, NULL);
449 break;
450 }
224ce89b 451 }
7c673cae 452
11fdf7f2 453 set_obj_size(obj_size + m.obj_size);
7c673cae
FG
454
455 return 0;
456}
457
11fdf7f2 458int RGWObjManifest::append(RGWObjManifest& m, RGWSI_Zone *zone_svc)
7c673cae 459{
11fdf7f2 460 return append(m, zone_svc->get_zonegroup(), zone_svc->get_zone_params());
7c673cae
FG
461}
462
11fdf7f2
TL
463void RGWObjManifest::append_rules(RGWObjManifest& m, map<uint64_t, RGWObjManifestRule>::iterator& miter,
464 string *override_prefix)
7c673cae 465{
11fdf7f2
TL
466 for (; miter != m.rules.end(); ++miter) {
467 RGWObjManifestRule rule = miter->second;
468 rule.start_ofs += obj_size;
469 if (override_prefix)
470 rule.override_prefix = *override_prefix;
471 rules[rule.start_ofs] = rule;
7c673cae 472 }
7c673cae
FG
473}
474
11fdf7f2 475void RGWObjManifest::convert_to_explicit(const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
7c673cae 476{
11fdf7f2
TL
477 if (explicit_objs) {
478 return;
7c673cae 479 }
11fdf7f2 480 obj_iterator iter = obj_begin();
7c673cae 481
11fdf7f2
TL
482 while (iter != obj_end()) {
483 RGWObjManifestPart& part = objs[iter.get_stripe_ofs()];
484 const rgw_obj_select& os = iter.get_location();
485 const rgw_raw_obj& raw_loc = os.get_raw_obj(zonegroup, zone_params);
486 part.loc_ofs = 0;
7c673cae 487
11fdf7f2 488 uint64_t ofs = iter.get_stripe_ofs();
7c673cae 489
11fdf7f2
TL
490 if (ofs == 0) {
491 part.loc = obj;
492 } else {
493 rgw_raw_obj_to_obj(tail_placement.bucket, raw_loc, &part.loc);
494 }
495 ++iter;
496 uint64_t next_ofs = iter.get_stripe_ofs();
7c673cae 497
11fdf7f2 498 part.size = next_ofs - ofs;
7c673cae 499 }
7c673cae 500
11fdf7f2
TL
501 explicit_objs = true;
502 rules.clear();
503 prefix.clear();
7c673cae
FG
504}
505
11fdf7f2 506int RGWObjManifest::append_explicit(RGWObjManifest& m, const RGWZoneGroup& zonegroup, const RGWZoneParams& zone_params)
7c673cae 507{
11fdf7f2
TL
508 if (!explicit_objs) {
509 convert_to_explicit(zonegroup, zone_params);
7c673cae 510 }
11fdf7f2
TL
511 if (!m.explicit_objs) {
512 m.convert_to_explicit(zonegroup, zone_params);
7c673cae 513 }
11fdf7f2
TL
514 map<uint64_t, RGWObjManifestPart>::iterator iter;
515 uint64_t base = obj_size;
516 for (iter = m.objs.begin(); iter != m.objs.end(); ++iter) {
517 RGWObjManifestPart& part = iter->second;
518 objs[base + iter->first] = part;
7c673cae 519 }
11fdf7f2 520 obj_size += m.obj_size;
7c673cae
FG
521
522 return 0;
523}
524
11fdf7f2
TL
525bool RGWObjManifest::get_rule(uint64_t ofs, RGWObjManifestRule *rule)
526{
527 if (rules.empty()) {
528 return false;
7c673cae
FG
529 }
530
11fdf7f2
TL
531 map<uint64_t, RGWObjManifestRule>::iterator iter = rules.upper_bound(ofs);
532 if (iter != rules.begin()) {
533 --iter;
534 }
3a9019d9 535
11fdf7f2 536 *rule = iter->second;
7c673cae 537
11fdf7f2 538 return true;
7c673cae
FG
539}
540
11fdf7f2 541void RGWObjVersionTracker::generate_new_write_ver(CephContext *cct)
7c673cae 542{
11fdf7f2
TL
543 write_version.ver = 1;
544#define TAG_LEN 24
7c673cae 545
11fdf7f2
TL
546 write_version.tag.clear();
547 append_rand_alpha(cct, write_version.tag, write_version.tag, TAG_LEN);
7c673cae
FG
548}
549
7c673cae
FG
550class RGWMetaNotifierManager : public RGWCoroutinesManager {
551 RGWRados *store;
552 RGWHTTPManager http_manager;
553
554public:
555 RGWMetaNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
556 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 557 http_manager.start();
7c673cae
FG
558 }
559
560 int notify_all(map<string, RGWRESTConn *>& conn_map, set<int>& shards) {
561 rgw_http_param_pair pairs[] = { { "type", "metadata" },
562 { "notify", NULL },
563 { NULL, NULL } };
564
565 list<RGWCoroutinesStack *> stacks;
566 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
567 RGWRESTConn *conn = iter->second;
568 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
569 stack->call(new RGWPostRESTResourceCR<set<int>, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
570
571 stacks.push_back(stack);
572 }
573 return run(stacks);
574 }
575};
576
577class RGWDataNotifierManager : public RGWCoroutinesManager {
578 RGWRados *store;
579 RGWHTTPManager http_manager;
580
581public:
582 RGWDataNotifierManager(RGWRados *_store) : RGWCoroutinesManager(_store->ctx(), _store->get_cr_registry()), store(_store),
583 http_manager(store->ctx(), completion_mgr) {
11fdf7f2 584 http_manager.start();
7c673cae
FG
585 }
586
587 int notify_all(map<string, RGWRESTConn *>& conn_map, map<int, set<string> >& shards) {
588 rgw_http_param_pair pairs[] = { { "type", "data" },
589 { "notify", NULL },
11fdf7f2 590 { "source-zone", store->svc.zone->get_zone_params().get_id().c_str() },
7c673cae
FG
591 { NULL, NULL } };
592
593 list<RGWCoroutinesStack *> stacks;
594 for (map<string, RGWRESTConn *>::iterator iter = conn_map.begin(); iter != conn_map.end(); ++iter) {
595 RGWRESTConn *conn = iter->second;
596 RGWCoroutinesStack *stack = new RGWCoroutinesStack(store->ctx(), this);
597 stack->call(new RGWPostRESTResourceCR<map<int, set<string> >, int>(store->ctx(), conn, &http_manager, "/admin/log", pairs, shards, NULL));
598
599 stacks.push_back(stack);
600 }
601 return run(stacks);
602 }
603};
604
11fdf7f2
TL
605/* class RGWRadosThread */
606
7c673cae
FG
607void RGWRadosThread::start()
608{
609 worker = new Worker(cct, this);
610 worker->create(thread_name.c_str());
611}
612
613void RGWRadosThread::stop()
614{
615 down_flag = true;
616 stop_process();
617 if (worker) {
31f18b77 618 worker->signal();
7c673cae
FG
619 worker->join();
620 }
621 delete worker;
622 worker = NULL;
623}
624
625void *RGWRadosThread::Worker::entry() {
626 uint64_t msec = processor->interval_msec();
627 utime_t interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
628
629 do {
630 utime_t start = ceph_clock_now();
631 int r = processor->process();
632 if (r < 0) {
633 dout(0) << "ERROR: processor->process() returned error r=" << r << dendl;
634 }
635
636 if (processor->going_down())
637 break;
638
639 utime_t end = ceph_clock_now();
640 end -= start;
641
642 uint64_t cur_msec = processor->interval_msec();
643 if (cur_msec != msec) { /* was it reconfigured? */
644 msec = cur_msec;
645 interval = utime_t(msec / 1000, (msec % 1000) * 1000000);
646 }
647
648 if (cur_msec > 0) {
649 if (interval <= end)
650 continue; // next round
651
652 utime_t wait_time = interval;
653 wait_time -= end;
654
31f18b77 655 wait_interval(wait_time);
7c673cae 656 } else {
31f18b77 657 wait();
7c673cae
FG
658 }
659 } while (!processor->going_down());
660
661 return NULL;
662}
663
664class RGWMetaNotifier : public RGWRadosThread {
665 RGWMetaNotifierManager notify_mgr;
666 RGWMetadataLog *const log;
667
668 uint64_t interval_msec() override {
669 return cct->_conf->rgw_md_notify_interval_msec;
670 }
1adf2230
AA
671 void stop_process() override {
672 notify_mgr.stop();
673 }
7c673cae
FG
674public:
675 RGWMetaNotifier(RGWRados *_store, RGWMetadataLog* log)
676 : RGWRadosThread(_store, "meta-notifier"), notify_mgr(_store), log(log) {}
677
678 int process() override;
679};
680
681int RGWMetaNotifier::process()
682{
683 set<int> shards;
684
685 log->read_clear_modified(shards);
686
687 if (shards.empty()) {
688 return 0;
689 }
690
691 for (set<int>::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
692 ldout(cct, 20) << __func__ << "(): notifying mdlog change, shard_id=" << *iter << dendl;
693 }
694
11fdf7f2 695 notify_mgr.notify_all(store->svc.zone->get_zone_conn_map(), shards);
7c673cae
FG
696
697 return 0;
698}
699
700class RGWDataNotifier : public RGWRadosThread {
701 RGWDataNotifierManager notify_mgr;
702
703 uint64_t interval_msec() override {
11fdf7f2 704 return cct->_conf.get_val<int64_t>("rgw_data_notify_interval_msec");
7c673cae 705 }
1adf2230
AA
706 void stop_process() override {
707 notify_mgr.stop();
708 }
7c673cae
FG
709public:
710 RGWDataNotifier(RGWRados *_store) : RGWRadosThread(_store, "data-notifier"), notify_mgr(_store) {}
711
712 int process() override;
713};
714
715int RGWDataNotifier::process()
716{
717 if (!store->data_log) {
718 return 0;
719 }
720
721 map<int, set<string> > shards;
722
723 store->data_log->read_clear_modified(shards);
724
725 if (shards.empty()) {
726 return 0;
727 }
728
729 for (map<int, set<string> >::iterator iter = shards.begin(); iter != shards.end(); ++iter) {
730 ldout(cct, 20) << __func__ << "(): notifying datalog change, shard_id=" << iter->first << ": " << iter->second << dendl;
731 }
732
11fdf7f2 733 notify_mgr.notify_all(store->svc.zone->get_zone_data_notify_to_map(), shards);
7c673cae
FG
734
735 return 0;
736}
737
738class RGWSyncProcessorThread : public RGWRadosThread {
739public:
740 RGWSyncProcessorThread(RGWRados *_store, const string& thread_name = "radosgw") : RGWRadosThread(_store, thread_name) {}
741 RGWSyncProcessorThread(RGWRados *_store) : RGWRadosThread(_store) {}
742 ~RGWSyncProcessorThread() override {}
743 int init() override = 0 ;
744 int process() override = 0;
745};
746
747class RGWMetaSyncProcessorThread : public RGWSyncProcessorThread
748{
749 RGWMetaSyncStatusManager sync;
750
751 uint64_t interval_msec() override {
752 return 0; /* no interval associated, it'll run once until stopped */
753 }
754 void stop_process() override {
755 sync.stop();
756 }
757public:
758 RGWMetaSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados)
759 : RGWSyncProcessorThread(_store, "meta-sync"), sync(_store, async_rados) {}
760
761 void wakeup_sync_shards(set<int>& shard_ids) {
762 for (set<int>::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
763 sync.wakeup(*iter);
764 }
765 }
766 RGWMetaSyncStatusManager* get_manager() { return &sync; }
767
768 int init() override {
769 int ret = sync.init();
770 if (ret < 0) {
771 ldout(store->ctx(), 0) << "ERROR: sync.init() returned " << ret << dendl;
772 return ret;
773 }
774 return 0;
775 }
776
777 int process() override {
778 sync.run();
779 return 0;
780 }
781};
782
783class RGWDataSyncProcessorThread : public RGWSyncProcessorThread
784{
81eedcae 785 PerfCountersRef counters;
7c673cae
FG
786 RGWDataSyncStatusManager sync;
787 bool initialized;
788
789 uint64_t interval_msec() override {
790 if (initialized) {
791 return 0; /* no interval associated, it'll run once until stopped */
792 } else {
793#define DATA_SYNC_INIT_WAIT_SEC 20
794 return DATA_SYNC_INIT_WAIT_SEC * 1000;
795 }
796 }
797 void stop_process() override {
798 sync.stop();
799 }
800public:
801 RGWDataSyncProcessorThread(RGWRados *_store, RGWAsyncRadosProcessor *async_rados,
81eedcae 802 const RGWZone* source_zone)
b32b8144 803 : RGWSyncProcessorThread(_store, "data-sync"),
81eedcae
TL
804 counters(sync_counters::build(store->ctx(), std::string("data-sync-from-") + source_zone->name)),
805 sync(_store, async_rados, source_zone->id, counters.get()),
7c673cae
FG
806 initialized(false) {}
807
808 void wakeup_sync_shards(map<int, set<string> >& shard_ids) {
809 for (map<int, set<string> >::iterator iter = shard_ids.begin(); iter != shard_ids.end(); ++iter) {
810 sync.wakeup(iter->first, iter->second);
811 }
812 }
813 RGWDataSyncStatusManager* get_manager() { return &sync; }
814
815 int init() override {
816 return 0;
817 }
818
819 int process() override {
820 while (!initialized) {
821 if (going_down()) {
822 return 0;
823 }
824 int ret = sync.init();
825 if (ret >= 0) {
826 initialized = true;
827 break;
828 }
829 /* we'll be back! */
830 return 0;
831 }
832 sync.run();
833 return 0;
834 }
835};
836
11fdf7f2 837class RGWSyncLogTrimThread : public RGWSyncProcessorThread, DoutPrefixProvider
7c673cae
FG
838{
839 RGWCoroutinesManager crs;
840 RGWRados *store;
b32b8144 841 rgw::BucketTrimManager *bucket_trim;
7c673cae
FG
842 RGWHTTPManager http;
843 const utime_t trim_interval;
844
845 uint64_t interval_msec() override { return 0; }
846 void stop_process() override { crs.stop(); }
847public:
b32b8144
FG
848 RGWSyncLogTrimThread(RGWRados *store, rgw::BucketTrimManager *bucket_trim,
849 int interval)
7c673cae
FG
850 : RGWSyncProcessorThread(store, "sync-log-trim"),
851 crs(store->ctx(), store->get_cr_registry()), store(store),
b32b8144 852 bucket_trim(bucket_trim),
7c673cae
FG
853 http(store->ctx(), crs.get_completion_mgr()),
854 trim_interval(interval, 0)
855 {}
856
857 int init() override {
11fdf7f2 858 return http.start();
7c673cae
FG
859 }
860 int process() override {
861 list<RGWCoroutinesStack*> stacks;
862 auto meta = new RGWCoroutinesStack(store->ctx(), &crs);
11fdf7f2 863 meta->call(create_meta_log_trim_cr(this, store, &http,
7c673cae
FG
864 cct->_conf->rgw_md_log_max_shards,
865 trim_interval));
866 stacks.push_back(meta);
867
868 auto data = new RGWCoroutinesStack(store->ctx(), &crs);
869 data->call(create_data_log_trim_cr(store, &http,
870 cct->_conf->rgw_data_log_num_shards,
871 trim_interval));
872 stacks.push_back(data);
873
b32b8144
FG
874 auto bucket = new RGWCoroutinesStack(store->ctx(), &crs);
875 bucket->call(bucket_trim->create_bucket_trim_cr(&http));
876 stacks.push_back(bucket);
877
7c673cae
FG
878 crs.run(stacks);
879 return 0;
880 }
11fdf7f2
TL
881
882 // implements DoutPrefixProvider
883 CephContext *get_cct() const override { return store->ctx(); }
884 unsigned get_subsys() const
885 {
886 return dout_subsys;
887 }
888
889 std::ostream& gen_prefix(std::ostream& out) const
890 {
891 return out << "sync log trim: ";
892 }
893
7c673cae
FG
894};
895
896void RGWRados::wakeup_meta_sync_shards(set<int>& shard_ids)
897{
898 Mutex::Locker l(meta_sync_thread_lock);
899 if (meta_sync_processor_thread) {
900 meta_sync_processor_thread->wakeup_sync_shards(shard_ids);
901 }
902}
903
904void RGWRados::wakeup_data_sync_shards(const string& source_zone, map<int, set<string> >& shard_ids)
905{
906 ldout(ctx(), 20) << __func__ << ": source_zone=" << source_zone << ", shard_ids=" << shard_ids << dendl;
907 Mutex::Locker l(data_sync_thread_lock);
908 map<string, RGWDataSyncProcessorThread *>::iterator iter = data_sync_processor_threads.find(source_zone);
909 if (iter == data_sync_processor_threads.end()) {
910 ldout(ctx(), 10) << __func__ << ": couldn't find sync thread for zone " << source_zone << ", skipping async data sync processing" << dendl;
911 return;
912 }
913
914 RGWDataSyncProcessorThread *thread = iter->second;
11fdf7f2 915 ceph_assert(thread);
7c673cae
FG
916 thread->wakeup_sync_shards(shard_ids);
917}
918
919RGWMetaSyncStatusManager* RGWRados::get_meta_sync_manager()
920{
921 Mutex::Locker l(meta_sync_thread_lock);
922 if (meta_sync_processor_thread) {
923 return meta_sync_processor_thread->get_manager();
924 }
925 return nullptr;
926}
927
928RGWDataSyncStatusManager* RGWRados::get_data_sync_manager(const std::string& source_zone)
929{
930 Mutex::Locker l(data_sync_thread_lock);
931 auto thread = data_sync_processor_threads.find(source_zone);
932 if (thread == data_sync_processor_threads.end()) {
933 return nullptr;
934 }
935 return thread->second->get_manager();
936}
937
938int RGWRados::get_required_alignment(const rgw_pool& pool, uint64_t *alignment)
939{
940 IoCtx ioctx;
494da23a 941 int r = open_pool_ctx(pool, ioctx, false);
7c673cae
FG
942 if (r < 0) {
943 ldout(cct, 0) << "ERROR: open_pool_ctx() returned " << r << dendl;
944 return r;
945 }
946
947 bool requires;
948 r = ioctx.pool_requires_alignment2(&requires);
949 if (r < 0) {
950 ldout(cct, 0) << "ERROR: ioctx.pool_requires_alignment2() returned "
951 << r << dendl;
952 return r;
953 }
954
955 if (!requires) {
956 *alignment = 0;
957 return 0;
958 }
959
960 uint64_t align;
961 r = ioctx.pool_required_alignment2(&align);
962 if (r < 0) {
963 ldout(cct, 0) << "ERROR: ioctx.pool_required_alignment2() returned "
964 << r << dendl;
965 return r;
966 }
967 if (align != 0) {
968 ldout(cct, 20) << "required alignment=" << align << dendl;
969 }
970 *alignment = align;
971 return 0;
972}
973
11fdf7f2
TL
974void RGWRados::get_max_aligned_size(uint64_t size, uint64_t alignment, uint64_t *max_size)
975{
976 if (alignment == 0) {
977 *max_size = size;
978 return;
979 }
980
981 if (size <= alignment) {
982 *max_size = alignment;
983 return;
984 }
985
986 *max_size = size - (size % alignment);
987}
988
989int RGWRados::get_max_chunk_size(const rgw_pool& pool, uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae 990{
11fdf7f2 991 uint64_t alignment;
7c673cae
FG
992 int r = get_required_alignment(pool, &alignment);
993 if (r < 0) {
994 return r;
995 }
996
11fdf7f2
TL
997 if (palignment) {
998 *palignment = alignment;
7c673cae
FG
999 }
1000
11fdf7f2 1001 uint64_t config_chunk_size = cct->_conf->rgw_max_chunk_size;
7c673cae 1002
11fdf7f2 1003 get_max_aligned_size(config_chunk_size, alignment, max_chunk_size);
7c673cae
FG
1004
1005 ldout(cct, 20) << "max_chunk_size=" << *max_chunk_size << dendl;
1006
1007 return 0;
1008}
1009
11fdf7f2
TL
1010int RGWRados::get_max_chunk_size(const rgw_placement_rule& placement_rule, const rgw_obj& obj,
1011 uint64_t *max_chunk_size, uint64_t *palignment)
7c673cae
FG
1012{
1013 rgw_pool pool;
1014 if (!get_obj_data_pool(placement_rule, obj, &pool)) {
1015 ldout(cct, 0) << "ERROR: failed to get data pool for object " << obj << dendl;
1016 return -EIO;
1017 }
11fdf7f2 1018 return get_max_chunk_size(pool, max_chunk_size, palignment);
7c673cae
FG
1019}
1020
31f18b77
FG
1021class RGWIndexCompletionManager;
1022
1023struct complete_op_data {
1024 Mutex lock{"complete_op_data"};
1025 AioCompletion *rados_completion{nullptr};
1026 int manager_shard_id{-1};
1027 RGWIndexCompletionManager *manager{nullptr};
1028 rgw_obj obj;
1029 RGWModifyOp op;
1030 string tag;
1031 rgw_bucket_entry_ver ver;
1032 cls_rgw_obj_key key;
1033 rgw_bucket_dir_entry_meta dir_meta;
1034 list<cls_rgw_obj_key> remove_objs;
1035 bool log_op;
1036 uint16_t bilog_op;
1037 rgw_zone_set zones_trace;
1038
1039 bool stopped{false};
1040
1041 void stop() {
1042 Mutex::Locker l(lock);
1043 stopped = true;
1044 }
1045};
1046
1047class RGWIndexCompletionThread : public RGWRadosThread {
1048 RGWRados *store;
1049
1050 uint64_t interval_msec() override {
1051 return 0;
1052 }
1053
1054 list<complete_op_data *> completions;
1055
1056 Mutex completions_lock;
1057public:
1058 RGWIndexCompletionThread(RGWRados *_store)
1059 : RGWRadosThread(_store, "index-complete"), store(_store), completions_lock("RGWIndexCompletionThread::completions_lock") {}
1060
1061 int process() override;
1062
1063 void add_completion(complete_op_data *completion) {
1064 {
1065 Mutex::Locker l(completions_lock);
1066 completions.push_back(completion);
1067 }
1068
1069 signal();
1070 }
1071};
1072
1073int RGWIndexCompletionThread::process()
1074{
1075 list<complete_op_data *> comps;
1076
1077 {
1078 Mutex::Locker l(completions_lock);
1079 completions.swap(comps);
1080 }
1081
1082 for (auto c : comps) {
1083 std::unique_ptr<complete_op_data> up{c};
1084
1085 if (going_down()) {
1086 continue;
1087 }
1088 ldout(store->ctx(), 20) << __func__ << "(): handling completion for key=" << c->key << dendl;
1089
1090 RGWRados::BucketShard bs(store);
f64942e4 1091 RGWBucketInfo bucket_info;
31f18b77 1092
f64942e4 1093 int r = bs.init(c->obj.bucket, c->obj, &bucket_info);
31f18b77
FG
1094 if (r < 0) {
1095 ldout(cct, 0) << "ERROR: " << __func__ << "(): failed to initialize BucketShard, obj=" << c->obj << " r=" << r << dendl;
1096 /* not much to do */
1097 continue;
1098 }
1099
f64942e4
AA
1100 r = store->guard_reshard(&bs, c->obj, bucket_info,
1101 [&](RGWRados::BucketShard *bs) -> int {
1102 librados::ObjectWriteOperation o;
1103 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
1104 cls_rgw_bucket_complete_op(o, c->op, c->tag, c->ver, c->key, c->dir_meta, &c->remove_objs,
1105 c->log_op, c->bilog_op, &c->zones_trace);
1106 return bs->index_ctx.operate(bs->bucket_obj, &o);
31f18b77
FG
1107 });
1108 if (r < 0) {
1109 ldout(cct, 0) << "ERROR: " << __func__ << "(): bucket index completion failed, obj=" << c->obj << " r=" << r << dendl;
1110 /* ignoring error, can't do anything about it */
1111 continue;
1112 }
1113 r = store->data_log->add_entry(bs.bucket, bs.shard_id);
1114 if (r < 0) {
1115 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
1116 }
1117 }
1118
1119 return 0;
1120}
1121
1122class RGWIndexCompletionManager {
1123 RGWRados *store{nullptr};
1124 vector<Mutex *> locks;
1125 vector<set<complete_op_data *> > completions;
1126
1127 RGWIndexCompletionThread *completion_thread{nullptr};
1128
1129 int num_shards;
1130
1131 std::atomic<int> cur_shard {0};
1132
1133
1134public:
1135 RGWIndexCompletionManager(RGWRados *_store) : store(_store) {
1136 num_shards = store->ctx()->_conf->rgw_thread_pool_size;
1137
1138 for (int i = 0; i < num_shards; i++) {
1139 char buf[64];
1140 snprintf(buf, sizeof(buf), "RGWIndexCompletionManager::lock::%d", i);
1141 locks.push_back(new Mutex(buf));
1142 }
1143
1144 completions.resize(num_shards);
1145 }
1146 ~RGWIndexCompletionManager() {
1147 stop();
1148
1149 for (auto l : locks) {
1150 delete l;
1151 }
1152 }
1153
1154 int next_shard() {
1155 int result = cur_shard % num_shards;
1156 cur_shard++;
1157 return result;
1158 }
1159
1160 void create_completion(const rgw_obj& obj,
1161 RGWModifyOp op, string& tag,
1162 rgw_bucket_entry_ver& ver,
1163 const cls_rgw_obj_key& key,
1164 rgw_bucket_dir_entry_meta& dir_meta,
1165 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1166 uint16_t bilog_op,
1167 rgw_zone_set *zones_trace,
1168 complete_op_data **result);
1169 bool handle_completion(completion_t cb, complete_op_data *arg);
1170
1171 int start() {
1172 completion_thread = new RGWIndexCompletionThread(store);
1173 int ret = completion_thread->init();
1174 if (ret < 0) {
1175 return ret;
1176 }
1177 completion_thread->start();
1178 return 0;
1179 }
1180 void stop() {
1181 if (completion_thread) {
1182 completion_thread->stop();
1183 delete completion_thread;
1184 }
1185
1186 for (int i = 0; i < num_shards; ++i) {
1187 Mutex::Locker l(*locks[i]);
1188 for (auto c : completions[i]) {
31f18b77
FG
1189 c->stop();
1190 }
1191 }
1192 completions.clear();
1193 }
1194};
1195
1196static void obj_complete_cb(completion_t cb, void *arg)
1197{
1198 complete_op_data *completion = (complete_op_data *)arg;
1199 completion->lock.Lock();
1200 if (completion->stopped) {
1201 completion->lock.Unlock(); /* can drop lock, no one else is referencing us */
1202 delete completion;
1203 return;
1204 }
1205 bool need_delete = completion->manager->handle_completion(cb, completion);
1206 completion->lock.Unlock();
1207 if (need_delete) {
1208 delete completion;
1209 }
1210}
1211
1212
1213void RGWIndexCompletionManager::create_completion(const rgw_obj& obj,
1214 RGWModifyOp op, string& tag,
1215 rgw_bucket_entry_ver& ver,
1216 const cls_rgw_obj_key& key,
1217 rgw_bucket_dir_entry_meta& dir_meta,
1218 list<cls_rgw_obj_key> *remove_objs, bool log_op,
1219 uint16_t bilog_op,
1220 rgw_zone_set *zones_trace,
1221 complete_op_data **result)
1222{
1223 complete_op_data *entry = new complete_op_data;
1224
1225 int shard_id = next_shard();
1226
1227 entry->manager_shard_id = shard_id;
1228 entry->manager = this;
1229 entry->obj = obj;
1230 entry->op = op;
1231 entry->tag = tag;
1232 entry->ver = ver;
1233 entry->key = key;
1234 entry->dir_meta = dir_meta;
1235 entry->log_op = log_op;
1236 entry->bilog_op = bilog_op;
1237
1238 if (remove_objs) {
1239 for (auto iter = remove_objs->begin(); iter != remove_objs->end(); ++iter) {
1240 entry->remove_objs.push_back(*iter);
1241 }
1242 }
1243
1244 if (zones_trace) {
1245 entry->zones_trace = *zones_trace;
1246 } else {
11fdf7f2 1247 entry->zones_trace.insert(store->svc.zone->get_zone().id);
31f18b77
FG
1248 }
1249
1250 *result = entry;
1251
1252 entry->rados_completion = librados::Rados::aio_create_completion(entry, NULL, obj_complete_cb);
1253
1254 Mutex::Locker l(*locks[shard_id]);
1255 completions[shard_id].insert(entry);
1256}
1257
1258bool RGWIndexCompletionManager::handle_completion(completion_t cb, complete_op_data *arg)
1259{
1260 int shard_id = arg->manager_shard_id;
1261 {
1262 Mutex::Locker l(*locks[shard_id]);
1263
1264 auto& comps = completions[shard_id];
1265
1266 auto iter = comps.find(arg);
1267 if (iter == comps.end()) {
1268 return true;
1269 }
1270
1271 comps.erase(iter);
1272 }
1273
1274 int r = rados_aio_get_return_value(cb);
1275 if (r != -ERR_BUSY_RESHARDING) {
1276 return true;
1277 }
1278 completion_thread->add_completion(arg);
1279 return false;
1280}
1281
7c673cae
FG
1282void RGWRados::finalize()
1283{
11fdf7f2 1284 cct->get_admin_socket()->unregister_commands(this);
7c673cae
FG
1285 if (run_sync_thread) {
1286 Mutex::Locker l(meta_sync_thread_lock);
1287 meta_sync_processor_thread->stop();
1288
1289 Mutex::Locker dl(data_sync_thread_lock);
1290 for (auto iter : data_sync_processor_threads) {
1291 RGWDataSyncProcessorThread *thread = iter.second;
1292 thread->stop();
1293 }
1294 if (sync_log_trimmer) {
1295 sync_log_trimmer->stop();
1296 }
1297 }
1298 if (async_rados) {
1299 async_rados->stop();
1300 }
1301 if (run_sync_thread) {
1302 delete meta_sync_processor_thread;
1303 meta_sync_processor_thread = NULL;
1304 Mutex::Locker dl(data_sync_thread_lock);
1305 for (auto iter : data_sync_processor_threads) {
1306 RGWDataSyncProcessorThread *thread = iter.second;
1307 delete thread;
1308 }
1309 data_sync_processor_threads.clear();
1310 delete sync_log_trimmer;
1311 sync_log_trimmer = nullptr;
b32b8144 1312 bucket_trim = boost::none;
7c673cae 1313 }
7c673cae
FG
1314 if (meta_notifier) {
1315 meta_notifier->stop();
1316 delete meta_notifier;
1317 }
1318 if (data_notifier) {
1319 data_notifier->stop();
1320 delete data_notifier;
1321 }
1322 delete data_log;
11fdf7f2
TL
1323 delete sync_tracer;
1324 if (async_rados) {
1325 delete async_rados;
7c673cae 1326 }
11fdf7f2
TL
1327
1328 delete lc;
1329 lc = NULL;
7c673cae 1330
11fdf7f2
TL
1331 delete gc;
1332 gc = NULL;
7c673cae 1333
11fdf7f2
TL
1334 delete obj_expirer;
1335 obj_expirer = NULL;
7c673cae 1336
11fdf7f2
TL
1337 RGWQuotaHandler::free_handler(quota_handler);
1338 if (cr_registry) {
1339 cr_registry->put();
7c673cae
FG
1340 }
1341
11fdf7f2 1342 svc.shutdown();
7c673cae 1343
11fdf7f2
TL
1344 delete meta_mgr;
1345 delete binfo_cache;
1346 delete obj_tombstone_cache;
7c673cae 1347
11fdf7f2
TL
1348 if (reshard_wait.get()) {
1349 reshard_wait->stop();
1350 reshard_wait.reset();
7c673cae
FG
1351 }
1352
11fdf7f2
TL
1353 if (run_reshard_thread) {
1354 reshard->stop_processor();
7c673cae 1355 }
11fdf7f2
TL
1356 delete reshard;
1357 delete index_completion_manager;
1358}
1359
1360/**
1361 * Initialize the RADOS instance and prepare to do other ops
1362 * Returns 0 on success, -ERR# on failure.
1363 */
1364int RGWRados::init_rados()
1365{
1366 int ret = 0;
1367 auto admin_socket = cct->get_admin_socket();
1368 for (auto cmd : admin_commands) {
1369 int r = admin_socket->register_command(cmd[0], cmd[1], this,
1370 cmd[2]);
1371 if (r < 0) {
1372 lderr(cct) << "ERROR: fail to register admin socket command (r=" << r
1373 << ")" << dendl;
1374 return r;
1375 }
7c673cae 1376 }
7c673cae 1377
494da23a
TL
1378 ret = rados.init_with_context(cct);
1379 if (ret < 0) {
1380 return ret;
1381 }
1382 ret = rados.connect();
1383 if (ret < 0) {
1384 return ret;
7c673cae 1385 }
11fdf7f2
TL
1386
1387 auto crs = std::unique_ptr<RGWCoroutinesManagerRegistry>{
1388 new RGWCoroutinesManagerRegistry(cct)};
1389 ret = crs->hook_to_admin_command("cr dump");
1390 if (ret < 0) {
1391 return ret;
7c673cae
FG
1392 }
1393
11fdf7f2
TL
1394 meta_mgr = new RGWMetadataManager(cct, this);
1395 data_log = new RGWDataChangesLog(cct, this);
1396 cr_registry = crs.release();
11fdf7f2 1397 return ret;
7c673cae
FG
1398}
1399
11fdf7f2 1400int RGWRados::register_to_service_map(const string& daemon_type, const map<string, string>& meta)
7c673cae 1401{
11fdf7f2 1402 map<string,string> metadata = meta;
494da23a 1403 metadata["num_handles"] = "1"s;
11fdf7f2
TL
1404 metadata["zonegroup_id"] = svc.zone->get_zonegroup().get_id();
1405 metadata["zonegroup_name"] = svc.zone->get_zonegroup().get_name();
1406 metadata["zone_name"] = svc.zone->zone_name();
1407 metadata["zone_id"] = svc.zone->zone_id();
1408 string name = cct->_conf->name.get_id();
1409 if (name.compare(0, 4, "rgw.") == 0) {
1410 name = name.substr(4);
7c673cae 1411 }
494da23a 1412 int ret = rados.service_daemon_register(daemon_type, name, metadata);
11fdf7f2
TL
1413 if (ret < 0) {
1414 ldout(cct, 0) << "ERROR: service_daemon_register() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1415 return ret;
7c673cae
FG
1416 }
1417
1418 return 0;
1419}
1420
11fdf7f2 1421int RGWRados::update_service_map(std::map<std::string, std::string>&& status)
7c673cae 1422{
494da23a 1423 int ret = rados.service_daemon_update_status(move(status));
11fdf7f2
TL
1424 if (ret < 0) {
1425 ldout(cct, 0) << "ERROR: service_daemon_update_status() returned ret=" << ret << ": " << cpp_strerror(-ret) << dendl;
1426 return ret;
1427 }
1428
1429 return 0;
7c673cae
FG
1430}
1431
1432/**
1433 * Initialize the RADOS instance and prepare to do other ops
1434 * Returns 0 on success, -ERR# on failure.
1435 */
1436int RGWRados::init_complete()
1437{
11fdf7f2 1438 int ret;
7c673cae 1439
11fdf7f2
TL
1440 /*
1441 * create sync module instance even if we don't run sync thread, might need it for radosgw-admin
1442 */
1443 auto& zone_public_config = svc.zone->get_zone();
1444 ret = svc.sync_modules->get_manager()->create_instance(cct, zone_public_config.tier_type, svc.zone->get_zone_params().tier_config, &sync_module);
7c673cae 1445 if (ret < 0) {
11fdf7f2
TL
1446 lderr(cct) << "ERROR: failed to init sync module instance, ret=" << ret << dendl;
1447 if (ret == -ENOENT) {
1448 lderr(cct) << "ERROR: " << zone_public_config.tier_type
1449 << " sync module does not exist. valid sync modules: "
1450 << svc.sync_modules->get_manager()->get_registered_module_names()
1451 << dendl;
7c673cae 1452 }
7c673cae
FG
1453 return ret;
1454 }
7c673cae
FG
1455
1456 period_puller.reset(new RGWPeriodPuller(this));
1457 period_history.reset(new RGWPeriodHistory(cct, period_puller.get(),
11fdf7f2 1458 svc.zone->get_current_period()));
7c673cae
FG
1459
1460 ret = open_root_pool_ctx();
1461 if (ret < 0)
1462 return ret;
1463
1464 ret = open_gc_pool_ctx();
1465 if (ret < 0)
1466 return ret;
1467
1468 ret = open_lc_pool_ctx();
1469 if (ret < 0)
1470 return ret;
1471
1472 ret = open_objexp_pool_ctx();
1473 if (ret < 0)
1474 return ret;
1475
31f18b77
FG
1476 ret = open_reshard_pool_ctx();
1477 if (ret < 0)
1478 return ret;
1479
7c673cae
FG
1480 pools_initialized = true;
1481
1482 gc = new RGWGC();
1483 gc->initialize(cct, this);
1484
1485 obj_expirer = new RGWObjectExpirer(this);
1486
1487 if (use_gc_thread) {
1488 gc->start_processor();
1489 obj_expirer->start_processor();
1490 }
1491
11fdf7f2
TL
1492 auto& current_period = svc.zone->get_current_period();
1493 auto& zonegroup = svc.zone->get_zonegroup();
1494 auto& zone_params = svc.zone->get_zone_params();
1495 auto& zone = svc.zone->get_zone();
1496
7c673cae
FG
1497 /* no point of running sync thread if we don't have a master zone configured
1498 or there is no rest_master_conn */
11fdf7f2 1499 if (zonegroup.master_zone.empty() || !svc.zone->get_master_conn()
7c673cae
FG
1500 || current_period.get_id().empty()) {
1501 run_sync_thread = false;
1502 }
1503
b32b8144
FG
1504 if (run_sync_thread) {
1505 // initialize the log period history
1506 meta_mgr->init_oldest_log_period();
1507 }
1508
7c673cae
FG
1509 async_rados = new RGWAsyncRadosProcessor(this, cct->_conf->rgw_num_async_rados_threads);
1510 async_rados->start();
1511
1512 ret = meta_mgr->init(current_period.get_id());
1513 if (ret < 0) {
1514 lderr(cct) << "ERROR: failed to initialize metadata log: "
1515 << cpp_strerror(-ret) << dendl;
1516 return ret;
1517 }
1518
11fdf7f2 1519 if (svc.zone->is_meta_master()) {
7c673cae
FG
1520 auto md_log = meta_mgr->get_log(current_period.get_id());
1521 meta_notifier = new RGWMetaNotifier(this, md_log);
1522 meta_notifier->start();
1523 }
1524
11fdf7f2
TL
1525 /* init it anyway, might run sync through radosgw-admin explicitly */
1526 sync_tracer = new RGWSyncTraceManager(cct, cct->_conf->rgw_sync_trace_history_size);
1527 sync_tracer->init(this);
1528 ret = sync_tracer->hook_to_admin_command();
1529 if (ret < 0) {
1530 return ret;
1531 }
1532
7c673cae 1533 if (run_sync_thread) {
11fdf7f2
TL
1534 for (const auto &pt: zonegroup.placement_targets) {
1535 if (zone_params.placement_pools.find(pt.second.name)
1536 == zone_params.placement_pools.end()){
1537 ldout(cct, 0) << "WARNING: This zone does not contain the placement target "
1538 << pt.second.name << " present in zonegroup" << dendl;
1539 }
1540 }
7c673cae
FG
1541 Mutex::Locker l(meta_sync_thread_lock);
1542 meta_sync_processor_thread = new RGWMetaSyncProcessorThread(this, async_rados);
1543 ret = meta_sync_processor_thread->init();
1544 if (ret < 0) {
1545 ldout(cct, 0) << "ERROR: failed to initialize meta sync thread" << dendl;
1546 return ret;
1547 }
1548 meta_sync_processor_thread->start();
1549
b32b8144
FG
1550 // configure the bucket trim manager
1551 rgw::BucketTrimConfig config;
1552 rgw::configure_bucket_trim(cct, config);
1553
1554 bucket_trim.emplace(this, config);
1555 ret = bucket_trim->init();
1556 if (ret < 0) {
1557 ldout(cct, 0) << "ERROR: failed to start bucket trim manager" << dendl;
1558 return ret;
1559 }
91327a77 1560 data_log->set_observer(&*bucket_trim);
b32b8144 1561
7c673cae 1562 Mutex::Locker dl(data_sync_thread_lock);
81eedcae
TL
1563 for (auto source_zone : svc.zone->get_data_sync_source_zones()) {
1564 ldout(cct, 5) << "starting data sync thread for zone " << source_zone->name << dendl;
1565 auto *thread = new RGWDataSyncProcessorThread(this, async_rados, source_zone);
7c673cae
FG
1566 ret = thread->init();
1567 if (ret < 0) {
1568 ldout(cct, 0) << "ERROR: failed to initialize data sync thread" << dendl;
1569 return ret;
1570 }
1571 thread->start();
81eedcae 1572 data_sync_processor_threads[source_zone->id] = thread;
7c673cae
FG
1573 }
1574 auto interval = cct->_conf->rgw_sync_log_trim_interval;
1575 if (interval > 0) {
b32b8144 1576 sync_log_trimmer = new RGWSyncLogTrimThread(this, &*bucket_trim, interval);
7c673cae
FG
1577 ret = sync_log_trimmer->init();
1578 if (ret < 0) {
1579 ldout(cct, 0) << "ERROR: failed to initialize sync log trim thread" << dendl;
1580 return ret;
1581 }
1582 sync_log_trimmer->start();
1583 }
1584 }
1585 data_notifier = new RGWDataNotifier(this);
1586 data_notifier->start();
1587
1588 lc = new RGWLC();
1589 lc->initialize(cct, this);
31f18b77 1590
7c673cae
FG
1591 if (use_lc_thread)
1592 lc->start_processor();
31f18b77 1593
7c673cae
FG
1594 quota_handler = RGWQuotaHandler::generate_handler(this, quota_threads);
1595
1596 bucket_index_max_shards = (cct->_conf->rgw_override_bucket_index_max_shards ? cct->_conf->rgw_override_bucket_index_max_shards :
11fdf7f2 1597 zone.bucket_index_max_shards);
31f18b77
FG
1598 if (bucket_index_max_shards > get_max_bucket_shards()) {
1599 bucket_index_max_shards = get_max_bucket_shards();
7c673cae 1600 ldout(cct, 1) << __func__ << " bucket index max shards is too large, reset to value: "
31f18b77 1601 << get_max_bucket_shards() << dendl;
7c673cae
FG
1602 }
1603 ldout(cct, 20) << __func__ << " bucket index max shards: " << bucket_index_max_shards << dendl;
1604
1605 binfo_cache = new RGWChainedCacheImpl<bucket_info_entry>;
11fdf7f2 1606 binfo_cache->init(svc.cache);
7c673cae 1607
11fdf7f2 1608 bool need_tombstone_cache = !svc.zone->get_zone_data_notify_to_map().empty(); /* have zones syncing from us */
7c673cae
FG
1609
1610 if (need_tombstone_cache) {
1611 obj_tombstone_cache = new tombstone_cache_t(cct->_conf->rgw_obj_tombstone_cache_size);
1612 }
1613
11fdf7f2 1614 reshard_wait = std::make_shared<RGWReshardWait>();
31f18b77
FG
1615
1616 reshard = new RGWReshard(this);
1617
1618 /* only the master zone in the zonegroup reshards buckets */
11fdf7f2 1619 run_reshard_thread = run_reshard_thread && (zonegroup.master_zone == zone.id);
31f18b77
FG
1620 if (run_reshard_thread) {
1621 reshard->start_processor();
1622 }
1623
1624 index_completion_manager = new RGWIndexCompletionManager(this);
1625 ret = index_completion_manager->start();
1626
7c673cae
FG
1627 return ret;
1628}
1629
11fdf7f2
TL
1630int RGWRados::init_svc(bool raw)
1631{
1632 if (raw) {
1633 return svc.init_raw(cct, use_cache);
1634 }
1635
1636 return svc.init(cct, use_cache);
1637}
1638
7c673cae
FG
1639/**
1640 * Initialize the RADOS instance and prepare to do other ops
1641 * Returns 0 on success, -ERR# on failure.
1642 */
1643int RGWRados::initialize()
1644{
1645 int ret;
1646
11fdf7f2
TL
1647 inject_notify_timeout_probability =
1648 cct->_conf.get_val<double>("rgw_inject_notify_timeout_probability");
1649 max_notify_retries = cct->_conf.get_val<uint64_t>("rgw_max_notify_retries");
7c673cae 1650
11fdf7f2 1651 ret = init_svc(false);
7c673cae 1652 if (ret < 0) {
11fdf7f2 1653 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
7c673cae
FG
1654 return ret;
1655 }
7c673cae 1656
11fdf7f2 1657 host_id = svc.zone_utils->gen_host_id();
7c673cae 1658
11fdf7f2
TL
1659 ret = init_rados();
1660 if (ret < 0)
1661 return ret;
1662
1663 return init_complete();
7c673cae
FG
1664}
1665
1666/**
1667 * Open the pool used as root for this gateway
1668 * Returns: 0 on success, -ERR# otherwise.
1669 */
1670int RGWRados::open_root_pool_ctx()
1671{
494da23a 1672 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().domain_root, root_pool_ctx, true, true);
7c673cae
FG
1673}
1674
1675int RGWRados::open_gc_pool_ctx()
1676{
494da23a 1677 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().gc_pool, gc_pool_ctx, true, true);
7c673cae
FG
1678}
1679
1680int RGWRados::open_lc_pool_ctx()
1681{
494da23a 1682 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().lc_pool, lc_pool_ctx, true, true);
7c673cae
FG
1683}
1684
1685int RGWRados::open_objexp_pool_ctx()
1686{
494da23a 1687 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, objexp_pool_ctx, true, true);
7c673cae
FG
1688}
1689
31f18b77
FG
1690int RGWRados::open_reshard_pool_ctx()
1691{
494da23a 1692 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().reshard_pool, reshard_pool_ctx, true, true);
7c673cae
FG
1693}
1694
494da23a
TL
1695int RGWRados::open_pool_ctx(const rgw_pool& pool, librados::IoCtx& io_ctx,
1696 bool mostly_omap)
7c673cae 1697{
28e407b8 1698 constexpr bool create = true; // create the pool if it doesn't exist
494da23a 1699 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create, mostly_omap);
7c673cae
FG
1700}
1701
1702void RGWRados::build_bucket_index_marker(const string& shard_id_str, const string& shard_marker,
1703 string *marker) {
1704 if (marker) {
1705 *marker = shard_id_str;
1706 marker->append(BucketIndexShardsManager::KEY_VALUE_SEPARATOR);
1707 marker->append(shard_marker);
1708 }
1709}
1710
1711int RGWRados::open_bucket_index_ctx(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx)
1712{
3a9019d9
FG
1713 const rgw_pool& explicit_pool = bucket_info.bucket.explicit_placement.index_pool;
1714
1715 if (!explicit_pool.empty()) {
494da23a 1716 return open_pool_ctx(explicit_pool, index_ctx, false);
3a9019d9
FG
1717 }
1718
11fdf7f2
TL
1719 auto& zonegroup = svc.zone->get_zonegroup();
1720 auto& zone_params = svc.zone->get_zone_params();
1721
1722 const rgw_placement_rule *rule = &bucket_info.placement_rule;
7c673cae
FG
1723 if (rule->empty()) {
1724 rule = &zonegroup.default_placement;
1725 }
11fdf7f2 1726 auto iter = zone_params.placement_pools.find(rule->name);
7c673cae
FG
1727 if (iter == zone_params.placement_pools.end()) {
1728 ldout(cct, 0) << "could not find placement rule " << *rule << " within zonegroup " << dendl;
1729 return -EINVAL;
1730 }
1731
494da23a 1732 int r = open_pool_ctx(iter->second.index_pool, index_ctx, true);
7c673cae
FG
1733 if (r < 0)
1734 return r;
1735
1736 return 0;
1737}
1738
7c673cae
FG
1739/**** logs ****/
1740
1741struct log_list_state {
1742 string prefix;
1743 librados::IoCtx io_ctx;
1744 librados::NObjectIterator obit;
1745};
1746
1747int RGWRados::log_list_init(const string& prefix, RGWAccessHandle *handle)
1748{
1749 log_list_state *state = new log_list_state;
11fdf7f2 1750 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1751 if (r < 0) {
1752 delete state;
1753 return r;
1754 }
1755 state->prefix = prefix;
1756 state->obit = state->io_ctx.nobjects_begin();
1757 *handle = (RGWAccessHandle)state;
1758 return 0;
1759}
1760
1761int RGWRados::log_list_next(RGWAccessHandle handle, string *name)
1762{
1763 log_list_state *state = static_cast<log_list_state *>(handle);
1764 while (true) {
1765 if (state->obit == state->io_ctx.nobjects_end()) {
1766 delete state;
1767 return -ENOENT;
1768 }
1769 if (state->prefix.length() &&
1770 state->obit->get_oid().find(state->prefix) != 0) {
1771 state->obit++;
1772 continue;
1773 }
1774 *name = state->obit->get_oid();
1775 state->obit++;
1776 break;
1777 }
1778 return 0;
1779}
1780
1781int RGWRados::log_remove(const string& name)
1782{
1783 librados::IoCtx io_ctx;
11fdf7f2 1784 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
1785 if (r < 0)
1786 return r;
1787 return io_ctx.remove(name);
1788}
1789
1790struct log_show_state {
1791 librados::IoCtx io_ctx;
1792 bufferlist bl;
11fdf7f2 1793 bufferlist::const_iterator p;
7c673cae
FG
1794 string name;
1795 uint64_t pos;
1796 bool eof;
1797 log_show_state() : pos(0), eof(false) {}
1798};
1799
1800int RGWRados::log_show_init(const string& name, RGWAccessHandle *handle)
1801{
1802 log_show_state *state = new log_show_state;
11fdf7f2 1803 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, state->io_ctx);
7c673cae
FG
1804 if (r < 0) {
1805 delete state;
1806 return r;
1807 }
1808 state->name = name;
1809 *handle = (RGWAccessHandle)state;
1810 return 0;
1811}
1812
1813int RGWRados::log_show_next(RGWAccessHandle handle, rgw_log_entry *entry)
1814{
1815 log_show_state *state = static_cast<log_show_state *>(handle);
1816 off_t off = state->p.get_off();
1817
1818 ldout(cct, 10) << "log_show_next pos " << state->pos << " bl " << state->bl.length()
1819 << " off " << off
1820 << " eof " << (int)state->eof
1821 << dendl;
1822 // read some?
1823 unsigned chunk = 1024*1024;
1824 if ((state->bl.length() - off) < chunk/2 && !state->eof) {
1825 bufferlist more;
1826 int r = state->io_ctx.read(state->name, more, chunk, state->pos);
1827 if (r < 0)
1828 return r;
1829 state->pos += r;
1830 bufferlist old;
1831 try {
1832 old.substr_of(state->bl, off, state->bl.length() - off);
1833 } catch (buffer::error& err) {
1834 return -EINVAL;
1835 }
1836 state->bl.clear();
1837 state->bl.claim(old);
1838 state->bl.claim_append(more);
11fdf7f2 1839 state->p = state->bl.cbegin();
7c673cae
FG
1840 if ((unsigned)r < chunk)
1841 state->eof = true;
1842 ldout(cct, 10) << " read " << r << dendl;
1843 }
1844
1845 if (state->p.end())
1846 return 0; // end of file
1847 try {
11fdf7f2 1848 decode(*entry, state->p);
7c673cae
FG
1849 }
1850 catch (const buffer::error &e) {
1851 return -EINVAL;
1852 }
1853 return 1;
1854}
1855
1856/**
1857 * usage_log_hash: get usage log key hash, based on name and index
1858 *
1859 * Get the usage object name. Since a user may have more than 1
1860 * object holding that info (multiple shards), we use index to
1861 * specify that shard number. Once index exceeds max shards it
1862 * wraps.
1863 * If name is not being set, results for all users will be returned
1864 * and index will wrap only after total shards number.
1865 *
1866 * @param cct [in] ceph context
1867 * @param name [in] user name
1868 * @param hash [out] hash value
1869 * @param index [in] shard index number
1870 */
1871static void usage_log_hash(CephContext *cct, const string& name, string& hash, uint32_t index)
1872{
1873 uint32_t val = index;
1874
1875 if (!name.empty()) {
c07f9fc5 1876 int max_user_shards = cct->_conf->rgw_usage_max_user_shards;
7c673cae
FG
1877 val %= max_user_shards;
1878 val += ceph_str_hash_linux(name.c_str(), name.size());
1879 }
1880 char buf[17];
c07f9fc5 1881 int max_shards = cct->_conf->rgw_usage_max_shards;
7c673cae
FG
1882 snprintf(buf, sizeof(buf), RGW_USAGE_OBJ_PREFIX "%u", (unsigned)(val % max_shards));
1883 hash = buf;
1884}
1885
1886int RGWRados::log_usage(map<rgw_user_bucket, RGWUsageBatch>& usage_info)
1887{
1888 uint32_t index = 0;
1889
1890 map<string, rgw_usage_log_info> log_objs;
1891
1892 string hash;
1893 string last_user;
1894
1895 /* restructure usage map, zone by object hash */
1896 map<rgw_user_bucket, RGWUsageBatch>::iterator iter;
1897 for (iter = usage_info.begin(); iter != usage_info.end(); ++iter) {
1898 const rgw_user_bucket& ub = iter->first;
1899 RGWUsageBatch& info = iter->second;
1900
1901 if (ub.user.empty()) {
1902 ldout(cct, 0) << "WARNING: RGWRados::log_usage(): user name empty (bucket=" << ub.bucket << "), skipping" << dendl;
1903 continue;
1904 }
1905
1906 if (ub.user != last_user) {
1907 /* index *should* be random, but why waste extra cycles
1908 in most cases max user shards is not going to exceed 1,
1909 so just incrementing it */
1910 usage_log_hash(cct, ub.user, hash, index++);
1911 }
1912 last_user = ub.user;
1913 vector<rgw_usage_log_entry>& v = log_objs[hash].entries;
1914
1915 for (auto miter = info.m.begin(); miter != info.m.end(); ++miter) {
1916 v.push_back(miter->second);
1917 }
1918 }
1919
1920 map<string, rgw_usage_log_info>::iterator liter;
1921
1922 for (liter = log_objs.begin(); liter != log_objs.end(); ++liter) {
1923 int r = cls_obj_usage_log_add(liter->first, liter->second);
1924 if (r < 0)
1925 return r;
1926 }
1927 return 0;
1928}
1929
11fdf7f2
TL
1930int RGWRados::read_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch,
1931 uint32_t max_entries, bool *is_truncated, RGWUsageIter& usage_iter, map<rgw_user_bucket,
1932 rgw_usage_log_entry>& usage)
7c673cae
FG
1933{
1934 uint32_t num = max_entries;
1935 string hash, first_hash;
1936 string user_str = user.to_str();
1937 usage_log_hash(cct, user_str, first_hash, 0);
1938
1939 if (usage_iter.index) {
1940 usage_log_hash(cct, user_str, hash, usage_iter.index);
1941 } else {
1942 hash = first_hash;
1943 }
1944
1945 usage.clear();
1946
1947 do {
1948 map<rgw_user_bucket, rgw_usage_log_entry> ret_usage;
1949 map<rgw_user_bucket, rgw_usage_log_entry>::iterator iter;
1950
11fdf7f2 1951 int ret = cls_obj_usage_log_read(hash, user_str, bucket_name, start_epoch, end_epoch, num,
7c673cae
FG
1952 usage_iter.read_iter, ret_usage, is_truncated);
1953 if (ret == -ENOENT)
1954 goto next;
1955
1956 if (ret < 0)
1957 return ret;
1958
1959 num -= ret_usage.size();
1960
1961 for (iter = ret_usage.begin(); iter != ret_usage.end(); ++iter) {
1962 usage[iter->first].aggregate(iter->second);
1963 }
1964
1965next:
1966 if (!*is_truncated) {
1967 usage_iter.read_iter.clear();
1968 usage_log_hash(cct, user_str, hash, ++usage_iter.index);
1969 }
1970 } while (num && !*is_truncated && hash != first_hash);
1971 return 0;
1972}
1973
11fdf7f2 1974int RGWRados::trim_usage(const rgw_user& user, const string& bucket_name, uint64_t start_epoch, uint64_t end_epoch)
7c673cae
FG
1975{
1976 uint32_t index = 0;
1977 string hash, first_hash;
1978 string user_str = user.to_str();
1979 usage_log_hash(cct, user_str, first_hash, index);
1980
1981 hash = first_hash;
7c673cae 1982 do {
11fdf7f2 1983 int ret = cls_obj_usage_log_trim(hash, user_str, bucket_name, start_epoch, end_epoch);
7c673cae 1984
b32b8144 1985 if (ret < 0 && ret != -ENOENT)
7c673cae
FG
1986 return ret;
1987
7c673cae
FG
1988 usage_log_hash(cct, user_str, hash, ++index);
1989 } while (hash != first_hash);
1990
1991 return 0;
1992}
1993
11fdf7f2
TL
1994
1995int RGWRados::clear_usage()
1996{
1997 auto max_shards = cct->_conf->rgw_usage_max_shards;
1998 int ret=0;
1999 for (unsigned i=0; i < max_shards; i++){
2000 string oid = RGW_USAGE_OBJ_PREFIX + to_string(i);
2001 ret = cls_obj_usage_log_clear(oid);
2002 if (ret < 0){
2003 ldout(cct,0) << "usage clear on oid="<< oid << "failed with ret=" << ret << dendl;
2004 return ret;
2005 }
2006 }
2007 return ret;
2008}
2009
7c673cae
FG
2010int RGWRados::key_to_shard_id(const string& key, int max_shards)
2011{
1adf2230 2012 return rgw_shard_id(key, max_shards);
7c673cae
FG
2013}
2014
2015void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& key, string& name, int *shard_id)
2016{
2017 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2018 char buf[16];
2019 if (shard_id) {
2020 *shard_id = val % max_shards;
2021 }
2022 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2023 name = prefix + buf;
2024}
2025
2026void RGWRados::shard_name(const string& prefix, unsigned max_shards, const string& section, const string& key, string& name)
2027{
2028 uint32_t val = ceph_str_hash_linux(key.c_str(), key.size());
2029 val ^= ceph_str_hash_linux(section.c_str(), section.size());
2030 char buf[16];
2031 snprintf(buf, sizeof(buf), "%u", (unsigned)(val % max_shards));
2032 name = prefix + buf;
2033}
2034
2035void RGWRados::shard_name(const string& prefix, unsigned shard_id, string& name)
2036{
2037 char buf[16];
2038 snprintf(buf, sizeof(buf), "%u", shard_id);
2039 name = prefix + buf;
2040
2041}
2042
2043void RGWRados::time_log_prepare_entry(cls_log_entry& entry, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2044{
2045 cls_log_add_prepare_entry(entry, utime_t(ut), section, key, bl);
2046}
2047
2048int RGWRados::time_log_add_init(librados::IoCtx& io_ctx)
2049{
11fdf7f2 2050 return rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx, true);
7c673cae
FG
2051
2052}
2053
2054int RGWRados::time_log_add(const string& oid, const real_time& ut, const string& section, const string& key, bufferlist& bl)
2055{
2056 librados::IoCtx io_ctx;
2057
2058 int r = time_log_add_init(io_ctx);
2059 if (r < 0) {
2060 return r;
2061 }
2062
2063 ObjectWriteOperation op;
2064 utime_t t(ut);
2065 cls_log_add(op, t, section, key, bl);
2066
2067 return io_ctx.operate(oid, &op);
2068}
2069
2070int RGWRados::time_log_add(const string& oid, list<cls_log_entry>& entries,
2071 librados::AioCompletion *completion, bool monotonic_inc)
2072{
2073 librados::IoCtx io_ctx;
2074
2075 int r = time_log_add_init(io_ctx);
2076 if (r < 0) {
2077 return r;
2078 }
2079
2080 ObjectWriteOperation op;
2081 cls_log_add(op, entries, monotonic_inc);
2082
2083 if (!completion) {
2084 r = io_ctx.operate(oid, &op);
2085 } else {
2086 r = io_ctx.aio_operate(oid, completion, &op);
2087 }
2088 return r;
2089}
2090
2091int RGWRados::time_log_list(const string& oid, const real_time& start_time, const real_time& end_time,
2092 int max_entries, list<cls_log_entry>& entries,
2093 const string& marker,
2094 string *out_marker,
2095 bool *truncated)
2096{
2097 librados::IoCtx io_ctx;
2098
11fdf7f2 2099 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
2100 if (r < 0)
2101 return r;
2102 librados::ObjectReadOperation op;
2103
2104 utime_t st(start_time);
2105 utime_t et(end_time);
2106
2107 cls_log_list(op, st, et, marker, max_entries, entries,
2108 out_marker, truncated);
2109
2110 bufferlist obl;
2111
2112 int ret = io_ctx.operate(oid, &op, &obl);
2113 if (ret < 0)
2114 return ret;
2115
2116 return 0;
2117}
2118
2119int RGWRados::time_log_info(const string& oid, cls_log_header *header)
2120{
2121 librados::IoCtx io_ctx;
2122
11fdf7f2 2123 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
2124 if (r < 0)
2125 return r;
2126 librados::ObjectReadOperation op;
2127
2128 cls_log_info(op, header);
2129
2130 bufferlist obl;
2131
2132 int ret = io_ctx.operate(oid, &op, &obl);
2133 if (ret < 0)
2134 return ret;
2135
2136 return 0;
2137}
2138
2139int RGWRados::time_log_info_async(librados::IoCtx& io_ctx, const string& oid, cls_log_header *header, librados::AioCompletion *completion)
2140{
11fdf7f2 2141 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
2142 if (r < 0)
2143 return r;
2144
2145 librados::ObjectReadOperation op;
2146
2147 cls_log_info(op, header);
2148
2149 int ret = io_ctx.aio_operate(oid, completion, &op, NULL);
2150 if (ret < 0)
2151 return ret;
2152
2153 return 0;
2154}
2155
2156int RGWRados::time_log_trim(const string& oid, const real_time& start_time, const real_time& end_time,
2157 const string& from_marker, const string& to_marker,
2158 librados::AioCompletion *completion)
2159{
2160 librados::IoCtx io_ctx;
2161
11fdf7f2 2162 int r = rgw_init_ioctx(get_rados_handle(), svc.zone->get_zone_params().log_pool, io_ctx);
7c673cae
FG
2163 if (r < 0)
2164 return r;
2165
2166 utime_t st(start_time);
2167 utime_t et(end_time);
2168
2169 ObjectWriteOperation op;
2170 cls_log_trim(op, st, et, from_marker, to_marker);
2171
2172 if (!completion) {
2173 r = io_ctx.operate(oid, &op);
2174 } else {
2175 r = io_ctx.aio_operate(oid, completion, &op);
2176 }
2177 return r;
2178}
2179
2180string RGWRados::objexp_hint_get_shardname(int shard_num)
2181{
2182 char buf[32];
2183 snprintf(buf, sizeof(buf), "%010u", (unsigned)shard_num);
2184
2185 string objname("obj_delete_at_hint.");
2186 return objname + buf;
2187}
2188
7c673cae
FG
2189int RGWRados::objexp_key_shard(const rgw_obj_index_key& key)
2190{
2191 string obj_key = key.name + key.instance;
2192 int num_shards = cct->_conf->rgw_objexp_hints_num_shards;
1adf2230 2193 return rgw_bucket_shard_index(obj_key, num_shards);
7c673cae
FG
2194}
2195
2196static string objexp_hint_get_keyext(const string& tenant_name,
2197 const string& bucket_name,
2198 const string& bucket_id,
2199 const rgw_obj_key& obj_key)
2200{
2201 return tenant_name + (tenant_name.empty() ? "" : ":") + bucket_name + ":" + bucket_id +
2202 ":" + obj_key.name + ":" + obj_key.instance;
2203}
2204
2205int RGWRados::objexp_hint_add(const ceph::real_time& delete_at,
2206 const string& tenant_name,
2207 const string& bucket_name,
2208 const string& bucket_id,
2209 const rgw_obj_index_key& obj_key)
2210{
2211 const string keyext = objexp_hint_get_keyext(tenant_name, bucket_name,
2212 bucket_id, obj_key);
2213 objexp_hint_entry he = {
2214 .tenant = tenant_name,
2215 .bucket_name = bucket_name,
2216 .bucket_id = bucket_id,
2217 .obj_key = obj_key,
2218 .exp_time = delete_at };
2219 bufferlist hebl;
11fdf7f2 2220 encode(he, hebl);
7c673cae
FG
2221 ObjectWriteOperation op;
2222 cls_timeindex_add(op, utime_t(delete_at), keyext, hebl);
2223
2224 string shard_name = objexp_hint_get_shardname(objexp_key_shard(obj_key));
2225 return objexp_pool_ctx.operate(shard_name, &op);
2226}
2227
2228void RGWRados::objexp_get_shard(int shard_num,
2229 string& shard) /* out */
2230{
2231 shard = objexp_hint_get_shardname(shard_num);
2232}
2233
2234int RGWRados::objexp_hint_list(const string& oid,
2235 const ceph::real_time& start_time,
2236 const ceph::real_time& end_time,
2237 const int max_entries,
2238 const string& marker,
2239 list<cls_timeindex_entry>& entries, /* out */
2240 string *out_marker, /* out */
2241 bool *truncated) /* out */
2242{
2243 librados::ObjectReadOperation op;
2244 cls_timeindex_list(op, utime_t(start_time), utime_t(end_time), marker, max_entries, entries,
2245 out_marker, truncated);
2246
2247 bufferlist obl;
2248 int ret = objexp_pool_ctx.operate(oid, &op, &obl);
2249
2250 if ((ret < 0 ) && (ret != -ENOENT)) {
2251 return ret;
2252 }
2253
2254 if ((ret == -ENOENT) && truncated) {
2255 *truncated = false;
2256 }
2257
2258 return 0;
2259}
2260
2261int RGWRados::objexp_hint_parse(cls_timeindex_entry &ti_entry, /* in */
2262 objexp_hint_entry& hint_entry) /* out */
2263{
2264 try {
11fdf7f2
TL
2265 auto iter = ti_entry.value.cbegin();
2266 decode(hint_entry, iter);
7c673cae
FG
2267 } catch (buffer::error& err) {
2268 ldout(cct, 0) << "ERROR: couldn't decode avail_pools" << dendl;
2269 }
2270
2271 return 0;
2272}
2273
2274int RGWRados::objexp_hint_trim(const string& oid,
2275 const ceph::real_time& start_time,
2276 const ceph::real_time& end_time,
2277 const string& from_marker,
2278 const string& to_marker)
2279{
2280 int ret = cls_timeindex_trim(objexp_pool_ctx, oid, utime_t(start_time), utime_t(end_time),
2281 from_marker, to_marker);
2282 if ((ret < 0 ) && (ret != -ENOENT)) {
2283 return ret;
2284 }
2285
2286 return 0;
2287}
2288
11fdf7f2 2289int RGWRados::lock_exclusive(const rgw_pool& pool, const string& oid, timespan& duration,
7c673cae
FG
2290 string& zone_id, string& owner_id) {
2291 librados::IoCtx io_ctx;
2292
2293 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2294 if (r < 0) {
2295 return r;
2296 }
2297 uint64_t msec = std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
2298 utime_t ut(msec / 1000, msec % 1000);
2299
2300 rados::cls::lock::Lock l(log_lock_name);
2301 l.set_duration(ut);
2302 l.set_cookie(owner_id);
2303 l.set_tag(zone_id);
f64942e4 2304 l.set_may_renew(true);
7c673cae
FG
2305
2306 return l.lock_exclusive(&io_ctx, oid);
2307}
2308
11fdf7f2 2309int RGWRados::unlock(const rgw_pool& pool, const string& oid, string& zone_id, string& owner_id) {
7c673cae
FG
2310 librados::IoCtx io_ctx;
2311
2312 int r = rgw_init_ioctx(get_rados_handle(), pool, io_ctx);
2313 if (r < 0) {
2314 return r;
2315 }
2316
2317 rados::cls::lock::Lock l(log_lock_name);
2318 l.set_tag(zone_id);
2319 l.set_cookie(owner_id);
2320
2321 return l.unlock(&io_ctx, oid);
2322}
2323
2324int RGWRados::decode_policy(bufferlist& bl, ACLOwner *owner)
2325{
11fdf7f2 2326 auto i = bl.cbegin();
7c673cae
FG
2327 RGWAccessControlPolicy policy(cct);
2328 try {
2329 policy.decode_owner(i);
2330 } catch (buffer::error& err) {
2331 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2332 return -EIO;
2333 }
2334 *owner = policy.get_owner();
2335 return 0;
2336}
2337
2338int rgw_policy_from_attrset(CephContext *cct, map<string, bufferlist>& attrset, RGWAccessControlPolicy *policy)
2339{
2340 map<string, bufferlist>::iterator aiter = attrset.find(RGW_ATTR_ACL);
2341 if (aiter == attrset.end())
2342 return -EIO;
2343
2344 bufferlist& bl = aiter->second;
11fdf7f2 2345 auto iter = bl.cbegin();
7c673cae
FG
2346 try {
2347 policy->decode(iter);
2348 } catch (buffer::error& err) {
2349 ldout(cct, 0) << "ERROR: could not decode policy, caught buffer::error" << dendl;
2350 return -EIO;
2351 }
11fdf7f2 2352 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 15>()) {
7c673cae
FG
2353 RGWAccessControlPolicy_S3 *s3policy = static_cast<RGWAccessControlPolicy_S3 *>(policy);
2354 ldout(cct, 15) << __func__ << " Read AccessControlPolicy";
2355 s3policy->to_xml(*_dout);
2356 *_dout << dendl;
2357 }
2358 return 0;
2359}
2360
2361
31f18b77
FG
2362int RGWRados::Bucket::update_bucket_id(const string& new_bucket_id)
2363{
2364 rgw_bucket bucket = bucket_info.bucket;
2365 bucket.update_bucket_id(new_bucket_id);
2366
11fdf7f2 2367 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
31f18b77 2368
81eedcae 2369 bucket_info.objv_tracker.clear();
31f18b77
FG
2370 int ret = store->get_bucket_instance_info(obj_ctx, bucket, bucket_info, nullptr, nullptr);
2371 if (ret < 0) {
2372 return ret;
2373 }
2374
2375 return 0;
2376}
2377
1adf2230
AA
2378
2379/**
2380 * Get ordered listing of the objects in a bucket.
7c673cae
FG
2381 *
2382 * max: maximum number of results to return
2383 * bucket: bucket to list contents of
2384 * prefix: only return results that match this prefix
2385 * delim: do not include results that match this string.
2386 * Any skipped results will have the matching portion of their name
2387 * inserted in common_prefixes with a "true" mark.
2388 * marker: if filled in, begin the listing with this object.
2389 * end_marker: if filled in, end the listing with this object.
2390 * result: the objects are put in here.
11fdf7f2
TL
2391 * common_prefixes: if delim is filled in, any matching prefixes are
2392 * placed here.
2393 * is_truncated: if number of objects in the bucket is bigger than
2394 * max, then truncated.
7c673cae 2395 */
11fdf7f2
TL
2396static inline std::string after_delim(std::string_view delim)
2397{
2398 // assert: ! delim.empty()
11fdf7f2 2399 std::string result{delim.data(), delim.length()};
81eedcae 2400 result += char(255);
11fdf7f2
TL
2401 return result;
2402}
2403
2404int RGWRados::Bucket::List::list_objects_ordered(
2405 int64_t max,
2406 vector<rgw_bucket_dir_entry> *result,
2407 map<string, bool> *common_prefixes,
2408 bool *is_truncated)
7c673cae
FG
2409{
2410 RGWRados *store = target->get_store();
2411 CephContext *cct = store->ctx();
2412 int shard_id = target->get_shard_id();
2413
2414 int count = 0;
2415 bool truncated = true;
2416 int read_ahead = std::max(cct->_conf->rgw_list_bucket_min_readahead,max);
2417
2418 result->clear();
2419
2420 rgw_obj_key marker_obj(params.marker.name, params.marker.instance, params.ns);
7c673cae
FG
2421 rgw_obj_index_key cur_marker;
2422 marker_obj.get_index_key(&cur_marker);
2423
3efd9988
FG
2424 rgw_obj_key end_marker_obj(params.end_marker.name, params.end_marker.instance,
2425 params.ns);
2426 rgw_obj_index_key cur_end_marker;
2427 end_marker_obj.get_index_key(&cur_end_marker);
7c673cae
FG
2428 const bool cur_end_marker_valid = !params.end_marker.empty();
2429
2430 rgw_obj_key prefix_obj(params.prefix);
2431 prefix_obj.ns = params.ns;
2432 string cur_prefix = prefix_obj.get_index_key_name();
11fdf7f2 2433 string after_delim_s; /* needed in !params.delim.empty() AND later */
7c673cae
FG
2434
2435 if (!params.delim.empty()) {
81eedcae 2436 after_delim_s = after_delim(params.delim);
11fdf7f2
TL
2437 /* if marker points at a common prefix, fast forward it into its
2438 * upper bound string */
224ce89b 2439 int delim_pos = cur_marker.name.find(params.delim, cur_prefix.size());
7c673cae
FG
2440 if (delim_pos >= 0) {
2441 string s = cur_marker.name.substr(0, delim_pos);
11fdf7f2 2442 s.append(after_delim_s);
7c673cae
FG
2443 cur_marker = s;
2444 }
2445 }
1adf2230 2446
7c673cae
FG
2447 string skip_after_delim;
2448 while (truncated && count <= max) {
2449 if (skip_after_delim > cur_marker.name) {
2450 cur_marker = skip_after_delim;
11fdf7f2
TL
2451
2452 ldout(cct, 20) << "setting cur_marker="
2453 << cur_marker.name
2454 << "[" << cur_marker.instance << "]"
2455 << dendl;
7c673cae
FG
2456 }
2457 std::map<string, rgw_bucket_dir_entry> ent_map;
1adf2230
AA
2458 int r = store->cls_bucket_list_ordered(target->get_bucket_info(),
2459 shard_id,
2460 cur_marker,
2461 cur_prefix,
2462 read_ahead + 1 - count,
2463 params.list_versions,
2464 ent_map,
2465 &truncated,
2466 &cur_marker);
7c673cae
FG
2467 if (r < 0)
2468 return r;
2469
1adf2230 2470 for (auto eiter = ent_map.begin(); eiter != ent_map.end(); ++eiter) {
7c673cae
FG
2471 rgw_bucket_dir_entry& entry = eiter->second;
2472 rgw_obj_index_key index_key = entry.key;
2473
2474 rgw_obj_key obj(index_key);
2475
1adf2230
AA
2476 /* note that parse_raw_oid() here will not set the correct
2477 * object's instance, as rgw_obj_index_key encodes that
2478 * separately. We don't need to set the instance because it's
2479 * not needed for the checks here and we end up using the raw
2480 * entry for the return vector
7c673cae
FG
2481 */
2482 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2483 if (!valid) {
2484 ldout(cct, 0) << "ERROR: could not parse object name: " << obj.name << dendl;
2485 continue;
2486 }
11fdf7f2 2487
7c673cae
FG
2488 bool check_ns = (obj.ns == params.ns);
2489 if (!params.list_versions && !entry.is_visible()) {
2490 continue;
2491 }
2492
2493 if (params.enforce_ns && !check_ns) {
2494 if (!params.ns.empty()) {
2495 /* we've iterated past the namespace we're searching -- done now */
2496 truncated = false;
2497 goto done;
2498 }
2499
2500 /* we're not looking at the namespace this object is in, next! */
2501 continue;
2502 }
2503
2504 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2505 truncated = false;
2506 goto done;
2507 }
2508
2509 if (count < max) {
2510 params.marker = index_key;
2511 next_marker = index_key;
2512 }
2513
2514 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2515 continue;
2516
1adf2230
AA
2517 if (params.prefix.size() &&
2518 (obj.name.compare(0, params.prefix.size(), params.prefix) != 0))
7c673cae
FG
2519 continue;
2520
2521 if (!params.delim.empty()) {
2522 int delim_pos = obj.name.find(params.delim, params.prefix.size());
2523
2524 if (delim_pos >= 0) {
11fdf7f2
TL
2525 /* extract key -with trailing delimiter- for CommonPrefix */
2526 string prefix_key =
2527 obj.name.substr(0, delim_pos + params.delim.length());
7c673cae
FG
2528
2529 if (common_prefixes &&
2530 common_prefixes->find(prefix_key) == common_prefixes->end()) {
2531 if (count >= max) {
2532 truncated = true;
2533 goto done;
2534 }
2535 next_marker = prefix_key;
2536 (*common_prefixes)[prefix_key] = true;
2537
11fdf7f2
TL
2538 int marker_delim_pos = cur_marker.name.find(
2539 params.delim, cur_prefix.size());
224ce89b
WB
2540
2541 skip_after_delim = cur_marker.name.substr(0, marker_delim_pos);
11fdf7f2 2542 skip_after_delim.append(after_delim_s);
7c673cae
FG
2543
2544 ldout(cct, 20) << "skip_after_delim=" << skip_after_delim << dendl;
2545
2546 count++;
2547 }
2548
2549 continue;
2550 }
2551 }
2552
2553 if (count >= max) {
2554 truncated = true;
2555 goto done;
2556 }
2557
2558 result->emplace_back(std::move(entry));
2559 count++;
2560 }
7c673cae
FG
2561 }
2562
2563done:
2564 if (is_truncated)
2565 *is_truncated = truncated;
2566
2567 return 0;
1adf2230
AA
2568} // list_objects_ordered
2569
2570
2571/**
2572 * Get listing of the objects in a bucket and allow the results to be out
2573 * of order.
2574 *
2575 * Even though there are key differences with the ordered counterpart,
2576 * the parameters are the same to maintain some compatability.
2577 *
2578 * max: maximum number of results to return
2579 * bucket: bucket to list contents of
2580 * prefix: only return results that match this prefix
2581 * delim: should not be set; if it is we should have indicated an error
2582 * marker: if filled in, begin the listing with this object.
2583 * end_marker: if filled in, end the listing with this object.
2584 * result: the objects are put in here.
2585 * common_prefixes: this is never filled with an unordered list; the param
2586 * is maintained for compatibility
2587 * is_truncated: if number of objects in the bucket is bigger than max, then
2588 * truncated.
2589 */
2590int RGWRados::Bucket::List::list_objects_unordered(int64_t max,
2591 vector<rgw_bucket_dir_entry> *result,
2592 map<string, bool> *common_prefixes,
2593 bool *is_truncated)
2594{
2595 RGWRados *store = target->get_store();
2596 CephContext *cct = store->ctx();
2597 int shard_id = target->get_shard_id();
2598
2599 int count = 0;
2600 bool truncated = true;
2601
2602 // read a few extra in each call to cls_bucket_list_unordered in
2603 // case some are filtered out due to namespace matching, versioning,
2604 // filtering, etc.
2605 const int64_t max_read_ahead = 100;
2606 const uint32_t read_ahead = uint32_t(max + std::min(max, max_read_ahead));
2607
2608 result->clear();
2609
11fdf7f2
TL
2610 rgw_obj_key marker_obj(params.marker.name,
2611 params.marker.instance,
2612 params.ns);
1adf2230
AA
2613 rgw_obj_index_key cur_marker;
2614 marker_obj.get_index_key(&cur_marker);
2615
11fdf7f2
TL
2616 rgw_obj_key end_marker_obj(params.end_marker.name,
2617 params.end_marker.instance,
1adf2230
AA
2618 params.ns);
2619 rgw_obj_index_key cur_end_marker;
2620 end_marker_obj.get_index_key(&cur_end_marker);
2621 const bool cur_end_marker_valid = !params.end_marker.empty();
2622
2623 rgw_obj_key prefix_obj(params.prefix);
2624 prefix_obj.ns = params.ns;
2625 string cur_prefix = prefix_obj.get_index_key_name();
2626
2627 while (truncated && count <= max) {
2628 std::vector<rgw_bucket_dir_entry> ent_list;
2629 int r = store->cls_bucket_list_unordered(target->get_bucket_info(),
2630 shard_id,
2631 cur_marker,
2632 cur_prefix,
2633 read_ahead,
2634 params.list_versions,
2635 ent_list,
2636 &truncated,
2637 &cur_marker);
2638 if (r < 0)
2639 return r;
2640
2641 // NB: while regions of ent_list will be sorted, we have no
2642 // guarantee that all items will be sorted since they can cross
2643 // shard boundaries
2644
2645 for (auto& entry : ent_list) {
2646 rgw_obj_index_key index_key = entry.key;
2647 rgw_obj_key obj(index_key);
2648
2649 /* note that parse_raw_oid() here will not set the correct
2650 * object's instance, as rgw_obj_index_key encodes that
2651 * separately. We don't need to set the instance because it's
2652 * not needed for the checks here and we end up using the raw
2653 * entry for the return vector
2654 */
2655 bool valid = rgw_obj_key::parse_raw_oid(index_key.name, &obj);
2656 if (!valid) {
2657 ldout(cct, 0) << "ERROR: could not parse object name: " <<
2658 obj.name << dendl;
2659 continue;
2660 }
2661
2662 if (!params.list_versions && !entry.is_visible()) {
2663 continue;
2664 }
2665
2666 if (params.enforce_ns && obj.ns != params.ns) {
2667 continue;
2668 }
2669
2670 if (cur_end_marker_valid && cur_end_marker <= index_key) {
2671 // we're not guaranteed items will come in order, so we have
2672 // to loop through all
2673 continue;
2674 }
2675
2676 if (count < max) {
11fdf7f2
TL
2677 params.marker.set(index_key);
2678 next_marker.set(index_key);
1adf2230
AA
2679 }
2680
2681 if (params.filter && !params.filter->filter(obj.name, index_key.name))
2682 continue;
2683
2684 if (params.prefix.size() &&
2685 (0 != obj.name.compare(0, params.prefix.size(), params.prefix)))
2686 continue;
2687
2688 if (count >= max) {
2689 truncated = true;
2690 goto done;
2691 }
2692
2693 result->emplace_back(std::move(entry));
2694 count++;
2695 } // for (auto& entry : ent_list)
2696 } // while (truncated && count <= max)
2697
2698done:
2699 if (is_truncated)
2700 *is_truncated = truncated;
2701
2702 return 0;
2703} // list_objects_unordered
2704
7c673cae
FG
2705
2706/**
2707 * create a rados pool, associated meta info
2708 * returns 0 on success, -ERR# otherwise.
2709 */
2710int RGWRados::create_pool(const rgw_pool& pool)
2711{
c07f9fc5 2712 librados::IoCtx io_ctx;
28e407b8
AA
2713 constexpr bool create = true;
2714 return rgw_init_ioctx(get_rados_handle(), pool, io_ctx, create);
7c673cae
FG
2715}
2716
2717int RGWRados::init_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2718{
f64942e4 2719 librados::IoCtx index_ctx;
7c673cae 2720
f64942e4 2721 string dir_oid = dir_oid_prefix;
7c673cae 2722 int r = open_bucket_index_ctx(bucket_info, index_ctx);
31f18b77 2723 if (r < 0) {
7c673cae 2724 return r;
31f18b77 2725 }
7c673cae 2726
7c673cae
FG
2727 dir_oid.append(bucket_info.bucket.bucket_id);
2728
2729 map<int, string> bucket_objs;
2730 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2731
f64942e4
AA
2732 return CLSRGWIssueBucketIndexInit(index_ctx,
2733 bucket_objs,
2734 cct->_conf->rgw_bucket_index_max_aio)();
2735}
2736
2737int RGWRados::clean_bucket_index(RGWBucketInfo& bucket_info, int num_shards)
2738{
2739 librados::IoCtx index_ctx;
2740
2741 std::string dir_oid = dir_oid_prefix;
2742 int r = open_bucket_index_ctx(bucket_info, index_ctx);
2743 if (r < 0) {
2744 return r;
2745 }
2746
2747 dir_oid.append(bucket_info.bucket.bucket_id);
2748
2749 std::map<int, std::string> bucket_objs;
2750 get_bucket_index_objects(dir_oid, num_shards, bucket_objs);
2751
2752 return CLSRGWIssueBucketIndexClean(index_ctx,
2753 bucket_objs,
2754 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
2755}
2756
2757void RGWRados::create_bucket_id(string *bucket_id)
2758{
2759 uint64_t iid = instance_id();
2760 uint64_t bid = next_bucket_id();
11fdf7f2
TL
2761 char buf[svc.zone->get_zone_params().get_id().size() + 48];
2762 snprintf(buf, sizeof(buf), "%s.%" PRIu64 ".%" PRIu64,
2763 svc.zone->get_zone_params().get_id().c_str(), iid, bid);
7c673cae
FG
2764 *bucket_id = buf;
2765}
2766
11fdf7f2 2767int RGWRados::create_bucket(const RGWUserInfo& owner, rgw_bucket& bucket,
7c673cae 2768 const string& zonegroup_id,
11fdf7f2 2769 const rgw_placement_rule& placement_rule,
7c673cae
FG
2770 const string& swift_ver_location,
2771 const RGWQuotaInfo * pquota_info,
2772 map<std::string, bufferlist>& attrs,
2773 RGWBucketInfo& info,
2774 obj_version *pobjv,
2775 obj_version *pep_objv,
2776 real_time creation_time,
2777 rgw_bucket *pmaster_bucket,
2778 uint32_t *pmaster_num_shards,
2779 bool exclusive)
2780{
2781#define MAX_CREATE_RETRIES 20 /* need to bound retries */
11fdf7f2 2782 rgw_placement_rule selected_placement_rule;
7c673cae
FG
2783 RGWZonePlacementInfo rule_info;
2784
2785 for (int i = 0; i < MAX_CREATE_RETRIES; i++) {
2786 int ret = 0;
11fdf7f2
TL
2787 ret = svc.zone->select_bucket_placement(owner, zonegroup_id, placement_rule,
2788 &selected_placement_rule, &rule_info);
7c673cae
FG
2789 if (ret < 0)
2790 return ret;
2791
2792 if (!pmaster_bucket) {
2793 create_bucket_id(&bucket.marker);
2794 bucket.bucket_id = bucket.marker;
2795 } else {
2796 bucket.marker = pmaster_bucket->marker;
2797 bucket.bucket_id = pmaster_bucket->bucket_id;
2798 }
2799
2800 RGWObjVersionTracker& objv_tracker = info.objv_tracker;
2801
2802 if (pobjv) {
2803 objv_tracker.write_version = *pobjv;
2804 } else {
2805 objv_tracker.generate_new_write_ver(cct);
2806 }
2807
2808 info.bucket = bucket;
2809 info.owner = owner.user_id;
2810 info.zonegroup = zonegroup_id;
11fdf7f2 2811 info.placement_rule = selected_placement_rule;
7c673cae
FG
2812 info.index_type = rule_info.index_type;
2813 info.swift_ver_location = swift_ver_location;
2814 info.swift_versioning = (!swift_ver_location.empty());
2815 if (pmaster_num_shards) {
2816 info.num_shards = *pmaster_num_shards;
2817 } else {
2818 info.num_shards = bucket_index_max_shards;
2819 }
2820 info.bucket_index_shard_hash_type = RGWBucketInfo::MOD;
2821 info.requester_pays = false;
2822 if (real_clock::is_zero(creation_time)) {
2823 info.creation_time = ceph::real_clock::now();
2824 } else {
2825 info.creation_time = creation_time;
2826 }
2827 if (pquota_info) {
2828 info.quota = *pquota_info;
2829 }
2830
2831 int r = init_bucket_index(info, info.num_shards);
11fdf7f2
TL
2832 if (r < 0) {
2833 return r;
2834 }
7c673cae 2835
11fdf7f2
TL
2836 ret = put_linked_bucket_info(info, exclusive, ceph::real_time(), pep_objv, &attrs, true);
2837 if (ret == -EEXIST) {
2838 librados::IoCtx index_ctx;
2839 map<int, string> bucket_objs;
2840 int r = open_bucket_index(info, index_ctx, bucket_objs);
2841 if (r < 0)
2842 return r;
7c673cae 2843
11fdf7f2
TL
2844 /* we need to reread the info and return it, caller will have a use for it */
2845 RGWObjVersionTracker instance_ver = info.objv_tracker;
2846 info.objv_tracker.clear();
2847 auto obj_ctx = svc.sysobj->init_obj_ctx();
2848 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, NULL);
2849 if (r < 0) {
2850 if (r == -ENOENT) {
2851 continue;
2852 }
2853 ldout(cct, 0) << "get_bucket_info returned " << r << dendl;
2854 return r;
2855 }
7c673cae 2856
11fdf7f2
TL
2857 /* only remove it if it's a different bucket instance */
2858 if (info.bucket.bucket_id != bucket.bucket_id) {
2859 /* remove bucket meta instance */
2860 r = rgw_bucket_instance_remove_entry(this,
2861 bucket.get_key(),
2862 &instance_ver);
2863 if (r < 0)
2864 return r;
7c673cae 2865
11fdf7f2
TL
2866 /* remove bucket index objects asynchronously by best effort */
2867 (void) CLSRGWIssueBucketIndexClean(index_ctx,
2868 bucket_objs,
2869 cct->_conf->rgw_bucket_index_max_aio)();
2870 }
2871 /* ret == -ENOENT here */
2872 }
7c673cae 2873 return ret;
7c673cae
FG
2874 }
2875
11fdf7f2
TL
2876 /* this is highly unlikely */
2877 ldout(cct, 0) << "ERROR: could not create bucket, continuously raced with bucket creation and removal" << dendl;
2878 return -ENOENT;
7c673cae
FG
2879}
2880
11fdf7f2 2881bool RGWRados::get_obj_data_pool(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_pool *pool)
7c673cae 2882{
11fdf7f2
TL
2883 return rgw_get_obj_data_pool(svc.zone->get_zonegroup(), svc.zone->get_zone_params(), placement_rule, obj, pool);
2884}
c07f9fc5 2885
11fdf7f2
TL
2886bool RGWRados::obj_to_raw(const rgw_placement_rule& placement_rule, const rgw_obj& obj, rgw_raw_obj *raw_obj)
2887{
2888 get_obj_bucket_and_oid_loc(obj, raw_obj->oid, raw_obj->loc);
c07f9fc5 2889
11fdf7f2 2890 return get_obj_data_pool(placement_rule, obj, &raw_obj->pool);
7c673cae
FG
2891}
2892
2893int RGWRados::get_obj_head_ioctx(const RGWBucketInfo& bucket_info, const rgw_obj& obj, librados::IoCtx *ioctx)
2894{
2895 string oid, key;
2896 get_obj_bucket_and_oid_loc(obj, oid, key);
2897
2898 rgw_pool pool;
2899 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2900 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2901 return -EIO;
2902 }
2903
494da23a 2904 int r = open_pool_ctx(pool, *ioctx, false);
7c673cae
FG
2905 if (r < 0) {
2906 return r;
2907 }
2908
2909 ioctx->locator_set_key(key);
2910
2911 return 0;
2912}
2913
2914int RGWRados::get_obj_head_ref(const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_rados_ref *ref)
2915{
11fdf7f2 2916 get_obj_bucket_and_oid_loc(obj, ref->obj.oid, ref->obj.loc);
7c673cae
FG
2917
2918 rgw_pool pool;
2919 if (!get_obj_data_pool(bucket_info.placement_rule, obj, &pool)) {
2920 ldout(cct, 0) << "ERROR: cannot get data pool for obj=" << obj << ", probably misconfiguration" << dendl;
2921 return -EIO;
2922 }
2923
494da23a 2924 int r = open_pool_ctx(pool, ref->ioctx, false);
7c673cae
FG
2925 if (r < 0) {
2926 return r;
2927 }
2928
11fdf7f2 2929 ref->ioctx.locator_set_key(ref->obj.loc);
7c673cae
FG
2930
2931 return 0;
2932}
2933
224ce89b 2934int RGWRados::get_raw_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2935{
11fdf7f2 2936 ref->obj = obj;
7c673cae
FG
2937
2938 int r;
2939
11fdf7f2
TL
2940 if (ref->obj.oid.empty()) {
2941 ref->obj.oid = obj.pool.to_str();
2942 ref->obj.pool = svc.zone->get_zone_params().domain_root;
7c673cae 2943 }
494da23a 2944 r = open_pool_ctx(ref->obj.pool, ref->ioctx, false);
7c673cae
FG
2945 if (r < 0)
2946 return r;
2947
11fdf7f2 2948 ref->ioctx.locator_set_key(ref->obj.loc);
7c673cae
FG
2949
2950 return 0;
2951}
2952
224ce89b 2953int RGWRados::get_system_obj_ref(const rgw_raw_obj& obj, rgw_rados_ref *ref)
7c673cae 2954{
224ce89b 2955 return get_raw_obj_ref(obj, ref);
7c673cae
FG
2956}
2957
2958/*
2959 * fixes an issue where head objects were supposed to have a locator created, but ended
2960 * up without one
2961 */
2962int RGWRados::fix_head_obj_locator(const RGWBucketInfo& bucket_info, bool copy_obj, bool remove_bad, rgw_obj_key& key)
2963{
2964 const rgw_bucket& bucket = bucket_info.bucket;
2965 string oid;
2966 string locator;
2967
2968 rgw_obj obj(bucket, key);
2969
2970 get_obj_bucket_and_oid_loc(obj, oid, locator);
2971
2972 if (locator.empty()) {
2973 ldout(cct, 20) << "object does not have a locator, nothing to fix" << dendl;
2974 return 0;
2975 }
2976
2977 librados::IoCtx ioctx;
2978
2979 int ret = get_obj_head_ioctx(bucket_info, obj, &ioctx);
2980 if (ret < 0) {
2981 cerr << "ERROR: get_obj_head_ioctx() returned ret=" << ret << std::endl;
2982 return ret;
2983 }
2984 ioctx.locator_set_key(string()); /* override locator for this object, use empty locator */
2985
2986 uint64_t size;
2987 bufferlist data;
2988
2989 struct timespec mtime_ts;
2990 map<string, bufferlist> attrs;
2991 librados::ObjectReadOperation op;
2992 op.getxattrs(&attrs, NULL);
2993 op.stat2(&size, &mtime_ts, NULL);
2994#define HEAD_SIZE 512 * 1024
2995 op.read(0, HEAD_SIZE, &data, NULL);
2996
2997 ret = ioctx.operate(oid, &op, NULL);
2998 if (ret < 0) {
2999 lderr(cct) << "ERROR: ioctx.operate(oid=" << oid << ") returned ret=" << ret << dendl;
3000 return ret;
3001 }
3002
3003 if (size > HEAD_SIZE) {
3004 lderr(cct) << "ERROR: returned object size (" << size << ") > HEAD_SIZE (" << HEAD_SIZE << ")" << dendl;
3005 return -EIO;
3006 }
3007
3008 if (size != data.length()) {
3009 lderr(cct) << "ERROR: returned object size (" << size << ") != data.length() (" << data.length() << ")" << dendl;
3010 return -EIO;
3011 }
3012
3013 if (copy_obj) {
3014 librados::ObjectWriteOperation wop;
3015
3016 wop.mtime2(&mtime_ts);
3017
3018 map<string, bufferlist>::iterator iter;
3019 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3020 wop.setxattr(iter->first.c_str(), iter->second);
3021 }
3022
3023 wop.write(0, data);
3024
3025 ioctx.locator_set_key(locator);
3026 ioctx.operate(oid, &wop);
3027 }
3028
3029 if (remove_bad) {
3030 ioctx.locator_set_key(string());
3031
3032 ret = ioctx.remove(oid);
3033 if (ret < 0) {
3034 lderr(cct) << "ERROR: failed to remove original bad object" << dendl;
3035 return ret;
3036 }
3037 }
3038
3039 return 0;
3040}
3041
3042int RGWRados::move_rados_obj(librados::IoCtx& src_ioctx,
3043 const string& src_oid, const string& src_locator,
3044 librados::IoCtx& dst_ioctx,
3045 const string& dst_oid, const string& dst_locator)
3046{
3047
3048#define COPY_BUF_SIZE (4 * 1024 * 1024)
3049 bool done = false;
3050 uint64_t chunk_size = COPY_BUF_SIZE;
3051 uint64_t ofs = 0;
3052 int ret = 0;
3053 real_time mtime;
3054 struct timespec mtime_ts;
3055 uint64_t size;
3056
3057 if (src_oid == dst_oid && src_locator == dst_locator) {
3058 return 0;
3059 }
3060
3061 src_ioctx.locator_set_key(src_locator);
3062 dst_ioctx.locator_set_key(dst_locator);
3063
3064 do {
3065 bufferlist data;
3066 ObjectReadOperation rop;
3067 ObjectWriteOperation wop;
3068
3069 if (ofs == 0) {
3070 rop.stat2(&size, &mtime_ts, NULL);
3071 mtime = real_clock::from_timespec(mtime_ts);
3072 }
3073 rop.read(ofs, chunk_size, &data, NULL);
3074 ret = src_ioctx.operate(src_oid, &rop, NULL);
3075 if (ret < 0) {
3076 goto done_err;
3077 }
3078
3079 if (data.length() == 0) {
3080 break;
3081 }
3082
3083 if (ofs == 0) {
3084 wop.create(true); /* make it exclusive */
3085 wop.mtime2(&mtime_ts);
3086 mtime = real_clock::from_timespec(mtime_ts);
3087 }
3088 wop.write(ofs, data);
3089 ret = dst_ioctx.operate(dst_oid, &wop);
11fdf7f2
TL
3090 if (ret < 0) {
3091 goto done_err;
3092 }
7c673cae
FG
3093 ofs += data.length();
3094 done = data.length() != chunk_size;
3095 } while (!done);
3096
3097 if (ofs != size) {
3098 lderr(cct) << "ERROR: " << __func__ << ": copying " << src_oid << " -> " << dst_oid
3099 << ": expected " << size << " bytes to copy, ended up with " << ofs << dendl;
3100 ret = -EIO;
3101 goto done_err;
3102 }
3103
3104 src_ioctx.remove(src_oid);
3105
3106 return 0;
3107
3108done_err:
11fdf7f2 3109 // TODO: clean up dst_oid if we created it
7c673cae
FG
3110 lderr(cct) << "ERROR: failed to copy " << src_oid << " -> " << dst_oid << dendl;
3111 return ret;
3112}
3113
3114/*
3115 * fixes an issue where head objects were supposed to have a locator created, but ended
3116 * up without one
3117 */
3118int RGWRados::fix_tail_obj_locator(const RGWBucketInfo& bucket_info, rgw_obj_key& key, bool fix, bool *need_fix)
3119{
3120 const rgw_bucket& bucket = bucket_info.bucket;
3121 rgw_obj obj(bucket, key);
3122
3123 if (need_fix) {
3124 *need_fix = false;
3125 }
3126
3127 rgw_rados_ref ref;
3128 int r = get_obj_head_ref(bucket_info, obj, &ref);
3129 if (r < 0) {
3130 return r;
3131 }
3132
3133 RGWObjState *astate = NULL;
3134 RGWObjectCtx rctx(this);
3135 r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
3136 if (r < 0)
3137 return r;
3138
3139 if (astate->has_manifest) {
3140 RGWObjManifest::obj_iterator miter;
3141 RGWObjManifest& manifest = astate->manifest;
3142 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
3143 rgw_raw_obj raw_loc = miter.get_location().get_raw_obj(this);
3144 rgw_obj loc;
3145 string oid;
3146 string locator;
3147
3148 rgw_raw_obj_to_obj(manifest.get_tail_placement().bucket, raw_loc, &loc);
3149
3150 if (loc.key.ns.empty()) {
3151 /* continue, we're only interested in tail objects */
3152 continue;
3153 }
3154
3155 get_obj_bucket_and_oid_loc(loc, oid, locator);
3156 ref.ioctx.locator_set_key(locator);
3157
3158 ldout(cct, 20) << __func__ << ": key=" << key << " oid=" << oid << " locator=" << locator << dendl;
3159
3160 r = ref.ioctx.stat(oid, NULL, NULL);
3161 if (r != -ENOENT) {
3162 continue;
3163 }
3164
3165 string bad_loc;
3166 prepend_bucket_marker(bucket, loc.key.name, bad_loc);
3167
3168 /* create a new ioctx with the bad locator */
3169 librados::IoCtx src_ioctx;
3170 src_ioctx.dup(ref.ioctx);
3171 src_ioctx.locator_set_key(bad_loc);
3172
3173 r = src_ioctx.stat(oid, NULL, NULL);
3174 if (r != 0) {
3175 /* cannot find a broken part */
3176 continue;
3177 }
3178 ldout(cct, 20) << __func__ << ": found bad object part: " << loc << dendl;
3179 if (need_fix) {
3180 *need_fix = true;
3181 }
3182 if (fix) {
3183 r = move_rados_obj(src_ioctx, oid, bad_loc, ref.ioctx, oid, locator);
3184 if (r < 0) {
3185 lderr(cct) << "ERROR: copy_rados_obj() on oid=" << oid << " returned r=" << r << dendl;
3186 }
3187 }
3188 }
3189 }
3190
3191 return 0;
3192}
3193
f64942e4
AA
3194int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3195 const rgw_obj& obj,
3196 RGWBucketInfo* bucket_info_out)
7c673cae
FG
3197{
3198 bucket = _bucket;
3199
11fdf7f2 3200 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
3201
3202 RGWBucketInfo bucket_info;
f64942e4
AA
3203 RGWBucketInfo* bucket_info_p =
3204 bucket_info_out ? bucket_info_out : &bucket_info;
3205
3206 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
3207 if (ret < 0) {
3208 return ret;
3209 }
3210
f64942e4 3211 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, obj.get_hash_object(), &bucket_obj, &shard_id);
7c673cae
FG
3212 if (ret < 0) {
3213 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3214 return ret;
3215 }
3216 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3217
3218 return 0;
3219}
3220
f64942e4
AA
3221int RGWRados::BucketShard::init(const rgw_bucket& _bucket,
3222 int sid,
3223 RGWBucketInfo* bucket_info_out)
7c673cae
FG
3224{
3225 bucket = _bucket;
3226 shard_id = sid;
3227
11fdf7f2 3228 auto obj_ctx = store->svc.sysobj->init_obj_ctx();
7c673cae
FG
3229
3230 RGWBucketInfo bucket_info;
f64942e4
AA
3231 RGWBucketInfo* bucket_info_p =
3232 bucket_info_out ? bucket_info_out : &bucket_info;
3233 int ret = store->get_bucket_instance_info(obj_ctx, bucket, *bucket_info_p, NULL, NULL);
7c673cae
FG
3234 if (ret < 0) {
3235 return ret;
3236 }
3237
f64942e4 3238 ret = store->open_bucket_index_shard(*bucket_info_p, index_ctx, shard_id, &bucket_obj);
7c673cae
FG
3239 if (ret < 0) {
3240 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3241 return ret;
3242 }
3243 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3244
3245 return 0;
3246}
3247
a8e16298
TL
3248int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info,
3249 const rgw_obj& obj)
3250{
3251 bucket = bucket_info.bucket;
3252
3253 int ret = store->open_bucket_index_shard(bucket_info, index_ctx,
3254 obj.get_hash_object(), &bucket_obj,
3255 &shard_id);
3256 if (ret < 0) {
3257 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3258 return ret;
3259 }
3260 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3261
3262 return 0;
3263}
3264
b32b8144
FG
3265int RGWRados::BucketShard::init(const RGWBucketInfo& bucket_info, int sid)
3266{
3267 bucket = bucket_info.bucket;
3268 shard_id = sid;
3269
3270 int ret = store->open_bucket_index_shard(bucket_info, index_ctx, shard_id, &bucket_obj);
3271 if (ret < 0) {
3272 ldout(store->ctx(), 0) << "ERROR: open_bucket_index_shard() returned ret=" << ret << dendl;
3273 return ret;
3274 }
3275 ldout(store->ctx(), 20) << " bucket index object: " << bucket_obj << dendl;
3276
3277 return 0;
3278}
3279
7c673cae
FG
3280
3281/* Execute @handler on last item in bucket listing for bucket specified
3282 * in @bucket_info. @obj_prefix and @obj_delim narrow down the listing
3283 * to objects matching these criterias. */
3284int RGWRados::on_last_entry_in_listing(RGWBucketInfo& bucket_info,
3285 const std::string& obj_prefix,
3286 const std::string& obj_delim,
3287 std::function<int(const rgw_bucket_dir_entry&)> handler)
3288{
3289 RGWRados::Bucket target(this, bucket_info);
3290 RGWRados::Bucket::List list_op(&target);
3291
3292 list_op.params.prefix = obj_prefix;
3293 list_op.params.delim = obj_delim;
3294
3295 ldout(cct, 20) << "iterating listing for bucket=" << bucket_info.bucket.name
3296 << ", obj_prefix=" << obj_prefix
3297 << ", obj_delim=" << obj_delim
3298 << dendl;
3299
3300 bool is_truncated = false;
3301
3302 boost::optional<rgw_bucket_dir_entry> last_entry;
3303 /* We need to rewind to the last object in a listing. */
3304 do {
3305 /* List bucket entries in chunks. */
3306 static constexpr int MAX_LIST_OBJS = 100;
3307 std::vector<rgw_bucket_dir_entry> entries(MAX_LIST_OBJS);
3308
3309 int ret = list_op.list_objects(MAX_LIST_OBJS, &entries, nullptr,
3310 &is_truncated);
3311 if (ret < 0) {
3312 return ret;
3313 } else if (!entries.empty()) {
3314 last_entry = entries.back();
3315 }
3316 } while (is_truncated);
3317
3318 if (last_entry) {
3319 return handler(*last_entry);
3320 }
3321
3322 /* Empty listing - no items we can run handler on. */
3323 return 0;
3324}
3325
3326
3327int RGWRados::swift_versioning_copy(RGWObjectCtx& obj_ctx,
3328 const rgw_user& user,
3329 RGWBucketInfo& bucket_info,
3330 rgw_obj& obj)
3331{
3332 if (! swift_versioning_enabled(bucket_info)) {
3333 return 0;
3334 }
3335
11fdf7f2 3336 obj_ctx.set_atomic(obj);
7c673cae
FG
3337
3338 RGWObjState * state = nullptr;
3339 int r = get_obj_state(&obj_ctx, bucket_info, obj, &state, false);
3340 if (r < 0) {
3341 return r;
3342 }
3343
3344 if (!state->exists) {
3345 return 0;
3346 }
3347
7c673cae
FG
3348 const string& src_name = obj.get_oid();
3349 char buf[src_name.size() + 32];
3350 struct timespec ts = ceph::real_clock::to_timespec(state->mtime);
3351 snprintf(buf, sizeof(buf), "%03x%s/%lld.%06ld", (int)src_name.size(),
3352 src_name.c_str(), (long long)ts.tv_sec, ts.tv_nsec / 1000);
3353
3354 RGWBucketInfo dest_bucket_info;
3355
11fdf7f2
TL
3356 auto sysobj_ctx = svc.sysobj->init_obj_ctx();
3357
3358 r = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant, bucket_info.swift_ver_location, dest_bucket_info, NULL, NULL);
7c673cae
FG
3359 if (r < 0) {
3360 ldout(cct, 10) << "failed to read dest bucket info: r=" << r << dendl;
3361 if (r == -ENOENT) {
3362 return -ERR_PRECONDITION_FAILED;
3363 }
3364 return r;
3365 }
3366
3367 if (dest_bucket_info.owner != bucket_info.owner) {
3368 return -ERR_PRECONDITION_FAILED;
3369 }
3370
3371 rgw_obj dest_obj(dest_bucket_info.bucket, buf);
11fdf7f2
TL
3372
3373 if (dest_bucket_info.versioning_enabled()){
3374 gen_rand_obj_instance_name(&dest_obj);
3375 }
3376
3377 obj_ctx.set_atomic(dest_obj);
7c673cae
FG
3378
3379 string no_zone;
3380
3381 r = copy_obj(obj_ctx,
3382 user,
7c673cae
FG
3383 NULL, /* req_info *info */
3384 no_zone,
3385 dest_obj,
3386 obj,
3387 dest_bucket_info,
3388 bucket_info,
11fdf7f2 3389 bucket_info.placement_rule,
7c673cae
FG
3390 NULL, /* time_t *src_mtime */
3391 NULL, /* time_t *mtime */
3392 NULL, /* const time_t *mod_ptr */
3393 NULL, /* const time_t *unmod_ptr */
3394 false, /* bool high_precision_time */
3395 NULL, /* const char *if_match */
3396 NULL, /* const char *if_nomatch */
3397 RGWRados::ATTRSMOD_NONE,
3398 true, /* bool copy_if_newer */
3399 state->attrset,
11fdf7f2 3400 RGWObjCategory::Main,
7c673cae
FG
3401 0, /* uint64_t olh_epoch */
3402 real_time(), /* time_t delete_at */
3403 NULL, /* string *version_id */
3404 NULL, /* string *ptag */
3405 NULL, /* string *petag */
7c673cae
FG
3406 NULL, /* void (*progress_cb)(off_t, void *) */
3407 NULL); /* void *progress_data */
3408 if (r == -ECANCELED || r == -ENOENT) {
3409 /* Has already been overwritten, meaning another rgw process already
3410 * copied it out */
3411 return 0;
3412 }
3413
3414 return r;
3415}
3416
11fdf7f2
TL
3417int RGWRados::swift_versioning_restore(RGWSysObjectCtx& sysobj_ctx,
3418 RGWObjectCtx& obj_ctx,
7c673cae
FG
3419 const rgw_user& user,
3420 RGWBucketInfo& bucket_info,
3421 rgw_obj& obj,
3422 bool& restored) /* out */
3423{
3424 if (! swift_versioning_enabled(bucket_info)) {
3425 return 0;
3426 }
3427
3428 /* Bucket info of the bucket that stores previous versions of our object. */
3429 RGWBucketInfo archive_binfo;
3430
11fdf7f2 3431 int ret = get_bucket_info(sysobj_ctx, bucket_info.bucket.tenant,
7c673cae
FG
3432 bucket_info.swift_ver_location, archive_binfo,
3433 nullptr, nullptr);
3434 if (ret < 0) {
3435 return ret;
3436 }
3437
3438 /* Abort the operation if the bucket storing our archive belongs to someone
3439 * else. This is a limitation in comparison to Swift as we aren't taking ACLs
3440 * into consideration. For we can live with that.
3441 *
3442 * TODO: delegate this check to un upper layer and compare with ACLs. */
3443 if (bucket_info.owner != archive_binfo.owner) {
3444 return -EPERM;
3445 }
3446
3447 /* This code will be executed on latest version of the object. */
3448 const auto handler = [&](const rgw_bucket_dir_entry& entry) -> int {
7c673cae
FG
3449 std::string no_zone;
3450
3451 /* We don't support object versioning of Swift API on those buckets that
3452 * are already versioned using the S3 mechanism. This affects also bucket
3453 * storing archived objects. Otherwise the delete operation would create
3454 * a deletion marker. */
3455 if (archive_binfo.versioned()) {
3456 restored = false;
3457 return -ERR_PRECONDITION_FAILED;
3458 }
3459
3460 /* We are requesting ATTRSMOD_NONE so the attr attribute is perfectly
3461 * irrelevant and may be safely skipped. */
3462 std::map<std::string, ceph::bufferlist> no_attrs;
3463
3464 rgw_obj archive_obj(archive_binfo.bucket, entry.key);
11fdf7f2
TL
3465
3466 if (bucket_info.versioning_enabled()){
3467 gen_rand_obj_instance_name(&obj);
3468 }
3469
3470 obj_ctx.set_atomic(archive_obj);
3471 obj_ctx.set_atomic(obj);
7c673cae
FG
3472
3473 int ret = copy_obj(obj_ctx,
3474 user,
7c673cae
FG
3475 nullptr, /* req_info *info */
3476 no_zone,
3477 obj, /* dest obj */
3478 archive_obj, /* src obj */
3479 bucket_info, /* dest bucket info */
3480 archive_binfo, /* src bucket info */
11fdf7f2 3481 bucket_info.placement_rule, /* placement_rule */
7c673cae
FG
3482 nullptr, /* time_t *src_mtime */
3483 nullptr, /* time_t *mtime */
3484 nullptr, /* const time_t *mod_ptr */
3485 nullptr, /* const time_t *unmod_ptr */
3486 false, /* bool high_precision_time */
3487 nullptr, /* const char *if_match */
3488 nullptr, /* const char *if_nomatch */
3489 RGWRados::ATTRSMOD_NONE,
3490 true, /* bool copy_if_newer */
3491 no_attrs,
11fdf7f2 3492 RGWObjCategory::Main,
7c673cae
FG
3493 0, /* uint64_t olh_epoch */
3494 real_time(), /* time_t delete_at */
3495 nullptr, /* string *version_id */
3496 nullptr, /* string *ptag */
3497 nullptr, /* string *petag */
7c673cae
FG
3498 nullptr, /* void (*progress_cb)(off_t, void *) */
3499 nullptr); /* void *progress_data */
3500 if (ret == -ECANCELED || ret == -ENOENT) {
3501 /* Has already been overwritten, meaning another rgw process already
3502 * copied it out */
3503 return 0;
3504 } else if (ret < 0) {
3505 return ret;
3506 } else {
3507 restored = true;
3508 }
3509
3510 /* Need to remove the archived copy. */
3511 ret = delete_obj(obj_ctx, archive_binfo, archive_obj,
3512 archive_binfo.versioning_status());
3513
3514 return ret;
3515 };
3516
3517 const std::string& obj_name = obj.get_oid();
3518 const auto prefix = boost::str(boost::format("%03x%s") % obj_name.size()
3519 % obj_name);
3520
3521 return on_last_entry_in_listing(archive_binfo, prefix, std::string(),
3522 handler);
3523}
3524
7c673cae 3525int RGWRados::Object::Write::_do_write_meta(uint64_t size, uint64_t accounted_size,
181888fb
FG
3526 map<string, bufferlist>& attrs,
3527 bool assume_noent, bool modify_tail,
7c673cae
FG
3528 void *_index_op)
3529{
3530 RGWRados::Bucket::UpdateIndex *index_op = static_cast<RGWRados::Bucket::UpdateIndex *>(_index_op);
7c673cae
FG
3531 RGWRados *store = target->get_store();
3532
3533 ObjectWriteOperation op;
11fdf7f2
TL
3534#ifdef WITH_LTTNG
3535 const struct req_state* s = get_req_state();
3536 string req_id;
3537 if (!s) {
3538 // fake req_id
3539 req_id = store->svc.zone_utils->unique_id(store->get_new_req_id());
3540 } else {
3541 req_id = s->req_id;
3542 }
3543#endif
7c673cae
FG
3544
3545 RGWObjState *state;
3546 int r = target->get_state(&state, false, assume_noent);
3547 if (r < 0)
3548 return r;
3549
3550 rgw_obj& obj = target->get_obj();
3551
3552 if (obj.get_oid().empty()) {
3553 ldout(store->ctx(), 0) << "ERROR: " << __func__ << "(): cannot write object with empty name" << dendl;
3554 return -EIO;
3555 }
3556
224ce89b 3557 rgw_rados_ref ref;
7c673cae
FG
3558 r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
3559 if (r < 0)
3560 return r;
3561
3562 bool is_olh = state->is_olh;
3563
3564 bool reset_obj = (meta.flags & PUT_OBJ_CREATE) != 0;
3565
3566 const string *ptag = meta.ptag;
3567 if (!ptag && !index_op->get_optag()->empty()) {
3568 ptag = index_op->get_optag();
3569 }
181888fb 3570 r = target->prepare_atomic_modification(op, reset_obj, ptag, meta.if_match, meta.if_nomatch, false, modify_tail);
7c673cae
FG
3571 if (r < 0)
3572 return r;
3573
3574 if (real_clock::is_zero(meta.set_mtime)) {
3575 meta.set_mtime = real_clock::now();
3576 }
3577
3578 if (state->is_olh) {
3579 op.setxattr(RGW_ATTR_OLH_ID_TAG, state->olh_tag);
3580 }
3581
3582 struct timespec mtime_ts = real_clock::to_timespec(meta.set_mtime);
3583 op.mtime2(&mtime_ts);
3584
3585 if (meta.data) {
3586 /* if we want to overwrite the data, we also want to overwrite the
3587 xattrs, so just remove the object */
3588 op.write_full(*meta.data);
3589 }
3590
3591 string etag;
3592 string content_type;
3593 bufferlist acl_bl;
11fdf7f2 3594 string storage_class;
7c673cae
FG
3595
3596 map<string, bufferlist>::iterator iter;
3597 if (meta.rmattrs) {
3598 for (iter = meta.rmattrs->begin(); iter != meta.rmattrs->end(); ++iter) {
3599 const string& name = iter->first;
3600 op.rmxattr(name.c_str());
3601 }
3602 }
3603
3604 if (meta.manifest) {
11fdf7f2
TL
3605 storage_class = meta.manifest->get_tail_placement().placement_rule.storage_class;
3606
7c673cae
FG
3607 /* remove existing manifest attr */
3608 iter = attrs.find(RGW_ATTR_MANIFEST);
3609 if (iter != attrs.end())
3610 attrs.erase(iter);
3611
3612 bufferlist bl;
11fdf7f2 3613 encode(*meta.manifest, bl);
7c673cae
FG
3614 op.setxattr(RGW_ATTR_MANIFEST, bl);
3615 }
3616
3617 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
3618 const string& name = iter->first;
3619 bufferlist& bl = iter->second;
3620
3621 if (!bl.length())
3622 continue;
3623
3624 op.setxattr(name.c_str(), bl);
3625
3626 if (name.compare(RGW_ATTR_ETAG) == 0) {
11fdf7f2 3627 etag = rgw_bl_str(bl);
7c673cae 3628 } else if (name.compare(RGW_ATTR_CONTENT_TYPE) == 0) {
11fdf7f2 3629 content_type = rgw_bl_str(bl);
7c673cae
FG
3630 } else if (name.compare(RGW_ATTR_ACL) == 0) {
3631 acl_bl = bl;
3632 }
3633 }
3634 if (attrs.find(RGW_ATTR_PG_VER) == attrs.end()) {
3635 cls_rgw_obj_store_pg_ver(op, RGW_ATTR_PG_VER);
3636 }
3637
3638 if (attrs.find(RGW_ATTR_SOURCE_ZONE) == attrs.end()) {
3639 bufferlist bl;
11fdf7f2 3640 encode(store->svc.zone->get_zone_short_id(), bl);
7c673cae
FG
3641 op.setxattr(RGW_ATTR_SOURCE_ZONE, bl);
3642 }
3643
11fdf7f2
TL
3644 if (!storage_class.empty()) {
3645 bufferlist bl;
3646 bl.append(storage_class);
3647 op.setxattr(RGW_ATTR_STORAGE_CLASS, bl);
3648 }
3649
7c673cae
FG
3650 if (!op.size())
3651 return 0;
3652
3653 uint64_t epoch;
3654 int64_t poolid;
224ce89b
WB
3655 bool orig_exists;
3656 uint64_t orig_size;
3657
3658 if (!reset_obj) { //Multipart upload, it has immutable head.
3659 orig_exists = false;
3660 orig_size = 0;
3661 } else {
3662 orig_exists = state->exists;
3663 orig_size = state->accounted_size;
3664 }
7c673cae 3665
91327a77
AA
3666 bool versioned_target = (meta.olh_epoch && *meta.olh_epoch > 0) ||
3667 !obj.key.instance.empty();
7c673cae
FG
3668
3669 bool versioned_op = (target->versioning_enabled() || is_olh || versioned_target);
3670
3671 if (versioned_op) {
3672 index_op->set_bilog_flags(RGW_BILOG_FLAG_VERSIONED_OP);
3673 }
3674
3675 if (!index_op->is_prepared()) {
11fdf7f2 3676 tracepoint(rgw_rados, prepare_enter, req_id.c_str());
7c673cae 3677 r = index_op->prepare(CLS_RGW_OP_ADD, &state->write_tag);
11fdf7f2 3678 tracepoint(rgw_rados, prepare_exit, req_id.c_str());
7c673cae
FG
3679 if (r < 0)
3680 return r;
3681 }
3682
11fdf7f2
TL
3683 tracepoint(rgw_rados, operate_enter, req_id.c_str());
3684 r = ref.ioctx.operate(ref.obj.oid, &op);
3685 tracepoint(rgw_rados, operate_exit, req_id.c_str());
7c673cae
FG
3686 if (r < 0) { /* we can expect to get -ECANCELED if object was replaced under,
3687 or -ENOENT if was removed, or -EEXIST if it did not exist
3688 before and now it does */
3689 if (r == -EEXIST && assume_noent) {
3690 target->invalidate_state();
3691 return r;
3692 }
3693 goto done_cancel;
3694 }
3695
3696 epoch = ref.ioctx.get_last_version();
3697 poolid = ref.ioctx.get_id();
3698
3699 r = target->complete_atomic_modification();
3700 if (r < 0) {
3701 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned r=" << r << dendl;
3702 }
3703
11fdf7f2 3704 tracepoint(rgw_rados, complete_enter, req_id.c_str());
7c673cae 3705 r = index_op->complete(poolid, epoch, size, accounted_size,
11fdf7f2
TL
3706 meta.set_mtime, etag, content_type,
3707 storage_class, &acl_bl,
3708 meta.category, meta.remove_objs, meta.user_data, meta.appendable);
3709 tracepoint(rgw_rados, complete_exit, req_id.c_str());
7c673cae
FG
3710 if (r < 0)
3711 goto done_cancel;
3712
3713 if (meta.mtime) {
3714 *meta.mtime = meta.set_mtime;
3715 }
3716
3717 /* note that index_op was using state so we couldn't invalidate it earlier */
3718 target->invalidate_state();
3719 state = NULL;
3720
91327a77
AA
3721 if (versioned_op && meta.olh_epoch) {
3722 r = store->set_olh(target->get_ctx(), target->get_bucket_info(), obj, false, NULL, *meta.olh_epoch, real_time(), false, meta.zones_trace);
7c673cae
FG
3723 if (r < 0) {
3724 return r;
3725 }
3726 }
3727
3728 if (!real_clock::is_zero(meta.delete_at)) {
3729 rgw_obj_index_key obj_key;
3730 obj.key.get_index_key(&obj_key);
3731
3732 r = store->objexp_hint_add(meta.delete_at,
3733 obj.bucket.tenant, obj.bucket.name, obj.bucket.bucket_id, obj_key);
3734 if (r < 0) {
3735 ldout(store->ctx(), 0) << "ERROR: objexp_hint_add() returned r=" << r << ", object will not get removed" << dendl;
3736 /* ignoring error, nothing we can do at this point */
3737 }
3738 }
3739 meta.canceled = false;
3740
3741 /* update quota cache */
3efd9988
FG
3742 if (meta.completeMultipart){
3743 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3744 0, orig_size);
3745 }
3746 else {
3747 store->quota_handler->update_stats(meta.owner, obj.bucket, (orig_exists ? 0 : 1),
3748 accounted_size, orig_size);
3749 }
7c673cae
FG
3750 return 0;
3751
3752done_cancel:
3753 int ret = index_op->cancel();
3754 if (ret < 0) {
3755 ldout(store->ctx(), 0) << "ERROR: index_op.cancel()() returned ret=" << ret << dendl;
3756 }
3757
3758 meta.canceled = true;
3759
3760 /* we lost in a race. There are a few options:
3761 * - existing object was rewritten (ECANCELED)
3762 * - non existing object was created (EEXIST)
3763 * - object was removed (ENOENT)
3764 * should treat it as a success
3765 */
3766 if (meta.if_match == NULL && meta.if_nomatch == NULL) {
3767 if (r == -ECANCELED || r == -ENOENT || r == -EEXIST) {
3768 r = 0;
3769 }
3770 } else {
3771 if (meta.if_match != NULL) {
3772 // only overwrite existing object
3773 if (strcmp(meta.if_match, "*") == 0) {
3774 if (r == -ENOENT) {
3775 r = -ERR_PRECONDITION_FAILED;
3776 } else if (r == -ECANCELED) {
3777 r = 0;
3778 }
3779 }
3780 }
3781
3782 if (meta.if_nomatch != NULL) {
3783 // only create a new object
3784 if (strcmp(meta.if_nomatch, "*") == 0) {
3785 if (r == -EEXIST) {
3786 r = -ERR_PRECONDITION_FAILED;
3787 } else if (r == -ENOENT) {
3788 r = 0;
3789 }
3790 }
3791 }
3792 }
3793
3794 return r;
3795}
3796
3797int RGWRados::Object::Write::write_meta(uint64_t size, uint64_t accounted_size,
3798 map<string, bufferlist>& attrs)
3799{
3800 RGWBucketInfo& bucket_info = target->get_bucket_info();
3801
3802 RGWRados::Bucket bop(target->get_store(), bucket_info);
3803 RGWRados::Bucket::UpdateIndex index_op(&bop, target->get_obj());
31f18b77
FG
3804 index_op.set_zones_trace(meta.zones_trace);
3805
7c673cae
FG
3806 bool assume_noent = (meta.if_match == NULL && meta.if_nomatch == NULL);
3807 int r;
3808 if (assume_noent) {
181888fb 3809 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
3810 if (r == -EEXIST) {
3811 assume_noent = false;
3812 }
3813 }
3814 if (!assume_noent) {
181888fb 3815 r = _do_write_meta(size, accounted_size, attrs, assume_noent, meta.modify_tail, (void *)&index_op);
7c673cae
FG
3816 }
3817 return r;
3818}
3819
11fdf7f2 3820class RGWRadosPutObj : public RGWHTTPStreamRWRequest::ReceiveCB
7c673cae
FG
3821{
3822 CephContext* cct;
3823 rgw_obj obj;
11fdf7f2 3824 rgw::putobj::DataProcessor *filter;
7c673cae 3825 boost::optional<RGWPutObj_Compress>& compressor;
11fdf7f2 3826 boost::optional<rgw::putobj::ChunkProcessor> buffering;
7c673cae 3827 CompressorRef& plugin;
11fdf7f2 3828 rgw::putobj::ObjectProcessor *processor;
7c673cae
FG
3829 void (*progress_cb)(off_t, void *);
3830 void *progress_data;
3831 bufferlist extra_data_bl;
11fdf7f2
TL
3832 uint64_t extra_data_left{0};
3833 bool need_to_process_attrs{true};
3834 uint64_t data_len{0};
7c673cae 3835 map<string, bufferlist> src_attrs;
11fdf7f2
TL
3836 uint64_t ofs{0};
3837 uint64_t lofs{0}; /* logical ofs */
3838 std::function<int(const map<string, bufferlist>&)> attrs_handler;
7c673cae
FG
3839public:
3840 RGWRadosPutObj(CephContext* cct,
3841 CompressorRef& plugin,
3842 boost::optional<RGWPutObj_Compress>& compressor,
11fdf7f2 3843 rgw::putobj::ObjectProcessor *p,
7c673cae 3844 void (*_progress_cb)(off_t, void *),
11fdf7f2
TL
3845 void *_progress_data,
3846 std::function<int(const map<string, bufferlist>&)> _attrs_handler) :
7c673cae
FG
3847 cct(cct),
3848 filter(p),
3849 compressor(compressor),
3850 plugin(plugin),
3851 processor(p),
7c673cae
FG
3852 progress_cb(_progress_cb),
3853 progress_data(_progress_data),
11fdf7f2 3854 attrs_handler(_attrs_handler) {}
7c673cae
FG
3855
3856 int process_attrs(void) {
3857 if (extra_data_bl.length()) {
3858 JSONParser jp;
3859 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
3860 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
3861 return -EIO;
3862 }
3863
3864 JSONDecoder::decode_json("attrs", src_attrs, &jp);
3865
3866 src_attrs.erase(RGW_ATTR_COMPRESSION);
3867 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
a8e16298
TL
3868
3869 // filter out olh attributes
3870 auto iter = src_attrs.lower_bound(RGW_ATTR_OLH_PREFIX);
3871 while (iter != src_attrs.end()) {
3872 if (!boost::algorithm::starts_with(iter->first, RGW_ATTR_OLH_PREFIX)) {
3873 break;
3874 }
3875 iter = src_attrs.erase(iter);
3876 }
7c673cae
FG
3877 }
3878
11fdf7f2
TL
3879 int ret = attrs_handler(src_attrs);
3880 if (ret < 0) {
3881 return ret;
3882 }
3883
7c673cae
FG
3884 if (plugin && src_attrs.find(RGW_ATTR_CRYPT_MODE) == src_attrs.end()) {
3885 //do not compress if object is encrypted
3886 compressor = boost::in_place(cct, plugin, filter);
11fdf7f2
TL
3887 // add a filter that buffers data so we don't try to compress tiny blocks.
3888 // libcurl reads in 16k at a time, and we need at least 64k to get a good
3889 // compression ratio
28e407b8
AA
3890 constexpr unsigned buffer_size = 512 * 1024;
3891 buffering = boost::in_place(&*compressor, buffer_size);
3892 filter = &*buffering;
7c673cae 3893 }
11fdf7f2
TL
3894
3895 need_to_process_attrs = false;
3896
7c673cae
FG
3897 return 0;
3898 }
3899
11fdf7f2 3900 int handle_data(bufferlist& bl, bool *pause) override {
7c673cae 3901 if (progress_cb) {
11fdf7f2 3902 progress_cb(data_len, progress_data);
7c673cae 3903 }
b32b8144 3904 if (extra_data_left) {
11fdf7f2 3905 uint64_t extra_len = bl.length();
b32b8144
FG
3906 if (extra_len > extra_data_left)
3907 extra_len = extra_data_left;
7c673cae
FG
3908
3909 bufferlist extra;
3910 bl.splice(0, extra_len, &extra);
3911 extra_data_bl.append(extra);
3912
b32b8144
FG
3913 extra_data_left -= extra_len;
3914 if (extra_data_left == 0) {
7c673cae
FG
3915 int res = process_attrs();
3916 if (res < 0)
3917 return res;
3918 }
11fdf7f2 3919 ofs += extra_len;
7c673cae
FG
3920 if (bl.length() == 0) {
3921 return 0;
3922 }
3923 }
11fdf7f2
TL
3924 if (need_to_process_attrs) {
3925 /* need to call process_attrs() even if we don't get any attrs,
3926 * need it to call attrs_handler().
3927 */
3928 int res = process_attrs();
3929 if (res < 0) {
3930 return res;
3931 }
3932 }
7c673cae 3933
11fdf7f2 3934 ceph_assert(uint64_t(ofs) >= extra_data_len);
7c673cae 3935
11fdf7f2
TL
3936 uint64_t size = bl.length();
3937 ofs += size;
7c673cae 3938
11fdf7f2
TL
3939 const uint64_t lofs = data_len;
3940 data_len += size;
7c673cae 3941
11fdf7f2 3942 return filter->process(std::move(bl), lofs);
7c673cae
FG
3943 }
3944
28e407b8 3945 int flush() {
11fdf7f2 3946 return filter->process({}, data_len);
28e407b8
AA
3947 }
3948
7c673cae
FG
3949 bufferlist& get_extra_data() { return extra_data_bl; }
3950
3951 map<string, bufferlist>& get_attrs() { return src_attrs; }
3952
3953 void set_extra_data_len(uint64_t len) override {
b32b8144 3954 extra_data_left = len;
11fdf7f2 3955 RGWHTTPStreamRWRequest::ReceiveCB::set_extra_data_len(len);
7c673cae
FG
3956 }
3957
3958 uint64_t get_data_len() {
3959 return data_len;
3960 }
7c673cae
FG
3961};
3962
3963/*
3964 * prepare attrset depending on attrs_mod.
3965 */
3966static void set_copy_attrs(map<string, bufferlist>& src_attrs,
3967 map<string, bufferlist>& attrs,
3968 RGWRados::AttrsMod attrs_mod)
3969{
3970 switch (attrs_mod) {
3971 case RGWRados::ATTRSMOD_NONE:
3972 attrs = src_attrs;
3973 break;
3974 case RGWRados::ATTRSMOD_REPLACE:
3975 if (!attrs[RGW_ATTR_ETAG].length()) {
3976 attrs[RGW_ATTR_ETAG] = src_attrs[RGW_ATTR_ETAG];
3977 }
181888fb
FG
3978 if (!attrs[RGW_ATTR_TAIL_TAG].length()) {
3979 auto ttiter = src_attrs.find(RGW_ATTR_TAIL_TAG);
3980 if (ttiter != src_attrs.end()) {
3981 attrs[RGW_ATTR_TAIL_TAG] = src_attrs[RGW_ATTR_TAIL_TAG];
3982 }
3983 }
7c673cae
FG
3984 break;
3985 case RGWRados::ATTRSMOD_MERGE:
3986 for (map<string, bufferlist>::iterator it = src_attrs.begin(); it != src_attrs.end(); ++it) {
3987 if (attrs.find(it->first) == attrs.end()) {
3988 attrs[it->first] = it->second;
3989 }
3990 }
3991 break;
3992 }
3993}
3994
11fdf7f2 3995int RGWRados::rewrite_obj(RGWBucketInfo& dest_bucket_info, const rgw_obj& obj)
7c673cae
FG
3996{
3997 map<string, bufferlist> attrset;
3998
3999 real_time mtime;
4000 uint64_t obj_size;
4001 RGWObjectCtx rctx(this);
4002
4003 RGWRados::Object op_target(this, dest_bucket_info, rctx, obj);
4004 RGWRados::Object::Read read_op(&op_target);
4005
4006 read_op.params.attrs = &attrset;
4007 read_op.params.lastmod = &mtime;
4008 read_op.params.obj_size = &obj_size;
4009
4010 int ret = read_op.prepare();
4011 if (ret < 0)
4012 return ret;
4013
4014 attrset.erase(RGW_ATTR_ID_TAG);
181888fb 4015 attrset.erase(RGW_ATTR_TAIL_TAG);
7c673cae 4016
11fdf7f2
TL
4017 return copy_obj_data(rctx, dest_bucket_info, dest_bucket_info.placement_rule,
4018 read_op, obj_size - 1, obj, NULL, mtime, attrset,
4019 0, real_time(), NULL);
7c673cae
FG
4020}
4021
4022struct obj_time_weight {
4023 real_time mtime;
4024 uint32_t zone_short_id;
4025 uint64_t pg_ver;
4026 bool high_precision;
4027
4028 obj_time_weight() : zone_short_id(0), pg_ver(0), high_precision(false) {}
4029
4030 bool compare_low_precision(const obj_time_weight& rhs) {
4031 struct timespec l = ceph::real_clock::to_timespec(mtime);
4032 struct timespec r = ceph::real_clock::to_timespec(rhs.mtime);
4033 l.tv_nsec = 0;
4034 r.tv_nsec = 0;
4035 if (l > r) {
4036 return false;
4037 }
4038 if (l < r) {
4039 return true;
4040 }
11fdf7f2
TL
4041 if (!zone_short_id || !rhs.zone_short_id) {
4042 /* don't compare zone ids, if one wasn't provided */
4043 return false;
4044 }
7c673cae
FG
4045 if (zone_short_id != rhs.zone_short_id) {
4046 return (zone_short_id < rhs.zone_short_id);
4047 }
4048 return (pg_ver < rhs.pg_ver);
4049
4050 }
4051
4052 bool operator<(const obj_time_weight& rhs) {
4053 if (!high_precision || !rhs.high_precision) {
4054 return compare_low_precision(rhs);
4055 }
4056 if (mtime > rhs.mtime) {
4057 return false;
4058 }
4059 if (mtime < rhs.mtime) {
4060 return true;
4061 }
11fdf7f2
TL
4062 if (!zone_short_id || !rhs.zone_short_id) {
4063 /* don't compare zone ids, if one wasn't provided */
4064 return false;
4065 }
7c673cae
FG
4066 if (zone_short_id != rhs.zone_short_id) {
4067 return (zone_short_id < rhs.zone_short_id);
4068 }
4069 return (pg_ver < rhs.pg_ver);
4070 }
4071
4072 void init(const real_time& _mtime, uint32_t _short_id, uint64_t _pg_ver) {
4073 mtime = _mtime;
4074 zone_short_id = _short_id;
4075 pg_ver = _pg_ver;
4076 }
4077
4078 void init(RGWObjState *state) {
4079 mtime = state->mtime;
4080 zone_short_id = state->zone_short_id;
4081 pg_ver = state->pg_ver;
4082 }
4083};
4084
4085inline ostream& operator<<(ostream& out, const obj_time_weight &o) {
4086 out << o.mtime;
4087
4088 if (o.zone_short_id != 0 || o.pg_ver != 0) {
4089 out << "[zid=" << o.zone_short_id << ", pgv=" << o.pg_ver << "]";
4090 }
4091
4092 return out;
4093}
4094
11fdf7f2 4095class RGWGetExtraDataCB : public RGWHTTPStreamRWRequest::ReceiveCB {
7c673cae
FG
4096 bufferlist extra_data;
4097public:
4098 RGWGetExtraDataCB() {}
11fdf7f2
TL
4099 int handle_data(bufferlist& bl, bool *pause) override {
4100 int bl_len = (int)bl.length();
7c673cae
FG
4101 if (extra_data.length() < extra_data_len) {
4102 off_t max = extra_data_len - extra_data.length();
4103 if (max > bl_len) {
4104 max = bl_len;
4105 }
4106 bl.splice(0, max, &extra_data);
4107 }
4108 return bl_len;
4109 }
4110
4111 bufferlist& get_extra_data() {
4112 return extra_data;
4113 }
4114};
4115
4116int RGWRados::stat_remote_obj(RGWObjectCtx& obj_ctx,
4117 const rgw_user& user_id,
7c673cae
FG
4118 req_info *info,
4119 const string& source_zone,
4120 rgw_obj& src_obj,
4121 RGWBucketInfo& src_bucket_info,
4122 real_time *src_mtime,
4123 uint64_t *psize,
4124 const real_time *mod_ptr,
4125 const real_time *unmod_ptr,
4126 bool high_precision_time,
4127 const char *if_match,
4128 const char *if_nomatch,
4129 map<string, bufferlist> *pattrs,
11fdf7f2 4130 map<string, string> *pheaders,
7c673cae
FG
4131 string *version_id,
4132 string *ptag,
4133 string *petag)
4134{
4135 /* source is in a different zonegroup, copy from there */
4136
4137 RGWRESTStreamRWRequest *in_stream_req;
4138 string tag;
4139 map<string, bufferlist> src_attrs;
4140 append_rand_alpha(cct, tag, tag, 32);
4141 obj_time_weight set_mtime_weight;
4142 set_mtime_weight.high_precision = high_precision_time;
4143
4144 RGWRESTConn *conn;
4145 if (source_zone.empty()) {
4146 if (src_bucket_info.zonegroup.empty()) {
4147 /* source is in the master zonegroup */
11fdf7f2 4148 conn = svc.zone->get_master_conn();
7c673cae 4149 } else {
11fdf7f2 4150 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae
FG
4151 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4152 if (iter == zonegroup_conn_map.end()) {
4153 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4154 return -ENOENT;
4155 }
4156 conn = iter->second;
4157 }
4158 } else {
11fdf7f2 4159 auto& zone_conn_map = svc.zone->get_zone_conn_map();
7c673cae
FG
4160 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4161 if (iter == zone_conn_map.end()) {
4162 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
4163 return -ENOENT;
4164 }
4165 conn = iter->second;
4166 }
4167
4168 RGWGetExtraDataCB cb;
7c673cae
FG
4169 map<string, string> req_headers;
4170 real_time set_mtime;
4171
4172 const real_time *pmod = mod_ptr;
4173
4174 obj_time_weight dest_mtime_weight;
4175
181888fb
FG
4176 constexpr bool prepend_meta = true;
4177 constexpr bool get_op = true;
4178 constexpr bool rgwx_stat = true;
4179 constexpr bool sync_manifest = true;
4180 constexpr bool skip_decrypt = true;
7c673cae
FG
4181 int ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4182 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 4183 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
4184 sync_manifest, skip_decrypt,
4185 true, &cb, &in_stream_req);
7c673cae
FG
4186 if (ret < 0) {
4187 return ret;
4188 }
4189
11fdf7f2 4190 ret = conn->complete_request(in_stream_req, nullptr, &set_mtime, psize, nullptr, pheaders);
7c673cae
FG
4191 if (ret < 0) {
4192 return ret;
4193 }
4194
4195 bufferlist& extra_data_bl = cb.get_extra_data();
4196 if (extra_data_bl.length()) {
4197 JSONParser jp;
4198 if (!jp.parse(extra_data_bl.c_str(), extra_data_bl.length())) {
4199 ldout(cct, 0) << "failed to parse response extra data. len=" << extra_data_bl.length() << " data=" << extra_data_bl.c_str() << dendl;
4200 return -EIO;
4201 }
4202
4203 JSONDecoder::decode_json("attrs", src_attrs, &jp);
4204
4205 src_attrs.erase(RGW_ATTR_MANIFEST); // not interested in original object layout
4206 }
4207
4208 if (src_mtime) {
4209 *src_mtime = set_mtime;
4210 }
4211
4212 if (petag) {
4213 map<string, bufferlist>::iterator iter = src_attrs.find(RGW_ATTR_ETAG);
4214 if (iter != src_attrs.end()) {
4215 bufferlist& etagbl = iter->second;
4216 *petag = etagbl.to_str();
11fdf7f2
TL
4217 while (petag->size() > 0 && (*petag)[petag->size() - 1] == '\0') {
4218 *petag = petag->substr(0, petag->size() - 1);
4219 }
7c673cae
FG
4220 }
4221 }
4222
4223 if (pattrs) {
11fdf7f2 4224 *pattrs = std::move(src_attrs);
7c673cae
FG
4225 }
4226
4227 return 0;
4228}
4229
4230int RGWRados::fetch_remote_obj(RGWObjectCtx& obj_ctx,
4231 const rgw_user& user_id,
7c673cae
FG
4232 req_info *info,
4233 const string& source_zone,
11fdf7f2
TL
4234 const rgw_obj& dest_obj,
4235 const rgw_obj& src_obj,
7c673cae
FG
4236 RGWBucketInfo& dest_bucket_info,
4237 RGWBucketInfo& src_bucket_info,
11fdf7f2 4238 std::optional<rgw_placement_rule> dest_placement_rule,
7c673cae
FG
4239 real_time *src_mtime,
4240 real_time *mtime,
4241 const real_time *mod_ptr,
4242 const real_time *unmod_ptr,
4243 bool high_precision_time,
4244 const char *if_match,
4245 const char *if_nomatch,
4246 AttrsMod attrs_mod,
4247 bool copy_if_newer,
4248 map<string, bufferlist>& attrs,
4249 RGWObjCategory category,
11fdf7f2 4250 std::optional<uint64_t> olh_epoch,
7c673cae 4251 real_time delete_at,
7c673cae 4252 string *ptag,
11fdf7f2 4253 string *petag,
7c673cae 4254 void (*progress_cb)(off_t, void *),
31f18b77 4255 void *progress_data,
81eedcae
TL
4256 rgw_zone_set *zones_trace,
4257 std::optional<uint64_t>* bytes_transferred)
7c673cae
FG
4258{
4259 /* source is in a different zonegroup, copy from there */
4260
4261 RGWRESTStreamRWRequest *in_stream_req;
4262 string tag;
4263 int i;
4264 append_rand_alpha(cct, tag, tag, 32);
4265 obj_time_weight set_mtime_weight;
4266 set_mtime_weight.high_precision = high_precision_time;
11fdf7f2 4267 int ret;
7c673cae 4268
11fdf7f2
TL
4269 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4270 using namespace rgw::putobj;
4271 const rgw_placement_rule *ptail_rule = (dest_placement_rule ? &(*dest_placement_rule) : nullptr);
4272 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, ptail_rule, user_id,
4273 obj_ctx, dest_obj, olh_epoch, tag);
7c673cae 4274 RGWRESTConn *conn;
11fdf7f2
TL
4275 auto& zone_conn_map = svc.zone->get_zone_conn_map();
4276 auto& zonegroup_conn_map = svc.zone->get_zonegroup_conn_map();
7c673cae
FG
4277 if (source_zone.empty()) {
4278 if (dest_bucket_info.zonegroup.empty()) {
4279 /* source is in the master zonegroup */
11fdf7f2 4280 conn = svc.zone->get_master_conn();
7c673cae
FG
4281 } else {
4282 map<string, RGWRESTConn *>::iterator iter = zonegroup_conn_map.find(src_bucket_info.zonegroup);
4283 if (iter == zonegroup_conn_map.end()) {
4284 ldout(cct, 0) << "could not find zonegroup connection to zonegroup: " << source_zone << dendl;
4285 return -ENOENT;
4286 }
4287 conn = iter->second;
4288 }
4289 } else {
4290 map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
4291 if (iter == zone_conn_map.end()) {
4292 ldout(cct, 0) << "could not find zone connection to zone: " << source_zone << dendl;
11fdf7f2 4293 return -ENOENT;
7c673cae 4294 }
11fdf7f2 4295 conn = iter->second;
7c673cae
FG
4296 }
4297
11fdf7f2
TL
4298 string obj_name = dest_obj.bucket.name + "/" + dest_obj.get_oid();
4299
7c673cae
FG
4300 boost::optional<RGWPutObj_Compress> compressor;
4301 CompressorRef plugin;
4302
11fdf7f2
TL
4303 rgw_placement_rule dest_rule;
4304 RGWRadosPutObj cb(cct, plugin, compressor, &processor, progress_cb, progress_data,
4305 [&](const map<string, bufferlist>& obj_attrs) {
4306 if (!ptail_rule) {
4307 auto iter = obj_attrs.find(RGW_ATTR_STORAGE_CLASS);
4308 if (iter != obj_attrs.end()) {
4309 dest_rule.storage_class = iter->second.to_str();
4310 dest_rule.inherit_from(dest_bucket_info.placement_rule);
4311 processor.set_tail_placement(std::move(dest_rule));
4312 ptail_rule = &dest_rule;
4313 } else {
4314 ptail_rule = &dest_bucket_info.placement_rule;
4315 }
4316 }
4317 const auto& compression_type = svc.zone->get_zone_params().get_compression_type(*ptail_rule);
4318 if (compression_type != "none") {
4319 plugin = Compressor::create(cct, compression_type);
4320 if (!plugin) {
4321 ldout(cct, 1) << "Cannot load plugin for compression type "
4322 << compression_type << dendl;
4323 }
4324 }
4325
4326 int ret = processor.prepare();
4327 if (ret < 0) {
4328 return ret;
4329 }
4330 return 0;
4331 });
7c673cae
FG
4332
4333 string etag;
7c673cae 4334 real_time set_mtime;
81eedcae 4335 uint64_t expected_size = 0;
7c673cae
FG
4336
4337 RGWObjState *dest_state = NULL;
4338
4339 const real_time *pmod = mod_ptr;
4340
4341 obj_time_weight dest_mtime_weight;
4342
4343 if (copy_if_newer) {
4344 /* need to get mtime for destination */
4345 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4346 if (ret < 0)
4347 goto set_err_state;
4348
4349 if (!real_clock::is_zero(dest_state->mtime)) {
4350 dest_mtime_weight.init(dest_state);
4351 pmod = &dest_mtime_weight.mtime;
4352 }
4353 }
4354
181888fb
FG
4355 static constexpr bool prepend_meta = true;
4356 static constexpr bool get_op = true;
4357 static constexpr bool rgwx_stat = false;
4358 static constexpr bool sync_manifest = true;
4359 static constexpr bool skip_decrypt = true;
7c673cae
FG
4360 ret = conn->get_obj(user_id, info, src_obj, pmod, unmod_ptr,
4361 dest_mtime_weight.zone_short_id, dest_mtime_weight.pg_ver,
181888fb 4362 prepend_meta, get_op, rgwx_stat,
11fdf7f2
TL
4363 sync_manifest, skip_decrypt,
4364 true,
4365 &cb, &in_stream_req);
7c673cae
FG
4366 if (ret < 0) {
4367 goto set_err_state;
4368 }
4369
81eedcae
TL
4370 ret = conn->complete_request(in_stream_req, &etag, &set_mtime,
4371 &expected_size, nullptr, nullptr);
7c673cae
FG
4372 if (ret < 0) {
4373 goto set_err_state;
4374 }
28e407b8
AA
4375 ret = cb.flush();
4376 if (ret < 0) {
4377 goto set_err_state;
4378 }
81eedcae
TL
4379 if (cb.get_data_len() != expected_size) {
4380 ret = -EIO;
4381 ldout(cct, 0) << "ERROR: object truncated during fetching, expected "
4382 << expected_size << " bytes but received " << cb.get_data_len() << dendl;
4383 goto set_err_state;
4384 }
7c673cae
FG
4385 if (compressor && compressor->is_compressed()) {
4386 bufferlist tmp;
4387 RGWCompressionInfo cs_info;
4388 cs_info.compression_type = plugin->get_type_name();
4389 cs_info.orig_size = cb.get_data_len();
4390 cs_info.blocks = move(compressor->get_compression_blocks());
11fdf7f2 4391 encode(cs_info, tmp);
7c673cae
FG
4392 cb.get_attrs()[RGW_ATTR_COMPRESSION] = tmp;
4393 }
4394
4395 if (source_zone.empty()) { /* need to preserve expiration if copy in the same zonegroup */
4396 cb.get_attrs().erase(RGW_ATTR_DELETE_AT);
4397 } else {
4398 map<string, bufferlist>::iterator iter = cb.get_attrs().find(RGW_ATTR_DELETE_AT);
4399 if (iter != cb.get_attrs().end()) {
4400 try {
11fdf7f2 4401 decode(delete_at, iter->second);
7c673cae
FG
4402 } catch (buffer::error& err) {
4403 ldout(cct, 0) << "ERROR: failed to decode delete_at field in intra zone copy" << dendl;
4404 }
4405 }
4406 }
4407
4408 if (src_mtime) {
4409 *src_mtime = set_mtime;
4410 }
4411
4412 if (petag) {
4413 const auto iter = cb.get_attrs().find(RGW_ATTR_ETAG);
4414 if (iter != cb.get_attrs().end()) {
11fdf7f2 4415 *petag = iter->second.to_str();
7c673cae
FG
4416 }
4417 }
4418
11fdf7f2
TL
4419 //erase the append attr
4420 cb.get_attrs().erase(RGW_ATTR_APPEND_PART_NUM);
4421
7c673cae
FG
4422 if (source_zone.empty()) {
4423 set_copy_attrs(cb.get_attrs(), attrs, attrs_mod);
4424 } else {
4425 attrs = cb.get_attrs();
4426 }
4427
4428 if (copy_if_newer) {
4429 uint64_t pg_ver = 0;
4430 auto i = attrs.find(RGW_ATTR_PG_VER);
4431 if (i != attrs.end() && i->second.length() > 0) {
11fdf7f2 4432 auto iter = i->second.cbegin();
7c673cae 4433 try {
11fdf7f2 4434 decode(pg_ver, iter);
7c673cae
FG
4435 } catch (buffer::error& err) {
4436 ldout(ctx(), 0) << "ERROR: failed to decode pg ver attribute, ignoring" << dendl;
4437 /* non critical error */
4438 }
4439 }
11fdf7f2 4440 set_mtime_weight.init(set_mtime, svc.zone->get_zone_short_id(), pg_ver);
7c673cae
FG
4441 }
4442
4443#define MAX_COMPLETE_RETRY 100
4444 for (i = 0; i < MAX_COMPLETE_RETRY; i++) {
11fdf7f2
TL
4445 bool canceled = false;
4446 ret = processor.complete(cb.get_data_len(), etag, mtime, set_mtime,
4447 attrs, delete_at, nullptr, nullptr, nullptr,
4448 zones_trace, &canceled);
7c673cae
FG
4449 if (ret < 0) {
4450 goto set_err_state;
4451 }
11fdf7f2 4452 if (copy_if_newer && canceled) {
7c673cae 4453 ldout(cct, 20) << "raced with another write of obj: " << dest_obj << dendl;
11fdf7f2 4454 obj_ctx.invalidate(dest_obj); /* object was overwritten */
7c673cae
FG
4455 ret = get_obj_state(&obj_ctx, dest_bucket_info, dest_obj, &dest_state, false);
4456 if (ret < 0) {
4457 ldout(cct, 0) << "ERROR: " << __func__ << ": get_err_state() returned ret=" << ret << dendl;
4458 goto set_err_state;
4459 }
4460 dest_mtime_weight.init(dest_state);
4461 dest_mtime_weight.high_precision = high_precision_time;
4462 if (!dest_state->exists ||
4463 dest_mtime_weight < set_mtime_weight) {
4464 ldout(cct, 20) << "retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4465 continue;
4466 } else {
4467 ldout(cct, 20) << "not retrying writing object mtime=" << set_mtime << " dest_state->mtime=" << dest_state->mtime << " dest_state->exists=" << dest_state->exists << dendl;
4468 }
4469 }
4470 break;
4471 }
4472
4473 if (i == MAX_COMPLETE_RETRY) {
4474 ldout(cct, 0) << "ERROR: retried object completion too many times, something is wrong!" << dendl;
4475 ret = -EIO;
4476 goto set_err_state;
4477 }
4478
81eedcae
TL
4479 if (bytes_transferred) {
4480 *bytes_transferred = cb.get_data_len();
4481 }
7c673cae
FG
4482 return 0;
4483set_err_state:
4484 if (copy_if_newer && ret == -ERR_NOT_MODIFIED) {
91327a77
AA
4485 // we may have already fetched during sync of OP_ADD, but were waiting
4486 // for OP_LINK_OLH to call set_olh() with a real olh_epoch
4487 if (olh_epoch && *olh_epoch > 0) {
4488 constexpr bool log_data_change = true;
4489 ret = set_olh(obj_ctx, dest_bucket_info, dest_obj, false, nullptr,
4490 *olh_epoch, real_time(), false, zones_trace, log_data_change);
4491 } else {
4492 // we already have the latest copy
4493 ret = 0;
4494 }
7c673cae 4495 }
7c673cae
FG
4496 return ret;
4497}
4498
4499
4500int RGWRados::copy_obj_to_remote_dest(RGWObjState *astate,
4501 map<string, bufferlist>& src_attrs,
4502 RGWRados::Object::Read& read_op,
4503 const rgw_user& user_id,
4504 rgw_obj& dest_obj,
4505 real_time *mtime)
4506{
4507 string etag;
4508
11fdf7f2 4509 RGWRESTStreamS3PutObj *out_stream_req;
7c673cae 4510
11fdf7f2
TL
4511 auto rest_master_conn = svc.zone->get_master_conn();
4512
4513 int ret = rest_master_conn->put_obj_async(user_id, dest_obj, astate->size, src_attrs, true, &out_stream_req);
7c673cae 4514 if (ret < 0) {
7c673cae
FG
4515 return ret;
4516 }
4517
4518 ret = read_op.iterate(0, astate->size - 1, out_stream_req->get_out_cb());
224ce89b
WB
4519 if (ret < 0) {
4520 delete out_stream_req;
7c673cae 4521 return ret;
224ce89b 4522 }
7c673cae
FG
4523
4524 ret = rest_master_conn->complete_request(out_stream_req, etag, mtime);
4525 if (ret < 0)
4526 return ret;
4527
4528 return 0;
4529}
4530
4531/**
4532 * Copy an object.
4533 * dest_obj: the object to copy into
4534 * src_obj: the object to copy from
4535 * attrs: usage depends on attrs_mod parameter
4536 * attrs_mod: the modification mode of the attrs, may have the following values:
4537 * ATTRSMOD_NONE - the attributes of the source object will be
4538 * copied without modifications, attrs parameter is ignored;
4539 * ATTRSMOD_REPLACE - new object will have the attributes provided by attrs
4540 * parameter, source object attributes are not copied;
4541 * ATTRSMOD_MERGE - any conflicting meta keys on the source object's attributes
4542 * are overwritten by values contained in attrs parameter.
4543 * err: stores any errors resulting from the get of the original object
4544 * Returns: 0 on success, -ERR# otherwise.
4545 */
4546int RGWRados::copy_obj(RGWObjectCtx& obj_ctx,
4547 const rgw_user& user_id,
7c673cae
FG
4548 req_info *info,
4549 const string& source_zone,
4550 rgw_obj& dest_obj,
4551 rgw_obj& src_obj,
4552 RGWBucketInfo& dest_bucket_info,
4553 RGWBucketInfo& src_bucket_info,
11fdf7f2 4554 const rgw_placement_rule& dest_placement,
7c673cae
FG
4555 real_time *src_mtime,
4556 real_time *mtime,
4557 const real_time *mod_ptr,
4558 const real_time *unmod_ptr,
4559 bool high_precision_time,
4560 const char *if_match,
4561 const char *if_nomatch,
4562 AttrsMod attrs_mod,
4563 bool copy_if_newer,
4564 map<string, bufferlist>& attrs,
4565 RGWObjCategory category,
4566 uint64_t olh_epoch,
4567 real_time delete_at,
4568 string *version_id,
4569 string *ptag,
11fdf7f2 4570 string *petag,
7c673cae
FG
4571 void (*progress_cb)(off_t, void *),
4572 void *progress_data)
4573{
4574 int ret;
4575 uint64_t obj_size;
4576 rgw_obj shadow_obj = dest_obj;
4577 string shadow_oid;
4578
4579 bool remote_src;
4580 bool remote_dest;
4581
4582 append_rand_alpha(cct, dest_obj.get_oid(), shadow_oid, 32);
4583 shadow_obj.init_ns(dest_obj.bucket, shadow_oid, shadow_ns);
4584
11fdf7f2
TL
4585 auto& zonegroup = svc.zone->get_zonegroup();
4586
4587 remote_dest = !zonegroup.equals(dest_bucket_info.zonegroup);
4588 remote_src = !zonegroup.equals(src_bucket_info.zonegroup);
7c673cae
FG
4589
4590 if (remote_src && remote_dest) {
4591 ldout(cct, 0) << "ERROR: can't copy object when both src and dest buckets are remote" << dendl;
4592 return -EINVAL;
4593 }
4594
4595 ldout(cct, 5) << "Copy object " << src_obj.bucket << ":" << src_obj.get_oid() << " => " << dest_obj.bucket << ":" << dest_obj.get_oid() << dendl;
4596
4597 if (remote_src || !source_zone.empty()) {
11fdf7f2
TL
4598 return fetch_remote_obj(obj_ctx, user_id, info, source_zone,
4599 dest_obj, src_obj, dest_bucket_info, src_bucket_info,
4600 dest_placement, src_mtime, mtime, mod_ptr,
7c673cae
FG
4601 unmod_ptr, high_precision_time,
4602 if_match, if_nomatch, attrs_mod, copy_if_newer, attrs, category,
11fdf7f2 4603 olh_epoch, delete_at, ptag, petag, progress_cb, progress_data);
7c673cae
FG
4604 }
4605
4606 map<string, bufferlist> src_attrs;
4607 RGWRados::Object src_op_target(this, src_bucket_info, obj_ctx, src_obj);
4608 RGWRados::Object::Read read_op(&src_op_target);
4609
4610 read_op.conds.mod_ptr = mod_ptr;
4611 read_op.conds.unmod_ptr = unmod_ptr;
4612 read_op.conds.high_precision_time = high_precision_time;
4613 read_op.conds.if_match = if_match;
4614 read_op.conds.if_nomatch = if_nomatch;
4615 read_op.params.attrs = &src_attrs;
4616 read_op.params.lastmod = src_mtime;
4617 read_op.params.obj_size = &obj_size;
7c673cae
FG
4618
4619 ret = read_op.prepare();
4620 if (ret < 0) {
4621 return ret;
4622 }
94b18763
FG
4623 if (src_attrs.count(RGW_ATTR_CRYPT_MODE)) {
4624 // Current implementation does not follow S3 spec and even
4625 // may result in data corruption silently when copying
4626 // multipart objects acorss pools. So reject COPY operations
4627 //on encrypted objects before it is fully functional.
4628 ldout(cct, 0) << "ERROR: copy op for encrypted object " << src_obj
4629 << " has not been implemented." << dendl;
4630 return -ERR_NOT_IMPLEMENTED;
4631 }
7c673cae
FG
4632
4633 src_attrs[RGW_ATTR_ACL] = attrs[RGW_ATTR_ACL];
4634 src_attrs.erase(RGW_ATTR_DELETE_AT);
4635
4636 set_copy_attrs(src_attrs, attrs, attrs_mod);
4637 attrs.erase(RGW_ATTR_ID_TAG);
4638 attrs.erase(RGW_ATTR_PG_VER);
4639 attrs.erase(RGW_ATTR_SOURCE_ZONE);
4640 map<string, bufferlist>::iterator cmp = src_attrs.find(RGW_ATTR_COMPRESSION);
4641 if (cmp != src_attrs.end())
4642 attrs[RGW_ATTR_COMPRESSION] = cmp->second;
4643
4644 RGWObjManifest manifest;
4645 RGWObjState *astate = NULL;
4646
4647 ret = get_obj_state(&obj_ctx, src_bucket_info, src_obj, &astate);
4648 if (ret < 0) {
4649 return ret;
4650 }
4651
4652 vector<rgw_raw_obj> ref_objs;
4653
4654 if (remote_dest) {
4655 /* dest is in a different zonegroup, copy it there */
4656 return copy_obj_to_remote_dest(astate, attrs, read_op, user_id, dest_obj, mtime);
4657 }
4658 uint64_t max_chunk_size;
4659
4660 ret = get_max_chunk_size(dest_bucket_info.placement_rule, dest_obj, &max_chunk_size);
4661 if (ret < 0) {
4662 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for bucket " << dest_obj.bucket << dendl;
4663 return ret;
4664 }
4665
4666 rgw_pool src_pool;
4667 rgw_pool dest_pool;
11fdf7f2
TL
4668
4669 const rgw_placement_rule *src_rule{nullptr};
4670
4671 if (astate->has_manifest) {
4672 src_rule = &astate->manifest.get_tail_placement().placement_rule;
4673 ldout(cct, 20) << __func__ << "(): manifest src_rule=" << src_rule->to_str() << dendl;
4674 }
4675
4676 if (!src_rule || src_rule->empty()) {
4677 src_rule = &src_bucket_info.placement_rule;
4678 }
4679
4680 if (!get_obj_data_pool(*src_rule, src_obj, &src_pool)) {
7c673cae
FG
4681 ldout(cct, 0) << "ERROR: failed to locate data pool for " << src_obj << dendl;
4682 return -EIO;
4683 }
11fdf7f2
TL
4684
4685 if (!get_obj_data_pool(dest_placement, dest_obj, &dest_pool)) {
7c673cae
FG
4686 ldout(cct, 0) << "ERROR: failed to locate data pool for " << dest_obj << dendl;
4687 return -EIO;
4688 }
4689
11fdf7f2
TL
4690 ldout(cct, 20) << __func__ << "(): src_rule=" << src_rule->to_str() << " src_pool=" << src_pool
4691 << " dest_rule=" << dest_placement.to_str() << " dest_pool=" << dest_pool << dendl;
4692
4693 bool copy_data = !astate->has_manifest ||
4694 (*src_rule != dest_placement) ||
4695 (src_pool != dest_pool);
7c673cae 4696
7c673cae
FG
4697 bool copy_first = false;
4698 if (astate->has_manifest) {
4699 if (!astate->manifest.has_tail()) {
4700 copy_data = true;
4701 } else {
4702 uint64_t head_size = astate->manifest.get_head_size();
4703
4704 if (head_size > 0) {
4705 if (head_size > max_chunk_size) {
4706 copy_data = true;
4707 } else {
4708 copy_first = true;
4709 }
4710 }
4711 }
4712 }
4713
4714 if (petag) {
4715 const auto iter = attrs.find(RGW_ATTR_ETAG);
4716 if (iter != attrs.end()) {
11fdf7f2 4717 *petag = iter->second.to_str();
7c673cae
FG
4718 }
4719 }
4720
4721 if (copy_data) { /* refcounting tail wouldn't work here, just copy the data */
11fdf7f2
TL
4722 attrs.erase(RGW_ATTR_TAIL_TAG);
4723 return copy_obj_data(obj_ctx, dest_bucket_info, dest_placement, read_op, obj_size - 1, dest_obj,
4724 mtime, real_time(), attrs, olh_epoch, delete_at, petag);
7c673cae
FG
4725 }
4726
4727 RGWObjManifest::obj_iterator miter = astate->manifest.obj_begin();
4728
4729 if (copy_first) { // we need to copy first chunk, not increase refcount
4730 ++miter;
4731 }
4732
4733 rgw_rados_ref ref;
4734 ret = get_raw_obj_ref(miter.get_location().get_raw_obj(this), &ref);
4735 if (ret < 0) {
4736 return ret;
4737 }
4738
7c673cae
FG
4739 bufferlist first_chunk;
4740
4741 bool copy_itself = (dest_obj == src_obj);
4742 RGWObjManifest *pmanifest;
31f18b77 4743 ldout(cct, 20) << "dest_obj=" << dest_obj << " src_obj=" << src_obj << " copy_itself=" << (int)copy_itself << dendl;
7c673cae
FG
4744
4745 RGWRados::Object dest_op_target(this, dest_bucket_info, obj_ctx, dest_obj);
4746 RGWRados::Object::Write write_op(&dest_op_target);
4747
4748 string tag;
4749
4750 if (ptag) {
4751 tag = *ptag;
4752 }
4753
4754 if (tag.empty()) {
4755 append_rand_alpha(cct, tag, tag, 32);
4756 }
4757
4758 if (!copy_itself) {
181888fb 4759 attrs.erase(RGW_ATTR_TAIL_TAG);
7c673cae
FG
4760 manifest = astate->manifest;
4761 const rgw_bucket_placement& tail_placement = manifest.get_tail_placement();
4762 if (tail_placement.bucket.name.empty()) {
4763 manifest.set_tail_placement(tail_placement.placement_rule, src_obj.bucket);
4764 }
3efd9988 4765 string ref_tag;
7c673cae
FG
4766 for (; miter != astate->manifest.obj_end(); ++miter) {
4767 ObjectWriteOperation op;
3efd9988
FG
4768 ref_tag = tag + '\0';
4769 cls_refcount_get(op, ref_tag, true);
7c673cae
FG
4770 const rgw_raw_obj& loc = miter.get_location().get_raw_obj(this);
4771 ref.ioctx.locator_set_key(loc.loc);
4772
4773 ret = ref.ioctx.operate(loc.oid, &op);
4774 if (ret < 0) {
4775 goto done_ret;
4776 }
4777
4778 ref_objs.push_back(loc);
4779 }
4780
4781 pmanifest = &manifest;
4782 } else {
4783 pmanifest = &astate->manifest;
4784 /* don't send the object's tail for garbage collection */
4785 astate->keep_tail = true;
4786 }
4787
4788 if (copy_first) {
4789 ret = read_op.read(0, max_chunk_size, first_chunk);
4790 if (ret < 0) {
4791 goto done_ret;
4792 }
4793
4794 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, first_chunk.length());
4795 } else {
4796 pmanifest->set_head(dest_bucket_info.placement_rule, dest_obj, 0);
4797 }
4798
4799 write_op.meta.data = &first_chunk;
4800 write_op.meta.manifest = pmanifest;
4801 write_op.meta.ptag = &tag;
4802 write_op.meta.owner = dest_bucket_info.owner;
4803 write_op.meta.mtime = mtime;
4804 write_op.meta.flags = PUT_OBJ_CREATE;
4805 write_op.meta.category = category;
4806 write_op.meta.olh_epoch = olh_epoch;
4807 write_op.meta.delete_at = delete_at;
181888fb 4808 write_op.meta.modify_tail = !copy_itself;
7c673cae
FG
4809
4810 ret = write_op.write_meta(obj_size, astate->accounted_size, attrs);
4811 if (ret < 0) {
4812 goto done_ret;
4813 }
4814
4815 return 0;
4816
4817done_ret:
4818 if (!copy_itself) {
4819 vector<rgw_raw_obj>::iterator riter;
4820
7c673cae
FG
4821 /* rollback reference */
4822 for (riter = ref_objs.begin(); riter != ref_objs.end(); ++riter) {
4823 ObjectWriteOperation op;
4824 cls_refcount_put(op, tag, true);
4825
4826 ref.ioctx.locator_set_key(riter->loc);
4827
4828 int r = ref.ioctx.operate(riter->oid, &op);
4829 if (r < 0) {
4830 ldout(cct, 0) << "ERROR: cleanup after error failed to drop reference on obj=" << *riter << dendl;
4831 }
4832 }
4833 }
4834 return ret;
4835}
4836
4837
4838int RGWRados::copy_obj_data(RGWObjectCtx& obj_ctx,
4839 RGWBucketInfo& dest_bucket_info,
11fdf7f2 4840 const rgw_placement_rule& dest_placement,
7c673cae 4841 RGWRados::Object::Read& read_op, off_t end,
11fdf7f2 4842 const rgw_obj& dest_obj,
7c673cae
FG
4843 real_time *mtime,
4844 real_time set_mtime,
4845 map<string, bufferlist>& attrs,
7c673cae
FG
4846 uint64_t olh_epoch,
4847 real_time delete_at,
11fdf7f2 4848 string *petag)
7c673cae 4849{
7c673cae
FG
4850 string tag;
4851 append_rand_alpha(cct, tag, tag, 32);
4852
11fdf7f2
TL
4853 rgw::AioThrottle aio(cct->_conf->rgw_put_obj_min_window_size);
4854 using namespace rgw::putobj;
4855 AtomicObjectProcessor processor(&aio, this, dest_bucket_info, &dest_placement,
4856 dest_bucket_info.owner, obj_ctx,
4857 dest_obj, olh_epoch, tag);
4858 int ret = processor.prepare();
7c673cae
FG
4859 if (ret < 0)
4860 return ret;
4861
4862 off_t ofs = 0;
4863
4864 do {
4865 bufferlist bl;
4866 ret = read_op.read(ofs, end, bl);
11fdf7f2
TL
4867 if (ret < 0) {
4868 ldout(cct, 0) << "ERROR: fail to read object data, ret = " << ret << dendl;
4869 return ret;
4870 }
7c673cae
FG
4871
4872 uint64_t read_len = ret;
11fdf7f2
TL
4873 ret = processor.process(std::move(bl), ofs);
4874 if (ret < 0) {
4875 return ret;
4876 }
7c673cae
FG
4877
4878 ofs += read_len;
4879 } while (ofs <= end);
4880
11fdf7f2
TL
4881 // flush
4882 ret = processor.process({}, ofs);
4883 if (ret < 0) {
4884 return ret;
4885 }
4886
7c673cae
FG
4887 string etag;
4888 auto iter = attrs.find(RGW_ATTR_ETAG);
4889 if (iter != attrs.end()) {
4890 bufferlist& bl = iter->second;
11fdf7f2 4891 etag = bl.to_str();
7c673cae 4892 if (petag) {
11fdf7f2 4893 *petag = etag;
7c673cae
FG
4894 }
4895 }
4896
4897 uint64_t accounted_size;
4898 {
4899 bool compressed{false};
4900 RGWCompressionInfo cs_info;
4901 ret = rgw_compression_info_from_attrset(attrs, compressed, cs_info);
4902 if (ret < 0) {
4903 ldout(cct, 0) << "ERROR: failed to read compression info" << dendl;
4904 return ret;
4905 }
4906 // pass original size if compressed
4907 accounted_size = compressed ? cs_info.orig_size : ofs;
4908 }
4909
11fdf7f2
TL
4910 return processor.complete(accounted_size, etag, mtime, set_mtime, attrs, delete_at,
4911 nullptr, nullptr, nullptr, nullptr, nullptr);
7c673cae
FG
4912}
4913
11fdf7f2
TL
4914int RGWRados::transition_obj(RGWObjectCtx& obj_ctx,
4915 RGWBucketInfo& bucket_info,
4916 rgw_obj& obj,
4917 const rgw_placement_rule& placement_rule,
4918 const real_time& mtime,
4919 uint64_t olh_epoch)
7c673cae 4920{
11fdf7f2
TL
4921 map<string, bufferlist> attrs;
4922 real_time read_mtime;
4923 uint64_t obj_size;
7c673cae 4924
11fdf7f2
TL
4925 RGWRados::Object op_target(this, bucket_info, obj_ctx, obj);
4926 RGWRados::Object::Read read_op(&op_target);
7c673cae 4927
11fdf7f2
TL
4928 read_op.params.attrs = &attrs;
4929 read_op.params.lastmod = &read_mtime;
4930 read_op.params.obj_size = &obj_size;
7c673cae 4931
11fdf7f2
TL
4932 int ret = read_op.prepare();
4933 if (ret < 0) {
4934 return ret;
7c673cae
FG
4935 }
4936
11fdf7f2
TL
4937 if (read_mtime != mtime) {
4938 /* raced */
4939 return -ECANCELED;
7c673cae
FG
4940 }
4941
11fdf7f2
TL
4942 ret = copy_obj_data(obj_ctx,
4943 bucket_info,
4944 placement_rule,
4945 read_op,
4946 obj_size - 1,
4947 obj,
4948 nullptr /* pmtime */,
4949 mtime,
4950 attrs,
4951 olh_epoch,
4952 real_time(),
4953 nullptr /* petag */);
4954 if (ret < 0) {
4955 return ret;
7c673cae
FG
4956 }
4957
11fdf7f2 4958 return 0;
7c673cae
FG
4959}
4960
4961int RGWRados::check_bucket_empty(RGWBucketInfo& bucket_info)
4962{
1adf2230 4963 std::vector<rgw_bucket_dir_entry> ent_list;
7c673cae
FG
4964 rgw_obj_index_key marker;
4965 string prefix;
4966 bool is_truncated;
4967
4968 do {
1adf2230
AA
4969 constexpr uint NUM_ENTRIES = 1000u;
4970 int r = cls_bucket_list_unordered(bucket_info,
4971 RGW_NO_SHARD,
4972 marker,
4973 prefix,
4974 NUM_ENTRIES,
4975 true,
4976 ent_list,
4977 &is_truncated,
4978 &marker);
7c673cae
FG
4979 if (r < 0)
4980 return r;
4981
4982 string ns;
1adf2230 4983 for (auto const& dirent : ent_list) {
7c673cae
FG
4984 rgw_obj_key obj;
4985
1adf2230 4986 if (rgw_obj_key::oid_to_key_in_ns(dirent.key.name, &obj, ns))
7c673cae
FG
4987 return -ENOTEMPTY;
4988 }
4989 } while (is_truncated);
1adf2230 4990
7c673cae
FG
4991 return 0;
4992}
4993
4994/**
4995 * Delete a bucket.
4996 * bucket: the name of the bucket to delete
4997 * Returns 0 on success, -ERR# otherwise.
4998 */
4999int RGWRados::delete_bucket(RGWBucketInfo& bucket_info, RGWObjVersionTracker& objv_tracker, bool check_empty)
5000{
5001 const rgw_bucket& bucket = bucket_info.bucket;
5002 librados::IoCtx index_ctx;
5003 map<int, string> bucket_objs;
5004 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5005 if (r < 0)
5006 return r;
5007
5008 if (check_empty) {
5009 r = check_bucket_empty(bucket_info);
5010 if (r < 0) {
5011 return r;
5012 }
5013 }
5014
5015 r = rgw_bucket_delete_bucket_obj(this, bucket.tenant, bucket.name, objv_tracker);
5016 if (r < 0)
5017 return r;
5018
5019 /* if the bucket is not synced we can remove the meta file */
11fdf7f2 5020 if (!svc.zone->is_syncing_bucket_meta(bucket)) {
7c673cae 5021 RGWObjVersionTracker objv_tracker;
f64942e4 5022 r = rgw_bucket_instance_remove_entry(this, bucket.get_key(), &objv_tracker);
7c673cae
FG
5023 if (r < 0) {
5024 return r;
5025 }
f64942e4
AA
5026
5027 /* remove bucket index objects asynchronously by best effort */
5028 (void) CLSRGWIssueBucketIndexClean(index_ctx,
5029 bucket_objs,
5030 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae 5031 }
f64942e4 5032
7c673cae
FG
5033 return 0;
5034}
5035
5036int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
5037{
5038 RGWBucketInfo info;
5039 map<string, bufferlist> attrs;
11fdf7f2 5040 auto obj_ctx = svc.sysobj->init_obj_ctx();
31f18b77
FG
5041 int r;
5042 if (bucket.bucket_id.empty()) {
5043 r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5044 } else {
5045 r = get_bucket_instance_info(obj_ctx, bucket, info, nullptr, &attrs);
5046 }
7c673cae
FG
5047 if (r < 0) {
5048 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5049 return r;
5050 }
5051
5052 info.owner = owner.get_id();
5053
5054 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5055 if (r < 0) {
5056 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
5057 return r;
5058 }
5059
5060 return 0;
5061}
5062
5063
5064int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
5065{
5066 int ret = 0;
5067
5068 vector<rgw_bucket>::iterator iter;
5069
5070 for (iter = buckets.begin(); iter != buckets.end(); ++iter) {
5071 rgw_bucket& bucket = *iter;
5072 if (enabled)
5073 ldout(cct, 20) << "enabling bucket name=" << bucket.name << dendl;
5074 else
5075 ldout(cct, 20) << "disabling bucket name=" << bucket.name << dendl;
5076
5077 RGWBucketInfo info;
5078 map<string, bufferlist> attrs;
11fdf7f2 5079 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5080 int r = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, info, NULL, &attrs);
5081 if (r < 0) {
5082 ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5083 ret = r;
5084 continue;
5085 }
5086 if (enabled) {
5087 info.flags &= ~BUCKET_SUSPENDED;
5088 } else {
5089 info.flags |= BUCKET_SUSPENDED;
5090 }
5091
5092 r = put_bucket_instance_info(info, false, real_time(), &attrs);
5093 if (r < 0) {
5094 ldout(cct, 0) << "NOTICE: put_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
5095 ret = r;
5096 continue;
5097 }
5098 }
5099 return ret;
5100}
5101
5102int RGWRados::bucket_suspended(rgw_bucket& bucket, bool *suspended)
5103{
5104 RGWBucketInfo bucket_info;
11fdf7f2 5105 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5106 int ret = get_bucket_info(obj_ctx, bucket.tenant, bucket.name, bucket_info, NULL);
5107 if (ret < 0) {
5108 return ret;
5109 }
5110
5111 *suspended = ((bucket_info.flags & BUCKET_SUSPENDED) != 0);
5112 return 0;
5113}
5114
5115int RGWRados::Object::complete_atomic_modification()
5116{
5117 if (!state->has_manifest || state->keep_tail)
5118 return 0;
5119
5120 cls_rgw_obj_chain chain;
5121 store->update_gc_chain(obj, state->manifest, &chain);
5122
5123 if (chain.empty()) {
5124 return 0;
5125 }
5126
181888fb 5127 string tag = (state->tail_tag.length() > 0 ? state->tail_tag.to_str() : state->obj_tag.to_str());
7c673cae
FG
5128 return store->gc->send_chain(chain, tag, false); // do it async
5129}
5130
5131void RGWRados::update_gc_chain(rgw_obj& head_obj, RGWObjManifest& manifest, cls_rgw_obj_chain *chain)
5132{
5133 RGWObjManifest::obj_iterator iter;
5134 rgw_raw_obj raw_head;
5135 obj_to_raw(manifest.get_head_placement_rule(), head_obj, &raw_head);
5136 for (iter = manifest.obj_begin(); iter != manifest.obj_end(); ++iter) {
5137 const rgw_raw_obj& mobj = iter.get_location().get_raw_obj(this);
5138 if (mobj == raw_head)
5139 continue;
5140 cls_rgw_obj_key key(mobj.oid);
5141 chain->push_obj(mobj.pool.to_str(), key, mobj.loc);
5142 }
5143}
5144
5145int RGWRados::send_chain_to_gc(cls_rgw_obj_chain& chain, const string& tag, bool sync)
5146{
5147 return gc->send_chain(chain, tag, sync);
5148}
5149
1adf2230
AA
5150int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5151 librados::IoCtx& index_ctx,
5152 string& bucket_oid)
7c673cae
FG
5153{
5154 const rgw_bucket& bucket = bucket_info.bucket;
5155 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5156 if (r < 0)
5157 return r;
5158
5159 if (bucket.bucket_id.empty()) {
5160 ldout(cct, 0) << "ERROR: empty bucket id for bucket operation" << dendl;
5161 return -EIO;
5162 }
5163
5164 bucket_oid = dir_oid_prefix;
5165 bucket_oid.append(bucket.bucket_id);
5166
5167 return 0;
5168}
5169
1adf2230
AA
5170int RGWRados::open_bucket_index_base(const RGWBucketInfo& bucket_info,
5171 librados::IoCtx& index_ctx,
5172 string& bucket_oid_base) {
7c673cae
FG
5173 const rgw_bucket& bucket = bucket_info.bucket;
5174 int r = open_bucket_index_ctx(bucket_info, index_ctx);
5175 if (r < 0)
5176 return r;
5177
5178 if (bucket.bucket_id.empty()) {
5179 ldout(cct, 0) << "ERROR: empty bucket_id for bucket operation" << dendl;
5180 return -EIO;
5181 }
5182
5183 bucket_oid_base = dir_oid_prefix;
5184 bucket_oid_base.append(bucket.bucket_id);
5185
5186 return 0;
5187
5188}
5189
1adf2230
AA
5190int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info,
5191 librados::IoCtx& index_ctx,
5192 map<int, string>& bucket_objs,
5193 int shard_id,
5194 map<int, string> *bucket_instance_ids) {
7c673cae
FG
5195 string bucket_oid_base;
5196 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5197 if (ret < 0) {
5198 return ret;
5199 }
5200
5201 get_bucket_index_objects(bucket_oid_base, bucket_info.num_shards, bucket_objs, shard_id);
5202 if (bucket_instance_ids) {
5203 get_bucket_instance_ids(bucket_info, shard_id, bucket_instance_ids);
5204 }
5205 return 0;
5206}
5207
5208template<typename T>
5209int RGWRados::open_bucket_index(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5210 map<int, string>& oids, map<int, T>& bucket_objs,
5211 int shard_id, map<int, string> *bucket_instance_ids)
5212{
5213 int ret = open_bucket_index(bucket_info, index_ctx, oids, shard_id, bucket_instance_ids);
5214 if (ret < 0)
5215 return ret;
5216
5217 map<int, string>::const_iterator iter = oids.begin();
5218 for (; iter != oids.end(); ++iter) {
5219 bucket_objs[iter->first] = T();
5220 }
5221 return 0;
5222}
5223
5224int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5225 const string& obj_key, string *bucket_obj, int *shard_id)
5226{
5227 string bucket_oid_base;
5228 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5229 if (ret < 0)
5230 return ret;
5231
5232 RGWObjectCtx obj_ctx(this);
5233
5234 ret = get_bucket_index_object(bucket_oid_base, obj_key, bucket_info.num_shards,
5235 (RGWBucketInfo::BIShardsHashType)bucket_info.bucket_index_shard_hash_type, bucket_obj, shard_id);
5236 if (ret < 0) {
5237 ldout(cct, 10) << "get_bucket_index_object() returned ret=" << ret << dendl;
5238 return ret;
5239 }
5240 return 0;
5241}
5242
5243int RGWRados::open_bucket_index_shard(const RGWBucketInfo& bucket_info, librados::IoCtx& index_ctx,
5244 int shard_id, string *bucket_obj)
5245{
5246 string bucket_oid_base;
5247 int ret = open_bucket_index_base(bucket_info, index_ctx, bucket_oid_base);
5248 if (ret < 0)
5249 return ret;
5250
5251 RGWObjectCtx obj_ctx(this);
5252
5253 get_bucket_index_object(bucket_oid_base, bucket_info.num_shards,
5254 shard_id, bucket_obj);
5255 return 0;
5256}
5257
5258static void accumulate_raw_stats(const rgw_bucket_dir_header& header,
5259 map<RGWObjCategory, RGWStorageStats>& stats)
5260{
5261 for (const auto& pair : header.stats) {
5262 const RGWObjCategory category = static_cast<RGWObjCategory>(pair.first);
5263 const rgw_bucket_category_stats& header_stats = pair.second;
5264
5265 RGWStorageStats& s = stats[category];
5266
5267 s.category = category;
5268 s.size += header_stats.total_size;
5269 s.size_rounded += header_stats.total_size_rounded;
5270 s.size_utilized += header_stats.actual_size;
5271 s.num_objects += header_stats.num_entries;
5272 }
5273}
5274
5275int RGWRados::bucket_check_index(RGWBucketInfo& bucket_info,
5276 map<RGWObjCategory, RGWStorageStats> *existing_stats,
5277 map<RGWObjCategory, RGWStorageStats> *calculated_stats)
5278{
5279 librados::IoCtx index_ctx;
5280 // key - bucket index object id
5281 // value - bucket index check OP returned result with the given bucket index object (shard)
5282 map<int, string> oids;
5283 map<int, struct rgw_cls_check_index_ret> bucket_objs_ret;
31f18b77 5284
7c673cae 5285 int ret = open_bucket_index(bucket_info, index_ctx, oids, bucket_objs_ret);
31f18b77
FG
5286 if (ret < 0) {
5287 return ret;
5288 }
7c673cae
FG
5289
5290 ret = CLSRGWIssueBucketCheck(index_ctx, oids, bucket_objs_ret, cct->_conf->rgw_bucket_index_max_aio)();
31f18b77
FG
5291 if (ret < 0) {
5292 return ret;
5293 }
7c673cae
FG
5294
5295 // Aggregate results (from different shards if there is any)
5296 map<int, struct rgw_cls_check_index_ret>::iterator iter;
5297 for (iter = bucket_objs_ret.begin(); iter != bucket_objs_ret.end(); ++iter) {
5298 accumulate_raw_stats(iter->second.existing_header, *existing_stats);
5299 accumulate_raw_stats(iter->second.calculated_header, *calculated_stats);
5300 }
5301
5302 return 0;
5303}
5304
5305int RGWRados::bucket_rebuild_index(RGWBucketInfo& bucket_info)
5306{
5307 librados::IoCtx index_ctx;
5308 map<int, string> bucket_objs;
31f18b77 5309
7c673cae 5310 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
31f18b77 5311 if (r < 0) {
7c673cae 5312 return r;
31f18b77 5313 }
7c673cae
FG
5314
5315 return CLSRGWIssueBucketRebuild(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
5316}
5317
f64942e4 5318int RGWRados::bucket_set_reshard(const RGWBucketInfo& bucket_info, const cls_rgw_bucket_instance_entry& entry)
31f18b77
FG
5319{
5320 librados::IoCtx index_ctx;
5321 map<int, string> bucket_objs;
5322
5323 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
5324 if (r < 0) {
5325 return r;
5326 }
5327
5328 return CLSRGWIssueSetBucketResharding(index_ctx, bucket_objs, entry, cct->_conf->rgw_bucket_index_max_aio)();
5329}
7c673cae
FG
5330
5331int RGWRados::defer_gc(void *ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj)
5332{
5333 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
5334 std::string oid, key;
5335 get_obj_bucket_and_oid_loc(obj, oid, key);
5336 if (!rctx)
5337 return 0;
5338
5339 RGWObjState *state = NULL;
5340
5341 int r = get_obj_state(rctx, bucket_info, obj, &state, false);
5342 if (r < 0)
5343 return r;
5344
5345 if (!state->is_atomic) {
5346 ldout(cct, 20) << "state for obj=" << obj << " is not atomic, not deferring gc operation" << dendl;
5347 return -EINVAL;
5348 }
5349
181888fb
FG
5350 string tag;
5351
5352 if (state->tail_tag.length() > 0) {
5353 tag = state->tail_tag.c_str();
5354 } else if (state->obj_tag.length() > 0) {
5355 tag = state->obj_tag.c_str();
5356 } else {
7c673cae
FG
5357 ldout(cct, 20) << "state->obj_tag is empty, not deferring gc operation" << dendl;
5358 return -EINVAL;
5359 }
5360
7c673cae
FG
5361 ldout(cct, 0) << "defer chain tag=" << tag << dendl;
5362
5363 return gc->defer_chain(tag, false);
5364}
5365
5366void RGWRados::remove_rgw_head_obj(ObjectWriteOperation& op)
5367{
5368 list<string> prefixes;
5369 prefixes.push_back(RGW_ATTR_OLH_PREFIX);
5370 cls_rgw_remove_obj(op, prefixes);
5371}
5372
5373void RGWRados::cls_obj_check_prefix_exist(ObjectOperation& op, const string& prefix, bool fail_if_exist)
5374{
5375 cls_rgw_obj_check_attrs_prefix(op, prefix, fail_if_exist);
5376}
5377
5378void RGWRados::cls_obj_check_mtime(ObjectOperation& op, const real_time& mtime, bool high_precision_time, RGWCheckMTimeType type)
5379{
5380 cls_rgw_obj_check_mtime(op, mtime, high_precision_time, type);
5381}
5382
5383
5384/**
5385 * Delete an object.
5386 * bucket: name of the bucket storing the object
5387 * obj: name of the object to delete
5388 * Returns: 0 on success, -ERR# otherwise.
5389 */
5390int RGWRados::Object::Delete::delete_obj()
5391{
5392 RGWRados *store = target->get_store();
5393 rgw_obj& src_obj = target->get_obj();
5394 const string& instance = src_obj.key.instance;
5395 rgw_obj obj = src_obj;
5396
5397 if (instance == "null") {
5398 obj.key.instance.clear();
5399 }
5400
5401 bool explicit_marker_version = (!params.marker_version_id.empty());
5402
5403 if (params.versioning_status & BUCKET_VERSIONED || explicit_marker_version) {
5404 if (instance.empty() || explicit_marker_version) {
5405 rgw_obj marker = obj;
5406
5407 if (!params.marker_version_id.empty()) {
5408 if (params.marker_version_id != "null") {
5409 marker.key.set_instance(params.marker_version_id);
5410 }
5411 } else if ((params.versioning_status & BUCKET_VERSIONS_SUSPENDED) == 0) {
5412 store->gen_rand_obj_instance_name(&marker);
5413 }
5414
5415 result.version_id = marker.key.instance;
91327a77
AA
5416 if (result.version_id.empty())
5417 result.version_id = "null";
7c673cae
FG
5418 result.delete_marker = true;
5419
5420 struct rgw_bucket_dir_entry_meta meta;
5421
5422 meta.owner = params.obj_owner.get_id().to_str();
5423 meta.owner_display_name = params.obj_owner.get_display_name();
5424
5425 if (real_clock::is_zero(params.mtime)) {
5426 meta.mtime = real_clock::now();
5427 } else {
5428 meta.mtime = params.mtime;
5429 }
5430
31f18b77 5431 int r = store->set_olh(target->get_ctx(), target->get_bucket_info(), marker, true, &meta, params.olh_epoch, params.unmod_since, params.high_precision_time, params.zones_trace);
7c673cae
FG
5432 if (r < 0) {
5433 return r;
5434 }
5435 } else {
5436 rgw_bucket_dir_entry dirent;
5437
5438 int r = store->bi_get_instance(target->get_bucket_info(), obj, &dirent);
5439 if (r < 0) {
5440 return r;
5441 }
5442 result.delete_marker = dirent.is_delete_marker();
31f18b77 5443 r = store->unlink_obj_instance(target->get_ctx(), target->get_bucket_info(), obj, params.olh_epoch, params.zones_trace);
7c673cae
FG
5444 if (r < 0) {
5445 return r;
5446 }
5447 result.version_id = instance;
5448 }
5449
5450 BucketShard *bs;
5451 int r = target->get_bucket_shard(&bs);
5452 if (r < 0) {
5453 ldout(store->ctx(), 5) << "failed to get BucketShard object: r=" << r << dendl;
5454 return r;
5455 }
5456
c07f9fc5
FG
5457 if (target->bucket_info.datasync_flag_enabled()) {
5458 r = store->data_log->add_entry(bs->bucket, bs->shard_id);
5459 if (r < 0) {
5460 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
5461 return r;
5462 }
7c673cae
FG
5463 }
5464
5465 return 0;
5466 }
5467
5468 rgw_rados_ref ref;
5469 int r = store->get_obj_head_ref(target->get_bucket_info(), obj, &ref);
5470 if (r < 0) {
5471 return r;
5472 }
5473
5474 RGWObjState *state;
5475 r = target->get_state(&state, false);
5476 if (r < 0)
5477 return r;
5478
5479 ObjectWriteOperation op;
5480
5481 if (!real_clock::is_zero(params.unmod_since)) {
5482 struct timespec ctime = ceph::real_clock::to_timespec(state->mtime);
5483 struct timespec unmod = ceph::real_clock::to_timespec(params.unmod_since);
5484 if (!params.high_precision_time) {
5485 ctime.tv_nsec = 0;
5486 unmod.tv_nsec = 0;
5487 }
5488
5489 ldout(store->ctx(), 10) << "If-UnModified-Since: " << params.unmod_since << " Last-Modified: " << ctime << dendl;
5490 if (ctime > unmod) {
5491 return -ERR_PRECONDITION_FAILED;
5492 }
5493
5494 /* only delete object if mtime is less than or equal to params.unmod_since */
5495 store->cls_obj_check_mtime(op, params.unmod_since, params.high_precision_time, CLS_RGW_CHECK_TIME_MTIME_LE);
5496 }
11fdf7f2 5497 uint64_t obj_accounted_size = state->accounted_size;
7c673cae
FG
5498
5499 if (!real_clock::is_zero(params.expiration_time)) {
5500 bufferlist bl;
5501 real_time delete_at;
5502
5503 if (state->get_attr(RGW_ATTR_DELETE_AT, bl)) {
5504 try {
11fdf7f2
TL
5505 auto iter = bl.cbegin();
5506 decode(delete_at, iter);
7c673cae
FG
5507 } catch (buffer::error& err) {
5508 ldout(store->ctx(), 0) << "ERROR: couldn't decode RGW_ATTR_DELETE_AT" << dendl;
5509 return -EIO;
5510 }
5511
5512 if (params.expiration_time != delete_at) {
5513 return -ERR_PRECONDITION_FAILED;
5514 }
5515 } else {
5516 return -ERR_PRECONDITION_FAILED;
5517 }
5518 }
5519
5520 if (!state->exists) {
5521 target->invalidate_state();
5522 return -ENOENT;
5523 }
5524
181888fb 5525 r = target->prepare_atomic_modification(op, false, NULL, NULL, NULL, true, false);
7c673cae
FG
5526 if (r < 0)
5527 return r;
5528
5529 RGWBucketInfo& bucket_info = target->get_bucket_info();
5530
5531 RGWRados::Bucket bop(store, bucket_info);
5532 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
31f18b77
FG
5533
5534 index_op.set_zones_trace(params.zones_trace);
7c673cae
FG
5535 index_op.set_bilog_flags(params.bilog_flags);
5536
7c673cae
FG
5537 r = index_op.prepare(CLS_RGW_OP_DEL, &state->write_tag);
5538 if (r < 0)
5539 return r;
5540
5541 store->remove_rgw_head_obj(op);
11fdf7f2 5542 r = ref.ioctx.operate(ref.obj.oid, &op);
94b18763
FG
5543
5544 /* raced with another operation, object state is indeterminate */
5545 const bool need_invalidate = (r == -ECANCELED);
7c673cae
FG
5546
5547 int64_t poolid = ref.ioctx.get_id();
5548 if (r >= 0) {
5549 tombstone_cache_t *obj_tombstone_cache = store->get_tombstone_cache();
5550 if (obj_tombstone_cache) {
5551 tombstone_entry entry{*state};
5552 obj_tombstone_cache->add(obj, entry);
5553 }
5554 r = index_op.complete_del(poolid, ref.ioctx.get_last_version(), state->mtime, params.remove_objs);
224ce89b 5555
7c673cae
FG
5556 int ret = target->complete_atomic_modification();
5557 if (ret < 0) {
5558 ldout(store->ctx(), 0) << "ERROR: complete_atomic_modification returned ret=" << ret << dendl;
5559 }
5560 /* other than that, no need to propagate error */
224ce89b
WB
5561 } else {
5562 int ret = index_op.cancel();
5563 if (ret < 0) {
5564 ldout(store->ctx(), 0) << "ERROR: index_op.cancel() returned ret=" << ret << dendl;
5565 }
7c673cae
FG
5566 }
5567
5568 if (need_invalidate) {
5569 target->invalidate_state();
5570 }
5571
5572 if (r < 0)
5573 return r;
5574
5575 /* update quota cache */
11fdf7f2 5576 store->quota_handler->update_stats(params.bucket_owner, obj.bucket, -1, 0, obj_accounted_size);
7c673cae
FG
5577
5578 return 0;
5579}
5580
5581int RGWRados::delete_obj(RGWObjectCtx& obj_ctx,
5582 const RGWBucketInfo& bucket_info,
5583 const rgw_obj& obj,
5584 int versioning_status,
5585 uint16_t bilog_flags,
31f18b77
FG
5586 const real_time& expiration_time,
5587 rgw_zone_set *zones_trace)
7c673cae
FG
5588{
5589 RGWRados::Object del_target(this, bucket_info, obj_ctx, obj);
5590 RGWRados::Object::Delete del_op(&del_target);
5591
5592 del_op.params.bucket_owner = bucket_info.owner;
5593 del_op.params.versioning_status = versioning_status;
5594 del_op.params.bilog_flags = bilog_flags;
5595 del_op.params.expiration_time = expiration_time;
31f18b77 5596 del_op.params.zones_trace = zones_trace;
7c673cae
FG
5597
5598 return del_op.delete_obj();
5599}
5600
5601int RGWRados::delete_raw_obj(const rgw_raw_obj& obj)
5602{
5603 rgw_rados_ref ref;
224ce89b 5604 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
5605 if (r < 0) {
5606 return r;
5607 }
5608
5609 ObjectWriteOperation op;
5610
5611 op.remove();
11fdf7f2 5612 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
5613 if (r < 0)
5614 return r;
5615
5616 return 0;
5617}
5618
494da23a 5619int RGWRados::delete_obj_index(const rgw_obj& obj, ceph::real_time mtime)
7c673cae
FG
5620{
5621 std::string oid, key;
5622 get_obj_bucket_and_oid_loc(obj, oid, key);
5623
11fdf7f2 5624 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
5625
5626 RGWBucketInfo bucket_info;
5627 int ret = get_bucket_instance_info(obj_ctx, obj.bucket, bucket_info, NULL, NULL);
5628 if (ret < 0) {
5629 ldout(cct, 0) << "ERROR: " << __func__ << "() get_bucket_instance_info(bucket=" << obj.bucket << ") returned ret=" << ret << dendl;
5630 return ret;
5631 }
5632
5633 RGWRados::Bucket bop(this, bucket_info);
5634 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
5635
494da23a 5636 return index_op.complete_del(-1 /* pool */, 0, mtime, NULL);
7c673cae
FG
5637}
5638
5639static void generate_fake_tag(RGWRados *store, map<string, bufferlist>& attrset, RGWObjManifest& manifest, bufferlist& manifest_bl, bufferlist& tag_bl)
5640{
5641 string tag;
5642
5643 RGWObjManifest::obj_iterator mi = manifest.obj_begin();
5644 if (mi != manifest.obj_end()) {
5645 if (manifest.has_tail()) // first object usually points at the head, let's skip to a more unique part
5646 ++mi;
5647 tag = mi.get_location().get_raw_obj(store).oid;
5648 tag.append("_");
5649 }
5650
5651 unsigned char md5[CEPH_CRYPTO_MD5_DIGESTSIZE];
5652 char md5_str[CEPH_CRYPTO_MD5_DIGESTSIZE * 2 + 1];
5653 MD5 hash;
11fdf7f2 5654 hash.Update((const unsigned char *)manifest_bl.c_str(), manifest_bl.length());
7c673cae
FG
5655
5656 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_ETAG);
5657 if (iter != attrset.end()) {
5658 bufferlist& bl = iter->second;
11fdf7f2 5659 hash.Update((const unsigned char *)bl.c_str(), bl.length());
7c673cae
FG
5660 }
5661
5662 hash.Final(md5);
5663 buf_to_hex(md5, CEPH_CRYPTO_MD5_DIGESTSIZE, md5_str);
5664 tag.append(md5_str);
5665
5666 ldout(store->ctx(), 10) << "generate_fake_tag new tag=" << tag << dendl;
5667
5668 tag_bl.append(tag.c_str(), tag.size() + 1);
5669}
5670
5671static bool is_olh(map<string, bufferlist>& attrs)
5672{
5673 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_INFO);
5674 return (iter != attrs.end());
5675}
5676
5677static bool has_olh_tag(map<string, bufferlist>& attrs)
5678{
5679 map<string, bufferlist>::iterator iter = attrs.find(RGW_ATTR_OLH_ID_TAG);
5680 return (iter != attrs.end());
5681}
5682
5683int RGWRados::get_olh_target_state(RGWObjectCtx& obj_ctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5684 RGWObjState *olh_state, RGWObjState **target_state)
5685{
11fdf7f2 5686 ceph_assert(olh_state->is_olh);
7c673cae
FG
5687
5688 rgw_obj target;
5689 int r = RGWRados::follow_olh(bucket_info, obj_ctx, olh_state, obj, &target); /* might return -EAGAIN */
5690 if (r < 0) {
5691 return r;
5692 }
5693 r = get_obj_state(&obj_ctx, bucket_info, target, target_state, false);
5694 if (r < 0) {
5695 return r;
5696 }
5697
5698 return 0;
5699}
5700
7c673cae
FG
5701int RGWRados::get_obj_state_impl(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
5702 RGWObjState **state, bool follow_olh, bool assume_noent)
5703{
5704 if (obj.empty()) {
5705 return -EINVAL;
5706 }
5707
5708 bool need_follow_olh = follow_olh && obj.key.instance.empty();
5709
11fdf7f2 5710 RGWObjState *s = rctx->get_state(obj);
7c673cae
FG
5711 ldout(cct, 20) << "get_obj_state: rctx=" << (void *)rctx << " obj=" << obj << " state=" << (void *)s << " s->prefetch_data=" << s->prefetch_data << dendl;
5712 *state = s;
5713 if (s->has_attrs) {
5714 if (s->is_olh && need_follow_olh) {
5715 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
5716 }
5717 return 0;
5718 }
5719
5720 s->obj = obj;
5721
5722 rgw_raw_obj raw_obj;
5723 obj_to_raw(bucket_info.placement_rule, obj, &raw_obj);
5724
5725 int r = -ENOENT;
5726
5727 if (!assume_noent) {
5728 r = RGWRados::raw_obj_stat(raw_obj, &s->size, &s->mtime, &s->epoch, &s->attrset, (s->prefetch_data ? &s->data : NULL), NULL);
5729 }
5730
5731 if (r == -ENOENT) {
5732 s->exists = false;
5733 s->has_attrs = true;
5734 tombstone_entry entry;
5735 if (obj_tombstone_cache && obj_tombstone_cache->find(obj, entry)) {
5736 s->mtime = entry.mtime;
5737 s->zone_short_id = entry.zone_short_id;
5738 s->pg_ver = entry.pg_ver;
5739 ldout(cct, 20) << __func__ << "(): found obj in tombstone cache: obj=" << obj
5740 << " mtime=" << s->mtime << " pgv=" << s->pg_ver << dendl;
5741 } else {
5742 s->mtime = real_time();
5743 }
5744 return 0;
5745 }
5746 if (r < 0)
5747 return r;
5748
5749 s->exists = true;
5750 s->has_attrs = true;
5751 s->accounted_size = s->size;
5752
11fdf7f2
TL
5753 auto iter = s->attrset.find(RGW_ATTR_ETAG);
5754 if (iter != s->attrset.end()) {
5755 /* get rid of extra null character at the end of the etag, as we used to store it like that */
5756 bufferlist& bletag = iter->second;
5757 if (bletag.length() > 0 && bletag[bletag.length() - 1] == '\0') {
5758 bufferlist newbl;
5759 bletag.splice(0, bletag.length() - 1, &newbl);
5760 bletag.claim(newbl);
5761 }
5762 }
5763
5764 iter = s->attrset.find(RGW_ATTR_COMPRESSION);
31f18b77
FG
5765 const bool compressed = (iter != s->attrset.end());
5766 if (compressed) {
7c673cae
FG
5767 // use uncompressed size for accounted_size
5768 try {
5769 RGWCompressionInfo info;
11fdf7f2
TL
5770 auto p = iter->second.cbegin();
5771 decode(info, p);
31f18b77 5772 s->accounted_size = info.orig_size;
7c673cae
FG
5773 } catch (buffer::error&) {
5774 dout(0) << "ERROR: could not decode compression info for object: " << obj << dendl;
5775 return -EIO;
5776 }
5777 }
5778
5779 iter = s->attrset.find(RGW_ATTR_SHADOW_OBJ);
5780 if (iter != s->attrset.end()) {
5781 bufferlist bl = iter->second;
5782 bufferlist::iterator it = bl.begin();
5783 it.copy(bl.length(), s->shadow_obj);
5784 s->shadow_obj[bl.length()] = '\0';
5785 }
5786 s->obj_tag = s->attrset[RGW_ATTR_ID_TAG];
181888fb
FG
5787 auto ttiter = s->attrset.find(RGW_ATTR_TAIL_TAG);
5788 if (ttiter != s->attrset.end()) {
5789 s->tail_tag = s->attrset[RGW_ATTR_TAIL_TAG];
5790 }
7c673cae
FG
5791
5792 bufferlist manifest_bl = s->attrset[RGW_ATTR_MANIFEST];
5793 if (manifest_bl.length()) {
11fdf7f2 5794 auto miter = manifest_bl.cbegin();
7c673cae 5795 try {
11fdf7f2 5796 decode(s->manifest, miter);
7c673cae
FG
5797 s->has_manifest = true;
5798 s->manifest.set_head(bucket_info.placement_rule, obj, s->size); /* patch manifest to reflect the head we just read, some manifests might be
5799 broken due to old bugs */
5800 s->size = s->manifest.get_obj_size();
31f18b77
FG
5801 if (!compressed)
5802 s->accounted_size = s->size;
7c673cae
FG
5803 } catch (buffer::error& err) {
5804 ldout(cct, 0) << "ERROR: couldn't decode manifest" << dendl;
5805 return -EIO;
5806 }
5807 ldout(cct, 10) << "manifest: total_size = " << s->manifest.get_obj_size() << dendl;
11fdf7f2
TL
5808 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>() && \
5809 s->manifest.has_explicit_objs()) {
7c673cae
FG
5810 RGWObjManifest::obj_iterator mi;
5811 for (mi = s->manifest.obj_begin(); mi != s->manifest.obj_end(); ++mi) {
5812 ldout(cct, 20) << "manifest: ofs=" << mi.get_ofs() << " loc=" << mi.get_location().get_raw_obj(this) << dendl;
5813 }
5814 }
5815
5816 if (!s->obj_tag.length()) {
5817 /*
5818 * Uh oh, something's wrong, object with manifest should have tag. Let's
5819 * create one out of the manifest, would be unique
5820 */
5821 generate_fake_tag(this, s->attrset, s->manifest, manifest_bl, s->obj_tag);
5822 s->fake_tag = true;
5823 }
5824 }
5825 map<string, bufferlist>::iterator aiter = s->attrset.find(RGW_ATTR_PG_VER);
5826 if (aiter != s->attrset.end()) {
5827 bufferlist& pg_ver_bl = aiter->second;
5828 if (pg_ver_bl.length()) {
11fdf7f2 5829 auto pgbl = pg_ver_bl.cbegin();
7c673cae 5830 try {
11fdf7f2 5831 decode(s->pg_ver, pgbl);
7c673cae
FG
5832 } catch (buffer::error& err) {
5833 ldout(cct, 0) << "ERROR: couldn't decode pg ver attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5834 }
5835 }
5836 }
5837 aiter = s->attrset.find(RGW_ATTR_SOURCE_ZONE);
5838 if (aiter != s->attrset.end()) {
5839 bufferlist& zone_short_id_bl = aiter->second;
5840 if (zone_short_id_bl.length()) {
11fdf7f2 5841 auto zbl = zone_short_id_bl.cbegin();
7c673cae 5842 try {
11fdf7f2 5843 decode(s->zone_short_id, zbl);
7c673cae
FG
5844 } catch (buffer::error& err) {
5845 ldout(cct, 0) << "ERROR: couldn't decode zone short id attr for object " << s->obj << ", non-critical error, ignoring" << dendl;
5846 }
5847 }
5848 }
5849 if (s->obj_tag.length())
31f18b77 5850 ldout(cct, 20) << "get_obj_state: setting s->obj_tag to " << s->obj_tag.c_str() << dendl;
7c673cae
FG
5851 else
5852 ldout(cct, 20) << "get_obj_state: s->obj_tag was set empty" << dendl;
5853
5854 /* an object might not be olh yet, but could have olh id tag, so we should set it anyway if
5855 * it exist, and not only if is_olh() returns true
5856 */
5857 iter = s->attrset.find(RGW_ATTR_OLH_ID_TAG);
5858 if (iter != s->attrset.end()) {
5859 s->olh_tag = iter->second;
5860 }
5861
5862 if (is_olh(s->attrset)) {
5863 s->is_olh = true;
5864
5865 ldout(cct, 20) << __func__ << ": setting s->olh_tag to " << string(s->olh_tag.c_str(), s->olh_tag.length()) << dendl;
5866
5867 if (need_follow_olh) {
5868 return get_olh_target_state(*rctx, bucket_info, obj, s, state);
11fdf7f2
TL
5869 } else if (obj.key.have_null_instance() && !s->has_manifest) {
5870 // read null version, and the head object only have olh info
5871 s->exists = false;
5872 return -ENOENT;
7c673cae
FG
5873 }
5874 }
5875
5876 return 0;
5877}
5878
5879int RGWRados::get_obj_state(RGWObjectCtx *rctx, const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWObjState **state,
5880 bool follow_olh, bool assume_noent)
5881{
5882 int ret;
5883
5884 do {
5885 ret = get_obj_state_impl(rctx, bucket_info, obj, state, follow_olh, assume_noent);
5886 } while (ret == -EAGAIN);
5887
5888 return ret;
5889}
5890
5891int RGWRados::Object::get_manifest(RGWObjManifest **pmanifest)
5892{
5893 RGWObjState *astate;
5894 int r = get_state(&astate, true);
5895 if (r < 0) {
5896 return r;
5897 }
5898
5899 *pmanifest = &astate->manifest;
5900
5901 return 0;
5902}
5903
5904int RGWRados::Object::Read::get_attr(const char *name, bufferlist& dest)
5905{
5906 RGWObjState *state;
5907 int r = source->get_state(&state, true);
5908 if (r < 0)
5909 return r;
5910 if (!state->exists)
5911 return -ENOENT;
5912 if (!state->get_attr(name, dest))
5913 return -ENODATA;
5914
5915 return 0;
5916}
5917
5918
5919int RGWRados::Object::Stat::stat_async()
5920{
5921 RGWObjectCtx& ctx = source->get_ctx();
5922 rgw_obj& obj = source->get_obj();
5923 RGWRados *store = source->get_store();
5924
11fdf7f2 5925 RGWObjState *s = ctx.get_state(obj); /* calling this one directly because otherwise a sync request will be sent */
7c673cae
FG
5926 result.obj = obj;
5927 if (s->has_attrs) {
5928 state.ret = 0;
5929 result.size = s->size;
5930 result.mtime = ceph::real_clock::to_timespec(s->mtime);
5931 result.attrs = s->attrset;
5932 result.has_manifest = s->has_manifest;
5933 result.manifest = s->manifest;
5934 return 0;
5935 }
5936
5937 string oid;
5938 string loc;
5939 get_obj_bucket_and_oid_loc(obj, oid, loc);
5940
5941 int r = store->get_obj_head_ioctx(source->get_bucket_info(), obj, &state.io_ctx);
5942 if (r < 0) {
5943 return r;
5944 }
5945
5946 librados::ObjectReadOperation op;
5947 op.stat2(&result.size, &result.mtime, NULL);
5948 op.getxattrs(&result.attrs, NULL);
5949 state.completion = librados::Rados::aio_create_completion(NULL, NULL, NULL);
5950 state.io_ctx.locator_set_key(loc);
5951 r = state.io_ctx.aio_operate(oid, state.completion, &op, NULL);
5952 if (r < 0) {
5953 ldout(store->ctx(), 5) << __func__
5954 << ": ERROR: aio_operate() returned ret=" << r
5955 << dendl;
5956 return r;
5957 }
5958
5959 return 0;
5960}
5961
5962
5963int RGWRados::Object::Stat::wait()
5964{
5965 if (!state.completion) {
5966 return state.ret;
5967 }
5968
5969 state.completion->wait_for_safe();
5970 state.ret = state.completion->get_return_value();
5971 state.completion->release();
5972
5973 if (state.ret != 0) {
5974 return state.ret;
5975 }
5976
5977 return finish();
5978}
5979
5980int RGWRados::Object::Stat::finish()
5981{
5982 map<string, bufferlist>::iterator iter = result.attrs.find(RGW_ATTR_MANIFEST);
5983 if (iter != result.attrs.end()) {
5984 bufferlist& bl = iter->second;
11fdf7f2 5985 auto biter = bl.cbegin();
7c673cae 5986 try {
11fdf7f2 5987 decode(result.manifest, biter);
7c673cae
FG
5988 } catch (buffer::error& err) {
5989 RGWRados *store = source->get_store();
5990 ldout(store->ctx(), 0) << "ERROR: " << __func__ << ": failed to decode manifest" << dendl;
5991 return -EIO;
5992 }
5993 result.has_manifest = true;
5994 }
5995
5996 return 0;
5997}
5998
7c673cae
FG
5999int RGWRados::append_atomic_test(RGWObjectCtx *rctx,
6000 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
6001 ObjectOperation& op, RGWObjState **pstate)
6002{
6003 if (!rctx)
6004 return 0;
6005
6006 int r = get_obj_state(rctx, bucket_info, obj, pstate, false);
6007 if (r < 0)
6008 return r;
6009
11fdf7f2
TL
6010 return append_atomic_test(*pstate, op);
6011}
7c673cae 6012
11fdf7f2
TL
6013int RGWRados::append_atomic_test(const RGWObjState* state,
6014 librados::ObjectOperation& op)
6015{
7c673cae 6016 if (!state->is_atomic) {
11fdf7f2 6017 ldout(cct, 20) << "state for obj=" << state->obj << " is not atomic, not appending atomic test" << dendl;
7c673cae
FG
6018 return 0;
6019 }
6020
6021 if (state->obj_tag.length() > 0 && !state->fake_tag) {// check for backward compatibility
6022 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6023 } else {
6024 ldout(cct, 20) << "state->obj_tag is empty, not appending atomic test" << dendl;
6025 }
6026 return 0;
6027}
6028
6029int RGWRados::Object::get_state(RGWObjState **pstate, bool follow_olh, bool assume_noent)
6030{
6031 return store->get_obj_state(&ctx, bucket_info, obj, pstate, follow_olh, assume_noent);
6032}
6033
6034void RGWRados::Object::invalidate_state()
6035{
11fdf7f2 6036 ctx.invalidate(obj);
7c673cae
FG
6037}
6038
6039int RGWRados::Object::prepare_atomic_modification(ObjectWriteOperation& op, bool reset_obj, const string *ptag,
181888fb
FG
6040 const char *if_match, const char *if_nomatch, bool removal_op,
6041 bool modify_tail)
7c673cae
FG
6042{
6043 int r = get_state(&state, false);
6044 if (r < 0)
6045 return r;
6046
6047 bool need_guard = (state->has_manifest || (state->obj_tag.length() != 0) ||
6048 if_match != NULL || if_nomatch != NULL) &&
6049 (!state->fake_tag);
6050
6051 if (!state->is_atomic) {
6052 ldout(store->ctx(), 20) << "prepare_atomic_modification: state is not atomic. state=" << (void *)state << dendl;
6053
6054 if (reset_obj) {
6055 op.create(false);
6056 store->remove_rgw_head_obj(op); // we're not dropping reference here, actually removing object
6057 }
6058
6059 return 0;
6060 }
6061
6062 if (need_guard) {
6063 /* first verify that the object wasn't replaced under */
6064 if (if_nomatch == NULL || strcmp(if_nomatch, "*") != 0) {
6065 op.cmpxattr(RGW_ATTR_ID_TAG, LIBRADOS_CMPXATTR_OP_EQ, state->obj_tag);
6066 // FIXME: need to add FAIL_NOTEXIST_OK for racing deletion
6067 }
6068
6069 if (if_match) {
6070 if (strcmp(if_match, "*") == 0) {
6071 // test the object is existing
6072 if (!state->exists) {
6073 return -ERR_PRECONDITION_FAILED;
6074 }
6075 } else {
6076 bufferlist bl;
6077 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6078 strncmp(if_match, bl.c_str(), bl.length()) != 0) {
6079 return -ERR_PRECONDITION_FAILED;
6080 }
6081 }
6082 }
6083
6084 if (if_nomatch) {
6085 if (strcmp(if_nomatch, "*") == 0) {
6086 // test the object is NOT existing
6087 if (state->exists) {
6088 return -ERR_PRECONDITION_FAILED;
6089 }
6090 } else {
6091 bufferlist bl;
6092 if (!state->get_attr(RGW_ATTR_ETAG, bl) ||
6093 strncmp(if_nomatch, bl.c_str(), bl.length()) == 0) {
6094 return -ERR_PRECONDITION_FAILED;
6095 }
6096 }
6097 }
6098 }
6099
6100 if (reset_obj) {
6101 if (state->exists) {
6102 op.create(false);
6103 store->remove_rgw_head_obj(op);
6104 } else {
6105 op.create(true);
6106 }
6107 }
6108
6109 if (removal_op) {
6110 /* the object is being removed, no need to update its tag */
6111 return 0;
6112 }
6113
6114 if (ptag) {
6115 state->write_tag = *ptag;
6116 } else {
6117 append_rand_alpha(store->ctx(), state->write_tag, state->write_tag, 32);
6118 }
6119 bufferlist bl;
6120 bl.append(state->write_tag.c_str(), state->write_tag.size() + 1);
6121
6122 ldout(store->ctx(), 10) << "setting object write_tag=" << state->write_tag << dendl;
6123
6124 op.setxattr(RGW_ATTR_ID_TAG, bl);
181888fb
FG
6125 if (modify_tail) {
6126 op.setxattr(RGW_ATTR_TAIL_TAG, bl);
6127 }
7c673cae
FG
6128
6129 return 0;
6130}
6131
7c673cae
FG
6132/**
6133 * Set an attr on an object.
6134 * bucket: name of the bucket holding the object
6135 * obj: name of the object to set the attr on
6136 * name: the attr to set
6137 * bl: the contents of the attr
6138 * Returns: 0 on success, -ERR# otherwise.
6139 */
6140int RGWRados::set_attr(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& obj, const char *name, bufferlist& bl)
6141{
6142 map<string, bufferlist> attrs;
6143 attrs[name] = bl;
6144 return set_attrs(ctx, bucket_info, obj, attrs, NULL);
6145}
6146
494da23a 6147int RGWRados::set_attrs(void *ctx, const RGWBucketInfo& bucket_info, rgw_obj& src_obj,
7c673cae
FG
6148 map<string, bufferlist>& attrs,
6149 map<string, bufferlist>* rmattrs)
6150{
494da23a
TL
6151 rgw_obj obj = src_obj;
6152 if (obj.key.instance == "null") {
6153 obj.key.instance.clear();
6154 }
6155
7c673cae
FG
6156 rgw_rados_ref ref;
6157 int r = get_obj_head_ref(bucket_info, obj, &ref);
6158 if (r < 0) {
6159 return r;
6160 }
6161 RGWObjectCtx *rctx = static_cast<RGWObjectCtx *>(ctx);
6162
6163 ObjectWriteOperation op;
6164 RGWObjState *state = NULL;
6165
6166 r = append_atomic_test(rctx, bucket_info, obj, op, &state);
6167 if (r < 0)
6168 return r;
6169
494da23a
TL
6170 // ensure null version object exist
6171 if (src_obj.key.instance == "null" && !state->has_manifest) {
6172 return -ENOENT;
6173 }
6174
7c673cae
FG
6175 map<string, bufferlist>::iterator iter;
6176 if (rmattrs) {
6177 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6178 const string& name = iter->first;
6179 op.rmxattr(name.c_str());
6180 }
6181 }
6182
6183 const rgw_bucket& bucket = obj.bucket;
6184
6185 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6186 const string& name = iter->first;
6187 bufferlist& bl = iter->second;
6188
6189 if (!bl.length())
6190 continue;
6191
6192 op.setxattr(name.c_str(), bl);
6193
6194 if (name.compare(RGW_ATTR_DELETE_AT) == 0) {
6195 real_time ts;
6196 try {
11fdf7f2 6197 decode(ts, bl);
7c673cae
FG
6198
6199 rgw_obj_index_key obj_key;
6200 obj.key.get_index_key(&obj_key);
6201
6202 objexp_hint_add(ts, bucket.tenant, bucket.name, bucket.bucket_id, obj_key);
6203 } catch (buffer::error& err) {
6204 ldout(cct, 0) << "ERROR: failed to decode " RGW_ATTR_DELETE_AT << " attr" << dendl;
6205 }
6206 }
6207 }
6208
6209 if (!op.size())
6210 return 0;
6211
6212 RGWObjectCtx obj_ctx(this);
6213
6214 bufferlist bl;
6215 RGWRados::Bucket bop(this, bucket_info);
6216 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
6217
6218 if (state) {
6219 string tag;
6220 append_rand_alpha(cct, tag, tag, 32);
6221 state->write_tag = tag;
6222 r = index_op.prepare(CLS_RGW_OP_ADD, &state->write_tag);
6223
6224 if (r < 0)
6225 return r;
6226
6227 bl.append(tag.c_str(), tag.size() + 1);
7c673cae
FG
6228 op.setxattr(RGW_ATTR_ID_TAG, bl);
6229 }
6230
3efd9988
FG
6231
6232 real_time mtime = real_clock::now();
6233 struct timespec mtime_ts = real_clock::to_timespec(mtime);
6234 op.mtime2(&mtime_ts);
11fdf7f2 6235 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
6236 if (state) {
6237 if (r >= 0) {
6238 bufferlist acl_bl = attrs[RGW_ATTR_ACL];
6239 bufferlist etag_bl = attrs[RGW_ATTR_ETAG];
6240 bufferlist content_type_bl = attrs[RGW_ATTR_CONTENT_TYPE];
11fdf7f2
TL
6241 string etag = rgw_bl_str(etag_bl);
6242 string content_type = rgw_bl_str(content_type_bl);
6243 string storage_class;
6244 auto iter = attrs.find(RGW_ATTR_STORAGE_CLASS);
6245 if (iter != attrs.end()) {
6246 storage_class = rgw_bl_str(iter->second);
6247 }
7c673cae
FG
6248 uint64_t epoch = ref.ioctx.get_last_version();
6249 int64_t poolid = ref.ioctx.get_id();
7c673cae 6250 r = index_op.complete(poolid, epoch, state->size, state->accounted_size,
11fdf7f2
TL
6251 mtime, etag, content_type, storage_class, &acl_bl,
6252 RGWObjCategory::Main, NULL);
7c673cae
FG
6253 } else {
6254 int ret = index_op.cancel();
6255 if (ret < 0) {
6256 ldout(cct, 0) << "ERROR: complete_update_index_cancel() returned ret=" << ret << dendl;
6257 }
6258 }
6259 }
6260 if (r < 0)
6261 return r;
6262
6263 if (state) {
6264 state->obj_tag.swap(bl);
6265 if (rmattrs) {
6266 for (iter = rmattrs->begin(); iter != rmattrs->end(); ++iter) {
6267 state->attrset.erase(iter->first);
6268 }
6269 }
6270 for (iter = attrs.begin(); iter != attrs.end(); ++iter) {
6271 state->attrset[iter->first] = iter->second;
6272 }
6273 }
6274
6275 return 0;
6276}
6277
7c673cae
FG
6278int RGWRados::Object::Read::prepare()
6279{
6280 RGWRados *store = source->get_store();
6281 CephContext *cct = store->ctx();
6282
6283 bufferlist etag;
6284
6285 map<string, bufferlist>::iterator iter;
6286
6287 RGWObjState *astate;
6288 int r = source->get_state(&astate, true);
6289 if (r < 0)
6290 return r;
6291
6292 if (!astate->exists) {
6293 return -ENOENT;
6294 }
6295
6296 const RGWBucketInfo& bucket_info = source->get_bucket_info();
6297
6298 state.obj = astate->obj;
6299 store->obj_to_raw(bucket_info.placement_rule, state.obj, &state.head_obj);
6300
11fdf7f2
TL
6301 state.cur_pool = state.head_obj.pool;
6302 state.cur_ioctx = &state.io_ctxs[state.cur_pool];
6303
6304 r = store->get_obj_head_ioctx(bucket_info, state.obj, state.cur_ioctx);
7c673cae
FG
6305 if (r < 0) {
6306 return r;
6307 }
6308 if (params.attrs) {
6309 *params.attrs = astate->attrset;
11fdf7f2 6310 if (cct->_conf->subsys.should_gather<ceph_subsys_rgw, 20>()) {
7c673cae
FG
6311 for (iter = params.attrs->begin(); iter != params.attrs->end(); ++iter) {
6312 ldout(cct, 20) << "Read xattr: " << iter->first << dendl;
6313 }
6314 }
6315 }
6316
6317 /* Convert all times go GMT to make them compatible */
6318 if (conds.mod_ptr || conds.unmod_ptr) {
6319 obj_time_weight src_weight;
6320 src_weight.init(astate);
6321 src_weight.high_precision = conds.high_precision_time;
6322
6323 obj_time_weight dest_weight;
6324 dest_weight.high_precision = conds.high_precision_time;
6325
6326 if (conds.mod_ptr) {
6327 dest_weight.init(*conds.mod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6328 ldout(cct, 10) << "If-Modified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6329 if (!(dest_weight < src_weight)) {
6330 return -ERR_NOT_MODIFIED;
6331 }
6332 }
6333
6334 if (conds.unmod_ptr) {
6335 dest_weight.init(*conds.unmod_ptr, conds.mod_zone_id, conds.mod_pg_ver);
6336 ldout(cct, 10) << "If-UnModified-Since: " << dest_weight << " Last-Modified: " << src_weight << dendl;
6337 if (dest_weight < src_weight) {
6338 return -ERR_PRECONDITION_FAILED;
6339 }
6340 }
6341 }
6342 if (conds.if_match || conds.if_nomatch) {
6343 r = get_attr(RGW_ATTR_ETAG, etag);
6344 if (r < 0)
6345 return r;
6346
11fdf7f2
TL
6347
6348
7c673cae
FG
6349 if (conds.if_match) {
6350 string if_match_str = rgw_string_unquote(conds.if_match);
11fdf7f2
TL
6351 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-Match: " << if_match_str << dendl;
6352 if (if_match_str.compare(0, etag.length(), etag.c_str(), etag.length()) != 0) {
7c673cae
FG
6353 return -ERR_PRECONDITION_FAILED;
6354 }
6355 }
6356
6357 if (conds.if_nomatch) {
6358 string if_nomatch_str = rgw_string_unquote(conds.if_nomatch);
11fdf7f2
TL
6359 ldout(cct, 10) << "ETag: " << string(etag.c_str(), etag.length()) << " " << " If-NoMatch: " << if_nomatch_str << dendl;
6360 if (if_nomatch_str.compare(0, etag.length(), etag.c_str(), etag.length()) == 0) {
7c673cae
FG
6361 return -ERR_NOT_MODIFIED;
6362 }
6363 }
6364 }
6365
6366 if (params.obj_size)
6367 *params.obj_size = astate->size;
6368 if (params.lastmod)
6369 *params.lastmod = astate->mtime;
6370
6371 return 0;
6372}
6373
6374int RGWRados::Object::Read::range_to_ofs(uint64_t obj_size, int64_t &ofs, int64_t &end)
6375{
6376 if (ofs < 0) {
6377 ofs += obj_size;
11fdf7f2
TL
6378 if (ofs < 0)
6379 ofs = 0;
6380 end = obj_size - 1;
6381 } else if (end < 0) {
6382 end = obj_size - 1;
7c673cae
FG
6383 }
6384
11fdf7f2
TL
6385 if (obj_size > 0) {
6386 if (ofs >= (off_t)obj_size) {
6387 return -ERANGE;
6388 }
6389 if (end >= (off_t)obj_size) {
6390 end = obj_size - 1;
7c673cae
FG
6391 }
6392 }
7c673cae
FG
6393 return 0;
6394}
6395
31f18b77
FG
6396int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::function<int(BucketShard *)> call)
6397{
6398 RGWRados *store = target->get_store();
6399 BucketShard *bs;
6400 int r;
6401
6402#define NUM_RESHARD_RETRIES 10
6403 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
6404 int ret = get_bucket_shard(&bs);
6405 if (ret < 0) {
6406 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6407 return ret;
6408 }
6409 r = call(bs);
6410 if (r != -ERR_BUSY_RESHARDING) {
6411 break;
6412 }
6413 ldout(store->ctx(), 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
6414 string new_bucket_id;
11fdf7f2
TL
6415 r = store->block_while_resharding(bs, &new_bucket_id,
6416 target->bucket_info, null_yield);
31f18b77
FG
6417 if (r == -ERR_BUSY_RESHARDING) {
6418 continue;
6419 }
6420 if (r < 0) {
6421 return r;
6422 }
6423 ldout(store->ctx(), 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
6424 i = 0; /* resharding is finished, make sure we can retry */
6425 r = target->update_bucket_id(new_bucket_id);
6426 if (r < 0) {
6427 ldout(store->ctx(), 0) << "ERROR: update_bucket_id() new_bucket_id=" << new_bucket_id << " returned r=" << r << dendl;
6428 return r;
6429 }
6430 invalidate_bs();
81eedcae 6431 } // for loop
31f18b77
FG
6432
6433 if (r < 0) {
6434 return r;
6435 }
6436
6437 if (pbs) {
6438 *pbs = bs;
6439 }
6440
6441 return 0;
6442}
6443
7c673cae
FG
6444int RGWRados::Bucket::UpdateIndex::prepare(RGWModifyOp op, const string *write_tag)
6445{
6446 if (blind) {
6447 return 0;
6448 }
6449 RGWRados *store = target->get_store();
7c673cae
FG
6450
6451 if (write_tag && write_tag->length()) {
6452 optag = string(write_tag->c_str(), write_tag->length());
6453 } else {
6454 if (optag.empty()) {
6455 append_rand_alpha(store->ctx(), optag, optag, 32);
6456 }
6457 }
6458
f64942e4
AA
6459 int r = guard_reshard(nullptr, [&](BucketShard *bs) -> int {
6460 return store->cls_obj_prepare_op(*bs, op, optag, obj, bilog_flags, zones_trace);
6461 });
31f18b77 6462
7c673cae
FG
6463 if (r < 0) {
6464 return r;
6465 }
6466 prepared = true;
31f18b77 6467
7c673cae
FG
6468 return 0;
6469}
6470
6471int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch,
6472 uint64_t size, uint64_t accounted_size,
6473 ceph::real_time& ut, const string& etag,
11fdf7f2 6474 const string& content_type, const string& storage_class,
7c673cae
FG
6475 bufferlist *acl_bl,
6476 RGWObjCategory category,
11fdf7f2
TL
6477 list<rgw_obj_index_key> *remove_objs, const string *user_data,
6478 bool appendable)
7c673cae
FG
6479{
6480 if (blind) {
6481 return 0;
6482 }
6483 RGWRados *store = target->get_store();
6484 BucketShard *bs;
31f18b77 6485
7c673cae
FG
6486 int ret = get_bucket_shard(&bs);
6487 if (ret < 0) {
6488 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6489 return ret;
6490 }
6491
6492 rgw_bucket_dir_entry ent;
6493 obj.key.get_index_key(&ent.key);
6494 ent.meta.size = size;
6495 ent.meta.accounted_size = accounted_size;
6496 ent.meta.mtime = ut;
6497 ent.meta.etag = etag;
11fdf7f2 6498 ent.meta.storage_class = storage_class;
7c673cae
FG
6499 if (user_data)
6500 ent.meta.user_data = *user_data;
6501
6502 ACLOwner owner;
6503 if (acl_bl && acl_bl->length()) {
6504 int ret = store->decode_policy(*acl_bl, &owner);
6505 if (ret < 0) {
6506 ldout(store->ctx(), 0) << "WARNING: could not decode policy ret=" << ret << dendl;
6507 }
6508 }
6509 ent.meta.owner = owner.get_id().to_str();
6510 ent.meta.owner_display_name = owner.get_display_name();
6511 ent.meta.content_type = content_type;
11fdf7f2 6512 ent.meta.appendable = appendable;
7c673cae 6513
31f18b77 6514 ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae 6515
c07f9fc5
FG
6516 if (target->bucket_info.datasync_flag_enabled()) {
6517 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6518 if (r < 0) {
6519 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6520 }
7c673cae
FG
6521 }
6522
6523 return ret;
6524}
6525
6526int RGWRados::Bucket::UpdateIndex::complete_del(int64_t poolid, uint64_t epoch,
6527 real_time& removed_mtime,
6528 list<rgw_obj_index_key> *remove_objs)
6529{
6530 if (blind) {
6531 return 0;
6532 }
6533 RGWRados *store = target->get_store();
6534 BucketShard *bs;
31f18b77 6535
7c673cae
FG
6536 int ret = get_bucket_shard(&bs);
6537 if (ret < 0) {
6538 ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl;
6539 return ret;
6540 }
6541
31f18b77 6542 ret = store->cls_obj_complete_del(*bs, optag, poolid, epoch, obj, removed_mtime, remove_objs, bilog_flags, zones_trace);
7c673cae 6543
c07f9fc5
FG
6544 if (target->bucket_info.datasync_flag_enabled()) {
6545 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6546 if (r < 0) {
6547 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6548 }
7c673cae
FG
6549 }
6550
6551 return ret;
6552}
6553
6554
6555int RGWRados::Bucket::UpdateIndex::cancel()
6556{
6557 if (blind) {
6558 return 0;
6559 }
6560 RGWRados *store = target->get_store();
6561 BucketShard *bs;
7c673cae 6562
f64942e4
AA
6563 int ret = guard_reshard(&bs, [&](BucketShard *bs) -> int {
6564 return store->cls_obj_complete_cancel(*bs, optag, obj, bilog_flags, zones_trace);
6565 });
7c673cae
FG
6566
6567 /*
6568 * need to update data log anyhow, so that whoever follows needs to update its internal markers
6569 * for following the specific bucket shard log. Otherwise they end up staying behind, and users
6570 * have no way to tell that they're all caught up
6571 */
c07f9fc5
FG
6572 if (target->bucket_info.datasync_flag_enabled()) {
6573 int r = store->data_log->add_entry(bs->bucket, bs->shard_id);
6574 if (r < 0) {
6575 lderr(store->ctx()) << "ERROR: failed writing data log" << dendl;
6576 }
7c673cae
FG
6577 }
6578
6579 return ret;
6580}
6581
6582int RGWRados::Object::Read::read(int64_t ofs, int64_t end, bufferlist& bl)
6583{
6584 RGWRados *store = source->get_store();
6585 CephContext *cct = store->ctx();
6586
7c673cae
FG
6587 rgw_raw_obj read_obj;
6588 uint64_t read_ofs = ofs;
6589 uint64_t len, read_len;
6590 bool reading_from_head = true;
6591 ObjectReadOperation op;
6592
6593 bool merge_bl = false;
6594 bufferlist *pbl = &bl;
6595 bufferlist read_bl;
6596 uint64_t max_chunk_size;
6597
6598 RGWObjState *astate;
6599 int r = source->get_state(&astate, true);
6600 if (r < 0)
6601 return r;
6602
11fdf7f2
TL
6603 if (astate->size == 0) {
6604 end = 0;
6605 } else if (end >= (int64_t)astate->size) {
6606 end = astate->size - 1;
6607 }
6608
7c673cae
FG
6609 if (end < 0)
6610 len = 0;
6611 else
6612 len = end - ofs + 1;
6613
6614 if (astate->has_manifest && astate->manifest.has_tail()) {
6615 /* now get the relevant object part */
6616 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6617
6618 uint64_t stripe_ofs = iter.get_stripe_ofs();
6619 read_obj = iter.get_location().get_raw_obj(store);
11fdf7f2 6620 len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6621 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6622 reading_from_head = (read_obj == state.head_obj);
6623 } else {
6624 read_obj = state.head_obj;
6625 }
6626
6627 r = store->get_max_chunk_size(read_obj.pool, &max_chunk_size);
6628 if (r < 0) {
6629 ldout(cct, 0) << "ERROR: failed to get max_chunk_size() for pool " << read_obj.pool << dendl;
6630 return r;
6631 }
6632
6633 if (len > max_chunk_size)
6634 len = max_chunk_size;
6635
6636
7c673cae
FG
6637 read_len = len;
6638
6639 if (reading_from_head) {
6640 /* only when reading from the head object do we need to do the atomic test */
6641 r = store->append_atomic_test(&source->get_ctx(), source->get_bucket_info(), state.obj, op, &astate);
6642 if (r < 0)
6643 return r;
6644
6645 if (astate && astate->prefetch_data) {
6646 if (!ofs && astate->data.length() >= len) {
6647 bl = astate->data;
6648 return bl.length();
6649 }
6650
6651 if (ofs < astate->data.length()) {
11fdf7f2 6652 unsigned copy_len = std::min((uint64_t)astate->data.length() - ofs, len);
7c673cae
FG
6653 astate->data.copy(ofs, copy_len, bl);
6654 read_len -= copy_len;
6655 read_ofs += copy_len;
6656 if (!read_len)
6657 return bl.length();
6658
6659 merge_bl = true;
6660 pbl = &read_bl;
6661 }
6662 }
6663 }
6664
6665 ldout(cct, 20) << "rados->read obj-ofs=" << ofs << " read_ofs=" << read_ofs << " read_len=" << read_len << dendl;
6666 op.read(read_ofs, read_len, pbl, NULL);
6667
11fdf7f2
TL
6668 if (state.cur_pool != read_obj.pool) {
6669 auto iter = state.io_ctxs.find(read_obj.pool);
6670 if (iter == state.io_ctxs.end()) {
6671 state.cur_ioctx = &state.io_ctxs[read_obj.pool];
494da23a 6672 r = store->open_pool_ctx(read_obj.pool, *state.cur_ioctx, false);
11fdf7f2
TL
6673 if (r < 0) {
6674 ldout(cct, 20) << "ERROR: failed to open pool context for pool=" << read_obj.pool << " r=" << r << dendl;
6675 return r;
6676 }
6677 } else {
6678 state.cur_ioctx = &iter->second;
7c673cae 6679 }
11fdf7f2 6680 state.cur_pool = read_obj.pool;
7c673cae
FG
6681 }
6682
11fdf7f2 6683 state.cur_ioctx->locator_set_key(read_obj.loc);
7c673cae 6684
11fdf7f2
TL
6685 r = state.cur_ioctx->operate(read_obj.oid, &op, NULL);
6686 ldout(cct, 20) << "rados->read r=" << r << " bl.length=" << bl.length() << dendl;
7c673cae 6687
7c673cae 6688 if (r < 0) {
7c673cae
FG
6689 return r;
6690 }
7c673cae 6691
11fdf7f2
TL
6692 if (merge_bl) {
6693 bl.append(read_bl);
7c673cae
FG
6694 }
6695
7c673cae
FG
6696 return bl.length();
6697}
6698
11fdf7f2
TL
6699struct get_obj_data {
6700 RGWRados* store;
6701 RGWGetDataCB* client_cb;
6702 rgw::Aio* aio;
6703 uint64_t offset; // next offset to write to client
6704 rgw::AioResultList completed; // completed read results, sorted by offset
7c673cae 6705
11fdf7f2
TL
6706 get_obj_data(RGWRados* store, RGWGetDataCB* cb, rgw::Aio* aio, uint64_t offset)
6707 : store(store), client_cb(cb), aio(aio), offset(offset) {}
7c673cae 6708
11fdf7f2
TL
6709 int flush(rgw::AioResultList&& results) {
6710 int r = rgw::check_for_errors(results);
6711 if (r < 0) {
6712 return r;
7c673cae 6713 }
7c673cae 6714
11fdf7f2
TL
6715 auto cmp = [](const auto& lhs, const auto& rhs) { return lhs.id < rhs.id; };
6716 results.sort(cmp); // merge() requires results to be sorted first
6717 completed.merge(results, cmp); // merge results in sorted order
7c673cae 6718
11fdf7f2
TL
6719 while (!completed.empty() && completed.front().id == offset) {
6720 auto bl = std::move(completed.front().data);
6721 completed.pop_front_and_dispose(std::default_delete<rgw::AioResultEntry>{});
7c673cae 6722
11fdf7f2
TL
6723 offset += bl.length();
6724 int r = client_cb->handle_data(bl, 0, bl.length());
6725 if (r < 0) {
6726 return r;
6727 }
7c673cae 6728 }
11fdf7f2 6729 return 0;
7c673cae
FG
6730 }
6731
11fdf7f2
TL
6732 void cancel() {
6733 // wait for all completions to drain and ignore the results
6734 aio->drain();
7c673cae
FG
6735 }
6736
11fdf7f2
TL
6737 int drain() {
6738 auto c = aio->wait();
6739 while (!c.empty()) {
6740 int r = flush(std::move(c));
7c673cae 6741 if (r < 0) {
11fdf7f2 6742 cancel();
7c673cae
FG
6743 return r;
6744 }
11fdf7f2 6745 c = aio->wait();
7c673cae 6746 }
11fdf7f2 6747 return flush(std::move(c));
7c673cae
FG
6748 }
6749};
6750
11fdf7f2
TL
6751static int _get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6752 off_t read_ofs, off_t len, bool is_head_obj,
6753 RGWObjState *astate, void *arg)
7c673cae
FG
6754{
6755 struct get_obj_data *d = (struct get_obj_data *)arg;
6756
11fdf7f2
TL
6757 return d->store->get_obj_iterate_cb(read_obj, obj_ofs, read_ofs, len,
6758 is_head_obj, astate, arg);
7c673cae
FG
6759}
6760
11fdf7f2
TL
6761int RGWRados::get_obj_iterate_cb(const rgw_raw_obj& read_obj, off_t obj_ofs,
6762 off_t read_ofs, off_t len, bool is_head_obj,
6763 RGWObjState *astate, void *arg)
7c673cae 6764{
7c673cae
FG
6765 ObjectReadOperation op;
6766 struct get_obj_data *d = (struct get_obj_data *)arg;
6767 string oid, key;
7c673cae
FG
6768
6769 if (is_head_obj) {
6770 /* only when reading from the head object do we need to do the atomic test */
11fdf7f2 6771 int r = append_atomic_test(astate, op);
7c673cae
FG
6772 if (r < 0)
6773 return r;
6774
6775 if (astate &&
6776 obj_ofs < astate->data.length()) {
11fdf7f2 6777 unsigned chunk_len = std::min((uint64_t)astate->data.length() - obj_ofs, (uint64_t)len);
7c673cae 6778
7c673cae 6779 r = d->client_cb->handle_data(astate->data, obj_ofs, chunk_len);
7c673cae
FG
6780 if (r < 0)
6781 return r;
6782
7c673cae 6783 len -= chunk_len;
11fdf7f2 6784 d->offset += chunk_len;
7c673cae
FG
6785 read_ofs += chunk_len;
6786 obj_ofs += chunk_len;
6787 if (!len)
6788 return 0;
6789 }
6790 }
6791
11fdf7f2
TL
6792 auto obj = d->store->svc.rados->obj(read_obj);
6793 int r = obj.open();
7c673cae 6794 if (r < 0) {
11fdf7f2
TL
6795 ldout(cct, 4) << "failed to open rados context for " << read_obj << dendl;
6796 return r;
7c673cae
FG
6797 }
6798
11fdf7f2
TL
6799 ldout(cct, 20) << "rados->get_obj_iterate_cb oid=" << read_obj.oid << " obj-ofs=" << obj_ofs << " read_ofs=" << read_ofs << " len=" << len << dendl;
6800 op.read(read_ofs, len, nullptr, nullptr);
7c673cae 6801
11fdf7f2
TL
6802 const uint64_t cost = len;
6803 const uint64_t id = obj_ofs; // use logical object offset for sorting replies
7c673cae 6804
11fdf7f2 6805 auto completed = d->aio->submit(obj, &op, cost, id);
7c673cae 6806
11fdf7f2 6807 return d->flush(std::move(completed));
7c673cae
FG
6808}
6809
6810int RGWRados::Object::Read::iterate(int64_t ofs, int64_t end, RGWGetDataCB *cb)
6811{
6812 RGWRados *store = source->get_store();
6813 CephContext *cct = store->ctx();
7c673cae 6814 RGWObjectCtx& obj_ctx = source->get_ctx();
11fdf7f2
TL
6815 const uint64_t chunk_size = cct->_conf->rgw_get_obj_max_req_size;
6816 const uint64_t window_size = cct->_conf->rgw_get_obj_window_size;
7c673cae 6817
11fdf7f2
TL
6818 rgw::AioThrottle aio(window_size);
6819 get_obj_data data(store, cb, &aio, ofs);
7c673cae 6820
11fdf7f2
TL
6821 int r = store->iterate_obj(obj_ctx, source->get_bucket_info(), state.obj,
6822 ofs, end, chunk_size, _get_obj_iterate_cb, &data);
7c673cae 6823 if (r < 0) {
11fdf7f2
TL
6824 ldout(cct, 0) << "iterate_obj() failed with " << r << dendl;
6825 data.cancel(); // drain completions without writing back to client
6826 return r;
7c673cae
FG
6827 }
6828
11fdf7f2 6829 return data.drain();
7c673cae
FG
6830}
6831
6832int RGWRados::iterate_obj(RGWObjectCtx& obj_ctx,
6833 const RGWBucketInfo& bucket_info, const rgw_obj& obj,
11fdf7f2
TL
6834 off_t ofs, off_t end, uint64_t max_chunk_size,
6835 iterate_obj_cb cb, void *arg)
7c673cae
FG
6836{
6837 rgw_raw_obj head_obj;
6838 rgw_raw_obj read_obj;
6839 uint64_t read_ofs = ofs;
6840 uint64_t len;
6841 bool reading_from_head = true;
6842 RGWObjState *astate = NULL;
6843
6844 obj_to_raw(bucket_info.placement_rule, obj, &head_obj);
6845
6846 int r = get_obj_state(&obj_ctx, bucket_info, obj, &astate, false);
6847 if (r < 0) {
6848 return r;
6849 }
6850
6851 if (end < 0)
6852 len = 0;
6853 else
6854 len = end - ofs + 1;
6855
6856 if (astate->has_manifest) {
6857 /* now get the relevant object stripe */
6858 RGWObjManifest::obj_iterator iter = astate->manifest.obj_find(ofs);
6859
6860 RGWObjManifest::obj_iterator obj_end = astate->manifest.obj_end();
6861
6862 for (; iter != obj_end && ofs <= end; ++iter) {
6863 off_t stripe_ofs = iter.get_stripe_ofs();
6864 off_t next_stripe_ofs = stripe_ofs + iter.get_stripe_size();
6865
6866 while (ofs < next_stripe_ofs && ofs <= end) {
6867 read_obj = iter.get_location().get_raw_obj(this);
11fdf7f2 6868 uint64_t read_len = std::min(len, iter.get_stripe_size() - (ofs - stripe_ofs));
7c673cae
FG
6869 read_ofs = iter.location_ofs() + (ofs - stripe_ofs);
6870
6871 if (read_len > max_chunk_size) {
6872 read_len = max_chunk_size;
6873 }
6874
6875 reading_from_head = (read_obj == head_obj);
11fdf7f2 6876 r = cb(read_obj, ofs, read_ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6877 if (r < 0) {
6878 return r;
6879 }
6880
6881 len -= read_len;
6882 ofs += read_len;
6883 }
6884 }
6885 } else {
6886 while (ofs <= end) {
6887 read_obj = head_obj;
11fdf7f2 6888 uint64_t read_len = std::min(len, max_chunk_size);
7c673cae 6889
11fdf7f2 6890 r = cb(read_obj, ofs, ofs, read_len, reading_from_head, astate, arg);
7c673cae
FG
6891 if (r < 0) {
6892 return r;
6893 }
6894
6895 len -= read_len;
6896 ofs += read_len;
6897 }
6898 }
6899
6900 return 0;
6901}
6902
6903int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectWriteOperation *op)
6904{
6905 rgw_rados_ref ref;
6906 int r = get_obj_head_ref(bucket_info, obj, &ref);
6907 if (r < 0) {
6908 return r;
6909 }
6910
11fdf7f2 6911 return ref.ioctx.operate(ref.obj.oid, op);
7c673cae
FG
6912}
6913
6914int RGWRados::obj_operate(const RGWBucketInfo& bucket_info, const rgw_obj& obj, ObjectReadOperation *op)
6915{
6916 rgw_rados_ref ref;
6917 int r = get_obj_head_ref(bucket_info, obj, &ref);
6918 if (r < 0) {
6919 return r;
6920 }
6921
6922 bufferlist outbl;
6923
11fdf7f2 6924 return ref.ioctx.operate(ref.obj.oid, op, &outbl);
7c673cae
FG
6925}
6926
6927int RGWRados::olh_init_modification_impl(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, string *op_tag)
6928{
6929 ObjectWriteOperation op;
6930
11fdf7f2 6931 ceph_assert(olh_obj.key.instance.empty());
7c673cae
FG
6932
6933 bool has_tag = (state.exists && has_olh_tag(state.attrset));
6934
6935 if (!state.exists) {
6936 op.create(true);
6937 } else {
6938 op.assert_exists();
b32b8144
FG
6939 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
6940 op.mtime2(&mtime_ts);
7c673cae
FG
6941 }
6942
6943 /*
6944 * 3 possible cases: olh object doesn't exist, it exists as an olh, it exists as a regular object.
6945 * If it exists as a regular object we'll need to transform it into an olh. We'll do it in two
6946 * steps, first change its tag and set the olh pending attrs. Once write is done we'll need to
6947 * truncate it, remove extra attrs, and send it to the garbage collection. The bucket index olh
6948 * log will reflect that.
6949 *
6950 * Need to generate separate olh and obj tags, as olh can be colocated with object data. obj_tag
6951 * is used for object data instance, olh_tag for olh instance.
6952 */
6953 if (has_tag) {
6954 /* guard against racing writes */
6955 bucket_index_guard_olh_op(state, op);
6956 }
6957
6958 if (!has_tag) {
6959 /* obj tag */
6960 string obj_tag;
11fdf7f2
TL
6961 gen_rand_alphanumeric_lower(cct, &obj_tag, 32);
6962
7c673cae
FG
6963 bufferlist bl;
6964 bl.append(obj_tag.c_str(), obj_tag.size());
6965 op.setxattr(RGW_ATTR_ID_TAG, bl);
6966
6967 state.attrset[RGW_ATTR_ID_TAG] = bl;
6968 state.obj_tag = bl;
6969
6970 /* olh tag */
6971 string olh_tag;
11fdf7f2
TL
6972 gen_rand_alphanumeric_lower(cct, &olh_tag, 32);
6973
7c673cae
FG
6974 bufferlist olh_bl;
6975 olh_bl.append(olh_tag.c_str(), olh_tag.size());
6976 op.setxattr(RGW_ATTR_OLH_ID_TAG, olh_bl);
6977
6978 state.attrset[RGW_ATTR_OLH_ID_TAG] = olh_bl;
6979 state.olh_tag = olh_bl;
6980 state.is_olh = true;
6981
6982 bufferlist verbl;
6983 op.setxattr(RGW_ATTR_OLH_VER, verbl);
6984 }
6985
6986 bufferlist bl;
6987 RGWOLHPendingInfo pending_info;
6988 pending_info.time = real_clock::now();
11fdf7f2 6989 encode(pending_info, bl);
7c673cae
FG
6990
6991#define OLH_PENDING_TAG_LEN 32
6992 /* tag will start with current time epoch, this so that entries are sorted by time */
6993 char buf[32];
6994 utime_t ut(pending_info.time);
6995 snprintf(buf, sizeof(buf), "%016llx", (unsigned long long)ut.sec());
6996 *op_tag = buf;
6997
6998 string s;
11fdf7f2
TL
6999 gen_rand_alphanumeric_lower(cct, &s, OLH_PENDING_TAG_LEN - op_tag->size());
7000
7c673cae
FG
7001 op_tag->append(s);
7002
7003 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7004 attr_name.append(*op_tag);
7005
7006 op.setxattr(attr_name.c_str(), bl);
7007
11fdf7f2 7008 int ret = obj_operate(bucket_info, olh_obj, &op);
7c673cae
FG
7009 if (ret < 0) {
7010 return ret;
7011 }
7012
7013 state.exists = true;
7014 state.attrset[attr_name] = bl;
7015
7016 return 0;
7017}
7018
7019int RGWRados::olh_init_modification(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj, string *op_tag)
7020{
7021 int ret;
7022
7023 ret = olh_init_modification_impl(bucket_info, state, obj, op_tag);
7024 if (ret == -EEXIST) {
7025 ret = -ECANCELED;
7026 }
7027
7028 return ret;
7029}
7030
f64942e4
AA
7031int RGWRados::guard_reshard(BucketShard *bs,
7032 const rgw_obj& obj_instance,
7033 const RGWBucketInfo& bucket_info,
7034 std::function<int(BucketShard *)> call)
31f18b77
FG
7035{
7036 rgw_obj obj;
7037 const rgw_obj *pobj = &obj_instance;
7038 int r;
7039
7040 for (int i = 0; i < NUM_RESHARD_RETRIES; ++i) {
f64942e4 7041 r = bs->init(pobj->bucket, *pobj, nullptr /* no RGWBucketInfo */);
31f18b77
FG
7042 if (r < 0) {
7043 ldout(cct, 5) << "bs.init() returned ret=" << r << dendl;
7044 return r;
7045 }
7046 r = call(bs);
7047 if (r != -ERR_BUSY_RESHARDING) {
7048 break;
7049 }
7050 ldout(cct, 0) << "NOTICE: resharding operation on bucket index detected, blocking" << dendl;
7051 string new_bucket_id;
11fdf7f2 7052 r = block_while_resharding(bs, &new_bucket_id, bucket_info, null_yield);
31f18b77
FG
7053 if (r == -ERR_BUSY_RESHARDING) {
7054 continue;
7055 }
7056 if (r < 0) {
7057 return r;
7058 }
7059 ldout(cct, 20) << "reshard completion identified, new_bucket_id=" << new_bucket_id << dendl;
7060 i = 0; /* resharding is finished, make sure we can retry */
7061
7062 obj = *pobj;
7063 obj.bucket.update_bucket_id(new_bucket_id);
7064 pobj = &obj;
81eedcae 7065 } // for loop
31f18b77
FG
7066
7067 if (r < 0) {
7068 return r;
7069 }
7070
7071 return 0;
7072}
7073
f64942e4
AA
7074int RGWRados::block_while_resharding(RGWRados::BucketShard *bs,
7075 string *new_bucket_id,
11fdf7f2
TL
7076 const RGWBucketInfo& bucket_info,
7077 optional_yield y)
31f18b77 7078{
11fdf7f2
TL
7079 int ret = 0;
7080 cls_rgw_bucket_instance_entry entry;
7081
81eedcae
TL
7082 // since we want to run this recovery code from two distinct places,
7083 // let's just put it in a lambda so we can easily re-use; if the
7084 // lambda successfully fetches a new bucket id, it sets
7085 // new_bucket_id and returns 0, otherwise it returns a negative
7086 // error code
7087 auto fetch_new_bucket_id =
7088 [this, bucket_info](const std::string& log_tag,
7089 std::string* new_bucket_id) -> int {
7090 RGWBucketInfo fresh_bucket_info = bucket_info;
7091 int ret = try_refresh_bucket_info(fresh_bucket_info, nullptr);
7092 if (ret < 0) {
7093 ldout(cct, 0) << __func__ <<
7094 " ERROR: failed to refresh bucket info after reshard at " <<
7095 log_tag << ": " << cpp_strerror(-ret) << dendl;
7096 return ret;
7097 }
7098 *new_bucket_id = fresh_bucket_info.bucket.bucket_id;
7099 return 0;
7100 };
7101
7102 constexpr int num_retries = 10;
7103 for (int i = 1; i <= num_retries; i++) { // nb: 1-based for loop
11fdf7f2 7104 ret = cls_rgw_get_bucket_resharding(bs->index_ctx, bs->bucket_obj, &entry);
81eedcae
TL
7105 if (ret == -ENOENT) {
7106 return fetch_new_bucket_id("get_bucket_resharding_failed", new_bucket_id);
7107 } else if (ret < 0) {
7108 ldout(cct, 0) << __func__ <<
7109 " ERROR: failed to get bucket resharding : " << cpp_strerror(-ret) <<
7110 dendl;
11fdf7f2
TL
7111 return ret;
7112 }
81eedcae 7113
11fdf7f2 7114 if (!entry.resharding_in_progress()) {
81eedcae
TL
7115 return fetch_new_bucket_id("get_bucket_resharding_succeeded",
7116 new_bucket_id);
11fdf7f2 7117 }
31f18b77 7118
81eedcae
TL
7119 ldout(cct, 20) << "NOTICE: reshard still in progress; " <<
7120 (i < num_retries ? "retrying" : "too many retries") << dendl;
7121
7122 if (i == num_retries) {
11fdf7f2
TL
7123 break;
7124 }
7125
7126 // If bucket is erroneously marked as resharding (e.g., crash or
7127 // other error) then fix it. If we can take the bucket reshard
7128 // lock then it means no other resharding should be taking place,
7129 // and we're free to clear the flags.
7130 {
7131 // since we expect to do this rarely, we'll do our work in a
7132 // block and erase our work after each try
7133
7134 RGWObjectCtx obj_ctx(this);
7135 const rgw_bucket& b = bs->bucket;
7136 std::string bucket_id = b.get_key();
7137 RGWBucketReshardLock reshard_lock(this, bucket_info, true);
7138 ret = reshard_lock.lock();
7139 if (ret < 0) {
7140 ldout(cct, 20) << __func__ <<
7141 " INFO: failed to take reshard lock for bucket " <<
7142 bucket_id << "; expected if resharding underway" << dendl;
7143 } else {
7144 ldout(cct, 10) << __func__ <<
7145 " INFO: was able to take reshard lock for bucket " <<
7146 bucket_id << dendl;
7147 ret = RGWBucketReshard::clear_resharding(this, bucket_info);
7148 if (ret < 0) {
7149 reshard_lock.unlock();
7150 ldout(cct, 0) << __func__ <<
7151 " ERROR: failed to clear resharding flags for bucket " <<
7152 bucket_id << dendl;
7153 } else {
7154 reshard_lock.unlock();
7155 ldout(cct, 5) << __func__ <<
7156 " INFO: apparently successfully cleared resharding flags for "
7157 "bucket " << bucket_id << dendl;
7158 continue; // if we apparently succeed immediately test again
7159 } // if clear resharding succeeded
7160 } // if taking of lock succeeded
7161 } // block to encapsulate recovery from incomplete reshard
7162
7163 ret = reshard_wait->wait(y);
7164 if (ret < 0) {
81eedcae
TL
7165 ldout(cct, 0) << __func__ <<
7166 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2
TL
7167 return ret;
7168 }
81eedcae
TL
7169 } // for loop
7170
7171 ldout(cct, 0) << __func__ <<
7172 " ERROR: bucket is still resharding, please retry" << dendl;
11fdf7f2 7173 return -ERR_BUSY_RESHARDING;
31f18b77
FG
7174}
7175
7c673cae
FG
7176int RGWRados::bucket_index_link_olh(const RGWBucketInfo& bucket_info, RGWObjState& olh_state, const rgw_obj& obj_instance,
7177 bool delete_marker,
7178 const string& op_tag,
7179 struct rgw_bucket_dir_entry_meta *meta,
7180 uint64_t olh_epoch,
91327a77
AA
7181 real_time unmod_since, bool high_precision_time,
7182 rgw_zone_set *_zones_trace, bool log_data_change)
7c673cae
FG
7183{
7184 rgw_rados_ref ref;
7185 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7186 if (r < 0) {
7187 return r;
7188 }
7189
31f18b77
FG
7190 rgw_zone_set zones_trace;
7191 if (_zones_trace) {
7192 zones_trace = *_zones_trace;
7c673cae 7193 }
11fdf7f2 7194 zones_trace.insert(svc.zone->get_zone().id);
7c673cae 7195
31f18b77
FG
7196 BucketShard bs(this);
7197
7c673cae 7198 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
7199 r = guard_reshard(&bs, obj_instance, bucket_info,
7200 [&](BucketShard *bs) -> int {
7201 librados::ObjectWriteOperation op;
7202 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7203 return cls_rgw_bucket_link_olh(bs->index_ctx, op,
7204 bs->bucket_obj, key, olh_state.olh_tag, delete_marker, op_tag, meta, olh_epoch,
7205 unmod_since, high_precision_time,
11fdf7f2 7206 svc.zone->get_zone().log_data, zones_trace);
31f18b77
FG
7207 });
7208 if (r < 0) {
7209 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7210 return r;
7c673cae
FG
7211 }
7212
91327a77
AA
7213 if (log_data_change && bucket_info.datasync_flag_enabled()) {
7214 data_log->add_entry(bs.bucket, bs.shard_id);
7215 }
7216
7c673cae
FG
7217 return 0;
7218}
7219
7220void RGWRados::bucket_index_guard_olh_op(RGWObjState& olh_state, ObjectOperation& op)
7221{
7222 ldout(cct, 20) << __func__ << "(): olh_state.olh_tag=" << string(olh_state.olh_tag.c_str(), olh_state.olh_tag.length()) << dendl;
7223 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_state.olh_tag);
7224}
7225
7226int RGWRados::bucket_index_unlink_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj_instance,
31f18b77 7227 const string& op_tag, const string& olh_tag, uint64_t olh_epoch, rgw_zone_set *_zones_trace)
7c673cae
FG
7228{
7229 rgw_rados_ref ref;
7230 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7231 if (r < 0) {
7232 return r;
7233 }
7234
31f18b77
FG
7235 rgw_zone_set zones_trace;
7236 if (_zones_trace) {
7237 zones_trace = *_zones_trace;
7c673cae 7238 }
11fdf7f2 7239 zones_trace.insert(svc.zone->get_zone().id);
31f18b77
FG
7240
7241 BucketShard bs(this);
7c673cae
FG
7242
7243 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), obj_instance.key.instance);
f64942e4
AA
7244 r = guard_reshard(&bs, obj_instance, bucket_info,
7245 [&](BucketShard *bs) -> int {
7246 librados::ObjectWriteOperation op;
7247 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7248 return cls_rgw_bucket_unlink_instance(bs->index_ctx, op, bs->bucket_obj, key, op_tag,
11fdf7f2 7249 olh_tag, olh_epoch, svc.zone->get_zone().log_data, zones_trace);
31f18b77
FG
7250 });
7251 if (r < 0) {
7252 ldout(cct, 20) << "cls_rgw_bucket_link_olh() returned r=" << r << dendl;
7253 return r;
7c673cae
FG
7254 }
7255
7256 return 0;
7257}
7258
7259int RGWRados::bucket_index_read_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state,
7260 const rgw_obj& obj_instance, uint64_t ver_marker,
7261 map<uint64_t, vector<rgw_bucket_olh_log_entry> > *log,
7262 bool *is_truncated)
7263{
7264 rgw_rados_ref ref;
7265 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7266 if (r < 0) {
7267 return r;
7268 }
7269
7270 BucketShard bs(this);
f64942e4
AA
7271 int ret =
7272 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7273 if (ret < 0) {
7274 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7275 return ret;
7276 }
7277
7278 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7279
7280 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7281
f64942e4
AA
7282 ret = guard_reshard(&bs, obj_instance, bucket_info,
7283 [&](BucketShard *bs) -> int {
7284 ObjectReadOperation op;
7285 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7286 return cls_rgw_get_olh_log(bs->index_ctx, bs->bucket_obj, op,
7287 key, ver_marker, olh_tag, log, is_truncated);
7288 });
31f18b77
FG
7289 if (ret < 0) {
7290 ldout(cct, 20) << "cls_rgw_get_olh_log() returned r=" << r << dendl;
7c673cae 7291 return ret;
31f18b77 7292 }
7c673cae
FG
7293
7294 return 0;
7295}
7296
a8e16298
TL
7297// a multisite sync bug resulted in the OLH head attributes being overwritten by
7298// the attributes from another zone, causing link_olh() to fail endlessly due to
7299// olh_tag mismatch. this attempts to detect this case and reconstruct the OLH
7300// attributes from the bucket index. see http://tracker.ceph.com/issues/37792
7301int RGWRados::repair_olh(RGWObjState* state, const RGWBucketInfo& bucket_info,
7302 const rgw_obj& obj)
7303{
7304 // fetch the current olh entry from the bucket index
7305 rgw_bucket_olh_entry olh;
7306 int r = bi_get_olh(bucket_info, obj, &olh);
7307 if (r < 0) {
7308 ldout(cct, 0) << "repair_olh failed to read olh entry for " << obj << dendl;
7309 return r;
7310 }
11fdf7f2 7311 if (olh.tag == rgw_bl_str(state->olh_tag)) { // mismatch already resolved?
a8e16298
TL
7312 return 0;
7313 }
7314
7315 ldout(cct, 4) << "repair_olh setting olh_tag=" << olh.tag
7316 << " key=" << olh.key << " delete_marker=" << olh.delete_marker << dendl;
7317
7318 // rewrite OLH_ID_TAG and OLH_INFO from current olh
7319 ObjectWriteOperation op;
7320 // assert this is the same olh tag we think we're fixing
7321 bucket_index_guard_olh_op(*state, op);
7322 // preserve existing mtime
7323 struct timespec mtime_ts = ceph::real_clock::to_timespec(state->mtime);
7324 op.mtime2(&mtime_ts);
7325 {
7326 bufferlist bl;
7327 bl.append(olh.tag.c_str(), olh.tag.size());
7328 op.setxattr(RGW_ATTR_OLH_ID_TAG, bl);
7329 }
7330 {
7331 RGWOLHInfo info;
7332 info.target = rgw_obj(bucket_info.bucket, olh.key);
7333 info.removed = olh.delete_marker;
7334 bufferlist bl;
7335 encode(info, bl);
7336 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7337 }
7338 rgw_rados_ref ref;
7339 r = get_obj_head_ref(bucket_info, obj, &ref);
7340 if (r < 0) {
7341 return r;
7342 }
11fdf7f2 7343 r = ref.ioctx.operate(ref.obj.oid, &op);
a8e16298
TL
7344 if (r < 0) {
7345 ldout(cct, 0) << "repair_olh failed to write olh attributes with "
7346 << cpp_strerror(r) << dendl;
7347 return r;
7348 }
7349 return 0;
7350}
7351
7c673cae
FG
7352int RGWRados::bucket_index_trim_olh_log(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance, uint64_t ver)
7353{
7354 rgw_rados_ref ref;
7355 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7356 if (r < 0) {
7357 return r;
7358 }
7359
7360 BucketShard bs(this);
f64942e4
AA
7361 int ret =
7362 bs.init(obj_instance.bucket, obj_instance, nullptr /* no RGWBucketInfo */);
7c673cae
FG
7363 if (ret < 0) {
7364 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
7365 return ret;
7366 }
7367
7368 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7369
7370 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7371
f64942e4
AA
7372 ret = guard_reshard(&bs, obj_instance, bucket_info,
7373 [&](BucketShard *pbs) -> int {
7374 ObjectWriteOperation op;
7375 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7376 cls_rgw_trim_olh_log(op, key, ver, olh_tag);
7377 return pbs->index_ctx.operate(pbs->bucket_obj, &op);
31f18b77
FG
7378 });
7379 if (ret < 0) {
7380 ldout(cct, 20) << "cls_rgw_trim_olh_log() returned r=" << ret << dendl;
7c673cae 7381 return ret;
31f18b77 7382 }
7c673cae
FG
7383
7384 return 0;
7385}
7386
7387int RGWRados::bucket_index_clear_olh(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& obj_instance)
7388{
7389 rgw_rados_ref ref;
7390 int r = get_obj_head_ref(bucket_info, obj_instance, &ref);
7391 if (r < 0) {
7392 return r;
7393 }
7394
7395 BucketShard bs(this);
7c673cae
FG
7396
7397 string olh_tag(state.olh_tag.c_str(), state.olh_tag.length());
7398
7399 cls_rgw_obj_key key(obj_instance.key.get_index_key_name(), string());
7400
f64942e4
AA
7401 int ret = guard_reshard(&bs, obj_instance, bucket_info,
7402 [&](BucketShard *pbs) -> int {
7403 ObjectWriteOperation op;
7404 cls_rgw_guard_bucket_resharding(op, -ERR_BUSY_RESHARDING);
7405 return cls_rgw_clear_olh(pbs->index_ctx, op, pbs->bucket_obj, key, olh_tag);
31f18b77 7406 });
7c673cae
FG
7407 if (ret < 0) {
7408 ldout(cct, 5) << "cls_rgw_clear_olh() returned ret=" << ret << dendl;
7409 return ret;
7410 }
7411
7412 return 0;
7413}
7414
7415int RGWRados::apply_olh_log(RGWObjectCtx& obj_ctx, RGWObjState& state, const RGWBucketInfo& bucket_info, const rgw_obj& obj,
7416 bufferlist& olh_tag, map<uint64_t, vector<rgw_bucket_olh_log_entry> >& log,
31f18b77 7417 uint64_t *plast_ver, rgw_zone_set* zones_trace)
7c673cae
FG
7418{
7419 if (log.empty()) {
7420 return 0;
7421 }
7422
7423 librados::ObjectWriteOperation op;
7424
7425 uint64_t last_ver = log.rbegin()->first;
7426 *plast_ver = last_ver;
7427
7428 map<uint64_t, vector<rgw_bucket_olh_log_entry> >::iterator iter = log.begin();
7429
7430 op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
7431 op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_GT, last_ver);
7432
a8e16298
TL
7433 bufferlist ver_bl;
7434 string last_ver_s = to_string(last_ver);
7435 ver_bl.append(last_ver_s.c_str(), last_ver_s.size());
7436 op.setxattr(RGW_ATTR_OLH_VER, ver_bl);
7437
b32b8144
FG
7438 struct timespec mtime_ts = real_clock::to_timespec(state.mtime);
7439 op.mtime2(&mtime_ts);
7440
7c673cae
FG
7441 bool need_to_link = false;
7442 cls_rgw_obj_key key;
7443 bool delete_marker = false;
7444 list<cls_rgw_obj_key> remove_instances;
7445 bool need_to_remove = false;
7446
7447 for (iter = log.begin(); iter != log.end(); ++iter) {
7448 vector<rgw_bucket_olh_log_entry>::iterator viter = iter->second.begin();
7449 for (; viter != iter->second.end(); ++viter) {
7450 rgw_bucket_olh_log_entry& entry = *viter;
7451
7452 ldout(cct, 20) << "olh_log_entry: op=" << (int)entry.op
7453 << " key=" << entry.key.name << "[" << entry.key.instance << "] "
7454 << (entry.delete_marker ? "(delete)" : "") << dendl;
7455 switch (entry.op) {
7456 case CLS_RGW_OLH_OP_REMOVE_INSTANCE:
7457 remove_instances.push_back(entry.key);
7458 break;
7459 case CLS_RGW_OLH_OP_LINK_OLH:
7460 need_to_link = true;
7461 need_to_remove = false;
7462 key = entry.key;
7463 delete_marker = entry.delete_marker;
7464 break;
7465 case CLS_RGW_OLH_OP_UNLINK_OLH:
7466 need_to_remove = true;
7467 need_to_link = false;
7468 break;
7469 default:
7470 ldout(cct, 0) << "ERROR: apply_olh_log: invalid op: " << (int)entry.op << dendl;
7471 return -EIO;
7472 }
7473 string attr_name = RGW_ATTR_OLH_PENDING_PREFIX;
7474 attr_name.append(entry.op_tag);
7475 op.rmxattr(attr_name.c_str());
7476 }
7477 }
7478
7479 rgw_rados_ref ref;
7480 int r = get_obj_head_ref(bucket_info, obj, &ref);
7481 if (r < 0) {
7482 return r;
7483 }
7484
7485 const rgw_bucket& bucket = obj.bucket;
7486
7487 if (need_to_link) {
7488 rgw_obj target(bucket, key);
7489 RGWOLHInfo info;
7490 info.target = target;
7491 info.removed = delete_marker;
7492 bufferlist bl;
11fdf7f2 7493 encode(info, bl);
7c673cae
FG
7494 op.setxattr(RGW_ATTR_OLH_INFO, bl);
7495 }
7496
7497 /* first remove object instances */
7498 for (list<cls_rgw_obj_key>::iterator liter = remove_instances.begin();
7499 liter != remove_instances.end(); ++liter) {
7500 cls_rgw_obj_key& key = *liter;
7501 rgw_obj obj_instance(bucket, key);
31f18b77 7502 int ret = delete_obj(obj_ctx, bucket_info, obj_instance, 0, RGW_BILOG_FLAG_VERSIONED_OP, ceph::real_time(), zones_trace);
7c673cae
FG
7503 if (ret < 0 && ret != -ENOENT) {
7504 ldout(cct, 0) << "ERROR: delete_obj() returned " << ret << " obj_instance=" << obj_instance << dendl;
7505 return ret;
7506 }
7507 }
7508
7509 /* update olh object */
11fdf7f2 7510 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
7511 if (r == -ECANCELED) {
7512 r = 0;
7513 }
7514 if (r < 0) {
7515 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7516 return r;
7517 }
7518
7519 r = bucket_index_trim_olh_log(bucket_info, state, obj, last_ver);
7520 if (r < 0) {
7521 ldout(cct, 0) << "ERROR: could not trim olh log, r=" << r << dendl;
7522 return r;
7523 }
7524
7525 if (need_to_remove) {
7526 ObjectWriteOperation rm_op;
7527
7528 rm_op.cmpxattr(RGW_ATTR_OLH_ID_TAG, CEPH_OSD_CMPXATTR_OP_EQ, olh_tag);
a8e16298 7529 rm_op.cmpxattr(RGW_ATTR_OLH_VER, CEPH_OSD_CMPXATTR_OP_EQ, last_ver);
7c673cae
FG
7530 cls_obj_check_prefix_exist(rm_op, RGW_ATTR_OLH_PENDING_PREFIX, true); /* fail if found one of these, pending modification */
7531 rm_op.remove();
7532
11fdf7f2 7533 r = ref.ioctx.operate(ref.obj.oid, &rm_op);
7c673cae
FG
7534 if (r == -ECANCELED) {
7535 return 0; /* someone else won this race */
7536 } else {
7537 /*
7538 * only clear if was successful, otherwise we might clobber pending operations on this object
7539 */
7540 r = bucket_index_clear_olh(bucket_info, state, obj);
7541 if (r < 0) {
7542 ldout(cct, 0) << "ERROR: could not clear bucket index olh entries r=" << r << dendl;
7543 return r;
7544 }
7545 }
7546 }
7547
7548 return 0;
7549}
7550
7551/*
7552 * read olh log and apply it
7553 */
31f18b77 7554int RGWRados::update_olh(RGWObjectCtx& obj_ctx, RGWObjState *state, const RGWBucketInfo& bucket_info, const rgw_obj& obj, rgw_zone_set *zones_trace)
7c673cae
FG
7555{
7556 map<uint64_t, vector<rgw_bucket_olh_log_entry> > log;
7557 bool is_truncated;
7558 uint64_t ver_marker = 0;
7559
7560 do {
7561 int ret = bucket_index_read_olh_log(bucket_info, *state, obj, ver_marker, &log, &is_truncated);
7562 if (ret < 0) {
7563 return ret;
7564 }
31f18b77 7565 ret = apply_olh_log(obj_ctx, *state, bucket_info, obj, state->olh_tag, log, &ver_marker, zones_trace);
7c673cae
FG
7566 if (ret < 0) {
7567 return ret;
7568 }
7569 } while (is_truncated);
7570
7571 return 0;
7572}
7573
7574int RGWRados::set_olh(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj, bool delete_marker, rgw_bucket_dir_entry_meta *meta,
91327a77
AA
7575 uint64_t olh_epoch, real_time unmod_since, bool high_precision_time,
7576 rgw_zone_set *zones_trace, bool log_data_change)
7c673cae
FG
7577{
7578 string op_tag;
7579
7580 rgw_obj olh_obj = target_obj;
7581 olh_obj.key.instance.clear();
7582
7583 RGWObjState *state = NULL;
7584
7585 int ret = 0;
7586 int i;
31f18b77 7587
7c673cae
FG
7588#define MAX_ECANCELED_RETRY 100
7589 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7590 if (ret == -ECANCELED) {
11fdf7f2 7591 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7592 }
7593
7594 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7595 if (ret < 0) {
7596 return ret;
7597 }
7598
7599 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7600 if (ret < 0) {
7601 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7602 if (ret == -ECANCELED) {
7603 continue;
7604 }
7605 return ret;
7606 }
91327a77
AA
7607 ret = bucket_index_link_olh(bucket_info, *state, target_obj, delete_marker,
7608 op_tag, meta, olh_epoch, unmod_since, high_precision_time,
7609 zones_trace, log_data_change);
7c673cae
FG
7610 if (ret < 0) {
7611 ldout(cct, 20) << "bucket_index_link_olh() target_obj=" << target_obj << " delete_marker=" << (int)delete_marker << " returned " << ret << dendl;
7612 if (ret == -ECANCELED) {
a8e16298
TL
7613 // the bucket index rejected the link_olh() due to olh tag mismatch;
7614 // attempt to reconstruct olh head attributes based on the bucket index
7615 int r2 = repair_olh(state, bucket_info, olh_obj);
7616 if (r2 < 0 && r2 != -ECANCELED) {
7617 return r2;
7618 }
7c673cae
FG
7619 continue;
7620 }
7621 return ret;
7622 }
7623 break;
7624 }
7625
7626 if (i == MAX_ECANCELED_RETRY) {
7627 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7628 return -EIO;
7629 }
7630
7631 ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7632 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7633 ret = 0;
7634 }
7635 if (ret < 0) {
7636 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7637 return ret;
7638 }
7639
7640 return 0;
7641}
7642
7643int RGWRados::unlink_obj_instance(RGWObjectCtx& obj_ctx, RGWBucketInfo& bucket_info, const rgw_obj& target_obj,
31f18b77 7644 uint64_t olh_epoch, rgw_zone_set *zones_trace)
7c673cae
FG
7645{
7646 string op_tag;
7647
7648 rgw_obj olh_obj = target_obj;
7649 olh_obj.key.instance.clear();
7650
7651 RGWObjState *state = NULL;
7652
7653 int ret = 0;
7654 int i;
7655
7656 for (i = 0; i < MAX_ECANCELED_RETRY; i++) {
7657 if (ret == -ECANCELED) {
11fdf7f2 7658 obj_ctx.invalidate(olh_obj);
7c673cae
FG
7659 }
7660
7661 ret = get_obj_state(&obj_ctx, bucket_info, olh_obj, &state, false); /* don't follow olh */
7662 if (ret < 0)
7663 return ret;
7664
7665 ret = olh_init_modification(bucket_info, *state, olh_obj, &op_tag);
7666 if (ret < 0) {
7667 ldout(cct, 20) << "olh_init_modification() target_obj=" << target_obj << " returned " << ret << dendl;
7668 if (ret == -ECANCELED) {
7669 continue;
7670 }
7671 return ret;
7672 }
7673
7674 string olh_tag(state->olh_tag.c_str(), state->olh_tag.length());
7675
31f18b77 7676 ret = bucket_index_unlink_instance(bucket_info, target_obj, op_tag, olh_tag, olh_epoch, zones_trace);
7c673cae
FG
7677 if (ret < 0) {
7678 ldout(cct, 20) << "bucket_index_unlink_instance() target_obj=" << target_obj << " returned " << ret << dendl;
7679 if (ret == -ECANCELED) {
7680 continue;
7681 }
7682 return ret;
7683 }
7684 break;
7685 }
7686
7687 if (i == MAX_ECANCELED_RETRY) {
7688 ldout(cct, 0) << "ERROR: exceeded max ECANCELED retries, aborting (EIO)" << dendl;
7689 return -EIO;
7690 }
7691
31f18b77 7692 ret = update_olh(obj_ctx, state, bucket_info, olh_obj, zones_trace);
7c673cae
FG
7693 if (ret == -ECANCELED) { /* already did what we needed, no need to retry, raced with another user */
7694 return 0;
7695 }
7696 if (ret < 0) {
7697 ldout(cct, 20) << "update_olh() target_obj=" << target_obj << " returned " << ret << dendl;
7698 return ret;
7699 }
7700
7701 return 0;
7702}
7703
11fdf7f2 7704void RGWRados::gen_rand_obj_instance_name(rgw_obj_key *target_key)
7c673cae
FG
7705{
7706#define OBJ_INSTANCE_LEN 32
7707 char buf[OBJ_INSTANCE_LEN + 1];
7708
7709 gen_rand_alphanumeric_no_underscore(cct, buf, OBJ_INSTANCE_LEN); /* don't want it to get url escaped,
7710 no underscore for instance name due to the way we encode the raw keys */
7711
11fdf7f2 7712 target_key->set_instance(buf);
7c673cae
FG
7713}
7714
11fdf7f2 7715void RGWRados::gen_rand_obj_instance_name(rgw_obj *target_obj)
7c673cae 7716{
11fdf7f2 7717 gen_rand_obj_instance_name(&target_obj->key);
7c673cae
FG
7718}
7719
7720int RGWRados::get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj, RGWOLHInfo *olh)
7721{
7722 map<string, bufferlist> unfiltered_attrset;
7723
7724 ObjectReadOperation op;
7725 op.getxattrs(&unfiltered_attrset, NULL);
7726
7727 bufferlist outbl;
7728 int r = obj_operate(bucket_info, obj, &op);
7729
7730 if (r < 0) {
7731 return r;
7732 }
7733 map<string, bufferlist> attrset;
7734
11fdf7f2 7735 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_OLH_PREFIX, &attrset);
7c673cae
FG
7736
7737 map<string, bufferlist>::iterator iter = attrset.find(RGW_ATTR_OLH_INFO);
7738 if (iter == attrset.end()) { /* not an olh */
7739 return -EINVAL;
7740 }
7741
7742 try {
11fdf7f2
TL
7743 auto biter = iter->second.cbegin();
7744 decode(*olh, biter);
7c673cae
FG
7745 } catch (buffer::error& err) {
7746 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7747 return -EIO;
7748 }
7749
7750 return 0;
7751}
7752
7753void RGWRados::check_pending_olh_entries(map<string, bufferlist>& pending_entries,
7754 map<string, bufferlist> *rm_pending_entries)
7755{
7756 map<string, bufferlist>::iterator iter = pending_entries.begin();
7757
7758 real_time now = real_clock::now();
7759
7760 while (iter != pending_entries.end()) {
11fdf7f2 7761 auto biter = iter->second.cbegin();
7c673cae
FG
7762 RGWOLHPendingInfo pending_info;
7763 try {
11fdf7f2 7764 decode(pending_info, biter);
7c673cae
FG
7765 } catch (buffer::error& err) {
7766 /* skipping bad entry, we could remove it but it might hide a bug */
7767 ldout(cct, 0) << "ERROR: failed to decode pending entry " << iter->first << dendl;
7768 ++iter;
7769 continue;
7770 }
7771
7772 map<string, bufferlist>::iterator cur_iter = iter;
7773 ++iter;
7774 if (now - pending_info.time >= make_timespan(cct->_conf->rgw_olh_pending_timeout_sec)) {
7775 (*rm_pending_entries)[cur_iter->first] = cur_iter->second;
7776 pending_entries.erase(cur_iter);
7777 } else {
7778 /* entries names are sorted by time (rounded to a second) */
7779 break;
7780 }
7781 }
7782}
7783
7784int RGWRados::remove_olh_pending_entries(const RGWBucketInfo& bucket_info, RGWObjState& state, const rgw_obj& olh_obj, map<string, bufferlist>& pending_attrs)
7785{
7c673cae
FG
7786 rgw_rados_ref ref;
7787 int r = get_obj_head_ref(bucket_info, olh_obj, &ref);
7788 if (r < 0) {
7789 return r;
7790 }
7791
81eedcae
TL
7792 // trim no more than 1000 entries per osd op
7793 constexpr int max_entries = 1000;
7c673cae 7794
81eedcae
TL
7795 auto i = pending_attrs.begin();
7796 while (i != pending_attrs.end()) {
7797 ObjectWriteOperation op;
7798 bucket_index_guard_olh_op(state, op);
7799
7800 for (int n = 0; n < max_entries && i != pending_attrs.end(); ++n, ++i) {
7801 op.rmxattr(i->first.c_str());
7802 }
7803
7804 r = ref.ioctx.operate(ref.obj.oid, &op);
7805 if (r == -ENOENT || r == -ECANCELED) {
7806 /* raced with some other change, shouldn't sweat about it */
7807 return 0;
7808 }
7809 if (r < 0) {
7810 ldout(cct, 0) << "ERROR: could not apply olh update, r=" << r << dendl;
7811 return r;
7812 }
7813 }
7c673cae
FG
7814 return 0;
7815}
7816
7817int RGWRados::follow_olh(const RGWBucketInfo& bucket_info, RGWObjectCtx& obj_ctx, RGWObjState *state, const rgw_obj& olh_obj, rgw_obj *target)
7818{
7819 map<string, bufferlist> pending_entries;
11fdf7f2 7820 rgw_filter_attrset(state->attrset, RGW_ATTR_OLH_PENDING_PREFIX, &pending_entries);
7c673cae
FG
7821
7822 map<string, bufferlist> rm_pending_entries;
7823 check_pending_olh_entries(pending_entries, &rm_pending_entries);
7824
7825 if (!rm_pending_entries.empty()) {
7826 int ret = remove_olh_pending_entries(bucket_info, *state, olh_obj, rm_pending_entries);
7827 if (ret < 0) {
7828 ldout(cct, 20) << "ERROR: rm_pending_entries returned ret=" << ret << dendl;
7829 return ret;
7830 }
7831 }
7832 if (!pending_entries.empty()) {
7833 ldout(cct, 20) << __func__ << "(): found pending entries, need to update_olh() on bucket=" << olh_obj.bucket << dendl;
7834
7835 int ret = update_olh(obj_ctx, state, bucket_info, olh_obj);
7836 if (ret < 0) {
7837 return ret;
7838 }
7839 }
7840
7841 map<string, bufferlist>::iterator iter = state->attrset.find(RGW_ATTR_OLH_INFO);
11fdf7f2 7842 ceph_assert(iter != state->attrset.end());
7c673cae
FG
7843 RGWOLHInfo olh;
7844 try {
11fdf7f2
TL
7845 auto biter = iter->second.cbegin();
7846 decode(olh, biter);
7c673cae
FG
7847 } catch (buffer::error& err) {
7848 ldout(cct, 0) << "ERROR: failed to decode olh info" << dendl;
7849 return -EIO;
7850 }
7851
7852 if (olh.removed) {
7853 return -ENOENT;
7854 }
7855
7856 *target = olh.target;
7857
7858 return 0;
7859}
7860
7861int RGWRados::raw_obj_stat(rgw_raw_obj& obj, uint64_t *psize, real_time *pmtime, uint64_t *epoch,
7862 map<string, bufferlist> *attrs, bufferlist *first_chunk,
7863 RGWObjVersionTracker *objv_tracker)
7864{
7865 rgw_rados_ref ref;
7866 int r = get_raw_obj_ref(obj, &ref);
7867 if (r < 0) {
7868 return r;
7869 }
7870
7871 map<string, bufferlist> unfiltered_attrset;
7872 uint64_t size = 0;
7873 struct timespec mtime_ts;
7874
7875 ObjectReadOperation op;
7876 if (objv_tracker) {
7877 objv_tracker->prepare_op_for_read(&op);
7878 }
7879 if (attrs) {
7880 op.getxattrs(&unfiltered_attrset, NULL);
7881 }
7882 if (psize || pmtime) {
7883 op.stat2(&size, &mtime_ts, NULL);
7884 }
7885 if (first_chunk) {
7886 op.read(0, cct->_conf->rgw_max_chunk_size, first_chunk, NULL);
7887 }
7888 bufferlist outbl;
11fdf7f2 7889 r = ref.ioctx.operate(ref.obj.oid, &op, &outbl);
7c673cae
FG
7890
7891 if (epoch) {
7892 *epoch = ref.ioctx.get_last_version();
7893 }
7894
7895 if (r < 0)
7896 return r;
7897
7898 if (psize)
7899 *psize = size;
7900 if (pmtime)
7901 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
7902 if (attrs) {
11fdf7f2 7903 rgw_filter_attrset(unfiltered_attrset, RGW_ATTR_PREFIX, attrs);
7c673cae
FG
7904 }
7905
7906 return 0;
7907}
7908
7909int RGWRados::get_bucket_stats(RGWBucketInfo& bucket_info, int shard_id, string *bucket_ver, string *master_ver,
c07f9fc5 7910 map<RGWObjCategory, RGWStorageStats>& stats, string *max_marker, bool *syncstopped)
7c673cae 7911{
a8e16298 7912 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7913 map<int, string> bucket_instance_ids;
7914 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7915 if (r < 0) {
7916 return r;
7917 }
7918
11fdf7f2 7919 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7920
a8e16298 7921 auto iter = headers.begin();
7c673cae
FG
7922 map<int, string>::iterator viter = bucket_instance_ids.begin();
7923 BucketIndexShardsManager ver_mgr;
7924 BucketIndexShardsManager master_ver_mgr;
7925 BucketIndexShardsManager marker_mgr;
7c673cae
FG
7926 char buf[64];
7927 for(; iter != headers.end(); ++iter, ++viter) {
a8e16298
TL
7928 accumulate_raw_stats(*iter, stats);
7929 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->ver);
7c673cae 7930 ver_mgr.add(viter->first, string(buf));
a8e16298 7931 snprintf(buf, sizeof(buf), "%lu", (unsigned long)iter->master_ver);
7c673cae
FG
7932 master_ver_mgr.add(viter->first, string(buf));
7933 if (shard_id >= 0) {
a8e16298 7934 *max_marker = iter->max_marker;
7c673cae 7935 } else {
a8e16298 7936 marker_mgr.add(viter->first, iter->max_marker);
7c673cae 7937 }
c07f9fc5 7938 if (syncstopped != NULL)
a8e16298 7939 *syncstopped = iter->syncstopped;
7c673cae
FG
7940 }
7941 ver_mgr.to_string(bucket_ver);
7942 master_ver_mgr.to_string(master_ver);
7943 if (shard_id < 0) {
7944 marker_mgr.to_string(max_marker);
7945 }
7946 return 0;
7947}
7948
7949int RGWRados::get_bi_log_status(RGWBucketInfo& bucket_info, int shard_id,
7950 map<int, string>& markers)
7951{
a8e16298 7952 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
7953 map<int, string> bucket_instance_ids;
7954 int r = cls_bucket_head(bucket_info, shard_id, headers, &bucket_instance_ids);
7955 if (r < 0)
7956 return r;
7957
11fdf7f2 7958 ceph_assert(headers.size() == bucket_instance_ids.size());
7c673cae 7959
a8e16298 7960 auto iter = headers.begin();
7c673cae
FG
7961 map<int, string>::iterator viter = bucket_instance_ids.begin();
7962
7963 for(; iter != headers.end(); ++iter, ++viter) {
7964 if (shard_id >= 0) {
a8e16298 7965 markers[shard_id] = iter->max_marker;
7c673cae 7966 } else {
a8e16298 7967 markers[viter->first] = iter->max_marker;
7c673cae
FG
7968 }
7969 }
7970 return 0;
7971}
7972
7973class RGWGetBucketStatsContext : public RGWGetDirHeader_CB {
7974 RGWGetBucketStats_CB *cb;
7975 uint32_t pendings;
7976 map<RGWObjCategory, RGWStorageStats> stats;
7977 int ret_code;
7978 bool should_cb;
7979 Mutex lock;
7980
7981public:
7982 RGWGetBucketStatsContext(RGWGetBucketStats_CB *_cb, uint32_t _pendings)
7983 : cb(_cb), pendings(_pendings), stats(), ret_code(0), should_cb(true),
7984 lock("RGWGetBucketStatsContext") {}
7985
7986 void handle_response(int r, rgw_bucket_dir_header& header) override {
7987 Mutex::Locker l(lock);
7988 if (should_cb) {
7989 if ( r >= 0) {
7990 accumulate_raw_stats(header, stats);
7991 } else {
7992 ret_code = r;
7993 }
7994
7995 // Are we all done?
7996 if (--pendings == 0) {
7997 if (!ret_code) {
7998 cb->set_response(&stats);
7999 }
8000 cb->handle_response(ret_code);
8001 cb->put();
8002 }
8003 }
8004 }
8005
8006 void unset_cb() {
8007 Mutex::Locker l(lock);
8008 should_cb = false;
8009 }
8010};
8011
8012int RGWRados::get_bucket_stats_async(RGWBucketInfo& bucket_info, int shard_id, RGWGetBucketStats_CB *ctx)
8013{
8014 int num_aio = 0;
c07f9fc5 8015 RGWGetBucketStatsContext *get_ctx = new RGWGetBucketStatsContext(ctx, bucket_info.num_shards ? : 1);
11fdf7f2 8016 ceph_assert(get_ctx);
7c673cae 8017 int r = cls_bucket_head_async(bucket_info, shard_id, get_ctx, &num_aio);
7c673cae
FG
8018 if (r < 0) {
8019 ctx->put();
8020 if (num_aio) {
8021 get_ctx->unset_cb();
8022 }
8023 }
c07f9fc5 8024 get_ctx->put();
7c673cae
FG
8025 return r;
8026}
8027
8028class RGWGetUserStatsContext : public RGWGetUserHeader_CB {
8029 RGWGetUserStats_CB *cb;
8030
8031public:
8032 explicit RGWGetUserStatsContext(RGWGetUserStats_CB * const cb)
8033 : cb(cb) {}
8034
8035 void handle_response(int r, cls_user_header& header) override {
8036 const cls_user_stats& hs = header.stats;
8037 if (r >= 0) {
8038 RGWStorageStats stats;
8039
8040 stats.size = hs.total_bytes;
8041 stats.size_rounded = hs.total_bytes_rounded;
8042 stats.num_objects = hs.total_entries;
8043
8044 cb->set_response(stats);
8045 }
8046
8047 cb->handle_response(r);
8048
8049 cb->put();
8050 }
8051};
8052
8053int RGWRados::get_user_stats(const rgw_user& user, RGWStorageStats& stats)
8054{
8055 string user_str = user.to_str();
8056
8057 cls_user_header header;
8058 int r = cls_user_get_header(user_str, &header);
8059 if (r < 0)
8060 return r;
8061
8062 const cls_user_stats& hs = header.stats;
8063
8064 stats.size = hs.total_bytes;
8065 stats.size_rounded = hs.total_bytes_rounded;
8066 stats.num_objects = hs.total_entries;
8067
8068 return 0;
8069}
8070
8071int RGWRados::get_user_stats_async(const rgw_user& user, RGWGetUserStats_CB *ctx)
8072{
8073 string user_str = user.to_str();
8074
8075 RGWGetUserStatsContext *get_ctx = new RGWGetUserStatsContext(ctx);
8076 int r = cls_user_get_header_async(user_str, get_ctx);
8077 if (r < 0) {
8078 ctx->put();
8079 delete get_ctx;
8080 return r;
8081 }
8082
8083 return 0;
8084}
8085
8086void RGWRados::get_bucket_meta_oid(const rgw_bucket& bucket, string& oid)
8087{
8088 oid = RGW_BUCKET_INSTANCE_MD_PREFIX + bucket.get_key(':');
8089}
8090
8091void RGWRados::get_bucket_instance_obj(const rgw_bucket& bucket, rgw_raw_obj& obj)
8092{
8093 if (!bucket.oid.empty()) {
11fdf7f2 8094 obj.init(svc.zone->get_zone_params().domain_root, bucket.oid);
7c673cae
FG
8095 } else {
8096 string oid;
8097 get_bucket_meta_oid(bucket, oid);
11fdf7f2 8098 obj.init(svc.zone->get_zone_params().domain_root, oid);
7c673cae
FG
8099 }
8100}
8101
11fdf7f2 8102int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const string& meta_key, RGWBucketInfo& info,
7c673cae
FG
8103 real_time *pmtime, map<string, bufferlist> *pattrs)
8104{
8105 size_t pos = meta_key.find(':');
8106 if (pos == string::npos) {
8107 return -EINVAL;
8108 }
8109 string oid = RGW_BUCKET_INSTANCE_MD_PREFIX + meta_key;
8110 rgw_bucket_instance_key_to_oid(oid);
8111
8112 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8113}
8114
11fdf7f2 8115int RGWRados::get_bucket_instance_info(RGWSysObjectCtx& obj_ctx, const rgw_bucket& bucket, RGWBucketInfo& info,
7c673cae
FG
8116 real_time *pmtime, map<string, bufferlist> *pattrs)
8117{
8118 string oid;
8119 if (bucket.oid.empty()) {
8120 get_bucket_meta_oid(bucket, oid);
8121 } else {
8122 oid = bucket.oid;
8123 }
8124
8125 return get_bucket_instance_from_oid(obj_ctx, oid, info, pmtime, pattrs);
8126}
8127
11fdf7f2 8128int RGWRados::get_bucket_instance_from_oid(RGWSysObjectCtx& obj_ctx, const string& oid, RGWBucketInfo& info,
7c673cae 8129 real_time *pmtime, map<string, bufferlist> *pattrs,
b32b8144
FG
8130 rgw_cache_entry_info *cache_info,
8131 boost::optional<obj_version> refresh_version)
7c673cae 8132{
11fdf7f2
TL
8133 auto& domain_root = svc.zone->get_zone_params().domain_root;
8134
8135 ldout(cct, 20) << "reading from " << domain_root << ":" << oid << dendl;
7c673cae
FG
8136
8137 bufferlist epbl;
8138
11fdf7f2 8139 int ret = rgw_get_system_obj(this, obj_ctx, domain_root,
b32b8144
FG
8140 oid, epbl, &info.objv_tracker, pmtime, pattrs,
8141 cache_info, refresh_version);
7c673cae
FG
8142 if (ret < 0) {
8143 return ret;
8144 }
8145
11fdf7f2 8146 auto iter = epbl.cbegin();
7c673cae 8147 try {
11fdf7f2 8148 decode(info, iter);
7c673cae
FG
8149 } catch (buffer::error& err) {
8150 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8151 return -EIO;
8152 }
8153 info.bucket.oid = oid;
8154 return 0;
8155}
8156
11fdf7f2 8157int RGWRados::get_bucket_entrypoint_info(RGWSysObjectCtx& obj_ctx,
7c673cae
FG
8158 const string& tenant_name,
8159 const string& bucket_name,
8160 RGWBucketEntryPoint& entry_point,
8161 RGWObjVersionTracker *objv_tracker,
8162 real_time *pmtime,
8163 map<string, bufferlist> *pattrs,
b32b8144
FG
8164 rgw_cache_entry_info *cache_info,
8165 boost::optional<obj_version> refresh_version)
7c673cae
FG
8166{
8167 bufferlist bl;
8168 string bucket_entry;
8169
8170 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
11fdf7f2 8171 int ret = rgw_get_system_obj(this, obj_ctx, svc.zone->get_zone_params().domain_root,
b32b8144
FG
8172 bucket_entry, bl, objv_tracker, pmtime, pattrs,
8173 cache_info, refresh_version);
7c673cae
FG
8174 if (ret < 0) {
8175 return ret;
8176 }
8177
11fdf7f2 8178 auto iter = bl.cbegin();
7c673cae 8179 try {
11fdf7f2 8180 decode(entry_point, iter);
7c673cae
FG
8181 } catch (buffer::error& err) {
8182 ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
8183 return -EIO;
8184 }
8185 return 0;
8186}
8187
11fdf7f2 8188int RGWRados::convert_old_bucket_info(RGWSysObjectCtx& obj_ctx,
7c673cae
FG
8189 const string& tenant_name,
8190 const string& bucket_name)
8191{
8192 RGWBucketEntryPoint entry_point;
8193 real_time ep_mtime;
8194 RGWObjVersionTracker ot;
8195 map<string, bufferlist> attrs;
8196 RGWBucketInfo info;
8197
8198 ldout(cct, 10) << "RGWRados::convert_old_bucket_info(): bucket=" << bucket_name << dendl;
8199
8200 int ret = get_bucket_entrypoint_info(obj_ctx, tenant_name, bucket_name, entry_point, &ot, &ep_mtime, &attrs);
8201 if (ret < 0) {
8202 ldout(cct, 0) << "ERROR: get_bucket_entrypoint_info() returned " << ret << " bucket=" << bucket_name << dendl;
8203 return ret;
8204 }
8205
8206 if (!entry_point.has_bucket_info) {
8207 /* already converted! */
8208 return 0;
8209 }
8210
8211 info = entry_point.old_bucket_info;
8212 info.bucket.oid = bucket_name;
8213 info.ep_objv = ot.read_version;
8214
8215 ot.generate_new_write_ver(cct);
8216
8217 ret = put_linked_bucket_info(info, false, ep_mtime, &ot.write_version, &attrs, true);
8218 if (ret < 0) {
8219 ldout(cct, 0) << "ERROR: failed to put_linked_bucket_info(): " << ret << dendl;
8220 return ret;
8221 }
8222
8223 return 0;
8224}
8225
11fdf7f2 8226int RGWRados::_get_bucket_info(RGWSysObjectCtx& obj_ctx,
b32b8144
FG
8227 const string& tenant,
8228 const string& bucket_name,
8229 RGWBucketInfo& info,
8230 real_time *pmtime,
8231 map<string, bufferlist> *pattrs,
8232 boost::optional<obj_version> refresh_version)
7c673cae 8233{
7c673cae
FG
8234 string bucket_entry;
8235 rgw_make_bucket_entry_name(tenant, bucket_name, bucket_entry);
8236
b32b8144 8237
11fdf7f2 8238 if (auto e = binfo_cache->find(bucket_entry)) {
b32b8144 8239 if (refresh_version &&
11fdf7f2 8240 e->info.objv_tracker.read_version.compare(&(*refresh_version))) {
b32b8144
FG
8241 lderr(cct) << "WARNING: The bucket info cache is inconsistent. This is "
8242 << "a failure that should be debugged. I am a nice machine, "
8243 << "so I will try to recover." << dendl;
8244 binfo_cache->invalidate(bucket_entry);
11fdf7f2
TL
8245 } else {
8246 info = e->info;
8247 if (pattrs)
8248 *pattrs = e->attrs;
8249 if (pmtime)
8250 *pmtime = e->mtime;
8251 return 0;
b32b8144 8252 }
7c673cae
FG
8253 }
8254
11fdf7f2 8255 bucket_info_entry e;
7c673cae
FG
8256 RGWBucketEntryPoint entry_point;
8257 real_time ep_mtime;
8258 RGWObjVersionTracker ot;
8259 rgw_cache_entry_info entry_cache_info;
b32b8144
FG
8260 int ret = get_bucket_entrypoint_info(obj_ctx, tenant, bucket_name,
8261 entry_point, &ot, &ep_mtime, pattrs,
8262 &entry_cache_info, refresh_version);
7c673cae
FG
8263 if (ret < 0) {
8264 /* only init these fields */
8265 info.bucket.tenant = tenant;
8266 info.bucket.name = bucket_name;
8267 return ret;
8268 }
8269
8270 if (entry_point.has_bucket_info) {
8271 info = entry_point.old_bucket_info;
8272 info.bucket.oid = bucket_name;
8273 info.bucket.tenant = tenant;
8274 info.ep_objv = ot.read_version;
8275 ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
8276 return 0;
8277 }
8278
8279 /* data is in the bucket instance object, we need to get attributes from there, clear everything
8280 * that we got
8281 */
8282 if (pattrs) {
8283 pattrs->clear();
8284 }
8285
8286 ldout(cct, 20) << "rgw_get_bucket_info: bucket instance: " << entry_point.bucket << dendl;
8287
8288
8289 /* read bucket instance info */
8290
8291 string oid;
8292 get_bucket_meta_oid(entry_point.bucket, oid);
8293
8294 rgw_cache_entry_info cache_info;
8295
b32b8144
FG
8296 ret = get_bucket_instance_from_oid(obj_ctx, oid, e.info, &e.mtime, &e.attrs,
8297 &cache_info, refresh_version);
7c673cae
FG
8298 e.info.ep_objv = ot.read_version;
8299 info = e.info;
8300 if (ret < 0) {
b32b8144 8301 lderr(cct) << "ERROR: get_bucket_instance_from_oid failed: " << ret << dendl;
7c673cae
FG
8302 info.bucket.tenant = tenant;
8303 info.bucket.name = bucket_name;
8304 // XXX and why return anything in case of an error anyway?
8305 return ret;
8306 }
8307
8308 if (pmtime)
8309 *pmtime = e.mtime;
8310 if (pattrs)
8311 *pattrs = e.attrs;
8312
7c673cae 8313 /* chain to both bucket entry point and bucket instance */
11fdf7f2 8314 if (!binfo_cache->put(svc.cache, bucket_entry, &e, {&entry_cache_info, &cache_info})) {
7c673cae
FG
8315 ldout(cct, 20) << "couldn't put binfo cache entry, might have raced with data changes" << dendl;
8316 }
8317
b32b8144
FG
8318 if (refresh_version &&
8319 refresh_version->compare(&info.objv_tracker.read_version)) {
8320 lderr(cct) << "WARNING: The OSD has the same version I have. Something may "
8321 << "have gone squirrelly. An administrator may have forced a "
8322 << "change; otherwise there is a problem somewhere." << dendl;
8323 }
8324
7c673cae
FG
8325 return 0;
8326}
8327
11fdf7f2 8328int RGWRados::get_bucket_info(RGWSysObjectCtx& obj_ctx,
b32b8144
FG
8329 const string& tenant, const string& bucket_name,
8330 RGWBucketInfo& info,
8331 real_time *pmtime, map<string, bufferlist> *pattrs)
8332{
8333 return _get_bucket_info(obj_ctx, tenant, bucket_name, info, pmtime,
8334 pattrs, boost::none);
8335}
8336
8337int RGWRados::try_refresh_bucket_info(RGWBucketInfo& info,
8338 ceph::real_time *pmtime,
8339 map<string, bufferlist> *pattrs)
8340{
11fdf7f2 8341 RGWSysObjectCtx obj_ctx = svc.sysobj->init_obj_ctx();
b32b8144
FG
8342
8343 return _get_bucket_info(obj_ctx, info.bucket.tenant, info.bucket.name,
8344 info, pmtime, pattrs, info.objv_tracker.read_version);
8345}
8346
7c673cae
FG
8347int RGWRados::put_bucket_entrypoint_info(const string& tenant_name, const string& bucket_name, RGWBucketEntryPoint& entry_point,
8348 bool exclusive, RGWObjVersionTracker& objv_tracker, real_time mtime,
8349 map<string, bufferlist> *pattrs)
8350{
8351 bufferlist epbl;
11fdf7f2 8352 encode(entry_point, epbl);
7c673cae
FG
8353 string bucket_entry;
8354 rgw_make_bucket_entry_name(tenant_name, bucket_name, bucket_entry);
8355 return rgw_bucket_store_info(this, bucket_entry, epbl, exclusive, pattrs, &objv_tracker, mtime);
8356}
8357
8358int RGWRados::put_bucket_instance_info(RGWBucketInfo& info, bool exclusive,
8359 real_time mtime, map<string, bufferlist> *pattrs)
8360{
8361 info.has_instance_obj = true;
8362 bufferlist bl;
8363
11fdf7f2 8364 encode(info, bl);
7c673cae
FG
8365
8366 string key = info.bucket.get_key(); /* when we go through meta api, we don't use oid directly */
8367 int ret = rgw_bucket_instance_store_info(this, key, bl, exclusive, pattrs, &info.objv_tracker, mtime);
8368 if (ret == -EEXIST) {
8369 /* well, if it's exclusive we shouldn't overwrite it, because we might race with another
8370 * bucket operation on this specific bucket (e.g., being synced from the master), but
8371 * since bucket instace meta object is unique for this specific bucket instace, we don't
8372 * need to return an error.
8373 * A scenario where we'd get -EEXIST here, is in a multi-zone config, we're not on the
8374 * master, creating a bucket, sending bucket creation to the master, we create the bucket
8375 * locally, while in the sync thread we sync the new bucket.
8376 */
8377 ret = 0;
8378 }
8379 return ret;
8380}
8381
8382int RGWRados::put_linked_bucket_info(RGWBucketInfo& info, bool exclusive, real_time mtime, obj_version *pep_objv,
8383 map<string, bufferlist> *pattrs, bool create_entry_point)
8384{
8385 bool create_head = !info.has_instance_obj || create_entry_point;
8386
8387 int ret = put_bucket_instance_info(info, exclusive, mtime, pattrs);
8388 if (ret < 0) {
8389 return ret;
8390 }
8391
8392 if (!create_head)
8393 return 0; /* done! */
8394
8395 RGWBucketEntryPoint entry_point;
8396 entry_point.bucket = info.bucket;
8397 entry_point.owner = info.owner;
8398 entry_point.creation_time = info.creation_time;
8399 entry_point.linked = true;
8400 RGWObjVersionTracker ot;
8401 if (pep_objv && !pep_objv->tag.empty()) {
8402 ot.write_version = *pep_objv;
8403 } else {
8404 ot.generate_new_write_ver(cct);
8405 if (pep_objv) {
8406 *pep_objv = ot.write_version;
8407 }
8408 }
8409 ret = put_bucket_entrypoint_info(info.bucket.tenant, info.bucket.name, entry_point, exclusive, ot, mtime, NULL);
8410 if (ret < 0)
8411 return ret;
8412
8413 return 0;
8414}
8415
7c673cae
FG
8416int RGWRados::update_containers_stats(map<string, RGWBucketEnt>& m)
8417{
11fdf7f2 8418 auto obj_ctx = svc.sysobj->init_obj_ctx();
7c673cae
FG
8419
8420 map<string, RGWBucketEnt>::iterator iter;
8421 for (iter = m.begin(); iter != m.end(); ++iter) {
8422 RGWBucketEnt& ent = iter->second;
8423 rgw_bucket& bucket = ent.bucket;
8424 ent.count = 0;
8425 ent.size = 0;
8426 ent.size_rounded = 0;
8427
a8e16298 8428 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
8429
8430 RGWBucketInfo bucket_info;
8431 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
8432 if (ret < 0) {
8433 return ret;
8434 }
8435
8436 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
8437 if (r < 0)
8438 return r;
8439
a8e16298 8440 auto hiter = headers.begin();
7c673cae
FG
8441 for (; hiter != headers.end(); ++hiter) {
8442 RGWObjCategory category = main_category;
11fdf7f2 8443 auto iter = (hiter->stats).find(category);
a8e16298 8444 if (iter != hiter->stats.end()) {
7c673cae
FG
8445 struct rgw_bucket_category_stats& stats = iter->second;
8446 ent.count += stats.num_entries;
8447 ent.size += stats.total_size;
8448 ent.size_rounded += stats.total_size_rounded;
8449 }
8450 }
3efd9988
FG
8451
8452 // fill in placement_rule from the bucket instance for use in swift's
8453 // per-storage policy statistics
8454 ent.placement_rule = std::move(bucket_info.placement_rule);
7c673cae
FG
8455 }
8456
8457 return m.size();
8458}
8459
8460int RGWRados::append_async(rgw_raw_obj& obj, size_t size, bufferlist& bl)
8461{
8462 rgw_rados_ref ref;
8463 int r = get_raw_obj_ref(obj, &ref);
8464 if (r < 0) {
8465 return r;
8466 }
8467 librados::Rados *rad = get_rados_handle();
8468 librados::AioCompletion *completion = rad->aio_create_completion(NULL, NULL, NULL);
8469
11fdf7f2 8470 r = ref.ioctx.aio_append(ref.obj.oid, completion, bl, size);
7c673cae
FG
8471 completion->release();
8472 return r;
8473}
8474
7c673cae
FG
8475int RGWRados::pool_iterate_begin(const rgw_pool& pool, RGWPoolIterCtx& ctx)
8476{
8477 librados::IoCtx& io_ctx = ctx.io_ctx;
8478 librados::NObjectIterator& iter = ctx.iter;
8479
494da23a 8480 int r = open_pool_ctx(pool, io_ctx, false);
7c673cae
FG
8481 if (r < 0)
8482 return r;
8483
8484 iter = io_ctx.nobjects_begin();
8485
8486 return 0;
8487}
8488
181888fb
FG
8489int RGWRados::pool_iterate_begin(const rgw_pool& pool, const string& cursor, RGWPoolIterCtx& ctx)
8490{
8491 librados::IoCtx& io_ctx = ctx.io_ctx;
8492 librados::NObjectIterator& iter = ctx.iter;
8493
494da23a 8494 int r = open_pool_ctx(pool, io_ctx, false);
181888fb
FG
8495 if (r < 0)
8496 return r;
8497
8498 librados::ObjectCursor oc;
8499 if (!oc.from_str(cursor)) {
8500 ldout(cct, 10) << "failed to parse cursor: " << cursor << dendl;
8501 return -EINVAL;
8502 }
8503
f64942e4
AA
8504 try {
8505 iter = io_ctx.nobjects_begin(oc);
8506 return 0;
8507 } catch (const std::system_error& e) {
8508 r = -e.code().value();
8509 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8510 << ", returning " << r << dendl;
8511 return r;
8512 } catch (const std::exception& e) {
8513 ldout(cct, 10) << "nobjects_begin threw " << e.what()
8514 << ", returning -5" << dendl;
8515 return -EIO;
8516 }
181888fb
FG
8517}
8518
8519string RGWRados::pool_iterate_get_cursor(RGWPoolIterCtx& ctx)
8520{
8521 return ctx.iter.get_cursor().to_str();
8522}
8523
f64942e4
AA
8524static int do_pool_iterate(CephContext* cct, RGWPoolIterCtx& ctx, uint32_t num,
8525 vector<rgw_bucket_dir_entry>& objs,
7c673cae
FG
8526 bool *is_truncated, RGWAccessListFilter *filter)
8527{
8528 librados::IoCtx& io_ctx = ctx.io_ctx;
8529 librados::NObjectIterator& iter = ctx.iter;
8530
8531 if (iter == io_ctx.nobjects_end())
8532 return -ENOENT;
8533
8534 uint32_t i;
8535
8536 for (i = 0; i < num && iter != io_ctx.nobjects_end(); ++i, ++iter) {
8537 rgw_bucket_dir_entry e;
8538
8539 string oid = iter->get_oid();
8540 ldout(cct, 20) << "RGWRados::pool_iterate: got " << oid << dendl;
8541
8542 // fill it in with initial values; we may correct later
8543 if (filter && !filter->filter(oid, oid))
8544 continue;
8545
8546 e.key = oid;
8547 objs.push_back(e);
8548 }
8549
8550 if (is_truncated)
8551 *is_truncated = (iter != io_ctx.nobjects_end());
8552
8553 return objs.size();
8554}
7c673cae 8555
f64942e4
AA
8556int RGWRados::pool_iterate(RGWPoolIterCtx& ctx, uint32_t num, vector<rgw_bucket_dir_entry>& objs,
8557 bool *is_truncated, RGWAccessListFilter *filter)
8558{
8559 // catch exceptions from NObjectIterator::operator++()
8560 try {
8561 return do_pool_iterate(cct, ctx, num, objs, is_truncated, filter);
8562 } catch (const std::system_error& e) {
8563 int r = -e.code().value();
8564 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8565 << ", returning " << r << dendl;
8566 return r;
8567 } catch (const std::exception& e) {
8568 ldout(cct, 10) << "NObjectIterator threw exception " << e.what()
8569 << ", returning -5" << dendl;
8570 return -EIO;
8571 }
8572}
8573
181888fb 8574int RGWRados::list_raw_objects_init(const rgw_pool& pool, const string& marker, RGWListRawObjsCtx *ctx)
7c673cae 8575{
181888fb
FG
8576 if (!ctx->initialized) {
8577 int r = pool_iterate_begin(pool, marker, ctx->iter_ctx);
7c673cae
FG
8578 if (r < 0) {
8579 ldout(cct, 10) << "failed to list objects pool_iterate_begin() returned r=" << r << dendl;
8580 return r;
8581 }
181888fb 8582 ctx->initialized = true;
7c673cae 8583 }
181888fb
FG
8584 return 0;
8585}
7c673cae 8586
181888fb
FG
8587int RGWRados::list_raw_objects_next(const string& prefix_filter, int max,
8588 RGWListRawObjsCtx& ctx, list<string>& oids,
8589 bool *is_truncated)
8590{
8591 if (!ctx.initialized) {
8592 return -EINVAL;
8593 }
8594 RGWAccessListFilterPrefix filter(prefix_filter);
7c673cae
FG
8595 vector<rgw_bucket_dir_entry> objs;
8596 int r = pool_iterate(ctx.iter_ctx, max, objs, is_truncated, &filter);
8597 if (r < 0) {
8598 if(r != -ENOENT)
8599 ldout(cct, 10) << "failed to list objects pool_iterate returned r=" << r << dendl;
8600 return r;
8601 }
8602
8603 vector<rgw_bucket_dir_entry>::iterator iter;
8604 for (iter = objs.begin(); iter != objs.end(); ++iter) {
8605 oids.push_back(iter->key.name);
8606 }
8607
8608 return oids.size();
8609}
8610
181888fb
FG
8611int RGWRados::list_raw_objects(const rgw_pool& pool, const string& prefix_filter,
8612 int max, RGWListRawObjsCtx& ctx, list<string>& oids,
8613 bool *is_truncated)
8614{
8615 if (!ctx.initialized) {
8616 int r = list_raw_objects_init(pool, string(), &ctx);
8617 if (r < 0) {
8618 return r;
8619 }
8620 }
8621
8622 return list_raw_objects_next(prefix_filter, max, ctx, oids, is_truncated);
8623}
8624
8625string RGWRados::list_raw_objs_get_cursor(RGWListRawObjsCtx& ctx)
8626{
8627 return pool_iterate_get_cursor(ctx.iter_ctx);
8628}
8629
7c673cae
FG
8630int RGWRados::list_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& marker, uint32_t max,
8631 std::list<rgw_bi_log_entry>& result, bool *truncated)
8632{
8633 ldout(cct, 20) << __func__ << ": " << bucket_info.bucket << " marker " << marker << " shard_id=" << shard_id << " max " << max << dendl;
8634 result.clear();
8635
8636 librados::IoCtx index_ctx;
8637 map<int, string> oids;
8638 map<int, cls_rgw_bi_log_list_ret> bi_log_lists;
11fdf7f2 8639 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
7c673cae
FG
8640 if (r < 0)
8641 return r;
8642
8643 BucketIndexShardsManager marker_mgr;
8644 bool has_shards = (oids.size() > 1 || shard_id >= 0);
8645 // If there are multiple shards for the bucket index object, the marker
8646 // should have the pattern '{shard_id_1}#{shard_marker_1},{shard_id_2}#
8647 // {shard_marker_2}...', if there is no sharding, the bi_log_list should
8648 // only contain one record, and the key is the bucket instance id.
8649 r = marker_mgr.from_string(marker, shard_id);
8650 if (r < 0)
8651 return r;
8652
8653 r = CLSRGWIssueBILogList(index_ctx, marker_mgr, max, oids, bi_log_lists, cct->_conf->rgw_bucket_index_max_aio)();
8654 if (r < 0)
8655 return r;
8656
8657 map<int, list<rgw_bi_log_entry>::iterator> vcurrents;
8658 map<int, list<rgw_bi_log_entry>::iterator> vends;
8659 if (truncated) {
8660 *truncated = false;
8661 }
8662 map<int, cls_rgw_bi_log_list_ret>::iterator miter = bi_log_lists.begin();
8663 for (; miter != bi_log_lists.end(); ++miter) {
8664 int shard_id = miter->first;
8665 vcurrents[shard_id] = miter->second.entries.begin();
8666 vends[shard_id] = miter->second.entries.end();
8667 if (truncated) {
8668 *truncated = (*truncated || miter->second.truncated);
8669 }
8670 }
8671
8672 size_t total = 0;
8673 bool has_more = true;
8674 map<int, list<rgw_bi_log_entry>::iterator>::iterator viter;
8675 map<int, list<rgw_bi_log_entry>::iterator>::iterator eiter;
8676 while (total < max && has_more) {
8677 has_more = false;
8678
8679 viter = vcurrents.begin();
8680 eiter = vends.begin();
8681
8682 for (; total < max && viter != vcurrents.end(); ++viter, ++eiter) {
8683 assert (eiter != vends.end());
8684
8685 int shard_id = viter->first;
8686 list<rgw_bi_log_entry>::iterator& liter = viter->second;
8687
8688 if (liter == eiter->second){
8689 continue;
8690 }
8691 rgw_bi_log_entry& entry = *(liter);
8692 if (has_shards) {
8693 char buf[16];
8694 snprintf(buf, sizeof(buf), "%d", shard_id);
8695 string tmp_id;
8696 build_bucket_index_marker(buf, entry.id, &tmp_id);
8697 entry.id.swap(tmp_id);
8698 }
8699 marker_mgr.add(shard_id, entry.id);
8700 result.push_back(entry);
8701 total++;
8702 has_more = true;
8703 ++liter;
8704 }
8705 }
8706
8707 if (truncated) {
8708 for (viter = vcurrents.begin(), eiter = vends.begin(); viter != vcurrents.end(); ++viter, ++eiter) {
8709 assert (eiter != vends.end());
8710 *truncated = (*truncated || (viter->second != eiter->second));
8711 }
8712 }
8713
8714 // Refresh marker, if there are multiple shards, the output will look like
8715 // '{shard_oid_1}#{shard_marker_1},{shard_oid_2}#{shard_marker_2}...',
8716 // if there is no sharding, the simply marker (without oid) is returned
8717 if (has_shards) {
8718 marker_mgr.to_string(&marker);
8719 } else {
8720 if (!result.empty()) {
8721 marker = result.rbegin()->id;
8722 }
8723 }
8724
8725 return 0;
8726}
8727
8728int RGWRados::trim_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id, string& start_marker, string& end_marker)
8729{
8730 librados::IoCtx index_ctx;
8731 map<int, string> bucket_objs;
31f18b77
FG
8732
8733 BucketIndexShardsManager start_marker_mgr;
8734 BucketIndexShardsManager end_marker_mgr;
8735
7c673cae 8736 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
31f18b77 8737 if (r < 0) {
7c673cae 8738 return r;
31f18b77 8739 }
7c673cae 8740
7c673cae 8741 r = start_marker_mgr.from_string(start_marker, shard_id);
31f18b77 8742 if (r < 0) {
7c673cae 8743 return r;
31f18b77
FG
8744 }
8745
7c673cae 8746 r = end_marker_mgr.from_string(end_marker, shard_id);
31f18b77 8747 if (r < 0) {
7c673cae 8748 return r;
31f18b77 8749 }
7c673cae
FG
8750
8751 return CLSRGWIssueBILogTrim(index_ctx, start_marker_mgr, end_marker_mgr, bucket_objs,
31f18b77 8752 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
8753}
8754
c07f9fc5
FG
8755int RGWRados::resync_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8756{
8757 librados::IoCtx index_ctx;
8758 map<int, string> bucket_objs;
8759 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8760 if (r < 0)
8761 return r;
8762
8763 return CLSRGWIssueResyncBucketBILog(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8764}
8765
8766int RGWRados::stop_bi_log_entries(RGWBucketInfo& bucket_info, int shard_id)
8767{
8768 librados::IoCtx index_ctx;
8769 map<int, string> bucket_objs;
8770 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
8771 if (r < 0)
8772 return r;
8773
8774 return CLSRGWIssueBucketBILogStop(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio)();
8775}
8776
a8e16298
TL
8777int RGWRados::bi_get_instance(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8778 rgw_bucket_dir_entry *dirent)
7c673cae 8779{
a8e16298 8780 rgw_cls_bi_entry bi_entry;
11fdf7f2 8781 int r = bi_get(bucket_info, obj, BIIndexType::Instance, &bi_entry);
a8e16298
TL
8782 if (r < 0 && r != -ENOENT) {
8783 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8784 }
7c673cae
FG
8785 if (r < 0) {
8786 return r;
8787 }
11fdf7f2 8788 auto iter = bi_entry.data.cbegin();
a8e16298 8789 try {
11fdf7f2 8790 decode(*dirent, iter);
a8e16298
TL
8791 } catch (buffer::error& err) {
8792 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8793 return -EIO;
8794 }
8795
8796 return 0;
8797}
7c673cae 8798
a8e16298
TL
8799int RGWRados::bi_get_olh(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8800 rgw_bucket_olh_entry *olh)
8801{
7c673cae 8802 rgw_cls_bi_entry bi_entry;
11fdf7f2 8803 int r = bi_get(bucket_info, obj, BIIndexType::OLH, &bi_entry);
7c673cae
FG
8804 if (r < 0 && r != -ENOENT) {
8805 ldout(cct, 0) << "ERROR: bi_get() returned r=" << r << dendl;
8806 }
8807 if (r < 0) {
8808 return r;
8809 }
11fdf7f2 8810 auto iter = bi_entry.data.cbegin();
7c673cae 8811 try {
a8e16298 8812 decode(*olh, iter);
7c673cae
FG
8813 } catch (buffer::error& err) {
8814 ldout(cct, 0) << "ERROR: failed to decode bi_entry()" << dendl;
8815 return -EIO;
8816 }
8817
8818 return 0;
8819}
8820
a8e16298
TL
8821int RGWRados::bi_get(const RGWBucketInfo& bucket_info, const rgw_obj& obj,
8822 BIIndexType index_type, rgw_cls_bi_entry *entry)
7c673cae
FG
8823{
8824 BucketShard bs(this);
a8e16298 8825 int ret = bs.init(bucket_info, obj);
7c673cae
FG
8826 if (ret < 0) {
8827 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8828 return ret;
8829 }
8830
8831 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
8832
a8e16298 8833 return cls_rgw_bi_get(bs.index_ctx, bs.bucket_obj, index_type, key, entry);
7c673cae
FG
8834}
8835
8836void RGWRados::bi_put(ObjectWriteOperation& op, BucketShard& bs, rgw_cls_bi_entry& entry)
8837{
8838 cls_rgw_bi_put(op, bs.bucket_obj, entry);
8839}
8840
8841int RGWRados::bi_put(BucketShard& bs, rgw_cls_bi_entry& entry)
8842{
8843 int ret = cls_rgw_bi_put(bs.index_ctx, bs.bucket_obj, entry);
8844 if (ret < 0)
8845 return ret;
8846
8847 return 0;
8848}
8849
8850int RGWRados::bi_put(rgw_bucket& bucket, rgw_obj& obj, rgw_cls_bi_entry& entry)
8851{
8852 BucketShard bs(this);
f64942e4 8853 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
8854 if (ret < 0) {
8855 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8856 return ret;
8857 }
8858
8859 return bi_put(bs, entry);
8860}
8861
8862int RGWRados::bi_list(rgw_bucket& bucket, const string& obj_name, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8863{
8864 rgw_obj obj(bucket, obj_name);
8865 BucketShard bs(this);
f64942e4 8866 int ret = bs.init(bucket, obj, nullptr /* no RGWBucketInfo */);
7c673cae
FG
8867 if (ret < 0) {
8868 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8869 return ret;
8870 }
8871
8872 ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, obj_name, marker, max, entries, is_truncated);
31f18b77
FG
8873 if (ret == -ENOENT) {
8874 *is_truncated = false;
8875 }
7c673cae
FG
8876 if (ret < 0)
8877 return ret;
8878
8879 return 0;
8880}
8881
8882int RGWRados::bi_list(BucketShard& bs, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8883{
8884 int ret = cls_rgw_bi_list(bs.index_ctx, bs.bucket_obj, filter_obj, marker, max, entries, is_truncated);
8885 if (ret < 0)
8886 return ret;
8887
8888 return 0;
8889}
8890
8891int RGWRados::bi_remove(BucketShard& bs)
8892{
8893 int ret = bs.index_ctx.remove(bs.bucket_obj);
8894 if (ret == -ENOENT) {
8895 ret = 0;
8896 }
8897 if (ret < 0) {
8898 ldout(cct, 5) << "bs.index_ctx.remove(" << bs.bucket_obj << ") returned ret=" << ret << dendl;
8899 return ret;
8900 }
8901
8902 return 0;
8903}
8904
8905int RGWRados::bi_list(rgw_bucket& bucket, int shard_id, const string& filter_obj, const string& marker, uint32_t max, list<rgw_cls_bi_entry> *entries, bool *is_truncated)
8906{
8907 BucketShard bs(this);
f64942e4 8908 int ret = bs.init(bucket, shard_id, nullptr /* no RGWBucketInfo */);
7c673cae
FG
8909 if (ret < 0) {
8910 ldout(cct, 5) << "bs.init() returned ret=" << ret << dendl;
8911 return ret;
8912 }
8913
8914 return bi_list(bs, filter_obj, marker, max, entries, is_truncated);
8915}
8916
8917int RGWRados::gc_operate(string& oid, librados::ObjectWriteOperation *op)
8918{
8919 return gc_pool_ctx.operate(oid, op);
8920}
8921
11fdf7f2 8922int RGWRados::gc_aio_operate(string& oid, librados::ObjectWriteOperation *op, AioCompletion **pc)
7c673cae
FG
8923{
8924 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
8925 int r = gc_pool_ctx.aio_operate(oid, c, op);
11fdf7f2
TL
8926 if (!pc) {
8927 c->release();
8928 } else {
8929 *pc = c;
8930 }
7c673cae
FG
8931 return r;
8932}
8933
8934int RGWRados::gc_operate(string& oid, librados::ObjectReadOperation *op, bufferlist *pbl)
8935{
8936 return gc_pool_ctx.operate(oid, op, pbl);
8937}
8938
8939int RGWRados::list_gc_objs(int *index, string& marker, uint32_t max, bool expired_only, std::list<cls_rgw_gc_obj_info>& result, bool *truncated)
8940{
8941 return gc->list(index, marker, max, expired_only, result, truncated);
8942}
8943
11fdf7f2 8944int RGWRados::process_gc(bool expired_only)
7c673cae 8945{
11fdf7f2 8946 return gc->process(expired_only);
7c673cae
FG
8947}
8948
8949int RGWRados::list_lc_progress(const string& marker, uint32_t max_entries, map<string, int> *progress_map)
8950{
8951 return lc->list_lc_progress(marker, max_entries, progress_map);
8952}
8953
8954int RGWRados::process_lc()
8955{
8956 return lc->process();
8957}
8958
1adf2230 8959bool RGWRados::process_expire_objects()
7c673cae 8960{
1adf2230 8961 return obj_expirer->inspect_all_shards(utime_t(), ceph_clock_now());
7c673cae
FG
8962}
8963
7c673cae 8964int RGWRados::cls_obj_prepare_op(BucketShard& bs, RGWModifyOp op, string& tag,
31f18b77 8965 rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8966{
31f18b77
FG
8967 rgw_zone_set zones_trace;
8968 if (_zones_trace) {
8969 zones_trace = *_zones_trace;
8970 }
11fdf7f2 8971 zones_trace.insert(svc.zone->get_zone().id);
1adf2230 8972
7c673cae
FG
8973 ObjectWriteOperation o;
8974 cls_rgw_obj_key key(obj.key.get_index_key_name(), obj.key.instance);
31f18b77 8975 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
11fdf7f2 8976 cls_rgw_bucket_prepare_op(o, op, tag, key, obj.key.get_loc(), svc.zone->get_zone().log_data, bilog_flags, zones_trace);
7c673cae
FG
8977 return bs.index_ctx.operate(bs.bucket_obj, &o);
8978}
8979
31f18b77 8980int RGWRados::cls_obj_complete_op(BucketShard& bs, const rgw_obj& obj, RGWModifyOp op, string& tag,
7c673cae
FG
8981 int64_t pool, uint64_t epoch,
8982 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 8983 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *_zones_trace)
7c673cae 8984{
7c673cae
FG
8985 ObjectWriteOperation o;
8986 rgw_bucket_dir_entry_meta dir_meta;
8987 dir_meta = ent.meta;
8988 dir_meta.category = category;
8989
1adf2230
AA
8990 rgw_zone_set zones_trace;
8991 if (_zones_trace) {
8992 zones_trace = *_zones_trace;
8993 }
11fdf7f2 8994 zones_trace.insert(svc.zone->get_zone().id);
1adf2230 8995
7c673cae
FG
8996 rgw_bucket_entry_ver ver;
8997 ver.pool = pool;
8998 ver.epoch = epoch;
8999 cls_rgw_obj_key key(ent.key.name, ent.key.instance);
31f18b77
FG
9000 cls_rgw_guard_bucket_resharding(o, -ERR_BUSY_RESHARDING);
9001 cls_rgw_bucket_complete_op(o, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 9002 svc.zone->get_zone().log_data, bilog_flags, &zones_trace);
31f18b77
FG
9003 complete_op_data *arg;
9004 index_completion_manager->create_completion(obj, op, tag, ver, key, dir_meta, remove_objs,
11fdf7f2 9005 svc.zone->get_zone().log_data, bilog_flags, &zones_trace, &arg);
31f18b77
FG
9006 librados::AioCompletion *completion = arg->rados_completion;
9007 int ret = bs.index_ctx.aio_operate(bs.bucket_obj, arg->rados_completion, &o);
9008 completion->release(); /* can't reference arg here, as it might have already been released */
7c673cae
FG
9009 return ret;
9010}
9011
31f18b77 9012int RGWRados::cls_obj_complete_add(BucketShard& bs, const rgw_obj& obj, string& tag,
7c673cae
FG
9013 int64_t pool, uint64_t epoch,
9014 rgw_bucket_dir_entry& ent, RGWObjCategory category,
31f18b77 9015 list<rgw_obj_index_key> *remove_objs, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae 9016{
31f18b77 9017 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_ADD, tag, pool, epoch, ent, category, remove_objs, bilog_flags, zones_trace);
7c673cae
FG
9018}
9019
9020int RGWRados::cls_obj_complete_del(BucketShard& bs, string& tag,
9021 int64_t pool, uint64_t epoch,
9022 rgw_obj& obj,
9023 real_time& removed_mtime,
9024 list<rgw_obj_index_key> *remove_objs,
31f18b77
FG
9025 uint16_t bilog_flags,
9026 rgw_zone_set *zones_trace)
7c673cae
FG
9027{
9028 rgw_bucket_dir_entry ent;
9029 ent.meta.mtime = removed_mtime;
9030 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
9031 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_DEL, tag, pool, epoch,
9032 ent, RGWObjCategory::None, remove_objs,
9033 bilog_flags, zones_trace);
7c673cae
FG
9034}
9035
31f18b77 9036int RGWRados::cls_obj_complete_cancel(BucketShard& bs, string& tag, rgw_obj& obj, uint16_t bilog_flags, rgw_zone_set *zones_trace)
7c673cae
FG
9037{
9038 rgw_bucket_dir_entry ent;
9039 obj.key.get_index_key(&ent.key);
11fdf7f2
TL
9040 return cls_obj_complete_op(bs, obj, CLS_RGW_OP_CANCEL, tag,
9041 -1 /* pool id */, 0, ent,
9042 RGWObjCategory::None, NULL, bilog_flags,
9043 zones_trace);
7c673cae
FG
9044}
9045
9046int RGWRados::cls_obj_set_bucket_tag_timeout(RGWBucketInfo& bucket_info, uint64_t timeout)
9047{
9048 librados::IoCtx index_ctx;
9049 map<int, string> bucket_objs;
9050 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs);
9051 if (r < 0)
9052 return r;
9053
9054 return CLSRGWIssueSetTagTimeout(index_ctx, bucket_objs, cct->_conf->rgw_bucket_index_max_aio, timeout)();
9055}
9056
1adf2230
AA
9057
9058int RGWRados::cls_bucket_list_ordered(RGWBucketInfo& bucket_info,
9059 int shard_id,
11fdf7f2 9060 const rgw_obj_index_key& start,
1adf2230
AA
9061 const string& prefix,
9062 uint32_t num_entries,
9063 bool list_versions,
9064 map<string, rgw_bucket_dir_entry>& m,
9065 bool *is_truncated,
9066 rgw_obj_index_key *last_entry,
9067 bool (*force_check_filter)(const string& name))
7c673cae 9068{
1adf2230
AA
9069 ldout(cct, 10) << "cls_bucket_list_ordered " << bucket_info.bucket <<
9070 " start " << start.name << "[" << start.instance << "] num_entries " <<
9071 num_entries << dendl;
7c673cae
FG
9072
9073 librados::IoCtx index_ctx;
9074 // key - oid (for different shards if there is any)
1adf2230
AA
9075 // value - list result for the corresponding oid (shard), it is filled by
9076 // the AIO callback
7c673cae
FG
9077 map<int, string> oids;
9078 map<int, struct rgw_cls_list_ret> list_results;
9079 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9080 if (r < 0)
9081 return r;
9082
9083 cls_rgw_obj_key start_key(start.name, start.instance);
1adf2230
AA
9084 r = CLSRGWIssueBucketList(index_ctx, start_key, prefix, num_entries,
9085 list_versions, oids, list_results,
9086 cct->_conf->rgw_bucket_index_max_aio)();
7c673cae
FG
9087 if (r < 0)
9088 return r;
9089
9090 // Create a list of iterators that are used to iterate each shard
11fdf7f2
TL
9091 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vcurrents;
9092 vector<map<string, struct rgw_bucket_dir_entry>::iterator> vends;
9093 vector<string> vnames;
9094 vcurrents.reserve(list_results.size());
9095 vends.reserve(list_results.size());
9096 vnames.reserve(list_results.size());
7c673cae
FG
9097 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9098 *is_truncated = false;
9099 for (; iter != list_results.end(); ++iter) {
9100 vcurrents.push_back(iter->second.dir.m.begin());
9101 vends.push_back(iter->second.dir.m.end());
9102 vnames.push_back(oids[iter->first]);
9103 *is_truncated = (*is_truncated || iter->second.is_truncated);
9104 }
9105
9106 // Create a map to track the next candidate entry from each shard, if the entry
9107 // from a specified shard is selected/erased, the next entry from that shard will
9108 // be inserted for next round selection
9109 map<string, size_t> candidates;
9110 for (size_t i = 0; i < vcurrents.size(); ++i) {
9111 if (vcurrents[i] != vends[i]) {
9112 candidates[vcurrents[i]->first] = i;
9113 }
9114 }
9115
9116 map<string, bufferlist> updates;
9117 uint32_t count = 0;
9118 while (count < num_entries && !candidates.empty()) {
9119 r = 0;
9120 // Select the next one
9121 int pos = candidates.begin()->second;
9122 const string& name = vcurrents[pos]->first;
9123 struct rgw_bucket_dir_entry& dirent = vcurrents[pos]->second;
9124
3efd9988
FG
9125 bool force_check = force_check_filter &&
9126 force_check_filter(dirent.key.name);
9127 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9128 !dirent.pending_map.empty() ||
9129 force_check) {
7c673cae
FG
9130 /* there are uncommitted ops. We need to check the current state,
9131 * and if the tags are old we need to do cleanup as well. */
9132 librados::IoCtx sub_ctx;
9133 sub_ctx.dup(index_ctx);
1adf2230
AA
9134 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent,
9135 updates[vnames[pos]]);
7c673cae
FG
9136 if (r < 0 && r != -ENOENT) {
9137 return r;
9138 }
9139 }
9140 if (r >= 0) {
1adf2230
AA
9141 ldout(cct, 10) << "RGWRados::cls_bucket_list_ordered: got " <<
9142 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
7c673cae
FG
9143 m[name] = std::move(dirent);
9144 ++count;
9145 }
9146
9147 // Refresh the candidates map
9148 candidates.erase(candidates.begin());
9149 ++vcurrents[pos];
9150 if (vcurrents[pos] != vends[pos]) {
9151 candidates[vcurrents[pos]->first] = pos;
9152 }
9153 }
9154
9155 // Suggest updates if there is any
9156 map<string, bufferlist>::iterator miter = updates.begin();
9157 for (; miter != updates.end(); ++miter) {
9158 if (miter->second.length()) {
9159 ObjectWriteOperation o;
9160 cls_rgw_suggest_changes(o, miter->second);
9161 // we don't care if we lose suggested updates, send them off blindly
9162 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9163 index_ctx.aio_operate(miter->first, c, &o);
1adf2230 9164 c->release();
7c673cae
FG
9165 }
9166 }
9167
9168 // Check if all the returned entries are consumed or not
9169 for (size_t i = 0; i < vcurrents.size(); ++i) {
1adf2230 9170 if (vcurrents[i] != vends[i]) {
7c673cae 9171 *is_truncated = true;
1adf2230
AA
9172 break;
9173 }
7c673cae
FG
9174 }
9175 if (!m.empty())
9176 *last_entry = m.rbegin()->first;
9177
9178 return 0;
9179}
9180
1adf2230
AA
9181
9182int RGWRados::cls_bucket_list_unordered(RGWBucketInfo& bucket_info,
9183 int shard_id,
11fdf7f2 9184 const rgw_obj_index_key& start,
1adf2230
AA
9185 const string& prefix,
9186 uint32_t num_entries,
9187 bool list_versions,
9188 std::vector<rgw_bucket_dir_entry>& ent_list,
9189 bool *is_truncated,
9190 rgw_obj_index_key *last_entry,
9191 bool (*force_check_filter)(const string& name)) {
9192 ldout(cct, 10) << "cls_bucket_list_unordered " << bucket_info.bucket <<
9193 " start " << start.name << "[" << start.instance <<
9194 "] num_entries " << num_entries << dendl;
9195
11fdf7f2
TL
9196 static MultipartMetaFilter multipart_meta_filter;
9197
1adf2230
AA
9198 *is_truncated = false;
9199 librados::IoCtx index_ctx;
9200
1adf2230
AA
9201 map<int, string> oids;
9202 int r = open_bucket_index(bucket_info, index_ctx, oids, shard_id);
9203 if (r < 0)
9204 return r;
9205 const uint32_t num_shards = oids.size();
9206
11fdf7f2 9207 rgw_obj_index_key marker = start;
1adf2230
AA
9208 uint32_t current_shard;
9209 if (shard_id >= 0) {
9210 current_shard = shard_id;
11fdf7f2 9211 } else if (start.empty()) {
1adf2230
AA
9212 current_shard = 0u;
9213 } else {
11fdf7f2
TL
9214 // at this point we have a marker (start) that has something in
9215 // it, so we need to get to the bucket shard index, so we can
9216 // start reading from there
9217
9218 std::string key;
9219 // test whether object name is a multipart meta name
9220 if(! multipart_meta_filter.filter(start.name, key)) {
9221 // if multipart_meta_filter fails, must be "regular" (i.e.,
9222 // unadorned) and the name is the key
9223 key = start.name;
9224 }
9225
9226 // now convert the key (oid) to an rgw_obj_key since that will
9227 // separate out the namespace, name, and instance
9228 rgw_obj_key obj_key;
9229 bool parsed = rgw_obj_key::parse_raw_oid(key, &obj_key);
9230 if (!parsed) {
9231 ldout(cct, 0) <<
9232 "ERROR: RGWRados::cls_bucket_list_unordered received an invalid "
9233 "start marker: '" << start << "'" << dendl;
9234 return -EINVAL;
9235 } else if (obj_key.name.empty()) {
9236 // if the name is empty that means the object name came in with
9237 // a namespace only, and therefore we need to start our scan at
9238 // the first bucket index shard
9239 current_shard = 0u;
9240 } else {
9241 // so now we have the key used to compute the bucket index shard
9242 // and can extract the specific shard from it
9243 current_shard = rgw_bucket_shard_index(obj_key.name, num_shards);
9244 }
1adf2230
AA
9245 }
9246
9247 uint32_t count = 0u;
9248 map<string, bufferlist> updates;
11fdf7f2 9249 rgw_obj_index_key last_added_entry;
1adf2230
AA
9250 while (count <= num_entries &&
9251 ((shard_id >= 0 && current_shard == uint32_t(shard_id)) ||
9252 current_shard < num_shards)) {
81eedcae
TL
9253 const std::string& oid = oids[current_shard];
9254 rgw_cls_list_ret result;
9255
9256 librados::ObjectReadOperation op;
9257 cls_rgw_bucket_list_op(op, marker, prefix, num_entries,
9258 list_versions, &result);
9259 r = index_ctx.operate(oid, &op, nullptr);
1adf2230
AA
9260 if (r < 0)
9261 return r;
9262
1adf2230
AA
9263 for (auto& entry : result.dir.m) {
9264 rgw_bucket_dir_entry& dirent = entry.second;
9265
9266 bool force_check = force_check_filter &&
9267 force_check_filter(dirent.key.name);
9268 if ((!dirent.exists && !dirent.is_delete_marker()) ||
9269 !dirent.pending_map.empty() ||
9270 force_check) {
9271 /* there are uncommitted ops. We need to check the current state,
9272 * and if the tags are old we need to do cleanup as well. */
9273 librados::IoCtx sub_ctx;
9274 sub_ctx.dup(index_ctx);
9275 r = check_disk_state(sub_ctx, bucket_info, dirent, dirent, updates[oid]);
9276 if (r < 0 && r != -ENOENT) {
9277 return r;
9278 }
9279 }
9280
9281 // at this point either r >=0 or r == -ENOENT
9282 if (r >= 0) { // i.e., if r != -ENOENT
9283 ldout(cct, 10) << "RGWRados::cls_bucket_list_unordered: got " <<
9284 dirent.key.name << "[" << dirent.key.instance << "]" << dendl;
9285
9286 if (count < num_entries) {
11fdf7f2 9287 marker = last_added_entry = dirent.key; // double assign
1adf2230
AA
9288 ent_list.emplace_back(std::move(dirent));
9289 ++count;
9290 } else {
9291 *is_truncated = true;
9292 goto check_updates;
9293 }
9294 } else { // r == -ENOENT
9295 // in the case of -ENOENT, make sure we're advancing marker
9296 // for possible next call to CLSRGWIssueBucketList
11fdf7f2 9297 marker = dirent.key;
1adf2230
AA
9298 }
9299 } // entry for loop
9300
9301 if (!result.is_truncated) {
9302 // if we reached the end of the shard read next shard
9303 ++current_shard;
11fdf7f2 9304 marker = rgw_obj_index_key();
1adf2230
AA
9305 }
9306 } // shard loop
9307
9308check_updates:
11fdf7f2 9309
1adf2230
AA
9310 // suggest updates if there is any
9311 map<string, bufferlist>::iterator miter = updates.begin();
9312 for (; miter != updates.end(); ++miter) {
9313 if (miter->second.length()) {
9314 ObjectWriteOperation o;
9315 cls_rgw_suggest_changes(o, miter->second);
9316 // we don't care if we lose suggested updates, send them off blindly
9317 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
9318 index_ctx.aio_operate(miter->first, c, &o);
9319 c->release();
9320 }
9321 }
9322
9323 if (last_entry && !ent_list.empty()) {
9324 *last_entry = last_added_entry;
9325 }
9326
9327 return 0;
11fdf7f2 9328} // RGWRados::cls_bucket_list_unordered
1adf2230
AA
9329
9330
9331int RGWRados::cls_obj_usage_log_add(const string& oid,
9332 rgw_usage_log_info& info)
7c673cae 9333{
11fdf7f2 9334 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9335
9336 rgw_rados_ref ref;
224ce89b 9337 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9338 if (r < 0) {
9339 return r;
9340 }
9341
9342 ObjectWriteOperation op;
9343 cls_rgw_usage_log_add(op, info);
9344
11fdf7f2 9345 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
9346 return r;
9347}
9348
11fdf7f2
TL
9349int RGWRados::cls_obj_usage_log_read(const string& oid, const string& user, const string& bucket,
9350 uint64_t start_epoch, uint64_t end_epoch, uint32_t max_entries,
9351 string& read_iter, map<rgw_user_bucket, rgw_usage_log_entry>& usage,
9352 bool *is_truncated)
7c673cae 9353{
11fdf7f2 9354 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9355
9356 rgw_rados_ref ref;
224ce89b 9357 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9358 if (r < 0) {
9359 return r;
9360 }
9361
9362 *is_truncated = false;
9363
11fdf7f2 9364 r = cls_rgw_usage_log_read(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch,
7c673cae
FG
9365 max_entries, read_iter, usage, is_truncated);
9366
9367 return r;
9368}
9369
11fdf7f2
TL
9370int RGWRados::cls_obj_usage_log_trim(const string& oid, const string& user, const string& bucket,
9371 uint64_t start_epoch, uint64_t end_epoch)
7c673cae 9372{
11fdf7f2 9373 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
7c673cae
FG
9374
9375 rgw_rados_ref ref;
224ce89b 9376 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9377 if (r < 0) {
9378 return r;
9379 }
9380
11fdf7f2
TL
9381 r = cls_rgw_usage_log_trim(ref.ioctx, ref.obj.oid, user, bucket, start_epoch, end_epoch);
9382 return r;
9383}
9384
9385int RGWRados::cls_obj_usage_log_clear(string& oid)
9386{
9387 rgw_raw_obj obj(svc.zone->get_zone_params().usage_log_pool, oid);
9388
9389 rgw_rados_ref ref;
9390 int r = get_raw_obj_ref(obj, &ref);
9391 if (r < 0) {
9392 return r;
9393 }
9394 librados::ObjectWriteOperation op;
9395 cls_rgw_usage_log_clear(op);
9396 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
9397 return r;
9398}
9399
11fdf7f2 9400
7c673cae
FG
9401int RGWRados::remove_objs_from_index(RGWBucketInfo& bucket_info, list<rgw_obj_index_key>& oid_list)
9402{
9403 librados::IoCtx index_ctx;
9404 string dir_oid;
9405
11fdf7f2 9406 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
9407
9408 int r = open_bucket_index(bucket_info, index_ctx, dir_oid);
9409 if (r < 0)
9410 return r;
9411
9412 bufferlist updates;
9413
9414 for (auto iter = oid_list.begin(); iter != oid_list.end(); ++iter) {
9415 rgw_bucket_dir_entry entry;
9416 entry.key = *iter;
9417 dout(2) << "RGWRados::remove_objs_from_index bucket=" << bucket_info.bucket << " obj=" << entry.key.name << ":" << entry.key.instance << dendl;
9418 entry.ver.epoch = (uint64_t)-1; // ULLONG_MAX, needed to that objclass doesn't skip out request
9419 updates.append(CEPH_RGW_REMOVE | suggest_flag);
11fdf7f2 9420 encode(entry, updates);
7c673cae
FG
9421 }
9422
9423 bufferlist out;
9424
9425 r = index_ctx.exec(dir_oid, RGW_CLASS, RGW_DIR_SUGGEST_CHANGES, updates, out);
9426
9427 return r;
9428}
9429
9430int RGWRados::check_disk_state(librados::IoCtx io_ctx,
9431 const RGWBucketInfo& bucket_info,
9432 rgw_bucket_dir_entry& list_state,
9433 rgw_bucket_dir_entry& object,
9434 bufferlist& suggested_updates)
9435{
9436 const rgw_bucket& bucket = bucket_info.bucket;
11fdf7f2 9437 uint8_t suggest_flag = (svc.zone->get_zone().log_data ? CEPH_RGW_DIR_SUGGEST_LOG_OP : 0);
7c673cae
FG
9438
9439 std::string loc;
9440
9441 rgw_obj obj(bucket, list_state.key);
9442
9443 string oid;
9444 get_obj_bucket_and_oid_loc(obj, oid, loc);
9445
9446 if (loc != list_state.locator) {
9447 ldout(cct, 0) << "WARNING: generated locator (" << loc << ") is different from listed locator (" << list_state.locator << ")" << dendl;
9448 }
9449
9450 io_ctx.locator_set_key(list_state.locator);
9451
9452 RGWObjState *astate = NULL;
9453 RGWObjectCtx rctx(this);
9454 int r = get_obj_state(&rctx, bucket_info, obj, &astate, false);
9455 if (r < 0)
9456 return r;
9457
9458 list_state.pending_map.clear(); // we don't need this and it inflates size
9459 if (!astate->exists) {
9460 /* object doesn't exist right now -- hopefully because it's
9461 * marked as !exists and got deleted */
9462 if (list_state.exists) {
9463 /* FIXME: what should happen now? Work out if there are any
9464 * non-bad ways this could happen (there probably are, but annoying
9465 * to handle!) */
9466 }
9467 // encode a suggested removal of that key
9468 list_state.ver.epoch = io_ctx.get_last_version();
9469 list_state.ver.pool = io_ctx.get_id();
9470 cls_rgw_encode_suggestion(CEPH_RGW_REMOVE, list_state, suggested_updates);
9471 return -ENOENT;
9472 }
9473
9474 string etag;
9475 string content_type;
9476 ACLOwner owner;
9477
9478 object.meta.size = astate->size;
9479 object.meta.accounted_size = astate->accounted_size;
9480 object.meta.mtime = astate->mtime;
9481
9482 map<string, bufferlist>::iterator iter = astate->attrset.find(RGW_ATTR_ETAG);
9483 if (iter != astate->attrset.end()) {
11fdf7f2 9484 etag = rgw_bl_str(iter->second);
7c673cae
FG
9485 }
9486 iter = astate->attrset.find(RGW_ATTR_CONTENT_TYPE);
9487 if (iter != astate->attrset.end()) {
11fdf7f2 9488 content_type = rgw_bl_str(iter->second);
7c673cae
FG
9489 }
9490 iter = astate->attrset.find(RGW_ATTR_ACL);
9491 if (iter != astate->attrset.end()) {
9492 r = decode_policy(iter->second, &owner);
9493 if (r < 0) {
9494 dout(0) << "WARNING: could not decode policy for object: " << obj << dendl;
9495 }
9496 }
9497
9498 if (astate->has_manifest) {
9499 RGWObjManifest::obj_iterator miter;
9500 RGWObjManifest& manifest = astate->manifest;
9501 for (miter = manifest.obj_begin(); miter != manifest.obj_end(); ++miter) {
9502 const rgw_raw_obj& raw_loc = miter.get_location().get_raw_obj(this);
9503 rgw_obj loc;
9504 rgw_raw_obj_to_obj(manifest.get_obj().bucket, raw_loc, &loc);
9505
9506 if (loc.key.ns == RGW_OBJ_NS_MULTIPART) {
9507 dout(10) << "check_disk_state(): removing manifest part from index: " << loc << dendl;
494da23a 9508 r = delete_obj_index(loc, astate->mtime);
7c673cae
FG
9509 if (r < 0) {
9510 dout(0) << "WARNING: delete_obj_index() returned r=" << r << dendl;
9511 }
9512 }
9513 }
9514 }
9515
9516 object.meta.etag = etag;
9517 object.meta.content_type = content_type;
9518 object.meta.owner = owner.get_id().to_str();
9519 object.meta.owner_display_name = owner.get_display_name();
9520
9521 // encode suggested updates
9522 list_state.ver.pool = io_ctx.get_id();
9523 list_state.ver.epoch = astate->epoch;
9524 list_state.meta.size = object.meta.size;
9525 list_state.meta.accounted_size = object.meta.accounted_size;
9526 list_state.meta.mtime = object.meta.mtime;
9527 list_state.meta.category = main_category;
9528 list_state.meta.etag = etag;
9529 list_state.meta.content_type = content_type;
9530 if (astate->obj_tag.length() > 0)
9531 list_state.tag = astate->obj_tag.c_str();
9532 list_state.meta.owner = owner.get_id().to_str();
9533 list_state.meta.owner_display_name = owner.get_display_name();
9534
9535 list_state.exists = true;
9536 cls_rgw_encode_suggestion(CEPH_RGW_UPDATE | suggest_flag, list_state, suggested_updates);
9537 return 0;
9538}
9539
a8e16298 9540int RGWRados::cls_bucket_head(const RGWBucketInfo& bucket_info, int shard_id, vector<rgw_bucket_dir_header>& headers, map<int, string> *bucket_instance_ids)
7c673cae
FG
9541{
9542 librados::IoCtx index_ctx;
9543 map<int, string> oids;
9544 map<int, struct rgw_cls_list_ret> list_results;
9545 int r = open_bucket_index(bucket_info, index_ctx, oids, list_results, shard_id, bucket_instance_ids);
9546 if (r < 0)
9547 return r;
9548
9549 r = CLSRGWIssueGetDirHeader(index_ctx, oids, list_results, cct->_conf->rgw_bucket_index_max_aio)();
9550 if (r < 0)
9551 return r;
9552
9553 map<int, struct rgw_cls_list_ret>::iterator iter = list_results.begin();
9554 for(; iter != list_results.end(); ++iter) {
a8e16298 9555 headers.push_back(std::move(iter->second.dir.header));
7c673cae
FG
9556 }
9557 return 0;
9558}
9559
9560int RGWRados::cls_bucket_head_async(const RGWBucketInfo& bucket_info, int shard_id, RGWGetDirHeader_CB *ctx, int *num_aio)
9561{
9562 librados::IoCtx index_ctx;
9563 map<int, string> bucket_objs;
9564 int r = open_bucket_index(bucket_info, index_ctx, bucket_objs, shard_id);
9565 if (r < 0)
9566 return r;
9567
9568 map<int, string>::iterator iter = bucket_objs.begin();
9569 for (; iter != bucket_objs.end(); ++iter) {
9570 r = cls_rgw_get_dir_header_async(index_ctx, iter->second, static_cast<RGWGetDirHeader_CB*>(ctx->get()));
9571 if (r < 0) {
9572 ctx->put();
9573 break;
9574 } else {
9575 (*num_aio)++;
9576 }
9577 }
9578 return r;
9579}
9580
9581int RGWRados::cls_user_get_header(const string& user_id, cls_user_header *header)
9582{
9583 string buckets_obj_id;
9584 rgw_get_buckets_obj(user_id, buckets_obj_id);
11fdf7f2 9585 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
7c673cae
FG
9586
9587 rgw_rados_ref ref;
224ce89b 9588 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9589 if (r < 0) {
9590 return r;
9591 }
9592
9593 librados::ObjectReadOperation op;
9594 int rc;
9595 ::cls_user_get_header(op, header, &rc);
9596 bufferlist ibl;
11fdf7f2 9597 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
7c673cae
FG
9598 if (r < 0)
9599 return r;
9600 if (rc < 0)
9601 return rc;
9602
9603 return 0;
9604}
9605
94b18763
FG
9606int RGWRados::cls_user_reset_stats(const string& user_id)
9607{
9608 string buckets_obj_id;
9609 rgw_get_buckets_obj(user_id, buckets_obj_id);
11fdf7f2 9610 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
94b18763
FG
9611
9612 rgw_rados_ref ref;
9613 int r = get_raw_obj_ref(obj, &ref);
9614 if (r < 0) {
9615 return r;
9616 }
9617
9618 librados::ObjectWriteOperation op;
9619 ::cls_user_reset_stats(op);
11fdf7f2 9620 return ref.ioctx.operate(ref.obj.oid, &op);
94b18763
FG
9621}
9622
7c673cae
FG
9623int RGWRados::cls_user_get_header_async(const string& user_id, RGWGetUserHeader_CB *ctx)
9624{
9625 string buckets_obj_id;
9626 rgw_get_buckets_obj(user_id, buckets_obj_id);
11fdf7f2 9627 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
7c673cae
FG
9628
9629 rgw_rados_ref ref;
224ce89b 9630 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9631 if (r < 0) {
9632 return r;
9633 }
9634
11fdf7f2 9635 r = ::cls_user_get_header_async(ref.ioctx, ref.obj.oid, ctx);
7c673cae
FG
9636 if (r < 0)
9637 return r;
9638
9639 return 0;
9640}
9641
11fdf7f2
TL
9642int RGWRados::cls_user_sync_bucket_stats(rgw_raw_obj& user_obj,
9643 const RGWBucketInfo& bucket_info)
7c673cae 9644{
a8e16298 9645 vector<rgw_bucket_dir_header> headers;
7c673cae
FG
9646 int r = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9647 if (r < 0) {
9648 ldout(cct, 20) << "cls_bucket_header() returned " << r << dendl;
9649 return r;
9650 }
9651
9652 cls_user_bucket_entry entry;
9653
9654 bucket_info.bucket.convert(&entry.bucket);
9655
c07f9fc5 9656 for (const auto& hiter : headers) {
a8e16298 9657 for (const auto& iter : hiter.stats) {
11fdf7f2
TL
9658 if (RGWObjCategory::Main == iter.first ||
9659 RGWObjCategory::MultiMeta == iter.first) {
9660 const struct rgw_bucket_category_stats& header_stats = iter.second;
9661 entry.size += header_stats.total_size;
9662 entry.size_rounded += header_stats.total_size_rounded;
9663 entry.count += header_stats.num_entries;
9664 }
7c673cae
FG
9665 }
9666 }
9667
9668 list<cls_user_bucket_entry> entries;
9669 entries.push_back(entry);
9670
9671 r = cls_user_update_buckets(user_obj, entries, false);
9672 if (r < 0) {
9673 ldout(cct, 20) << "cls_user_update_buckets() returned " << r << dendl;
9674 return r;
9675 }
9676
9677 return 0;
9678}
9679
c07f9fc5
FG
9680int RGWRados::cls_user_get_bucket_stats(const rgw_bucket& bucket, cls_user_bucket_entry& entry)
9681{
a8e16298 9682 vector<rgw_bucket_dir_header> headers;
c07f9fc5 9683 RGWBucketInfo bucket_info;
11fdf7f2 9684 auto obj_ctx = svc.sysobj->init_obj_ctx();
c07f9fc5
FG
9685 int ret = get_bucket_instance_info(obj_ctx, bucket, bucket_info, NULL, NULL);
9686 if (ret < 0) {
9687 return ret;
9688 }
9689
9690 ret = cls_bucket_head(bucket_info, RGW_NO_SHARD, headers);
9691 if (ret < 0) {
9692 ldout(cct, 20) << "cls_bucket_header() returned " << ret << dendl;
9693 return ret;
9694 }
9695
9696 bucket.convert(&entry.bucket);
9697
9698 for (const auto& hiter : headers) {
a8e16298 9699 for (const auto& iter : hiter.stats) {
c07f9fc5
FG
9700 const struct rgw_bucket_category_stats& header_stats = iter.second;
9701 entry.size += header_stats.total_size;
9702 entry.size_rounded += header_stats.total_size_rounded;
9703 entry.count += header_stats.num_entries;
9704 }
9705 }
9706
9707 return 0;
9708}
9709
7c673cae
FG
9710int RGWRados::cls_user_list_buckets(rgw_raw_obj& obj,
9711 const string& in_marker,
9712 const string& end_marker,
9713 const int max_entries,
9714 list<cls_user_bucket_entry>& entries,
9715 string * const out_marker,
9716 bool * const truncated)
9717{
9718 rgw_rados_ref ref;
224ce89b 9719 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9720 if (r < 0) {
9721 return r;
9722 }
9723
9724 librados::ObjectReadOperation op;
9725 int rc;
9726
9727 cls_user_bucket_list(op, in_marker, end_marker, max_entries, entries, out_marker, truncated, &rc);
9728 bufferlist ibl;
11fdf7f2 9729 r = ref.ioctx.operate(ref.obj.oid, &op, &ibl);
7c673cae
FG
9730 if (r < 0)
9731 return r;
9732 if (rc < 0)
9733 return rc;
9734
9735 return 0;
9736}
9737
9738int RGWRados::cls_user_update_buckets(rgw_raw_obj& obj, list<cls_user_bucket_entry>& entries, bool add)
9739{
9740 rgw_rados_ref ref;
224ce89b 9741 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9742 if (r < 0) {
9743 return r;
9744 }
9745
9746 librados::ObjectWriteOperation op;
9747 cls_user_set_buckets(op, entries, add);
11fdf7f2 9748 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
9749 if (r < 0)
9750 return r;
9751
9752 return 0;
9753}
9754
9755int RGWRados::complete_sync_user_stats(const rgw_user& user_id)
9756{
9757 string buckets_obj_id;
9758 rgw_get_buckets_obj(user_id, buckets_obj_id);
11fdf7f2 9759 rgw_raw_obj obj(svc.zone->get_zone_params().user_uid_pool, buckets_obj_id);
7c673cae
FG
9760 return cls_user_complete_stats_sync(obj);
9761}
9762
9763int RGWRados::cls_user_complete_stats_sync(rgw_raw_obj& obj)
9764{
9765 rgw_rados_ref ref;
224ce89b 9766 int r = get_raw_obj_ref(obj, &ref);
7c673cae
FG
9767 if (r < 0) {
9768 return r;
9769 }
9770
9771 librados::ObjectWriteOperation op;
9772 ::cls_user_complete_stats_sync(op);
11fdf7f2 9773 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
9774 if (r < 0)
9775 return r;
9776
9777 return 0;
9778}
9779
9780int RGWRados::cls_user_add_bucket(rgw_raw_obj& obj, const cls_user_bucket_entry& entry)
9781{
9782 list<cls_user_bucket_entry> l;
9783 l.push_back(entry);
9784
9785 return cls_user_update_buckets(obj, l, true);
9786}
9787
9788int RGWRados::cls_user_remove_bucket(rgw_raw_obj& obj, const cls_user_bucket& bucket)
9789{
7c673cae 9790 rgw_rados_ref ref;
224ce89b 9791 int r = get_system_obj_ref(obj, &ref);
7c673cae
FG
9792 if (r < 0) {
9793 return r;
9794 }
9795
9796 librados::ObjectWriteOperation op;
9797 ::cls_user_remove_bucket(op, bucket);
11fdf7f2 9798 r = ref.ioctx.operate(ref.obj.oid, &op);
7c673cae
FG
9799 if (r < 0)
9800 return r;
9801
9802 return 0;
9803}
9804
224ce89b 9805int RGWRados::check_bucket_shards(const RGWBucketInfo& bucket_info, const rgw_bucket& bucket,
31f18b77
FG
9806 RGWQuotaInfo& bucket_quota)
9807{
11fdf7f2 9808 if (! cct->_conf.get_val<bool>("rgw_dynamic_resharding")) {
31f18b77
FG
9809 return 0;
9810 }
9811
9812 bool need_resharding = false;
9813 int num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9814 uint32_t suggested_num_shards;
9815
11fdf7f2
TL
9816 const uint64_t max_objs_per_shard =
9817 cct->_conf.get_val<uint64_t>("rgw_max_objs_per_shard");
9818 int ret =
9819 quota_handler->check_bucket_shards(max_objs_per_shard, num_source_shards,
9820 bucket_info.owner, bucket, bucket_quota,
9821 1, need_resharding, &suggested_num_shards);
31f18b77
FG
9822 if (ret < 0) {
9823 return ret;
9824 }
9825
9826 if (need_resharding) {
224ce89b
WB
9827 ldout(cct, 20) << __func__ << " bucket " << bucket.name << " need resharding " <<
9828 " old num shards " << bucket_info.num_shards << " new num shards " << suggested_num_shards <<
9829 dendl;
31f18b77
FG
9830 return add_bucket_to_reshard(bucket_info, suggested_num_shards);
9831 }
9832
9833 return ret;
9834}
9835
9836int RGWRados::add_bucket_to_reshard(const RGWBucketInfo& bucket_info, uint32_t new_num_shards)
9837{
9838 RGWReshard reshard(this);
9839
9840 uint32_t num_source_shards = (bucket_info.num_shards > 0 ? bucket_info.num_shards : 1);
9841
11fdf7f2 9842 new_num_shards = std::min(new_num_shards, get_max_bucket_shards());
31f18b77
FG
9843 if (new_num_shards <= num_source_shards) {
9844 ldout(cct, 20) << "not resharding bucket name=" << bucket_info.bucket.name << ", orig_num=" << num_source_shards << ", new_num_shards=" << new_num_shards << dendl;
9845 return 0;
9846 }
9847
9848 cls_rgw_reshard_entry entry;
9849 entry.time = real_clock::now();
9850 entry.tenant = bucket_info.owner.tenant;
9851 entry.bucket_name = bucket_info.bucket.name;
9852 entry.bucket_id = bucket_info.bucket.bucket_id;
9853 entry.old_num_shards = num_source_shards;
9854 entry.new_num_shards = new_num_shards;
9855
9856 return reshard.add(entry);
9857}
9858
7c673cae 9859int RGWRados::check_quota(const rgw_user& bucket_owner, rgw_bucket& bucket,
11fdf7f2 9860 RGWQuotaInfo& user_quota, RGWQuotaInfo& bucket_quota, uint64_t obj_size, bool check_size_only)
7c673cae 9861{
11fdf7f2
TL
9862 // if we only check size, then num_objs will set to 0
9863 if(check_size_only)
9864 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 0, obj_size);
9865
7c673cae
FG
9866 return quota_handler->check_quota(bucket_owner, bucket, user_quota, bucket_quota, 1, obj_size);
9867}
9868
9869void RGWRados::get_bucket_index_objects(const string& bucket_oid_base,
1adf2230
AA
9870 uint32_t num_shards,
9871 map<int, string>& bucket_objects,
9872 int shard_id) {
7c673cae
FG
9873 if (!num_shards) {
9874 bucket_objects[0] = bucket_oid_base;
9875 } else {
9876 char buf[bucket_oid_base.size() + 32];
9877 if (shard_id < 0) {
9878 for (uint32_t i = 0; i < num_shards; ++i) {
9879 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), i);
9880 bucket_objects[i] = buf;
9881 }
9882 } else {
9883 if ((uint32_t)shard_id > num_shards) {
9884 return;
9885 }
9886 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
9887 bucket_objects[shard_id] = buf;
9888 }
9889 }
9890}
9891
9892void RGWRados::get_bucket_instance_ids(const RGWBucketInfo& bucket_info, int shard_id, map<int, string> *result)
9893{
9894 const rgw_bucket& bucket = bucket_info.bucket;
9895 string plain_id = bucket.name + ":" + bucket.bucket_id;
9896 if (!bucket_info.num_shards) {
9897 (*result)[0] = plain_id;
9898 } else {
9899 char buf[16];
9900 if (shard_id < 0) {
9901 for (uint32_t i = 0; i < bucket_info.num_shards; ++i) {
9902 snprintf(buf, sizeof(buf), ":%d", i);
9903 (*result)[i] = plain_id + buf;
9904 }
9905 } else {
9906 if ((uint32_t)shard_id > bucket_info.num_shards) {
9907 return;
9908 }
9909 snprintf(buf, sizeof(buf), ":%d", shard_id);
11fdf7f2 9910 (*result)[shard_id] = plain_id + buf;
7c673cae 9911 }
7c673cae 9912 }
7c673cae
FG
9913}
9914
11fdf7f2
TL
9915int RGWRados::get_target_shard_id(const RGWBucketInfo& bucket_info, const string& obj_key,
9916 int *shard_id)
7c673cae 9917{
11fdf7f2
TL
9918 int r = 0;
9919 switch (bucket_info.bucket_index_shard_hash_type) {
9920 case RGWBucketInfo::MOD:
9921 if (!bucket_info.num_shards) {
9922 if (shard_id) {
9923 *shard_id = -1;
9924 }
9925 } else {
9926 uint32_t sid = rgw_bucket_shard_index(obj_key, bucket_info.num_shards);
9927 if (shard_id) {
9928 *shard_id = (int)sid;
9929 }
9930 }
7c673cae
FG
9931 break;
9932 default:
11fdf7f2 9933 r = -ENOTSUP;
7c673cae 9934 }
11fdf7f2 9935 return r;
7c673cae
FG
9936}
9937
11fdf7f2
TL
9938void RGWRados::get_bucket_index_object(const string& bucket_oid_base, uint32_t num_shards,
9939 int shard_id, string *bucket_obj)
9940{
9941 if (!num_shards) {
9942 // By default with no sharding, we use the bucket oid as itself
9943 (*bucket_obj) = bucket_oid_base;
7c673cae 9944 } else {
11fdf7f2
TL
9945 char buf[bucket_oid_base.size() + 32];
9946 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), shard_id);
9947 (*bucket_obj) = buf;
7c673cae 9948 }
7c673cae
FG
9949}
9950
11fdf7f2
TL
9951int RGWRados::get_bucket_index_object(const string& bucket_oid_base, const string& obj_key,
9952 uint32_t num_shards, RGWBucketInfo::BIShardsHashType hash_type, string *bucket_obj, int *shard_id)
7c673cae 9953{
11fdf7f2
TL
9954 int r = 0;
9955 switch (hash_type) {
9956 case RGWBucketInfo::MOD:
9957 if (!num_shards) {
9958 // By default with no sharding, we use the bucket oid as itself
9959 (*bucket_obj) = bucket_oid_base;
9960 if (shard_id) {
9961 *shard_id = -1;
9962 }
9963 } else {
9964 uint32_t sid = rgw_bucket_shard_index(obj_key, num_shards);
9965 char buf[bucket_oid_base.size() + 32];
9966 snprintf(buf, sizeof(buf), "%s.%d", bucket_oid_base.c_str(), sid);
9967 (*bucket_obj) = buf;
9968 if (shard_id) {
9969 *shard_id = (int)sid;
9970 }
9971 }
9972 break;
9973 default:
9974 r = -ENOTSUP;
7c673cae 9975 }
11fdf7f2 9976 return r;
7c673cae
FG
9977}
9978
7c673cae
FG
9979uint64_t RGWRados::instance_id()
9980{
9981 return get_rados_handle()->get_instance_id();
9982}
9983
9984uint64_t RGWRados::next_bucket_id()
9985{
9986 Mutex::Locker l(bucket_id_lock);
9987 return ++max_bucket_id;
9988}
9989
28e407b8
AA
9990RGWRados *RGWStoreManager::init_storage_provider(CephContext *cct, bool use_gc_thread, bool use_lc_thread,
9991 bool quota_threads, bool run_sync_thread, bool run_reshard_thread, bool use_cache)
7c673cae 9992{
11fdf7f2 9993 RGWRados *store = new RGWRados;
7c673cae 9994
11fdf7f2
TL
9995 if ((*store).set_use_cache(use_cache)
9996 .set_run_gc_thread(use_gc_thread)
9997 .set_run_lc_thread(use_lc_thread)
9998 .set_run_quota_threads(quota_threads)
9999 .set_run_sync_thread(run_sync_thread)
10000 .set_run_reshard_thread(run_reshard_thread)
10001 .initialize(cct) < 0) {
7c673cae
FG
10002 delete store;
10003 return NULL;
10004 }
10005
10006 return store;
10007}
10008
10009RGWRados *RGWStoreManager::init_raw_storage_provider(CephContext *cct)
10010{
10011 RGWRados *store = NULL;
10012 store = new RGWRados;
10013
10014 store->set_context(cct);
10015
11fdf7f2
TL
10016 int ret = store->init_svc(true);
10017 if (ret < 0) {
10018 ldout(cct, 0) << "ERROR: failed to init services (ret=" << cpp_strerror(-ret) << ")" << dendl;
10019 return nullptr;
10020 }
10021
7c673cae
FG
10022 if (store->init_rados() < 0) {
10023 delete store;
11fdf7f2 10024 return nullptr;
7c673cae
FG
10025 }
10026
10027 return store;
10028}
10029
10030void RGWStoreManager::close_storage(RGWRados *store)
10031{
10032 if (!store)
10033 return;
10034
10035 store->finalize();
10036
10037 delete store;
10038}
10039
10040librados::Rados* RGWRados::get_rados_handle()
10041{
494da23a 10042 return &rados;
7c673cae
FG
10043}
10044
10045int RGWRados::delete_raw_obj_aio(const rgw_raw_obj& obj, list<librados::AioCompletion *>& handles)
10046{
10047 rgw_rados_ref ref;
10048 int ret = get_raw_obj_ref(obj, &ref);
10049 if (ret < 0) {
10050 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10051 return ret;
10052 }
10053
10054 ObjectWriteOperation op;
10055 list<string> prefixes;
10056 cls_rgw_remove_obj(op, prefixes);
10057
10058 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
11fdf7f2 10059 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
10060 if (ret < 0) {
10061 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10062 c->release();
10063 return ret;
10064 }
10065
10066 handles.push_back(c);
10067
10068 return 0;
10069}
10070
10071int RGWRados::delete_obj_aio(const rgw_obj& obj,
10072 RGWBucketInfo& bucket_info, RGWObjState *astate,
10073 list<librados::AioCompletion *>& handles, bool keep_index_consistent)
10074{
10075 rgw_rados_ref ref;
10076 int ret = get_obj_head_ref(bucket_info, obj, &ref);
10077 if (ret < 0) {
10078 lderr(cct) << "ERROR: failed to get obj ref with ret=" << ret << dendl;
10079 return ret;
10080 }
10081
10082 if (keep_index_consistent) {
10083 RGWRados::Bucket bop(this, bucket_info);
10084 RGWRados::Bucket::UpdateIndex index_op(&bop, obj);
10085
10086 ret = index_op.prepare(CLS_RGW_OP_DEL, &astate->write_tag);
10087 if (ret < 0) {
10088 lderr(cct) << "ERROR: failed to prepare index op with ret=" << ret << dendl;
10089 return ret;
10090 }
10091 }
10092
10093 ObjectWriteOperation op;
10094 list<string> prefixes;
10095 cls_rgw_remove_obj(op, prefixes);
10096
10097 AioCompletion *c = librados::Rados::aio_create_completion(NULL, NULL, NULL);
11fdf7f2 10098 ret = ref.ioctx.aio_operate(ref.obj.oid, c, &op);
7c673cae
FG
10099 if (ret < 0) {
10100 lderr(cct) << "ERROR: AioOperate failed with ret=" << ret << dendl;
10101 c->release();
10102 return ret;
10103 }
10104
10105 handles.push_back(c);
10106
10107 if (keep_index_consistent) {
494da23a 10108 ret = delete_obj_index(obj, astate->mtime);
7c673cae
FG
10109 if (ret < 0) {
10110 lderr(cct) << "ERROR: failed to delete obj index with ret=" << ret << dendl;
10111 return ret;
10112 }
10113 }
10114 return ret;
10115}
10116
10117int rgw_compression_info_from_attrset(map<string, bufferlist>& attrs, bool& need_decompress, RGWCompressionInfo& cs_info) {
10118 map<string, bufferlist>::iterator value = attrs.find(RGW_ATTR_COMPRESSION);
10119 if (value != attrs.end()) {
11fdf7f2 10120 auto bliter = value->second.cbegin();
7c673cae 10121 try {
11fdf7f2 10122 decode(cs_info, bliter);
7c673cae
FG
10123 } catch (buffer::error& err) {
10124 return -EIO;
10125 }
10126 if (cs_info.blocks.size() == 0) {
10127 return -EIO;
10128 }
10129 if (cs_info.compression_type != "none")
10130 need_decompress = true;
10131 else
10132 need_decompress = false;
10133 return 0;
10134 } else {
10135 need_decompress = false;
10136 return 0;
10137 }
10138}
10139
11fdf7f2
TL
10140bool RGWRados::call(std::string_view command, const cmdmap_t& cmdmap,
10141 std::string_view format, bufferlist& out)
3a9019d9 10142{
11fdf7f2
TL
10143 if (command == "cache list"sv) {
10144 std::optional<std::string> filter;
10145 if (auto i = cmdmap.find("filter"); i != cmdmap.cend()) {
3a9019d9
FG
10146 filter = boost::get<std::string>(i->second);
10147 }
10148 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "table"));
10149 if (f) {
10150 f->open_array_section("cache_entries");
10151 call_list(filter, f.get());
10152 f->close_section();
10153 f->flush(out);
10154 return true;
10155 } else {
10156 out.append("Unable to create Formatter.\n");
10157 return false;
10158 }
11fdf7f2 10159 } else if (command == "cache inspect"sv) {
3a9019d9
FG
10160 std::unique_ptr<Formatter> f(ceph::Formatter::create(format, "json-pretty"));
10161 if (f) {
11fdf7f2 10162 const auto& target = boost::get<std::string>(cmdmap.at("target"));
3a9019d9
FG
10163 if (call_inspect(target, f.get())) {
10164 f->flush(out);
10165 return true;
10166 } else {
11fdf7f2 10167 out.append("Unable to find entry "s + target + ".\n");
3a9019d9
FG
10168 return false;
10169 }
10170 } else {
10171 out.append("Unable to create Formatter.\n");
10172 return false;
10173 }
11fdf7f2
TL
10174 } else if (command == "cache erase"sv) {
10175 const auto& target = boost::get<std::string>(cmdmap.at("target"));
3a9019d9
FG
10176 if (call_erase(target)) {
10177 return true;
10178 } else {
11fdf7f2 10179 out.append("Unable to find entry "s + target + ".\n");
3a9019d9
FG
10180 return false;
10181 }
11fdf7f2 10182 } else if (command == "cache zap"sv) {
3a9019d9
FG
10183 call_zap();
10184 return true;
10185 }
10186 return false;
10187}
10188
11fdf7f2
TL
10189void RGWRados::call_list(const std::optional<std::string>& s,
10190 ceph::Formatter *f)
3a9019d9 10191{
11fdf7f2
TL
10192 if (!svc.cache) {
10193 return;
10194 }
10195 svc.cache->call_list(s, f);
3a9019d9
FG
10196}
10197
11fdf7f2 10198bool RGWRados::call_inspect(const std::string& s, Formatter *f)
3a9019d9 10199{
11fdf7f2
TL
10200 if (!svc.cache) {
10201 return false;
10202 }
10203 return svc.cache->call_inspect(s, f);
3a9019d9
FG
10204}
10205
11fdf7f2
TL
10206bool RGWRados::call_erase(const std::string& s) {
10207 if (!svc.cache) {
10208 return false;
10209 }
10210 return svc.cache->call_erase(s);
3a9019d9
FG
10211}
10212
10213void RGWRados::call_zap() {
11fdf7f2
TL
10214 if (svc.cache) {
10215 return;
10216 }
10217 svc.cache->call_zap();
10218}
10219
10220string RGWRados::get_mfa_oid(const rgw_user& user)
10221{
10222 return string("user:") + user.to_str();
10223}
10224
10225int RGWRados::get_mfa_ref(const rgw_user& user, rgw_rados_ref *ref)
10226{
10227 string oid = get_mfa_oid(user);
10228 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10229 return get_system_obj_ref(obj, ref);
10230}
10231
10232int RGWRados::check_mfa(const rgw_user& user, const string& otp_id, const string& pin)
10233{
10234 rgw_rados_ref ref;
10235
10236 int r = get_mfa_ref(user, &ref);
10237 if (r < 0) {
10238 return r;
10239 }
10240
10241 rados::cls::otp::otp_check_t result;
10242
10243 r = rados::cls::otp::OTP::check(cct, ref.ioctx, ref.obj.oid, otp_id, pin, &result);
10244 if (r < 0)
10245 return r;
10246
10247 ldout(cct, 20) << "OTP check, otp_id=" << otp_id << " result=" << (int)result.result << dendl;
10248
10249 return (result.result == rados::cls::otp::OTP_CHECK_SUCCESS ? 0 : -EACCES);
10250}
10251
10252void RGWRados::prepare_mfa_write(librados::ObjectWriteOperation *op,
10253 RGWObjVersionTracker *objv_tracker,
10254 const ceph::real_time& mtime)
10255{
10256 RGWObjVersionTracker ot;
10257
10258 if (objv_tracker) {
10259 ot = *objv_tracker;
10260 }
10261
10262 if (ot.write_version.tag.empty()) {
10263 if (ot.read_version.tag.empty()) {
10264 ot.generate_new_write_ver(cct);
10265 } else {
10266 ot.write_version = ot.read_version;
10267 ot.write_version.ver++;
10268 }
10269 }
10270
10271 ot.prepare_op_for_write(op);
10272 struct timespec mtime_ts = real_clock::to_timespec(mtime);
10273 op->mtime2(&mtime_ts);
10274}
10275
10276int RGWRados::create_mfa(const rgw_user& user, const rados::cls::otp::otp_info_t& config,
10277 RGWObjVersionTracker *objv_tracker, const ceph::real_time& mtime)
10278{
10279 rgw_rados_ref ref;
10280
10281 int r = get_mfa_ref(user, &ref);
10282 if (r < 0) {
10283 return r;
10284 }
10285
10286 librados::ObjectWriteOperation op;
10287 prepare_mfa_write(&op, objv_tracker, mtime);
10288 rados::cls::otp::OTP::create(&op, config);
10289 r = ref.ioctx.operate(ref.obj.oid, &op);
10290 if (r < 0) {
10291 ldout(cct, 20) << "OTP create, otp_id=" << config.id << " result=" << (int)r << dendl;
10292 return r;
10293 }
10294
10295 return 0;
10296}
10297
10298int RGWRados::remove_mfa(const rgw_user& user, const string& id,
10299 RGWObjVersionTracker *objv_tracker,
10300 const ceph::real_time& mtime)
10301{
10302 rgw_rados_ref ref;
10303
10304 int r = get_mfa_ref(user, &ref);
10305 if (r < 0) {
10306 return r;
10307 }
10308
10309 librados::ObjectWriteOperation op;
10310 prepare_mfa_write(&op, objv_tracker, mtime);
10311 rados::cls::otp::OTP::remove(&op, id);
10312 r = ref.ioctx.operate(ref.obj.oid, &op);
10313 if (r < 0) {
10314 ldout(cct, 20) << "OTP remove, otp_id=" << id << " result=" << (int)r << dendl;
10315 return r;
10316 }
10317
10318 return 0;
10319}
10320
10321int RGWRados::get_mfa(const rgw_user& user, const string& id, rados::cls::otp::otp_info_t *result)
10322{
10323 rgw_rados_ref ref;
10324
10325 int r = get_mfa_ref(user, &ref);
10326 if (r < 0) {
10327 return r;
10328 }
10329
10330 r = rados::cls::otp::OTP::get(nullptr, ref.ioctx, ref.obj.oid, id, result);
10331 if (r < 0) {
10332 return r;
10333 }
10334
10335 return 0;
10336}
10337
10338int RGWRados::list_mfa(const rgw_user& user, list<rados::cls::otp::otp_info_t> *result)
10339{
10340 rgw_rados_ref ref;
10341
10342 int r = get_mfa_ref(user, &ref);
10343 if (r < 0) {
10344 return r;
10345 }
10346
10347 r = rados::cls::otp::OTP::get_all(nullptr, ref.ioctx, ref.obj.oid, result);
10348 if (r < 0) {
10349 return r;
10350 }
10351
10352 return 0;
10353}
10354
10355int RGWRados::otp_get_current_time(const rgw_user& user, ceph::real_time *result)
10356{
10357 rgw_rados_ref ref;
10358
10359 int r = get_mfa_ref(user, &ref);
10360 if (r < 0) {
10361 return r;
10362 }
10363
10364 r = rados::cls::otp::OTP::get_current_time(ref.ioctx, ref.obj.oid, result);
10365 if (r < 0) {
10366 return r;
10367 }
10368
10369 return 0;
10370}
10371
10372int RGWRados::set_mfa(const string& oid, const list<rados::cls::otp::otp_info_t>& entries,
10373 bool reset_obj, RGWObjVersionTracker *objv_tracker,
10374 const real_time& mtime)
10375{
10376 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10377 rgw_rados_ref ref;
10378 int r = get_system_obj_ref(obj, &ref);
10379 if (r < 0) {
10380 return r;
10381 }
10382
10383 librados::ObjectWriteOperation op;
10384 if (reset_obj) {
10385 op.remove();
10386 op.set_op_flags2(LIBRADOS_OP_FLAG_FAILOK);
10387 op.create(false);
10388 }
10389 prepare_mfa_write(&op, objv_tracker, mtime);
10390 rados::cls::otp::OTP::set(&op, entries);
10391 r = ref.ioctx.operate(ref.obj.oid, &op);
10392 if (r < 0) {
10393 ldout(cct, 20) << "OTP set entries.size()=" << entries.size() << " result=" << (int)r << dendl;
10394 return r;
10395 }
10396
10397 return 0;
10398}
10399
10400int RGWRados::list_mfa(const string& oid, list<rados::cls::otp::otp_info_t> *result,
10401 RGWObjVersionTracker *objv_tracker, ceph::real_time *pmtime)
10402{
10403 rgw_raw_obj obj(svc.zone->get_zone_params().otp_pool, oid);
10404 rgw_rados_ref ref;
10405 int r = get_system_obj_ref(obj, &ref);
10406 if (r < 0) {
10407 return r;
10408 }
10409 librados::ObjectReadOperation op;
10410 struct timespec mtime_ts;
10411 if (pmtime) {
10412 op.stat2(nullptr, &mtime_ts, nullptr);
10413 }
10414 objv_tracker->prepare_op_for_read(&op);
10415 r = rados::cls::otp::OTP::get_all(&op, ref.ioctx, ref.obj.oid, result);
10416 if (r < 0) {
10417 return r;
10418 }
10419 if (pmtime) {
10420 *pmtime = ceph::real_clock::from_timespec(mtime_ts);
10421 }
10422
10423 return 0;
3a9019d9 10424}