ceph/src/rgw/rgw_sync.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include <boost/optional.hpp>
   5
   6 #include "common/ceph_json.h"
   7 #include "common/RWLock.h"
   8 #include "common/RefCountedObj.h"
   9 #include "common/WorkQueue.h"
  10 #include "common/Throttle.h"
  11 #include "common/admin_socket.h"
  12 #include "common/errno.h"
  13
  14 #include "rgw_common.h"
  15 #include "rgw_rados.h"
  16 #include "rgw_sync.h"
  17 #include "rgw_metadata.h"
  18 #include "rgw_rest_conn.h"
  19 #include "rgw_tools.h"
  20 #include "rgw_cr_rados.h"
  21 #include "rgw_cr_rest.h"
  22 #include "rgw_http_client.h"
  23
  24 #include "cls/lock/cls_lock_client.h"
  25
  26 #include <boost/asio/yield.hpp>
  27
  28 #define dout_subsys ceph_subsys_rgw
  29
  30 #undef dout_prefix
  31 #define dout_prefix (*_dout << "meta sync: ")
  32
  33 static string mdlog_sync_status_oid = "mdlog.sync-status";
  34 static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
  35 static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
  36
  37 RGWSyncErrorLogger::RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
  38   for (int i = 0; i < num_shards; i++) {
  39     oids.push_back(get_shard_oid(oid_prefix, i));
  40   }
  41 }
  42 string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
  43   char buf[oid_prefix.size() + 16];
  44   snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
  45   return string(buf);
  46 }
  47
  48 RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
  49   cls_log_entry entry;
  50
  51   rgw_sync_error_info info(source_zone, error_code, message);
  52   bufferlist bl;
  53   ::encode(info, bl);
  54   store->time_log_prepare_entry(entry, real_clock::now(), section, name, bl);
  55
  56   uint32_t shard_id = ++counter % num_shards;
  57
  58
  59   return new RGWRadosTimelogAddCR(store, oids[shard_id], entry);
  60 }
  61
  62 void RGWSyncBackoff::update_wait_time()
  63 {
  64   if (cur_wait == 0) {
  65     cur_wait = 1;
  66   } else {
  67     cur_wait = (cur_wait << 1);
  68   }
  69   if (cur_wait >= max_secs) {
  70     cur_wait = max_secs;
  71   }
  72 }
  73
  74 void RGWSyncBackoff::backoff_sleep()
  75 {
  76   update_wait_time();
  77   sleep(cur_wait);
  78 }
  79
  80 void RGWSyncBackoff::backoff(RGWCoroutine *op)
  81 {
  82   update_wait_time();
  83   op->wait(utime_t(cur_wait, 0));
  84 }
  85
  86 int RGWBackoffControlCR::operate() {
  87   reenter(this) {
  88     // retry the operation until it succeeds
  89     while (true) {
  90       yield {
  91         Mutex::Locker l(lock);
  92         cr = alloc_cr();
  93         cr->get();
  94         call(cr);
  95       }
  96       {
  97         Mutex::Locker l(lock);
  98         cr->put();
  99         cr = NULL;
 100       }
 101       if (retcode >= 0) {
 102         break;
 103       }
 104       if (retcode != -EBUSY && retcode != -EAGAIN) {
 105         ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
 106         if (exit_on_error) {
 107           return set_cr_error(retcode);
 108         }
 109       }
 110       if (reset_backoff) {
 111         backoff.reset();
 112       }
 113       yield backoff.backoff(this);
 114     }
 115
 116     // run an optional finisher
 117     yield call(alloc_finisher_cr());
 118     if (retcode < 0) {
 119       ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
 120       return set_cr_error(retcode);
 121     }
 122     return set_cr_done();
 123   }
 124   return 0;
 125 }
 126
 127 void rgw_mdlog_info::decode_json(JSONObj *obj) {
 128   JSONDecoder::decode_json("num_objects", num_shards, obj);
 129   JSONDecoder::decode_json("period", period, obj);
 130   JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
 131 }
 132
 133 void rgw_mdlog_entry::decode_json(JSONObj *obj) {
 134   JSONDecoder::decode_json("id", id, obj);
 135   JSONDecoder::decode_json("section", section, obj);
 136   JSONDecoder::decode_json("name", name, obj);
 137   utime_t ut;
 138   JSONDecoder::decode_json("timestamp", ut, obj);
 139   timestamp = ut.to_real_time();
 140   JSONDecoder::decode_json("data", log_data, obj);
 141 }
 142
 143 void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
 144   JSONDecoder::decode_json("marker", marker, obj);
 145   JSONDecoder::decode_json("truncated", truncated, obj);
 146   JSONDecoder::decode_json("entries", entries, obj);
 147 };
 148
 149 int RGWShardCollectCR::operate() {
 150   reenter(this) {
 151     while (spawn_next()) {
 152       current_running++;
 153
 154       while (current_running >= max_concurrent) {
 155         int child_ret;
 156         yield wait_for_child();
 157         if (collect_next(&child_ret)) {
 158           current_running--;
 159           if (child_ret < 0 && child_ret != -ENOENT) {
 160             ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
 161             status = child_ret;
 162           }
 163         }
 164       }
 165     }
 166     while (current_running > 0) {
 167       int child_ret;
 168       yield wait_for_child();
 169       if (collect_next(&child_ret)) {
 170         current_running--;
 171         if (child_ret < 0 && child_ret != -ENOENT) {
 172           ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
 173           status = child_ret;
 174         }
 175       }
 176     }
 177     if (status < 0) {
 178       return set_cr_error(status);
 179     }
 180     return set_cr_done();
 181   }
 182   return 0;
 183 }
 184
 185 class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
 186   RGWMetaSyncEnv *sync_env;
 187
 188   const std::string& period;
 189   int num_shards;
 190   map<int, RGWMetadataLogInfo> *mdlog_info;
 191
 192   int shard_id;
 193 #define READ_MDLOG_MAX_CONCURRENT 10
 194
 195 public:
 196   RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
 197                      const std::string& period, int _num_shards,
 198                      map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
 199                                                                  sync_env(_sync_env),
 200                                                                  period(period), num_shards(_num_shards),
 201                                                                  mdlog_info(_mdlog_info), shard_id(0) {}
 202   bool spawn_next() override;
 203 };
 204
 205 class RGWListRemoteMDLogCR : public RGWShardCollectCR {
 206   RGWMetaSyncEnv *sync_env;
 207
 208   const std::string& period;
 209   map<int, string> shards;
 210   int max_entries_per_shard;
 211   map<int, rgw_mdlog_shard_data> *result;
 212
 213   map<int, string>::iterator iter;
 214 #define READ_MDLOG_MAX_CONCURRENT 10
 215
 216 public:
 217   RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
 218                      const std::string& period, map<int, string>& _shards,
 219                      int _max_entries_per_shard,
 220                      map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
 221                                                                  sync_env(_sync_env), period(period),
 222                                                                  max_entries_per_shard(_max_entries_per_shard),
 223                                                                  result(_result) {
 224     shards.swap(_shards);
 225     iter = shards.begin();
 226   }
 227   bool spawn_next() override;
 228 };
 229
 230 RGWRemoteMetaLog::~RGWRemoteMetaLog()
 231 {
 232   delete error_logger;
 233 }
 234
 235 int RGWRemoteMetaLog::read_log_info(rgw_mdlog_info *log_info)
 236 {
 237   rgw_http_param_pair pairs[] = { { "type", "metadata" },
 238                                   { NULL, NULL } };
 239
 240   int ret = conn->get_json_resource("/admin/log", pairs, *log_info);
 241   if (ret < 0) {
 242     ldout(store->ctx(), 0) << "ERROR: failed to fetch mdlog info" << dendl;
 243     return ret;
 244   }
 245
 246   ldout(store->ctx(), 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
 247
 248   return 0;
 249 }
 250
 251 int RGWRemoteMetaLog::read_master_log_shards_info(const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
 252 {
 253   if (store->is_meta_master()) {
 254     return 0;
 255   }
 256
 257   rgw_mdlog_info log_info;
 258   int ret = read_log_info(&log_info);
 259   if (ret < 0) {
 260     return ret;
 261   }
 262
 263   return run(new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
 264 }
 265
 266 int RGWRemoteMetaLog::read_master_log_shards_next(const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
 267 {
 268   if (store->is_meta_master()) {
 269     return 0;
 270   }
 271
 272   return run(new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
 273 }
 274
 275 int RGWRemoteMetaLog::init()
 276 {
 277   conn = store->rest_master_conn;
 278
 279   int ret = http_manager.set_threaded();
 280   if (ret < 0) {
 281     ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
 282     return ret;
 283   }
 284
 285   error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
 286
 287   init_sync_env(&sync_env);
 288
 289   return 0;
 290 }
 291
 292 void RGWRemoteMetaLog::finish()
 293 {
 294   going_down = true;
 295   stop();
 296 }
 297
 298 #define CLONE_MAX_ENTRIES 100
 299
 300 int RGWMetaSyncStatusManager::init()
 301 {
 302   if (store->is_meta_master()) {
 303     return 0;
 304   }
 305
 306   if (!store->rest_master_conn) {
 307     lderr(store->ctx()) << "no REST connection to master zone" << dendl;
 308     return -EIO;
 309   }
 310
 311   int r = rgw_init_ioctx(store->get_rados_handle(), store->get_zone_params().log_pool, ioctx, true);
 312   if (r < 0) {
 313     lderr(store->ctx()) << "ERROR: failed to open log pool (" << store->get_zone_params().log_pool << " ret=" << r << dendl;
 314     return r;
 315   }
 316
 317   r = master_log.init();
 318   if (r < 0) {
 319     lderr(store->ctx()) << "ERROR: failed to init remote log, r=" << r << dendl;
 320     return r;
 321   }
 322
 323   RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
 324
 325   rgw_meta_sync_status sync_status;
 326   r = read_sync_status(&sync_status);
 327   if (r < 0 && r != -ENOENT) {
 328     lderr(store->ctx()) << "ERROR: failed to read sync status, r=" << r << dendl;
 329     return r;
 330   }
 331
 332   int num_shards = sync_status.sync_info.num_shards;
 333
 334   for (int i = 0; i < num_shards; i++) {
 335     shard_objs[i] = rgw_raw_obj(store->get_zone_params().log_pool, sync_env.shard_obj_name(i));
 336   }
 337
 338   RWLock::WLocker wl(ts_to_shard_lock);
 339   for (int i = 0; i < num_shards; i++) {
 340     clone_markers.push_back(string());
 341     utime_shard ut;
 342     ut.shard_id = i;
 343     ts_to_shard[ut] = i;
 344   }
 345
 346   return 0;
 347 }
 348
 349 void RGWMetaSyncEnv::init(CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn,
 350                           RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
 351                           RGWSyncErrorLogger *_error_logger) {
 352   cct = _cct;
 353   store = _store;
 354   conn = _conn;
 355   async_rados = _async_rados;
 356   http_manager = _http_manager;
 357   error_logger = _error_logger;
 358 }
 359
 360 string RGWMetaSyncEnv::status_oid()
 361 {
 362   return mdlog_sync_status_oid;
 363 }
 364
 365 string RGWMetaSyncEnv::shard_obj_name(int shard_id)
 366 {
 367   char buf[mdlog_sync_status_shard_prefix.size() + 16];
 368   snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
 369
 370   return string(buf);
 371 }
 372
 373 class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
 374   RGWRados *store;
 375   RGWMetadataLog *mdlog;
 376   int shard_id;
 377   string *marker;
 378   int max_entries;
 379   list<cls_log_entry> *entries;
 380   bool *truncated;
 381
 382 protected:
 383   int _send_request() override {
 384     real_time from_time;
 385     real_time end_time;
 386
 387     void *handle;
 388
 389     mdlog->init_list_entries(shard_id, from_time, end_time, *marker, &handle);
 390
 391     int ret = mdlog->list_entries(handle, max_entries, *entries, marker, truncated);
 392
 393     mdlog->complete_list_entries(handle);
 394
 395     return ret;
 396   }
 397 public:
 398   RGWAsyncReadMDLogEntries(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
 399                            RGWMetadataLog* mdlog, int _shard_id,
 400                            string* _marker, int _max_entries,
 401                            list<cls_log_entry> *_entries, bool *_truncated)
 402     : RGWAsyncRadosRequest(caller, cn), store(_store), mdlog(mdlog),
 403       shard_id(_shard_id), marker(_marker), max_entries(_max_entries),
 404       entries(_entries), truncated(_truncated) {}
 405 };
 406
 407 class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
 408   RGWMetaSyncEnv *sync_env;
 409   RGWMetadataLog *const mdlog;
 410   int shard_id;
 411   string marker;
 412   string *pmarker;
 413   int max_entries;
 414   list<cls_log_entry> *entries;
 415   bool *truncated;
 416
 417   RGWAsyncReadMDLogEntries *req;
 418
 419 public:
 420   RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
 421                         int _shard_id, string*_marker, int _max_entries,
 422                         list<cls_log_entry> *_entries, bool *_truncated)
 423     : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
 424       shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
 425       entries(_entries), truncated(_truncated) {}
 426
 427   ~RGWReadMDLogEntriesCR() override {
 428     if (req) {
 429       req->finish();
 430     }
 431   }
 432
 433   int send_request() override {
 434     marker = *pmarker;
 435     req = new RGWAsyncReadMDLogEntries(this, stack->create_completion_notifier(),
 436                                        sync_env->store, mdlog, shard_id, &marker,
 437                                        max_entries, entries, truncated);
 438     sync_env->async_rados->queue(req);
 439     return 0;
 440   }
 441
 442   int request_complete() override {
 443     int ret = req->get_ret_status();
 444     if (ret >= 0 && !entries->empty()) {
 445      *pmarker = marker;
 446     }
 447     return req->get_ret_status();
 448   }
 449 };
 450
 451
 452 class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
 453   RGWMetaSyncEnv *env;
 454   RGWRESTReadResource *http_op;
 455
 456   const std::string& period;
 457   int shard_id;
 458   RGWMetadataLogInfo *shard_info;
 459
 460 public:
 461   RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
 462                                 int _shard_id, RGWMetadataLogInfo *_shard_info)
 463     : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
 464       period(period), shard_id(_shard_id), shard_info(_shard_info) {}
 465
 466   int operate() override {
 467     auto store = env->store;
 468     RGWRESTConn *conn = store->rest_master_conn;
 469     reenter(this) {
 470       yield {
 471         char buf[16];
 472         snprintf(buf, sizeof(buf), "%d", shard_id);
 473         rgw_http_param_pair pairs[] = { { "type" , "metadata" },
 474                                         { "id", buf },
 475                                         { "period", period.c_str() },
 476                                         { "info" , NULL },
 477                                         { NULL, NULL } };
 478
 479         string p = "/admin/log/";
 480
 481         http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
 482                                           env->http_manager);
 483
 484         http_op->set_user_info((void *)stack);
 485
 486         int ret = http_op->aio_read();
 487         if (ret < 0) {
 488           ldout(store->ctx(), 0) << "ERROR: failed to read from " << p << dendl;
 489           log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 490           http_op->put();
 491           return set_cr_error(ret);
 492         }
 493
 494         return io_block(0);
 495       }
 496       yield {
 497         int ret = http_op->wait(shard_info);
 498         http_op->put();
 499         if (ret < 0) {
 500           return set_cr_error(ret);
 501         }
 502         return set_cr_done();
 503       }
 504     }
 505     return 0;
 506   }
 507 };
 508
 509 class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
 510   RGWMetaSyncEnv *sync_env;
 511   RGWRESTReadResource *http_op;
 512
 513   const std::string& period;
 514   int shard_id;
 515   string marker;
 516   uint32_t max_entries;
 517   rgw_mdlog_shard_data *result;
 518
 519 public:
 520   RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
 521                             int _shard_id, const string& _marker, uint32_t _max_entries,
 522                             rgw_mdlog_shard_data *_result)
 523     : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
 524       period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
 525
 526   int send_request() override {
 527     RGWRESTConn *conn = sync_env->conn;
 528     RGWRados *store = sync_env->store;
 529
 530     char buf[32];
 531     snprintf(buf, sizeof(buf), "%d", shard_id);
 532
 533     char max_entries_buf[32];
 534     snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
 535
 536     const char *marker_key = (marker.empty() ? "" : "marker");
 537
 538     rgw_http_param_pair pairs[] = { { "type", "metadata" },
 539       { "id", buf },
 540       { "period", period.c_str() },
 541       { "max-entries", max_entries_buf },
 542       { marker_key, marker.c_str() },
 543       { NULL, NULL } };
 544
 545     string p = "/admin/log/";
 546
 547     http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
 548     http_op->set_user_info((void *)stack);
 549
 550     int ret = http_op->aio_read();
 551     if (ret < 0) {
 552       ldout(store->ctx(), 0) << "ERROR: failed to read from " << p << dendl;
 553       log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 554       http_op->put();
 555       return ret;
 556     }
 557
 558     return 0;
 559   }
 560
 561   int request_complete() override {
 562     int ret = http_op->wait(result);
 563     http_op->put();
 564     if (ret < 0 && ret != -ENOENT) {
 565       ldout(sync_env->store->ctx(), 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
 566       return ret;
 567     }
 568     return 0;
 569   }
 570 };
 571
 572 bool RGWReadRemoteMDLogInfoCR::spawn_next() {
 573   if (shard_id >= num_shards) {
 574     return false;
 575   }
 576   spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
 577   shard_id++;
 578   return true;
 579 }
 580
 581 bool RGWListRemoteMDLogCR::spawn_next() {
 582   if (iter == shards.end()) {
 583     return false;
 584   }
 585
 586   spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
 587   ++iter;
 588   return true;
 589 }
 590
 591 class RGWInitSyncStatusCoroutine : public RGWCoroutine {
 592   RGWMetaSyncEnv *sync_env;
 593
 594   rgw_meta_sync_info status;
 595   vector<RGWMetadataLogInfo> shards_info;
 596   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
 597   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
 598 public:
 599   RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
 600                              const rgw_meta_sync_info &status)
 601     : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
 602       status(status), shards_info(status.num_shards),
 603       lease_cr(nullptr), lease_stack(nullptr) {}
 604
 605   ~RGWInitSyncStatusCoroutine() override {
 606     if (lease_cr) {
 607       lease_cr->abort();
 608     }
 609   }
 610
 611   int operate() override {
 612     int ret;
 613     reenter(this) {
 614       yield {
 615         set_status("acquiring sync lock");
 616         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
 617         string lock_name = "sync_lock";
 618         RGWRados *store = sync_env->store;
 619         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
 620                                                 rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 621                                                 lock_name, lock_duration, this));
 622         lease_stack.reset(spawn(lease_cr.get(), false));
 623       }
 624       while (!lease_cr->is_locked()) {
 625         if (lease_cr->is_done()) {
 626           ldout(cct, 5) << "lease cr failed, done early " << dendl;
 627           set_status("lease lock failed, early abort");
 628           return set_cr_error(lease_cr->get_ret_status());
 629         }
 630         set_sleeping(true);
 631         yield;
 632       }
 633       yield {
 634         set_status("writing sync status");
 635         RGWRados *store = sync_env->store;
 636         call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store,
 637                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 638                                                            status));
 639       }
 640
 641       if (retcode < 0) {
 642         set_status("failed to write sync status");
 643         ldout(cct, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
 644         yield lease_cr->go_down();
 645         return set_cr_error(retcode);
 646       }
 647       /* fetch current position in logs */
 648       set_status("fetching remote log position");
 649       yield {
 650         for (int i = 0; i < (int)status.num_shards; i++) {
 651           spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
 652                                                   &shards_info[i]), false);
 653         }
 654       }
 655
 656       drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
 657
 658       yield {
 659         set_status("updating sync status");
 660         for (int i = 0; i < (int)status.num_shards; i++) {
 661           rgw_meta_sync_marker marker;
 662           RGWMetadataLogInfo& info = shards_info[i];
 663           marker.next_step_marker = info.marker;
 664           marker.timestamp = info.last_update;
 665           RGWRados *store = sync_env->store;
 666           spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
 667                                                                 store,
 668                                                                 rgw_raw_obj(store->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
 669                                                                 marker), true);
 670         }
 671       }
 672       yield {
 673         set_status("changing sync state: build full sync maps");
 674         status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
 675         RGWRados *store = sync_env->store;
 676         call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store,
 677                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 678                                                            status));
 679       }
 680       set_status("drop lock lease");
 681       yield lease_cr->go_down();
 682       while (collect(&ret, NULL)) {
 683         if (ret < 0) {
 684           return set_cr_error(ret);
 685         }
 686         yield;
 687       }
 688       drain_all();
 689       return set_cr_done();
 690     }
 691     return 0;
 692   }
 693 };
 694
 695 class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
 696   static constexpr int MAX_CONCURRENT_SHARDS = 16;
 697
 698   RGWMetaSyncEnv *env;
 699   const int num_shards;
 700   int shard_id{0};
 701   map<uint32_t, rgw_meta_sync_marker>& markers;
 702
 703  public:
 704   RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
 705                              map<uint32_t, rgw_meta_sync_marker>& markers)
 706     : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
 707       env(env), num_shards(num_shards), markers(markers)
 708   {}
 709   bool spawn_next() override;
 710 };
 711
 712 bool RGWReadSyncStatusMarkersCR::spawn_next()
 713 {
 714   if (shard_id >= num_shards) {
 715     return false;
 716   }
 717   using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
 718   rgw_raw_obj obj{env->store->get_zone_params().log_pool,
 719                   env->shard_obj_name(shard_id)};
 720   spawn(new CR(env->async_rados, env->store, obj, &markers[shard_id]), false);
 721   shard_id++;
 722   return true;
 723 }
 724
 725 class RGWReadSyncStatusCoroutine : public RGWCoroutine {
 726   RGWMetaSyncEnv *sync_env;
 727   rgw_meta_sync_status *sync_status;
 728
 729 public:
 730   RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
 731                              rgw_meta_sync_status *_status)
 732     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
 733   {}
 734   int operate() override;
 735 };
 736
 737 int RGWReadSyncStatusCoroutine::operate()
 738 {
 739   reenter(this) {
 740     // read sync info
 741     using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
 742     yield {
 743       bool empty_on_enoent = false; // fail on ENOENT
 744       rgw_raw_obj obj{sync_env->store->get_zone_params().log_pool,
 745                       sync_env->status_oid()};
 746       call(new ReadInfoCR(sync_env->async_rados, sync_env->store, obj,
 747                           &sync_status->sync_info, empty_on_enoent));
 748     }
 749     if (retcode < 0) {
 750       ldout(sync_env->cct, 4) << "failed to read sync status info with "
 751           << cpp_strerror(retcode) << dendl;
 752       return set_cr_error(retcode);
 753     }
 754     // read shard markers
 755     using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
 756     yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
 757                                  sync_status->sync_markers));
 758     if (retcode < 0) {
 759       ldout(sync_env->cct, 4) << "failed to read sync status markers with "
 760           << cpp_strerror(retcode) << dendl;
 761       return set_cr_error(retcode);
 762     }
 763     return set_cr_done();
 764   }
 765   return 0;
 766 }
 767
 768 class RGWFetchAllMetaCR : public RGWCoroutine {
 769   RGWMetaSyncEnv *sync_env;
 770
 771   int num_shards;
 772
 773
 774   int ret_status;
 775
 776   list<string> sections;
 777   list<string>::iterator sections_iter;
 778   list<string> result;
 779   list<string>::iterator iter;
 780
 781   std::unique_ptr<RGWShardedOmapCRManager> entries_index;
 782
 783   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
 784   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
 785   bool lost_lock;
 786   bool failed;
 787
 788   map<uint32_t, rgw_meta_sync_marker>& markers;
 789
 790 public:
 791   RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
 792                     map<uint32_t, rgw_meta_sync_marker>& _markers) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
 793                                                       num_shards(_num_shards),
 794                                                       ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
 795                                                       lost_lock(false), failed(false), markers(_markers) {
 796   }
 797
 798   ~RGWFetchAllMetaCR() override {
 799   }
 800
 801   void append_section_from_set(set<string>& all_sections, const string& name) {
 802     set<string>::iterator iter = all_sections.find(name);
 803     if (iter != all_sections.end()) {
 804       sections.emplace_back(std::move(*iter));
 805       all_sections.erase(iter);
 806     }
 807   }
 808   /*
 809    * meta sync should go in the following order: user, bucket.instance, bucket
 810    * then whatever other sections exist (if any)
 811    */
 812   void rearrange_sections() {
 813     set<string> all_sections;
 814     std::move(sections.begin(), sections.end(),
 815               std::inserter(all_sections, all_sections.end()));
 816     sections.clear();
 817
 818     append_section_from_set(all_sections, "user");
 819     append_section_from_set(all_sections, "bucket.instance");
 820     append_section_from_set(all_sections, "bucket");
 821
 822     std::move(all_sections.begin(), all_sections.end(),
 823               std::back_inserter(sections));
 824   }
 825
 826   int operate() override {
 827     RGWRESTConn *conn = sync_env->conn;
 828
 829     reenter(this) {
 830       yield {
 831         set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
 832         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
 833         string lock_name = "sync_lock";
 834         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
 835                                                 sync_env->store,
 836                                                 rgw_raw_obj(sync_env->store->get_zone_params().log_pool, sync_env->status_oid()),
 837                                                 lock_name, lock_duration, this));
 838         lease_stack = spawn(lease_cr.get(), false);
 839       }
 840       while (!lease_cr->is_locked()) {
 841         if (lease_cr->is_done()) {
 842           ldout(cct, 5) << "lease cr failed, done early " << dendl;
 843           set_status("failed acquiring lock");
 844           return set_cr_error(lease_cr->get_ret_status());
 845         }
 846         set_sleeping(true);
 847         yield;
 848       }
 849       entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
 850                                                       sync_env->store->get_zone_params().log_pool,
 851                                                       mdlog_sync_full_sync_index_prefix));
 852       yield {
 853         call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
 854                                        "/admin/metadata", NULL, &sections));
 855       }
 856       if (get_ret_status() < 0) {
 857         ldout(cct, 0) << "ERROR: failed to fetch metadata sections" << dendl;
 858         yield entries_index->finish();
 859         yield lease_cr->go_down();
 860         drain_all();
 861         return set_cr_error(get_ret_status());
 862       }
 863       rearrange_sections();
 864       sections_iter = sections.begin();
 865       for (; sections_iter != sections.end(); ++sections_iter) {
 866         yield {
 867           string entrypoint = string("/admin/metadata/") + *sections_iter;
 868           /* FIXME: need a better scaling solution here, requires streaming output */
 869           call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
 870                                        entrypoint, NULL, &result));
 871         }
 872         if (get_ret_status() < 0) {
 873           ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
 874           yield entries_index->finish();
 875           yield lease_cr->go_down();
 876           drain_all();
 877           return set_cr_error(get_ret_status());
 878         }
 879         iter = result.begin();
 880         for (; iter != result.end(); ++iter) {
 881           if (!lease_cr->is_locked()) {
 882             lost_lock = true;
 883             break;
 884           }
 885           yield; // allow entries_index consumer to make progress
 886
 887           ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
 888           string s = *sections_iter + ":" + *iter;
 889           int shard_id;
 890           RGWRados *store = sync_env->store;
 891           int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
 892           if (ret < 0) {
 893             ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
 894             ret_status = ret;
 895             break;
 896           }
 897           if (!entries_index->append(s, shard_id)) {
 898             break;
 899           }
 900         }
 901       }
 902       yield {
 903         if (!entries_index->finish()) {
 904           failed = true;
 905         }
 906       }
 907       if (!failed) {
 908         for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
 909           int shard_id = (int)iter->first;
 910           rgw_meta_sync_marker& marker = iter->second;
 911           marker.total_entries = entries_index->get_total_entries(shard_id);
 912           spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados, sync_env->store,
 913                                                                 rgw_raw_obj(sync_env->store->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
 914                                                                 marker), true);
 915         }
 916       }
 917
 918       drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
 919
 920       yield lease_cr->go_down();
 921
 922       int ret;
 923       while (collect(&ret, NULL)) {
 924         if (ret < 0) {
 925           return set_cr_error(ret);
 926         }
 927         yield;
 928       }
 929       drain_all();
 930       if (failed) {
 931         yield return set_cr_error(-EIO);
 932       }
 933       if (lost_lock) {
 934         yield return set_cr_error(-EBUSY);
 935       }
 936
 937       if (ret_status < 0) {
 938         yield return set_cr_error(ret_status);
 939       }
 940
 941       yield return set_cr_done();
 942     }
 943     return 0;
 944   }
 945 };
 946
 947 static string full_sync_index_shard_oid(int shard_id)
 948 {
 949   char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
 950   snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
 951   return string(buf);
 952 }
 953
 954 class RGWReadRemoteMetadataCR : public RGWCoroutine {
 955   RGWMetaSyncEnv *sync_env;
 956
 957   RGWRESTReadResource *http_op;
 958
 959   string section;
 960   string key;
 961
 962   bufferlist *pbl;
 963
 964 public:
 965   RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
 966                                                       const string& _section, const string& _key, bufferlist *_pbl) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
 967                                                       http_op(NULL),
 968                                                       section(_section),
 969                                                       key(_key),
 970                                                       pbl(_pbl) {
 971   }
 972
 973   int operate() override {
 974     RGWRESTConn *conn = sync_env->conn;
 975     reenter(this) {
 976       yield {
 977         rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
 978                                         { NULL, NULL } };
 979
 980         string p = string("/admin/metadata/") + section + "/" + key;
 981
 982         http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
 983
 984         http_op->set_user_info((void *)stack);
 985
 986         int ret = http_op->aio_read();
 987         if (ret < 0) {
 988           ldout(sync_env->cct, 0) << "ERROR: failed to fetch mdlog data" << dendl;
 989           log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 990           http_op->put();
 991           return set_cr_error(ret);
 992         }
 993
 994         return io_block(0);
 995       }
 996       yield {
 997         int ret = http_op->wait_bl(pbl);
 998         http_op->put();
 999         if (ret < 0) {
1000           return set_cr_error(ret);
1001         }
1002         return set_cr_done();
1003       }
1004     }
1005     return 0;
1006   }
1007 };
1008
1009 class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
1010   RGWRados *store;
1011   string raw_key;
1012   bufferlist bl;
1013 protected:
1014   int _send_request() override {
1015     int ret = store->meta_mgr->put(raw_key, bl, RGWMetadataHandler::APPLY_ALWAYS);
1016     if (ret < 0) {
1017       ldout(store->ctx(), 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
1018       return ret;
1019     }
1020     return 0;
1021   }
1022 public:
1023   RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
1024                        const string& _raw_key,
1025                        bufferlist& _bl) : RGWAsyncRadosRequest(caller, cn), store(_store),
1026                                           raw_key(_raw_key), bl(_bl) {}
1027 };
1028
1029
1030 class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
1031   RGWMetaSyncEnv *sync_env;
1032   string raw_key;
1033   bufferlist bl;
1034
1035   RGWAsyncMetaStoreEntry *req;
1036
1037 public:
1038   RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
1039                        const string& _raw_key,
1040                        bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
1041                                           raw_key(_raw_key), bl(_bl), req(NULL) {
1042   }
1043
1044   ~RGWMetaStoreEntryCR() override {
1045     if (req) {
1046       req->finish();
1047     }
1048   }
1049
1050   int send_request() override {
1051     req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
1052                                    sync_env->store, raw_key, bl);
1053     sync_env->async_rados->queue(req);
1054     return 0;
1055   }
1056
1057   int request_complete() override {
1058     return req->get_ret_status();
1059   }
1060 };
1061
1062 class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
1063   RGWRados *store;
1064   string raw_key;
1065 protected:
1066   int _send_request() override {
1067     int ret = store->meta_mgr->remove(raw_key);
1068     if (ret < 0) {
1069       ldout(store->ctx(), 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
1070       return ret;
1071     }
1072     return 0;
1073   }
1074 public:
1075   RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
1076                        const string& _raw_key) : RGWAsyncRadosRequest(caller, cn), store(_store),
1077                                           raw_key(_raw_key) {}
1078 };
1079
1080
1081 class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
1082   RGWMetaSyncEnv *sync_env;
1083   string raw_key;
1084
1085   RGWAsyncMetaRemoveEntry *req;
1086
1087 public:
1088   RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
1089                        const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
1090                                           raw_key(_raw_key), req(NULL) {
1091   }
1092
1093   ~RGWMetaRemoveEntryCR() override {
1094     if (req) {
1095       req->finish();
1096     }
1097   }
1098
1099   int send_request() override {
1100     req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
1101                                    sync_env->store, raw_key);
1102     sync_env->async_rados->queue(req);
1103     return 0;
1104   }
1105
1106   int request_complete() override {
1107     int r = req->get_ret_status();
1108     if (r == -ENOENT) {
1109       r = 0;
1110     }
1111     return r;
1112   }
1113 };
1114
1115 #define META_SYNC_UPDATE_MARKER_WINDOW 10
1116
1117 class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
1118   RGWMetaSyncEnv *sync_env;
1119
1120   string marker_oid;
1121   rgw_meta_sync_marker sync_marker;
1122
1123
1124 public:
1125   RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
1126                          const string& _marker_oid,
1127                          const rgw_meta_sync_marker& _marker) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
1128                                                                 sync_env(_sync_env),
1129                                                                 marker_oid(_marker_oid),
1130                                                                 sync_marker(_marker) {}
1131
1132   RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
1133     sync_marker.marker = new_marker;
1134     if (index_pos > 0) {
1135       sync_marker.pos = index_pos;
1136     }
1137
1138     if (!real_clock::is_zero(timestamp)) {
1139       sync_marker.timestamp = timestamp;
1140     }
1141
1142     ldout(sync_env->cct, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
1143     RGWRados *store = sync_env->store;
1144     return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
1145                                                            store,
1146                                                            rgw_raw_obj(store->get_zone_params().log_pool, marker_oid),
1147                                                            sync_marker);
1148   }
1149 };
1150
1151 int RGWMetaSyncSingleEntryCR::operate() {
1152   reenter(this) {
1153 #define NUM_TRANSIENT_ERROR_RETRIES 10
1154
1155     if (error_injection &&
1156         rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
1157       ldout(sync_env->cct, 0) << __FILE__ << ":" << __LINE__ << ": injecting meta sync error on key=" << raw_key << dendl;
1158       return set_cr_error(-EIO);
1159     }
1160
1161     if (op_status != MDLOG_STATUS_COMPLETE) {
1162       ldout(sync_env->cct, 20) << "skipping pending operation" << dendl;
1163       yield call(marker_tracker->finish(entry_marker));
1164       if (retcode < 0) {
1165         return set_cr_error(retcode);
1166       }
1167       return set_cr_done();
1168     }
1169     for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
1170       yield {
1171         pos = raw_key.find(':');
1172         section = raw_key.substr(0, pos);
1173         key = raw_key.substr(pos + 1);
1174         ldout(sync_env->cct, 20) << "fetching remote metadata: " << section << ":" << key << (tries == 0 ? "" : " (retry)") << dendl;
1175         call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl));
1176       }
1177
1178       sync_status = retcode;
1179
1180       if (sync_status == -ENOENT) {
1181         /* FIXME: do we need to remove the entry from the local zone? */
1182         break;
1183       }
1184
1185       if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
1186         ldout(sync_env->cct, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl;
1187         continue;
1188       }
1189
1190       if (sync_status < 0) {
1191         ldout(sync_env->cct, 10) << *this << ": failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << dendl;
1192         log_error() << "failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
1193         yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), section, key, -sync_status,
1194                                                         string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
1195         return set_cr_error(sync_status);
1196       }
1197
1198       break;
1199     }
1200
1201     retcode = 0;
1202     for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
1203       if (sync_status != -ENOENT) {
1204           yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
1205       } else {
1206           yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
1207       }
1208       if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
1209         ldout(sync_env->cct, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl;
1210         continue;
1211       }
1212       break;
1213     }
1214
1215     sync_status = retcode;
1216
1217     if (sync_status == 0 && marker_tracker) {
1218       /* update marker */
1219       yield call(marker_tracker->finish(entry_marker));
1220       sync_status = retcode;
1221     }
1222     if (sync_status < 0) {
1223       return set_cr_error(sync_status);
1224     }
1225     return set_cr_done();
1226   }
1227   return 0;
1228 }
1229
1230 class RGWCloneMetaLogCoroutine : public RGWCoroutine {
1231   RGWMetaSyncEnv *sync_env;
1232   RGWMetadataLog *mdlog;
1233
1234   const std::string& period;
1235   int shard_id;
1236   string marker;
1237   bool truncated = false;
1238   string *new_marker;
1239
1240   int max_entries = CLONE_MAX_ENTRIES;
1241
1242   RGWRESTReadResource *http_op = nullptr;
1243   boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
1244
1245   RGWMetadataLogInfo shard_info;
1246   rgw_mdlog_shard_data data;
1247
1248 public:
1249   RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
1250                            const std::string& period, int _id,
1251                            const string& _marker, string *_new_marker)
1252     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
1253       period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
1254     if (new_marker) {
1255       *new_marker = marker;
1256     }
1257   }
1258   ~RGWCloneMetaLogCoroutine() override {
1259     if (http_op) {
1260       http_op->put();
1261     }
1262     if (completion) {
1263       completion->cancel();
1264     }
1265   }
1266
1267   int operate() override;
1268
1269   int state_init();
1270   int state_read_shard_status();
1271   int state_read_shard_status_complete();
1272   int state_send_rest_request();
1273   int state_receive_rest_response();
1274   int state_store_mdlog_entries();
1275   int state_store_mdlog_entries_complete();
1276 };
1277
1278 class RGWMetaSyncShardCR : public RGWCoroutine {
1279   RGWMetaSyncEnv *sync_env;
1280
1281   const rgw_pool& pool;
1282   const std::string& period; //< currently syncing period id
1283   const epoch_t realm_epoch; //< realm_epoch of period
1284   RGWMetadataLog* mdlog; //< log of syncing period
1285   uint32_t shard_id;
1286   rgw_meta_sync_marker& sync_marker;
1287   boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
1288   string marker;
1289   string max_marker;
1290   const std::string& period_marker; //< max marker stored in next period
1291
1292   map<string, bufferlist> entries;
1293   map<string, bufferlist>::iterator iter;
1294
1295   string oid;
1296
1297   RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
1298
1299   list<cls_log_entry> log_entries;
1300   list<cls_log_entry>::iterator log_iter;
1301   bool truncated = false;
1302
1303   string mdlog_marker;
1304   string raw_key;
1305   rgw_mdlog_entry mdlog_entry;
1306
1307   Mutex inc_lock;
1308   Cond inc_cond;
1309
1310   boost::asio::coroutine incremental_cr;
1311   boost::asio::coroutine full_cr;
1312
1313   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
1314   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
1315
1316   bool lost_lock = false;
1317
1318   bool *reset_backoff;
1319
1320   // hold a reference to the cr stack while it's in the map
1321   using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
1322   map<StackRef, string> stack_to_pos;
1323   map<string, string> pos_to_prev;
1324
1325   bool can_adjust_marker = false;
1326   bool done_with_period = false;
1327
1328   int total_entries = 0;
1329
1330 public:
1331   RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
1332                      const std::string& period, epoch_t realm_epoch,
1333                      RGWMetadataLog* mdlog, uint32_t _shard_id,
1334                      rgw_meta_sync_marker& _marker,
1335                      const std::string& period_marker, bool *_reset_backoff)
1336     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
1337       period(period), realm_epoch(realm_epoch), mdlog(mdlog),
1338       shard_id(_shard_id), sync_marker(_marker),
1339       period_marker(period_marker), inc_lock("RGWMetaSyncShardCR::inc_lock"),
1340       reset_backoff(_reset_backoff) {
1341     *reset_backoff = false;
1342   }
1343
1344   ~RGWMetaSyncShardCR() override {
1345     delete marker_tracker;
1346     if (lease_cr) {
1347       lease_cr->abort();
1348     }
1349   }
1350
1351   void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
1352     delete marker_tracker;
1353     marker_tracker = mt;
1354   }
1355
1356   int operate() override {
1357     int r;
1358     while (true) {
1359       switch (sync_marker.state) {
1360       case rgw_meta_sync_marker::FullSync:
1361         r  = full_sync();
1362         if (r < 0) {
1363           ldout(sync_env->cct, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
1364           return set_cr_error(r);
1365         }
1366         return 0;
1367       case rgw_meta_sync_marker::IncrementalSync:
1368         r  = incremental_sync();
1369         if (r < 0) {
1370           ldout(sync_env->cct, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
1371           return set_cr_error(r);
1372         }
1373         return 0;
1374       }
1375     }
1376     /* unreachable */
1377     return 0;
1378   }
1379
1380   void collect_children()
1381   {
1382     int child_ret;
1383     RGWCoroutinesStack *child;
1384     while (collect_next(&child_ret, &child)) {
1385       auto iter = stack_to_pos.find(child);
1386       if (iter == stack_to_pos.end()) {
1387         /* some other stack that we don't care about */
1388         continue;
1389       }
1390
1391       string& pos = iter->second;
1392
1393       if (child_ret < 0) {
1394         ldout(sync_env->cct, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
1395       }
1396
1397       map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
1398       assert(prev_iter != pos_to_prev.end());
1399
1400       /*
1401        * we should get -EAGAIN for transient errors, for which we want to retry, so we don't
1402        * update the marker and abort. We'll get called again for these. Permanent errors will be
1403        * handled by marking the entry at the error log shard, so that we retry on it separately
1404        */
1405       if (child_ret == -EAGAIN) {
1406         can_adjust_marker = false;
1407       }
1408
1409       if (pos_to_prev.size() == 1) {
1410         if (can_adjust_marker) {
1411           sync_marker.marker = pos;
1412         }
1413         pos_to_prev.erase(prev_iter);
1414       } else {
1415         assert(pos_to_prev.size() > 1);
1416         pos_to_prev.erase(prev_iter);
1417         prev_iter = pos_to_prev.begin();
1418         if (can_adjust_marker) {
1419           sync_marker.marker = prev_iter->second;
1420         }
1421       }
1422
1423       ldout(sync_env->cct, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
1424       stack_to_pos.erase(iter);
1425     }
1426   }
1427
1428   int full_sync() {
1429 #define OMAP_GET_MAX_ENTRIES 100
1430     int max_entries = OMAP_GET_MAX_ENTRIES;
1431     reenter(&full_cr) {
1432       set_status("full_sync");
1433       oid = full_sync_index_shard_oid(shard_id);
1434       can_adjust_marker = true;
1435       /* grab lock */
1436       yield {
1437         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
1438         string lock_name = "sync_lock";
1439         RGWRados *store = sync_env->store;
1440         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
1441                                                 rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1442                                                 lock_name, lock_duration, this));
1443         lease_stack.reset(spawn(lease_cr.get(), false));
1444         lost_lock = false;
1445       }
1446       while (!lease_cr->is_locked()) {
1447         if (lease_cr->is_done()) {
1448           ldout(cct, 5) << "lease cr failed, done early " << dendl;
1449           drain_all();
1450           return lease_cr->get_ret_status();
1451         }
1452         set_sleeping(true);
1453         yield;
1454       }
1455
1456       /* lock succeeded, a retry now should avoid previous backoff status */
1457       *reset_backoff = true;
1458
1459       /* prepare marker tracker */
1460       set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
1461                                                          sync_env->shard_obj_name(shard_id),
1462                                                          sync_marker));
1463
1464       marker = sync_marker.marker;
1465
1466       total_entries = sync_marker.pos;
1467
1468       /* sync! */
1469       do {
1470         if (!lease_cr->is_locked()) {
1471           lost_lock = true;
1472           break;
1473         }
1474         yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
1475                                              marker, &entries, max_entries));
1476         if (retcode < 0) {
1477           ldout(sync_env->cct, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
1478           yield lease_cr->go_down();
1479           drain_all();
1480           return retcode;
1481         }
1482         iter = entries.begin();
1483         for (; iter != entries.end(); ++iter) {
1484           ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
1485           total_entries++;
1486           if (!marker_tracker->start(iter->first, total_entries, real_time())) {
1487             ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
1488           } else {
1489             // fetch remote and write locally
1490             yield {
1491               RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, iter->first, iter->first, MDLOG_STATUS_COMPLETE, marker_tracker), false);
1492               // stack_to_pos holds a reference to the stack
1493               stack_to_pos[stack] = iter->first;
1494               pos_to_prev[iter->first] = marker;
1495             }
1496           }
1497           marker = iter->first;
1498         }
1499         collect_children();
1500       } while ((int)entries.size() == max_entries && can_adjust_marker);
1501
1502       while (num_spawned() > 1) {
1503         yield wait_for_child();
1504         collect_children();
1505       }
1506
1507       if (!lost_lock) {
1508         /* update marker to reflect we're done with full sync */
1509         if (can_adjust_marker) {
1510           // apply updates to a temporary marker, or operate() will send us
1511           // to incremental_sync() after we yield
1512           temp_marker = sync_marker;
1513           temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
1514           temp_marker->marker = std::move(temp_marker->next_step_marker);
1515           temp_marker->next_step_marker.clear();
1516           temp_marker->realm_epoch = realm_epoch;
1517           ldout(sync_env->cct, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
1518
1519           using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
1520           yield call(new WriteMarkerCR(sync_env->async_rados, sync_env->store,
1521                                        rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1522                                        *temp_marker));
1523         }
1524
1525         if (retcode < 0) {
1526           ldout(sync_env->cct, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
1527           return retcode;
1528         }
1529       }
1530
1531       /*
1532        * if we reached here, it means that lost_lock is true, otherwise the state
1533        * change in the previous block will prevent us from reaching here
1534        */
1535
1536       yield lease_cr->go_down();
1537
1538       lease_cr.reset();
1539
1540       drain_all();
1541
1542       if (!can_adjust_marker) {
1543         return -EAGAIN;
1544       }
1545
1546       if (lost_lock) {
1547         return -EBUSY;
1548       }
1549
1550       // apply the sync marker update
1551       assert(temp_marker);
1552       sync_marker = std::move(*temp_marker);
1553       temp_marker = boost::none;
1554       // must not yield after this point!
1555     }
1556     return 0;
1557   }
1558
1559
1560   int incremental_sync() {
1561     reenter(&incremental_cr) {
1562       set_status("incremental_sync");
1563       can_adjust_marker = true;
1564       /* grab lock */
1565       if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
1566         yield {
1567           uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
1568           string lock_name = "sync_lock";
1569           RGWRados *store = sync_env->store;
1570           lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
1571                                                    rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1572                                                    lock_name, lock_duration, this));
1573           lease_stack.reset(spawn(lease_cr.get(), false));
1574           lost_lock = false;
1575         }
1576         while (!lease_cr->is_locked()) {
1577           if (lease_cr->is_done()) {
1578             ldout(cct, 5) << "lease cr failed, done early " << dendl;
1579             drain_all();
1580             return lease_cr->get_ret_status();
1581           }
1582           set_sleeping(true);
1583           yield;
1584         }
1585       }
1586       // if the period has advanced, we can't use the existing marker
1587       if (sync_marker.realm_epoch < realm_epoch) {
1588         ldout(sync_env->cct, 4) << "clearing marker=" << sync_marker.marker
1589             << " from old realm_epoch=" << sync_marker.realm_epoch
1590             << " (now " << realm_epoch << ')' << dendl;
1591         sync_marker.realm_epoch = realm_epoch;
1592         sync_marker.marker.clear();
1593       }
1594       mdlog_marker = sync_marker.marker;
1595       set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
1596                                                          sync_env->shard_obj_name(shard_id),
1597                                                          sync_marker));
1598
1599       /*
1600        * mdlog_marker: the remote sync marker positiion
1601        * sync_marker: the local sync marker position
1602        * max_marker: the max mdlog position that we fetched
1603        * marker: the current position we try to sync
1604        * period_marker: the last marker before the next period begins (optional)
1605        */
1606       marker = max_marker = sync_marker.marker;
1607       /* inc sync */
1608       do {
1609         if (!lease_cr->is_locked()) {
1610           lost_lock = true;
1611           break;
1612         }
1613 #define INCREMENTAL_MAX_ENTRIES 100
1614         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
1615         if (!period_marker.empty() && period_marker <= mdlog_marker) {
1616           ldout(cct, 10) << "mdlog_marker past period_marker=" << period_marker << dendl;
1617           done_with_period = true;
1618           break;
1619         }
1620         if (mdlog_marker <= max_marker) {
1621           /* we're at the tip, try to bring more entries */
1622           ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
1623           yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
1624                                                   period, shard_id,
1625                                                   mdlog_marker, &mdlog_marker));
1626         }
1627         if (retcode < 0) {
1628           ldout(sync_env->cct, 10) << *this << ": failed to fetch more log entries, retcode=" << retcode << dendl;
1629           yield lease_cr->go_down();
1630           drain_all();
1631           *reset_backoff = false; // back off and try again later
1632           return retcode;
1633         }
1634         *reset_backoff = true; /* if we got to this point, all systems function */
1635         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
1636         if (mdlog_marker > max_marker) {
1637           marker = max_marker;
1638           yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
1639                                                &max_marker, INCREMENTAL_MAX_ENTRIES,
1640                                                &log_entries, &truncated));
1641           if (retcode < 0) {
1642             ldout(sync_env->cct, 10) << *this << ": failed to list mdlog entries, retcode=" << retcode << dendl;
1643             yield lease_cr->go_down();
1644             drain_all();
1645             *reset_backoff = false; // back off and try again later
1646             return retcode;
1647           }
1648           for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
1649             if (!period_marker.empty() && period_marker <= log_iter->id) {
1650               done_with_period = true;
1651               if (period_marker < log_iter->id) {
1652                 ldout(cct, 10) << "found key=" << log_iter->id
1653                     << " past period_marker=" << period_marker << dendl;
1654                 break;
1655               }
1656               ldout(cct, 10) << "found key at period_marker=" << period_marker << dendl;
1657               // sync this entry, then return control to RGWMetaSyncCR
1658             }
1659             if (!mdlog_entry.convert_from(*log_iter)) {
1660               ldout(sync_env->cct, 0) << __func__ << ":" << __LINE__ << ": ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry" << dendl;
1661               continue;
1662             }
1663             ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << dendl;
1664             if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
1665               ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
1666             } else {
1667               raw_key = log_iter->section + ":" + log_iter->name;
1668               yield {
1669                 RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker), false);
1670                 assert(stack);
1671                 // stack_to_pos holds a reference to the stack
1672                 stack_to_pos[stack] = log_iter->id;
1673                 pos_to_prev[log_iter->id] = marker;
1674               }
1675             }
1676             marker = log_iter->id;
1677           }
1678         }
1679         collect_children();
1680         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
1681         if (done_with_period) {
1682           // return control to RGWMetaSyncCR and advance to the next period
1683           ldout(sync_env->cct, 10) << *this << ": done with period" << dendl;
1684           break;
1685         }
1686         if (mdlog_marker == max_marker && can_adjust_marker) {
1687 #define INCREMENTAL_INTERVAL 20
1688           yield wait(utime_t(INCREMENTAL_INTERVAL, 0));
1689         }
1690       } while (can_adjust_marker);
1691
1692       while (num_spawned() > 1) {
1693         yield wait_for_child();
1694         collect_children();
1695       }
1696
1697       yield lease_cr->go_down();
1698
1699       drain_all();
1700
1701       if (lost_lock) {
1702         return -EBUSY;
1703       }
1704
1705       if (!can_adjust_marker) {
1706         return -EAGAIN;
1707       }
1708
1709       return set_cr_done();
1710     }
1711     /* TODO */
1712     return 0;
1713   }
1714 };
1715
1716 class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
1717 {
1718   RGWMetaSyncEnv *sync_env;
1719
1720   const rgw_pool& pool;
1721   const std::string& period;
1722   epoch_t realm_epoch;
1723   RGWMetadataLog* mdlog;
1724   uint32_t shard_id;
1725   rgw_meta_sync_marker sync_marker;
1726   const std::string period_marker;
1727
1728   static constexpr bool exit_on_error = false; // retry on all errors
1729 public:
1730   RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
1731                             const std::string& period, epoch_t realm_epoch,
1732                             RGWMetadataLog* mdlog, uint32_t _shard_id,
1733                             const rgw_meta_sync_marker& _marker,
1734                             std::string&& period_marker)
1735     : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
1736       pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
1737       shard_id(_shard_id), sync_marker(_marker),
1738       period_marker(std::move(period_marker)) {}
1739
1740   RGWCoroutine *alloc_cr() override {
1741     return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
1742                                   shard_id, sync_marker, period_marker, backoff_ptr());
1743   }
1744
1745   RGWCoroutine *alloc_finisher_cr() override {
1746     RGWRados *store = sync_env->store;
1747     return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->async_rados, store,
1748                                                           rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1749                                                           &sync_marker);
1750   }
1751 };
1752
1753 class RGWMetaSyncCR : public RGWCoroutine {
1754   RGWMetaSyncEnv *sync_env;
1755   const rgw_pool& pool;
1756   RGWPeriodHistory::Cursor cursor; //< sync position in period history
1757   RGWPeriodHistory::Cursor next; //< next period in history
1758   rgw_meta_sync_status sync_status;
1759
1760   std::mutex mutex; //< protect access to shard_crs
1761
1762   // TODO: it should be enough to hold a reference on the stack only, as calling
1763   // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
1764   // already completed
1765   using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
1766   using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
1767   using RefPair = std::pair<ControlCRRef, StackRef>;
1768   map<int, RefPair> shard_crs;
1769   int ret{0};
1770
1771 public:
1772   RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, RGWPeriodHistory::Cursor cursor,
1773                 const rgw_meta_sync_status& _sync_status)
1774     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
1775       pool(sync_env->store->get_zone_params().log_pool),
1776       cursor(cursor), sync_status(_sync_status) {}
1777
1778   int operate() override {
1779     reenter(this) {
1780       // loop through one period at a time
1781       for (;;) {
1782         if (cursor == sync_env->store->period_history->get_current()) {
1783           next = RGWPeriodHistory::Cursor{};
1784           if (cursor) {
1785             ldout(cct, 10) << "RGWMetaSyncCR on current period="
1786                 << cursor.get_period().get_id() << dendl;
1787           } else {
1788             ldout(cct, 10) << "RGWMetaSyncCR with no period" << dendl;
1789           }
1790         } else {
1791           next = cursor;
1792           next.next();
1793           ldout(cct, 10) << "RGWMetaSyncCR on period="
1794               << cursor.get_period().get_id() << ", next="
1795               << next.get_period().get_id() << dendl;
1796         }
1797
1798         yield {
1799           // get the mdlog for the current period (may be empty)
1800           auto& period_id = sync_status.sync_info.period;
1801           auto realm_epoch = sync_status.sync_info.realm_epoch;
1802           auto mdlog = sync_env->store->meta_mgr->get_log(period_id);
1803
1804           // prevent wakeup() from accessing shard_crs while we're spawning them
1805           std::lock_guard<std::mutex> lock(mutex);
1806
1807           // sync this period on each shard
1808           for (const auto& m : sync_status.sync_markers) {
1809             uint32_t shard_id = m.first;
1810             auto& marker = m.second;
1811
1812             std::string period_marker;
1813             if (next) {
1814               // read the maximum marker from the next period's sync status
1815               period_marker = next.get_period().get_sync_status()[shard_id];
1816               if (period_marker.empty()) {
1817                 // no metadata changes have occurred on this shard, skip it
1818                 ldout(cct, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
1819                     << " with empty period marker" << dendl;
1820                 continue;
1821               }
1822             }
1823
1824             using ShardCR = RGWMetaSyncShardControlCR;
1825             auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
1826                                   mdlog, shard_id, marker,
1827                                   std::move(period_marker));
1828             auto stack = spawn(cr, false);
1829             shard_crs[shard_id] = RefPair{cr, stack};
1830           }
1831         }
1832         // wait for each shard to complete
1833         while (ret == 0 && num_spawned() > 0) {
1834           yield wait_for_child();
1835           collect(&ret, nullptr);
1836         }
1837         drain_all();
1838         {
1839           // drop shard cr refs under lock
1840           std::lock_guard<std::mutex> lock(mutex);
1841           shard_crs.clear();
1842         }
1843         if (ret < 0) {
1844           return set_cr_error(ret);
1845         }
1846         // advance to the next period
1847         assert(next);
1848         cursor = next;
1849
1850         // write the updated sync info
1851         sync_status.sync_info.period = cursor.get_period().get_id();
1852         sync_status.sync_info.realm_epoch = cursor.get_epoch();
1853         yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados,
1854                                                                  sync_env->store,
1855                                                                  rgw_raw_obj(pool, sync_env->status_oid()),
1856                                                                  sync_status.sync_info));
1857       }
1858     }
1859     return 0;
1860   }
1861
1862   void wakeup(int shard_id) {
1863     std::lock_guard<std::mutex> lock(mutex);
1864     auto iter = shard_crs.find(shard_id);
1865     if (iter == shard_crs.end()) {
1866       return;
1867     }
1868     iter->second.first->wakeup();
1869   }
1870 };
1871
1872 void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
1873   env->cct = store->ctx();
1874   env->store = store;
1875   env->conn = conn;
1876   env->async_rados = async_rados;
1877   env->http_manager = &http_manager;
1878   env->error_logger = error_logger;
1879 }
1880
1881 int RGWRemoteMetaLog::read_sync_status(rgw_meta_sync_status *sync_status)
1882 {
1883   if (store->is_meta_master()) {
1884     return 0;
1885   }
1886   // cannot run concurrently with run_sync(), so run in a separate manager
1887   RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
1888   RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
1889   int ret = http_manager.set_threaded();
1890   if (ret < 0) {
1891     ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
1892     return ret;
1893   }
1894   RGWMetaSyncEnv sync_env_local = sync_env;
1895   sync_env_local.http_manager = &http_manager;
1896   ret = crs.run(new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
1897   http_manager.stop();
1898   return ret;
1899 }
1900
1901 int RGWRemoteMetaLog::init_sync_status()
1902 {
1903   if (store->is_meta_master()) {
1904     return 0;
1905   }
1906
1907   rgw_mdlog_info mdlog_info;
1908   int r = read_log_info(&mdlog_info);
1909   if (r < 0) {
1910     lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
1911     return r;
1912   }
1913
1914   rgw_meta_sync_info sync_info;
1915   sync_info.num_shards = mdlog_info.num_shards;
1916   auto cursor = store->period_history->get_current();
1917   if (cursor) {
1918     sync_info.period = cursor.get_period().get_id();
1919     sync_info.realm_epoch = cursor.get_epoch();
1920   }
1921
1922   return run(new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
1923 }
1924
1925 int RGWRemoteMetaLog::store_sync_info(const rgw_meta_sync_info& sync_info)
1926 {
1927   return run(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(async_rados, store,
1928                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env.status_oid()),
1929                                                            sync_info));
1930 }
1931
1932 // return a cursor to the period at our sync position
1933 static RGWPeriodHistory::Cursor get_period_at(RGWRados* store,
1934                                               const rgw_meta_sync_info& info)
1935 {
1936   if (info.period.empty()) {
1937     // return an empty cursor with error=0
1938     return RGWPeriodHistory::Cursor{};
1939   }
1940
1941   // look for an existing period in our history
1942   auto cursor = store->period_history->lookup(info.realm_epoch);
1943   if (cursor) {
1944     // verify that the period ids match
1945     auto& existing = cursor.get_period().get_id();
1946     if (existing != info.period) {
1947       lderr(store->ctx()) << "ERROR: sync status period=" << info.period
1948           << " does not match period=" << existing
1949           << " in history at realm epoch=" << info.realm_epoch << dendl;
1950       return RGWPeriodHistory::Cursor{-EEXIST};
1951     }
1952     return cursor;
1953   }
1954
1955   // read the period from rados or pull it from the master
1956   RGWPeriod period;
1957   int r = store->period_puller->pull(info.period, period);
1958   if (r < 0) {
1959     lderr(store->ctx()) << "ERROR: failed to read period id "
1960         << info.period << ": " << cpp_strerror(r) << dendl;
1961     return RGWPeriodHistory::Cursor{r};
1962   }
1963   // attach the period to our history
1964   cursor = store->period_history->attach(std::move(period));
1965   if (!cursor) {
1966     r = cursor.get_error();
1967     lderr(store->ctx()) << "ERROR: failed to read period history back to "
1968         << info.period << ": " << cpp_strerror(r) << dendl;
1969   }
1970   return cursor;
1971 }
1972
1973 int RGWRemoteMetaLog::run_sync()
1974 {
1975   if (store->is_meta_master()) {
1976     return 0;
1977   }
1978
1979   int r = 0;
1980
1981   // get shard count and oldest log period from master
1982   rgw_mdlog_info mdlog_info;
1983   for (;;) {
1984     if (going_down) {
1985       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
1986       return 0;
1987     }
1988     r = read_log_info(&mdlog_info);
1989     if (r == -EIO || r == -ENOENT) {
1990       // keep retrying if master isn't alive or hasn't initialized the log
1991       ldout(store->ctx(), 10) << __func__ << "(): waiting for master.." << dendl;
1992       backoff.backoff_sleep();
1993       continue;
1994     }
1995     backoff.reset();
1996     if (r < 0) {
1997       lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
1998       return r;
1999     }
2000     break;
2001   }
2002
2003   rgw_meta_sync_status sync_status;
2004   do {
2005     if (going_down) {
2006       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
2007       return 0;
2008     }
2009     r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
2010     if (r < 0 && r != -ENOENT) {
2011       ldout(store->ctx(), 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
2012       return r;
2013     }
2014
2015     if (!mdlog_info.period.empty()) {
2016       // restart sync if the remote has a period, but:
2017       // a) our status does not, or
2018       // b) our sync period comes before the remote's oldest log period
2019       if (sync_status.sync_info.period.empty() ||
2020           sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
2021         sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
2022         ldout(store->ctx(), 1) << "epoch=" << sync_status.sync_info.realm_epoch
2023            << " in sync status comes before remote's oldest mdlog epoch="
2024            << mdlog_info.realm_epoch << ", restarting sync" << dendl;
2025       }
2026     }
2027
2028     if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
2029       ldout(store->ctx(), 20) << __func__ << "(): init" << dendl;
2030       sync_status.sync_info.num_shards = mdlog_info.num_shards;
2031       auto cursor = store->period_history->get_current();
2032       if (cursor) {
2033         // run full sync, then start incremental from the current period/epoch
2034         sync_status.sync_info.period = cursor.get_period().get_id();
2035         sync_status.sync_info.realm_epoch = cursor.get_epoch();
2036       }
2037       r = run(new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
2038       if (r == -EBUSY) {
2039         backoff.backoff_sleep();
2040         continue;
2041       }
2042       backoff.reset();
2043       if (r < 0) {
2044         ldout(store->ctx(), 0) << "ERROR: failed to init sync status r=" << r << dendl;
2045         return r;
2046       }
2047     }
2048   } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
2049
2050   auto num_shards = sync_status.sync_info.num_shards;
2051   if (num_shards != mdlog_info.num_shards) {
2052     lderr(store->ctx()) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
2053     return -EINVAL;
2054   }
2055
2056   RGWPeriodHistory::Cursor cursor;
2057   do {
2058     r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
2059     if (r < 0 && r != -ENOENT) {
2060       ldout(store->ctx(), 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
2061       return r;
2062     }
2063
2064     switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
2065       case rgw_meta_sync_info::StateBuildingFullSyncMaps:
2066         ldout(store->ctx(), 20) << __func__ << "(): building full sync maps" << dendl;
2067         r = run(new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers));
2068         if (r == -EBUSY || r == -EAGAIN) {
2069           backoff.backoff_sleep();
2070           continue;
2071         }
2072         backoff.reset();
2073         if (r < 0) {
2074           ldout(store->ctx(), 0) << "ERROR: failed to fetch all metadata keys" << dendl;
2075           return r;
2076         }
2077
2078         sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
2079         r = store_sync_info(sync_status.sync_info);
2080         if (r < 0) {
2081           ldout(store->ctx(), 0) << "ERROR: failed to update sync status" << dendl;
2082           return r;
2083         }
2084         /* fall through */
2085       case rgw_meta_sync_info::StateSync:
2086         ldout(store->ctx(), 20) << __func__ << "(): sync" << dendl;
2087         // find our position in the period history (if any)
2088         cursor = get_period_at(store, sync_status.sync_info);
2089         r = cursor.get_error();
2090         if (r < 0) {
2091           return r;
2092         }
2093         meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status);
2094         r = run(meta_sync_cr);
2095         if (r < 0) {
2096           ldout(store->ctx(), 0) << "ERROR: failed to fetch all metadata keys" << dendl;
2097           return r;
2098         }
2099         break;
2100       default:
2101         ldout(store->ctx(), 0) << "ERROR: bad sync state!" << dendl;
2102         return -EIO;
2103     }
2104   } while (!going_down);
2105
2106   return 0;
2107 }
2108
2109 void RGWRemoteMetaLog::wakeup(int shard_id)
2110 {
2111   if (!meta_sync_cr) {
2112     return;
2113   }
2114   meta_sync_cr->wakeup(shard_id);
2115 }
2116
2117 int RGWCloneMetaLogCoroutine::operate()
2118 {
2119   reenter(this) {
2120     do {
2121       yield {
2122         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
2123         return state_init();
2124       }
2125       yield {
2126         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
2127         return state_read_shard_status();
2128       }
2129       yield {
2130         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
2131         return state_read_shard_status_complete();
2132       }
2133       yield {
2134         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
2135         return state_send_rest_request();
2136       }
2137       yield {
2138         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
2139         return state_receive_rest_response();
2140       }
2141       yield {
2142         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
2143         return state_store_mdlog_entries();
2144       }
2145     } while (truncated);
2146     yield {
2147       ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
2148       return state_store_mdlog_entries_complete();
2149     }
2150   }
2151
2152   return 0;
2153 }
2154
2155 int RGWCloneMetaLogCoroutine::state_init()
2156 {
2157   data = rgw_mdlog_shard_data();
2158
2159   return 0;
2160 }
2161
2162 int RGWCloneMetaLogCoroutine::state_read_shard_status()
2163 {
2164   const bool add_ref = false; // default constructs with refs=1
2165
2166   completion.reset(new RGWMetadataLogInfoCompletion(
2167     [this](int ret, const cls_log_header& header) {
2168       if (ret < 0) {
2169         ldout(cct, 1) << "ERROR: failed to read mdlog info with "
2170             << cpp_strerror(ret) << dendl;
2171       } else {
2172         shard_info.marker = header.max_marker;
2173         shard_info.last_update = header.max_time.to_real_time();
2174       }
2175       // wake up parent stack
2176       stack->get_completion_mgr()->complete(nullptr, stack);
2177     }), add_ref);
2178
2179   int ret = mdlog->get_info_async(shard_id, completion.get());
2180   if (ret < 0) {
2181     ldout(cct, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
2182     return set_cr_error(ret);
2183   }
2184
2185   return io_block(0);
2186 }
2187
2188 int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
2189 {
2190   completion.reset();
2191
2192   ldout(cct, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
2193
2194   marker = shard_info.marker;
2195
2196   return 0;
2197 }
2198
2199 int RGWCloneMetaLogCoroutine::state_send_rest_request()
2200 {
2201   RGWRESTConn *conn = sync_env->conn;
2202
2203   char buf[32];
2204   snprintf(buf, sizeof(buf), "%d", shard_id);
2205
2206   char max_entries_buf[32];
2207   snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
2208
2209   const char *marker_key = (marker.empty() ? "" : "marker");
2210
2211   rgw_http_param_pair pairs[] = { { "type", "metadata" },
2212                                   { "id", buf },
2213                                   { "period", period.c_str() },
2214                                   { "max-entries", max_entries_buf },
2215                                   { marker_key, marker.c_str() },
2216                                   { NULL, NULL } };
2217
2218   http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
2219
2220   http_op->set_user_info((void *)stack);
2221
2222   int ret = http_op->aio_read();
2223   if (ret < 0) {
2224     ldout(cct, 0) << "ERROR: failed to fetch mdlog data" << dendl;
2225     log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
2226     http_op->put();
2227     http_op = NULL;
2228     return ret;
2229   }
2230
2231   return io_block(0);
2232 }
2233
2234 int RGWCloneMetaLogCoroutine::state_receive_rest_response()
2235 {
2236   int ret = http_op->wait(&data);
2237   if (ret < 0) {
2238     error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
2239     ldout(cct, 5) << "failed to wait for op, ret=" << ret << dendl;
2240     http_op->put();
2241     http_op = NULL;
2242     return set_cr_error(ret);
2243   }
2244   http_op->put();
2245   http_op = NULL;
2246
2247   ldout(cct, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
2248
2249   truncated = ((int)data.entries.size() == max_entries);
2250
2251   if (data.entries.empty()) {
2252     if (new_marker) {
2253       *new_marker = marker;
2254     }
2255     return set_cr_done();
2256   }
2257
2258   if (new_marker) {
2259     *new_marker = data.entries.back().id;
2260   }
2261
2262   return 0;
2263 }
2264
2265
2266 int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
2267 {
2268   list<cls_log_entry> dest_entries;
2269
2270   vector<rgw_mdlog_entry>::iterator iter;
2271   for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
2272     rgw_mdlog_entry& entry = *iter;
2273     ldout(cct, 20) << "entry: name=" << entry.name << dendl;
2274
2275     cls_log_entry dest_entry;
2276     dest_entry.id = entry.id;
2277     dest_entry.section = entry.section;
2278     dest_entry.name = entry.name;
2279     dest_entry.timestamp = utime_t(entry.timestamp);
2280
2281     ::encode(entry.log_data, dest_entry.data);
2282
2283     dest_entries.push_back(dest_entry);
2284
2285     marker = entry.id;
2286   }
2287
2288   RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
2289
2290   int ret = mdlog->store_entries_in_shard(dest_entries, shard_id, cn->completion());
2291   if (ret < 0) {
2292     cn->put();
2293     ldout(cct, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
2294     return set_cr_error(ret);
2295   }
2296   return io_block(0);
2297 }
2298
2299 int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
2300 {
2301   return set_cr_done();
2302 }
2303
2304
2305 // TODO: move into rgw_sync_trim.cc
2306 #undef dout_prefix
2307 #define dout_prefix (*_dout << "meta trim: ")
2308
2309 /// purge all log shards for the given mdlog
2310 class PurgeLogShardsCR : public RGWShardCollectCR {
2311   RGWRados *const store;
2312   const RGWMetadataLog* mdlog;
2313   const int num_shards;
2314   rgw_raw_obj obj;
2315   int i{0};
2316
2317   static constexpr int max_concurrent = 16;
2318
2319  public:
2320   PurgeLogShardsCR(RGWRados *store, const RGWMetadataLog* mdlog,
2321                    const rgw_pool& pool, int num_shards)
2322     : RGWShardCollectCR(store->ctx(), max_concurrent),
2323       store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
2324   {}
2325
2326   bool spawn_next() override {
2327     if (i == num_shards) {
2328       return false;
2329     }
2330     mdlog->get_shard_oid(i++, obj.oid);
2331     spawn(new RGWRadosRemoveCR(store, obj), false);
2332     return true;
2333   }
2334 };
2335
2336 using Cursor = RGWPeriodHistory::Cursor;
2337
2338 /// purge mdlogs from the oldest up to (but not including) the given realm_epoch
2339 class PurgePeriodLogsCR : public RGWCoroutine {
2340   RGWRados *const store;
2341   RGWMetadataManager *const metadata;
2342   RGWObjVersionTracker objv;
2343   Cursor cursor;
2344   epoch_t realm_epoch;
2345   epoch_t *last_trim_epoch; //< update last trim on success
2346
2347  public:
2348   PurgePeriodLogsCR(RGWRados *store, epoch_t realm_epoch, epoch_t *last_trim)
2349     : RGWCoroutine(store->ctx()), store(store), metadata(store->meta_mgr),
2350       realm_epoch(realm_epoch), last_trim_epoch(last_trim)
2351   {}
2352
2353   int operate();
2354 };
2355
2356 int PurgePeriodLogsCR::operate()
2357 {
2358   reenter(this) {
2359     // read our current oldest log period
2360     yield call(metadata->read_oldest_log_period_cr(&cursor, &objv));
2361     if (retcode < 0) {
2362       return set_cr_error(retcode);
2363     }
2364     assert(cursor);
2365     ldout(cct, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
2366         << " period=" << cursor.get_period().get_id() << dendl;
2367
2368     // trim -up to- the given realm_epoch
2369     while (cursor.get_epoch() < realm_epoch) {
2370       ldout(cct, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
2371           << " period=" << cursor.get_period().get_id() << dendl;
2372       yield {
2373         const auto mdlog = metadata->get_log(cursor.get_period().get_id());
2374         const auto& pool = store->get_zone_params().log_pool;
2375         auto num_shards = cct->_conf->rgw_md_log_max_shards;
2376         call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
2377       }
2378       if (retcode < 0) {
2379         ldout(cct, 1) << "failed to remove log shards: "
2380             << cpp_strerror(retcode) << dendl;
2381         return set_cr_error(retcode);
2382       }
2383       ldout(cct, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
2384           << " period=" << cursor.get_period().get_id() << dendl;
2385
2386       // update our mdlog history
2387       yield call(metadata->trim_log_period_cr(cursor, &objv));
2388       if (retcode == -ENOENT) {
2389         // must have raced to update mdlog history. return success and allow the
2390         // winner to continue purging
2391         ldout(cct, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
2392             << " period=" << cursor.get_period().get_id() << dendl;
2393         return set_cr_done();
2394       } else if (retcode < 0) {
2395         ldout(cct, 1) << "failed to remove log shards for realm_epoch="
2396             << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
2397             << " with: " << cpp_strerror(retcode) << dendl;
2398         return set_cr_error(retcode);
2399       }
2400
2401       if (*last_trim_epoch < cursor.get_epoch()) {
2402         *last_trim_epoch = cursor.get_epoch();
2403       }
2404
2405       assert(cursor.has_next()); // get_current() should always come after
2406       cursor.next();
2407     }
2408     return set_cr_done();
2409   }
2410   return 0;
2411 }
2412
2413 namespace {
2414
2415 using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
2416
2417 /// construct a RGWRESTConn for each zone in the realm
2418 template <typename Zonegroups>
2419 connection_map make_peer_connections(RGWRados *store,
2420                                      const Zonegroups& zonegroups)
2421 {
2422   connection_map connections;
2423   for (auto& g : zonegroups) {
2424     for (auto& z : g.second.zones) {
2425       std::unique_ptr<RGWRESTConn> conn{
2426         new RGWRESTConn(store->ctx(), store, z.first, z.second.endpoints)};
2427       connections.emplace(z.first, std::move(conn));
2428     }
2429   }
2430   return connections;
2431 }
2432
2433 /// return the marker that it's safe to trim up to
2434 const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
2435 {
2436   return m.state == m.FullSync ? m.next_step_marker : m.marker;
2437 }
2438
2439 /// comparison operator for take_min_status()
2440 bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
2441 {
2442   // sort by stable marker
2443   return get_stable_marker(lhs) < get_stable_marker(rhs);
2444 }
2445
2446 /// populate the status with the minimum stable marker of each shard for any
2447 /// peer whose realm_epoch matches the minimum realm_epoch in the input
2448 template <typename Iter>
2449 int take_min_status(CephContext *cct, Iter first, Iter last,
2450                     rgw_meta_sync_status *status)
2451 {
2452   if (first == last) {
2453     return -EINVAL;
2454   }
2455   const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
2456
2457   status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
2458   for (auto p = first; p != last; ++p) {
2459     // validate peer's shard count
2460     if (p->sync_markers.size() != num_shards) {
2461       ldout(cct, 1) << "take_min_status got peer status with "
2462           << p->sync_markers.size() << " shards, expected "
2463           << num_shards << dendl;
2464       return -EINVAL;
2465     }
2466     if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
2467       // earlier epoch, take its entire status
2468       *status = std::move(*p);
2469     } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
2470       // same epoch, take any earlier markers
2471       auto m = status->sync_markers.begin();
2472       for (auto& shard : p->sync_markers) {
2473         if (shard.second < m->second) {
2474           m->second = std::move(shard.second);
2475         }
2476         ++m;
2477       }
2478     }
2479   }
2480   return 0;
2481 }
2482
2483 struct TrimEnv {
2484   RGWRados *const store;
2485   RGWHTTPManager *const http;
2486   int num_shards;
2487   const std::string& zone;
2488   Cursor current; //< cursor to current period
2489   epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
2490
2491   TrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2492     : store(store), http(http), num_shards(num_shards),
2493       zone(store->get_zone_params().get_id()),
2494       current(store->period_history->get_current())
2495   {}
2496 };
2497
2498 struct MasterTrimEnv : public TrimEnv {
2499   connection_map connections; //< peer connections
2500   std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
2501   /// last trim marker for each shard, only applies to current period's mdlog
2502   std::vector<std::string> last_trim_markers;
2503
2504   MasterTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2505     : TrimEnv(store, http, num_shards),
2506       last_trim_markers(num_shards)
2507   {
2508     auto& period = current.get_period();
2509     connections = make_peer_connections(store, period.get_map().zonegroups);
2510     connections.erase(zone);
2511     peer_status.resize(connections.size());
2512   }
2513 };
2514
2515 struct PeerTrimEnv : public TrimEnv {
2516   /// last trim timestamp for each shard, only applies to current period's mdlog
2517   std::vector<ceph::real_time> last_trim_timestamps;
2518
2519   PeerTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2520     : TrimEnv(store, http, num_shards),
2521       last_trim_timestamps(num_shards)
2522   {}
2523
2524   void set_num_shards(int num_shards) {
2525     this->num_shards = num_shards;
2526     last_trim_timestamps.resize(num_shards);
2527   }
2528 };
2529
2530 } // anonymous namespace
2531
2532
2533 /// spawn a trim cr for each shard that needs it, while limiting the number
2534 /// of concurrent shards
2535 class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
2536  private:
2537   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2538
2539   MasterTrimEnv& env;
2540   RGWMetadataLog *mdlog;
2541   int shard_id{0};
2542   std::string oid;
2543   const rgw_meta_sync_status& sync_status;
2544
2545  public:
2546   MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
2547                                const rgw_meta_sync_status& sync_status)
2548     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2549       env(env), mdlog(mdlog), sync_status(sync_status)
2550   {}
2551
2552   bool spawn_next() override;
2553 };
2554
2555 bool MetaMasterTrimShardCollectCR::spawn_next()
2556 {
2557   while (shard_id < env.num_shards) {
2558     auto m = sync_status.sync_markers.find(shard_id);
2559     if (m == sync_status.sync_markers.end()) {
2560       shard_id++;
2561       continue;
2562     }
2563     auto& stable = get_stable_marker(m->second);
2564     auto& last_trim = env.last_trim_markers[shard_id];
2565
2566     if (stable <= last_trim) {
2567       // already trimmed
2568       ldout(cct, 20) << "skipping log shard " << shard_id
2569           << " at marker=" << stable
2570           << " last_trim=" << last_trim
2571           << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
2572       shard_id++;
2573       continue;
2574     }
2575
2576     mdlog->get_shard_oid(shard_id, oid);
2577
2578     ldout(cct, 10) << "trimming log shard " << shard_id
2579         << " at marker=" << stable
2580         << " last_trim=" << last_trim
2581         << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
2582     spawn(new RGWSyncLogTrimCR(env.store, oid, stable, &last_trim), false);
2583     shard_id++;
2584     return true;
2585   }
2586   return false;
2587 }
2588
2589 /// spawn rest requests to read each peer's sync status
2590 class MetaMasterStatusCollectCR : public RGWShardCollectCR {
2591   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2592
2593   MasterTrimEnv& env;
2594   connection_map::iterator c;
2595   std::vector<rgw_meta_sync_status>::iterator s;
2596  public:
2597   MetaMasterStatusCollectCR(MasterTrimEnv& env)
2598     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2599       env(env), c(env.connections.begin()), s(env.peer_status.begin())
2600   {}
2601
2602   bool spawn_next() override {
2603     if (c == env.connections.end()) {
2604       return false;
2605     }
2606     static rgw_http_param_pair params[] = {
2607       { "type", "metadata" },
2608       { "status", nullptr },
2609       { nullptr, nullptr }
2610     };
2611
2612     ldout(cct, 20) << "query sync status from " << c->first << dendl;
2613     auto conn = c->second.get();
2614     using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
2615     spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
2616           false);
2617     ++c;
2618     ++s;
2619     return true;
2620   }
2621 };
2622
2623 class MetaMasterTrimCR : public RGWCoroutine {
2624   MasterTrimEnv& env;
2625   rgw_meta_sync_status min_status; //< minimum sync status of all peers
2626   int ret{0};
2627
2628  public:
2629   MetaMasterTrimCR(MasterTrimEnv& env)
2630     : RGWCoroutine(env.store->ctx()), env(env)
2631   {}
2632
2633   int operate();
2634 };
2635
2636 int MetaMasterTrimCR::operate()
2637 {
2638   reenter(this) {
2639     // TODO: detect this and fail before we spawn the trim thread?
2640     if (env.connections.empty()) {
2641       ldout(cct, 4) << "no peers, exiting" << dendl;
2642       return set_cr_done();
2643     }
2644
2645     ldout(cct, 10) << "fetching sync status for zone " << env.zone << dendl;
2646     // query mdlog sync status from peers
2647     yield call(new MetaMasterStatusCollectCR(env));
2648
2649     // must get a successful reply from all peers to consider trimming
2650     if (ret < 0) {
2651       ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl;
2652       return set_cr_error(ret);
2653     }
2654
2655     // determine the minimum epoch and markers
2656     ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
2657                           env.peer_status.end(), &min_status);
2658     if (ret < 0) {
2659       ldout(cct, 4) << "failed to calculate min sync status from peers" << dendl;
2660       return set_cr_error(ret);
2661     }
2662     yield {
2663       auto store = env.store;
2664       auto epoch = min_status.sync_info.realm_epoch;
2665       ldout(cct, 4) << "realm epoch min=" << epoch
2666           << " current=" << env.current.get_epoch()<< dendl;
2667       if (epoch > env.last_trim_epoch + 1) {
2668         // delete any prior mdlog periods
2669         spawn(new PurgePeriodLogsCR(store, epoch, &env.last_trim_epoch), true);
2670       } else {
2671         ldout(cct, 10) << "mdlogs already purged up to realm_epoch "
2672             << env.last_trim_epoch << dendl;
2673       }
2674
2675       // if realm_epoch == current, trim mdlog based on markers
2676       if (epoch == env.current.get_epoch()) {
2677         auto mdlog = store->meta_mgr->get_log(env.current.get_period().get_id());
2678         spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
2679       }
2680     }
2681     // ignore any errors during purge/trim because we want to hold the lock open
2682     return set_cr_done();
2683   }
2684   return 0;
2685 }
2686
2687
2688 /// read the first entry of the master's mdlog shard and trim to that position
2689 class MetaPeerTrimShardCR : public RGWCoroutine {
2690   RGWMetaSyncEnv& env;
2691   RGWMetadataLog *mdlog;
2692   const std::string& period_id;
2693   const int shard_id;
2694   RGWMetadataLogInfo info;
2695   ceph::real_time stable; //< safe timestamp to trim, according to master
2696   ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
2697   rgw_mdlog_shard_data result; //< result from master's mdlog listing
2698
2699  public:
2700   MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
2701                       const std::string& period_id, int shard_id,
2702                       ceph::real_time *last_trim)
2703     : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
2704       period_id(period_id), shard_id(shard_id), last_trim(last_trim)
2705   {}
2706
2707   int operate() override;
2708 };
2709
2710 int MetaPeerTrimShardCR::operate()
2711 {
2712   reenter(this) {
2713     // query master's first mdlog entry for this shard
2714     yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
2715                                              "", 1, &result));
2716     if (retcode < 0) {
2717       ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
2718           << shard_id << " for period " << period_id
2719           << ": " << cpp_strerror(retcode) << dendl;
2720       return set_cr_error(retcode);
2721     }
2722     if (result.entries.empty()) {
2723       // if there are no mdlog entries, we don't have a timestamp to compare. we
2724       // can't just trim everything, because there could be racing updates since
2725       // this empty reply. query the mdlog shard info to read its max timestamp,
2726       // then retry the listing to make sure it's still empty before trimming to
2727       // that
2728       ldout(cct, 10) << "empty master mdlog shard " << shard_id
2729           << ", reading last timestamp from shard info" << dendl;
2730       // read the mdlog shard info for the last timestamp
2731       using ShardInfoCR = RGWReadRemoteMDLogShardInfoCR;
2732       yield call(new ShardInfoCR(&env, period_id, shard_id, &info));
2733       if (retcode < 0) {
2734         ldout(cct, 5) << "failed to read info from master's mdlog shard "
2735             << shard_id << " for period " << period_id
2736             << ": " << cpp_strerror(retcode) << dendl;
2737         return set_cr_error(retcode);
2738       }
2739       if (ceph::real_clock::is_zero(info.last_update)) {
2740         return set_cr_done(); // nothing to trim
2741       }
2742       ldout(cct, 10) << "got mdlog shard info with last update="
2743           << info.last_update << dendl;
2744       // re-read the master's first mdlog entry to make sure it hasn't changed
2745       yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
2746                                                "", 1, &result));
2747       if (retcode < 0) {
2748         ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
2749             << shard_id << " for period " << period_id
2750             << ": " << cpp_strerror(retcode) << dendl;
2751         return set_cr_error(retcode);
2752       }
2753       // if the mdlog is still empty, trim to max marker
2754       if (result.entries.empty()) {
2755         stable = info.last_update;
2756       } else {
2757         stable = result.entries.front().timestamp;
2758
2759         // can only trim -up to- master's first timestamp, so subtract a second.
2760         // (this is why we use timestamps instead of markers for the peers)
2761         stable -= std::chrono::seconds(1);
2762       }
2763     } else {
2764       stable = result.entries.front().timestamp;
2765       stable -= std::chrono::seconds(1);
2766     }
2767
2768     if (stable <= *last_trim) {
2769       ldout(cct, 10) << "skipping log shard " << shard_id
2770           << " at timestamp=" << stable
2771           << " last_trim=" << *last_trim << dendl;
2772       return set_cr_done();
2773     }
2774
2775     ldout(cct, 10) << "trimming log shard " << shard_id
2776         << " at timestamp=" << stable
2777         << " last_trim=" << *last_trim << dendl;
2778     yield {
2779       std::string oid;
2780       mdlog->get_shard_oid(shard_id, oid);
2781       call(new RGWRadosTimelogTrimCR(env.store, oid, real_time{}, stable, "", ""));
2782     }
2783     if (retcode < 0 && retcode != -ENODATA) {
2784       ldout(cct, 1) << "failed to trim mdlog shard " << shard_id
2785           << ": " << cpp_strerror(retcode) << dendl;
2786       return set_cr_error(retcode);
2787     }
2788     *last_trim = stable;
2789     return set_cr_done();
2790   }
2791   return 0;
2792 }
2793
2794 class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
2795   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2796
2797   PeerTrimEnv& env;
2798   RGWMetadataLog *mdlog;
2799   const std::string& period_id;
2800   RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
2801   int shard_id{0};
2802
2803  public:
2804   MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
2805     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2806       env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
2807   {
2808     meta_env.init(cct, env.store, env.store->rest_master_conn,
2809                   env.store->get_async_rados(), env.http, nullptr);
2810   }
2811
2812   bool spawn_next() override;
2813 };
2814
2815 bool MetaPeerTrimShardCollectCR::spawn_next()
2816 {
2817   if (shard_id >= env.num_shards) {
2818     return false;
2819   }
2820   auto& last_trim = env.last_trim_timestamps[shard_id];
2821   spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
2822         false);
2823   shard_id++;
2824   return true;
2825 }
2826
2827 class MetaPeerTrimCR : public RGWCoroutine {
2828   PeerTrimEnv& env;
2829   rgw_mdlog_info mdlog_info; //< master's mdlog info
2830
2831  public:
2832   MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
2833
2834   int operate();
2835 };
2836
2837 int MetaPeerTrimCR::operate()
2838 {
2839   reenter(this) {
2840     ldout(cct, 10) << "fetching master mdlog info" << dendl;
2841     yield {
2842       // query mdlog_info from master for oldest_log_period
2843       rgw_http_param_pair params[] = {
2844         { "type", "metadata" },
2845         { nullptr, nullptr }
2846       };
2847
2848       using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
2849       call(new LogInfoCR(cct, env.store->rest_master_conn, env.http,
2850                          "/admin/log/", params, &mdlog_info));
2851     }
2852     if (retcode < 0) {
2853       ldout(cct, 4) << "failed to read mdlog info from master" << dendl;
2854       return set_cr_error(retcode);
2855     }
2856     // use master's shard count instead
2857     env.set_num_shards(mdlog_info.num_shards);
2858
2859     if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
2860       // delete any prior mdlog periods
2861       yield call(new PurgePeriodLogsCR(env.store, mdlog_info.realm_epoch,
2862                                        &env.last_trim_epoch));
2863     } else {
2864       ldout(cct, 10) << "mdlogs already purged through realm_epoch "
2865           << env.last_trim_epoch << dendl;
2866     }
2867
2868     // if realm_epoch == current, trim mdlog based on master's markers
2869     if (mdlog_info.realm_epoch == env.current.get_epoch()) {
2870       yield {
2871         auto meta_mgr = env.store->meta_mgr;
2872         auto mdlog = meta_mgr->get_log(env.current.get_period().get_id());
2873         call(new MetaPeerTrimShardCollectCR(env, mdlog));
2874         // ignore any errors during purge/trim because we want to hold the lock open
2875       }
2876     }
2877     return set_cr_done();
2878   }
2879   return 0;
2880 }
2881
2882 class MetaTrimPollCR : public RGWCoroutine {
2883   RGWRados *const store;
2884   const utime_t interval; //< polling interval
2885   const rgw_raw_obj obj;
2886   const std::string name{"meta_trim"}; //< lock name
2887   const std::string cookie;
2888
2889  protected:
2890   /// allocate the coroutine to run within the lease
2891   virtual RGWCoroutine* alloc_cr() = 0;
2892
2893  public:
2894   MetaTrimPollCR(RGWRados *store, utime_t interval)
2895     : RGWCoroutine(store->ctx()), store(store), interval(interval),
2896       obj(store->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
2897       cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
2898   {}
2899
2900   int operate();
2901 };
2902
2903 int MetaTrimPollCR::operate()
2904 {
2905   reenter(this) {
2906     for (;;) {
2907       set_status("sleeping");
2908       wait(interval);
2909
2910       // prevent others from trimming for our entire wait interval
2911       set_status("acquiring trim lock");
2912       yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store,
2913                                           obj, name, cookie, interval.sec()));
2914       if (retcode < 0) {
2915         ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
2916         continue;
2917       }
2918
2919       set_status("trimming");
2920       yield call(alloc_cr());
2921
2922       if (retcode < 0) {
2923         // on errors, unlock so other gateways can try
2924         set_status("unlocking");
2925         yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store,
2926                                               obj, name, cookie));
2927       }
2928     }
2929   }
2930   return 0;
2931 }
2932
2933 class MetaMasterTrimPollCR : public MetaTrimPollCR  {
2934   MasterTrimEnv env; //< trim state to share between calls
2935   RGWCoroutine* alloc_cr() override {
2936     return new MetaMasterTrimCR(env);
2937   }
2938  public:
2939   MetaMasterTrimPollCR(RGWRados *store, RGWHTTPManager *http,
2940                        int num_shards, utime_t interval)
2941     : MetaTrimPollCR(store, interval),
2942       env(store, http, num_shards)
2943   {}
2944 };
2945
2946 class MetaPeerTrimPollCR : public MetaTrimPollCR {
2947   PeerTrimEnv env; //< trim state to share between calls
2948   RGWCoroutine* alloc_cr() override {
2949     return new MetaPeerTrimCR(env);
2950   }
2951  public:
2952   MetaPeerTrimPollCR(RGWRados *store, RGWHTTPManager *http,
2953                      int num_shards, utime_t interval)
2954     : MetaTrimPollCR(store, interval),
2955       env(store, http, num_shards)
2956   {}
2957 };
2958
2959 RGWCoroutine* create_meta_log_trim_cr(RGWRados *store, RGWHTTPManager *http,
2960                                       int num_shards, utime_t interval)
2961 {
2962   if (store->is_meta_master()) {
2963     return new MetaMasterTrimPollCR(store, http, num_shards, interval);
2964   }
2965   return new MetaPeerTrimPollCR(store, http, num_shards, interval);
2966 }
2967
2968
2969 struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
2970   MetaMasterAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
2971     : MasterTrimEnv(store, http, num_shards),
2972       MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
2973   {}
2974 };
2975
2976 struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
2977   MetaPeerAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
2978     : PeerTrimEnv(store, http, num_shards),
2979       MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
2980   {}
2981 };
2982
2983 RGWCoroutine* create_admin_meta_log_trim_cr(RGWRados *store,
2984                                             RGWHTTPManager *http,
2985                                             int num_shards)
2986 {
2987   if (store->is_meta_master()) {
2988     return new MetaMasterAdminTrimCR(store, http, num_shards);
2989   }
2990   return new MetaPeerAdminTrimCR(store, http, num_shards);
2991 }