ceph/src/rgw/rgw_sync.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3
   4 #include <boost/optional.hpp>
   5
   6 #include "common/ceph_json.h"
   7 #include "common/RWLock.h"
   8 #include "common/RefCountedObj.h"
   9 #include "common/WorkQueue.h"
  10 #include "common/Throttle.h"
  11 #include "common/admin_socket.h"
  12 #include "common/errno.h"
  13
  14 #include "rgw_common.h"
  15 #include "rgw_rados.h"
  16 #include "rgw_sync.h"
  17 #include "rgw_metadata.h"
  18 #include "rgw_rest_conn.h"
  19 #include "rgw_tools.h"
  20 #include "rgw_cr_rados.h"
  21 #include "rgw_cr_rest.h"
  22 #include "rgw_http_client.h"
  23
  24 #include "cls/lock/cls_lock_client.h"
  25
  26 #include <boost/asio/yield.hpp>
  27
  28 #define dout_subsys ceph_subsys_rgw
  29
  30 #undef dout_prefix
  31 #define dout_prefix (*_dout << "meta sync: ")
  32
  33 static string mdlog_sync_status_oid = "mdlog.sync-status";
  34 static string mdlog_sync_status_shard_prefix = "mdlog.sync-status.shard";
  35 static string mdlog_sync_full_sync_index_prefix = "meta.full-sync.index";
  36
  37 RGWSyncErrorLogger::RGWSyncErrorLogger(RGWRados *_store, const string &oid_prefix, int _num_shards) : store(_store), num_shards(_num_shards) {
  38   for (int i = 0; i < num_shards; i++) {
  39     oids.push_back(get_shard_oid(oid_prefix, i));
  40   }
  41 }
  42 string RGWSyncErrorLogger::get_shard_oid(const string& oid_prefix, int shard_id) {
  43   char buf[oid_prefix.size() + 16];
  44   snprintf(buf, sizeof(buf), "%s.%d", oid_prefix.c_str(), shard_id);
  45   return string(buf);
  46 }
  47
  48 RGWCoroutine *RGWSyncErrorLogger::log_error_cr(const string& source_zone, const string& section, const string& name, uint32_t error_code, const string& message) {
  49   cls_log_entry entry;
  50
  51   rgw_sync_error_info info(source_zone, error_code, message);
  52   bufferlist bl;
  53   ::encode(info, bl);
  54   store->time_log_prepare_entry(entry, real_clock::now(), section, name, bl);
  55
  56   uint32_t shard_id = ++counter % num_shards;
  57
  58
  59   return new RGWRadosTimelogAddCR(store, oids[shard_id], entry);
  60 }
  61
  62 void RGWSyncBackoff::update_wait_time()
  63 {
  64   if (cur_wait == 0) {
  65     cur_wait = 1;
  66   } else {
  67     cur_wait = (cur_wait << 1);
  68   }
  69   if (cur_wait >= max_secs) {
  70     cur_wait = max_secs;
  71   }
  72 }
  73
  74 void RGWSyncBackoff::backoff_sleep()
  75 {
  76   update_wait_time();
  77   sleep(cur_wait);
  78 }
  79
  80 void RGWSyncBackoff::backoff(RGWCoroutine *op)
  81 {
  82   update_wait_time();
  83   op->wait(utime_t(cur_wait, 0));
  84 }
  85
  86 int RGWBackoffControlCR::operate() {
  87   reenter(this) {
  88     // retry the operation until it succeeds
  89     while (true) {
  90       yield {
  91         Mutex::Locker l(lock);
  92         cr = alloc_cr();
  93         cr->get();
  94         call(cr);
  95       }
  96       {
  97         Mutex::Locker l(lock);
  98         cr->put();
  99         cr = NULL;
 100       }
 101       if (retcode >= 0) {
 102         break;
 103       }
 104       if (retcode != -EBUSY && retcode != -EAGAIN) {
 105         ldout(cct, 0) << "ERROR: RGWBackoffControlCR called coroutine returned " << retcode << dendl;
 106         if (exit_on_error) {
 107           return set_cr_error(retcode);
 108         }
 109       }
 110       if (reset_backoff) {
 111         backoff.reset();
 112       }
 113       yield backoff.backoff(this);
 114     }
 115
 116     // run an optional finisher
 117     yield call(alloc_finisher_cr());
 118     if (retcode < 0) {
 119       ldout(cct, 0) << "ERROR: call to finisher_cr() failed: retcode=" << retcode << dendl;
 120       return set_cr_error(retcode);
 121     }
 122     return set_cr_done();
 123   }
 124   return 0;
 125 }
 126
 127 void rgw_mdlog_info::decode_json(JSONObj *obj) {
 128   JSONDecoder::decode_json("num_objects", num_shards, obj);
 129   JSONDecoder::decode_json("period", period, obj);
 130   JSONDecoder::decode_json("realm_epoch", realm_epoch, obj);
 131 }
 132
 133 void rgw_mdlog_entry::decode_json(JSONObj *obj) {
 134   JSONDecoder::decode_json("id", id, obj);
 135   JSONDecoder::decode_json("section", section, obj);
 136   JSONDecoder::decode_json("name", name, obj);
 137   utime_t ut;
 138   JSONDecoder::decode_json("timestamp", ut, obj);
 139   timestamp = ut.to_real_time();
 140   JSONDecoder::decode_json("data", log_data, obj);
 141 }
 142
 143 void rgw_mdlog_shard_data::decode_json(JSONObj *obj) {
 144   JSONDecoder::decode_json("marker", marker, obj);
 145   JSONDecoder::decode_json("truncated", truncated, obj);
 146   JSONDecoder::decode_json("entries", entries, obj);
 147 };
 148
 149 int RGWShardCollectCR::operate() {
 150   reenter(this) {
 151     while (spawn_next()) {
 152       current_running++;
 153
 154       while (current_running >= max_concurrent) {
 155         int child_ret;
 156         yield wait_for_child();
 157         if (collect_next(&child_ret)) {
 158           current_running--;
 159           if (child_ret < 0 && child_ret != -ENOENT) {
 160             ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
 161             status = child_ret;
 162           }
 163         }
 164       }
 165     }
 166     while (current_running > 0) {
 167       int child_ret;
 168       yield wait_for_child();
 169       if (collect_next(&child_ret)) {
 170         current_running--;
 171         if (child_ret < 0 && child_ret != -ENOENT) {
 172           ldout(cct, 10) << __func__ << ": failed to fetch log status, ret=" << child_ret << dendl;
 173           status = child_ret;
 174         }
 175       }
 176     }
 177     if (status < 0) {
 178       return set_cr_error(status);
 179     }
 180     return set_cr_done();
 181   }
 182   return 0;
 183 }
 184
 185 class RGWReadRemoteMDLogInfoCR : public RGWShardCollectCR {
 186   RGWMetaSyncEnv *sync_env;
 187
 188   const std::string& period;
 189   int num_shards;
 190   map<int, RGWMetadataLogInfo> *mdlog_info;
 191
 192   int shard_id;
 193 #define READ_MDLOG_MAX_CONCURRENT 10
 194
 195 public:
 196   RGWReadRemoteMDLogInfoCR(RGWMetaSyncEnv *_sync_env,
 197                      const std::string& period, int _num_shards,
 198                      map<int, RGWMetadataLogInfo> *_mdlog_info) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
 199                                                                  sync_env(_sync_env),
 200                                                                  period(period), num_shards(_num_shards),
 201                                                                  mdlog_info(_mdlog_info), shard_id(0) {}
 202   bool spawn_next() override;
 203 };
 204
 205 class RGWListRemoteMDLogCR : public RGWShardCollectCR {
 206   RGWMetaSyncEnv *sync_env;
 207
 208   const std::string& period;
 209   map<int, string> shards;
 210   int max_entries_per_shard;
 211   map<int, rgw_mdlog_shard_data> *result;
 212
 213   map<int, string>::iterator iter;
 214 #define READ_MDLOG_MAX_CONCURRENT 10
 215
 216 public:
 217   RGWListRemoteMDLogCR(RGWMetaSyncEnv *_sync_env,
 218                      const std::string& period, map<int, string>& _shards,
 219                      int _max_entries_per_shard,
 220                      map<int, rgw_mdlog_shard_data> *_result) : RGWShardCollectCR(_sync_env->cct, READ_MDLOG_MAX_CONCURRENT),
 221                                                                  sync_env(_sync_env), period(period),
 222                                                                  max_entries_per_shard(_max_entries_per_shard),
 223                                                                  result(_result) {
 224     shards.swap(_shards);
 225     iter = shards.begin();
 226   }
 227   bool spawn_next() override;
 228 };
 229
 230 RGWRemoteMetaLog::~RGWRemoteMetaLog()
 231 {
 232   delete error_logger;
 233 }
 234
 235 int RGWRemoteMetaLog::read_log_info(rgw_mdlog_info *log_info)
 236 {
 237   rgw_http_param_pair pairs[] = { { "type", "metadata" },
 238                                   { NULL, NULL } };
 239
 240   int ret = conn->get_json_resource("/admin/log", pairs, *log_info);
 241   if (ret < 0) {
 242     ldout(store->ctx(), 0) << "ERROR: failed to fetch mdlog info" << dendl;
 243     return ret;
 244   }
 245
 246   ldout(store->ctx(), 20) << "remote mdlog, num_shards=" << log_info->num_shards << dendl;
 247
 248   return 0;
 249 }
 250
 251 int RGWRemoteMetaLog::read_master_log_shards_info(const string &master_period, map<int, RGWMetadataLogInfo> *shards_info)
 252 {
 253   if (store->is_meta_master()) {
 254     return 0;
 255   }
 256
 257   rgw_mdlog_info log_info;
 258   int ret = read_log_info(&log_info);
 259   if (ret < 0) {
 260     return ret;
 261   }
 262
 263   return run(new RGWReadRemoteMDLogInfoCR(&sync_env, master_period, log_info.num_shards, shards_info));
 264 }
 265
 266 int RGWRemoteMetaLog::read_master_log_shards_next(const string& period, map<int, string> shard_markers, map<int, rgw_mdlog_shard_data> *result)
 267 {
 268   if (store->is_meta_master()) {
 269     return 0;
 270   }
 271
 272   return run(new RGWListRemoteMDLogCR(&sync_env, period, shard_markers, 1, result));
 273 }
 274
 275 int RGWRemoteMetaLog::init()
 276 {
 277   conn = store->rest_master_conn;
 278
 279   int ret = http_manager.set_threaded();
 280   if (ret < 0) {
 281     ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
 282     return ret;
 283   }
 284
 285   error_logger = new RGWSyncErrorLogger(store, RGW_SYNC_ERROR_LOG_SHARD_PREFIX, ERROR_LOGGER_SHARDS);
 286
 287   init_sync_env(&sync_env);
 288
 289   return 0;
 290 }
 291
 292 void RGWRemoteMetaLog::finish()
 293 {
 294   going_down = true;
 295   stop();
 296 }
 297
 298 #define CLONE_MAX_ENTRIES 100
 299
 300 int RGWMetaSyncStatusManager::init()
 301 {
 302   if (store->is_meta_master()) {
 303     return 0;
 304   }
 305
 306   if (!store->rest_master_conn) {
 307     lderr(store->ctx()) << "no REST connection to master zone" << dendl;
 308     return -EIO;
 309   }
 310
 311   int r = rgw_init_ioctx(store->get_rados_handle(), store->get_zone_params().log_pool, ioctx, true);
 312   if (r < 0) {
 313     lderr(store->ctx()) << "ERROR: failed to open log pool (" << store->get_zone_params().log_pool << " ret=" << r << dendl;
 314     return r;
 315   }
 316
 317   r = master_log.init();
 318   if (r < 0) {
 319     lderr(store->ctx()) << "ERROR: failed to init remote log, r=" << r << dendl;
 320     return r;
 321   }
 322
 323   RGWMetaSyncEnv& sync_env = master_log.get_sync_env();
 324
 325   rgw_meta_sync_status sync_status;
 326   r = read_sync_status(&sync_status);
 327   if (r < 0 && r != -ENOENT) {
 328     lderr(store->ctx()) << "ERROR: failed to read sync status, r=" << r << dendl;
 329     return r;
 330   }
 331
 332   int num_shards = sync_status.sync_info.num_shards;
 333
 334   for (int i = 0; i < num_shards; i++) {
 335     shard_objs[i] = rgw_raw_obj(store->get_zone_params().log_pool, sync_env.shard_obj_name(i));
 336   }
 337
 338   RWLock::WLocker wl(ts_to_shard_lock);
 339   for (int i = 0; i < num_shards; i++) {
 340     clone_markers.push_back(string());
 341     utime_shard ut;
 342     ut.shard_id = i;
 343     ts_to_shard[ut] = i;
 344   }
 345
 346   return 0;
 347 }
 348
 349 void RGWMetaSyncEnv::init(CephContext *_cct, RGWRados *_store, RGWRESTConn *_conn,
 350                           RGWAsyncRadosProcessor *_async_rados, RGWHTTPManager *_http_manager,
 351                           RGWSyncErrorLogger *_error_logger) {
 352   cct = _cct;
 353   store = _store;
 354   conn = _conn;
 355   async_rados = _async_rados;
 356   http_manager = _http_manager;
 357   error_logger = _error_logger;
 358 }
 359
 360 string RGWMetaSyncEnv::status_oid()
 361 {
 362   return mdlog_sync_status_oid;
 363 }
 364
 365 string RGWMetaSyncEnv::shard_obj_name(int shard_id)
 366 {
 367   char buf[mdlog_sync_status_shard_prefix.size() + 16];
 368   snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_status_shard_prefix.c_str(), shard_id);
 369
 370   return string(buf);
 371 }
 372
 373 class RGWAsyncReadMDLogEntries : public RGWAsyncRadosRequest {
 374   RGWRados *store;
 375   RGWMetadataLog *mdlog;
 376   int shard_id;
 377   string *marker;
 378   int max_entries;
 379   list<cls_log_entry> *entries;
 380   bool *truncated;
 381
 382 protected:
 383   int _send_request() override {
 384     real_time from_time;
 385     real_time end_time;
 386
 387     void *handle;
 388
 389     mdlog->init_list_entries(shard_id, from_time, end_time, *marker, &handle);
 390
 391     int ret = mdlog->list_entries(handle, max_entries, *entries, marker, truncated);
 392
 393     mdlog->complete_list_entries(handle);
 394
 395     return ret;
 396   }
 397 public:
 398   RGWAsyncReadMDLogEntries(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
 399                            RGWMetadataLog* mdlog, int _shard_id,
 400                            string* _marker, int _max_entries,
 401                            list<cls_log_entry> *_entries, bool *_truncated)
 402     : RGWAsyncRadosRequest(caller, cn), store(_store), mdlog(mdlog),
 403       shard_id(_shard_id), marker(_marker), max_entries(_max_entries),
 404       entries(_entries), truncated(_truncated) {}
 405 };
 406
 407 class RGWReadMDLogEntriesCR : public RGWSimpleCoroutine {
 408   RGWMetaSyncEnv *sync_env;
 409   RGWMetadataLog *const mdlog;
 410   int shard_id;
 411   string marker;
 412   string *pmarker;
 413   int max_entries;
 414   list<cls_log_entry> *entries;
 415   bool *truncated;
 416
 417   RGWAsyncReadMDLogEntries *req{nullptr};
 418
 419 public:
 420   RGWReadMDLogEntriesCR(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
 421                         int _shard_id, string*_marker, int _max_entries,
 422                         list<cls_log_entry> *_entries, bool *_truncated)
 423     : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
 424       shard_id(_shard_id), pmarker(_marker), max_entries(_max_entries),
 425       entries(_entries), truncated(_truncated) {}
 426
 427   ~RGWReadMDLogEntriesCR() override {
 428     if (req) {
 429       req->finish();
 430     }
 431   }
 432
 433   int send_request() override {
 434     marker = *pmarker;
 435     req = new RGWAsyncReadMDLogEntries(this, stack->create_completion_notifier(),
 436                                        sync_env->store, mdlog, shard_id, &marker,
 437                                        max_entries, entries, truncated);
 438     sync_env->async_rados->queue(req);
 439     return 0;
 440   }
 441
 442   int request_complete() override {
 443     int ret = req->get_ret_status();
 444     if (ret >= 0 && !entries->empty()) {
 445      *pmarker = marker;
 446     }
 447     return req->get_ret_status();
 448   }
 449 };
 450
 451
 452 class RGWReadRemoteMDLogShardInfoCR : public RGWCoroutine {
 453   RGWMetaSyncEnv *env;
 454   RGWRESTReadResource *http_op;
 455
 456   const std::string& period;
 457   int shard_id;
 458   RGWMetadataLogInfo *shard_info;
 459
 460 public:
 461   RGWReadRemoteMDLogShardInfoCR(RGWMetaSyncEnv *env, const std::string& period,
 462                                 int _shard_id, RGWMetadataLogInfo *_shard_info)
 463     : RGWCoroutine(env->store->ctx()), env(env), http_op(NULL),
 464       period(period), shard_id(_shard_id), shard_info(_shard_info) {}
 465
 466   int operate() override {
 467     auto store = env->store;
 468     RGWRESTConn *conn = store->rest_master_conn;
 469     reenter(this) {
 470       yield {
 471         char buf[16];
 472         snprintf(buf, sizeof(buf), "%d", shard_id);
 473         rgw_http_param_pair pairs[] = { { "type" , "metadata" },
 474                                         { "id", buf },
 475                                         { "period", period.c_str() },
 476                                         { "info" , NULL },
 477                                         { NULL, NULL } };
 478
 479         string p = "/admin/log/";
 480
 481         http_op = new RGWRESTReadResource(conn, p, pairs, NULL,
 482                                           env->http_manager);
 483
 484         http_op->set_user_info((void *)stack);
 485
 486         int ret = http_op->aio_read();
 487         if (ret < 0) {
 488           ldout(store->ctx(), 0) << "ERROR: failed to read from " << p << dendl;
 489           log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 490           http_op->put();
 491           return set_cr_error(ret);
 492         }
 493
 494         return io_block(0);
 495       }
 496       yield {
 497         int ret = http_op->wait(shard_info);
 498         http_op->put();
 499         if (ret < 0) {
 500           return set_cr_error(ret);
 501         }
 502         return set_cr_done();
 503       }
 504     }
 505     return 0;
 506   }
 507 };
 508
 509 class RGWListRemoteMDLogShardCR : public RGWSimpleCoroutine {
 510   RGWMetaSyncEnv *sync_env;
 511   RGWRESTReadResource *http_op;
 512
 513   const std::string& period;
 514   int shard_id;
 515   string marker;
 516   uint32_t max_entries;
 517   rgw_mdlog_shard_data *result;
 518
 519 public:
 520   RGWListRemoteMDLogShardCR(RGWMetaSyncEnv *env, const std::string& period,
 521                             int _shard_id, const string& _marker, uint32_t _max_entries,
 522                             rgw_mdlog_shard_data *_result)
 523     : RGWSimpleCoroutine(env->store->ctx()), sync_env(env), http_op(NULL),
 524       period(period), shard_id(_shard_id), marker(_marker), max_entries(_max_entries), result(_result) {}
 525
 526   int send_request() override {
 527     RGWRESTConn *conn = sync_env->conn;
 528     RGWRados *store = sync_env->store;
 529
 530     char buf[32];
 531     snprintf(buf, sizeof(buf), "%d", shard_id);
 532
 533     char max_entries_buf[32];
 534     snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", (int)max_entries);
 535
 536     const char *marker_key = (marker.empty() ? "" : "marker");
 537
 538     rgw_http_param_pair pairs[] = { { "type", "metadata" },
 539       { "id", buf },
 540       { "period", period.c_str() },
 541       { "max-entries", max_entries_buf },
 542       { marker_key, marker.c_str() },
 543       { NULL, NULL } };
 544
 545     string p = "/admin/log/";
 546
 547     http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
 548     http_op->set_user_info((void *)stack);
 549
 550     int ret = http_op->aio_read();
 551     if (ret < 0) {
 552       ldout(store->ctx(), 0) << "ERROR: failed to read from " << p << dendl;
 553       log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 554       http_op->put();
 555       return ret;
 556     }
 557
 558     return 0;
 559   }
 560
 561   int request_complete() override {
 562     int ret = http_op->wait(result);
 563     http_op->put();
 564     if (ret < 0 && ret != -ENOENT) {
 565       ldout(sync_env->store->ctx(), 0) << "ERROR: failed to list remote mdlog shard, ret=" << ret << dendl;
 566       return ret;
 567     }
 568     return 0;
 569   }
 570 };
 571
 572 bool RGWReadRemoteMDLogInfoCR::spawn_next() {
 573   if (shard_id >= num_shards) {
 574     return false;
 575   }
 576   spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, period, shard_id, &(*mdlog_info)[shard_id]), false);
 577   shard_id++;
 578   return true;
 579 }
 580
 581 bool RGWListRemoteMDLogCR::spawn_next() {
 582   if (iter == shards.end()) {
 583     return false;
 584   }
 585
 586   spawn(new RGWListRemoteMDLogShardCR(sync_env, period, iter->first, iter->second, max_entries_per_shard, &(*result)[iter->first]), false);
 587   ++iter;
 588   return true;
 589 }
 590
 591 class RGWInitSyncStatusCoroutine : public RGWCoroutine {
 592   RGWMetaSyncEnv *sync_env;
 593
 594   rgw_meta_sync_info status;
 595   vector<RGWMetadataLogInfo> shards_info;
 596   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
 597   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
 598 public:
 599   RGWInitSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
 600                              const rgw_meta_sync_info &status)
 601     : RGWCoroutine(_sync_env->store->ctx()), sync_env(_sync_env),
 602       status(status), shards_info(status.num_shards),
 603       lease_cr(nullptr), lease_stack(nullptr) {}
 604
 605   ~RGWInitSyncStatusCoroutine() override {
 606     if (lease_cr) {
 607       lease_cr->abort();
 608     }
 609   }
 610
 611   int operate() override {
 612     int ret;
 613     reenter(this) {
 614       yield {
 615         set_status("acquiring sync lock");
 616         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
 617         string lock_name = "sync_lock";
 618         RGWRados *store = sync_env->store;
 619         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
 620                                                 rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 621                                                 lock_name, lock_duration, this));
 622         lease_stack.reset(spawn(lease_cr.get(), false));
 623       }
 624       while (!lease_cr->is_locked()) {
 625         if (lease_cr->is_done()) {
 626           ldout(cct, 5) << "lease cr failed, done early " << dendl;
 627           set_status("lease lock failed, early abort");
 628           return set_cr_error(lease_cr->get_ret_status());
 629         }
 630         set_sleeping(true);
 631         yield;
 632       }
 633       yield {
 634         set_status("writing sync status");
 635         RGWRados *store = sync_env->store;
 636         call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store,
 637                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 638                                                            status));
 639       }
 640
 641       if (retcode < 0) {
 642         set_status("failed to write sync status");
 643         ldout(cct, 0) << "ERROR: failed to write sync status, retcode=" << retcode << dendl;
 644         yield lease_cr->go_down();
 645         return set_cr_error(retcode);
 646       }
 647       /* fetch current position in logs */
 648       set_status("fetching remote log position");
 649       yield {
 650         for (int i = 0; i < (int)status.num_shards; i++) {
 651           spawn(new RGWReadRemoteMDLogShardInfoCR(sync_env, status.period, i,
 652                                                   &shards_info[i]), false);
 653         }
 654       }
 655
 656       drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
 657
 658       yield {
 659         set_status("updating sync status");
 660         for (int i = 0; i < (int)status.num_shards; i++) {
 661           rgw_meta_sync_marker marker;
 662           RGWMetadataLogInfo& info = shards_info[i];
 663           marker.next_step_marker = info.marker;
 664           marker.timestamp = info.last_update;
 665           RGWRados *store = sync_env->store;
 666           spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
 667                                                                 store,
 668                                                                 rgw_raw_obj(store->get_zone_params().log_pool, sync_env->shard_obj_name(i)),
 669                                                                 marker), true);
 670         }
 671       }
 672       yield {
 673         set_status("changing sync state: build full sync maps");
 674         status.state = rgw_meta_sync_info::StateBuildingFullSyncMaps;
 675         RGWRados *store = sync_env->store;
 676         call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados, store,
 677                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env->status_oid()),
 678                                                            status));
 679       }
 680       set_status("drop lock lease");
 681       yield lease_cr->go_down();
 682       while (collect(&ret, NULL)) {
 683         if (ret < 0) {
 684           return set_cr_error(ret);
 685         }
 686         yield;
 687       }
 688       drain_all();
 689       return set_cr_done();
 690     }
 691     return 0;
 692   }
 693 };
 694
 695 class RGWReadSyncStatusMarkersCR : public RGWShardCollectCR {
 696   static constexpr int MAX_CONCURRENT_SHARDS = 16;
 697
 698   RGWMetaSyncEnv *env;
 699   const int num_shards;
 700   int shard_id{0};
 701   map<uint32_t, rgw_meta_sync_marker>& markers;
 702
 703  public:
 704   RGWReadSyncStatusMarkersCR(RGWMetaSyncEnv *env, int num_shards,
 705                              map<uint32_t, rgw_meta_sync_marker>& markers)
 706     : RGWShardCollectCR(env->cct, MAX_CONCURRENT_SHARDS),
 707       env(env), num_shards(num_shards), markers(markers)
 708   {}
 709   bool spawn_next() override;
 710 };
 711
 712 bool RGWReadSyncStatusMarkersCR::spawn_next()
 713 {
 714   if (shard_id >= num_shards) {
 715     return false;
 716   }
 717   using CR = RGWSimpleRadosReadCR<rgw_meta_sync_marker>;
 718   rgw_raw_obj obj{env->store->get_zone_params().log_pool,
 719                   env->shard_obj_name(shard_id)};
 720   spawn(new CR(env->async_rados, env->store, obj, &markers[shard_id]), false);
 721   shard_id++;
 722   return true;
 723 }
 724
 725 class RGWReadSyncStatusCoroutine : public RGWCoroutine {
 726   RGWMetaSyncEnv *sync_env;
 727   rgw_meta_sync_status *sync_status;
 728
 729 public:
 730   RGWReadSyncStatusCoroutine(RGWMetaSyncEnv *_sync_env,
 731                              rgw_meta_sync_status *_status)
 732     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), sync_status(_status)
 733   {}
 734   int operate() override;
 735 };
 736
 737 int RGWReadSyncStatusCoroutine::operate()
 738 {
 739   reenter(this) {
 740     // read sync info
 741     using ReadInfoCR = RGWSimpleRadosReadCR<rgw_meta_sync_info>;
 742     yield {
 743       bool empty_on_enoent = false; // fail on ENOENT
 744       rgw_raw_obj obj{sync_env->store->get_zone_params().log_pool,
 745                       sync_env->status_oid()};
 746       call(new ReadInfoCR(sync_env->async_rados, sync_env->store, obj,
 747                           &sync_status->sync_info, empty_on_enoent));
 748     }
 749     if (retcode < 0) {
 750       ldout(sync_env->cct, 4) << "failed to read sync status info with "
 751           << cpp_strerror(retcode) << dendl;
 752       return set_cr_error(retcode);
 753     }
 754     // read shard markers
 755     using ReadMarkersCR = RGWReadSyncStatusMarkersCR;
 756     yield call(new ReadMarkersCR(sync_env, sync_status->sync_info.num_shards,
 757                                  sync_status->sync_markers));
 758     if (retcode < 0) {
 759       ldout(sync_env->cct, 4) << "failed to read sync status markers with "
 760           << cpp_strerror(retcode) << dendl;
 761       return set_cr_error(retcode);
 762     }
 763     return set_cr_done();
 764   }
 765   return 0;
 766 }
 767
 768 class RGWFetchAllMetaCR : public RGWCoroutine {
 769   RGWMetaSyncEnv *sync_env;
 770
 771   int num_shards;
 772
 773
 774   int ret_status;
 775
 776   list<string> sections;
 777   list<string>::iterator sections_iter;
 778   list<string> result;
 779   list<string>::iterator iter;
 780
 781   std::unique_ptr<RGWShardedOmapCRManager> entries_index;
 782
 783   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
 784   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
 785   bool lost_lock;
 786   bool failed;
 787
 788   map<uint32_t, rgw_meta_sync_marker>& markers;
 789
 790 public:
 791   RGWFetchAllMetaCR(RGWMetaSyncEnv *_sync_env, int _num_shards,
 792                     map<uint32_t, rgw_meta_sync_marker>& _markers) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
 793                                                       num_shards(_num_shards),
 794                                                       ret_status(0), lease_cr(nullptr), lease_stack(nullptr),
 795                                                       lost_lock(false), failed(false), markers(_markers) {
 796   }
 797
 798   ~RGWFetchAllMetaCR() override {
 799   }
 800
 801   void append_section_from_set(set<string>& all_sections, const string& name) {
 802     set<string>::iterator iter = all_sections.find(name);
 803     if (iter != all_sections.end()) {
 804       sections.emplace_back(std::move(*iter));
 805       all_sections.erase(iter);
 806     }
 807   }
 808   /*
 809    * meta sync should go in the following order: user, bucket.instance, bucket
 810    * then whatever other sections exist (if any)
 811    */
 812   void rearrange_sections() {
 813     set<string> all_sections;
 814     std::move(sections.begin(), sections.end(),
 815               std::inserter(all_sections, all_sections.end()));
 816     sections.clear();
 817
 818     append_section_from_set(all_sections, "user");
 819     append_section_from_set(all_sections, "bucket.instance");
 820     append_section_from_set(all_sections, "bucket");
 821
 822     std::move(all_sections.begin(), all_sections.end(),
 823               std::back_inserter(sections));
 824   }
 825
 826   int operate() override {
 827     RGWRESTConn *conn = sync_env->conn;
 828
 829     reenter(this) {
 830       yield {
 831         set_status(string("acquiring lock (") + sync_env->status_oid() + ")");
 832         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
 833         string lock_name = "sync_lock";
 834         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados,
 835                                                 sync_env->store,
 836                                                 rgw_raw_obj(sync_env->store->get_zone_params().log_pool, sync_env->status_oid()),
 837                                                 lock_name, lock_duration, this));
 838         lease_stack.reset(spawn(lease_cr.get(), false));
 839       }
 840       while (!lease_cr->is_locked()) {
 841         if (lease_cr->is_done()) {
 842           ldout(cct, 5) << "lease cr failed, done early " << dendl;
 843           set_status("failed acquiring lock");
 844           return set_cr_error(lease_cr->get_ret_status());
 845         }
 846         set_sleeping(true);
 847         yield;
 848       }
 849       entries_index.reset(new RGWShardedOmapCRManager(sync_env->async_rados, sync_env->store, this, num_shards,
 850                                                       sync_env->store->get_zone_params().log_pool,
 851                                                       mdlog_sync_full_sync_index_prefix));
 852       yield {
 853         call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
 854                                        "/admin/metadata", NULL, &sections));
 855       }
 856       if (get_ret_status() < 0) {
 857         ldout(cct, 0) << "ERROR: failed to fetch metadata sections" << dendl;
 858         yield entries_index->finish();
 859         yield lease_cr->go_down();
 860         drain_all();
 861         return set_cr_error(get_ret_status());
 862       }
 863       rearrange_sections();
 864       sections_iter = sections.begin();
 865       for (; sections_iter != sections.end(); ++sections_iter) {
 866         yield {
 867           string entrypoint = string("/admin/metadata/") + *sections_iter;
 868           /* FIXME: need a better scaling solution here, requires streaming output */
 869           call(new RGWReadRESTResourceCR<list<string> >(cct, conn, sync_env->http_manager,
 870                                        entrypoint, NULL, &result));
 871         }
 872         if (get_ret_status() < 0) {
 873           ldout(cct, 0) << "ERROR: failed to fetch metadata section: " << *sections_iter << dendl;
 874           yield entries_index->finish();
 875           yield lease_cr->go_down();
 876           drain_all();
 877           return set_cr_error(get_ret_status());
 878         }
 879         iter = result.begin();
 880         for (; iter != result.end(); ++iter) {
 881           if (!lease_cr->is_locked()) {
 882             lost_lock = true;
 883             break;
 884           }
 885           yield; // allow entries_index consumer to make progress
 886
 887           ldout(cct, 20) << "list metadata: section=" << *sections_iter << " key=" << *iter << dendl;
 888           string s = *sections_iter + ":" + *iter;
 889           int shard_id;
 890           RGWRados *store = sync_env->store;
 891           int ret = store->meta_mgr->get_log_shard_id(*sections_iter, *iter, &shard_id);
 892           if (ret < 0) {
 893             ldout(cct, 0) << "ERROR: could not determine shard id for " << *sections_iter << ":" << *iter << dendl;
 894             ret_status = ret;
 895             break;
 896           }
 897           if (!entries_index->append(s, shard_id)) {
 898             break;
 899           }
 900         }
 901       }
 902       yield {
 903         if (!entries_index->finish()) {
 904           failed = true;
 905         }
 906       }
 907       if (!failed) {
 908         for (map<uint32_t, rgw_meta_sync_marker>::iterator iter = markers.begin(); iter != markers.end(); ++iter) {
 909           int shard_id = (int)iter->first;
 910           rgw_meta_sync_marker& marker = iter->second;
 911           marker.total_entries = entries_index->get_total_entries(shard_id);
 912           spawn(new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados, sync_env->store,
 913                                                                 rgw_raw_obj(sync_env->store->get_zone_params().log_pool, sync_env->shard_obj_name(shard_id)),
 914                                                                 marker), true);
 915         }
 916       }
 917
 918       drain_all_but_stack(lease_stack.get()); /* the lease cr still needs to run */
 919
 920       yield lease_cr->go_down();
 921
 922       int ret;
 923       while (collect(&ret, NULL)) {
 924         if (ret < 0) {
 925           return set_cr_error(ret);
 926         }
 927         yield;
 928       }
 929       drain_all();
 930       if (failed) {
 931         yield return set_cr_error(-EIO);
 932       }
 933       if (lost_lock) {
 934         yield return set_cr_error(-EBUSY);
 935       }
 936
 937       if (ret_status < 0) {
 938         yield return set_cr_error(ret_status);
 939       }
 940
 941       yield return set_cr_done();
 942     }
 943     return 0;
 944   }
 945 };
 946
 947 static string full_sync_index_shard_oid(int shard_id)
 948 {
 949   char buf[mdlog_sync_full_sync_index_prefix.size() + 16];
 950   snprintf(buf, sizeof(buf), "%s.%d", mdlog_sync_full_sync_index_prefix.c_str(), shard_id);
 951   return string(buf);
 952 }
 953
 954 class RGWReadRemoteMetadataCR : public RGWCoroutine {
 955   RGWMetaSyncEnv *sync_env;
 956
 957   RGWRESTReadResource *http_op;
 958
 959   string section;
 960   string key;
 961
 962   bufferlist *pbl;
 963
 964 public:
 965   RGWReadRemoteMetadataCR(RGWMetaSyncEnv *_sync_env,
 966                                                       const string& _section, const string& _key, bufferlist *_pbl) : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
 967                                                       http_op(NULL),
 968                                                       section(_section),
 969                                                       key(_key),
 970                                                       pbl(_pbl) {
 971   }
 972
 973   int operate() override {
 974     RGWRESTConn *conn = sync_env->conn;
 975     reenter(this) {
 976       yield {
 977         rgw_http_param_pair pairs[] = { { "key" , key.c_str()},
 978                                         { NULL, NULL } };
 979
 980         string p = string("/admin/metadata/") + section + "/" + key;
 981
 982         http_op = new RGWRESTReadResource(conn, p, pairs, NULL, sync_env->http_manager);
 983
 984         http_op->set_user_info((void *)stack);
 985
 986         int ret = http_op->aio_read();
 987         if (ret < 0) {
 988           ldout(sync_env->cct, 0) << "ERROR: failed to fetch mdlog data" << dendl;
 989           log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
 990           http_op->put();
 991           return set_cr_error(ret);
 992         }
 993
 994         return io_block(0);
 995       }
 996       yield {
 997         int ret = http_op->wait_bl(pbl);
 998         http_op->put();
 999         if (ret < 0) {
1000           return set_cr_error(ret);
1001         }
1002         return set_cr_done();
1003       }
1004     }
1005     return 0;
1006   }
1007 };
1008
1009 class RGWAsyncMetaStoreEntry : public RGWAsyncRadosRequest {
1010   RGWRados *store;
1011   string raw_key;
1012   bufferlist bl;
1013 protected:
1014   int _send_request() override {
1015     int ret = store->meta_mgr->put(raw_key, bl, RGWMetadataHandler::APPLY_ALWAYS);
1016     if (ret < 0) {
1017       ldout(store->ctx(), 0) << "ERROR: can't store key: " << raw_key << " ret=" << ret << dendl;
1018       return ret;
1019     }
1020     return 0;
1021   }
1022 public:
1023   RGWAsyncMetaStoreEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
1024                        const string& _raw_key,
1025                        bufferlist& _bl) : RGWAsyncRadosRequest(caller, cn), store(_store),
1026                                           raw_key(_raw_key), bl(_bl) {}
1027 };
1028
1029
1030 class RGWMetaStoreEntryCR : public RGWSimpleCoroutine {
1031   RGWMetaSyncEnv *sync_env;
1032   string raw_key;
1033   bufferlist bl;
1034
1035   RGWAsyncMetaStoreEntry *req;
1036
1037 public:
1038   RGWMetaStoreEntryCR(RGWMetaSyncEnv *_sync_env,
1039                        const string& _raw_key,
1040                        bufferlist& _bl) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
1041                                           raw_key(_raw_key), bl(_bl), req(NULL) {
1042   }
1043
1044   ~RGWMetaStoreEntryCR() override {
1045     if (req) {
1046       req->finish();
1047     }
1048   }
1049
1050   int send_request() override {
1051     req = new RGWAsyncMetaStoreEntry(this, stack->create_completion_notifier(),
1052                                    sync_env->store, raw_key, bl);
1053     sync_env->async_rados->queue(req);
1054     return 0;
1055   }
1056
1057   int request_complete() override {
1058     return req->get_ret_status();
1059   }
1060 };
1061
1062 class RGWAsyncMetaRemoveEntry : public RGWAsyncRadosRequest {
1063   RGWRados *store;
1064   string raw_key;
1065 protected:
1066   int _send_request() override {
1067     int ret = store->meta_mgr->remove(raw_key);
1068     if (ret < 0) {
1069       ldout(store->ctx(), 0) << "ERROR: can't remove key: " << raw_key << " ret=" << ret << dendl;
1070       return ret;
1071     }
1072     return 0;
1073   }
1074 public:
1075   RGWAsyncMetaRemoveEntry(RGWCoroutine *caller, RGWAioCompletionNotifier *cn, RGWRados *_store,
1076                        const string& _raw_key) : RGWAsyncRadosRequest(caller, cn), store(_store),
1077                                           raw_key(_raw_key) {}
1078 };
1079
1080
1081 class RGWMetaRemoveEntryCR : public RGWSimpleCoroutine {
1082   RGWMetaSyncEnv *sync_env;
1083   string raw_key;
1084
1085   RGWAsyncMetaRemoveEntry *req;
1086
1087 public:
1088   RGWMetaRemoveEntryCR(RGWMetaSyncEnv *_sync_env,
1089                        const string& _raw_key) : RGWSimpleCoroutine(_sync_env->cct), sync_env(_sync_env),
1090                                           raw_key(_raw_key), req(NULL) {
1091   }
1092
1093   ~RGWMetaRemoveEntryCR() override {
1094     if (req) {
1095       req->finish();
1096     }
1097   }
1098
1099   int send_request() override {
1100     req = new RGWAsyncMetaRemoveEntry(this, stack->create_completion_notifier(),
1101                                    sync_env->store, raw_key);
1102     sync_env->async_rados->queue(req);
1103     return 0;
1104   }
1105
1106   int request_complete() override {
1107     int r = req->get_ret_status();
1108     if (r == -ENOENT) {
1109       r = 0;
1110     }
1111     return r;
1112   }
1113 };
1114
1115 #define META_SYNC_UPDATE_MARKER_WINDOW 10
1116
1117 class RGWMetaSyncShardMarkerTrack : public RGWSyncShardMarkerTrack<string, string> {
1118   RGWMetaSyncEnv *sync_env;
1119
1120   string marker_oid;
1121   rgw_meta_sync_marker sync_marker;
1122
1123
1124 public:
1125   RGWMetaSyncShardMarkerTrack(RGWMetaSyncEnv *_sync_env,
1126                          const string& _marker_oid,
1127                          const rgw_meta_sync_marker& _marker) : RGWSyncShardMarkerTrack(META_SYNC_UPDATE_MARKER_WINDOW),
1128                                                                 sync_env(_sync_env),
1129                                                                 marker_oid(_marker_oid),
1130                                                                 sync_marker(_marker) {}
1131
1132   RGWCoroutine *store_marker(const string& new_marker, uint64_t index_pos, const real_time& timestamp) override {
1133     sync_marker.marker = new_marker;
1134     if (index_pos > 0) {
1135       sync_marker.pos = index_pos;
1136     }
1137
1138     if (!real_clock::is_zero(timestamp)) {
1139       sync_marker.timestamp = timestamp;
1140     }
1141
1142     ldout(sync_env->cct, 20) << __func__ << "(): updating marker marker_oid=" << marker_oid << " marker=" << new_marker << " realm_epoch=" << sync_marker.realm_epoch << dendl;
1143     RGWRados *store = sync_env->store;
1144     return new RGWSimpleRadosWriteCR<rgw_meta_sync_marker>(sync_env->async_rados,
1145                                                            store,
1146                                                            rgw_raw_obj(store->get_zone_params().log_pool, marker_oid),
1147                                                            sync_marker);
1148   }
1149 };
1150
1151 int RGWMetaSyncSingleEntryCR::operate() {
1152   reenter(this) {
1153 #define NUM_TRANSIENT_ERROR_RETRIES 10
1154
1155     if (error_injection &&
1156         rand() % 10000 < cct->_conf->rgw_sync_meta_inject_err_probability * 10000.0) {
1157       ldout(sync_env->cct, 0) << __FILE__ << ":" << __LINE__ << ": injecting meta sync error on key=" << raw_key << dendl;
1158       return set_cr_error(-EIO);
1159     }
1160
1161     if (op_status != MDLOG_STATUS_COMPLETE) {
1162       ldout(sync_env->cct, 20) << "skipping pending operation" << dendl;
1163       yield call(marker_tracker->finish(entry_marker));
1164       if (retcode < 0) {
1165         return set_cr_error(retcode);
1166       }
1167       return set_cr_done();
1168     }
1169     for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
1170       yield {
1171         pos = raw_key.find(':');
1172         section = raw_key.substr(0, pos);
1173         key = raw_key.substr(pos + 1);
1174         ldout(sync_env->cct, 20) << "fetching remote metadata: " << section << ":" << key << (tries == 0 ? "" : " (retry)") << dendl;
1175         call(new RGWReadRemoteMetadataCR(sync_env, section, key, &md_bl));
1176       }
1177
1178       sync_status = retcode;
1179
1180       if (sync_status == -ENOENT) {
1181         /* FIXME: do we need to remove the entry from the local zone? */
1182         break;
1183       }
1184
1185       if ((sync_status == -EAGAIN || sync_status == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
1186         ldout(sync_env->cct, 20) << *this << ": failed to fetch remote metadata: " << section << ":" << key << ", will retry" << dendl;
1187         continue;
1188       }
1189
1190       if (sync_status < 0) {
1191         ldout(sync_env->cct, 10) << *this << ": failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << dendl;
1192         log_error() << "failed to send read remote metadata entry: section=" << section << " key=" << key << " status=" << sync_status << std::endl;
1193         yield call(sync_env->error_logger->log_error_cr(sync_env->conn->get_remote_id(), section, key, -sync_status,
1194                                                         string("failed to read remote metadata entry: ") + cpp_strerror(-sync_status)));
1195         return set_cr_error(sync_status);
1196       }
1197
1198       break;
1199     }
1200
1201     retcode = 0;
1202     for (tries = 0; tries < NUM_TRANSIENT_ERROR_RETRIES; tries++) {
1203       if (sync_status != -ENOENT) {
1204           yield call(new RGWMetaStoreEntryCR(sync_env, raw_key, md_bl));
1205       } else {
1206           yield call(new RGWMetaRemoveEntryCR(sync_env, raw_key));
1207       }
1208       if ((retcode == -EAGAIN || retcode == -ECANCELED) && (tries < NUM_TRANSIENT_ERROR_RETRIES - 1)) {
1209         ldout(sync_env->cct, 20) << *this << ": failed to store metadata: " << section << ":" << key << ", got retcode=" << retcode << dendl;
1210         continue;
1211       }
1212       break;
1213     }
1214
1215     sync_status = retcode;
1216
1217     if (sync_status == 0 && marker_tracker) {
1218       /* update marker */
1219       yield call(marker_tracker->finish(entry_marker));
1220       sync_status = retcode;
1221     }
1222     if (sync_status < 0) {
1223       return set_cr_error(sync_status);
1224     }
1225     return set_cr_done();
1226   }
1227   return 0;
1228 }
1229
1230 class RGWCloneMetaLogCoroutine : public RGWCoroutine {
1231   RGWMetaSyncEnv *sync_env;
1232   RGWMetadataLog *mdlog;
1233
1234   const std::string& period;
1235   int shard_id;
1236   string marker;
1237   bool truncated = false;
1238   string *new_marker;
1239
1240   int max_entries = CLONE_MAX_ENTRIES;
1241
1242   RGWRESTReadResource *http_op = nullptr;
1243   boost::intrusive_ptr<RGWMetadataLogInfoCompletion> completion;
1244
1245   RGWMetadataLogInfo shard_info;
1246   rgw_mdlog_shard_data data;
1247
1248 public:
1249   RGWCloneMetaLogCoroutine(RGWMetaSyncEnv *_sync_env, RGWMetadataLog* mdlog,
1250                            const std::string& period, int _id,
1251                            const string& _marker, string *_new_marker)
1252     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), mdlog(mdlog),
1253       period(period), shard_id(_id), marker(_marker), new_marker(_new_marker) {
1254     if (new_marker) {
1255       *new_marker = marker;
1256     }
1257   }
1258   ~RGWCloneMetaLogCoroutine() override {
1259     if (http_op) {
1260       http_op->put();
1261     }
1262     if (completion) {
1263       completion->cancel();
1264     }
1265   }
1266
1267   int operate() override;
1268
1269   int state_init();
1270   int state_read_shard_status();
1271   int state_read_shard_status_complete();
1272   int state_send_rest_request();
1273   int state_receive_rest_response();
1274   int state_store_mdlog_entries();
1275   int state_store_mdlog_entries_complete();
1276 };
1277
1278 class RGWMetaSyncShardCR : public RGWCoroutine {
1279   RGWMetaSyncEnv *sync_env;
1280
1281   const rgw_pool& pool;
1282   const std::string& period; //< currently syncing period id
1283   const epoch_t realm_epoch; //< realm_epoch of period
1284   RGWMetadataLog* mdlog; //< log of syncing period
1285   uint32_t shard_id;
1286   rgw_meta_sync_marker& sync_marker;
1287   boost::optional<rgw_meta_sync_marker> temp_marker; //< for pending updates
1288   string marker;
1289   string max_marker;
1290   const std::string& period_marker; //< max marker stored in next period
1291
1292   map<string, bufferlist> entries;
1293   map<string, bufferlist>::iterator iter;
1294
1295   string oid;
1296
1297   RGWMetaSyncShardMarkerTrack *marker_tracker = nullptr;
1298
1299   list<cls_log_entry> log_entries;
1300   list<cls_log_entry>::iterator log_iter;
1301   bool truncated = false;
1302
1303   string mdlog_marker;
1304   string raw_key;
1305   rgw_mdlog_entry mdlog_entry;
1306
1307   Mutex inc_lock;
1308   Cond inc_cond;
1309
1310   boost::asio::coroutine incremental_cr;
1311   boost::asio::coroutine full_cr;
1312
1313   boost::intrusive_ptr<RGWContinuousLeaseCR> lease_cr;
1314   boost::intrusive_ptr<RGWCoroutinesStack> lease_stack;
1315
1316   bool lost_lock = false;
1317
1318   bool *reset_backoff;
1319
1320   // hold a reference to the cr stack while it's in the map
1321   using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
1322   map<StackRef, string> stack_to_pos;
1323   map<string, string> pos_to_prev;
1324
1325   bool can_adjust_marker = false;
1326   bool done_with_period = false;
1327
1328   int total_entries = 0;
1329
1330 public:
1331   RGWMetaSyncShardCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
1332                      const std::string& period, epoch_t realm_epoch,
1333                      RGWMetadataLog* mdlog, uint32_t _shard_id,
1334                      rgw_meta_sync_marker& _marker,
1335                      const std::string& period_marker, bool *_reset_backoff)
1336     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env), pool(_pool),
1337       period(period), realm_epoch(realm_epoch), mdlog(mdlog),
1338       shard_id(_shard_id), sync_marker(_marker),
1339       period_marker(period_marker), inc_lock("RGWMetaSyncShardCR::inc_lock"),
1340       reset_backoff(_reset_backoff) {
1341     *reset_backoff = false;
1342   }
1343
1344   ~RGWMetaSyncShardCR() override {
1345     delete marker_tracker;
1346     if (lease_cr) {
1347       lease_cr->abort();
1348     }
1349   }
1350
1351   void set_marker_tracker(RGWMetaSyncShardMarkerTrack *mt) {
1352     delete marker_tracker;
1353     marker_tracker = mt;
1354   }
1355
1356   int operate() override {
1357     int r;
1358     while (true) {
1359       switch (sync_marker.state) {
1360       case rgw_meta_sync_marker::FullSync:
1361         r  = full_sync();
1362         if (r < 0) {
1363           ldout(sync_env->cct, 10) << "sync: full_sync: shard_id=" << shard_id << " r=" << r << dendl;
1364           return set_cr_error(r);
1365         }
1366         return 0;
1367       case rgw_meta_sync_marker::IncrementalSync:
1368         r  = incremental_sync();
1369         if (r < 0) {
1370           ldout(sync_env->cct, 10) << "sync: incremental_sync: shard_id=" << shard_id << " r=" << r << dendl;
1371           return set_cr_error(r);
1372         }
1373         return 0;
1374       }
1375     }
1376     /* unreachable */
1377     return 0;
1378   }
1379
1380   void collect_children()
1381   {
1382     int child_ret;
1383     RGWCoroutinesStack *child;
1384     while (collect_next(&child_ret, &child)) {
1385       auto iter = stack_to_pos.find(child);
1386       if (iter == stack_to_pos.end()) {
1387         /* some other stack that we don't care about */
1388         continue;
1389       }
1390
1391       string& pos = iter->second;
1392
1393       if (child_ret < 0) {
1394         ldout(sync_env->cct, 0) << *this << ": child operation stack=" << child << " entry=" << pos << " returned " << child_ret << dendl;
1395       }
1396
1397       map<string, string>::iterator prev_iter = pos_to_prev.find(pos);
1398       assert(prev_iter != pos_to_prev.end());
1399
1400       /*
1401        * we should get -EAGAIN for transient errors, for which we want to retry, so we don't
1402        * update the marker and abort. We'll get called again for these. Permanent errors will be
1403        * handled by marking the entry at the error log shard, so that we retry on it separately
1404        */
1405       if (child_ret == -EAGAIN) {
1406         can_adjust_marker = false;
1407       }
1408
1409       if (pos_to_prev.size() == 1) {
1410         if (can_adjust_marker) {
1411           sync_marker.marker = pos;
1412         }
1413         pos_to_prev.erase(prev_iter);
1414       } else {
1415         assert(pos_to_prev.size() > 1);
1416         pos_to_prev.erase(prev_iter);
1417         prev_iter = pos_to_prev.begin();
1418         if (can_adjust_marker) {
1419           sync_marker.marker = prev_iter->second;
1420         }
1421       }
1422
1423       ldout(sync_env->cct, 4) << *this << ": adjusting marker pos=" << sync_marker.marker << dendl;
1424       stack_to_pos.erase(iter);
1425     }
1426   }
1427
1428   int full_sync() {
1429 #define OMAP_GET_MAX_ENTRIES 100
1430     int max_entries = OMAP_GET_MAX_ENTRIES;
1431     reenter(&full_cr) {
1432       set_status("full_sync");
1433       oid = full_sync_index_shard_oid(shard_id);
1434       can_adjust_marker = true;
1435       /* grab lock */
1436       yield {
1437         uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
1438         string lock_name = "sync_lock";
1439         RGWRados *store = sync_env->store;
1440         lease_cr.reset(new RGWContinuousLeaseCR(sync_env->async_rados, store,
1441                                                 rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1442                                                 lock_name, lock_duration, this));
1443         lease_stack.reset(spawn(lease_cr.get(), false));
1444         lost_lock = false;
1445       }
1446       while (!lease_cr->is_locked()) {
1447         if (lease_cr->is_done()) {
1448           ldout(cct, 5) << "lease cr failed, done early " << dendl;
1449           drain_all();
1450           return lease_cr->get_ret_status();
1451         }
1452         set_sleeping(true);
1453         yield;
1454       }
1455
1456       /* lock succeeded, a retry now should avoid previous backoff status */
1457       *reset_backoff = true;
1458
1459       /* prepare marker tracker */
1460       set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
1461                                                          sync_env->shard_obj_name(shard_id),
1462                                                          sync_marker));
1463
1464       marker = sync_marker.marker;
1465
1466       total_entries = sync_marker.pos;
1467
1468       /* sync! */
1469       do {
1470         if (!lease_cr->is_locked()) {
1471           lost_lock = true;
1472           break;
1473         }
1474         yield call(new RGWRadosGetOmapKeysCR(sync_env->store, rgw_raw_obj(pool, oid),
1475                                              marker, &entries, max_entries));
1476         if (retcode < 0) {
1477           ldout(sync_env->cct, 0) << "ERROR: " << __func__ << "(): RGWRadosGetOmapKeysCR() returned ret=" << retcode << dendl;
1478           yield lease_cr->go_down();
1479           drain_all();
1480           return retcode;
1481         }
1482         iter = entries.begin();
1483         for (; iter != entries.end(); ++iter) {
1484           ldout(sync_env->cct, 20) << __func__ << ": full sync: " << iter->first << dendl;
1485           total_entries++;
1486           if (!marker_tracker->start(iter->first, total_entries, real_time())) {
1487             ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << iter->first << ". Duplicate entry?" << dendl;
1488           } else {
1489             // fetch remote and write locally
1490             yield {
1491               RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, iter->first, iter->first, MDLOG_STATUS_COMPLETE, marker_tracker), false);
1492               // stack_to_pos holds a reference to the stack
1493               stack_to_pos[stack] = iter->first;
1494               pos_to_prev[iter->first] = marker;
1495             }
1496           }
1497           marker = iter->first;
1498         }
1499         collect_children();
1500       } while ((int)entries.size() == max_entries && can_adjust_marker);
1501
1502       while (num_spawned() > 1) {
1503         yield wait_for_child();
1504         collect_children();
1505       }
1506
1507       if (!lost_lock) {
1508         /* update marker to reflect we're done with full sync */
1509         if (can_adjust_marker) {
1510           // apply updates to a temporary marker, or operate() will send us
1511           // to incremental_sync() after we yield
1512           temp_marker = sync_marker;
1513           temp_marker->state = rgw_meta_sync_marker::IncrementalSync;
1514           temp_marker->marker = std::move(temp_marker->next_step_marker);
1515           temp_marker->next_step_marker.clear();
1516           temp_marker->realm_epoch = realm_epoch;
1517           ldout(sync_env->cct, 4) << *this << ": saving marker pos=" << temp_marker->marker << " realm_epoch=" << realm_epoch << dendl;
1518
1519           using WriteMarkerCR = RGWSimpleRadosWriteCR<rgw_meta_sync_marker>;
1520           yield call(new WriteMarkerCR(sync_env->async_rados, sync_env->store,
1521                                        rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1522                                        *temp_marker));
1523         }
1524
1525         if (retcode < 0) {
1526           ldout(sync_env->cct, 0) << "ERROR: failed to set sync marker: retcode=" << retcode << dendl;
1527           yield lease_cr->go_down();
1528           drain_all();
1529           return retcode;
1530         }
1531       }
1532
1533       /*
1534        * if we reached here, it means that lost_lock is true, otherwise the state
1535        * change in the previous block will prevent us from reaching here
1536        */
1537
1538       yield lease_cr->go_down();
1539
1540       lease_cr.reset();
1541
1542       drain_all();
1543
1544       if (!can_adjust_marker) {
1545         return -EAGAIN;
1546       }
1547
1548       if (lost_lock) {
1549         return -EBUSY;
1550       }
1551
1552       // apply the sync marker update
1553       assert(temp_marker);
1554       sync_marker = std::move(*temp_marker);
1555       temp_marker = boost::none;
1556       // must not yield after this point!
1557     }
1558     return 0;
1559   }
1560
1561
1562   int incremental_sync() {
1563     reenter(&incremental_cr) {
1564       set_status("incremental_sync");
1565       can_adjust_marker = true;
1566       /* grab lock */
1567       if (!lease_cr) { /* could have had  a lease_cr lock from previous state */
1568         yield {
1569           uint32_t lock_duration = cct->_conf->rgw_sync_lease_period;
1570           string lock_name = "sync_lock";
1571           RGWRados *store = sync_env->store;
1572           lease_cr.reset( new RGWContinuousLeaseCR(sync_env->async_rados, store,
1573                                                    rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1574                                                    lock_name, lock_duration, this));
1575           lease_stack.reset(spawn(lease_cr.get(), false));
1576           lost_lock = false;
1577         }
1578         while (!lease_cr->is_locked()) {
1579           if (lease_cr->is_done()) {
1580             ldout(cct, 5) << "lease cr failed, done early " << dendl;
1581             drain_all();
1582             return lease_cr->get_ret_status();
1583           }
1584           set_sleeping(true);
1585           yield;
1586         }
1587       }
1588       // if the period has advanced, we can't use the existing marker
1589       if (sync_marker.realm_epoch < realm_epoch) {
1590         ldout(sync_env->cct, 4) << "clearing marker=" << sync_marker.marker
1591             << " from old realm_epoch=" << sync_marker.realm_epoch
1592             << " (now " << realm_epoch << ')' << dendl;
1593         sync_marker.realm_epoch = realm_epoch;
1594         sync_marker.marker.clear();
1595       }
1596       mdlog_marker = sync_marker.marker;
1597       set_marker_tracker(new RGWMetaSyncShardMarkerTrack(sync_env,
1598                                                          sync_env->shard_obj_name(shard_id),
1599                                                          sync_marker));
1600
1601       /*
1602        * mdlog_marker: the remote sync marker positiion
1603        * sync_marker: the local sync marker position
1604        * max_marker: the max mdlog position that we fetched
1605        * marker: the current position we try to sync
1606        * period_marker: the last marker before the next period begins (optional)
1607        */
1608       marker = max_marker = sync_marker.marker;
1609       /* inc sync */
1610       do {
1611         if (!lease_cr->is_locked()) {
1612           lost_lock = true;
1613           break;
1614         }
1615 #define INCREMENTAL_MAX_ENTRIES 100
1616         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
1617         if (!period_marker.empty() && period_marker <= mdlog_marker) {
1618           ldout(cct, 10) << "mdlog_marker past period_marker=" << period_marker << dendl;
1619           done_with_period = true;
1620           break;
1621         }
1622         if (mdlog_marker <= max_marker) {
1623           /* we're at the tip, try to bring more entries */
1624           ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " syncing mdlog for shard_id=" << shard_id << dendl;
1625           yield call(new RGWCloneMetaLogCoroutine(sync_env, mdlog,
1626                                                   period, shard_id,
1627                                                   mdlog_marker, &mdlog_marker));
1628         }
1629         if (retcode < 0) {
1630           ldout(sync_env->cct, 10) << *this << ": failed to fetch more log entries, retcode=" << retcode << dendl;
1631           yield lease_cr->go_down();
1632           drain_all();
1633           *reset_backoff = false; // back off and try again later
1634           return retcode;
1635         }
1636         *reset_backoff = true; /* if we got to this point, all systems function */
1637         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " sync_marker.marker=" << sync_marker.marker << dendl;
1638         if (mdlog_marker > max_marker) {
1639           marker = max_marker;
1640           yield call(new RGWReadMDLogEntriesCR(sync_env, mdlog, shard_id,
1641                                                &max_marker, INCREMENTAL_MAX_ENTRIES,
1642                                                &log_entries, &truncated));
1643           if (retcode < 0) {
1644             ldout(sync_env->cct, 10) << *this << ": failed to list mdlog entries, retcode=" << retcode << dendl;
1645             yield lease_cr->go_down();
1646             drain_all();
1647             *reset_backoff = false; // back off and try again later
1648             return retcode;
1649           }
1650           for (log_iter = log_entries.begin(); log_iter != log_entries.end() && !done_with_period; ++log_iter) {
1651             if (!period_marker.empty() && period_marker <= log_iter->id) {
1652               done_with_period = true;
1653               if (period_marker < log_iter->id) {
1654                 ldout(cct, 10) << "found key=" << log_iter->id
1655                     << " past period_marker=" << period_marker << dendl;
1656                 break;
1657               }
1658               ldout(cct, 10) << "found key at period_marker=" << period_marker << dendl;
1659               // sync this entry, then return control to RGWMetaSyncCR
1660             }
1661             if (!mdlog_entry.convert_from(*log_iter)) {
1662               ldout(sync_env->cct, 0) << __func__ << ":" << __LINE__ << ": ERROR: failed to convert mdlog entry, shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << " ... skipping entry" << dendl;
1663               continue;
1664             }
1665             ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " log_entry: " << log_iter->id << ":" << log_iter->section << ":" << log_iter->name << ":" << log_iter->timestamp << dendl;
1666             if (!marker_tracker->start(log_iter->id, 0, log_iter->timestamp.to_real_time())) {
1667               ldout(sync_env->cct, 0) << "ERROR: cannot start syncing " << log_iter->id << ". Duplicate entry?" << dendl;
1668             } else {
1669               raw_key = log_iter->section + ":" + log_iter->name;
1670               yield {
1671                 RGWCoroutinesStack *stack = spawn(new RGWMetaSyncSingleEntryCR(sync_env, raw_key, log_iter->id, mdlog_entry.log_data.status, marker_tracker), false);
1672                 assert(stack);
1673                 // stack_to_pos holds a reference to the stack
1674                 stack_to_pos[stack] = log_iter->id;
1675                 pos_to_prev[log_iter->id] = marker;
1676               }
1677             }
1678             marker = log_iter->id;
1679           }
1680         }
1681         collect_children();
1682         ldout(sync_env->cct, 20) << __func__ << ":" << __LINE__ << ": shard_id=" << shard_id << " mdlog_marker=" << mdlog_marker << " max_marker=" << max_marker << " sync_marker.marker=" << sync_marker.marker << " period_marker=" << period_marker << dendl;
1683         if (done_with_period) {
1684           // return control to RGWMetaSyncCR and advance to the next period
1685           ldout(sync_env->cct, 10) << *this << ": done with period" << dendl;
1686           break;
1687         }
1688         if (mdlog_marker == max_marker && can_adjust_marker) {
1689 #define INCREMENTAL_INTERVAL 20
1690           yield wait(utime_t(INCREMENTAL_INTERVAL, 0));
1691         }
1692       } while (can_adjust_marker);
1693
1694       while (num_spawned() > 1) {
1695         yield wait_for_child();
1696         collect_children();
1697       }
1698
1699       yield lease_cr->go_down();
1700
1701       drain_all();
1702
1703       if (lost_lock) {
1704         return -EBUSY;
1705       }
1706
1707       if (!can_adjust_marker) {
1708         return -EAGAIN;
1709       }
1710
1711       return set_cr_done();
1712     }
1713     /* TODO */
1714     return 0;
1715   }
1716 };
1717
1718 class RGWMetaSyncShardControlCR : public RGWBackoffControlCR
1719 {
1720   RGWMetaSyncEnv *sync_env;
1721
1722   const rgw_pool& pool;
1723   const std::string& period;
1724   epoch_t realm_epoch;
1725   RGWMetadataLog* mdlog;
1726   uint32_t shard_id;
1727   rgw_meta_sync_marker sync_marker;
1728   const std::string period_marker;
1729
1730   static constexpr bool exit_on_error = false; // retry on all errors
1731 public:
1732   RGWMetaSyncShardControlCR(RGWMetaSyncEnv *_sync_env, const rgw_pool& _pool,
1733                             const std::string& period, epoch_t realm_epoch,
1734                             RGWMetadataLog* mdlog, uint32_t _shard_id,
1735                             const rgw_meta_sync_marker& _marker,
1736                             std::string&& period_marker)
1737     : RGWBackoffControlCR(_sync_env->cct, exit_on_error), sync_env(_sync_env),
1738       pool(_pool), period(period), realm_epoch(realm_epoch), mdlog(mdlog),
1739       shard_id(_shard_id), sync_marker(_marker),
1740       period_marker(std::move(period_marker)) {}
1741
1742   RGWCoroutine *alloc_cr() override {
1743     return new RGWMetaSyncShardCR(sync_env, pool, period, realm_epoch, mdlog,
1744                                   shard_id, sync_marker, period_marker, backoff_ptr());
1745   }
1746
1747   RGWCoroutine *alloc_finisher_cr() override {
1748     RGWRados *store = sync_env->store;
1749     return new RGWSimpleRadosReadCR<rgw_meta_sync_marker>(sync_env->async_rados, store,
1750                                                           rgw_raw_obj(pool, sync_env->shard_obj_name(shard_id)),
1751                                                           &sync_marker);
1752   }
1753 };
1754
1755 class RGWMetaSyncCR : public RGWCoroutine {
1756   RGWMetaSyncEnv *sync_env;
1757   const rgw_pool& pool;
1758   RGWPeriodHistory::Cursor cursor; //< sync position in period history
1759   RGWPeriodHistory::Cursor next; //< next period in history
1760   rgw_meta_sync_status sync_status;
1761
1762   std::mutex mutex; //< protect access to shard_crs
1763
1764   // TODO: it should be enough to hold a reference on the stack only, as calling
1765   // RGWCoroutinesStack::wakeup() doesn't refer to the RGWCoroutine if it has
1766   // already completed
1767   using ControlCRRef = boost::intrusive_ptr<RGWMetaSyncShardControlCR>;
1768   using StackRef = boost::intrusive_ptr<RGWCoroutinesStack>;
1769   using RefPair = std::pair<ControlCRRef, StackRef>;
1770   map<int, RefPair> shard_crs;
1771   int ret{0};
1772
1773 public:
1774   RGWMetaSyncCR(RGWMetaSyncEnv *_sync_env, RGWPeriodHistory::Cursor cursor,
1775                 const rgw_meta_sync_status& _sync_status)
1776     : RGWCoroutine(_sync_env->cct), sync_env(_sync_env),
1777       pool(sync_env->store->get_zone_params().log_pool),
1778       cursor(cursor), sync_status(_sync_status) {}
1779
1780   int operate() override {
1781     reenter(this) {
1782       // loop through one period at a time
1783       for (;;) {
1784         if (cursor == sync_env->store->period_history->get_current()) {
1785           next = RGWPeriodHistory::Cursor{};
1786           if (cursor) {
1787             ldout(cct, 10) << "RGWMetaSyncCR on current period="
1788                 << cursor.get_period().get_id() << dendl;
1789           } else {
1790             ldout(cct, 10) << "RGWMetaSyncCR with no period" << dendl;
1791           }
1792         } else {
1793           next = cursor;
1794           next.next();
1795           ldout(cct, 10) << "RGWMetaSyncCR on period="
1796               << cursor.get_period().get_id() << ", next="
1797               << next.get_period().get_id() << dendl;
1798         }
1799
1800         yield {
1801           // get the mdlog for the current period (may be empty)
1802           auto& period_id = sync_status.sync_info.period;
1803           auto realm_epoch = sync_status.sync_info.realm_epoch;
1804           auto mdlog = sync_env->store->meta_mgr->get_log(period_id);
1805
1806           // prevent wakeup() from accessing shard_crs while we're spawning them
1807           std::lock_guard<std::mutex> lock(mutex);
1808
1809           // sync this period on each shard
1810           for (const auto& m : sync_status.sync_markers) {
1811             uint32_t shard_id = m.first;
1812             auto& marker = m.second;
1813
1814             std::string period_marker;
1815             if (next) {
1816               // read the maximum marker from the next period's sync status
1817               period_marker = next.get_period().get_sync_status()[shard_id];
1818               if (period_marker.empty()) {
1819                 // no metadata changes have occurred on this shard, skip it
1820                 ldout(cct, 10) << "RGWMetaSyncCR: skipping shard " << shard_id
1821                     << " with empty period marker" << dendl;
1822                 continue;
1823               }
1824             }
1825
1826             using ShardCR = RGWMetaSyncShardControlCR;
1827             auto cr = new ShardCR(sync_env, pool, period_id, realm_epoch,
1828                                   mdlog, shard_id, marker,
1829                                   std::move(period_marker));
1830             auto stack = spawn(cr, false);
1831             shard_crs[shard_id] = RefPair{cr, stack};
1832           }
1833         }
1834         // wait for each shard to complete
1835         while (ret == 0 && num_spawned() > 0) {
1836           yield wait_for_child();
1837           collect(&ret, nullptr);
1838         }
1839         drain_all();
1840         {
1841           // drop shard cr refs under lock
1842           std::lock_guard<std::mutex> lock(mutex);
1843           shard_crs.clear();
1844         }
1845         if (ret < 0) {
1846           return set_cr_error(ret);
1847         }
1848         // advance to the next period
1849         assert(next);
1850         cursor = next;
1851
1852         // write the updated sync info
1853         sync_status.sync_info.period = cursor.get_period().get_id();
1854         sync_status.sync_info.realm_epoch = cursor.get_epoch();
1855         yield call(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(sync_env->async_rados,
1856                                                                  sync_env->store,
1857                                                                  rgw_raw_obj(pool, sync_env->status_oid()),
1858                                                                  sync_status.sync_info));
1859       }
1860     }
1861     return 0;
1862   }
1863
1864   void wakeup(int shard_id) {
1865     std::lock_guard<std::mutex> lock(mutex);
1866     auto iter = shard_crs.find(shard_id);
1867     if (iter == shard_crs.end()) {
1868       return;
1869     }
1870     iter->second.first->wakeup();
1871   }
1872 };
1873
1874 void RGWRemoteMetaLog::init_sync_env(RGWMetaSyncEnv *env) {
1875   env->cct = store->ctx();
1876   env->store = store;
1877   env->conn = conn;
1878   env->async_rados = async_rados;
1879   env->http_manager = &http_manager;
1880   env->error_logger = error_logger;
1881 }
1882
1883 int RGWRemoteMetaLog::read_sync_status(rgw_meta_sync_status *sync_status)
1884 {
1885   if (store->is_meta_master()) {
1886     return 0;
1887   }
1888   // cannot run concurrently with run_sync(), so run in a separate manager
1889   RGWCoroutinesManager crs(store->ctx(), store->get_cr_registry());
1890   RGWHTTPManager http_manager(store->ctx(), crs.get_completion_mgr());
1891   int ret = http_manager.set_threaded();
1892   if (ret < 0) {
1893     ldout(store->ctx(), 0) << "failed in http_manager.set_threaded() ret=" << ret << dendl;
1894     return ret;
1895   }
1896   RGWMetaSyncEnv sync_env_local = sync_env;
1897   sync_env_local.http_manager = &http_manager;
1898   ret = crs.run(new RGWReadSyncStatusCoroutine(&sync_env_local, sync_status));
1899   http_manager.stop();
1900   return ret;
1901 }
1902
1903 int RGWRemoteMetaLog::init_sync_status()
1904 {
1905   if (store->is_meta_master()) {
1906     return 0;
1907   }
1908
1909   rgw_mdlog_info mdlog_info;
1910   int r = read_log_info(&mdlog_info);
1911   if (r < 0) {
1912     lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
1913     return r;
1914   }
1915
1916   rgw_meta_sync_info sync_info;
1917   sync_info.num_shards = mdlog_info.num_shards;
1918   auto cursor = store->period_history->get_current();
1919   if (cursor) {
1920     sync_info.period = cursor.get_period().get_id();
1921     sync_info.realm_epoch = cursor.get_epoch();
1922   }
1923
1924   return run(new RGWInitSyncStatusCoroutine(&sync_env, sync_info));
1925 }
1926
1927 int RGWRemoteMetaLog::store_sync_info(const rgw_meta_sync_info& sync_info)
1928 {
1929   return run(new RGWSimpleRadosWriteCR<rgw_meta_sync_info>(async_rados, store,
1930                                                            rgw_raw_obj(store->get_zone_params().log_pool, sync_env.status_oid()),
1931                                                            sync_info));
1932 }
1933
1934 // return a cursor to the period at our sync position
1935 static RGWPeriodHistory::Cursor get_period_at(RGWRados* store,
1936                                               const rgw_meta_sync_info& info)
1937 {
1938   if (info.period.empty()) {
1939     // return an empty cursor with error=0
1940     return RGWPeriodHistory::Cursor{};
1941   }
1942
1943   // look for an existing period in our history
1944   auto cursor = store->period_history->lookup(info.realm_epoch);
1945   if (cursor) {
1946     // verify that the period ids match
1947     auto& existing = cursor.get_period().get_id();
1948     if (existing != info.period) {
1949       lderr(store->ctx()) << "ERROR: sync status period=" << info.period
1950           << " does not match period=" << existing
1951           << " in history at realm epoch=" << info.realm_epoch << dendl;
1952       return RGWPeriodHistory::Cursor{-EEXIST};
1953     }
1954     return cursor;
1955   }
1956
1957   // read the period from rados or pull it from the master
1958   RGWPeriod period;
1959   int r = store->period_puller->pull(info.period, period);
1960   if (r < 0) {
1961     lderr(store->ctx()) << "ERROR: failed to read period id "
1962         << info.period << ": " << cpp_strerror(r) << dendl;
1963     return RGWPeriodHistory::Cursor{r};
1964   }
1965   // attach the period to our history
1966   cursor = store->period_history->attach(std::move(period));
1967   if (!cursor) {
1968     r = cursor.get_error();
1969     lderr(store->ctx()) << "ERROR: failed to read period history back to "
1970         << info.period << ": " << cpp_strerror(r) << dendl;
1971   }
1972   return cursor;
1973 }
1974
1975 int RGWRemoteMetaLog::run_sync()
1976 {
1977   if (store->is_meta_master()) {
1978     return 0;
1979   }
1980
1981   int r = 0;
1982
1983   // get shard count and oldest log period from master
1984   rgw_mdlog_info mdlog_info;
1985   for (;;) {
1986     if (going_down) {
1987       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
1988       return 0;
1989     }
1990     r = read_log_info(&mdlog_info);
1991     if (r == -EIO || r == -ENOENT) {
1992       // keep retrying if master isn't alive or hasn't initialized the log
1993       ldout(store->ctx(), 10) << __func__ << "(): waiting for master.." << dendl;
1994       backoff.backoff_sleep();
1995       continue;
1996     }
1997     backoff.reset();
1998     if (r < 0) {
1999       lderr(store->ctx()) << "ERROR: fail to fetch master log info (r=" << r << ")" << dendl;
2000       return r;
2001     }
2002     break;
2003   }
2004
2005   rgw_meta_sync_status sync_status;
2006   do {
2007     if (going_down) {
2008       ldout(store->ctx(), 1) << __func__ << "(): going down" << dendl;
2009       return 0;
2010     }
2011     r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
2012     if (r < 0 && r != -ENOENT) {
2013       ldout(store->ctx(), 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
2014       return r;
2015     }
2016
2017     if (!mdlog_info.period.empty()) {
2018       // restart sync if the remote has a period, but:
2019       // a) our status does not, or
2020       // b) our sync period comes before the remote's oldest log period
2021       if (sync_status.sync_info.period.empty() ||
2022           sync_status.sync_info.realm_epoch < mdlog_info.realm_epoch) {
2023         sync_status.sync_info.state = rgw_meta_sync_info::StateInit;
2024         ldout(store->ctx(), 1) << "epoch=" << sync_status.sync_info.realm_epoch
2025            << " in sync status comes before remote's oldest mdlog epoch="
2026            << mdlog_info.realm_epoch << ", restarting sync" << dendl;
2027       }
2028     }
2029
2030     if (sync_status.sync_info.state == rgw_meta_sync_info::StateInit) {
2031       ldout(store->ctx(), 20) << __func__ << "(): init" << dendl;
2032       sync_status.sync_info.num_shards = mdlog_info.num_shards;
2033       auto cursor = store->period_history->get_current();
2034       if (cursor) {
2035         // run full sync, then start incremental from the current period/epoch
2036         sync_status.sync_info.period = cursor.get_period().get_id();
2037         sync_status.sync_info.realm_epoch = cursor.get_epoch();
2038       }
2039       r = run(new RGWInitSyncStatusCoroutine(&sync_env, sync_status.sync_info));
2040       if (r == -EBUSY) {
2041         backoff.backoff_sleep();
2042         continue;
2043       }
2044       backoff.reset();
2045       if (r < 0) {
2046         ldout(store->ctx(), 0) << "ERROR: failed to init sync status r=" << r << dendl;
2047         return r;
2048       }
2049     }
2050   } while (sync_status.sync_info.state == rgw_meta_sync_info::StateInit);
2051
2052   auto num_shards = sync_status.sync_info.num_shards;
2053   if (num_shards != mdlog_info.num_shards) {
2054     lderr(store->ctx()) << "ERROR: can't sync, mismatch between num shards, master num_shards=" << mdlog_info.num_shards << " local num_shards=" << num_shards << dendl;
2055     return -EINVAL;
2056   }
2057
2058   RGWPeriodHistory::Cursor cursor;
2059   do {
2060     r = run(new RGWReadSyncStatusCoroutine(&sync_env, &sync_status));
2061     if (r < 0 && r != -ENOENT) {
2062       ldout(store->ctx(), 0) << "ERROR: failed to fetch sync status r=" << r << dendl;
2063       return r;
2064     }
2065
2066     switch ((rgw_meta_sync_info::SyncState)sync_status.sync_info.state) {
2067       case rgw_meta_sync_info::StateBuildingFullSyncMaps:
2068         ldout(store->ctx(), 20) << __func__ << "(): building full sync maps" << dendl;
2069         r = run(new RGWFetchAllMetaCR(&sync_env, num_shards, sync_status.sync_markers));
2070         if (r == -EBUSY || r == -EAGAIN) {
2071           backoff.backoff_sleep();
2072           continue;
2073         }
2074         backoff.reset();
2075         if (r < 0) {
2076           ldout(store->ctx(), 0) << "ERROR: failed to fetch all metadata keys" << dendl;
2077           return r;
2078         }
2079
2080         sync_status.sync_info.state = rgw_meta_sync_info::StateSync;
2081         r = store_sync_info(sync_status.sync_info);
2082         if (r < 0) {
2083           ldout(store->ctx(), 0) << "ERROR: failed to update sync status" << dendl;
2084           return r;
2085         }
2086         /* fall through */
2087       case rgw_meta_sync_info::StateSync:
2088         ldout(store->ctx(), 20) << __func__ << "(): sync" << dendl;
2089         // find our position in the period history (if any)
2090         cursor = get_period_at(store, sync_status.sync_info);
2091         r = cursor.get_error();
2092         if (r < 0) {
2093           return r;
2094         }
2095         meta_sync_cr = new RGWMetaSyncCR(&sync_env, cursor, sync_status);
2096         r = run(meta_sync_cr);
2097         if (r < 0) {
2098           ldout(store->ctx(), 0) << "ERROR: failed to fetch all metadata keys" << dendl;
2099           return r;
2100         }
2101         break;
2102       default:
2103         ldout(store->ctx(), 0) << "ERROR: bad sync state!" << dendl;
2104         return -EIO;
2105     }
2106   } while (!going_down);
2107
2108   return 0;
2109 }
2110
2111 void RGWRemoteMetaLog::wakeup(int shard_id)
2112 {
2113   if (!meta_sync_cr) {
2114     return;
2115   }
2116   meta_sync_cr->wakeup(shard_id);
2117 }
2118
2119 int RGWCloneMetaLogCoroutine::operate()
2120 {
2121   reenter(this) {
2122     do {
2123       yield {
2124         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": init request" << dendl;
2125         return state_init();
2126       }
2127       yield {
2128         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status" << dendl;
2129         return state_read_shard_status();
2130       }
2131       yield {
2132         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": reading shard status complete" << dendl;
2133         return state_read_shard_status_complete();
2134       }
2135       yield {
2136         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": sending rest request" << dendl;
2137         return state_send_rest_request();
2138       }
2139       yield {
2140         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": receiving rest response" << dendl;
2141         return state_receive_rest_response();
2142       }
2143       yield {
2144         ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries" << dendl;
2145         return state_store_mdlog_entries();
2146       }
2147     } while (truncated);
2148     yield {
2149       ldout(cct, 20) << __func__ << ": shard_id=" << shard_id << ": storing mdlog entries complete" << dendl;
2150       return state_store_mdlog_entries_complete();
2151     }
2152   }
2153
2154   return 0;
2155 }
2156
2157 int RGWCloneMetaLogCoroutine::state_init()
2158 {
2159   data = rgw_mdlog_shard_data();
2160
2161   return 0;
2162 }
2163
2164 int RGWCloneMetaLogCoroutine::state_read_shard_status()
2165 {
2166   const bool add_ref = false; // default constructs with refs=1
2167
2168   completion.reset(new RGWMetadataLogInfoCompletion(
2169     [this](int ret, const cls_log_header& header) {
2170       if (ret < 0) {
2171         ldout(cct, 1) << "ERROR: failed to read mdlog info with "
2172             << cpp_strerror(ret) << dendl;
2173       } else {
2174         shard_info.marker = header.max_marker;
2175         shard_info.last_update = header.max_time.to_real_time();
2176       }
2177       // wake up parent stack
2178       stack->get_completion_mgr()->complete(nullptr, stack);
2179     }), add_ref);
2180
2181   int ret = mdlog->get_info_async(shard_id, completion.get());
2182   if (ret < 0) {
2183     ldout(cct, 0) << "ERROR: mdlog->get_info_async() returned ret=" << ret << dendl;
2184     return set_cr_error(ret);
2185   }
2186
2187   return io_block(0);
2188 }
2189
2190 int RGWCloneMetaLogCoroutine::state_read_shard_status_complete()
2191 {
2192   completion.reset();
2193
2194   ldout(cct, 20) << "shard_id=" << shard_id << " marker=" << shard_info.marker << " last_update=" << shard_info.last_update << dendl;
2195
2196   marker = shard_info.marker;
2197
2198   return 0;
2199 }
2200
2201 int RGWCloneMetaLogCoroutine::state_send_rest_request()
2202 {
2203   RGWRESTConn *conn = sync_env->conn;
2204
2205   char buf[32];
2206   snprintf(buf, sizeof(buf), "%d", shard_id);
2207
2208   char max_entries_buf[32];
2209   snprintf(max_entries_buf, sizeof(max_entries_buf), "%d", max_entries);
2210
2211   const char *marker_key = (marker.empty() ? "" : "marker");
2212
2213   rgw_http_param_pair pairs[] = { { "type", "metadata" },
2214                                   { "id", buf },
2215                                   { "period", period.c_str() },
2216                                   { "max-entries", max_entries_buf },
2217                                   { marker_key, marker.c_str() },
2218                                   { NULL, NULL } };
2219
2220   http_op = new RGWRESTReadResource(conn, "/admin/log", pairs, NULL, sync_env->http_manager);
2221
2222   http_op->set_user_info((void *)stack);
2223
2224   int ret = http_op->aio_read();
2225   if (ret < 0) {
2226     ldout(cct, 0) << "ERROR: failed to fetch mdlog data" << dendl;
2227     log_error() << "failed to send http operation: " << http_op->to_str() << " ret=" << ret << std::endl;
2228     http_op->put();
2229     http_op = NULL;
2230     return ret;
2231   }
2232
2233   return io_block(0);
2234 }
2235
2236 int RGWCloneMetaLogCoroutine::state_receive_rest_response()
2237 {
2238   int ret = http_op->wait(&data);
2239   if (ret < 0) {
2240     error_stream << "http operation failed: " << http_op->to_str() << " status=" << http_op->get_http_status() << std::endl;
2241     ldout(cct, 5) << "failed to wait for op, ret=" << ret << dendl;
2242     http_op->put();
2243     http_op = NULL;
2244     return set_cr_error(ret);
2245   }
2246   http_op->put();
2247   http_op = NULL;
2248
2249   ldout(cct, 20) << "remote mdlog, shard_id=" << shard_id << " num of shard entries: " << data.entries.size() << dendl;
2250
2251   truncated = ((int)data.entries.size() == max_entries);
2252
2253   if (data.entries.empty()) {
2254     if (new_marker) {
2255       *new_marker = marker;
2256     }
2257     return set_cr_done();
2258   }
2259
2260   if (new_marker) {
2261     *new_marker = data.entries.back().id;
2262   }
2263
2264   return 0;
2265 }
2266
2267
2268 int RGWCloneMetaLogCoroutine::state_store_mdlog_entries()
2269 {
2270   list<cls_log_entry> dest_entries;
2271
2272   vector<rgw_mdlog_entry>::iterator iter;
2273   for (iter = data.entries.begin(); iter != data.entries.end(); ++iter) {
2274     rgw_mdlog_entry& entry = *iter;
2275     ldout(cct, 20) << "entry: name=" << entry.name << dendl;
2276
2277     cls_log_entry dest_entry;
2278     dest_entry.id = entry.id;
2279     dest_entry.section = entry.section;
2280     dest_entry.name = entry.name;
2281     dest_entry.timestamp = utime_t(entry.timestamp);
2282
2283     ::encode(entry.log_data, dest_entry.data);
2284
2285     dest_entries.push_back(dest_entry);
2286
2287     marker = entry.id;
2288   }
2289
2290   RGWAioCompletionNotifier *cn = stack->create_completion_notifier();
2291
2292   int ret = mdlog->store_entries_in_shard(dest_entries, shard_id, cn->completion());
2293   if (ret < 0) {
2294     cn->put();
2295     ldout(cct, 10) << "failed to store md log entries shard_id=" << shard_id << " ret=" << ret << dendl;
2296     return set_cr_error(ret);
2297   }
2298   return io_block(0);
2299 }
2300
2301 int RGWCloneMetaLogCoroutine::state_store_mdlog_entries_complete()
2302 {
2303   return set_cr_done();
2304 }
2305
2306
2307 // TODO: move into rgw_sync_trim.cc
2308 #undef dout_prefix
2309 #define dout_prefix (*_dout << "meta trim: ")
2310
2311 /// purge all log shards for the given mdlog
2312 class PurgeLogShardsCR : public RGWShardCollectCR {
2313   RGWRados *const store;
2314   const RGWMetadataLog* mdlog;
2315   const int num_shards;
2316   rgw_raw_obj obj;
2317   int i{0};
2318
2319   static constexpr int max_concurrent = 16;
2320
2321  public:
2322   PurgeLogShardsCR(RGWRados *store, const RGWMetadataLog* mdlog,
2323                    const rgw_pool& pool, int num_shards)
2324     : RGWShardCollectCR(store->ctx(), max_concurrent),
2325       store(store), mdlog(mdlog), num_shards(num_shards), obj(pool, "")
2326   {}
2327
2328   bool spawn_next() override {
2329     if (i == num_shards) {
2330       return false;
2331     }
2332     mdlog->get_shard_oid(i++, obj.oid);
2333     spawn(new RGWRadosRemoveCR(store, obj), false);
2334     return true;
2335   }
2336 };
2337
2338 using Cursor = RGWPeriodHistory::Cursor;
2339
2340 /// purge mdlogs from the oldest up to (but not including) the given realm_epoch
2341 class PurgePeriodLogsCR : public RGWCoroutine {
2342   RGWRados *const store;
2343   RGWMetadataManager *const metadata;
2344   RGWObjVersionTracker objv;
2345   Cursor cursor;
2346   epoch_t realm_epoch;
2347   epoch_t *last_trim_epoch; //< update last trim on success
2348
2349  public:
2350   PurgePeriodLogsCR(RGWRados *store, epoch_t realm_epoch, epoch_t *last_trim)
2351     : RGWCoroutine(store->ctx()), store(store), metadata(store->meta_mgr),
2352       realm_epoch(realm_epoch), last_trim_epoch(last_trim)
2353   {}
2354
2355   int operate();
2356 };
2357
2358 int PurgePeriodLogsCR::operate()
2359 {
2360   reenter(this) {
2361     // read our current oldest log period
2362     yield call(metadata->read_oldest_log_period_cr(&cursor, &objv));
2363     if (retcode < 0) {
2364       return set_cr_error(retcode);
2365     }
2366     assert(cursor);
2367     ldout(cct, 20) << "oldest log realm_epoch=" << cursor.get_epoch()
2368         << " period=" << cursor.get_period().get_id() << dendl;
2369
2370     // trim -up to- the given realm_epoch
2371     while (cursor.get_epoch() < realm_epoch) {
2372       ldout(cct, 4) << "purging log shards for realm_epoch=" << cursor.get_epoch()
2373           << " period=" << cursor.get_period().get_id() << dendl;
2374       yield {
2375         const auto mdlog = metadata->get_log(cursor.get_period().get_id());
2376         const auto& pool = store->get_zone_params().log_pool;
2377         auto num_shards = cct->_conf->rgw_md_log_max_shards;
2378         call(new PurgeLogShardsCR(store, mdlog, pool, num_shards));
2379       }
2380       if (retcode < 0) {
2381         ldout(cct, 1) << "failed to remove log shards: "
2382             << cpp_strerror(retcode) << dendl;
2383         return set_cr_error(retcode);
2384       }
2385       ldout(cct, 10) << "removed log shards for realm_epoch=" << cursor.get_epoch()
2386           << " period=" << cursor.get_period().get_id() << dendl;
2387
2388       // update our mdlog history
2389       yield call(metadata->trim_log_period_cr(cursor, &objv));
2390       if (retcode == -ENOENT) {
2391         // must have raced to update mdlog history. return success and allow the
2392         // winner to continue purging
2393         ldout(cct, 10) << "already removed log shards for realm_epoch=" << cursor.get_epoch()
2394             << " period=" << cursor.get_period().get_id() << dendl;
2395         return set_cr_done();
2396       } else if (retcode < 0) {
2397         ldout(cct, 1) << "failed to remove log shards for realm_epoch="
2398             << cursor.get_epoch() << " period=" << cursor.get_period().get_id()
2399             << " with: " << cpp_strerror(retcode) << dendl;
2400         return set_cr_error(retcode);
2401       }
2402
2403       if (*last_trim_epoch < cursor.get_epoch()) {
2404         *last_trim_epoch = cursor.get_epoch();
2405       }
2406
2407       assert(cursor.has_next()); // get_current() should always come after
2408       cursor.next();
2409     }
2410     return set_cr_done();
2411   }
2412   return 0;
2413 }
2414
2415 namespace {
2416
2417 using connection_map = std::map<std::string, std::unique_ptr<RGWRESTConn>>;
2418
2419 /// construct a RGWRESTConn for each zone in the realm
2420 template <typename Zonegroups>
2421 connection_map make_peer_connections(RGWRados *store,
2422                                      const Zonegroups& zonegroups)
2423 {
2424   connection_map connections;
2425   for (auto& g : zonegroups) {
2426     for (auto& z : g.second.zones) {
2427       std::unique_ptr<RGWRESTConn> conn{
2428         new RGWRESTConn(store->ctx(), store, z.first, z.second.endpoints)};
2429       connections.emplace(z.first, std::move(conn));
2430     }
2431   }
2432   return connections;
2433 }
2434
2435 /// return the marker that it's safe to trim up to
2436 const std::string& get_stable_marker(const rgw_meta_sync_marker& m)
2437 {
2438   return m.state == m.FullSync ? m.next_step_marker : m.marker;
2439 }
2440
2441 /// comparison operator for take_min_status()
2442 bool operator<(const rgw_meta_sync_marker& lhs, const rgw_meta_sync_marker& rhs)
2443 {
2444   // sort by stable marker
2445   return get_stable_marker(lhs) < get_stable_marker(rhs);
2446 }
2447
2448 /// populate the status with the minimum stable marker of each shard for any
2449 /// peer whose realm_epoch matches the minimum realm_epoch in the input
2450 template <typename Iter>
2451 int take_min_status(CephContext *cct, Iter first, Iter last,
2452                     rgw_meta_sync_status *status)
2453 {
2454   if (first == last) {
2455     return -EINVAL;
2456   }
2457   const size_t num_shards = cct->_conf->rgw_md_log_max_shards;
2458
2459   status->sync_info.realm_epoch = std::numeric_limits<epoch_t>::max();
2460   for (auto p = first; p != last; ++p) {
2461     // validate peer's shard count
2462     if (p->sync_markers.size() != num_shards) {
2463       ldout(cct, 1) << "take_min_status got peer status with "
2464           << p->sync_markers.size() << " shards, expected "
2465           << num_shards << dendl;
2466       return -EINVAL;
2467     }
2468     if (p->sync_info.realm_epoch < status->sync_info.realm_epoch) {
2469       // earlier epoch, take its entire status
2470       *status = std::move(*p);
2471     } else if (p->sync_info.realm_epoch == status->sync_info.realm_epoch) {
2472       // same epoch, take any earlier markers
2473       auto m = status->sync_markers.begin();
2474       for (auto& shard : p->sync_markers) {
2475         if (shard.second < m->second) {
2476           m->second = std::move(shard.second);
2477         }
2478         ++m;
2479       }
2480     }
2481   }
2482   return 0;
2483 }
2484
2485 struct TrimEnv {
2486   RGWRados *const store;
2487   RGWHTTPManager *const http;
2488   int num_shards;
2489   const std::string& zone;
2490   Cursor current; //< cursor to current period
2491   epoch_t last_trim_epoch{0}; //< epoch of last mdlog that was purged
2492
2493   TrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2494     : store(store), http(http), num_shards(num_shards),
2495       zone(store->get_zone_params().get_id()),
2496       current(store->period_history->get_current())
2497   {}
2498 };
2499
2500 struct MasterTrimEnv : public TrimEnv {
2501   connection_map connections; //< peer connections
2502   std::vector<rgw_meta_sync_status> peer_status; //< sync status for each peer
2503   /// last trim marker for each shard, only applies to current period's mdlog
2504   std::vector<std::string> last_trim_markers;
2505
2506   MasterTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2507     : TrimEnv(store, http, num_shards),
2508       last_trim_markers(num_shards)
2509   {
2510     auto& period = current.get_period();
2511     connections = make_peer_connections(store, period.get_map().zonegroups);
2512     connections.erase(zone);
2513     peer_status.resize(connections.size());
2514   }
2515 };
2516
2517 struct PeerTrimEnv : public TrimEnv {
2518   /// last trim timestamp for each shard, only applies to current period's mdlog
2519   std::vector<ceph::real_time> last_trim_timestamps;
2520
2521   PeerTrimEnv(RGWRados *store, RGWHTTPManager *http, int num_shards)
2522     : TrimEnv(store, http, num_shards),
2523       last_trim_timestamps(num_shards)
2524   {}
2525
2526   void set_num_shards(int num_shards) {
2527     this->num_shards = num_shards;
2528     last_trim_timestamps.resize(num_shards);
2529   }
2530 };
2531
2532 } // anonymous namespace
2533
2534
2535 /// spawn a trim cr for each shard that needs it, while limiting the number
2536 /// of concurrent shards
2537 class MetaMasterTrimShardCollectCR : public RGWShardCollectCR {
2538  private:
2539   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2540
2541   MasterTrimEnv& env;
2542   RGWMetadataLog *mdlog;
2543   int shard_id{0};
2544   std::string oid;
2545   const rgw_meta_sync_status& sync_status;
2546
2547  public:
2548   MetaMasterTrimShardCollectCR(MasterTrimEnv& env, RGWMetadataLog *mdlog,
2549                                const rgw_meta_sync_status& sync_status)
2550     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2551       env(env), mdlog(mdlog), sync_status(sync_status)
2552   {}
2553
2554   bool spawn_next() override;
2555 };
2556
2557 bool MetaMasterTrimShardCollectCR::spawn_next()
2558 {
2559   while (shard_id < env.num_shards) {
2560     auto m = sync_status.sync_markers.find(shard_id);
2561     if (m == sync_status.sync_markers.end()) {
2562       shard_id++;
2563       continue;
2564     }
2565     auto& stable = get_stable_marker(m->second);
2566     auto& last_trim = env.last_trim_markers[shard_id];
2567
2568     if (stable <= last_trim) {
2569       // already trimmed
2570       ldout(cct, 20) << "skipping log shard " << shard_id
2571           << " at marker=" << stable
2572           << " last_trim=" << last_trim
2573           << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
2574       shard_id++;
2575       continue;
2576     }
2577
2578     mdlog->get_shard_oid(shard_id, oid);
2579
2580     ldout(cct, 10) << "trimming log shard " << shard_id
2581         << " at marker=" << stable
2582         << " last_trim=" << last_trim
2583         << " realm_epoch=" << sync_status.sync_info.realm_epoch << dendl;
2584     spawn(new RGWSyncLogTrimCR(env.store, oid, stable, &last_trim), false);
2585     shard_id++;
2586     return true;
2587   }
2588   return false;
2589 }
2590
2591 /// spawn rest requests to read each peer's sync status
2592 class MetaMasterStatusCollectCR : public RGWShardCollectCR {
2593   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2594
2595   MasterTrimEnv& env;
2596   connection_map::iterator c;
2597   std::vector<rgw_meta_sync_status>::iterator s;
2598  public:
2599   MetaMasterStatusCollectCR(MasterTrimEnv& env)
2600     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2601       env(env), c(env.connections.begin()), s(env.peer_status.begin())
2602   {}
2603
2604   bool spawn_next() override {
2605     if (c == env.connections.end()) {
2606       return false;
2607     }
2608     static rgw_http_param_pair params[] = {
2609       { "type", "metadata" },
2610       { "status", nullptr },
2611       { nullptr, nullptr }
2612     };
2613
2614     ldout(cct, 20) << "query sync status from " << c->first << dendl;
2615     auto conn = c->second.get();
2616     using StatusCR = RGWReadRESTResourceCR<rgw_meta_sync_status>;
2617     spawn(new StatusCR(cct, conn, env.http, "/admin/log/", params, &*s),
2618           false);
2619     ++c;
2620     ++s;
2621     return true;
2622   }
2623 };
2624
2625 class MetaMasterTrimCR : public RGWCoroutine {
2626   MasterTrimEnv& env;
2627   rgw_meta_sync_status min_status; //< minimum sync status of all peers
2628   int ret{0};
2629
2630  public:
2631   MetaMasterTrimCR(MasterTrimEnv& env)
2632     : RGWCoroutine(env.store->ctx()), env(env)
2633   {}
2634
2635   int operate();
2636 };
2637
2638 int MetaMasterTrimCR::operate()
2639 {
2640   reenter(this) {
2641     // TODO: detect this and fail before we spawn the trim thread?
2642     if (env.connections.empty()) {
2643       ldout(cct, 4) << "no peers, exiting" << dendl;
2644       return set_cr_done();
2645     }
2646
2647     ldout(cct, 10) << "fetching sync status for zone " << env.zone << dendl;
2648     // query mdlog sync status from peers
2649     yield call(new MetaMasterStatusCollectCR(env));
2650
2651     // must get a successful reply from all peers to consider trimming
2652     if (ret < 0) {
2653       ldout(cct, 4) << "failed to fetch sync status from all peers" << dendl;
2654       return set_cr_error(ret);
2655     }
2656
2657     // determine the minimum epoch and markers
2658     ret = take_min_status(env.store->ctx(), env.peer_status.begin(),
2659                           env.peer_status.end(), &min_status);
2660     if (ret < 0) {
2661       ldout(cct, 4) << "failed to calculate min sync status from peers" << dendl;
2662       return set_cr_error(ret);
2663     }
2664     yield {
2665       auto store = env.store;
2666       auto epoch = min_status.sync_info.realm_epoch;
2667       ldout(cct, 4) << "realm epoch min=" << epoch
2668           << " current=" << env.current.get_epoch()<< dendl;
2669       if (epoch > env.last_trim_epoch + 1) {
2670         // delete any prior mdlog periods
2671         spawn(new PurgePeriodLogsCR(store, epoch, &env.last_trim_epoch), true);
2672       } else {
2673         ldout(cct, 10) << "mdlogs already purged up to realm_epoch "
2674             << env.last_trim_epoch << dendl;
2675       }
2676
2677       // if realm_epoch == current, trim mdlog based on markers
2678       if (epoch == env.current.get_epoch()) {
2679         auto mdlog = store->meta_mgr->get_log(env.current.get_period().get_id());
2680         spawn(new MetaMasterTrimShardCollectCR(env, mdlog, min_status), true);
2681       }
2682     }
2683     // ignore any errors during purge/trim because we want to hold the lock open
2684     return set_cr_done();
2685   }
2686   return 0;
2687 }
2688
2689
2690 /// read the first entry of the master's mdlog shard and trim to that position
2691 class MetaPeerTrimShardCR : public RGWCoroutine {
2692   RGWMetaSyncEnv& env;
2693   RGWMetadataLog *mdlog;
2694   const std::string& period_id;
2695   const int shard_id;
2696   RGWMetadataLogInfo info;
2697   ceph::real_time stable; //< safe timestamp to trim, according to master
2698   ceph::real_time *last_trim; //< last trimmed timestamp, updated on trim
2699   rgw_mdlog_shard_data result; //< result from master's mdlog listing
2700
2701  public:
2702   MetaPeerTrimShardCR(RGWMetaSyncEnv& env, RGWMetadataLog *mdlog,
2703                       const std::string& period_id, int shard_id,
2704                       ceph::real_time *last_trim)
2705     : RGWCoroutine(env.store->ctx()), env(env), mdlog(mdlog),
2706       period_id(period_id), shard_id(shard_id), last_trim(last_trim)
2707   {}
2708
2709   int operate() override;
2710 };
2711
2712 int MetaPeerTrimShardCR::operate()
2713 {
2714   reenter(this) {
2715     // query master's first mdlog entry for this shard
2716     yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
2717                                              "", 1, &result));
2718     if (retcode < 0) {
2719       ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
2720           << shard_id << " for period " << period_id
2721           << ": " << cpp_strerror(retcode) << dendl;
2722       return set_cr_error(retcode);
2723     }
2724     if (result.entries.empty()) {
2725       // if there are no mdlog entries, we don't have a timestamp to compare. we
2726       // can't just trim everything, because there could be racing updates since
2727       // this empty reply. query the mdlog shard info to read its max timestamp,
2728       // then retry the listing to make sure it's still empty before trimming to
2729       // that
2730       ldout(cct, 10) << "empty master mdlog shard " << shard_id
2731           << ", reading last timestamp from shard info" << dendl;
2732       // read the mdlog shard info for the last timestamp
2733       using ShardInfoCR = RGWReadRemoteMDLogShardInfoCR;
2734       yield call(new ShardInfoCR(&env, period_id, shard_id, &info));
2735       if (retcode < 0) {
2736         ldout(cct, 5) << "failed to read info from master's mdlog shard "
2737             << shard_id << " for period " << period_id
2738             << ": " << cpp_strerror(retcode) << dendl;
2739         return set_cr_error(retcode);
2740       }
2741       if (ceph::real_clock::is_zero(info.last_update)) {
2742         return set_cr_done(); // nothing to trim
2743       }
2744       ldout(cct, 10) << "got mdlog shard info with last update="
2745           << info.last_update << dendl;
2746       // re-read the master's first mdlog entry to make sure it hasn't changed
2747       yield call(new RGWListRemoteMDLogShardCR(&env, period_id, shard_id,
2748                                                "", 1, &result));
2749       if (retcode < 0) {
2750         ldout(cct, 5) << "failed to read first entry from master's mdlog shard "
2751             << shard_id << " for period " << period_id
2752             << ": " << cpp_strerror(retcode) << dendl;
2753         return set_cr_error(retcode);
2754       }
2755       // if the mdlog is still empty, trim to max marker
2756       if (result.entries.empty()) {
2757         stable = info.last_update;
2758       } else {
2759         stable = result.entries.front().timestamp;
2760
2761         // can only trim -up to- master's first timestamp, so subtract a second.
2762         // (this is why we use timestamps instead of markers for the peers)
2763         stable -= std::chrono::seconds(1);
2764       }
2765     } else {
2766       stable = result.entries.front().timestamp;
2767       stable -= std::chrono::seconds(1);
2768     }
2769
2770     if (stable <= *last_trim) {
2771       ldout(cct, 10) << "skipping log shard " << shard_id
2772           << " at timestamp=" << stable
2773           << " last_trim=" << *last_trim << dendl;
2774       return set_cr_done();
2775     }
2776
2777     ldout(cct, 10) << "trimming log shard " << shard_id
2778         << " at timestamp=" << stable
2779         << " last_trim=" << *last_trim << dendl;
2780     yield {
2781       std::string oid;
2782       mdlog->get_shard_oid(shard_id, oid);
2783       call(new RGWRadosTimelogTrimCR(env.store, oid, real_time{}, stable, "", ""));
2784     }
2785     if (retcode < 0 && retcode != -ENODATA) {
2786       ldout(cct, 1) << "failed to trim mdlog shard " << shard_id
2787           << ": " << cpp_strerror(retcode) << dendl;
2788       return set_cr_error(retcode);
2789     }
2790     *last_trim = stable;
2791     return set_cr_done();
2792   }
2793   return 0;
2794 }
2795
2796 class MetaPeerTrimShardCollectCR : public RGWShardCollectCR {
2797   static constexpr int MAX_CONCURRENT_SHARDS = 16;
2798
2799   PeerTrimEnv& env;
2800   RGWMetadataLog *mdlog;
2801   const std::string& period_id;
2802   RGWMetaSyncEnv meta_env; //< for RGWListRemoteMDLogShardCR
2803   int shard_id{0};
2804
2805  public:
2806   MetaPeerTrimShardCollectCR(PeerTrimEnv& env, RGWMetadataLog *mdlog)
2807     : RGWShardCollectCR(env.store->ctx(), MAX_CONCURRENT_SHARDS),
2808       env(env), mdlog(mdlog), period_id(env.current.get_period().get_id())
2809   {
2810     meta_env.init(cct, env.store, env.store->rest_master_conn,
2811                   env.store->get_async_rados(), env.http, nullptr);
2812   }
2813
2814   bool spawn_next() override;
2815 };
2816
2817 bool MetaPeerTrimShardCollectCR::spawn_next()
2818 {
2819   if (shard_id >= env.num_shards) {
2820     return false;
2821   }
2822   auto& last_trim = env.last_trim_timestamps[shard_id];
2823   spawn(new MetaPeerTrimShardCR(meta_env, mdlog, period_id, shard_id, &last_trim),
2824         false);
2825   shard_id++;
2826   return true;
2827 }
2828
2829 class MetaPeerTrimCR : public RGWCoroutine {
2830   PeerTrimEnv& env;
2831   rgw_mdlog_info mdlog_info; //< master's mdlog info
2832
2833  public:
2834   MetaPeerTrimCR(PeerTrimEnv& env) : RGWCoroutine(env.store->ctx()), env(env) {}
2835
2836   int operate();
2837 };
2838
2839 int MetaPeerTrimCR::operate()
2840 {
2841   reenter(this) {
2842     ldout(cct, 10) << "fetching master mdlog info" << dendl;
2843     yield {
2844       // query mdlog_info from master for oldest_log_period
2845       rgw_http_param_pair params[] = {
2846         { "type", "metadata" },
2847         { nullptr, nullptr }
2848       };
2849
2850       using LogInfoCR = RGWReadRESTResourceCR<rgw_mdlog_info>;
2851       call(new LogInfoCR(cct, env.store->rest_master_conn, env.http,
2852                          "/admin/log/", params, &mdlog_info));
2853     }
2854     if (retcode < 0) {
2855       ldout(cct, 4) << "failed to read mdlog info from master" << dendl;
2856       return set_cr_error(retcode);
2857     }
2858     // use master's shard count instead
2859     env.set_num_shards(mdlog_info.num_shards);
2860
2861     if (mdlog_info.realm_epoch > env.last_trim_epoch + 1) {
2862       // delete any prior mdlog periods
2863       yield call(new PurgePeriodLogsCR(env.store, mdlog_info.realm_epoch,
2864                                        &env.last_trim_epoch));
2865     } else {
2866       ldout(cct, 10) << "mdlogs already purged through realm_epoch "
2867           << env.last_trim_epoch << dendl;
2868     }
2869
2870     // if realm_epoch == current, trim mdlog based on master's markers
2871     if (mdlog_info.realm_epoch == env.current.get_epoch()) {
2872       yield {
2873         auto meta_mgr = env.store->meta_mgr;
2874         auto mdlog = meta_mgr->get_log(env.current.get_period().get_id());
2875         call(new MetaPeerTrimShardCollectCR(env, mdlog));
2876         // ignore any errors during purge/trim because we want to hold the lock open
2877       }
2878     }
2879     return set_cr_done();
2880   }
2881   return 0;
2882 }
2883
2884 class MetaTrimPollCR : public RGWCoroutine {
2885   RGWRados *const store;
2886   const utime_t interval; //< polling interval
2887   const rgw_raw_obj obj;
2888   const std::string name{"meta_trim"}; //< lock name
2889   const std::string cookie;
2890
2891  protected:
2892   /// allocate the coroutine to run within the lease
2893   virtual RGWCoroutine* alloc_cr() = 0;
2894
2895  public:
2896   MetaTrimPollCR(RGWRados *store, utime_t interval)
2897     : RGWCoroutine(store->ctx()), store(store), interval(interval),
2898       obj(store->get_zone_params().log_pool, RGWMetadataLogHistory::oid),
2899       cookie(RGWSimpleRadosLockCR::gen_random_cookie(cct))
2900   {}
2901
2902   int operate();
2903 };
2904
2905 int MetaTrimPollCR::operate()
2906 {
2907   reenter(this) {
2908     for (;;) {
2909       set_status("sleeping");
2910       wait(interval);
2911
2912       // prevent others from trimming for our entire wait interval
2913       set_status("acquiring trim lock");
2914       yield call(new RGWSimpleRadosLockCR(store->get_async_rados(), store,
2915                                           obj, name, cookie, interval.sec()));
2916       if (retcode < 0) {
2917         ldout(cct, 4) << "failed to lock: " << cpp_strerror(retcode) << dendl;
2918         continue;
2919       }
2920
2921       set_status("trimming");
2922       yield call(alloc_cr());
2923
2924       if (retcode < 0) {
2925         // on errors, unlock so other gateways can try
2926         set_status("unlocking");
2927         yield call(new RGWSimpleRadosUnlockCR(store->get_async_rados(), store,
2928                                               obj, name, cookie));
2929       }
2930     }
2931   }
2932   return 0;
2933 }
2934
2935 class MetaMasterTrimPollCR : public MetaTrimPollCR  {
2936   MasterTrimEnv env; //< trim state to share between calls
2937   RGWCoroutine* alloc_cr() override {
2938     return new MetaMasterTrimCR(env);
2939   }
2940  public:
2941   MetaMasterTrimPollCR(RGWRados *store, RGWHTTPManager *http,
2942                        int num_shards, utime_t interval)
2943     : MetaTrimPollCR(store, interval),
2944       env(store, http, num_shards)
2945   {}
2946 };
2947
2948 class MetaPeerTrimPollCR : public MetaTrimPollCR {
2949   PeerTrimEnv env; //< trim state to share between calls
2950   RGWCoroutine* alloc_cr() override {
2951     return new MetaPeerTrimCR(env);
2952   }
2953  public:
2954   MetaPeerTrimPollCR(RGWRados *store, RGWHTTPManager *http,
2955                      int num_shards, utime_t interval)
2956     : MetaTrimPollCR(store, interval),
2957       env(store, http, num_shards)
2958   {}
2959 };
2960
2961 RGWCoroutine* create_meta_log_trim_cr(RGWRados *store, RGWHTTPManager *http,
2962                                       int num_shards, utime_t interval)
2963 {
2964   if (store->is_meta_master()) {
2965     return new MetaMasterTrimPollCR(store, http, num_shards, interval);
2966   }
2967   return new MetaPeerTrimPollCR(store, http, num_shards, interval);
2968 }
2969
2970
2971 struct MetaMasterAdminTrimCR : private MasterTrimEnv, public MetaMasterTrimCR {
2972   MetaMasterAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
2973     : MasterTrimEnv(store, http, num_shards),
2974       MetaMasterTrimCR(*static_cast<MasterTrimEnv*>(this))
2975   {}
2976 };
2977
2978 struct MetaPeerAdminTrimCR : private PeerTrimEnv, public MetaPeerTrimCR {
2979   MetaPeerAdminTrimCR(RGWRados *store, RGWHTTPManager *http, int num_shards)
2980     : PeerTrimEnv(store, http, num_shards),
2981       MetaPeerTrimCR(*static_cast<PeerTrimEnv*>(this))
2982   {}
2983 };
2984
2985 RGWCoroutine* create_admin_meta_log_trim_cr(RGWRados *store,
2986                                             RGWHTTPManager *http,
2987                                             int num_shards)
2988 {
2989   if (store->is_meta_master()) {
2990     return new MetaMasterAdminTrimCR(store, http, num_shards);
2991   }
2992   return new MetaPeerAdminTrimCR(store, http, num_shards);
2993 }