+bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
+{
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+
+ RWLock::RLocker pg_map_locker{pg_map_lock};
+ if (pg_map.size() < max_pgs_per_osd) {
+ return false;
+ }
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (is_mon_create) {
+ pending_creates_from_mon++;
+ } else {
+ bool is_primary = osdmap->get_pg_acting_rank(pgid.pgid, whoami) == 0;
+ pending_creates_from_osd.emplace(pgid.pgid, is_primary);
+ }
+ dout(5) << __func__ << " withhold creation of pg " << pgid
+ << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
+ return true;
+}
+
+// to re-trigger a peering, we have to twiddle the pg mapping a little bit,
+// see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
+// to up set if pg_temp is empty. so an empty pg_temp won't work.
+static vector<int32_t> twiddle(const vector<int>& acting) {
+ if (acting.size() > 1) {
+ return {acting[0]};
+ } else {
+ vector<int32_t> twiddled(acting.begin(), acting.end());
+ twiddled.push_back(-1);
+ return twiddled;
+ }
+}
+
+void OSD::resume_creating_pg()
+{
+ bool do_sub_pg_creates = false;
+ bool have_pending_creates = false;
+ {
+ const auto max_pgs_per_osd =
+ (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
+ cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
+ RWLock::RLocker l(pg_map_lock);
+ if (max_pgs_per_osd <= pg_map.size()) {
+ // this could happen if admin decreases this setting before a PG is removed
+ return;
+ }
+ unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
+ lock_guard<mutex> pending_creates_locker{pending_creates_lock};
+ if (pending_creates_from_mon > 0) {
+ do_sub_pg_creates = true;
+ if (pending_creates_from_mon >= spare_pgs) {
+ spare_pgs = pending_creates_from_mon = 0;
+ } else {
+ spare_pgs -= pending_creates_from_mon;
+ pending_creates_from_mon = 0;
+ }
+ }
+ auto pg = pending_creates_from_osd.cbegin();
+ while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
+ dout(20) << __func__ << " pg " << pg->first << dendl;
+ vector<int> acting;
+ osdmap->pg_to_up_acting_osds(pg->first, nullptr, nullptr, &acting, nullptr);
+ service.queue_want_pg_temp(pg->first, twiddle(acting), true);
+ pg = pending_creates_from_osd.erase(pg);
+ do_sub_pg_creates = true;
+ spare_pgs--;
+ }
+ have_pending_creates = (pending_creates_from_mon > 0 ||
+ !pending_creates_from_osd.empty());
+ }
+
+ bool do_renew_subs = false;
+ if (do_sub_pg_creates) {
+ if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
+ dout(4) << __func__ << ": resolicit pg creates from mon since "
+ << last_pg_create_epoch << dendl;
+ do_renew_subs = true;
+ }
+ }
+ version_t start = osdmap->get_epoch() + 1;
+ if (have_pending_creates) {
+ // don't miss any new osdmap deleting PGs
+ if (monc->sub_want("osdmap", start, 0)) {
+ dout(4) << __func__ << ": resolicit osdmap from mon since "
+ << start << dendl;
+ do_renew_subs = true;
+ }
+ } else if (do_sub_pg_creates) {
+ // no need to subscribe the osdmap continuously anymore
+ // once the pgtemp and/or mon_subscribe(pg_creates) is sent
+ if (monc->sub_want_increment("osdmap", start, CEPH_SUBSCRIBE_ONETIME)) {
+ dout(4) << __func__ << ": re-subscribe osdmap(onetime) since"
+ << start << dendl;
+ do_renew_subs = true;
+ }
+ }
+
+ if (do_renew_subs) {
+ monc->renew_subs();
+ }
+
+ service.send_pg_temp();
+}