[ceph.git] / ceph / src / seastar / src / core / fair_queue.cc

/*
 * This file is open source software, licensed to you under the terms
 * of the Apache License, Version 2.0 (the "License").  See the NOTICE file
 * distributed with this work for additional information regarding copyright
 * ownership.  You may not use this file except in compliance with the License.
 *
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Copyright 2019 ScyllaDB
 */

#include <boost/container/small_vector.hpp>
#include <boost/intrusive/parent_from_member.hpp>
#include <seastar/core/fair_queue.hh>
#include <seastar/core/future.hh>
#include <seastar/core/shared_ptr.hh>
#include <seastar/core/circular_buffer.hh>
#include <seastar/util/noncopyable_function.hh>
#include <seastar/core/reactor.hh>
#include <seastar/core/metrics.hh>
#include <queue>
#include <chrono>
#include <unordered_set>

#include "fmt/format.h"
#include "fmt/ostream.h"

namespace seastar {

static_assert(sizeof(fair_queue_ticket) == sizeof(uint64_t), "unexpected fair_queue_ticket size");
static_assert(sizeof(fair_queue_entry) <= 3 * sizeof(void*), "unexpected fair_queue_entry::_hook size");
static_assert(sizeof(fair_queue_entry::container_list_t) == 2 * sizeof(void*), "unexpected priority_class::_queue size");

fair_queue_ticket::fair_queue_ticket(uint32_t weight, uint32_t size) noexcept
    : _weight(weight)
    , _size(size)
{}

float fair_queue_ticket::normalize(fair_queue_ticket denominator) const noexcept {
    return float(_weight) / denominator._weight + float(_size) / denominator._size;
}

fair_queue_ticket fair_queue_ticket::operator+(fair_queue_ticket desc) const noexcept {
    return fair_queue_ticket(_weight + desc._weight, _size + desc._size);
}

fair_queue_ticket& fair_queue_ticket::operator+=(fair_queue_ticket desc) noexcept {
    _weight += desc._weight;
    _size += desc._size;
    return *this;
}

fair_queue_ticket fair_queue_ticket::operator-(fair_queue_ticket desc) const noexcept {
    return fair_queue_ticket(_weight - desc._weight, _size - desc._size);
}

fair_queue_ticket& fair_queue_ticket::operator-=(fair_queue_ticket desc) noexcept {
    _weight -= desc._weight;
    _size -= desc._size;
    return *this;
}

fair_queue_ticket::operator bool() const noexcept {
    return (_weight > 0) || (_size > 0);
}

bool fair_queue_ticket::is_non_zero() const noexcept {
    return (_weight > 0) && (_size > 0);
}

bool fair_queue_ticket::operator==(const fair_queue_ticket& o) const noexcept {
    return _weight == o._weight && _size == o._size;
}

std::ostream& operator<<(std::ostream& os, fair_queue_ticket t) {
    return os << t._weight << ":" << t._size;
}

fair_queue_ticket wrapping_difference(const fair_queue_ticket& a, const fair_queue_ticket& b) noexcept {
    return fair_queue_ticket(std::max<int32_t>(a._weight - b._weight, 0),
            std::max<int32_t>(a._size - b._size, 0));
}

fair_group::fair_group(config cfg)
        : _cost_capacity(cfg.weight_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count(), cfg.size_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count())
        , _token_bucket(cfg.rate_factor * fixed_point_factor,
                        std::max<capacity_t>(cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(), ticket_capacity(fair_queue_ticket(cfg.limit_min_weight, cfg.limit_min_size))),
                        ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size))
                       )
{
    assert(_cost_capacity.is_non_zero());
    seastar_logger.info("Created fair group {}, capacity rate {}, limit {}, rate {} (factor {}), threshold {}", cfg.label,
            _cost_capacity, _token_bucket.limit(), _token_bucket.rate(), cfg.rate_factor, _token_bucket.threshold());

    if (cfg.rate_factor * fixed_point_factor > _token_bucket.max_rate) {
        throw std::runtime_error("Fair-group rate_factor is too large");
    }

    if (ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size)) > _token_bucket.threshold()) {
        throw std::runtime_error("Fair-group replenisher limit is lower than threshold");
    }
}

auto fair_group::grab_capacity(capacity_t cap) noexcept -> capacity_t {
    assert(cap <= _token_bucket.limit());
    return _token_bucket.grab(cap);
}

void fair_group::release_capacity(capacity_t cap) noexcept {
    _token_bucket.release(cap);
}

void fair_group::replenish_capacity(clock_type::time_point now) noexcept {
    _token_bucket.replenish(now);
}

void fair_group::maybe_replenish_capacity(clock_type::time_point& local_ts) noexcept {
    auto now = clock_type::now();
    auto extra = _token_bucket.accumulated_in(now - local_ts);

    if (extra >= _token_bucket.threshold()) {
        local_ts = now;
        replenish_capacity(now);
    }
}

auto fair_group::capacity_deficiency(capacity_t from) const noexcept -> capacity_t {
    return _token_bucket.deficiency(from);
}

auto fair_group::ticket_capacity(fair_queue_ticket t) const noexcept -> capacity_t {
    return t.normalize(_cost_capacity) * fixed_point_factor;
}

// Priority class, to be used with a given fair_queue
class fair_queue::priority_class_data {
    friend class fair_queue;
    uint32_t _shares = 0;
    capacity_t _accumulated = 0;
    capacity_t _pure_accumulated = 0;
    fair_queue_entry::container_list_t _queue;
    bool _queued = false;
    bool _plugged = true;

public:
    explicit priority_class_data(uint32_t shares) noexcept : _shares(std::max(shares, 1u)) {}
    priority_class_data(const priority_class_data&) = delete;
    priority_class_data(priority_class_data&&) = delete;

    void update_shares(uint32_t shares) noexcept {
        _shares = (std::max(shares, 1u));
    }
};

bool fair_queue::class_compare::operator() (const priority_class_ptr& lhs, const priority_class_ptr & rhs) const noexcept {
    return lhs->_accumulated > rhs->_accumulated;
}

fair_queue::fair_queue(fair_group& group, config cfg)
    : _config(std::move(cfg))
    , _group(group)
    , _group_replenish(clock_type::now())
{
}

fair_queue::fair_queue(fair_queue&& other)
    : _config(std::move(other._config))
    , _group(other._group)
    , _group_replenish(std::move(other._group_replenish))
    , _resources_executing(std::exchange(other._resources_executing, fair_queue_ticket{}))
    , _resources_queued(std::exchange(other._resources_queued, fair_queue_ticket{}))
    , _requests_executing(std::exchange(other._requests_executing, 0))
    , _requests_queued(std::exchange(other._requests_queued, 0))
    , _handles(std::move(other._handles))
    , _priority_classes(std::move(other._priority_classes))
    , _last_accumulated(other._last_accumulated)
{
}

fair_queue::~fair_queue() {
    for (const auto& fq : _priority_classes) {
        assert(!fq);
    }
}

void fair_queue::push_priority_class(priority_class_data& pc) noexcept {
    assert(pc._plugged && !pc._queued);
    _handles.assert_enough_capacity();
    _handles.push(&pc);
    pc._queued = true;
}

void fair_queue::push_priority_class_from_idle(priority_class_data& pc) noexcept {
    if (!pc._queued) {
        // Don't let the newcomer monopolize the disk for more than tau
        // duration. For this estimate how many capacity units can be
        // accumulated with the current class shares per rate resulution
        // and scale it up to tau.
        capacity_t max_deviation = fair_group::fixed_point_factor / pc._shares * fair_group::token_bucket_t::rate_cast(_config.tau).count();
        // On start this deviation can go to negative values, so not to
        // introduce extra if's for that short corner case, use signed
        // arithmetics and make sure the _accumulated value doesn't grow
        // over signed maximum (see overflow check below)
        pc._accumulated = std::max<signed_capacity_t>(_last_accumulated - max_deviation, pc._accumulated);
        _handles.assert_enough_capacity();
        _handles.push(&pc);
        pc._queued = true;
    }
}

void fair_queue::pop_priority_class(priority_class_data& pc) noexcept {
    assert(pc._plugged && pc._queued);
    pc._queued = false;
    _handles.pop();
}

void fair_queue::plug_priority_class(priority_class_data& pc) noexcept {
    assert(!pc._plugged && !pc._queued);
    pc._plugged = true;
    if (!pc._queue.empty()) {
        push_priority_class_from_idle(pc);
    }
}

void fair_queue::plug_class(class_id cid) noexcept {
    plug_priority_class(*_priority_classes[cid]);
}

void fair_queue::unplug_priority_class(priority_class_data& pc) noexcept {
    assert(pc._plugged);
    if (pc._queued) {
        pop_priority_class(pc);
    }
    pc._plugged = false;
}

void fair_queue::unplug_class(class_id cid) noexcept {
    unplug_priority_class(*_priority_classes[cid]);
}

auto fair_queue::grab_pending_capacity(const fair_queue_entry& ent) noexcept -> grab_result {
    _group.maybe_replenish_capacity(_group_replenish);

    if (_group.capacity_deficiency(_pending->head)) {
        return grab_result::pending;
    }

    capacity_t cap = _group.ticket_capacity(ent._ticket);
    if (cap > _pending->cap) {
        return grab_result::cant_preempt;
    }

    if (cap < _pending->cap) {
        _group.release_capacity(_pending->cap - cap); // FIXME -- replenish right at once?
    }

    _pending.reset();
    return grab_result::grabbed;
}

auto fair_queue::grab_capacity(const fair_queue_entry& ent) noexcept -> grab_result {
    if (_pending) {
        return grab_pending_capacity(ent);
    }

    capacity_t cap = _group.ticket_capacity(ent._ticket);
    capacity_t want_head = _group.grab_capacity(cap);
    if (_group.capacity_deficiency(want_head)) {
        _pending.emplace(want_head, cap);
        return grab_result::pending;
    }

    return grab_result::grabbed;
}

void fair_queue::register_priority_class(class_id id, uint32_t shares) {
    if (id >= _priority_classes.size()) {
        _priority_classes.resize(id + 1);
    } else {
        assert(!_priority_classes[id]);
    }

    _handles.reserve(_nr_classes + 1);
    _priority_classes[id] = std::make_unique<priority_class_data>(shares);
    _nr_classes++;
}

void fair_queue::unregister_priority_class(class_id id) {
    auto& pclass = _priority_classes[id];
    assert(pclass && pclass->_queue.empty());
    pclass.reset();
    _nr_classes--;
}

void fair_queue::update_shares_for_class(class_id id, uint32_t shares) {
    assert(id < _priority_classes.size());
    auto& pc = _priority_classes[id];
    assert(pc);
    pc->update_shares(shares);
}

size_t fair_queue::waiters() const {
    return _requests_queued;
}

size_t fair_queue::requests_currently_executing() const {
    return _requests_executing;
}

fair_queue_ticket fair_queue::resources_currently_waiting() const {
    return _resources_queued;
}

fair_queue_ticket fair_queue::resources_currently_executing() const {
    return _resources_executing;
}

void fair_queue::queue(class_id id, fair_queue_entry& ent) noexcept {
    priority_class_data& pc = *_priority_classes[id];
    // We need to return a future in this function on which the caller can wait.
    // Since we don't know which queue we will use to execute the next request - if ours or
    // someone else's, we need a separate promise at this point.
    if (pc._plugged) {
        push_priority_class_from_idle(pc);
    }
    pc._queue.push_back(ent);
    _resources_queued += ent._ticket;
    _requests_queued++;
}

void fair_queue::notify_request_finished(fair_queue_ticket desc) noexcept {
    _resources_executing -= desc;
    _requests_executing--;
    _group.release_capacity(_group.ticket_capacity(desc));
}

void fair_queue::notify_request_cancelled(fair_queue_entry& ent) noexcept {
    _resources_queued -= ent._ticket;
    ent._ticket = fair_queue_ticket();
}

fair_queue::clock_type::time_point fair_queue::next_pending_aio() const noexcept {
    if (_pending) {
        /*
         * We expect the disk to release the ticket within some time,
         * but it's ... OK if it doesn't -- the pending wait still
         * needs the head rover value to be ahead of the needed value.
         *
         * It may happen that the capacity gets released before we think
         * it will, in this case we will wait for the full value again,
         * which's sub-optimal. The expectation is that we think disk
         * works faster, than it really does.
         */
        auto over = _group.capacity_deficiency(_pending->head);
        auto ticks = _group.capacity_duration(over);
        return std::chrono::steady_clock::now() + std::chrono::duration_cast<std::chrono::microseconds>(ticks);
    }

    return std::chrono::steady_clock::time_point::max();
}

void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
    capacity_t dispatched = 0;
    boost::container::small_vector<priority_class_ptr, 2> preempt;

    while (!_handles.empty() && (dispatched < _group.maximum_capacity() / smp::count)) {
        priority_class_data& h = *_handles.top();
        if (h._queue.empty()) {
            pop_priority_class(h);
            continue;
        }

        auto& req = h._queue.front();
        auto gr = grab_capacity(req);
        if (gr == grab_result::pending) {
            break;
        }

        if (gr == grab_result::cant_preempt) {
            pop_priority_class(h);
            preempt.emplace_back(&h);
            continue;
        }

        _last_accumulated = std::max(h._accumulated, _last_accumulated);
        pop_priority_class(h);
        h._queue.pop_front();

        _resources_executing += req._ticket;
        _resources_queued -= req._ticket;
        _requests_executing++;
        _requests_queued--;

        // Usually the cost of request is tens to hundreeds of thousands. However, for
        // unrestricted queue it can be as low as 2k. With large enough shares this
        // has chances to be translated into zero cost which, in turn, will make the
        // class show no progress and monopolize the queue.
        auto req_cap = _group.ticket_capacity(req._ticket);
        auto req_cost  = std::max(req_cap / h._shares, (capacity_t)1);
        // signed overflow check to make push_priority_class_from_idle math work
        if (h._accumulated >= std::numeric_limits<signed_capacity_t>::max() - req_cost) {
            for (auto& pc : _priority_classes) {
                if (pc) {
                    if (pc->_queued) {
                        pc->_accumulated -= h._accumulated;
                    } else { // this includes h
                        pc->_accumulated = 0;
                    }
                }
            }
            _last_accumulated = 0;
        }
        h._accumulated += req_cost;
        h._pure_accumulated += req_cap;

        dispatched += _group.ticket_capacity(req._ticket);
        cb(req);

        if (h._plugged && !h._queue.empty()) {
            push_priority_class(h);
        }
    }

    for (auto&& h : preempt) {
        push_priority_class(*h);
    }
}

std::vector<seastar::metrics::impl::metric_definition_impl> fair_queue::metrics(class_id c) {
    namespace sm = seastar::metrics;
    priority_class_data& pc = *_priority_classes[c];
    return std::vector<sm::impl::metric_definition_impl>({
            sm::make_counter("consumption",
                    [&pc] { return fair_group::capacity_tokens(pc._pure_accumulated); },
                    sm::description("Accumulated disk capacity units consumed by this class; an increment per-second rate indicates full utilization")),
            sm::make_counter("adjusted_consumption",
                    [&pc] { return fair_group::capacity_tokens(pc._accumulated); },
                    sm::description("Consumed disk capacity units adjusted for class shares and idling preemption")),
    });
}

}
Commit	Line	Data
9f95a23c TL	1	/*
	2	* This file is open source software, licensed to you under the terms
	3	* of the Apache License, Version 2.0 (the "License"). See the NOTICE file
	4	* distributed with this work for additional information regarding copyright
	5	* ownership. You may not use this file except in compliance with the License.
	6	*
	7	* You may obtain a copy of the License at
	8	*
	9	* http://www.apache.org/licenses/LICENSE-2.0
	10	*
	11	* Unless required by applicable law or agreed to in writing,
	12	* software distributed under the License is distributed on an
	13	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	14	* KIND, either express or implied. See the License for the
	15	* specific language governing permissions and limitations
	16	* under the License.
	17	*/
	18	/*
	19	* Copyright 2019 ScyllaDB
	20	*/
	21
1e59de90	22	#include <boost/container/small_vector.hpp>
20effc67	23	#include <boost/intrusive/parent_from_member.hpp>
9f95a23c TL	24	#include <seastar/core/fair_queue.hh>
	25	#include <seastar/core/future.hh>
	26	#include <seastar/core/shared_ptr.hh>
	27	#include <seastar/core/circular_buffer.hh>
	28	#include <seastar/util/noncopyable_function.hh>
20effc67	29	#include <seastar/core/reactor.hh>
1e59de90	30	#include <seastar/core/metrics.hh>
9f95a23c TL	31	#include <queue>
	32	#include <chrono>
	33	#include <unordered_set>
9f95a23c	34
f67539c2 TL	35	#include "fmt/format.h"
	36	#include "fmt/ostream.h"
	37
9f95a23c TL	38	namespace seastar {
9f95a23c TL	39
f67539c2	40	static_assert(sizeof(fair_queue_ticket) == sizeof(uint64_t), "unexpected fair_queue_ticket size");
20effc67 TL	41	static_assert(sizeof(fair_queue_entry) <= 3 * sizeof(void*), "unexpected fair_queue_entry::_hook size");
20effc67 TL	42	static_assert(sizeof(fair_queue_entry::container_list_t) == 2 * sizeof(void*), "unexpected priority_class::_queue size");
f67539c2	43
20effc67	44	fair_queue_ticket::fair_queue_ticket(uint32_t weight, uint32_t size) noexcept
f67539c2 TL	45	: _weight(weight)
	46	, _size(size)
	47	{}
	48
20effc67	49	float fair_queue_ticket::normalize(fair_queue_ticket denominator) const noexcept {
f67539c2 TL	50	return float(_weight) / denominator._weight + float(_size) / denominator._size;
	51	}
	52
20effc67	53	fair_queue_ticket fair_queue_ticket::operator+(fair_queue_ticket desc) const noexcept {
f67539c2 TL	54	return fair_queue_ticket(_weight + desc._weight, _size + desc._size);
	55	}
	56
20effc67	57	fair_queue_ticket& fair_queue_ticket::operator+=(fair_queue_ticket desc) noexcept {
f67539c2 TL	58	_weight += desc._weight;
	59	_size += desc._size;
	60	return *this;
	61	}
	62
20effc67	63	fair_queue_ticket fair_queue_ticket::operator-(fair_queue_ticket desc) const noexcept {
f67539c2 TL	64	return fair_queue_ticket(_weight - desc._weight, _size - desc._size);
	65	}
	66
20effc67	67	fair_queue_ticket& fair_queue_ticket::operator-=(fair_queue_ticket desc) noexcept {
f67539c2 TL	68	_weight -= desc._weight;
	69	_size -= desc._size;
	70	return *this;
	71	}
	72
20effc67 TL	73	fair_queue_ticket::operator bool() const noexcept {
20effc67 TL	74	return (_weight > 0) \|\| (_size > 0);
f67539c2 TL	75	}
f67539c2 TL	76
1e59de90 TL	77	bool fair_queue_ticket::is_non_zero() const noexcept {
	78	return (_weight > 0) && (_size > 0);
	79	}
	80
20effc67 TL	81	bool fair_queue_ticket::operator==(const fair_queue_ticket& o) const noexcept {
20effc67 TL	82	return _weight == o._weight && _size == o._size;
f67539c2 TL	83	}
	84
	85	std::ostream& operator<<(std::ostream& os, fair_queue_ticket t) {
	86	return os << t._weight << ":" << t._size;
	87	}
	88
1e59de90 TL	89	fair_queue_ticket wrapping_difference(const fair_queue_ticket& a, const fair_queue_ticket& b) noexcept {
	90	return fair_queue_ticket(std::max<int32_t>(a._weight - b._weight, 0),
	91	std::max<int32_t>(a._size - b._size, 0));
	92	}
	93
	94	fair_group::fair_group(config cfg)
	95	: _cost_capacity(cfg.weight_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count(), cfg.size_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count())
	96	, _token_bucket(cfg.rate_factor * fixed_point_factor,
	97	std::max<capacity_t>(cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(), ticket_capacity(fair_queue_ticket(cfg.limit_min_weight, cfg.limit_min_size))),
	98	ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size))
	99	)
	100	{
	101	assert(_cost_capacity.is_non_zero());
	102	seastar_logger.info("Created fair group {}, capacity rate {}, limit {}, rate {} (factor {}), threshold {}", cfg.label,
	103	_cost_capacity, _token_bucket.limit(), _token_bucket.rate(), cfg.rate_factor, _token_bucket.threshold());
20effc67	104
1e59de90 TL	105	if (cfg.rate_factor * fixed_point_factor > _token_bucket.max_rate) {
	106	throw std::runtime_error("Fair-group rate_factor is too large");
	107	}
	108
	109	if (ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size)) > _token_bucket.threshold()) {
	110	throw std::runtime_error("Fair-group replenisher limit is lower than threshold");
	111	}
20effc67 TL	112	}
20effc67 TL	113
1e59de90 TL	114	auto fair_group::grab_capacity(capacity_t cap) noexcept -> capacity_t {
	115	assert(cap <= _token_bucket.limit());
	116	return _token_bucket.grab(cap);
20effc67 TL	117	}
20effc67 TL	118
1e59de90 TL	119	void fair_group::release_capacity(capacity_t cap) noexcept {
1e59de90 TL	120	_token_bucket.release(cap);
20effc67 TL	121	}
20effc67 TL	122
1e59de90 TL	123	void fair_group::replenish_capacity(clock_type::time_point now) noexcept {
1e59de90 TL	124	_token_bucket.replenish(now);
20effc67 TL	125	}
20effc67 TL	126
1e59de90 TL	127	void fair_group::maybe_replenish_capacity(clock_type::time_point& local_ts) noexcept {
	128	auto now = clock_type::now();
	129	auto extra = _token_bucket.accumulated_in(now - local_ts);
	130
	131	if (extra >= _token_bucket.threshold()) {
	132	local_ts = now;
	133	replenish_capacity(now);
	134	}
20effc67 TL	135	}
20effc67 TL	136
1e59de90 TL	137	auto fair_group::capacity_deficiency(capacity_t from) const noexcept -> capacity_t {
1e59de90 TL	138	return _token_bucket.deficiency(from);
20effc67 TL	139	}
20effc67 TL	140
1e59de90 TL	141	auto fair_group::ticket_capacity(fair_queue_ticket t) const noexcept -> capacity_t {
1e59de90 TL	142	return t.normalize(_cost_capacity) * fixed_point_factor;
20effc67 TL	143	}
	144
	145	// Priority class, to be used with a given fair_queue
	146	class fair_queue::priority_class_data {
20effc67 TL	147	friend class fair_queue;
20effc67 TL	148	uint32_t _shares = 0;
1e59de90 TL	149	capacity_t _accumulated = 0;
1e59de90 TL	150	capacity_t _pure_accumulated = 0;
20effc67 TL	151	fair_queue_entry::container_list_t _queue;
20effc67 TL	152	bool _queued = false;
1e59de90	153	bool _plugged = true;
20effc67 TL	154
	155	public:
	156	explicit priority_class_data(uint32_t shares) noexcept : _shares(std::max(shares, 1u)) {}
1e59de90 TL	157	priority_class_data(const priority_class_data&) = delete;
1e59de90 TL	158	priority_class_data(priority_class_data&&) = delete;
20effc67 TL	159
	160	void update_shares(uint32_t shares) noexcept {
	161	_shares = (std::max(shares, 1u));
	162	}
	163	};
	164
	165	bool fair_queue::class_compare::operator() (const priority_class_ptr& lhs, const priority_class_ptr & rhs) const noexcept {
	166	return lhs->_accumulated > rhs->_accumulated;
	167	}
	168
	169	fair_queue::fair_queue(fair_group& group, config cfg)
f67539c2	170	: _config(std::move(cfg))
20effc67	171	, _group(group)
1e59de90	172	, _group_replenish(clock_type::now())
20effc67	173	{
20effc67 TL	174	}
	175
	176	fair_queue::fair_queue(fair_queue&& other)
	177	: _config(std::move(other._config))
	178	, _group(other._group)
1e59de90	179	, _group_replenish(std::move(other._group_replenish))
20effc67 TL	180	, _resources_executing(std::exchange(other._resources_executing, fair_queue_ticket{}))
	181	, _resources_queued(std::exchange(other._resources_queued, fair_queue_ticket{}))
	182	, _requests_executing(std::exchange(other._requests_executing, 0))
	183	, _requests_queued(std::exchange(other._requests_queued, 0))
20effc67 TL	184	, _handles(std::move(other._handles))
20effc67 TL	185	, _priority_classes(std::move(other._priority_classes))
1e59de90	186	, _last_accumulated(other._last_accumulated)
20effc67 TL	187	{
20effc67 TL	188	}
f67539c2	189
20effc67 TL	190	fair_queue::~fair_queue() {
	191	for (const auto& fq : _priority_classes) {
	192	assert(!fq);
9f95a23c TL	193	}
	194	}
	195
1e59de90 TL	196	void fair_queue::push_priority_class(priority_class_data& pc) noexcept {
	197	assert(pc._plugged && !pc._queued);
	198	_handles.assert_enough_capacity();
	199	_handles.push(&pc);
	200	pc._queued = true;
	201	}
	202
	203	void fair_queue::push_priority_class_from_idle(priority_class_data& pc) noexcept {
20effc67	204	if (!pc._queued) {
1e59de90 TL	205	// Don't let the newcomer monopolize the disk for more than tau
	206	// duration. For this estimate how many capacity units can be
	207	// accumulated with the current class shares per rate resulution
	208	// and scale it up to tau.
	209	capacity_t max_deviation = fair_group::fixed_point_factor / pc._shares * fair_group::token_bucket_t::rate_cast(_config.tau).count();
	210	// On start this deviation can go to negative values, so not to
	211	// introduce extra if's for that short corner case, use signed
	212	// arithmetics and make sure the _accumulated value doesn't grow
	213	// over signed maximum (see overflow check below)
	214	pc._accumulated = std::max<signed_capacity_t>(_last_accumulated - max_deviation, pc._accumulated);
	215	_handles.assert_enough_capacity();
20effc67 TL	216	_handles.push(&pc);
	217	pc._queued = true;
	218	}
9f95a23c TL	219	}
9f95a23c TL	220
1e59de90 TL	221	void fair_queue::pop_priority_class(priority_class_data& pc) noexcept {
1e59de90 TL	222	assert(pc._plugged && pc._queued);
20effc67 TL	223	pc._queued = false;
20effc67 TL	224	_handles.pop();
9f95a23c TL	225	}
9f95a23c TL	226
1e59de90 TL	227	void fair_queue::plug_priority_class(priority_class_data& pc) noexcept {
	228	assert(!pc._plugged && !pc._queued);
	229	pc._plugged = true;
	230	if (!pc._queue.empty()) {
	231	push_priority_class_from_idle(pc);
20effc67 TL	232	}
	233	}
	234
1e59de90 TL	235	void fair_queue::plug_class(class_id cid) noexcept {
1e59de90 TL	236	plug_priority_class(*_priority_classes[cid]);
20effc67 TL	237	}
20effc67 TL	238
1e59de90 TL	239	void fair_queue::unplug_priority_class(priority_class_data& pc) noexcept {
	240	assert(pc._plugged);
	241	if (pc._queued) {
	242	pop_priority_class(pc);
20effc67	243	}
1e59de90 TL	244	pc._plugged = false;
1e59de90 TL	245	}
20effc67	246
1e59de90 TL	247	void fair_queue::unplug_class(class_id cid) noexcept {
	248	unplug_priority_class(*_priority_classes[cid]);
	249	}
	250
	251	auto fair_queue::grab_pending_capacity(const fair_queue_entry& ent) noexcept -> grab_result {
	252	_group.maybe_replenish_capacity(_group_replenish);
	253
	254	if (_group.capacity_deficiency(_pending->head)) {
	255	return grab_result::pending;
20effc67 TL	256	}
20effc67 TL	257
1e59de90 TL	258	capacity_t cap = _group.ticket_capacity(ent._ticket);
	259	if (cap > _pending->cap) {
	260	return grab_result::cant_preempt;
	261	}
	262
	263	if (cap < _pending->cap) {
	264	_group.release_capacity(_pending->cap - cap); // FIXME -- replenish right at once?
	265	}
	266
	267	_pending.reset();
	268	return grab_result::grabbed;
20effc67 TL	269	}
20effc67 TL	270
1e59de90	271	auto fair_queue::grab_capacity(const fair_queue_entry& ent) noexcept -> grab_result {
20effc67	272	if (_pending) {
1e59de90	273	return grab_pending_capacity(ent);
20effc67 TL	274	}
20effc67 TL	275
1e59de90 TL	276	capacity_t cap = _group.ticket_capacity(ent._ticket);
	277	capacity_t want_head = _group.grab_capacity(cap);
	278	if (_group.capacity_deficiency(want_head)) {
	279	_pending.emplace(want_head, cap);
	280	return grab_result::pending;
9f95a23c	281	}
20effc67	282
1e59de90	283	return grab_result::grabbed;
9f95a23c TL	284	}
9f95a23c TL	285
20effc67 TL	286	void fair_queue::register_priority_class(class_id id, uint32_t shares) {
	287	if (id >= _priority_classes.size()) {
	288	_priority_classes.resize(id + 1);
	289	} else {
	290	assert(!_priority_classes[id]);
	291	}
	292
1e59de90	293	_handles.reserve(_nr_classes + 1);
20effc67	294	_priority_classes[id] = std::make_unique<priority_class_data>(shares);
1e59de90	295	_nr_classes++;
9f95a23c TL	296	}
9f95a23c TL	297
20effc67 TL	298	void fair_queue::unregister_priority_class(class_id id) {
	299	auto& pclass = _priority_classes[id];
	300	assert(pclass && pclass->_queue.empty());
	301	pclass.reset();
1e59de90	302	_nr_classes--;
9f95a23c TL	303	}
9f95a23c TL	304
20effc67 TL	305	void fair_queue::update_shares_for_class(class_id id, uint32_t shares) {
	306	assert(id < _priority_classes.size());
	307	auto& pc = _priority_classes[id];
	308	assert(pc);
	309	pc->update_shares(shares);
9f95a23c TL	310	}
	311
	312	size_t fair_queue::waiters() const {
	313	return _requests_queued;
	314	}
	315
	316	size_t fair_queue::requests_currently_executing() const {
	317	return _requests_executing;
	318	}
	319
f67539c2 TL	320	fair_queue_ticket fair_queue::resources_currently_waiting() const {
	321	return _resources_queued;
	322	}
	323
	324	fair_queue_ticket fair_queue::resources_currently_executing() const {
	325	return _resources_executing;
	326	}
	327
1e59de90	328	void fair_queue::queue(class_id id, fair_queue_entry& ent) noexcept {
20effc67	329	priority_class_data& pc = *_priority_classes[id];
9f95a23c TL	330	// We need to return a future in this function on which the caller can wait.
	331	// Since we don't know which queue we will use to execute the next request - if ours or
	332	// someone else's, we need a separate promise at this point.
1e59de90 TL	333	if (pc._plugged) {
	334	push_priority_class_from_idle(pc);
	335	}
20effc67 TL	336	pc._queue.push_back(ent);
20effc67 TL	337	_resources_queued += ent._ticket;
9f95a23c TL	338	_requests_queued++;
	339	}
	340
20effc67	341	void fair_queue::notify_request_finished(fair_queue_ticket desc) noexcept {
f67539c2	342	_resources_executing -= desc;
20effc67	343	_requests_executing--;
1e59de90	344	_group.release_capacity(_group.ticket_capacity(desc));
9f95a23c TL	345	}
9f95a23c TL	346
20effc67 TL	347	void fair_queue::notify_request_cancelled(fair_queue_entry& ent) noexcept {
	348	_resources_queued -= ent._ticket;
	349	ent._ticket = fair_queue_ticket();
	350	}
	351
1e59de90 TL	352	fair_queue::clock_type::time_point fair_queue::next_pending_aio() const noexcept {
	353	if (_pending) {
	354	/*
	355	* We expect the disk to release the ticket within some time,
	356	* but it's ... OK if it doesn't -- the pending wait still
	357	* needs the head rover value to be ahead of the needed value.
	358	*
	359	* It may happen that the capacity gets released before we think
	360	* it will, in this case we will wait for the full value again,
	361	* which's sub-optimal. The expectation is that we think disk
	362	* works faster, than it really does.
	363	*/
	364	auto over = _group.capacity_deficiency(_pending->head);
	365	auto ticks = _group.capacity_duration(over);
	366	return std::chrono::steady_clock::now() + std::chrono::duration_cast<std::chrono::microseconds>(ticks);
	367	}
	368
	369	return std::chrono::steady_clock::time_point::max();
	370	}
	371
20effc67	372	void fair_queue::dispatch_requests(std::function<void(fair_queue_entry&)> cb) {
1e59de90 TL	373	capacity_t dispatched = 0;
	374	boost::container::small_vector<priority_class_ptr, 2> preempt;
	375
	376	while (!_handles.empty() && (dispatched < _group.maximum_capacity() / smp::count)) {
20effc67 TL	377	priority_class_data& h = *_handles.top();
	378	if (h._queue.empty()) {
	379	pop_priority_class(h);
	380	continue;
	381	}
9f95a23c	382
20effc67	383	auto& req = h._queue.front();
1e59de90 TL	384	auto gr = grab_capacity(req);
1e59de90 TL	385	if (gr == grab_result::pending) {
20effc67 TL	386	break;
	387	}
	388
1e59de90 TL	389	if (gr == grab_result::cant_preempt) {
	390	pop_priority_class(h);
	391	preempt.emplace_back(&h);
	392	continue;
	393	}
	394
	395	_last_accumulated = std::max(h._accumulated, _last_accumulated);
20effc67 TL	396	pop_priority_class(h);
	397	h._queue.pop_front();
	398
	399	_resources_executing += req._ticket;
	400	_resources_queued -= req._ticket;
9f95a23c	401	_requests_executing++;
9f95a23c TL	402	_requests_queued--;
9f95a23c TL	403
1e59de90 TL	404	// Usually the cost of request is tens to hundreeds of thousands. However, for
	405	// unrestricted queue it can be as low as 2k. With large enough shares this
	406	// has chances to be translated into zero cost which, in turn, will make the
	407	// class show no progress and monopolize the queue.
	408	auto req_cap = _group.ticket_capacity(req._ticket);
	409	auto req_cost = std::max(req_cap / h._shares, (capacity_t)1);
	410	// signed overflow check to make push_priority_class_from_idle math work
	411	if (h._accumulated >= std::numeric_limits<signed_capacity_t>::max() - req_cost) {
	412	for (auto& pc : _priority_classes) {
	413	if (pc) {
	414	if (pc->_queued) {
	415	pc->_accumulated -= h._accumulated;
	416	} else { // this includes h
	417	pc->_accumulated = 0;
	418	}
	419	}
	420	}
	421	_last_accumulated = 0;
9f95a23c	422	}
1e59de90 TL	423	h._accumulated += req_cost;
1e59de90 TL	424	h._pure_accumulated += req_cap;
9f95a23c	425
1e59de90 TL	426	dispatched += _group.ticket_capacity(req._ticket);
	427	cb(req);
	428
	429	if (h._plugged && !h._queue.empty()) {
9f95a23c TL	430	push_priority_class(h);
9f95a23c TL	431	}
1e59de90	432	}
20effc67	433
1e59de90 TL	434	for (auto&& h : preempt) {
1e59de90 TL	435	push_priority_class(*h);
9f95a23c TL	436	}
	437	}
	438
1e59de90 TL	439	std::vector<seastar::metrics::impl::metric_definition_impl> fair_queue::metrics(class_id c) {
	440	namespace sm = seastar::metrics;
	441	priority_class_data& pc = *_priority_classes[c];
	442	return std::vector<sm::impl::metric_definition_impl>({
	443	sm::make_counter("consumption",
	444	[&pc] { return fair_group::capacity_tokens(pc._pure_accumulated); },
	445	sm::description("Accumulated disk capacity units consumed by this class; an increment per-second rate indicates full utilization")),
	446	sm::make_counter("adjusted_consumption",
	447	[&pc] { return fair_group::capacity_tokens(pc._accumulated); },
	448	sm::description("Consumed disk capacity units adjusted for class shares and idling preemption")),
	449	});
	450	}
	451
9f95a23c	452	}