Skip to content

Commit

Permalink
io: Adjust IO latency goal on fair-queue level
Browse files Browse the repository at this point in the history
The IO latency goal is the duration within which all submitted IOs must
complete. This duration is used to calculate the token-bucket limit --
the maximum amount of tokens in it -- so that not more than the relevant
number of requests get dispatched by the fair-queue.

Next, the token-bucket limit is then used by the IO-queue layer to
estimate the maximum IO lengths for both reads and writes.

And finally, this value can be increased by the reactor configuration
code if it "thinks" that the io-properties values are too low to allow
even for a single request.

Throughout all the associlated maths: io-properties -> latency goal ->
token bucket limit -> IO request lengths -- the floating point precision
takes place and causes off-by-few errors when the io-properties values
are too low.  As a result the IO request lengths calculations cannot
find the acceptable value, throw, the seastar initialization aborts.

Safer approach is to increase the last element in this chain -- the
token bucket limit value -- so that it could accomodate at lease some
desired request length. After it the IO-queue level can detect that the
limit increased, estimate the corresponding latency goal and warn user
about auto-increase took place. Rounding errors don't matter here.

fixes: scylladb#1135
tests: unit(dev), manual checks of faulty io-properties found so far

v2:
- set 64k as minimal limit claim

Signed-off-by: Pavel Emelyanov <[email protected]>
Message-Id: <[email protected]>
  • Loading branch information
xemul authored and avikivity committed Jul 13, 2022
1 parent 32bf061 commit 6d4a0cb
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 15 deletions.
17 changes: 16 additions & 1 deletion include/seastar/core/fair_queue.hh
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,8 @@ public:
*/

static constexpr float fixed_point_factor = float(1 << 24);
using token_bucket_t = internal::shared_token_bucket<capacity_t, std::milli, internal::capped_release::yes>;
using rate_resolution = std::milli;
using token_bucket_t = internal::shared_token_bucket<capacity_t, rate_resolution, internal::capped_release::yes>;

private:

Expand Down Expand Up @@ -221,8 +222,17 @@ public:

struct config {
sstring label = "";
/*
* There are two "min" values that can be configured. The former one
* is the minimal weight:size pair that the upper layer is going to
* submit. However, it can submit _larger_ values, and the fair queue
* must accept those as large as the latter pair (but it can accept
* even larger values, of course)
*/
unsigned min_weight = 0;
unsigned min_size = 0;
unsigned limit_min_weight = 0;
unsigned limit_min_size = 0;
unsigned long weight_rate;
unsigned long size_rate;
float rate_factor = 1.0;
Expand All @@ -242,6 +252,11 @@ public:

capacity_t capacity_deficiency(capacity_t from) const noexcept;
capacity_t ticket_capacity(fair_queue_ticket ticket) const noexcept;

std::chrono::duration<double> rate_limit_duration() const noexcept {
std::chrono::duration<double, rate_resolution> dur((double)_token_bucket.limit() / _token_bucket.rate());
return std::chrono::duration_cast<std::chrono::duration<double>>(dur);
}
};

/// \brief Fair queuing class
Expand Down
1 change: 1 addition & 0 deletions include/seastar/core/io_queue.hh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ public:
bool duplex = false;
float rate_factor = 1.0;
std::chrono::duration<double> rate_limit_duration = std::chrono::milliseconds(1);
size_t block_count_limit_min = 1;
};

io_queue(io_group_ptr group, internal::io_sink& sink);
Expand Down
2 changes: 1 addition & 1 deletion src/core/fair_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ fair_queue_ticket wrapping_difference(const fair_queue_ticket& a, const fair_que
fair_group::fair_group(config cfg)
: _cost_capacity(cfg.weight_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count(), cfg.size_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count())
, _token_bucket(cfg.rate_factor * fixed_point_factor,
cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(),
std::max<capacity_t>(cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(), ticket_capacity(fair_queue_ticket(cfg.limit_min_weight, cfg.limit_min_size))),
ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size))
)
{
Expand Down
10 changes: 10 additions & 0 deletions src/core/io_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -427,21 +427,31 @@ fair_group::config io_group::make_fair_group_config(const io_queue::config& qcfg
cfg.label = fmt::format("io-queue-{}", qcfg.devid);
cfg.min_weight = std::min(io_queue::read_request_base_count, qcfg.disk_req_write_to_read_multiplier);
cfg.min_size = std::min(io_queue::read_request_base_count, qcfg.disk_blocks_write_to_read_multiplier);
cfg.limit_min_weight = std::max(io_queue::read_request_base_count, qcfg.disk_req_write_to_read_multiplier);
cfg.limit_min_size = std::max(io_queue::read_request_base_count, qcfg.disk_blocks_write_to_read_multiplier) * qcfg.block_count_limit_min;
cfg.weight_rate = qcfg.req_count_rate;
cfg.size_rate = qcfg.blocks_count_rate;
cfg.rate_factor = qcfg.rate_factor;
cfg.rate_limit_duration = qcfg.rate_limit_duration;
return cfg;
}

static void maybe_warn_latency_goal_auto_adjust(const fair_group& fg, const io_queue::config& cfg) noexcept {
auto goal = fg.rate_limit_duration();
auto lvl = goal > 1.1 * cfg.rate_limit_duration ? log_level::warn : log_level::info;
seastar_logger.log(lvl, "IO queue uses {:.2f}ms latency goal for device {}", goal.count() * 1000, cfg.devid);
}

io_group::io_group(io_queue::config io_cfg)
: _config(std::move(io_cfg))
, _allocated_on(this_shard_id())
{
auto fg_cfg = make_fair_group_config(_config);
_fgs.push_back(std::make_unique<fair_group>(fg_cfg));
maybe_warn_latency_goal_auto_adjust(*_fgs.back(), io_cfg);
if (_config.duplex) {
_fgs.push_back(std::make_unique<fair_group>(fg_cfg));
maybe_warn_latency_goal_auto_adjust(*_fgs.back(), io_cfg);
}

/*
Expand Down
19 changes: 6 additions & 13 deletions src/core/reactor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3793,25 +3793,13 @@ class disk_config_params {
cfg.devid = devid;

if (!_capacity) {
std::chrono::duration<double> tick = latency_goal();

if (p.read_bytes_rate != std::numeric_limits<uint64_t>::max()) {
cfg.blocks_count_rate = (io_queue::read_request_base_count * (unsigned long)per_io_group(p.read_bytes_rate, nr_groups)) >> io_queue::block_size_shift;
cfg.disk_blocks_write_to_read_multiplier = (io_queue::read_request_base_count * p.read_bytes_rate) / p.write_bytes_rate;
auto bw_lat = std::chrono::duration<double>(4.0 / (p.rate_factor * (std::min(p.read_bytes_rate, p.write_bytes_rate) >> io_queue::block_size_shift)));
if (bw_lat > tick) {
tick = bw_lat;
seastar_logger.warn("Bandwidth is too low for {}, using {:.3f}ms IO latency goal", p.mountpoint, tick.count() * 1000);
}
}
if (p.read_req_rate != std::numeric_limits<uint64_t>::max()) {
cfg.req_count_rate = io_queue::read_request_base_count * (unsigned long)per_io_group(p.read_req_rate, nr_groups);
cfg.disk_req_write_to_read_multiplier = (io_queue::read_request_base_count * p.read_req_rate) / p.write_req_rate;
auto iops_lat = std::chrono::duration<double>(2.0 / (p.rate_factor * std::min(p.read_req_rate, p.write_req_rate)));
if (iops_lat > tick) {
tick = iops_lat;
seastar_logger.warn("IOPS is too low for {}, using {:.3f}ms IO latency goal", p.mountpoint, tick.count() * 1000);
}
}
if (p.read_saturation_length != std::numeric_limits<uint64_t>::max()) {
cfg.disk_read_saturation_length = p.read_saturation_length;
Expand All @@ -3822,7 +3810,12 @@ class disk_config_params {
cfg.mountpoint = p.mountpoint;
cfg.duplex = p.duplex;
cfg.rate_factor = p.rate_factor;
cfg.rate_limit_duration = tick;
cfg.rate_limit_duration = latency_goal();
// Block count limit should not be less than the minimal IO size on the device
// On the other hand, even this is not good enough -- in the worst case the
// scheduler will self-tune to allow for the single 64k request, while it would
// be better to sacrifice some IO latency, but allow for larger concurrency
cfg.block_count_limit_min = (64 << 10) >> io_queue::block_size_shift;
} else {
// For backwards compatibility
cfg.capacity = *_capacity;
Expand Down

0 comments on commit 6d4a0cb

Please sign in to comment.