io: Adjust IO latency goal on fair-queue level

The IO latency goal is the duration within which all submitted IOs must complete. This duration is used to calculate the token-bucket limit -- the maximum amount of tokens in it -- so that not more than the relevant number of requests get dispatched by the fair-queue. Next, the token-bucket limit is then used by the IO-queue layer to estimate the maximum IO lengths for both reads and writes. And finally, this value can be increased by the reactor configuration code if it "thinks" that the io-properties values are too low to allow even for a single request. Throughout all the associlated maths: io-properties -> latency goal -> token bucket limit -> IO request lengths -- the floating point precision takes place and causes off-by-few errors when the io-properties values are too low. As a result the IO request lengths calculations cannot find the acceptable value, throw, the seastar initialization aborts. Safer approach is to increase the last element in this chain -- the token bucket limit value -- so that it could accomodate at lease some desired request length. After it the IO-queue level can detect that the limit increased, estimate the corresponding latency goal and warn user about auto-increase took place. Rounding errors don't matter here. fixes: scylladb#1135 tests: unit(dev), manual checks of faulty io-properties found so far v2: - set 64k as minimal limit claim Signed-off-by: Pavel Emelyanov <[email protected]> Message-Id: <[email protected]>
longlene · Jul 13, 2022 · 6d4a0cb · 6d4a0cb
1 parent 32bf061
commit 6d4a0cb
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 15 deletions.
diff --git a/include/seastar/core/fair_queue.hh b/include/seastar/core/fair_queue.hh
@@ -182,7 +182,8 @@ public:
      */
 
     static constexpr float fixed_point_factor = float(1 << 24);
-    using token_bucket_t = internal::shared_token_bucket<capacity_t, std::milli, internal::capped_release::yes>;
+    using rate_resolution = std::milli;
+    using token_bucket_t = internal::shared_token_bucket<capacity_t, rate_resolution, internal::capped_release::yes>;
 
 private:
 
@@ -221,8 +222,17 @@ public:
 
     struct config {
         sstring label = "";
+        /*
+         * There are two "min" values that can be configured. The former one
+         * is the minimal weight:size pair that the upper layer is going to
+         * submit. However, it can submit _larger_ values, and the fair queue
+         * must accept those as large as the latter pair (but it can accept
+         * even larger values, of course)
+         */
         unsigned min_weight = 0;
         unsigned min_size = 0;
+        unsigned limit_min_weight = 0;
+        unsigned limit_min_size = 0;
         unsigned long weight_rate;
         unsigned long size_rate;
         float rate_factor = 1.0;
@@ -242,6 +252,11 @@ public:
 
     capacity_t capacity_deficiency(capacity_t from) const noexcept;
     capacity_t ticket_capacity(fair_queue_ticket ticket) const noexcept;
+
+    std::chrono::duration<double> rate_limit_duration() const noexcept {
+        std::chrono::duration<double, rate_resolution> dur((double)_token_bucket.limit() / _token_bucket.rate());
+        return std::chrono::duration_cast<std::chrono::duration<double>>(dur);
+    }
 };
 
 /// \brief Fair queuing class

diff --git a/include/seastar/core/io_queue.hh b/include/seastar/core/io_queue.hh
@@ -106,6 +106,7 @@ public:
         bool duplex = false;
         float rate_factor = 1.0;
         std::chrono::duration<double> rate_limit_duration = std::chrono::milliseconds(1);
+        size_t block_count_limit_min = 1;
     };
 
     io_queue(io_group_ptr group, internal::io_sink& sink);

diff --git a/src/core/fair_queue.cc b/src/core/fair_queue.cc
@@ -94,7 +94,7 @@ fair_queue_ticket wrapping_difference(const fair_queue_ticket& a, const fair_que
 fair_group::fair_group(config cfg)
         : _cost_capacity(cfg.weight_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count(), cfg.size_rate / token_bucket_t::rate_cast(std::chrono::seconds(1)).count())
         , _token_bucket(cfg.rate_factor * fixed_point_factor,
-                        cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(),
+                        std::max<capacity_t>(cfg.rate_factor * fixed_point_factor * token_bucket_t::rate_cast(cfg.rate_limit_duration).count(), ticket_capacity(fair_queue_ticket(cfg.limit_min_weight, cfg.limit_min_size))),
                         ticket_capacity(fair_queue_ticket(cfg.min_weight, cfg.min_size))
                        )
 {

diff --git a/src/core/io_queue.cc b/src/core/io_queue.cc
@@ -427,21 +427,31 @@ fair_group::config io_group::make_fair_group_config(const io_queue::config& qcfg
     cfg.label = fmt::format("io-queue-{}", qcfg.devid);
     cfg.min_weight = std::min(io_queue::read_request_base_count, qcfg.disk_req_write_to_read_multiplier);
     cfg.min_size = std::min(io_queue::read_request_base_count, qcfg.disk_blocks_write_to_read_multiplier);
+    cfg.limit_min_weight = std::max(io_queue::read_request_base_count, qcfg.disk_req_write_to_read_multiplier);
+    cfg.limit_min_size = std::max(io_queue::read_request_base_count, qcfg.disk_blocks_write_to_read_multiplier) * qcfg.block_count_limit_min;
     cfg.weight_rate = qcfg.req_count_rate;
     cfg.size_rate = qcfg.blocks_count_rate;
     cfg.rate_factor = qcfg.rate_factor;
     cfg.rate_limit_duration = qcfg.rate_limit_duration;
     return cfg;
 }
 
+static void maybe_warn_latency_goal_auto_adjust(const fair_group& fg, const io_queue::config& cfg) noexcept {
+    auto goal = fg.rate_limit_duration();
+    auto lvl = goal > 1.1 * cfg.rate_limit_duration ? log_level::warn : log_level::info;
+    seastar_logger.log(lvl, "IO queue uses {:.2f}ms latency goal for device {}", goal.count() * 1000, cfg.devid);
+}
+
 io_group::io_group(io_queue::config io_cfg)
     : _config(std::move(io_cfg))
     , _allocated_on(this_shard_id())
 {
     auto fg_cfg = make_fair_group_config(_config);
     _fgs.push_back(std::make_unique<fair_group>(fg_cfg));
+    maybe_warn_latency_goal_auto_adjust(*_fgs.back(), io_cfg);
     if (_config.duplex) {
         _fgs.push_back(std::make_unique<fair_group>(fg_cfg));
+        maybe_warn_latency_goal_auto_adjust(*_fgs.back(), io_cfg);
     }
 
     /*

diff --git a/src/core/reactor.cc b/src/core/reactor.cc
@@ -3793,25 +3793,13 @@ class disk_config_params {
         cfg.devid = devid;
 
         if (!_capacity) {
-            std::chrono::duration<double> tick = latency_goal();
-
             if (p.read_bytes_rate != std::numeric_limits<uint64_t>::max()) {
                 cfg.blocks_count_rate = (io_queue::read_request_base_count * (unsigned long)per_io_group(p.read_bytes_rate, nr_groups)) >> io_queue::block_size_shift;
                 cfg.disk_blocks_write_to_read_multiplier = (io_queue::read_request_base_count * p.read_bytes_rate) / p.write_bytes_rate;
-                auto bw_lat = std::chrono::duration<double>(4.0 / (p.rate_factor * (std::min(p.read_bytes_rate, p.write_bytes_rate) >> io_queue::block_size_shift)));
-                if (bw_lat > tick) {
-                    tick = bw_lat;
-                    seastar_logger.warn("Bandwidth is too low for {}, using {:.3f}ms IO latency goal", p.mountpoint, tick.count() * 1000);
-                }
             }
             if (p.read_req_rate != std::numeric_limits<uint64_t>::max()) {
                 cfg.req_count_rate = io_queue::read_request_base_count * (unsigned long)per_io_group(p.read_req_rate, nr_groups);
                 cfg.disk_req_write_to_read_multiplier = (io_queue::read_request_base_count * p.read_req_rate) / p.write_req_rate;
-                auto iops_lat = std::chrono::duration<double>(2.0 / (p.rate_factor * std::min(p.read_req_rate, p.write_req_rate)));
-                if (iops_lat > tick) {
-                    tick = iops_lat;
-                    seastar_logger.warn("IOPS is too low for {}, using {:.3f}ms IO latency goal", p.mountpoint, tick.count() * 1000);
-                }
             }
             if (p.read_saturation_length != std::numeric_limits<uint64_t>::max()) {
                 cfg.disk_read_saturation_length = p.read_saturation_length;
@@ -3822,7 +3810,12 @@ class disk_config_params {
             cfg.mountpoint = p.mountpoint;
             cfg.duplex = p.duplex;
             cfg.rate_factor = p.rate_factor;
-            cfg.rate_limit_duration = tick;
+            cfg.rate_limit_duration = latency_goal();
+            // Block count limit should not be less than the minimal IO size on the device
+            // On the other hand, even this is not good enough -- in the worst case the
+            // scheduler will self-tune to allow for the single 64k request, while it would
+            // be better to sacrifice some IO latency, but allow for larger concurrency
+            cfg.block_count_limit_min = (64 << 10) >> io_queue::block_size_shift;
         } else {
             // For backwards compatibility
             cfg.capacity = *_capacity;