Skip to content

Commit 6d29a68

Browse files
authored
Revert "Authority overload monitor" (MystenLabs#16047)
Reverts MystenLabs#15981 Config issue
1 parent b4e5fe8 commit 6d29a68

File tree

11 files changed

+3
-445
lines changed

11 files changed

+3
-445
lines changed

crates/sui-config/src/node.rs

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -687,22 +687,6 @@ pub struct TransactionKeyValueStoreWriteConfig {
687687
#[derive(Clone, Debug, Deserialize, Serialize)]
688688
pub struct OverloadThresholdConfig {
689689
pub max_txn_age_in_queue: Duration,
690-
691-
// The interval of checking overload signal.
692-
pub overload_monitor_interval: Duration,
693-
694-
// The execution queueing latency when entering load shedding mode.
695-
pub execution_queue_latency_soft_limit: Duration,
696-
697-
// The execution queueing latency when entering aggressive load shedding mode.
698-
pub execution_queue_latency_hard_limit: Duration,
699-
700-
// The maximum percentage of transactions to shed in load shedding mode.
701-
pub max_load_shedding_percentage: u32,
702-
703-
// When in aggressive load shedding mode, the the minimum percentage of
704-
// transactions to shed.
705-
pub min_load_shedding_percentage_above_hard_limit: u32,
706690
// TODO: Move other thresholds here as well, including `MAX_TM_QUEUE_LENGTH`
707691
// and `MAX_PER_OBJECT_QUEUE_LENGTH`.
708692
}
@@ -711,11 +695,6 @@ impl Default for OverloadThresholdConfig {
711695
fn default() -> Self {
712696
Self {
713697
max_txn_age_in_queue: Duration::from_secs(1), // 1 second
714-
overload_monitor_interval: Duration::from_secs(10),
715-
execution_queue_latency_soft_limit: Duration::from_secs(1),
716-
execution_queue_latency_hard_limit: Duration::from_secs(10),
717-
max_load_shedding_percentage: 95,
718-
min_load_shedding_percentage_above_hard_limit: 50,
719698
}
720699
}
721700
}

crates/sui-core/src/authority.rs

Lines changed: 2 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,7 @@ use crate::epoch::committee_store::CommitteeStore;
140140
use crate::execution_driver::execution_process;
141141
use crate::in_mem_execution_cache::{ExecutionCache, ExecutionCacheRead, ExecutionCacheWrite};
142142
use crate::metrics::LatencyObserver;
143-
use crate::metrics::RateTracker;
144143
use crate::module_cache_metrics::ResolverMetrics;
145-
use crate::overload_monitor::{overload_monitor, AuthorityOverloadInfo};
146144
use crate::stake_aggregator::StakeAggregator;
147145
use crate::state_accumulator::{StateAccumulator, WrappedObject};
148146
use crate::subscription_handler::SubscriptionHandler;
@@ -239,9 +237,6 @@ pub struct AuthorityMetrics {
239237
pub(crate) skipped_consensus_txns: IntCounter,
240238
pub(crate) skipped_consensus_txns_cache_hit: IntCounter,
241239

242-
pub(crate) authority_overload_status: IntGauge,
243-
pub(crate) authority_load_shedding_percentage: IntGauge,
244-
245240
/// Post processing metrics
246241
post_processing_total_events_emitted: IntCounter,
247242
post_processing_total_tx_indexed: IntCounter,
@@ -274,18 +269,6 @@ pub struct AuthorityMetrics {
274269
// Tracks recent average txn queueing delay between when it is ready for execution
275270
// until it starts executing.
276271
pub execution_queueing_latency: LatencyObserver,
277-
278-
// Tracks the rate of transactions become ready for execution in transaction manager.
279-
// The need for the Mutex is that the tracker is updated in transaction manager and read
280-
// in the overload_monitor. There should be low mutex contention because
281-
// transaction manager is single threaded and the read rate in overload_monitor is
282-
// low. In the case where transaction manager becomes multi-threaded, we can
283-
// create one rate tracker per thread.
284-
pub txn_ready_rate_tracker: Arc<Mutex<RateTracker>>,
285-
286-
// Tracks the rate of transactions starts execution in execution driver.
287-
// Similar reason for using a Mutex here as to `txn_ready_rate_tracker`.
288-
pub execution_rate_tracker: Arc<Mutex<RateTracker>>,
289272
}
290273

291274
// Override default Prom buckets for positive numbers in 0-50k range
@@ -470,16 +453,6 @@ impl AuthorityMetrics {
470453
registry,
471454
)
472455
.unwrap(),
473-
authority_overload_status: register_int_gauge_with_registry!(
474-
"authority_overload_status",
475-
"Whether authority is current experiencing overload and enters load shedding mode.",
476-
registry)
477-
.unwrap(),
478-
authority_load_shedding_percentage: register_int_gauge_with_registry!(
479-
"authority_load_shedding_percentage",
480-
"The percentage of transactions is shed when the authority is in load shedding mode.",
481-
registry)
482-
.unwrap(),
483456
transaction_manager_object_cache_misses: register_int_counter_with_registry!(
484457
"transaction_manager_object_cache_misses",
485458
"Number of object-availability cache misses in TransactionManager",
@@ -645,8 +618,6 @@ impl AuthorityMetrics {
645618
registry
646619
).unwrap(),
647620
execution_queueing_latency: LatencyObserver::new(),
648-
txn_ready_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
649-
execution_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
650621
}
651622
}
652623
}
@@ -713,9 +684,6 @@ pub struct AuthorityState {
713684

714685
/// Config for when we consider the node overloaded.
715686
overload_threshold_config: OverloadThresholdConfig,
716-
717-
/// Current overload status in this authority. Updated periodically.
718-
pub overload_info: AuthorityOverloadInfo,
719687
}
720688

721689
/// The authority state encapsulates all state, drives execution, and ensures safety.
@@ -2484,21 +2452,17 @@ impl AuthorityState {
24842452
transaction_deny_config,
24852453
certificate_deny_config,
24862454
debug_dump_config,
2487-
overload_threshold_config: overload_threshold_config.clone(),
2488-
overload_info: AuthorityOverloadInfo::default(),
2455+
overload_threshold_config,
24892456
});
24902457

24912458
// Start a task to execute ready certificates.
24922459
let authority_state = Arc::downgrade(&state);
24932460
spawn_monitored_task!(execution_process(
24942461
authority_state,
24952462
rx_ready_certificates,
2496-
rx_execution_shutdown,
2463+
rx_execution_shutdown
24972464
));
24982465

2499-
let authority_state = Arc::downgrade(&state);
2500-
spawn_monitored_task!(overload_monitor(authority_state, overload_threshold_config));
2501-
25022466
// TODO: This doesn't belong to the constructor of AuthorityState.
25032467
state
25042468
.create_owner_index_if_empty(genesis_objects, &epoch_store)

crates/sui-core/src/consensus_throughput_calculator.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,6 @@ mod tests {
469469
}
470470

471471
#[test]
472-
#[cfg_attr(msim, ignore)]
473472
pub fn test_consensus_throughput_calculator() {
474473
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
475474
let max_observation_points: NonZeroU64 = NonZeroU64::new(3).unwrap();
@@ -514,7 +513,6 @@ mod tests {
514513
}
515514

516515
#[test]
517-
#[cfg_attr(msim, ignore)]
518516
pub fn test_throughput_calculator_same_timestamp_observations() {
519517
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
520518
let max_observation_points: NonZeroU64 = NonZeroU64::new(2).unwrap();
@@ -545,7 +543,6 @@ mod tests {
545543
}
546544

547545
#[test]
548-
#[cfg_attr(msim, ignore)]
549546
pub fn test_consensus_throughput_profiler() {
550547
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
551548
let throughput_profile_update_interval: TimestampSecs = 5;
@@ -613,7 +610,6 @@ mod tests {
613610
}
614611

615612
#[test]
616-
#[cfg_attr(msim, ignore)]
617613
pub fn test_consensus_throughput_profiler_update_interval() {
618614
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
619615
let throughput_profile_update_interval: TimestampSecs = 5;
@@ -666,7 +662,6 @@ mod tests {
666662
}
667663

668664
#[test]
669-
#[cfg_attr(msim, ignore)]
670665
pub fn test_consensus_throughput_profiler_cool_down() {
671666
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
672667
let throughput_profile_update_window: TimestampSecs = 3;

crates/sui-core/src/execution_driver.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ pub async fn execution_process(
104104
}
105105
}
106106

107-
authority.metrics.execution_rate_tracker.lock().record();
108-
109107
// Certificate execution can take significant time, so run it in a separate task.
110108
spawn_monitored_task!(async move {
111109
let _scope = monitored_scope("ExecutionDriver::task");

crates/sui-core/src/lib.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ pub mod in_mem_execution_cache;
2222
pub mod metrics;
2323
pub mod module_cache_metrics;
2424
pub mod mysticeti_adapter;
25-
mod overload_monitor;
2625
pub(crate) mod post_consensus_tx_reorder;
2726
pub mod quorum_driver;
2827
pub mod safe_client;

0 commit comments

Comments
 (0)