Skip to content

Commit

Permalink
Revert "Authority overload monitor" (MystenLabs#16047)
Browse files Browse the repository at this point in the history
Reverts MystenLabs#15981

Config issue
  • Loading branch information
halfprice authored Feb 1, 2024
1 parent b4e5fe8 commit 6d29a68
Show file tree
Hide file tree
Showing 11 changed files with 3 additions and 445 deletions.
21 changes: 0 additions & 21 deletions crates/sui-config/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -687,22 +687,6 @@ pub struct TransactionKeyValueStoreWriteConfig {
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct OverloadThresholdConfig {
pub max_txn_age_in_queue: Duration,

// The interval of checking overload signal.
pub overload_monitor_interval: Duration,

// The execution queueing latency when entering load shedding mode.
pub execution_queue_latency_soft_limit: Duration,

// The execution queueing latency when entering aggressive load shedding mode.
pub execution_queue_latency_hard_limit: Duration,

// The maximum percentage of transactions to shed in load shedding mode.
pub max_load_shedding_percentage: u32,

// When in aggressive load shedding mode, the the minimum percentage of
// transactions to shed.
pub min_load_shedding_percentage_above_hard_limit: u32,
// TODO: Move other thresholds here as well, including `MAX_TM_QUEUE_LENGTH`
// and `MAX_PER_OBJECT_QUEUE_LENGTH`.
}
Expand All @@ -711,11 +695,6 @@ impl Default for OverloadThresholdConfig {
fn default() -> Self {
Self {
max_txn_age_in_queue: Duration::from_secs(1), // 1 second
overload_monitor_interval: Duration::from_secs(10),
execution_queue_latency_soft_limit: Duration::from_secs(1),
execution_queue_latency_hard_limit: Duration::from_secs(10),
max_load_shedding_percentage: 95,
min_load_shedding_percentage_above_hard_limit: 50,
}
}
}
Expand Down
40 changes: 2 additions & 38 deletions crates/sui-core/src/authority.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,7 @@ use crate::epoch::committee_store::CommitteeStore;
use crate::execution_driver::execution_process;
use crate::in_mem_execution_cache::{ExecutionCache, ExecutionCacheRead, ExecutionCacheWrite};
use crate::metrics::LatencyObserver;
use crate::metrics::RateTracker;
use crate::module_cache_metrics::ResolverMetrics;
use crate::overload_monitor::{overload_monitor, AuthorityOverloadInfo};
use crate::stake_aggregator::StakeAggregator;
use crate::state_accumulator::{StateAccumulator, WrappedObject};
use crate::subscription_handler::SubscriptionHandler;
Expand Down Expand Up @@ -239,9 +237,6 @@ pub struct AuthorityMetrics {
pub(crate) skipped_consensus_txns: IntCounter,
pub(crate) skipped_consensus_txns_cache_hit: IntCounter,

pub(crate) authority_overload_status: IntGauge,
pub(crate) authority_load_shedding_percentage: IntGauge,

/// Post processing metrics
post_processing_total_events_emitted: IntCounter,
post_processing_total_tx_indexed: IntCounter,
Expand Down Expand Up @@ -274,18 +269,6 @@ pub struct AuthorityMetrics {
// Tracks recent average txn queueing delay between when it is ready for execution
// until it starts executing.
pub execution_queueing_latency: LatencyObserver,

// Tracks the rate of transactions become ready for execution in transaction manager.
// The need for the Mutex is that the tracker is updated in transaction manager and read
// in the overload_monitor. There should be low mutex contention because
// transaction manager is single threaded and the read rate in overload_monitor is
// low. In the case where transaction manager becomes multi-threaded, we can
// create one rate tracker per thread.
pub txn_ready_rate_tracker: Arc<Mutex<RateTracker>>,

// Tracks the rate of transactions starts execution in execution driver.
// Similar reason for using a Mutex here as to `txn_ready_rate_tracker`.
pub execution_rate_tracker: Arc<Mutex<RateTracker>>,
}

// Override default Prom buckets for positive numbers in 0-50k range
Expand Down Expand Up @@ -470,16 +453,6 @@ impl AuthorityMetrics {
registry,
)
.unwrap(),
authority_overload_status: register_int_gauge_with_registry!(
"authority_overload_status",
"Whether authority is current experiencing overload and enters load shedding mode.",
registry)
.unwrap(),
authority_load_shedding_percentage: register_int_gauge_with_registry!(
"authority_load_shedding_percentage",
"The percentage of transactions is shed when the authority is in load shedding mode.",
registry)
.unwrap(),
transaction_manager_object_cache_misses: register_int_counter_with_registry!(
"transaction_manager_object_cache_misses",
"Number of object-availability cache misses in TransactionManager",
Expand Down Expand Up @@ -645,8 +618,6 @@ impl AuthorityMetrics {
registry
).unwrap(),
execution_queueing_latency: LatencyObserver::new(),
txn_ready_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
execution_rate_tracker: Arc::new(Mutex::new(RateTracker::new(Duration::from_secs(10)))),
}
}
}
Expand Down Expand Up @@ -713,9 +684,6 @@ pub struct AuthorityState {

/// Config for when we consider the node overloaded.
overload_threshold_config: OverloadThresholdConfig,

/// Current overload status in this authority. Updated periodically.
pub overload_info: AuthorityOverloadInfo,
}

/// The authority state encapsulates all state, drives execution, and ensures safety.
Expand Down Expand Up @@ -2484,21 +2452,17 @@ impl AuthorityState {
transaction_deny_config,
certificate_deny_config,
debug_dump_config,
overload_threshold_config: overload_threshold_config.clone(),
overload_info: AuthorityOverloadInfo::default(),
overload_threshold_config,
});

// Start a task to execute ready certificates.
let authority_state = Arc::downgrade(&state);
spawn_monitored_task!(execution_process(
authority_state,
rx_ready_certificates,
rx_execution_shutdown,
rx_execution_shutdown
));

let authority_state = Arc::downgrade(&state);
spawn_monitored_task!(overload_monitor(authority_state, overload_threshold_config));

// TODO: This doesn't belong to the constructor of AuthorityState.
state
.create_owner_index_if_empty(genesis_objects, &epoch_store)
Expand Down
5 changes: 0 additions & 5 deletions crates/sui-core/src/consensus_throughput_calculator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,6 @@ mod tests {
}

#[test]
#[cfg_attr(msim, ignore)]
pub fn test_consensus_throughput_calculator() {
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
let max_observation_points: NonZeroU64 = NonZeroU64::new(3).unwrap();
Expand Down Expand Up @@ -514,7 +513,6 @@ mod tests {
}

#[test]
#[cfg_attr(msim, ignore)]
pub fn test_throughput_calculator_same_timestamp_observations() {
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
let max_observation_points: NonZeroU64 = NonZeroU64::new(2).unwrap();
Expand Down Expand Up @@ -545,7 +543,6 @@ mod tests {
}

#[test]
#[cfg_attr(msim, ignore)]
pub fn test_consensus_throughput_profiler() {
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
let throughput_profile_update_interval: TimestampSecs = 5;
Expand Down Expand Up @@ -613,7 +610,6 @@ mod tests {
}

#[test]
#[cfg_attr(msim, ignore)]
pub fn test_consensus_throughput_profiler_update_interval() {
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
let throughput_profile_update_interval: TimestampSecs = 5;
Expand Down Expand Up @@ -666,7 +662,6 @@ mod tests {
}

#[test]
#[cfg_attr(msim, ignore)]
pub fn test_consensus_throughput_profiler_cool_down() {
let metrics = Arc::new(AuthorityMetrics::new(&Registry::new()));
let throughput_profile_update_window: TimestampSecs = 3;
Expand Down
2 changes: 0 additions & 2 deletions crates/sui-core/src/execution_driver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,6 @@ pub async fn execution_process(
}
}

authority.metrics.execution_rate_tracker.lock().record();

// Certificate execution can take significant time, so run it in a separate task.
spawn_monitored_task!(async move {
let _scope = monitored_scope("ExecutionDriver::task");
Expand Down
1 change: 0 additions & 1 deletion crates/sui-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ pub mod in_mem_execution_cache;
pub mod metrics;
pub mod module_cache_metrics;
pub mod mysticeti_adapter;
mod overload_monitor;
pub(crate) mod post_consensus_tx_reorder;
pub mod quorum_driver;
pub mod safe_client;
Expand Down
Loading

0 comments on commit 6d29a68

Please sign in to comment.