Postpone flushing aggregated counters to maintanence.

Yashnegi49 · Jun 7, 2024 · fe408af · fe408af
1 parent 4be858c
commit fe408af
Show file tree

Hide file tree

Showing 11 changed files with 303 additions and 230 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,6 @@
 .env
 .DS_Store
 config*.yaml
-avail_light_store
-avail_path*
+avail_*
+debug.plist
 /identity.toml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 # Changelog
 
-## v1.9.0
+## 1.9.1
+
+- Postpone flushing aggregated counters to maintanence step
+
+## [v1.9.0](https://github.com/availproject/avail-light/releases/tag/v1.9.0) - 2024-04-06
 
 - Add metric aggregation on client side in order to decrease the telemetry server load
 - Add `avail.light.starts` metric counter which allows measuring number of restarts

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "avail-light"
-version = "1.9.0"
+version = "1.9.1"
 authors = ["Avail Team"]
 default-run = "avail-light"
 edition = "2021"

diff --git a/src/bin/avail-light.rs b/src/bin/avail-light.rs
@@ -254,7 +254,6 @@ async fn run(shutdown: Controller<String>) -> Result<()> {
 			result
 		},
 	)));
-	ot_metrics.count(MetricCounter::Starts).await;
 
 	info!("Waiting for first finalized header...");
 	let block_header = match shutdown
@@ -400,6 +399,7 @@ async fn run(shutdown: Controller<String>) -> Result<()> {
 		replication_factor: cfg.replication_factor,
 		query_timeout: cfg.query_timeout,
 		pruning_interval: cfg.store_pruning_interval,
+		telemetry_flush_interval: cfg.ot_flush_block_interval,
 	};
 
 	tokio::task::spawn(shutdown.with_cancel(avail_light::maintenance::run(
@@ -434,13 +434,15 @@ async fn run(shutdown: Controller<String>) -> Result<()> {
 			db.clone(),
 			light_network_client,
 			(&cfg).into(),
-			ot_metrics,
+			ot_metrics.clone(),
 			state.clone(),
 			channels,
 			shutdown.clone(),
 		)));
 	}
 
+	ot_metrics.count(MetricCounter::Starts).await;
+
 	Ok(())
 }
 

diff --git a/src/fat_client.rs b/src/fat_client.rs
@@ -84,7 +84,7 @@ pub async fn process_block(
 	metrics.count(MetricCounter::SessionBlockCounter).await;
 	metrics
 		.record(MetricValue::TotalBlockNumber(header.number))
-		.await?;
+		.await;
 
 	let block_number = header.number;
 	let header_hash: H256 = Encode::using_encoded(header, blake2_256).into();
@@ -168,7 +168,7 @@ pub async fn process_block(
 		.record(MetricValue::RPCCallDuration(
 			partition_rpc_retrieve_time_elapsed.as_secs_f64(),
 		))
-		.await?;
+		.await;
 
 	if rpc_fetched.len() >= dimensions.cols().get().into() {
 		let data_cells = rpc_fetched
@@ -224,12 +224,9 @@ pub async fn run(
 		};
 
 		if let Some(seconds) = cfg.block_processing_delay.sleep_duration(received_at) {
-			if let Err(error) = metrics
+			metrics
 				.record(MetricValue::BlockProcessingDelay(seconds.as_secs_f64()))
-				.await
-			{
-				error!("Cannot record block processing delay: {}", error);
-			}
+				.await;
 			info!("Sleeping for {seconds:?} seconds");
 			tokio::time::sleep(seconds).await;
 		}
@@ -382,7 +379,7 @@ mod tests {
 
 		let mut mock_metrics = telemetry::MockMetrics::new();
 		mock_metrics.expect_count().returning(|_| ());
-		mock_metrics.expect_record().returning(|_| Ok(()));
+		mock_metrics.expect_record().returning(|_| ());
 
 		process_block(
 			&mock_client,

diff --git a/src/light_client.rs b/src/light_client.rs
@@ -52,7 +52,7 @@ pub async fn process_block(
 	metrics.count(MetricCounter::SessionBlockCounter).await;
 	metrics
 		.record(MetricValue::TotalBlockNumber(header.number))
-		.await?;
+		.await;
 
 	let block_number = header.number;
 	let header_hash: H256 = Encode::using_encoded(&header, blake2_256).into();
@@ -109,30 +109,30 @@ pub async fn process_block(
 
 			metrics
 				.record(MetricValue::DHTFetched(fetch_stats.dht_fetched))
-				.await?;
+				.await;
 
 			metrics
 				.record(MetricValue::DHTFetchedPercentage(
 					fetch_stats.dht_fetched_percentage,
 				))
-				.await?;
+				.await;
 
 			metrics
 				.record(MetricValue::DHTFetchDuration(
 					fetch_stats.dht_fetch_duration,
 				))
-				.await?;
+				.await;
 
 			if let Some(rpc_fetched) = fetch_stats.rpc_fetched {
 				metrics
 					.record(MetricValue::NodeRPCFetched(rpc_fetched))
-					.await?;
+					.await;
 			}
 
 			if let Some(rpc_fetch_duration) = fetch_stats.rpc_fetch_duration {
 				metrics
 					.record(MetricValue::NodeRPCFetchDuration(rpc_fetch_duration))
-					.await?;
+					.await;
 			}
 			(positions.len(), fetched.len(), unfetched.len())
 		},
@@ -158,7 +158,7 @@ pub async fn process_block(
 	);
 	metrics
 		.record(MetricValue::BlockConfidence(confidence))
-		.await?;
+		.await;
 
 	// push latest mined block's header into column family specified
 	// for keeping block headers, to be used
@@ -210,12 +210,9 @@ pub async fn run(
 		};
 
 		if let Some(seconds) = cfg.block_processing_delay.sleep_duration(received_at) {
-			if let Err(error) = metrics
+			metrics
 				.record(MetricValue::BlockProcessingDelay(seconds.as_secs_f64()))
-				.await
-			{
-				error!("Cannot record block processing delay: {}", error);
-			}
+				.await;
 			info!("Sleeping for {seconds:?} seconds");
 			tokio::time::sleep(seconds).await;
 		}
@@ -354,7 +351,7 @@ mod tests {
 
 		let mut mock_metrics = telemetry::MockMetrics::new();
 		mock_metrics.expect_count().returning(|_| ());
-		mock_metrics.expect_record().returning(|_| Ok(()));
+		mock_metrics.expect_record().returning(|_| ());
 		process_block(
 			db,
 			&mock_network_client,

diff --git a/src/maintenance.rs b/src/maintenance.rs
@@ -1,9 +1,7 @@
 use color_eyre::{eyre::WrapErr, Result};
 use std::sync::Arc;
 use tokio::sync::broadcast;
-#[cfg(not(feature = "kademlia-rocksdb"))]
-use tracing::error;
-use tracing::{debug, info};
+use tracing::{debug, error, info};
 
 use crate::{
 	network::p2p::Client as P2pClient,
@@ -18,6 +16,7 @@ pub struct StaticConfigParams {
 	pub replication_factor: u16,
 	pub query_timeout: u32,
 	pub pruning_interval: u32,
+	pub telemetry_flush_interval: u32,
 }
 
 pub async fn process_block(
@@ -35,6 +34,14 @@ pub async fn process_block(
 		}
 	}
 
+	if block_number % static_config_params.telemetry_flush_interval == 0 {
+		info!(block_number, "Flushing metrics...");
+		match metrics.flush().await {
+			Ok(()) => info!(block_number, "Flushing metrics finished"),
+			Err(error) => error!(block_number, "Flushing metrics failed: {error:#}"),
+		}
+	}
+
 	p2p_client
 		.shrink_kademlia_map()
 		.await
@@ -52,24 +59,24 @@ pub async fn process_block(
 	debug!("Connected peers: {:?}", connected_peers);
 
 	let peers_num_metric = MetricValue::ConnectedPeersNum(peers_num);
-	metrics.record(peers_num_metric).await?;
+	metrics.record(peers_num_metric).await;
 
 	metrics
-		.record(MetricValue::BlockConfidenceTreshold(
+		.record(MetricValue::BlockConfidenceThreshold(
 			static_config_params.block_confidence_treshold,
 		))
-		.await?;
+		.await;
 	metrics
 		.record(MetricValue::ReplicationFactor(
 			static_config_params.replication_factor,
 		))
-		.await?;
+		.await;
 	metrics
 		.record(MetricValue::QueryTimeout(
 			static_config_params.query_timeout,
 		))
-		.await?;
-	metrics.record(MetricValue::HealthCheck()).await?;
+		.await;
+	metrics.record(MetricValue::HealthCheck()).await;
 
 	info!(block_number, map_size, "Maintenance completed");
 	Ok(())

diff --git a/src/telemetry/mod.rs b/src/telemetry/mod.rs
@@ -12,6 +12,7 @@ use crate::types::Origin;
 
 pub mod otlp;
 
+#[derive(Debug)]
 pub enum MetricCounter {
 	Starts,
 	SessionBlockCounter,
@@ -39,6 +40,10 @@ impl Display for MetricCounter {
 }
 
 impl MetricCounter {
+	fn is_buffered(&self) -> bool {
+		!matches!(self, MetricCounter::Starts)
+	}
+
 	fn is_allowed(&self, origin: &Origin) -> bool {
 		match (origin, self) {
 			(Origin::External, MetricCounter::Starts) => true,
@@ -70,6 +75,7 @@ impl MetricCounter {
 	}
 }
 
+#[derive(Clone, Debug)]
 pub enum MetricValue {
 	TotalBlockNumber(u32),
 	DHTFetched(f64),
@@ -78,7 +84,7 @@ pub enum MetricValue {
 	NodeRPCFetched(f64),
 	NodeRPCFetchDuration(f64),
 	BlockConfidence(f64),
-	BlockConfidenceTreshold(f64),
+	BlockConfidenceThreshold(f64),
 	RPCCallDuration(f64),
 	DHTPutDuration(f64),
 	DHTPutSuccess(f64),
@@ -116,5 +122,6 @@ impl MetricValue {
 #[async_trait]
 pub trait Metrics {
 	async fn count(&self, counter: MetricCounter);
-	async fn record(&self, value: MetricValue) -> Result<()>;
+	async fn record(&self, value: MetricValue);
+	async fn flush(&self) -> Result<()>;
 }