Skip to content

Commit

Permalink
[forge] prometheus integration
Browse files Browse the repository at this point in the history
  • Loading branch information
rustielin committed Jul 27, 2022
1 parent 29f8575 commit 81c1403
Show file tree
Hide file tree
Showing 8 changed files with 188 additions and 6 deletions.
14 changes: 14 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 2 additions & 5 deletions terraform/helm/k8s-metrics/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,8 @@ autoscaler:
repo: k8s.gcr.io/autoscaling/cluster-autoscaler
tag: v1.21.0
resources:
limits:
cpu: 100m
memory: 600Mi
requests:
cpu: 100m
memory: 600Mi
cpu: 1
memory: 1Gi
serviceAccount:
annotations:
1 change: 1 addition & 0 deletions testsuite/forge/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ hyper = { version = "0.14.18", features = ["full"] }
hyper-tls = "0.5.0"
k8s-openapi = { version = "0.11.0", default-features = false, features = ["v1_15"] }
kube = "0.51.0"
prometheus-http-query = "0.5.2"
rand = "0.7.3"
rayon = "1.5.2"
regex = "1.5.5"
Expand Down
1 change: 1 addition & 0 deletions testsuite/forge/src/backend/k8s/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use std::{convert::TryInto, num::NonZeroUsize};
pub mod chaos;
mod cluster_helper;
pub mod node;
pub mod prometheus;
mod swarm;

pub use cluster_helper::*;
Expand Down
124 changes: 124 additions & 0 deletions testsuite/forge/src/backend/k8s/prometheus.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
// Copyright (c) Aptos
// SPDX-License-Identifier: Apache-2.0

use crate::Result;

use anyhow::bail;
use aptos_logger::info;
use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
use std::{collections::BTreeMap, convert::TryFrom};

pub fn get_prometheus_client() -> Result<PrometheusClient> {
let prom_url =
std::env::var("PROMETHEUS_URL").unwrap_or_else(|_| "http://127.0.0.1:9090".to_string());
info!("Attempting to create prometheus client with: {} ", prom_url);
match PrometheusClient::try_from(prom_url) {
Ok(c) => Ok(c),
Err(e) => bail!("Failed to create client {}", e),
}
}

fn construct_query_with_extra_labels(query: &str, labels_map: BTreeMap<String, String>) -> String {
// edit the query string to insert swarm metadata
let mut new_query = query.to_string();
let mut label_start_idx = query.find('{').unwrap_or(query.len());
if label_start_idx == query.len() {
// add a new curly and insert after it
new_query.insert_str(query.len(), "{}");
label_start_idx += 1;
} else {
// add a comma prefix to the existing labels and insert before it
label_start_idx += 1;
new_query.insert(label_start_idx, ',');
}

let mut labels_strs = vec![];
for (k, v) in labels_map {
labels_strs.push(format!(r#"{}="{}""#, k, v));
}

let labels = labels_strs.join(",");

// assume no collisions in Forge namespace
new_query.insert_str(label_start_idx, &labels);
new_query
}

pub async fn query_with_metadata(
prom_client: &PrometheusClient,
query: &str,
time: Option<i64>,
timeout: Option<i64>,
labels_map: BTreeMap<String, String>,
) -> Result<PromqlResult> {
let new_query = construct_query_with_extra_labels(query, labels_map);
match prom_client.query(&new_query, time, timeout).await {
Ok(r) => Ok(r),
Err(e) => bail!(e),
}
}

#[cfg(test)]
mod tests {
use prometheus_http_query::Error as PrometheusError;
use std::time::{SystemTime, UNIX_EPOCH};

use super::*;

#[tokio::test]
async fn test_query_prometheus() {
let client = get_prometheus_client().unwrap();

// try a simple instant query
// if it fails to connect to a prometheus instance, skip the test
let query = r#"rate(container_cpu_usage_seconds_total{pod=~".*validator.*", container="validator"}[1m])"#;
let response = client.query(query, None, None).await;
match response {
Ok(pres) => println!("{:?}", pres),
Err(PrometheusError::Client(e)) => {
println!("Skipping test. Failed to create prometheus client: {}", e);
return;
}
Err(e) => panic!("Expected PromqlResult: {}", e),
}

// try a range query
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards")
.as_secs();
let start_timestamp: i64 = (since_the_epoch - 60) as i64;
let end_timestamp: i64 = since_the_epoch as i64;
let step = 15.0;

let response = client
.query_range(query, start_timestamp, end_timestamp, step, None)
.await;
match response {
Ok(pres) => println!("{:?}", pres),
_ => panic!("Expected PromqlResult"),
}
}

#[test]
fn test_create_query() {
// test when no existing labels
let original_query = "aptos_connections";
let mut labels_map = BTreeMap::new();
labels_map.insert("a".to_string(), "a".to_string());
labels_map.insert("some_label".to_string(), "blabla".to_string());
let expected_query = r#"aptos_connections{a="a",some_label="blabla"}"#;
let new_query = construct_query_with_extra_labels(original_query, labels_map);
assert_eq!(expected_query, new_query);

// test when existing labels
let original_query = r#"aptos_connections{abc="123",def="456"}"#;
let mut labels_map = BTreeMap::new();
labels_map.insert("a".to_string(), "a".to_string());
labels_map.insert("some_label".to_string(), "blabla".to_string());
let expected_query = r#"aptos_connections{a="a",some_label="blabla",abc="123",def="456"}"#;
let new_query = construct_query_with_extra_labels(original_query, labels_map);
assert_eq!(expected_query, new_query);
}
}
28 changes: 27 additions & 1 deletion testsuite/forge/src/backend/k8s/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
use crate::{
chaos, create_k8s_client,
node::{K8sNode, REST_API_HAPROXY_SERVICE_PORT, REST_API_SERVICE_PORT},
prometheus::{self, query_with_metadata},
query_sequence_numbers, set_validator_image_tag, uninstall_testnet_resources, ChainInfo,
FullNode, Node, Result, Swarm, SwarmChaos, Validator, Version,
};
Expand All @@ -21,8 +22,9 @@ use kube::{
api::{Api, ListParams},
client::Client as K8sClient,
};
use prometheus_http_query::{response::PromqlResult, Client as PrometheusClient};
use std::{
collections::{HashMap, HashSet},
collections::{BTreeMap, HashMap, HashSet},
convert::TryFrom,
env,
net::TcpListener,
Expand All @@ -47,6 +49,7 @@ pub struct K8sSwarm {
kube_namespace: String,
keep: bool,
chaoses: HashSet<SwarmChaos>,
prom_client: Option<PrometheusClient>,
}

impl K8sSwarm {
Expand Down Expand Up @@ -82,6 +85,14 @@ impl K8sSwarm {
versions.insert(base_version, base_image_tag.to_string());
versions.insert(cur_version, image_tag.to_string());

let prom_client = match prometheus::get_prometheus_client() {
Ok(p) => Some(p),
Err(e) => {
info!("Could not build prometheus client: {}", e);
None
}
};

Ok(K8sSwarm {
validators,
fullnodes,
Expand All @@ -92,6 +103,7 @@ impl K8sSwarm {
kube_namespace: kube_namespace.to_string(),
keep,
chaoses: HashSet::new(),
prom_client,
})
}

Expand Down Expand Up @@ -226,6 +238,20 @@ impl Swarm for K8sSwarm {
}
Ok(())
}

async fn query_metrics(
&self,
query: &str,
time: Option<i64>,
timeout: Option<i64>,
) -> Result<PromqlResult> {
if let Some(c) = &self.prom_client {
let mut labels_map = BTreeMap::new();
labels_map.insert("namespace".to_string(), self.kube_namespace.clone());
return query_with_metadata(c, query, time, timeout, labels_map).await;
}
bail!("No prom client");
}
}

/// Amount of time to wait for genesis to complete
Expand Down
10 changes: 10 additions & 0 deletions testsuite/forge/src/backend/local/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use aptos_sdk::{
PeerId,
},
};
use prometheus_http_query::response::PromqlResult;
use std::{
collections::HashMap,
fs, mem,
Expand Down Expand Up @@ -532,4 +533,13 @@ impl Swarm for LocalSwarm {
fn remove_chaos(&mut self, _chaos: SwarmChaos) -> Result<()> {
todo!()
}

async fn query_metrics(
&self,
_query: &str,
_time: Option<i64>,
_timeout: Option<i64>,
) -> Result<PromqlResult> {
todo!()
}
}
9 changes: 9 additions & 0 deletions testsuite/forge/src/interface/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use aptos_config::config::NodeConfig;
use aptos_rest_client::Client as RestClient;
use aptos_sdk::types::PeerId;
use futures::future::try_join_all;
use prometheus_http_query::response::PromqlResult;
use std::time::{Duration, Instant};
use tokio::runtime::Runtime;

Expand Down Expand Up @@ -67,6 +68,14 @@ pub trait Swarm: Sync {
/// Injects all types of chaos
fn inject_chaos(&mut self, chaos: SwarmChaos) -> Result<()>;
fn remove_chaos(&mut self, chaos: SwarmChaos) -> Result<()>;

// Get prometheus metrics from the swarm
async fn query_metrics(
&self,
query: &str,
time: Option<i64>,
timeout: Option<i64>,
) -> Result<PromqlResult>;
}

impl<T: ?Sized> SwarmExt for T where T: Swarm {}
Expand Down

0 comments on commit 81c1403

Please sign in to comment.