Merge pull request ceph#43669 from rhcs-dashboard/grafana-unit-tests

monitoring/grafana: Grafana query tester Reviewed-by: Aashish Sharma <[email protected]> Reviewed-by: Alfonso Martínez <[email protected]> Reviewed-by: Ernesto Puerta <[email protected]> Reviewed-by: Pere Diaz Bou <[email protected]>
mitsu-ko · Nov 16, 2021 · 788b810 · 788b810
2 parents e43a888 + 44d3e4c
commit 788b810
Show file tree

Hide file tree

Showing 14 changed files with 571 additions and 3 deletions.
diff --git a/monitoring/grafana/dashboards/.pylintrc b/monitoring/grafana/dashboards/.pylintrc
@@ -0,0 +1 @@
+../../../src/pybind/mgr/dashboard/.pylintrc
diff --git a/monitoring/grafana/dashboards/CMakeLists.txt b/monitoring/grafana/dashboards/CMakeLists.txt
@@ -12,7 +12,9 @@ endif()
 
 if(WITH_GRAFANA)
   include(AddCephTest)
-  add_tox_test(grafana TOX_ENVS grafonnet-check)
+  add_tox_test(grafana-check TOX_ENVS grafonnet-check)
+  add_tox_test(grafana-query-test TOX_ENVS promql-query-test)
+  add_tox_test(grafana-lint TOX_ENVS lint)
   set(ver 0.1.0)
   set(name grafonnet-lib)
   include(ExternalProject)
@@ -30,7 +32,7 @@ if(WITH_GRAFANA)
     ${name})
   ExternalProject_Get_Property(${name} SOURCE_DIR)
   set_property(
-    TEST run-tox-grafana
+    TEST run-tox-grafana-check run-tox-grafana-query-test run-tox-grafana-lint
     APPEND
     PROPERTY ENVIRONMENT
     GRAFONNET_PATH=${SOURCE_DIR}/grafonnet)

diff --git a/monitoring/grafana/dashboards/requirements-lint.txt b/monitoring/grafana/dashboards/requirements-lint.txt
@@ -0,0 +1,18 @@
+attrs==21.2.0
+behave==1.2.6
+py==1.10.0
+pyparsing==2.4.7
+PyYAML==6.0
+types-PyYAML==6.0.0
+typing-extensions==3.10.0.2
+termcolor==1.1.0
+types-termcolor==1.1.2
+dataclasses==0.6
+types-dataclasses==0.6.1
+six==1.16.0
+toml==0.10.2
+pylint==2.6.0
+isort==5.10.0
+mypy==0.910
+mypy-extensions==0.4.3
+prettytable==2.4.0
diff --git a/monitoring/grafana/dashboards/tests/__init__.py b/monitoring/grafana/dashboards/tests/__init__.py
@@ -0,0 +1,187 @@
+import re
+import subprocess
+import sys
+import tempfile
+from dataclasses import asdict, dataclass, field
+from typing import Any, List
+
+import yaml
+
+
+@dataclass
+class InputSeries:
+    series: str = ''
+    values: str = ''
+
+@dataclass
+class ExprSample:
+    labels: str = ''
+    value: float = -1
+
+@dataclass
+class PromqlExprTest:
+    expr: str = ''
+    eval_time: str = '1m'
+    exp_samples: List[ExprSample] = field(default_factory=list)
+
+@dataclass
+class Test:
+    interval: str = '1m'
+    input_series: List[InputSeries] = field(default_factory=list)
+    promql_expr_test: List[PromqlExprTest] = field(default_factory=list)
+
+
+@dataclass
+class TestFile:
+    evaluation_interval: str = '1m'
+    tests: List[Test] = field(default_factory=list)
+
+
+class PromqlTest:
+    """
+    Base class to provide prometheus query test capabilities. After setting up
+    the query test with its input and expected output it's expected to run promtool.
+
+    https://prometheus.io/docs/prometheus/latest/configuration/unit_testing_rules/#test-yml
+
+    The workflow of testing would be something like:
+
+        # add prometheus query to test
+        self.set_expression('bonding_slaves > 0')
+
+        # add some prometheus input series
+        self.add_series('bonding_slaves{master="bond0"}', '2')
+        self.add_series('bonding_slaves{master="bond1"}', '3')
+        self.add_series('node_network_receive_bytes{instance="127.0.0.1",
+            device="eth1"}', "10 100 230 22")
+
+        # expected output of the query
+        self.add_exp_samples('bonding_slaves{master="bond0"}', 2)
+        self.add_exp_samples('bonding_slaves{master="bond1"}', 3)
+
+        # at last, always call promtool with:
+        self.assertTrue(self.run_promtool())
+        # assertTrue means it expect promtool to succeed
+    """
+
+    def __init__(self):
+        self.test_output_file = tempfile.NamedTemporaryFile('w+')
+
+        self.test_file = TestFile()
+        self.test = Test()
+        self.promql_expr_test = PromqlExprTest()
+        self.test.promql_expr_test.append(self.promql_expr_test)
+        self.test_file.tests.append(self.test)
+
+        self.variables = {}
+
+    def __del__(self):
+        self.test_output_file.close()
+
+
+    def set_evaluation_interval(self, interval: int, unit: str = 'm') -> None:
+        """
+        Set the evaluation interval of the time series
+
+        Args:
+            interval (int): number of units.
+            unit (str): unit type: 'ms', 's', 'm', etc...
+        """
+        self.test_file.evaluation_interval = f'{interval}{unit}'
+
+    def set_interval(self, interval: int, unit: str = 'm') -> None:
+        """
+        Set the duration of the time series
+
+        Args:
+            interval (int): number of units.
+            unit (str): unit type: 'ms', 's', 'm', etc...
+        """
+        self.test.interval = f'{interval}{unit}'
+
+    def set_expression(self, expr: str) -> None:
+        """
+        Set the prometheus expression/query used to filter data.
+
+        Args:
+             expr(str): expression/query.
+        """
+        self.promql_expr_test.expr = expr
+
+    def add_series(self, series: str, values: str) -> None:
+        """
+        Add a series to the input.
+
+        Args:
+             series(str): Prometheus series.
+             Notation: '<metric name>{<label name>=<label value>, ...}'
+             values(str): Value of the series.
+        """
+        input_series = InputSeries(series=series, values=values)
+        self.test.input_series.append(input_series)
+
+    def set_eval_time(self, eval_time: int, unit: str = 'm') -> None:
+        """
+        Set the time when the expression will be evaluated
+
+        Args:
+             interval (int): number of units.
+             unit (str): unit type: 'ms', 's', 'm', etc...
+        """
+        self.promql_expr_test.eval_time = f'{eval_time}{unit}'
+
+    def add_exp_samples(self, sample: str, values: Any) -> None:
+        """
+        Add an expected sample/output of the query given the series/input
+
+        Args:
+             sample(str): Expected sample.
+             Notation: '<metric name>{<label name>=<label value>, ...}'
+             values(Any): Value of the sample.
+        """
+        expr_sample = ExprSample(labels=sample, value=values)
+        self.promql_expr_test.exp_samples.append(expr_sample)
+
+    def set_variable(self, variable: str, value: str):
+        """
+        If a query makes use of grafonnet variables, for example
+        '$osd_hosts', you should change this to a real value. Example:
+
+
+        > self.set_expression('bonding_slaves{master="$osd_hosts"} > 0')
+        > self.set_variable('osd_hosts', '127.0.0.1')
+        > print(self.query)
+        > bonding_slaves{master="127.0.0.1"} > 0
+
+        Args:
+             variable(str): Variable name
+             value(str): Value to replace variable with
+
+        """
+        self.variables[variable] = value
+
+    def run_promtool(self):
+        """
+        Run promtool to test the query after setting up the input, output
+        and extra parameters.
+
+        Returns:
+             bool: True if successful, False otherwise.
+        """
+
+        for variable, value in self.variables.items():
+            expr = self.promql_expr_test.expr
+            new_expr = re.sub(r'\${0}'.format(variable), str(value), expr)
+            self.set_expression(new_expr)
+
+        test_as_dict = asdict(self.test_file)
+        yaml.dump(test_as_dict, self.test_output_file)
+
+        args = f'promtool test rules {self.test_output_file.name}'.split()
+        try:
+            subprocess.run(args, check=True)
+            return True
+        except subprocess.CalledProcessError as process_error:
+            print(yaml.dump(test_as_dict))
+            print(process_error.stderr)
+            return False
diff --git a/monitoring/grafana/dashboards/tests/features/__init__.py b/monitoring/grafana/dashboards/tests/features/__init__.py
diff --git a/monitoring/grafana/dashboards/tests/features/ceph-cluster.feature b/monitoring/grafana/dashboards/tests/features/ceph-cluster.feature
@@ -0,0 +1,10 @@
+Feature: Ceph Cluster Dashboard
+
+Scenario: "Test total PG States"
+  Given the following series:
+    | metrics | values |
+    | ceph_pg_total{foo="var"} | 10 100 |
+    | ceph_pg_total{foo="bar"} | 20 200 |
+  Then Grafana panel `PG States` with legend `Total` shows:
+    | metrics | values |
+    | {} | 300 |
diff --git a/monitoring/grafana/dashboards/tests/features/environment.py b/monitoring/grafana/dashboards/tests/features/environment.py
@@ -0,0 +1,135 @@
+# type: ignore[no-redef]
+# pylint: disable=E0611,W0613,E0102
+import copy
+
+from behave import given, then, when
+from prettytable import PrettyTable
+
+from tests import PromqlTest
+from tests.util import get_dashboards_data, resolve_time_and_unit
+
+
+class GlobalContext:
+    def __init__(self):
+        self.tested_queries_count = 0
+        self.promql_expr_test = None
+        self.data = get_dashboards_data()
+        self.query_map = self.data['queries']
+
+    def reset_promql_test(self):
+        self.promql_expr_test = PromqlTest()
+        self.promql_expr_test.variables = copy.copy(self.data['variables'])
+
+    def print_query_stats(self):
+        total = len(self.query_map)
+        table = PrettyTable()
+        table.field_names = ['Name', 'Queries', 'Tested', 'Cover']
+
+        def percent(tested, total):
+            return str(round((tested / total) * 100, 2)) + '%'
+
+        def file_name(path):
+            return path.split('/')[-1]
+
+        total = 0
+        tested = 0
+        for path, stat in self.data['stats'].items():
+            assert stat['total']
+            table.add_row([file_name(path), stat['total'], stat['tested'],
+                                     percent(stat['tested'], stat['total'])])
+            total += stat['total']
+            tested += stat['tested']
+
+        assert total
+        table.add_row(['Total', total, tested, percent(tested, total)])
+        print(table)
+
+
+global_context = GlobalContext()
+
+# Behave function overloading
+# ===========================
+
+
+def before_scenario(context, scenario):
+    global_context.reset_promql_test()
+
+
+def after_scenario(context, scenario):
+    assert global_context.promql_expr_test.run_promtool()
+
+
+def after_all(context):
+    global_context.print_query_stats()
+
+
+@given("the following series")
+def step_impl(context):
+    for row in context.table:
+        metric = row['metrics']
+        value = row['values']
+        global_context.promql_expr_test.add_series(metric, value)
+
+
+@when('evaluation interval is `{interval}`')
+def step_impl(context, interval):
+    interval_without_unit, unit = resolve_time_and_unit(interval)
+    if interval_without_unit is None:
+        raise ValueError(f'Invalid interval time: {interval_without_unit}. ' +
+                           'A valid time looks like "1m" where you have a number plus a unit')
+    global_context.promql_expr_test.set_evaluation_interval(interval_without_unit, unit)
+
+
+@when('interval is `{interval}`')
+def step_impl(context, interval):
+    interval_without_unit, unit = resolve_time_and_unit(interval)
+    if interval_without_unit is None:
+        raise ValueError(f'Invalid interval time: {interval_without_unit}. ' +
+                           'A valid time looks like "1m" where you have a number plus a unit')
+    global_context.promql_expr_test.set_interval(interval_without_unit, unit)
+
+
+@when('evaluation time is `{eval_time}`')
+def step_impl(context, eval_time):
+    eval_time_without_unit, unit = resolve_time_and_unit(eval_time)
+    if eval_time_without_unit is None:
+        raise ValueError(f'Invalid evalution time: {eval_time}. ' +
+                           'A valid time looks like "1m" where you have a number plus a unit')
+    global_context.promql_expr_test.set_eval_time(eval_time_without_unit, unit)
+
+
+@when('variable `{variable}` is `{value}`')
+def step_impl(context, variable, value):
+    global_context.promql_expr_test.set_variable(variable, value)
+
+
+@then('Grafana panel `{panel_name}` with legend `{legend}` shows')
+def step_impl(context, panel_name, legend):
+    """
+    This step can have an empty legend. As 'behave' doesn't provide a way
+    to say it's empty we use EMPTY to mark as empty.
+    """
+    if legend == "EMPTY":
+        legend = ''
+    query_id = panel_name + '-' + legend
+    if query_id not in global_context.query_map:
+        raise KeyError((f'Query with legend {legend} in panel "{panel_name}"'
+                           'couldn\'t be found'))
+
+    expr = global_context.query_map[query_id]['query']
+    global_context.promql_expr_test.set_expression(expr)
+    for row in context.table:
+        metric = row['metrics']
+        value = row['values']
+        global_context.promql_expr_test.add_exp_samples(metric, float(value))
+    path = global_context.query_map[query_id]['path']
+    global_context.data['stats'][path]['tested'] += 1
+
+
+@then('query `{query}` produces')
+def step_impl(context, query):
+    global_context.promql_expr_test.set_expression(query)
+    for row in context.table:
+        metric = row['metrics']
+        value = row['values']
+        global_context.promql_expr_test.add_exp_samples(metric, float(value))