Skip to content

Commit

Permalink
mgr/cephadm: Adding --storage.tsdb.retention.size prometheus option
Browse files Browse the repository at this point in the history
fixes: https://tracker.ceph.com/issues/57422

Signed-off-by: Redouane Kachach <[email protected]>
  • Loading branch information
rkachach committed Sep 8, 2022
1 parent 7c73866 commit 4da92c5
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 14 deletions.
16 changes: 10 additions & 6 deletions doc/cephadm/services/monitoring.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,16 @@ and the metrics will not be visible in Prometheus.
Setting up Prometheus
-----------------------

Setting Prometheus Retention Time
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Setting Prometheus Retention Size and Time
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Cephadm provides the option to set the Prometheus TDSB retention time using
a ``retention_time`` field in the Prometheus service spec. The value defaults
to 15 days (15d). If you would like a different value, such as 1 year (1y) you
can apply a service spec similar to:
Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
and ``retention_size`` values in the Prometheus service spec.
The retention time value defaults to 15 days (15d). Users can set a different value/unit where
supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.

In the following example spec we set the retention time to 1 year and the size to 1GB.

.. code-block:: yaml
Expand All @@ -356,6 +359,7 @@ can apply a service spec similar to:
count: 1
spec:
retention_time: "1y"
retention_size: "1GB"
.. note::

Expand Down
2 changes: 2 additions & 0 deletions src/cephadm/cephadm
Original file line number Diff line number Diff line change
Expand Up @@ -2639,7 +2639,9 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
if daemon_type == 'prometheus':
config = get_parm(ctx.config_json)
retention_time = config.get('retention_time', '15d')
retention_size = config.get('retention_size', '0') # default to disabled
r += [f'--storage.tsdb.retention.time={retention_time}']
r += [f'--storage.tsdb.retention.size={retention_size}']
scheme = 'http'
host = get_fqdn()
r += [f'--web.external-url={scheme}://{host}:{port}']
Expand Down
15 changes: 14 additions & 1 deletion src/pybind/mgr/cephadm/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from ceph.deployment.service_spec import \
ServiceSpec, PlacementSpec, \
HostPlacementSpec, IngressSpec, \
TunedProfileSpec
TunedProfileSpec, PrometheusSpec
from ceph.utils import str_to_datetime, datetime_to_str, datetime_now
from cephadm.serve import CephadmServe
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
Expand Down Expand Up @@ -2533,6 +2533,19 @@ def _apply(self, spec: GenericSpec) -> str:
# should only refresh if a change has been detected
self._trigger_preview_refresh(specs=[cast(DriveGroupSpec, spec)])

if spec.service_type == 'prometheus':
spec = cast(PrometheusSpec, spec)
if spec.retention_time:
valid_units = ['y', 'w', 'd', 'h', 'm', 's']
m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_time)
if not m:
raise OrchestratorError(f"Invalid retention time. Valid units are: {', '.join(valid_units)}")
if spec.retention_size:
valid_units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB']
m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_size)
if not m:
raise OrchestratorError(f"Invalid retention size. Valid units are: {', '.join(valid_units)}")

return self._apply_service_spec(cast(ServiceSpec, spec))

@handle_orch_error
Expand Down
9 changes: 8 additions & 1 deletion src/pybind/mgr/cephadm/services/monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,12 @@ def generate_config(
except AttributeError:
retention_time = '15d'

try:
retention_size = spec.retention_size if spec.retention_size else '0'
except AttributeError:
# default to disabled
retention_size = '0'

t = self.mgr.get('mgr_map').get('services', {}).get('prometheus', None)
sd_port = self.mgr.service_discovery_port
srv_end_point = ''
Expand Down Expand Up @@ -332,7 +338,8 @@ def generate_config(
'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context),
'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert()
},
'retention_time': retention_time
'retention_time': retention_time,
'retention_size': retention_size
}

# include alerts, if present in the container
Expand Down
60 changes: 59 additions & 1 deletion src/pybind/mgr/cephadm/tests/test_cephadm.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, RGWSpec, \
NFSServiceSpec, IscsiServiceSpec, HostPlacementSpec, CustomContainerSpec, MDSSpec, \
CustomConfig
CustomConfig, PrometheusSpec
from ceph.deployment.drive_selection.selector import DriveSelection
from ceph.deployment.inventory import Devices, Device
from ceph.utils import datetime_to_str, datetime_now
Expand Down Expand Up @@ -1508,6 +1508,64 @@ def test_apply_save(self, spec: ServiceSpec, meth, cephadm_module: CephadmOrches
with with_service(cephadm_module, spec, meth, 'test'):
pass

@pytest.mark.parametrize(
"spec, raise_exception, msg",
[
# Valid retention_time values (valid units: 'y', 'w', 'd', 'h', 'm', 's')
(PrometheusSpec(retention_time='1y'), False, ''),
(PrometheusSpec(retention_time=' 10w '), False, ''),
(PrometheusSpec(retention_time=' 1348d'), False, ''),
(PrometheusSpec(retention_time='2000h '), False, ''),
(PrometheusSpec(retention_time='173847m'), False, ''),
(PrometheusSpec(retention_time='200s'), False, ''),
(PrometheusSpec(retention_time=' '), False, ''), # default value will be used
# Invalid retention_time values
(PrometheusSpec(retention_time='100k'), True, '^Invalid retention time'), # invalid unit
(PrometheusSpec(retention_time='10'), True, '^Invalid retention time'), # no unit
(PrometheusSpec(retention_time='100.00y'), True, '^Invalid retention time'), # invalid value and valid unit
(PrometheusSpec(retention_time='100.00k'), True, '^Invalid retention time'), # invalid value and invalid unit
(PrometheusSpec(retention_time='---'), True, '^Invalid retention time'), # invalid value
# Valid retention_size values (valid units: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')
(PrometheusSpec(retention_size='123456789B'), False, ''),
(PrometheusSpec(retention_size=' 200KB'), False, ''),
(PrometheusSpec(retention_size='99999MB '), False, ''),
(PrometheusSpec(retention_size=' 10GB '), False, ''),
(PrometheusSpec(retention_size='100TB'), False, ''),
(PrometheusSpec(retention_size='500PB'), False, ''),
(PrometheusSpec(retention_size='200EB'), False, ''),
(PrometheusSpec(retention_size=' '), False, ''), # default value will be used
# Invalid retention_size values
(PrometheusSpec(retention_size='100b'), True, '^Invalid retention size'), # invalid unit (case sensitive)
(PrometheusSpec(retention_size='333kb'), True, '^Invalid retention size'), # invalid unit (case sensitive)
(PrometheusSpec(retention_size='2000'), True, '^Invalid retention size'), # no unit
(PrometheusSpec(retention_size='200.00PB'), True, '^Invalid retention size'), # invalid value and valid unit
(PrometheusSpec(retention_size='400.B'), True, '^Invalid retention size'), # invalid value and invalid unit
(PrometheusSpec(retention_size='10.000s'), True, '^Invalid retention size'), # invalid value and invalid unit
(PrometheusSpec(retention_size='...'), True, '^Invalid retention size'), # invalid value
# valid retention_size and valid retention_time
(PrometheusSpec(retention_time='1y', retention_size='100GB'), False, ''),
# invalid retention_time and valid retention_size
(PrometheusSpec(retention_time='1j', retention_size='100GB'), True, '^Invalid retention time'),
# valid retention_time and invalid retention_size
(PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
# valid retention_time and invalid retention_size
(PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'),
# invalid retention_time and invalid retention_size
(PrometheusSpec(retention_time='1i', retention_size='100gb'), True, '^Invalid retention time'),
])
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_apply_prometheus(self, spec: PrometheusSpec, raise_exception: bool, msg: str, cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
if not raise_exception:
cephadm_module._apply(spec)
else:
with pytest.raises(OrchestratorError, match=msg):
cephadm_module._apply(spec)

@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_mds_config_purge(self, cephadm_module: CephadmOrchestrator):
spec = MDSSpec('mds', service_id='fsname', config={'test': 'foo'})
Expand Down
10 changes: 6 additions & 4 deletions src/pybind/mgr/cephadm/tests/test_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
NodeExporterService, LokiService, PromtailService
from cephadm.module import CephadmOrchestrator
from ceph.deployment.service_spec import IscsiServiceSpec, MonitoringSpec, AlertManagerSpec, \
ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec
ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec, PrometheusSpec
from cephadm.tests.fixtures import with_host, with_service, _run_cephadm, async_side_effect

from orchestrator import OrchestratorError
Expand Down Expand Up @@ -392,7 +392,7 @@ def test_prometheus_config(self, _run_cephadm, cephadm_module: CephadmOrchestrat

with with_host(cephadm_module, 'test'):
with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \
with_service(cephadm_module, MonitoringSpec('prometheus')) as _:
with_service(cephadm_module, PrometheusSpec('prometheus')) as _:

y = dedent("""
# This file is generated by cephadm.
Expand Down Expand Up @@ -431,7 +431,9 @@ def test_prometheus_config(self, _run_cephadm, cephadm_module: CephadmOrchestrat
'--tcp-ports', '9095'
],
stdin=json.dumps({"files": {"prometheus.yml": y, "root_cert.pem": '',
"/etc/prometheus/alerting/custom_alerts.yml": ""}, 'retention_time': '15d'}),
"/etc/prometheus/alerting/custom_alerts.yml": ""},
'retention_time': '15d',
'retention_size': '0'}),
image='')

@patch("cephadm.serve.CephadmServe._run_cephadm")
Expand Down Expand Up @@ -536,7 +538,7 @@ def test_grafana_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator)
cephadm_module.set_store("test/grafana_crt", "c")
cephadm_module.set_store("test/grafana_key", "k")
with with_service(
cephadm_module, MonitoringSpec("prometheus")
cephadm_module, PrometheusSpec("prometheus")
) as _, with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service(
cephadm_module, GrafanaSpec("grafana")
) as _:
Expand Down
4 changes: 3 additions & 1 deletion src/python-common/ceph/deployment/service_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,7 @@ def __init__(self,
networks: Optional[List[str]] = None,
port: Optional[int] = None,
retention_time: Optional[str] = None,
retention_size: Optional[str] = None,
extra_container_args: Optional[List[str]] = None,
custom_configs: Optional[List[CustomConfig]] = None,
):
Expand All @@ -1282,7 +1283,8 @@ def __init__(self,
preview_only=preview_only, config=config, networks=networks, port=port,
extra_container_args=extra_container_args, custom_configs=custom_configs)

self.retention_time = retention_time
self.retention_time = retention_time.strip() if retention_time else None
self.retention_size = retention_size.strip() if retention_size else None


yaml.add_representer(PrometheusSpec, ServiceSpec.yaml_representer)
Expand Down

0 comments on commit 4da92c5

Please sign in to comment.