diff --git a/doc/cephadm/services/monitoring.rst b/doc/cephadm/services/monitoring.rst index 157332564e5d1..0f67d3f0044e6 100644 --- a/doc/cephadm/services/monitoring.rst +++ b/doc/cephadm/services/monitoring.rst @@ -341,13 +341,16 @@ and the metrics will not be visible in Prometheus. Setting up Prometheus ----------------------- -Setting Prometheus Retention Time -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Setting Prometheus Retention Size and Time +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Cephadm provides the option to set the Prometheus TDSB retention time using -a ``retention_time`` field in the Prometheus service spec. The value defaults -to 15 days (15d). If you would like a different value, such as 1 year (1y) you -can apply a service spec similar to: +Cephadm can configure Prometheus TSDB retention by specifying ``retention_time`` +and ``retention_size`` values in the Prometheus service spec. +The retention time value defaults to 15 days (15d). Users can set a different value/unit where +supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults +to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'. + +In the following example spec we set the retention time to 1 year and the size to 1GB. .. code-block:: yaml @@ -356,6 +359,7 @@ can apply a service spec similar to: count: 1 spec: retention_time: "1y" + retention_size: "1GB" .. note:: diff --git a/src/cephadm/cephadm b/src/cephadm/cephadm index fa03cb6441c22..6515a5fafbf79 100755 --- a/src/cephadm/cephadm +++ b/src/cephadm/cephadm @@ -2639,7 +2639,9 @@ def get_daemon_args(ctx, fsid, daemon_type, daemon_id): if daemon_type == 'prometheus': config = get_parm(ctx.config_json) retention_time = config.get('retention_time', '15d') + retention_size = config.get('retention_size', '0') # default to disabled r += [f'--storage.tsdb.retention.time={retention_time}'] + r += [f'--storage.tsdb.retention.size={retention_size}'] scheme = 'http' host = get_fqdn() r += [f'--web.external-url={scheme}://{host}:{port}'] diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index e3d90f13aebd2..d0a268f0c4c38 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -28,7 +28,7 @@ from ceph.deployment.service_spec import \ ServiceSpec, PlacementSpec, \ HostPlacementSpec, IngressSpec, \ - TunedProfileSpec + TunedProfileSpec, PrometheusSpec from ceph.utils import str_to_datetime, datetime_to_str, datetime_now from cephadm.serve import CephadmServe from cephadm.services.cephadmservice import CephadmDaemonDeploySpec @@ -2533,6 +2533,19 @@ def _apply(self, spec: GenericSpec) -> str: # should only refresh if a change has been detected self._trigger_preview_refresh(specs=[cast(DriveGroupSpec, spec)]) + if spec.service_type == 'prometheus': + spec = cast(PrometheusSpec, spec) + if spec.retention_time: + valid_units = ['y', 'w', 'd', 'h', 'm', 's'] + m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_time) + if not m: + raise OrchestratorError(f"Invalid retention time. Valid units are: {', '.join(valid_units)}") + if spec.retention_size: + valid_units = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB'] + m = re.search(rf"^(\d+)({'|'.join(valid_units)})$", spec.retention_size) + if not m: + raise OrchestratorError(f"Invalid retention size. Valid units are: {', '.join(valid_units)}") + return self._apply_service_spec(cast(ServiceSpec, spec)) @handle_orch_error diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index f111e00bb2def..5942a92597bfa 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -298,6 +298,12 @@ def generate_config( except AttributeError: retention_time = '15d' + try: + retention_size = spec.retention_size if spec.retention_size else '0' + except AttributeError: + # default to disabled + retention_size = '0' + t = self.mgr.get('mgr_map').get('services', {}).get('prometheus', None) sd_port = self.mgr.service_discovery_port srv_end_point = '' @@ -332,7 +338,8 @@ def generate_config( 'prometheus.yml': self.mgr.template.render('services/prometheus/prometheus.yml.j2', context), 'root_cert.pem': self.mgr.http_server.service_discovery.ssl_certs.get_root_cert() }, - 'retention_time': retention_time + 'retention_time': retention_time, + 'retention_size': retention_size } # include alerts, if present in the container diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 354ee338a5cfb..ccf3270c1d63e 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -17,7 +17,7 @@ from ceph.deployment.service_spec import ServiceSpec, PlacementSpec, RGWSpec, \ NFSServiceSpec, IscsiServiceSpec, HostPlacementSpec, CustomContainerSpec, MDSSpec, \ - CustomConfig + CustomConfig, PrometheusSpec from ceph.deployment.drive_selection.selector import DriveSelection from ceph.deployment.inventory import Devices, Device from ceph.utils import datetime_to_str, datetime_now @@ -1508,6 +1508,64 @@ def test_apply_save(self, spec: ServiceSpec, meth, cephadm_module: CephadmOrches with with_service(cephadm_module, spec, meth, 'test'): pass + @pytest.mark.parametrize( + "spec, raise_exception, msg", + [ + # Valid retention_time values (valid units: 'y', 'w', 'd', 'h', 'm', 's') + (PrometheusSpec(retention_time='1y'), False, ''), + (PrometheusSpec(retention_time=' 10w '), False, ''), + (PrometheusSpec(retention_time=' 1348d'), False, ''), + (PrometheusSpec(retention_time='2000h '), False, ''), + (PrometheusSpec(retention_time='173847m'), False, ''), + (PrometheusSpec(retention_time='200s'), False, ''), + (PrometheusSpec(retention_time=' '), False, ''), # default value will be used + + # Invalid retention_time values + (PrometheusSpec(retention_time='100k'), True, '^Invalid retention time'), # invalid unit + (PrometheusSpec(retention_time='10'), True, '^Invalid retention time'), # no unit + (PrometheusSpec(retention_time='100.00y'), True, '^Invalid retention time'), # invalid value and valid unit + (PrometheusSpec(retention_time='100.00k'), True, '^Invalid retention time'), # invalid value and invalid unit + (PrometheusSpec(retention_time='---'), True, '^Invalid retention time'), # invalid value + + # Valid retention_size values (valid units: 'B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB') + (PrometheusSpec(retention_size='123456789B'), False, ''), + (PrometheusSpec(retention_size=' 200KB'), False, ''), + (PrometheusSpec(retention_size='99999MB '), False, ''), + (PrometheusSpec(retention_size=' 10GB '), False, ''), + (PrometheusSpec(retention_size='100TB'), False, ''), + (PrometheusSpec(retention_size='500PB'), False, ''), + (PrometheusSpec(retention_size='200EB'), False, ''), + (PrometheusSpec(retention_size=' '), False, ''), # default value will be used + + # Invalid retention_size values + (PrometheusSpec(retention_size='100b'), True, '^Invalid retention size'), # invalid unit (case sensitive) + (PrometheusSpec(retention_size='333kb'), True, '^Invalid retention size'), # invalid unit (case sensitive) + (PrometheusSpec(retention_size='2000'), True, '^Invalid retention size'), # no unit + (PrometheusSpec(retention_size='200.00PB'), True, '^Invalid retention size'), # invalid value and valid unit + (PrometheusSpec(retention_size='400.B'), True, '^Invalid retention size'), # invalid value and invalid unit + (PrometheusSpec(retention_size='10.000s'), True, '^Invalid retention size'), # invalid value and invalid unit + (PrometheusSpec(retention_size='...'), True, '^Invalid retention size'), # invalid value + + # valid retention_size and valid retention_time + (PrometheusSpec(retention_time='1y', retention_size='100GB'), False, ''), + # invalid retention_time and valid retention_size + (PrometheusSpec(retention_time='1j', retention_size='100GB'), True, '^Invalid retention time'), + # valid retention_time and invalid retention_size + (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'), + # valid retention_time and invalid retention_size + (PrometheusSpec(retention_time='1y', retention_size='100gb'), True, '^Invalid retention size'), + # invalid retention_time and invalid retention_size + (PrometheusSpec(retention_time='1i', retention_size='100gb'), True, '^Invalid retention time'), + ]) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) + def test_apply_prometheus(self, spec: PrometheusSpec, raise_exception: bool, msg: str, cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'test'): + if not raise_exception: + cephadm_module._apply(spec) + else: + with pytest.raises(OrchestratorError, match=msg): + cephadm_module._apply(spec) + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) def test_mds_config_purge(self, cephadm_module: CephadmOrchestrator): spec = MDSSpec('mds', service_id='fsname', config={'test': 'foo'}) diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 22e58317af0d7..8655a7119a8cc 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -17,7 +17,7 @@ NodeExporterService, LokiService, PromtailService from cephadm.module import CephadmOrchestrator from ceph.deployment.service_spec import IscsiServiceSpec, MonitoringSpec, AlertManagerSpec, \ - ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec + ServiceSpec, RGWSpec, GrafanaSpec, SNMPGatewaySpec, IngressSpec, PlacementSpec, TracingSpec, PrometheusSpec from cephadm.tests.fixtures import with_host, with_service, _run_cephadm, async_side_effect from orchestrator import OrchestratorError @@ -392,7 +392,7 @@ def test_prometheus_config(self, _run_cephadm, cephadm_module: CephadmOrchestrat with with_host(cephadm_module, 'test'): with with_service(cephadm_module, MonitoringSpec('node-exporter')) as _, \ - with_service(cephadm_module, MonitoringSpec('prometheus')) as _: + with_service(cephadm_module, PrometheusSpec('prometheus')) as _: y = dedent(""" # This file is generated by cephadm. @@ -431,7 +431,9 @@ def test_prometheus_config(self, _run_cephadm, cephadm_module: CephadmOrchestrat '--tcp-ports', '9095' ], stdin=json.dumps({"files": {"prometheus.yml": y, "root_cert.pem": '', - "/etc/prometheus/alerting/custom_alerts.yml": ""}, 'retention_time': '15d'}), + "/etc/prometheus/alerting/custom_alerts.yml": ""}, + 'retention_time': '15d', + 'retention_size': '0'}), image='') @patch("cephadm.serve.CephadmServe._run_cephadm") @@ -536,7 +538,7 @@ def test_grafana_config(self, _run_cephadm, cephadm_module: CephadmOrchestrator) cephadm_module.set_store("test/grafana_crt", "c") cephadm_module.set_store("test/grafana_key", "k") with with_service( - cephadm_module, MonitoringSpec("prometheus") + cephadm_module, PrometheusSpec("prometheus") ) as _, with_service(cephadm_module, ServiceSpec("mgr")) as _, with_service( cephadm_module, GrafanaSpec("grafana") ) as _: diff --git a/src/python-common/ceph/deployment/service_spec.py b/src/python-common/ceph/deployment/service_spec.py index 9ff800f42eaf1..16db5ed7cc4e4 100644 --- a/src/python-common/ceph/deployment/service_spec.py +++ b/src/python-common/ceph/deployment/service_spec.py @@ -1272,6 +1272,7 @@ def __init__(self, networks: Optional[List[str]] = None, port: Optional[int] = None, retention_time: Optional[str] = None, + retention_size: Optional[str] = None, extra_container_args: Optional[List[str]] = None, custom_configs: Optional[List[CustomConfig]] = None, ): @@ -1282,7 +1283,8 @@ def __init__(self, preview_only=preview_only, config=config, networks=networks, port=port, extra_container_args=extra_container_args, custom_configs=custom_configs) - self.retention_time = retention_time + self.retention_time = retention_time.strip() if retention_time else None + self.retention_size = retention_size.strip() if retention_size else None yaml.add_representer(PrometheusSpec, ServiceSpec.yaml_representer)