Skip to content

Commit

Permalink
调整DS-增加告警规则。告警规则实现上传prometheus。
Browse files Browse the repository at this point in the history
  • Loading branch information
Pandas886 committed Apr 12, 2023
1 parent 3768dc1 commit 6d97388
Show file tree
Hide file tree
Showing 12 changed files with 127 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static org.dromara.cloudeon.utils.Constant.AdminUserId;
import static org.dromara.cloudeon.utils.Constant.*;

/**
* 集群服务相关接口
Expand Down Expand Up @@ -858,15 +858,15 @@ public ResultDTO<String> getDashboardUrl(Integer serviceInstanceId) {
StackServiceEntity stackServiceEntity = stackServiceRepository.findById(serviceInstanceEntity.getStackServiceId()).get();

// 如果没安装monitor服务,则提示请先安装
ServiceInstanceEntity monitorServiceInstance = serviceInstanceRepository.findEntityByClusterIdAndStackServiceName(serviceInstanceEntity.getClusterId(), "MONITOR");
ServiceInstanceEntity monitorServiceInstance = serviceInstanceRepository.findEntityByClusterIdAndStackServiceName(serviceInstanceEntity.getClusterId(), MONITOR_SERVICE_NAME);
if (monitorServiceInstance == null) {
return ResultDTO.success("请先安装Monitor服务");
}

// 通过服务框架的dashboard和Grafana地址拼接完整url
Integer monitorServiceInstanceId = monitorServiceInstance.getId();
String grafanaHttpPort = serviceInstanceConfigRepository.findByServiceInstanceIdAndName(monitorServiceInstanceId, "grafana.http.port").getValue();
ServiceRoleInstanceEntity grafana = roleInstanceRepository.findByServiceInstanceIdAndServiceRoleName(monitorServiceInstanceId, "GRAFANA").get(0);
ServiceRoleInstanceEntity grafana = roleInstanceRepository.findByServiceInstanceIdAndServiceRoleName(monitorServiceInstanceId, MONITOR_ROLE_GRAFANA).get(0);
Integer grafanaNodeId = grafana.getNodeId();
ClusterNodeEntity grafanaNodeEntity = clusterNodeRepository.findById(grafanaNodeId).get();
String dashboardUid = stackServiceEntity.getDashboardUid();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.dromara.cloudeon.utils.Constant.MONITOR_SERVICE_NAME;

@NoArgsConstructor
public class RegisterPrometheusScrapyTask extends BaseCloudeonTask {
private static final String PROMETHEUS_DIR = "prometheus";
Expand Down Expand Up @@ -53,7 +55,7 @@ public void internalExecute() {
File renderDirFile = new File(renderDir);

// 查出当前集群已安装的Monitor服务
ServiceInstanceEntity monitorServiceInstance = serviceInstanceRepository.findEntityByClusterIdAndStackServiceName(serviceInstanceEntity.getClusterId(), "MONITOR");
ServiceInstanceEntity monitorServiceInstance = serviceInstanceRepository.findEntityByClusterIdAndStackServiceName(serviceInstanceEntity.getClusterId(), MONITOR_SERVICE_NAME);

// 如果不存在prometheus目录则跳过
// todo monitor服务没安装则跳过
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,32 @@
package org.dromara.cloudeon.service;

import cn.hutool.core.io.FileUtil;
import cn.hutool.extra.spring.SpringUtil;
import freemarker.template.Template;
import freemarker.template.TemplateException;
import lombok.extern.slf4j.Slf4j;
import org.apache.sshd.client.session.ClientSession;
import org.apache.sshd.sftp.client.SftpClientFactory;
import org.apache.sshd.sftp.client.fs.SftpFileSystem;
import org.dromara.cloudeon.config.CloudeonConfigProp;
import org.dromara.cloudeon.dao.ClusterAlertRuleRepository;
import org.dromara.cloudeon.dao.ClusterNodeRepository;
import org.dromara.cloudeon.dao.ServiceInstanceRepository;
import org.dromara.cloudeon.dao.ServiceRoleInstanceRepository;
import org.dromara.cloudeon.entity.ClusterAlertRuleEntity;
import org.dromara.cloudeon.entity.ClusterNodeEntity;
import org.dromara.cloudeon.entity.ServiceInstanceEntity;
import org.dromara.cloudeon.entity.ServiceRoleInstanceEntity;
import org.dromara.cloudeon.utils.Constant;
import org.dromara.cloudeon.utils.SshUtils;
import org.slf4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.web.servlet.view.freemarker.FreeMarkerConfigurer;

import javax.annotation.Resource;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.StringWriter;
import java.util.HashMap;
Expand All @@ -17,15 +35,38 @@
import java.util.function.Consumer;
import java.util.stream.Collectors;

import static org.dromara.cloudeon.utils.Constant.*;

@Service
public class AlertService {
@Resource
private FreeMarkerConfigurer freeMarkerConfigurer;

@Resource
private CloudeonConfigProp cloudeonConfigProp;

@Resource
private ServiceInstanceRepository serviceInstanceRepository;

@Resource
private ServiceRoleInstanceRepository roleInstanceRepository;

@Resource
private ClusterAlertRuleRepository clusterAlertRuleRepository;

public void upgradeMonitorAlertRule(Integer clusterId) {
@Resource
private ClusterNodeRepository clusterNodeRepository;

public void upgradeMonitorAlertRule(Integer clusterId, Logger log) {
String workHome = cloudeonConfigProp.getWorkHome();
// 创建本地告警规则资源工作目录 ${workHome}/alert-rule/1/
String alertRuleOutputPath = workHome + File.separator + Constant.ALERT_RULE_RESOURCE_DIR + File.separator + clusterId;
log.info("开始集群告警规则资源文件生成:" + alertRuleOutputPath);

if (!FileUtil.exist(alertRuleOutputPath)) {
log.info("目录{}不存在,创建目录...", alertRuleOutputPath);
FileUtil.mkdir(alertRuleOutputPath);
}
List<ClusterAlertRuleEntity> alertRuleEntityList = clusterAlertRuleRepository.findByClusterId(clusterId);
Map<String, List<ClusterAlertRuleEntity>> map = alertRuleEntityList.stream().collect(Collectors.groupingBy(ClusterAlertRuleEntity::getStackServiceName));
// 根据服务生成对应的告警规则文件
Expand All @@ -34,21 +75,44 @@ public void upgradeMonitorAlertRule(Integer clusterId) {
public void accept(Map.Entry<String, List<ClusterAlertRuleEntity>> stringListEntry) {
String serviceName = stringListEntry.getKey();
try {
String outputFileName = serviceName + ".yml";
String outPutFile = alertRuleOutputPath + File.separator + outputFileName;
Template template = freeMarkerConfigurer.getConfiguration().getTemplate("alert-rule-template.ftl");
// 处理模板
Map<String, Object> data = new HashMap<>();
data.put("serviceName", serviceName);
data.put("ruleList", stringListEntry.getValue());
StringWriter writer = new StringWriter();
template.process(data, writer);
String result = writer.toString();
System.out.println(result);
FileWriter out = new FileWriter(outPutFile);
template.process(data, out);
log.info("完成服务告警规则资源文件生成:" + outPutFile);
out.close();
} catch (IOException | TemplateException e) {
e.printStackTrace();
}
}
});

// 查找promethus服务所在节点,将本地目录里该集群的告警规则文件全部上传
ServiceInstanceEntity monitorServiceInstance = serviceInstanceRepository.findEntityByClusterIdAndStackServiceName(clusterId, MONITOR_SERVICE_NAME);
ServiceRoleInstanceEntity prometheus = roleInstanceRepository.findByServiceInstanceIdAndServiceRoleName(monitorServiceInstance.getId(), MONITOR_ROLE_PROMETHEUS).get(0);
ClusterNodeEntity prometheusNode = clusterNodeRepository.findById(prometheus.getId()).get();
// 建立ssh连接
ClientSession clientSession = SshUtils.openConnectionByPassword(prometheusNode.getIp(), prometheusNode.getSshPort(), prometheusNode.getSshUser(), prometheusNode.getSshPassword());
SftpFileSystem sftp;
try {
sftp = SftpClientFactory.instance().createSftpFileSystem(clientSession);
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException("打开sftp失败:" + e);
}
String remoteConfDirPath = "/opt/edp/" + monitorServiceInstance.getServiceName() + "/conf/rule/";
log.info("拷贝本地配置目录:" + alertRuleOutputPath + " 到节点" + prometheusNode.getIp() + "的:" + remoteConfDirPath);
try {
SshUtils.uploadLocalDirToRemote(remoteConfDirPath, alertRuleOutputPath, sftp);
} catch (IOException e) {
e.printStackTrace();
throw new RuntimeException(e);
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@ public class Constant {

public static final String K8S_DIR = "k8s";
public static final String K8S_RESOURCE_DIR = "k8s-resource";
public static final String ALERT_RULE_RESOURCE_DIR = "alert-rule";
public static final String YARN_SERVICE_NAME = "YARN";
public static final String HBASE_SERVICE_NAME = "HBASE";
public static final String SPARK_SERVICE_NAME = "SPARK";
public static final String HDFS_SERVICE_NAME = "HDFS";
public static final String HIVE_SERVICE_NAME = "HIVE";
public static final String ZOOKEEPER_SERVICE_NAME = "ZOOKEEPER";
public static final String MONITOR_SERVICE_NAME = "MONITOR";
public static final String MONITOR_ROLE_GRAFANA = "GRAFANA";
public static final String MONITOR_ROLE_PROMETHEUS = "PROMETHEUS";

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ groups:
- name: ${serviceName}
rules:
<#list ruleList as rule>
- alert: ${rule.ruleName}
expr: ${rule.promql}
- alert: "${rule.ruleName}"
expr: "${rule.promql}"
labels:
alertLevel: ${rule.alertLevel}
clusterId: ${rule.clusterId}
serviceRoleName: ${rule.stackRoleName}
serviceName: ${rule.stackServiceName}
annotations:
alertAdvice: ${rule.alertAdvice}
alertInfo: ${rule.alertInfo}
alertAdvice: "${rule.alertAdvice}"
alertInfo: "${rule.alertInfo}"
</#list>
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import freemarker.template.Configuration;
import freemarker.template.Template;
import freemarker.template.TemplateException;
import lombok.extern.slf4j.Slf4j;
import org.dromara.cloudeon.dao.AlertQuotaRepository;
import org.dromara.cloudeon.dao.ClusterInfoRepository;
import org.dromara.cloudeon.dto.PrometheusAlertRule;
Expand All @@ -23,7 +24,7 @@
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

@Slf4j
@RunWith(SpringRunner.class)
@SpringBootTest
public class AlertRuleTest {
Expand All @@ -35,7 +36,7 @@ public class AlertRuleTest {
@Test
public void dod() {

alertService.upgradeMonitorAlertRule(2);
alertService.upgradeMonitorAlertRule(1,log);
}


Expand Down
25 changes: 25 additions & 0 deletions cloudeon-stack/EDP-1.0.0/dolphinscheduler/alert-rule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
rules:
- alert: dolphinscheduler master进程存活
promql: up{job='masterserver'} != 1
alertLevel: exception
serviceRoleName: DS_MASTER_SERVER
alertAdvice: 重新启动
alertInfo: "{{ $labels.job }}的{{ $labels.instance }}实例产生告警"
- alert: dolphinscheduler worker进程存活
promql: up{job='workerserver'} != 1
alertLevel: exception
serviceRoleName: DS_WORKER_SERVER
alertAdvice: 重新启动
alertInfo: "{{ $labels.job }}的{{ $labels.instance }}实例产生告警"
- alert: dolphinscheduler api进程存活
promql: up{job='apiserver'} != 1
alertLevel: exception
serviceRoleName: DS_API_SERVER
alertAdvice: 重新启动
alertInfo: "{{ $labels.job }}的{{ $labels.instance }}实例产生告警"
- alert: dolphinscheduler alert进程存活
promql: up{job='alertserver'} != 1
alertLevel: exception
serviceRoleName: DS_ALERT_SERVER
alertAdvice: 重新启动
alertInfo: "{{ $labels.job }}的{{ $labels.instance }}实例产生告警"
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,12 @@ resource.aws.s3.endpoint=http://localhost:9000

# if resource.storage.type=HDFS, the user must have the permission to create directories under the HDFS root path
resource.hdfs.root.user=hdfs
<#assign hdfs=dependencies.HDFS />
<#if hdfs.serviceRoles['HDFS_NAMENODE']?size gt 1>
<#assign fs_default_uri = "hdfs://" + hdfs.conf['nameservices']>
</#if>
# if resource.storage.type=S3, the value like: s3a://dolphinscheduler; if resource.storage.type=HDFS and namenode HA is enabled, you need to copy core-site.xml and hdfs-site.xml to conf dir
resource.hdfs.fs.defaultFS=hdfs://mycluster:8020
resource.hdfs.fs.defaultFS=${fs_default_uri}

# whether to startup kerberos
hadoop.security.authentication.startup.state=false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,16 @@ export SPRING_PROFILES_ACTIVE=$DATABASE
export SPRING_CACHE_TYPE=none
export SPRING_JACKSON_TIME_ZONE=UTC
export MASTER_FETCH_COMMAND_NUM=10

<#--handle dependent.zookeeper-->
<#if dependencies.ZOOKEEPER??>
<#assign zookeeper=dependencies.ZOOKEEPER quorum=[]>
<#list zookeeper.serviceRoles['ZOOKEEPER_SERVER'] as role>
<#assign quorum += [role.hostname + ":" + zookeeper.conf["zookeeper.client.port"]]>
</#list>
</#if>
# Registry center configuration, determines the type and link of the registry center
export REGISTRY_TYPE="zookeeper"
export REGISTRY_ZOOKEEPER_CONNECT_STRING="localhost:2181"
export REGISTRY_ZOOKEEPER_CONNECT_STRING="${quorum?join(",")}"

# Tasks related configurations, need to change the configuration if you use the related tasks.
export HADOOP_HOME=/opt/soft/hadoop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ registry:

worker:
# worker listener port
listen-port: 1234
listen-port: ${conf['worker.server.listern.port']}
# worker execute thread number to limit task instances in parallel
exec-threads: 100
# worker heartbeat interval
Expand Down
2 changes: 1 addition & 1 deletion cloudeon-stack/EDP-1.0.0/monitor/render/prometheus.yml.ftl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ alerting:
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/opt/edp/${service.serviceName}/conf/rule/rules*.yml"
- "/opt/edp/${service.serviceName}/conf/rule/*.yml"
<#assign node_exporters=[]>
<#list serviceRoles['NODEEXPORTER'] as role>
<#assign node_exporters += ["'"+role.hostname + ":" + conf["nodeexporter.http.port"]+"'"]>
Expand Down
1 change: 1 addition & 0 deletions cloudeon-stack/EDP-1.0.0/monitor/service-info.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dashboard:

persistencePaths:
- /opt/edp/${service.serviceName}/conf
- /opt/edp/${service.serviceName}/conf/rule
- /opt/edp/${service.serviceName}/conf/discovery_configs
- /opt/edp/${service.serviceName}/log
- /opt/edp/${service.serviceName}/data
Expand Down

0 comments on commit 6d97388

Please sign in to comment.