Skip to content

Commit

Permalink
Add declare incident step and model (#5047)
Browse files Browse the repository at this point in the history
Related to grafana/oncall-private#2831

## Checklist

- [ ] Unit, integration, and e2e (if applicable) tests updated
- [ ] Documentation added (or `pr:no public docs` PR label added if not
required)
- [ ] Added the relevant release notes label (see labels prefixed w/
`release:`). These labels dictate how your PR will
    show up in the autogenerated release notes.

---------

Co-authored-by: Matias Bordese <[email protected]>
Co-authored-by: Dominik <[email protected]>
  • Loading branch information
3 people authored Oct 2, 2024
1 parent 612c0e5 commit 70b7273
Show file tree
Hide file tree
Showing 27 changed files with 1,009 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class Meta:
"to_time",
"num_alerts_in_window",
"num_minutes_in_window",
"severity",
"custom_webhook",
"notify_schedule",
"notify_to_group",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@
from apps.alerts.models.escalation_policy import EscalationPolicy
from apps.alerts.tasks import (
custom_webhook_result,
declare_incident,
notify_all_task,
notify_group_task,
notify_user_task,
resolve_by_last_step_task,
)
from apps.alerts.utils import is_declare_incident_step_enabled
from apps.schedules.ical_utils import list_users_to_notify_from_ical
from apps.user_management.models import User

Expand All @@ -40,6 +42,7 @@ class EscalationPolicySnapshot:
"notify_schedule",
"notify_to_group",
"notify_to_team_members",
"severity",
"escalation_counter",
"passed_last_time",
"pause_escalation",
Expand Down Expand Up @@ -71,6 +74,7 @@ def __init__(
passed_last_time,
pause_escalation,
notify_to_team_members=None,
severity=None,
):
self.id = id
self.order = order
Expand All @@ -86,6 +90,7 @@ def __init__(
self.notify_schedule = notify_schedule
self.notify_to_group = notify_to_group
self.notify_to_team_members = notify_to_team_members
self.severity = severity
self.escalation_counter = escalation_counter # used for STEP_REPEAT_ESCALATION_N_TIMES
self.passed_last_time = passed_last_time # used for building escalation plan
self.pause_escalation = pause_escalation # used for STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW
Expand Down Expand Up @@ -133,6 +138,7 @@ def execute(self, alert_group: "AlertGroup", reason) -> StepExecutionResultData:
EscalationPolicy.STEP_NOTIFY_IF_NUM_ALERTS_IN_TIME_WINDOW: self._escalation_step_notify_if_num_alerts_in_time_window,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_NOTIFY_MULTIPLE_USERS_IMPORTANT: self._escalation_step_notify_multiple_users,
EscalationPolicy.STEP_DECLARE_INCIDENT: self._escalation_step_declare_incident,
None: self._escalation_step_not_configured,
}
result = action_map[self.step](alert_group, reason)
Expand Down Expand Up @@ -407,6 +413,32 @@ def _escalation_step_notify_team_members(self, alert_group: "AlertGroup", reason

self._execute_tasks(tasks)

def _escalation_step_declare_incident(self, alert_group: "AlertGroup", _reason: str) -> None:
grafana_declare_incident_enabled = is_declare_incident_step_enabled(
organization=alert_group.channel.organization
)
if not grafana_declare_incident_enabled:
AlertGroupLogRecord(
type=AlertGroupLogRecord.TYPE_ESCALATION_FAILED,
alert_group=alert_group,
reason="Declare Incident step is not enabled",
escalation_policy=self.escalation_policy,
escalation_error_code=AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
escalation_policy_step=self.step,
).save()
return
tasks = []
declare_incident_task = declare_incident.signature(
args=(alert_group.pk,),
kwargs={
"escalation_policy_pk": self.id,
"severity": self.severity,
},
immutable=True,
)
tasks.append(declare_incident_task)
self._execute_tasks(tasks)

def _escalation_step_notify_if_time(self, alert_group: "AlertGroup", _reason: str) -> StepExecutionResultData:
eta = None

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Generated by Django 4.2.15 on 2024-09-25 20:57

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('user_management', '0022_alter_team_unique_together'),
('alerts', '0058_alter_alertgroup_reason_to_skip_escalation'),
]

operations = [
migrations.AddField(
model_name='escalationpolicy',
name='severity',
field=models.CharField(default=None, max_length=512, null=True),
),
migrations.AlterField(
model_name='escalationpolicy',
name='step',
field=models.IntegerField(choices=[(0, 'Wait'), (1, 'Notify User'), (2, 'Notify Whole Channel'), (3, 'Repeat Escalation (5 times max)'), (4, 'Resolve'), (5, 'Notify Group'), (6, 'Notify Schedule'), (7, 'Notify User (Important)'), (8, 'Notify Group (Important)'), (9, 'Notify Schedule (Important)'), (10, 'Trigger Outgoing Webhook'), (11, 'Notify User (next each time)'), (12, 'Continue escalation only if time is from'), (13, 'Notify multiple Users'), (14, 'Notify multiple Users (Important)'), (15, 'Continue escalation if >X alerts per Y minutes'), (16, 'Trigger Webhook'), (17, 'Notify all users in a Team'), (18, 'Notify all users in a Team (Important)'), (19, 'Declare Incident')], default=None, null=True),
),
migrations.CreateModel(
name='DeclaredIncident',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('incident_id', models.CharField(db_index=True, max_length=50)),
('created_at', models.DateTimeField(auto_now_add=True)),
('is_active', models.BooleanField(default=True)),
('channel_filter', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='declared_incidents', to='alerts.channelfilter')),
('organization', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='declared_incidents', to='user_management.organization')),
],
),
migrations.AddField(
model_name='alertgroup',
name='declared_incident',
field=models.ForeignKey(default=None, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='attached_alert_groups', to='alerts.declaredincident'),
),
]
1 change: 1 addition & 0 deletions engine/apps/alerts/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .alert_receive_channel_connection import AlertReceiveChannelConnection # noqa: F401
from .channel_filter import ChannelFilter # noqa: F401
from .custom_button import CustomButton # noqa: F401
from .declared_incident import DeclaredIncident # noqa: F401
from .escalation_chain import EscalationChain # noqa: F401
from .escalation_policy import EscalationPolicy # noqa: F401
from .grafana_alerting_contact_point import GrafanaAlertingContactPoint # noqa: F401
Expand Down
11 changes: 11 additions & 0 deletions engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
AlertGroupLogRecord,
AlertReceiveChannel,
BundledNotification,
DeclaredIncident,
ResolutionNote,
ResolutionNoteSlackMessage,
)
Expand Down Expand Up @@ -205,6 +206,7 @@ class AlertGroup(AlertGroupSlackRenderingMixin, EscalationSnapshotMixin, models.
slack_messages: "RelatedManager['SlackMessage']"
users: "RelatedManager['User']"
labels: "RelatedManager['AlertGroupAssociatedLabel']"
declared_incident: typing.Optional["DeclaredIncident"]

objects: models.Manager["AlertGroup"] = AlertGroupQuerySet.as_manager()

Expand Down Expand Up @@ -420,8 +422,17 @@ def status(self) -> int:
# https://code.djangoproject.com/ticket/28545
is_open_for_grouping = models.BooleanField(default=None, null=True, blank=True)

# todo: rework using this field to use DeclaredIncident model field instead
grafana_incident_id = models.CharField(max_length=100, null=True, default=None)

declared_incident = models.ForeignKey(
"alerts.DeclaredIncident",
on_delete=models.SET_NULL,
null=True,
default=None,
related_name="attached_alert_groups",
)

@staticmethod
def get_silenced_state_filter():
"""
Expand Down
68 changes: 62 additions & 6 deletions engine/apps/alerts/models/alert_group_log_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,24 @@

from apps.alerts import tasks
from apps.alerts.constants import ActionSource
from apps.alerts.incident_appearance.renderers.constants import DEFAULT_BACKUP_TITLE
from apps.alerts.utils import render_relative_timeline
from apps.slack.slack_formatter import SlackFormatter
from common.utils import clean_markup

if typing.TYPE_CHECKING:
from apps.alerts.models import AlertGroup, CustomButton, EscalationPolicy, Invitation
from apps.user_management.models import User
from apps.user_management.models import Organization, User

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)


class RelatedIncidentData(typing.TypedDict):
incident_link: typing.Optional[str]
incident_title: str


class AlertGroupLogRecord(models.Model):
alert_group: "AlertGroup"
author: typing.Optional["User"]
Expand Down Expand Up @@ -161,7 +167,9 @@ class AlertGroupLogRecord(models.Model):
ERROR_ESCALATION_TRIGGER_CUSTOM_WEBHOOK_ERROR,
ERROR_ESCALATION_NOTIFY_TEAM_MEMBERS_STEP_IS_NOT_CONFIGURED,
ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED,
) = range(20)
ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED,
ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED,
) = range(22)

type = models.IntegerField(choices=TYPE_CHOICES)

Expand Down Expand Up @@ -225,16 +233,24 @@ class AlertGroupLogRecord(models.Model):
escalation_policy_step = models.IntegerField(null=True, default=None)
step_specific_info = JSONField(null=True, default=None)

STEP_SPECIFIC_INFO_KEYS = ["schedule_name", "custom_button_name", "usergroup_handle", "source_integration_name"]
STEP_SPECIFIC_INFO_KEYS = [
"schedule_name",
"custom_button_name",
"usergroup_handle",
"source_integration_name",
"incident_link",
"incident_title",
]

def render_log_line_json(self):
time = humanize.naturaldelta(self.alert_group.started_at - self.created_at)
created_at = DateTimeField().to_representation(self.created_at)
organization = self.alert_group.channel.organization
author = self.author.short(organization) if self.author is not None else None
related_incident = self.render_incident_data_from_step_info(organization, self.get_step_specific_info())

sf = SlackFormatter(organization)
action = sf.format(self.rendered_log_line_action(substitute_author_with_tag=True))
action = sf.format(self.rendered_log_line_action(substitute_with_tag=True))
action = clean_markup(action)

result = {
Expand All @@ -244,6 +260,7 @@ def render_log_line_json(self):
"type": self.type,
"created_at": created_at,
"author": author,
"incident": related_incident,
}
return result

Expand All @@ -258,7 +275,7 @@ def rendered_incident_log_line(self, for_slack=False, html=False):
result += self.rendered_log_line_action(for_slack=for_slack, html=html)
return result

def rendered_log_line_action(self, for_slack=False, html=False, substitute_author_with_tag=False):
def rendered_log_line_action(self, for_slack=False, html=False, substitute_with_tag=False):
from apps.alerts.models import EscalationPolicy

result = ""
Expand All @@ -276,7 +293,7 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho
elif self.action_source == ActionSource.BACKSYNC:
author_name = "source integration " + step_specific_info.get("source_integration_name", "")
elif self.author:
if substitute_author_with_tag:
if substitute_with_tag:
author_name = "{{author}}"
elif for_slack:
author_name = self.author.get_username_with_slack_verbal()
Expand Down Expand Up @@ -382,6 +399,21 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho
result += f'triggered step "Notify on-call from Schedule {schedule_name}{important_text}"'
elif escalation_policy_step == EscalationPolicy.STEP_REPEAT_ESCALATION_N_TIMES:
result += "escalation started from the beginning"
elif escalation_policy_step == EscalationPolicy.STEP_DECLARE_INCIDENT:
organization = self.alert_group.channel.organization
incident_data = self.render_incident_data_from_step_info(organization, step_specific_info)
incident_link = incident_data["incident_link"]
incident_title = incident_data["incident_title"]

result += self.reason
if html:
result += f": <a href='{incident_link}'>{incident_title}</a>"
elif for_slack:
result += f": <{incident_link}|{incident_title}>"
elif substitute_with_tag:
result += ": {{related_incident}}"
else:
result += f": {incident_title}"
else:
result += f'triggered step "{EscalationPolicy.get_step_display_name(escalation_policy_step)}"'
elif self.type == AlertGroupLogRecord.TYPE_SILENCE:
Expand Down Expand Up @@ -594,8 +626,32 @@ def rendered_log_line_action(self, for_slack=False, html=False, substitute_autho
result += f"failed to notify User Group{usergroup_handle_text} in Slack"
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_TRIGGER_WEBHOOK_IS_DISABLED:
result += 'skipped escalation step "Trigger Outgoing Webhook" because it is disabled'
elif (
self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_DECLARE_INCIDENT_STEP_IS_NOT_ENABLED
):
result += 'skipped escalation step "Declare Incident": step is not enabled'
elif self.escalation_error_code == AlertGroupLogRecord.ERROR_ESCALATION_INCIDENT_COULD_NOT_BE_DECLARED:
result += "failed to declare an Incident"
if self.reason:
result += f": {self.reason}"
return result

def render_incident_data_from_step_info(
self, organization: "Organization", step_specific_info: dict
) -> RelatedIncidentData | None:
from apps.alerts.models.declared_incident import get_incident_url

if not step_specific_info or not all(key in step_specific_info for key in ["incident_title", "incident_id"]):
return None

incident_link = (
get_incident_url(organization, step_specific_info["incident_id"])
if step_specific_info["incident_id"]
else None
)
incident_title = step_specific_info["incident_title"] or DEFAULT_BACKUP_TITLE
return {"incident_link": incident_link, "incident_title": incident_title}

def get_step_specific_info(self):
step_specific_info = None
# in some cases step_specific_info was saved with using json.dumps
Expand Down
38 changes: 38 additions & 0 deletions engine/apps/alerts/models/declared_incident.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import typing
from urllib.parse import urljoin

from django.db import models

if typing.TYPE_CHECKING:
from django.db.models.manager import RelatedManager

from apps.alerts.models import AlertGroup, ChannelFilter
from apps.user_management.models import Organization


def get_incident_url(organization, incident_id) -> str:
return urljoin(organization.grafana_url, f"a/grafana-incident-app/incidents/{incident_id}")


class DeclaredIncident(models.Model):
attached_alert_groups: "RelatedManager['AlertGroup']"
channel_filter: typing.Optional["ChannelFilter"]
organization: "Organization"

incident_id = models.CharField(db_index=True, max_length=50)
organization = models.ForeignKey(
"user_management.Organization",
on_delete=models.CASCADE,
related_name="declared_incidents",
)
channel_filter = models.ForeignKey(
"alerts.ChannelFilter",
on_delete=models.SET_NULL,
null=True,
related_name="declared_incidents",
)
created_at = models.DateTimeField(auto_now_add=True)
is_active = models.BooleanField(default=True)

def get_incident_link(self) -> str:
return get_incident_url(self.organization, self.incident_id)
Loading

0 comments on commit 70b7273

Please sign in to comment.