Skip to content

Automated Test: monitor-incident-refactor-after #329

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions src/sentry/monitors/logic/incident_occurrence.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from __future__ import annotations

import logging
import uuid
from collections import Counter
from collections.abc import Mapping, Sequence
from datetime import datetime, timezone
from typing import TYPE_CHECKING

from django.utils.text import get_text_list
from django.utils.translation import gettext_lazy as _

from sentry.issues.grouptype import MonitorIncidentType
from sentry.monitors.models import (
CheckInStatus,
MonitorCheckIn,
MonitorEnvironment,
MonitorIncident,
)
from sentry.monitors.types import SimpleCheckIn

if TYPE_CHECKING:
from django.utils.functional import _StrPromise

logger = logging.getLogger(__name__)


def create_incident_occurrence(
failed_checkins: Sequence[SimpleCheckIn],
failed_checkin: MonitorCheckIn,
incident: MonitorIncident,
received: datetime | None,
) -> None:
from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka

monitor_env = failed_checkin.monitor_environment

if monitor_env is None:
return

current_timestamp = datetime.now(timezone.utc)

# Get last successful check-in to show in evidence display
last_successful_checkin_timestamp = "Never"
last_successful_checkin = monitor_env.get_last_successful_checkin()
if last_successful_checkin:
last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()

occurrence = IssueOccurrence(
id=uuid.uuid4().hex,
resource_id=None,
project_id=monitor_env.monitor.project_id,
event_id=uuid.uuid4().hex,
fingerprint=[incident.grouphash],
type=MonitorIncidentType,
issue_title=f"Monitor failure: {monitor_env.monitor.name}",
subtitle="Your monitor has reached its failure threshold.",
evidence_display=[
IssueEvidence(
name="Failure reason",
value=str(get_failure_reason(failed_checkins)),
important=True,
),
IssueEvidence(
name="Environment",
value=monitor_env.get_environment().name,
important=False,
),
IssueEvidence(
name="Last successful check-in",
value=last_successful_checkin_timestamp,
important=False,
),
],
evidence_data={},
culprit="",
detection_time=current_timestamp,
level="error",
assignee=monitor_env.monitor.owner_actor,
)

if failed_checkin.trace_id:
trace_id = failed_checkin.trace_id.hex
else:
trace_id = None

event_data = {
"contexts": {"monitor": get_monitor_environment_context(monitor_env)},
"environment": monitor_env.get_environment().name,
"event_id": occurrence.event_id,
"fingerprint": [incident.grouphash],
"platform": "other",
"project_id": monitor_env.monitor.project_id,
# We set this to the time that the checkin that triggered the occurrence was written to relay if available
"received": (received if received else current_timestamp).isoformat(),
"sdk": None,
"tags": {
"monitor.id": str(monitor_env.monitor.guid),
"monitor.slug": str(monitor_env.monitor.slug),
"monitor.incident": str(incident.id),
},
"timestamp": current_timestamp.isoformat(),
}

if trace_id:
event_data["contexts"]["trace"] = {"trace_id": trace_id, "span_id": None}

produce_occurrence_to_kafka(
payload_type=PayloadType.OCCURRENCE,
occurrence=occurrence,
event_data=event_data,
)


HUMAN_FAILURE_STATUS_MAP: Mapping[int, _StrPromise] = {
CheckInStatus.ERROR: _("error"),
CheckInStatus.MISSED: _("missed"),
CheckInStatus.TIMEOUT: _("timeout"),
}

# Exists due to the vowel differences (A vs An) in the statuses
SINGULAR_HUMAN_FAILURE_MAP: Mapping[int, _StrPromise] = {
CheckInStatus.ERROR: _("An error check-in was detected"),
CheckInStatus.MISSED: _("A missed check-in was detected"),
CheckInStatus.TIMEOUT: _("A timeout check-in was detected"),
}


def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
"""
Builds a humam readible string from a list of failed check-ins.

"3 missed check-ins detected"
"2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
"A failed check-in was detected"
"""

status_counts = Counter(
checkin["status"]
for checkin in failed_checkins
if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
)

if sum(status_counts.values()) == 1:
return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]

human_status = get_text_list(
[
"%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
for status, count in status_counts.items()
],
last_word=_("and"),
)

return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
Comment on lines +130 to +156

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Edge case & minor issues in get_failure_reason.

  1. Empty status_counts: If all check-in statuses are outside HUMAN_FAILURE_STATUS_MAP, sum(status_counts.values()) is 0, which falls through to line 148 and produces a garbled message like " check-ins detected". Consider handling this case explicitly.

  2. Typo (line 132): "humam readible" → "human readable".

  3. Ruff RUF015 (line 146): list(status_counts.keys())[0]next(iter(status_counts)).

  4. Redundant .keys() (line 142): in HUMAN_FAILURE_STATUS_MAP.keys()in HUMAN_FAILURE_STATUS_MAP.

Proposed fixes
 def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
     """
-    Builds a humam readible string from a list of failed check-ins.
+    Builds a human readable string from a list of failed check-ins.
 
     "3 missed check-ins detected"
     "2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
@@ -139,12 +139,15 @@
     status_counts = Counter(
         checkin["status"]
         for checkin in failed_checkins
-        if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
+        if checkin["status"] in HUMAN_FAILURE_STATUS_MAP
     )
 
-    if sum(status_counts.values()) == 1:
-        return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]
+    if not status_counts:
+        return _("A check-in failure was detected")
+
+    if sum(status_counts.values()) == 1:
+        return SINGULAR_HUMAN_FAILURE_MAP[next(iter(status_counts))]
🧰 Tools
🪛 Ruff (0.14.14)

[warning] 146-146: Prefer next(iter(status_counts.keys())) over single element slice

Replace with next(iter(status_counts.keys()))

(RUF015)

🤖 Prompt for AI Agents
In `@src/sentry/monitors/logic/incident_occurrence.py` around lines 130 - 156, In
get_failure_reason, fix the docstring typo ("humam readible" → "human
readable"), change the generator membership check to "if checkin['status'] in
HUMAN_FAILURE_STATUS_MAP" (drop .keys()), replace list(status_counts.keys())[0]
with next(iter(status_counts)) to satisfy RUF015, and add an explicit branch
that returns a sensible default (e.g., _("A failed check-in was detected") or
similar) when status_counts is empty before building human_status; keep
references to status_counts, HUMAN_FAILURE_STATUS_MAP,
SINGULAR_HUMAN_FAILURE_MAP, and get_text_list when implementing these changes.



def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
config = monitor_environment.monitor.config.copy()
if "schedule_type" in config:
config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()

return {
"id": str(monitor_environment.monitor.guid),
"slug": str(monitor_environment.monitor.slug),
"name": monitor_environment.monitor.name,
"config": monitor_environment.monitor.config,
"status": monitor_environment.get_status_display(),
"type": monitor_environment.monitor.get_type_display(),
}
Comment on lines +159 to +171

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Bug: modified config copy is never returned.

Line 160 creates a local copy of config and line 162 potentially updates schedule_type on that copy, but line 168 returns the original monitor_environment.monitor.config instead of the modified config.

🐛 Proposed fix
     return {
         "id": str(monitor_environment.monitor.guid),
         "slug": str(monitor_environment.monitor.slug),
         "name": monitor_environment.monitor.name,
-        "config": monitor_environment.monitor.config,
+        "config": config,
         "status": monitor_environment.get_status_display(),
         "type": monitor_environment.monitor.get_type_display(),
     }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
config = monitor_environment.monitor.config.copy()
if "schedule_type" in config:
config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()
return {
"id": str(monitor_environment.monitor.guid),
"slug": str(monitor_environment.monitor.slug),
"name": monitor_environment.monitor.name,
"config": monitor_environment.monitor.config,
"status": monitor_environment.get_status_display(),
"type": monitor_environment.monitor.get_type_display(),
}
def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
config = monitor_environment.monitor.config.copy()
if "schedule_type" in config:
config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()
return {
"id": str(monitor_environment.monitor.guid),
"slug": str(monitor_environment.monitor.slug),
"name": monitor_environment.monitor.name,
"config": config,
"status": monitor_environment.get_status_display(),
"type": monitor_environment.monitor.get_type_display(),
}
🤖 Prompt for AI Agents
In `@src/sentry/monitors/logic/incident_occurrence.py` around lines 159 - 171, The
function get_monitor_environment_context modifies a local copy named config
(from monitor_environment.monitor.config.copy()) but returns the original
monitor_environment.monitor.config instead of that modified copy; update the
returned dict to use the local config variable (config) so that the adjusted
"schedule_type" (and any other edits) are included; locate
get_monitor_environment_context and replace the "config":
monitor_environment.monitor.config entry with "config": config.

104 changes: 104 additions & 0 deletions src/sentry/monitors/logic/incidents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from __future__ import annotations

import logging
from datetime import datetime
from typing import cast

from sentry.monitors.logic.incident_occurrence import create_incident_occurrence
from sentry.monitors.models import CheckInStatus, MonitorCheckIn, MonitorIncident, MonitorStatus
from sentry.monitors.types import SimpleCheckIn

logger = logging.getLogger(__name__)


def try_incident_threshold(
failed_checkin: MonitorCheckIn,
failure_issue_threshold: int,
received: datetime | None,
) -> bool:
from sentry.signals import monitor_environment_failed

monitor_env = failed_checkin.monitor_environment

if monitor_env is None:
return False

# check to see if we need to update the status
if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
if failure_issue_threshold == 1:
previous_checkins: list[SimpleCheckIn] = [
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]
else:
previous_checkins = cast(
list[SimpleCheckIn],
# Using .values for performance reasons
MonitorCheckIn.objects.filter(
monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
)
.order_by("-date_added")
.values("id", "date_added", "status"),
)

# reverse the list after slicing in order to start with oldest check-in
previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))

# If we have any successful check-ins within the threshold of
# commits we have NOT reached an incident state
if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
return False

# change monitor status + update fingerprint timestamp
monitor_env.status = MonitorStatus.ERROR
monitor_env.save(update_fields=("status",))

starting_checkin = previous_checkins[0]

incident: MonitorIncident | None
incident, _ = MonitorIncident.objects.get_or_create(
monitor_environment=monitor_env,
resolving_checkin=None,
defaults={
"monitor": monitor_env.monitor,
"starting_checkin_id": starting_checkin["id"],
"starting_timestamp": starting_checkin["date_added"],
},
)

elif monitor_env.status == MonitorStatus.ERROR:
# if monitor environment has a failed status, use the failed
# check-in and send occurrence
previous_checkins = [
{
"id": failed_checkin.id,
"date_added": failed_checkin.date_added,
"status": failed_checkin.status,
}
]

# get the active incident from the monitor environment
incident = monitor_env.active_incident
else:
# don't send occurrence for other statuses
return False
Comment on lines +72 to +87

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

monitor_env.active_incident may be None.

When the monitor is already in ERROR status, line 84 assigns incident = monitor_env.active_incident, which can be None if the incident was resolved between the status check and this line (race), or if data is inconsistent. Line 92 does guard against incident being falsy before creating occurrences, so this is safe — but the signal on line 102 still fires and True is returned even when no incident exists and no occurrence was created. Consider whether that's the intended behavior or if an early return / log warning would be more appropriate.

🤖 Prompt for AI Agents
In `@src/sentry/monitors/logic/incidents.py` around lines 72 - 87, When
monitor_env.status == MonitorStatus.ERROR, monitor_env.active_incident may be
None; update the logic in the block that builds previous_checkins and assigns
incident = monitor_env.active_incident to check for a missing incident and
return early (or log a warning) instead of proceeding to fire the signal and
return True; specifically, in the code path that handles MonitorStatus.ERROR
(the code referencing previous_checkins and incident), add an explicit guard
that if incident is None then do not create occurrences, avoid firing the
existing signal, and return False (or emit a warning log) so the function
accurately reflects that no occurrence was created.


# Only create an occurrence if:
# - We have an active incident and fingerprint
# - The monitor and env are not muted
if not monitor_env.monitor.is_muted and not monitor_env.is_muted and incident:
checkins = MonitorCheckIn.objects.filter(id__in=[c["id"] for c in previous_checkins])
for checkin in checkins:
create_incident_occurrence(
previous_checkins,
checkin,
incident,
received=received,
)

monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))

return True
Loading