admin-coderabbit · admin-coderabbit · Nov 11, 2024 · coderabbit-eval · Feb 4, 2026 · coderabbit-eval
diff --git a/src/sentry/monitors/logic/incident_occurrence.py b/src/sentry/monitors/logic/incident_occurrence.py
@@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import logging
+import uuid
+from collections import Counter
+from collections.abc import Mapping, Sequence
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING
+
+from django.utils.text import get_text_list
+from django.utils.translation import gettext_lazy as _
+
+from sentry.issues.grouptype import MonitorIncidentType
+from sentry.monitors.models import (
+    CheckInStatus,
+    MonitorCheckIn,
+    MonitorEnvironment,
+    MonitorIncident,
+)
+from sentry.monitors.types import SimpleCheckIn
+
+if TYPE_CHECKING:
+    from django.utils.functional import _StrPromise
+
+logger = logging.getLogger(__name__)
+
+
+def create_incident_occurrence(
+    failed_checkins: Sequence[SimpleCheckIn],
+    failed_checkin: MonitorCheckIn,
+    incident: MonitorIncident,
+    received: datetime | None,
+) -> None:
+    from sentry.issues.issue_occurrence import IssueEvidence, IssueOccurrence
+    from sentry.issues.producer import PayloadType, produce_occurrence_to_kafka
+
+    monitor_env = failed_checkin.monitor_environment
+
+    if monitor_env is None:
+        return
+
+    current_timestamp = datetime.now(timezone.utc)
+
+    # Get last successful check-in to show in evidence display
+    last_successful_checkin_timestamp = "Never"
+    last_successful_checkin = monitor_env.get_last_successful_checkin()
+    if last_successful_checkin:
+        last_successful_checkin_timestamp = last_successful_checkin.date_added.isoformat()
+
+    occurrence = IssueOccurrence(
+        id=uuid.uuid4().hex,
+        resource_id=None,
+        project_id=monitor_env.monitor.project_id,
+        event_id=uuid.uuid4().hex,
+        fingerprint=[incident.grouphash],
+        type=MonitorIncidentType,
+        issue_title=f"Monitor failure: {monitor_env.monitor.name}",
+        subtitle="Your monitor has reached its failure threshold.",
+        evidence_display=[
+            IssueEvidence(
+                name="Failure reason",
+                value=str(get_failure_reason(failed_checkins)),
+                important=True,
+            ),
+            IssueEvidence(
+                name="Environment",
+                value=monitor_env.get_environment().name,
+                important=False,
+            ),
+            IssueEvidence(
+                name="Last successful check-in",
+                value=last_successful_checkin_timestamp,
+                important=False,
+            ),
+        ],
+        evidence_data={},
+        culprit="",
+        detection_time=current_timestamp,
+        level="error",
+        assignee=monitor_env.monitor.owner_actor,
+    )
+
+    if failed_checkin.trace_id:
+        trace_id = failed_checkin.trace_id.hex
+    else:
+        trace_id = None
+
+    event_data = {
+        "contexts": {"monitor": get_monitor_environment_context(monitor_env)},
+        "environment": monitor_env.get_environment().name,
+        "event_id": occurrence.event_id,
+        "fingerprint": [incident.grouphash],
+        "platform": "other",
+        "project_id": monitor_env.monitor.project_id,
+        # We set this to the time that the checkin that triggered the occurrence was written to relay if available
+        "received": (received if received else current_timestamp).isoformat(),
+        "sdk": None,
+        "tags": {
+            "monitor.id": str(monitor_env.monitor.guid),
+            "monitor.slug": str(monitor_env.monitor.slug),
+            "monitor.incident": str(incident.id),
+        },
+        "timestamp": current_timestamp.isoformat(),
+    }
+
+    if trace_id:
+        event_data["contexts"]["trace"] = {"trace_id": trace_id, "span_id": None}
+
+    produce_occurrence_to_kafka(
+        payload_type=PayloadType.OCCURRENCE,
+        occurrence=occurrence,
+        event_data=event_data,
+    )
+
+
+HUMAN_FAILURE_STATUS_MAP: Mapping[int, _StrPromise] = {
+    CheckInStatus.ERROR: _("error"),
+    CheckInStatus.MISSED: _("missed"),
+    CheckInStatus.TIMEOUT: _("timeout"),
+}
+
+# Exists due to the vowel differences (A vs An) in the statuses
+SINGULAR_HUMAN_FAILURE_MAP: Mapping[int, _StrPromise] = {
+    CheckInStatus.ERROR: _("An error check-in was detected"),
+    CheckInStatus.MISSED: _("A missed check-in was detected"),
+    CheckInStatus.TIMEOUT: _("A timeout check-in was detected"),
+}
+
+
+def get_failure_reason(failed_checkins: Sequence[SimpleCheckIn]):
+    """
+    Builds a humam readible string from a list of failed check-ins.
+
+    "3 missed check-ins detected"
+    "2 missed check-ins, 1 timeout check-in and 1 error check-in were detected"
+    "A failed check-in was detected"
+    """
+
+    status_counts = Counter(
+        checkin["status"]
+        for checkin in failed_checkins
+        if checkin["status"] in HUMAN_FAILURE_STATUS_MAP.keys()
+    )
+
+    if sum(status_counts.values()) == 1:
+        return SINGULAR_HUMAN_FAILURE_MAP[list(status_counts.keys())[0]]
+
+    human_status = get_text_list(
+        [
+            "%(count)d %(status)s" % {"count": count, "status": HUMAN_FAILURE_STATUS_MAP[status]}
+            for status, count in status_counts.items()
+        ],
+        last_word=_("and"),
+    )
+
+    return _("%(problem_checkins)s check-ins detected") % {"problem_checkins": human_status}
+
+
+def get_monitor_environment_context(monitor_environment: MonitorEnvironment):
+    config = monitor_environment.monitor.config.copy()
+    if "schedule_type" in config:
+        config["schedule_type"] = monitor_environment.monitor.get_schedule_type_display()
+
+    return {
+        "id": str(monitor_environment.monitor.guid),
+        "slug": str(monitor_environment.monitor.slug),
+        "name": monitor_environment.monitor.name,
+        "config": monitor_environment.monitor.config,
+        "status": monitor_environment.get_status_display(),
+        "type": monitor_environment.monitor.get_type_display(),
+    }
diff --git a/src/sentry/monitors/logic/incidents.py b/src/sentry/monitors/logic/incidents.py
@@ -0,0 +1,104 @@
+from __future__ import annotations
+
+import logging
+from datetime import datetime
+from typing import cast
+
+from sentry.monitors.logic.incident_occurrence import create_incident_occurrence
+from sentry.monitors.models import CheckInStatus, MonitorCheckIn, MonitorIncident, MonitorStatus
+from sentry.monitors.types import SimpleCheckIn
+
+logger = logging.getLogger(__name__)
+
+
+def try_incident_threshold(
+    failed_checkin: MonitorCheckIn,
+    failure_issue_threshold: int,
+    received: datetime | None,
+) -> bool:
+    from sentry.signals import monitor_environment_failed
+
+    monitor_env = failed_checkin.monitor_environment
+
+    if monitor_env is None:
+        return False
+
+    # check to see if we need to update the status
+    if monitor_env.status in [MonitorStatus.OK, MonitorStatus.ACTIVE]:
+        if failure_issue_threshold == 1:
+            previous_checkins: list[SimpleCheckIn] = [
+                {
+                    "id": failed_checkin.id,
+                    "date_added": failed_checkin.date_added,
+                    "status": failed_checkin.status,
+                }
+            ]
+        else:
+            previous_checkins = cast(
+                list[SimpleCheckIn],
+                # Using .values for performance reasons
+                MonitorCheckIn.objects.filter(
+                    monitor_environment=monitor_env, date_added__lte=failed_checkin.date_added
+                )
+                .order_by("-date_added")
+                .values("id", "date_added", "status"),
+            )
+
+            # reverse the list after slicing in order to start with oldest check-in
+            previous_checkins = list(reversed(previous_checkins[:failure_issue_threshold]))
+
+            # If we have any successful check-ins within the threshold of
+            # commits we have NOT reached an incident state
+            if any([checkin["status"] == CheckInStatus.OK for checkin in previous_checkins]):
+                return False
+
+        # change monitor status + update fingerprint timestamp
+        monitor_env.status = MonitorStatus.ERROR
+        monitor_env.save(update_fields=("status",))
+
+        starting_checkin = previous_checkins[0]
+
+        incident: MonitorIncident | None
+        incident, _ = MonitorIncident.objects.get_or_create(
+            monitor_environment=monitor_env,
+            resolving_checkin=None,
+            defaults={
+                "monitor": monitor_env.monitor,
+                "starting_checkin_id": starting_checkin["id"],
+                "starting_timestamp": starting_checkin["date_added"],
+            },
+        )
+
+    elif monitor_env.status == MonitorStatus.ERROR:
+        # if monitor environment has a failed status, use the failed
+        # check-in and send occurrence
+        previous_checkins = [
+            {
+                "id": failed_checkin.id,
+                "date_added": failed_checkin.date_added,
+                "status": failed_checkin.status,
+            }
+        ]
+
+        # get the active incident from the monitor environment
+        incident = monitor_env.active_incident
+    else:
+        # don't send occurrence for other statuses
+        return False
+
+    # Only create an occurrence if:
+    # - We have an active incident and fingerprint
+    # - The monitor and env are not muted
+    if not monitor_env.monitor.is_muted and not monitor_env.is_muted and incident:
+        checkins = MonitorCheckIn.objects.filter(id__in=[c["id"] for c in previous_checkins])
+        for checkin in checkins:
+            create_incident_occurrence(
+                previous_checkins,
+                checkin,
+                incident,
+                received=received,
+            )
+
+    monitor_environment_failed.send(monitor_environment=monitor_env, sender=type(monitor_env))
+
+    return True