Files
social-media/deploy/observability/prometheus/rules/socialize-alerts.yml

98 lines
3.7 KiB
YAML

groups:
- name: socialize-preprod
rules:
- alert: SocializeApiTelemetryMissing
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializeApiHighErrorRate
expr: |
(
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
/
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
) > 0.05
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API 5xx rate is high
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
- alert: SocializeApiHighLatency
expr: |
histogram_quantile(
0.95,
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
) > 2
for: 10m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize API p95 latency is high
description: API p95 latency has been above 2 seconds for 10 minutes.
- alert: SocializeCoreUsageQuiet
expr: |
(
sum(increase(socialize_content_items_created_total[12h]))
+ sum(increase(socialize_comments_created_total[12h]))
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
+ sum(increase(socialize_feedback_submitted_total[12h]))
) < 1
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m
labels:
severity: info
service: socialize-api
annotations:
summary: New bug feedback submitted
description: A user submitted bug feedback in the last 15 minutes.
- alert: SocializeEmailDeliveryFailures
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Email delivery failures detected
description: One or more email delivery attempts failed in the last 15 minutes.
- alert: SocializeBlobStorageFailures
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Blob storage failures detected
description: One or more blob storage operations failed in the last 15 minutes.
- alert: SocializeBackgroundJobFailures
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Background job failures detected
description: One or more background jobs failed in the last 30 minutes.