feat: add preprod observability foundation
This commit is contained in:
97
deploy/observability/prometheus/rules/socialize-alerts.yml
Normal file
97
deploy/observability/prometheus/rules/socialize-alerts.yml
Normal file
@@ -0,0 +1,97 @@
|
||||
groups:
|
||||
- name: socialize-preprod
|
||||
rules:
|
||||
- alert: SocializeApiTelemetryMissing
|
||||
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API telemetry is missing
|
||||
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
||||
|
||||
- alert: SocializeApiHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
|
||||
/
|
||||
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API 5xx rate is high
|
||||
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
|
||||
|
||||
- alert: SocializeApiHighLatency
|
||||
expr: |
|
||||
histogram_quantile(
|
||||
0.95,
|
||||
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
|
||||
) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API p95 latency is high
|
||||
description: API p95 latency has been above 2 seconds for 10 minutes.
|
||||
|
||||
- alert: SocializeCoreUsageQuiet
|
||||
expr: |
|
||||
(
|
||||
sum(increase(socialize_content_items_created_total[12h]))
|
||||
+ sum(increase(socialize_comments_created_total[12h]))
|
||||
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
|
||||
+ sum(increase(socialize_feedback_submitted_total[12h]))
|
||||
) < 1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize core usage is quiet
|
||||
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
||||
|
||||
- alert: SocializeFeedbackBugSubmitted
|
||||
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: New bug feedback submitted
|
||||
description: A user submitted bug feedback in the last 15 minutes.
|
||||
|
||||
- alert: SocializeEmailDeliveryFailures
|
||||
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Email delivery failures detected
|
||||
description: One or more email delivery attempts failed in the last 15 minutes.
|
||||
|
||||
- alert: SocializeBlobStorageFailures
|
||||
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Blob storage failures detected
|
||||
description: One or more blob storage operations failed in the last 15 minutes.
|
||||
|
||||
- alert: SocializeBackgroundJobFailures
|
||||
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Background job failures detected
|
||||
description: One or more background jobs failed in the last 30 minutes.
|
||||
Reference in New Issue
Block a user