98 lines
3.7 KiB
YAML
98 lines
3.7 KiB
YAML
groups:
|
|
- name: socialize-preprod
|
|
rules:
|
|
- alert: SocializeApiTelemetryMissing
|
|
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Socialize API telemetry is missing
|
|
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
|
|
|
- alert: SocializeApiHighErrorRate
|
|
expr: |
|
|
(
|
|
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
|
|
/
|
|
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
|
|
) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Socialize API 5xx rate is high
|
|
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
|
|
|
|
- alert: SocializeApiHighLatency
|
|
expr: |
|
|
histogram_quantile(
|
|
0.95,
|
|
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
|
|
) > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Socialize API p95 latency is high
|
|
description: API p95 latency has been above 2 seconds for 10 minutes.
|
|
|
|
- alert: SocializeCoreUsageQuiet
|
|
expr: |
|
|
(
|
|
sum(increase(socialize_content_items_created_total[12h]))
|
|
+ sum(increase(socialize_comments_created_total[12h]))
|
|
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
|
|
+ sum(increase(socialize_feedback_submitted_total[12h]))
|
|
) < 1
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Socialize core usage is quiet
|
|
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
|
|
|
- alert: SocializeFeedbackBugSubmitted
|
|
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: info
|
|
service: socialize-api
|
|
annotations:
|
|
summary: New bug feedback submitted
|
|
description: A user submitted bug feedback in the last 15 minutes.
|
|
|
|
- alert: SocializeEmailDeliveryFailures
|
|
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Email delivery failures detected
|
|
description: One or more email delivery attempts failed in the last 15 minutes.
|
|
|
|
- alert: SocializeBlobStorageFailures
|
|
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Blob storage failures detected
|
|
description: One or more blob storage operations failed in the last 15 minutes.
|
|
|
|
- alert: SocializeBackgroundJobFailures
|
|
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
service: socialize-api
|
|
annotations:
|
|
summary: Background job failures detected
|
|
description: One or more background jobs failed in the last 30 minutes.
|