feat: close preprod observability loop
This commit is contained in:
@@ -2,6 +2,12 @@ global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
@@ -15,3 +21,22 @@ scrape_configs:
|
||||
static_configs:
|
||||
- targets:
|
||||
- alloy:12345
|
||||
|
||||
- job_name: preprod-uptime
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://web/
|
||||
- http://api:8080/health/ready
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox:9115
|
||||
|
||||
@@ -11,6 +11,16 @@ groups:
|
||||
summary: Socialize API telemetry is missing
|
||||
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
||||
|
||||
- alert: SocializePreprodEndpointDown
|
||||
expr: probe_success{job="preprod-uptime"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-preprod
|
||||
annotations:
|
||||
summary: Preprod endpoint is down
|
||||
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
|
||||
|
||||
- alert: SocializeApiHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
@@ -56,6 +66,26 @@ groups:
|
||||
summary: Socialize core usage is quiet
|
||||
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
||||
|
||||
- alert: SocializeContentStaleInApproval
|
||||
expr: socialize_workflow_stale_in_approval > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Content is stale in approval
|
||||
description: One or more content items have been in approval longer than the configured threshold.
|
||||
|
||||
- alert: SocializeNoActiveWorkspaces
|
||||
expr: socialize_workflow_active_workspaces{window="24h"} < 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: info
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: No active workspaces in the last 24 hours
|
||||
description: No workspace has content workflow activity in the last 24 hours.
|
||||
|
||||
- alert: SocializeFeedbackBugSubmitted
|
||||
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
||||
for: 0m
|
||||
|
||||
Reference in New Issue
Block a user