feat: close preprod observability loop
All checks were successful
deploy-socialize / image (push) Successful in 1m2s
deploy-socialize / deploy (push) Successful in 38s

This commit is contained in:
2026-05-08 15:48:56 -04:00
parent 8bcff96821
commit 986c7efea6
14 changed files with 618 additions and 2 deletions

View File

@@ -2,6 +2,12 @@ global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
@@ -15,3 +21,22 @@ scrape_configs:
static_configs:
- targets:
- alloy:12345
- job_name: preprod-uptime
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- http://web/
- http://api:8080/health/ready
relabel_configs:
- source_labels:
- __address__
target_label: __param_target
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox:9115

View File

@@ -11,6 +11,16 @@ groups:
summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializePreprodEndpointDown
expr: probe_success{job="preprod-uptime"} == 0
for: 2m
labels:
severity: critical
service: socialize-preprod
annotations:
summary: Preprod endpoint is down
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
- alert: SocializeApiHighErrorRate
expr: |
(
@@ -56,6 +66,26 @@ groups:
summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeContentStaleInApproval
expr: socialize_workflow_stale_in_approval > 0
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Content is stale in approval
description: One or more content items have been in approval longer than the configured threshold.
- alert: SocializeNoActiveWorkspaces
expr: socialize_workflow_active_workspaces{window="24h"} < 1
for: 1h
labels:
severity: info
service: socialize-api
annotations:
summary: No active workspaces in the last 24 hours
description: No workspace has content workflow activity in the last 24 hours.
- alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m