feat: close preprod observability loop

This commit is contained in:
2026-05-08 15:48:56 -04:00
parent 8bcff96821
commit 986c7efea6
14 changed files with 618 additions and 2 deletions

View File

@@ -0,0 +1,22 @@
global:
resolve_timeout: 5m
route:
receiver: preprod-webhook
group_by:
- alertname
- service
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- matchers:
- severity="critical"
receiver: preprod-webhook
repeat_interval: 30m
receivers:
- name: preprod-webhook
webhook_configs:
- url: ${ALERTMANAGER_WEBHOOK_URL}
send_resolved: true

View File

@@ -0,0 +1,9 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
method: GET
preferred_ip_protocol: ip4
valid_status_codes:
- 200

View File

@@ -0,0 +1,13 @@
# Optional Caddy snippet for exposing Grafana through a protected hostname.
# Generate a password hash with:
# caddy hash-password --plaintext '<password>'
{$OBSERVABILITY_HOST} {
encode gzip zstd
basicauth {
{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
}
reverse_proxy grafana:3000
}

View File

@@ -26,6 +26,7 @@ services:
- prometheus
- loki
- tempo
- alertmanager
networks:
- internal
@@ -44,6 +45,31 @@ services:
networks:
- internal
alertmanager:
image: prom/alertmanager:v0.29.0
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --config.expand-env
environment:
ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
volumes:
- alertmanager-data:/alertmanager
- ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
networks:
- internal
blackbox:
image: prom/blackbox-exporter:v0.27.0
restart: unless-stopped
command:
- --config.file=/etc/blackbox_exporter/config.yml
volumes:
- ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
networks:
- internal
loki:
image: grafana/loki:3.7.1
restart: unless-stopped
@@ -84,6 +110,7 @@ services:
- internal
volumes:
alertmanager-data:
grafana-data:
prometheus-data:
loki-data:

View File

@@ -333,6 +333,78 @@
"title": "Operational Events, 1h Rolling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 20
},
"id": 11,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_content_items",
"legendFormat": "content {{status}}"
},
{
"expr": "socialize_workflow_feedback_reports",
"legendFormat": "feedback {{status}}"
}
],
"title": "Workflow Backlog",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 20
},
"id": 12,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_active_workspaces",
"legendFormat": "active workspaces {{window}}"
},
{
"expr": "socialize_workflow_stale_in_approval",
"legendFormat": "stale in approval"
},
{
"expr": "socialize_workflow_pending_invites",
"legendFormat": "pending invites"
}
],
"title": "Workflow Health",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
@@ -342,7 +414,7 @@
"h": 7,
"w": 24,
"x": 0,
"y": 20
"y": 28
},
"id": 9,
"options": {
@@ -368,7 +440,7 @@
"h": 9,
"w": 24,
"x": 0,
"y": 27
"y": 35
},
"id": 10,
"options": {

View File

@@ -2,6 +2,12 @@ global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
@@ -15,3 +21,22 @@ scrape_configs:
static_configs:
- targets:
- alloy:12345
- job_name: preprod-uptime
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- http://web/
- http://api:8080/health/ready
relabel_configs:
- source_labels:
- __address__
target_label: __param_target
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox:9115

View File

@@ -11,6 +11,16 @@ groups:
summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializePreprodEndpointDown
expr: probe_success{job="preprod-uptime"} == 0
for: 2m
labels:
severity: critical
service: socialize-preprod
annotations:
summary: Preprod endpoint is down
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
- alert: SocializeApiHighErrorRate
expr: |
(
@@ -56,6 +66,26 @@ groups:
summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeContentStaleInApproval
expr: socialize_workflow_stale_in_approval > 0
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Content is stale in approval
description: One or more content items have been in approval longer than the configured threshold.
- alert: SocializeNoActiveWorkspaces
expr: socialize_workflow_active_workspaces{window="24h"} < 1
for: 1h
labels:
severity: info
service: socialize-api
annotations:
summary: No active workspaces in the last 24 hours
description: No workspace has content workflow activity in the last 24 hours.
- alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m