feat: close preprod observability loop
This commit is contained in:
22
deploy/observability/alertmanager/alertmanager.yml
Normal file
22
deploy/observability/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
|
||||
route:
|
||||
receiver: preprod-webhook
|
||||
group_by:
|
||||
- alertname
|
||||
- service
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
- matchers:
|
||||
- severity="critical"
|
||||
receiver: preprod-webhook
|
||||
repeat_interval: 30m
|
||||
|
||||
receivers:
|
||||
- name: preprod-webhook
|
||||
webhook_configs:
|
||||
- url: ${ALERTMANAGER_WEBHOOK_URL}
|
||||
send_resolved: true
|
||||
9
deploy/observability/blackbox/config.yml
Normal file
9
deploy/observability/blackbox/config.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
method: GET
|
||||
preferred_ip_protocol: ip4
|
||||
valid_status_codes:
|
||||
- 200
|
||||
13
deploy/observability/caddy/grafana.Caddyfile
Normal file
13
deploy/observability/caddy/grafana.Caddyfile
Normal file
@@ -0,0 +1,13 @@
|
||||
# Optional Caddy snippet for exposing Grafana through a protected hostname.
|
||||
# Generate a password hash with:
|
||||
# caddy hash-password --plaintext '<password>'
|
||||
|
||||
{$OBSERVABILITY_HOST} {
|
||||
encode gzip zstd
|
||||
|
||||
basicauth {
|
||||
{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
|
||||
}
|
||||
|
||||
reverse_proxy grafana:3000
|
||||
}
|
||||
@@ -26,6 +26,7 @@ services:
|
||||
- prometheus
|
||||
- loki
|
||||
- tempo
|
||||
- alertmanager
|
||||
networks:
|
||||
- internal
|
||||
|
||||
@@ -44,6 +45,31 @@ services:
|
||||
networks:
|
||||
- internal
|
||||
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.29.0
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||
- --storage.path=/alertmanager
|
||||
- --config.expand-env
|
||||
environment:
|
||||
ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
|
||||
volumes:
|
||||
- alertmanager-data:/alertmanager
|
||||
- ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
networks:
|
||||
- internal
|
||||
|
||||
blackbox:
|
||||
image: prom/blackbox-exporter:v0.27.0
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --config.file=/etc/blackbox_exporter/config.yml
|
||||
volumes:
|
||||
- ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
networks:
|
||||
- internal
|
||||
|
||||
loki:
|
||||
image: grafana/loki:3.7.1
|
||||
restart: unless-stopped
|
||||
@@ -84,6 +110,7 @@ services:
|
||||
- internal
|
||||
|
||||
volumes:
|
||||
alertmanager-data:
|
||||
grafana-data:
|
||||
prometheus-data:
|
||||
loki-data:
|
||||
|
||||
@@ -333,6 +333,78 @@
|
||||
"title": "Operational Events, 1h Rolling",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 11,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "socialize_workflow_content_items",
|
||||
"legendFormat": "content {{status}}"
|
||||
},
|
||||
{
|
||||
"expr": "socialize_workflow_feedback_reports",
|
||||
"legendFormat": "feedback {{status}}"
|
||||
}
|
||||
],
|
||||
"title": "Workflow Backlog",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 20
|
||||
},
|
||||
"id": 12,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "socialize_workflow_active_workspaces",
|
||||
"legendFormat": "active workspaces {{window}}"
|
||||
},
|
||||
{
|
||||
"expr": "socialize_workflow_stale_in_approval",
|
||||
"legendFormat": "stale in approval"
|
||||
},
|
||||
{
|
||||
"expr": "socialize_workflow_pending_invites",
|
||||
"legendFormat": "pending invites"
|
||||
}
|
||||
],
|
||||
"title": "Workflow Health",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
@@ -342,7 +414,7 @@
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
"y": 28
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
@@ -368,7 +440,7 @@
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
"y": 35
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
|
||||
@@ -2,6 +2,12 @@ global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
@@ -15,3 +21,22 @@ scrape_configs:
|
||||
static_configs:
|
||||
- targets:
|
||||
- alloy:12345
|
||||
|
||||
- job_name: preprod-uptime
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
static_configs:
|
||||
- targets:
|
||||
- http://web/
|
||||
- http://api:8080/health/ready
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox:9115
|
||||
|
||||
@@ -11,6 +11,16 @@ groups:
|
||||
summary: Socialize API telemetry is missing
|
||||
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
||||
|
||||
- alert: SocializePreprodEndpointDown
|
||||
expr: probe_success{job="preprod-uptime"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-preprod
|
||||
annotations:
|
||||
summary: Preprod endpoint is down
|
||||
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
|
||||
|
||||
- alert: SocializeApiHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
@@ -56,6 +66,26 @@ groups:
|
||||
summary: Socialize core usage is quiet
|
||||
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
||||
|
||||
- alert: SocializeContentStaleInApproval
|
||||
expr: socialize_workflow_stale_in_approval > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Content is stale in approval
|
||||
description: One or more content items have been in approval longer than the configured threshold.
|
||||
|
||||
- alert: SocializeNoActiveWorkspaces
|
||||
expr: socialize_workflow_active_workspaces{window="24h"} < 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: info
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: No active workspaces in the last 24 hours
|
||||
description: No workspace has content workflow activity in the last 24 hours.
|
||||
|
||||
- alert: SocializeFeedbackBugSubmitted
|
||||
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
||||
for: 0m
|
||||
|
||||
Reference in New Issue
Block a user