feat: add preprod observability foundation
This commit is contained in:
95
deploy/observability/alloy/config.alloy
Normal file
95
deploy/observability/alloy/config.alloy
Normal file
@@ -0,0 +1,95 @@
|
||||
logging {
|
||||
level = "info"
|
||||
format = "logfmt"
|
||||
}
|
||||
|
||||
otelcol.receiver.otlp "api" {
|
||||
grpc {
|
||||
endpoint = "0.0.0.0:4317"
|
||||
}
|
||||
|
||||
http {
|
||||
endpoint = "0.0.0.0:4318"
|
||||
}
|
||||
|
||||
output {
|
||||
metrics = [otelcol.processor.transform.metric_labels.input]
|
||||
traces = [otelcol.processor.batch.default.input]
|
||||
}
|
||||
}
|
||||
|
||||
otelcol.processor.transform "metric_labels" {
|
||||
error_mode = "ignore"
|
||||
|
||||
metric_statements {
|
||||
context = "datapoint"
|
||||
statements = [
|
||||
`set(attributes["service.name"], resource.attributes["service.name"])`,
|
||||
`set(attributes["deployment.environment"], resource.attributes["deployment.environment"])`,
|
||||
]
|
||||
}
|
||||
|
||||
output {
|
||||
metrics = [otelcol.processor.batch.default.input]
|
||||
}
|
||||
}
|
||||
|
||||
otelcol.processor.batch "default" {
|
||||
output {
|
||||
metrics = [otelcol.exporter.prometheus.local.input]
|
||||
traces = [otelcol.exporter.otlp.tempo.input]
|
||||
}
|
||||
}
|
||||
|
||||
otelcol.exporter.prometheus "local" {
|
||||
forward_to = [prometheus.remote_write.local.receiver]
|
||||
}
|
||||
|
||||
prometheus.remote_write "local" {
|
||||
endpoint {
|
||||
url = "http://prometheus:9090/api/v1/write"
|
||||
}
|
||||
}
|
||||
|
||||
otelcol.exporter.otlp "tempo" {
|
||||
client {
|
||||
endpoint = "tempo:4317"
|
||||
|
||||
tls {
|
||||
insecure = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
discovery.docker "linux" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
}
|
||||
|
||||
discovery.relabel "docker_logs" {
|
||||
targets = []
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_name"]
|
||||
regex = "/(.*)"
|
||||
target_label = "service_name"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
|
||||
target_label = "compose_service"
|
||||
}
|
||||
}
|
||||
|
||||
loki.source.docker "default" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
targets = discovery.docker.linux.targets
|
||||
labels = {"platform" = "docker"}
|
||||
relabel_rules = discovery.relabel.docker_logs.rules
|
||||
forward_to = [loki.write.local.receiver]
|
||||
}
|
||||
|
||||
loki.write "local" {
|
||||
endpoint {
|
||||
url = "http://loki:3100/loki/api/v1/push"
|
||||
}
|
||||
}
|
||||
94
deploy/observability/compose.observability.yml
Normal file
94
deploy/observability/compose.observability.yml
Normal file
@@ -0,0 +1,94 @@
|
||||
services:
|
||||
api:
|
||||
environment:
|
||||
OTEL_SERVICE_NAME: socialize-api
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://alloy:4317
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
|
||||
OTEL_RESOURCE_ATTRIBUTES: deployment.environment=preprod
|
||||
depends_on:
|
||||
alloy:
|
||||
condition: service_started
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:13.0.1
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
ports:
|
||||
- "${GRAFANA_HTTP_BIND:-127.0.0.1}:3000:3000"
|
||||
depends_on:
|
||||
- prometheus
|
||||
- loki
|
||||
- tempo
|
||||
networks:
|
||||
- internal
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.11.3
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --storage.tsdb.retention.time=${PROMETHEUS_RETENTION:-15d}
|
||||
- --web.enable-remote-write-receiver
|
||||
volumes:
|
||||
- prometheus-data:/prometheus
|
||||
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./observability/prometheus/rules:/etc/prometheus/rules:ro
|
||||
networks:
|
||||
- internal
|
||||
|
||||
loki:
|
||||
image: grafana/loki:3.7.1
|
||||
restart: unless-stopped
|
||||
command: -config.file=/etc/loki/local-config.yml
|
||||
volumes:
|
||||
- loki-data:/loki
|
||||
- ./observability/loki/local-config.yml:/etc/loki/local-config.yml:ro
|
||||
networks:
|
||||
- internal
|
||||
|
||||
tempo:
|
||||
image: grafana/tempo:2.10.3
|
||||
restart: unless-stopped
|
||||
command: -config.file=/etc/tempo.yml
|
||||
volumes:
|
||||
- tempo-data:/var/tempo
|
||||
- ./observability/tempo/tempo.yml:/etc/tempo.yml:ro
|
||||
networks:
|
||||
- internal
|
||||
|
||||
alloy:
|
||||
image: grafana/alloy:v1.16.0
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- run
|
||||
- --server.http.listen-addr=0.0.0.0:12345
|
||||
- --storage.path=/var/lib/alloy/data
|
||||
- /etc/alloy/config.alloy
|
||||
volumes:
|
||||
- alloy-data:/var/lib/alloy/data
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- ./observability/alloy/config.alloy:/etc/alloy/config.alloy:ro
|
||||
expose:
|
||||
- "4317"
|
||||
- "4318"
|
||||
- "12345"
|
||||
networks:
|
||||
- internal
|
||||
|
||||
volumes:
|
||||
grafana-data:
|
||||
prometheus-data:
|
||||
loki-data:
|
||||
tempo-data:
|
||||
alloy-data:
|
||||
|
||||
networks:
|
||||
internal:
|
||||
413
deploy/observability/grafana/dashboards/socialize-overview.json
Normal file
413
deploy/observability/grafana/dashboards/socialize-overview.json
Normal file
@@ -0,0 +1,413 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m]))",
|
||||
"legendFormat": "requests/sec"
|
||||
}
|
||||
],
|
||||
"title": "API Requests/sec",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 6,
|
||||
"y": 0
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])), 0.001)",
|
||||
"legendFormat": "5xx rate"
|
||||
}
|
||||
],
|
||||
"title": "API 5xx Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 12,
|
||||
"y": 0
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
|
||||
"legendFormat": "p95"
|
||||
}
|
||||
],
|
||||
"title": "API p95 Latency",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 4,
|
||||
"w": 6,
|
||||
"x": 18,
|
||||
"y": 0
|
||||
},
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "area",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": [
|
||||
"lastNotNull"
|
||||
],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(ALERTS{alertstate=\"firing\"})",
|
||||
"legendFormat": "firing"
|
||||
}
|
||||
],
|
||||
"title": "Firing Alerts",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 4
|
||||
},
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])) by (http_request_method, http_route)",
|
||||
"legendFormat": "{{http_request_method}} {{http_route}}"
|
||||
}
|
||||
],
|
||||
"title": "Request Rate By Endpoint",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 4
|
||||
},
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, sum by (le, http_route) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
|
||||
"legendFormat": "{{http_route}}"
|
||||
}
|
||||
],
|
||||
"title": "p95 Latency By Endpoint",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 12
|
||||
},
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(socialize_login_attempts_total[24h])) by (outcome)",
|
||||
"legendFormat": "login {{outcome}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_organizations_created_total[24h]))",
|
||||
"legendFormat": "organizations"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_workspaces_created_total[24h]))",
|
||||
"legendFormat": "workspaces"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_content_items_created_total[24h]))",
|
||||
"legendFormat": "content"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_comments_created_total[24h]))",
|
||||
"legendFormat": "comments"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_approval_decisions_submitted_total[24h]))",
|
||||
"legendFormat": "approvals"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_feedback_submitted_total[24h]))",
|
||||
"legendFormat": "feedback"
|
||||
}
|
||||
],
|
||||
"title": "Usage Signals, 24h Rolling",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 12,
|
||||
"x": 12,
|
||||
"y": 12
|
||||
},
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom"
|
||||
},
|
||||
"tooltip": {
|
||||
"mode": "multi"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(socialize_email_delivery_total[1h])) by (outcome, provider)",
|
||||
"legendFormat": "email {{provider}} {{outcome}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_blob_storage_operations_total[1h])) by (operation, outcome)",
|
||||
"legendFormat": "blob {{operation}} {{outcome}}"
|
||||
},
|
||||
{
|
||||
"expr": "sum(increase(socialize_background_job_runs_total[1h])) by (job, outcome)",
|
||||
"legendFormat": "job {{job}} {{outcome}}"
|
||||
}
|
||||
],
|
||||
"title": "Operational Events, 1h Rolling",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 7,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 20
|
||||
},
|
||||
"id": 9,
|
||||
"options": {
|
||||
"showHeader": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "ALERTS{alertstate=\"firing\"}",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "{{alertname}}"
|
||||
}
|
||||
],
|
||||
"title": "Firing Alerts",
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 9,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 27
|
||||
},
|
||||
"id": 10,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{platform=\"docker\", compose_service=\"api\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "API Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 39,
|
||||
"tags": [
|
||||
"socialize",
|
||||
"preprod"
|
||||
],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Socialize Overview",
|
||||
"uid": "socialize-overview",
|
||||
"version": 2,
|
||||
"weekStart": ""
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: Socialize
|
||||
orgId: 1
|
||||
folder: Socialize
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 30
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,26 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
uid: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
|
||||
- name: Loki
|
||||
uid: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
|
||||
- name: Tempo
|
||||
uid: Tempo
|
||||
type: tempo
|
||||
access: proxy
|
||||
url: http://tempo:3200
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: Loki
|
||||
serviceMap:
|
||||
datasourceUid: Prometheus
|
||||
32
deploy/observability/loki/local-config.yml
Normal file
32
deploy/observability/loki/local-config.yml
Normal file
@@ -0,0 +1,32 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
|
||||
limits_config:
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
17
deploy/observability/prometheus/prometheus.yml
Normal file
17
deploy/observability/prometheus/prometheus.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
- targets:
|
||||
- prometheus:9090
|
||||
|
||||
- job_name: alloy
|
||||
static_configs:
|
||||
- targets:
|
||||
- alloy:12345
|
||||
97
deploy/observability/prometheus/rules/socialize-alerts.yml
Normal file
97
deploy/observability/prometheus/rules/socialize-alerts.yml
Normal file
@@ -0,0 +1,97 @@
|
||||
groups:
|
||||
- name: socialize-preprod
|
||||
rules:
|
||||
- alert: SocializeApiTelemetryMissing
|
||||
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API telemetry is missing
|
||||
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
||||
|
||||
- alert: SocializeApiHighErrorRate
|
||||
expr: |
|
||||
(
|
||||
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
|
||||
/
|
||||
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
|
||||
) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API 5xx rate is high
|
||||
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
|
||||
|
||||
- alert: SocializeApiHighLatency
|
||||
expr: |
|
||||
histogram_quantile(
|
||||
0.95,
|
||||
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
|
||||
) > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize API p95 latency is high
|
||||
description: API p95 latency has been above 2 seconds for 10 minutes.
|
||||
|
||||
- alert: SocializeCoreUsageQuiet
|
||||
expr: |
|
||||
(
|
||||
sum(increase(socialize_content_items_created_total[12h]))
|
||||
+ sum(increase(socialize_comments_created_total[12h]))
|
||||
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
|
||||
+ sum(increase(socialize_feedback_submitted_total[12h]))
|
||||
) < 1
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Socialize core usage is quiet
|
||||
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
||||
|
||||
- alert: SocializeFeedbackBugSubmitted
|
||||
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: New bug feedback submitted
|
||||
description: A user submitted bug feedback in the last 15 minutes.
|
||||
|
||||
- alert: SocializeEmailDeliveryFailures
|
||||
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Email delivery failures detected
|
||||
description: One or more email delivery attempts failed in the last 15 minutes.
|
||||
|
||||
- alert: SocializeBlobStorageFailures
|
||||
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Blob storage failures detected
|
||||
description: One or more blob storage operations failed in the last 15 minutes.
|
||||
|
||||
- alert: SocializeBackgroundJobFailures
|
||||
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
service: socialize-api
|
||||
annotations:
|
||||
summary: Background job failures detected
|
||||
description: One or more background jobs failed in the last 30 minutes.
|
||||
25
deploy/observability/tempo/tempo.yml
Normal file
25
deploy/observability/tempo/tempo.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 168h
|
||||
|
||||
metrics_generator:
|
||||
storage:
|
||||
path: /var/tempo/generator/wal
|
||||
Reference in New Issue
Block a user