feat: add preprod observability foundation

This commit is contained in:
2026-05-08 15:45:31 -04:00
parent 1ca6ab7117
commit 8bcff96821
35 changed files with 1627 additions and 56 deletions

View File

@@ -3,8 +3,9 @@ services:
image: postgres:16
restart: unless-stopped
env_file:
- /etc/socialize/socialize.env
- .deploy.env
- path: /etc/socialize/socialize.env
- path: .deploy.env
required: false
environment:
POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER}
@@ -23,8 +24,9 @@ services:
image: git.mapachotes.com/jbourdon/socialize-api:${SOCIALIZE_IMAGE_TAG}
restart: unless-stopped
env_file:
- /etc/socialize/socialize.env
- .deploy.env
- path: /etc/socialize/socialize.env
- path: .deploy.env
required: false
environment:
ASPNETCORE_ENVIRONMENT: ${ASPNETCORE_ENVIRONMENT}
ASPNETCORE_URLS: ${ASPNETCORE_URLS}

View File

@@ -0,0 +1,95 @@
logging {
level = "info"
format = "logfmt"
}
otelcol.receiver.otlp "api" {
grpc {
endpoint = "0.0.0.0:4317"
}
http {
endpoint = "0.0.0.0:4318"
}
output {
metrics = [otelcol.processor.transform.metric_labels.input]
traces = [otelcol.processor.batch.default.input]
}
}
otelcol.processor.transform "metric_labels" {
error_mode = "ignore"
metric_statements {
context = "datapoint"
statements = [
`set(attributes["service.name"], resource.attributes["service.name"])`,
`set(attributes["deployment.environment"], resource.attributes["deployment.environment"])`,
]
}
output {
metrics = [otelcol.processor.batch.default.input]
}
}
otelcol.processor.batch "default" {
output {
metrics = [otelcol.exporter.prometheus.local.input]
traces = [otelcol.exporter.otlp.tempo.input]
}
}
otelcol.exporter.prometheus "local" {
forward_to = [prometheus.remote_write.local.receiver]
}
prometheus.remote_write "local" {
endpoint {
url = "http://prometheus:9090/api/v1/write"
}
}
otelcol.exporter.otlp "tempo" {
client {
endpoint = "tempo:4317"
tls {
insecure = true
}
}
}
discovery.docker "linux" {
host = "unix:///var/run/docker.sock"
}
discovery.relabel "docker_logs" {
targets = []
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "service_name"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "compose_service"
}
}
loki.source.docker "default" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.linux.targets
labels = {"platform" = "docker"}
relabel_rules = discovery.relabel.docker_logs.rules
forward_to = [loki.write.local.receiver]
}
loki.write "local" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}

View File

@@ -0,0 +1,94 @@
services:
api:
environment:
OTEL_SERVICE_NAME: socialize-api
OTEL_EXPORTER_OTLP_ENDPOINT: http://alloy:4317
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_RESOURCE_ATTRIBUTES: deployment.environment=preprod
depends_on:
alloy:
condition: service_started
grafana:
image: grafana/grafana:13.0.1
restart: unless-stopped
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
ports:
- "${GRAFANA_HTTP_BIND:-127.0.0.1}:3000:3000"
depends_on:
- prometheus
- loki
- tempo
networks:
- internal
prometheus:
image: prom/prometheus:v3.11.3
restart: unless-stopped
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=${PROMETHEUS_RETENTION:-15d}
- --web.enable-remote-write-receiver
volumes:
- prometheus-data:/prometheus
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./observability/prometheus/rules:/etc/prometheus/rules:ro
networks:
- internal
loki:
image: grafana/loki:3.7.1
restart: unless-stopped
command: -config.file=/etc/loki/local-config.yml
volumes:
- loki-data:/loki
- ./observability/loki/local-config.yml:/etc/loki/local-config.yml:ro
networks:
- internal
tempo:
image: grafana/tempo:2.10.3
restart: unless-stopped
command: -config.file=/etc/tempo.yml
volumes:
- tempo-data:/var/tempo
- ./observability/tempo/tempo.yml:/etc/tempo.yml:ro
networks:
- internal
alloy:
image: grafana/alloy:v1.16.0
restart: unless-stopped
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
volumes:
- alloy-data:/var/lib/alloy/data
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./observability/alloy/config.alloy:/etc/alloy/config.alloy:ro
expose:
- "4317"
- "4318"
- "12345"
networks:
- internal
volumes:
grafana-data:
prometheus-data:
loki-data:
tempo-data:
alloy-data:
networks:
internal:

View File

@@ -0,0 +1,413 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m]))",
"legendFormat": "requests/sec"
}
],
"title": "API Requests/sec",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])), 0.001)",
"legendFormat": "5xx rate"
}
],
"title": "API 5xx Rate",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
"legendFormat": "p95"
}
],
"title": "API p95 Latency",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\"})",
"legendFormat": "firing"
}
],
"title": "Firing Alerts",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "reqps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])) by (http_request_method, http_route)",
"legendFormat": "{{http_request_method}} {{http_route}}"
}
],
"title": "Request Rate By Endpoint",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"id": 6,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le, http_route) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
"legendFormat": "{{http_route}}"
}
],
"title": "p95 Latency By Endpoint",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 12
},
"id": 7,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(socialize_login_attempts_total[24h])) by (outcome)",
"legendFormat": "login {{outcome}}"
},
{
"expr": "sum(increase(socialize_organizations_created_total[24h]))",
"legendFormat": "organizations"
},
{
"expr": "sum(increase(socialize_workspaces_created_total[24h]))",
"legendFormat": "workspaces"
},
{
"expr": "sum(increase(socialize_content_items_created_total[24h]))",
"legendFormat": "content"
},
{
"expr": "sum(increase(socialize_comments_created_total[24h]))",
"legendFormat": "comments"
},
{
"expr": "sum(increase(socialize_approval_decisions_submitted_total[24h]))",
"legendFormat": "approvals"
},
{
"expr": "sum(increase(socialize_feedback_submitted_total[24h]))",
"legendFormat": "feedback"
}
],
"title": "Usage Signals, 24h Rolling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 12
},
"id": 8,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(socialize_email_delivery_total[1h])) by (outcome, provider)",
"legendFormat": "email {{provider}} {{outcome}}"
},
{
"expr": "sum(increase(socialize_blob_storage_operations_total[1h])) by (operation, outcome)",
"legendFormat": "blob {{operation}} {{outcome}}"
},
{
"expr": "sum(increase(socialize_background_job_runs_total[1h])) by (job, outcome)",
"legendFormat": "job {{job}} {{outcome}}"
}
],
"title": "Operational Events, 1h Rolling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 20
},
"id": 9,
"options": {
"showHeader": true
},
"targets": [
{
"expr": "ALERTS{alertstate=\"firing\"}",
"format": "table",
"instant": true,
"legendFormat": "{{alertname}}"
}
],
"title": "Firing Alerts",
"type": "table"
},
{
"datasource": {
"type": "loki",
"uid": "Loki"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 27
},
"id": 10,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"expr": "{platform=\"docker\", compose_service=\"api\"}",
"refId": "A"
}
],
"title": "API Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": [
"socialize",
"preprod"
],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Socialize Overview",
"uid": "socialize-overview",
"version": 2,
"weekStart": ""
}

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: Socialize
orgId: 1
folder: Socialize
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,26 @@
apiVersion: 1
datasources:
- name: Prometheus
uid: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: Loki
uid: Loki
type: loki
access: proxy
url: http://loki:3100
- name: Tempo
uid: Tempo
type: tempo
access: proxy
url: http://tempo:3200
jsonData:
tracesToLogsV2:
datasourceUid: Loki
serviceMap:
datasourceUid: Prometheus

View File

@@ -0,0 +1,32 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
filesystem:
directory: /loki/chunks
limits_config:
allow_structured_metadata: true
volume_enabled: true
analytics:
reporting_enabled: false

View File

@@ -0,0 +1,17 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- prometheus:9090
- job_name: alloy
static_configs:
- targets:
- alloy:12345

View File

@@ -0,0 +1,97 @@
groups:
- name: socialize-preprod
rules:
- alert: SocializeApiTelemetryMissing
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializeApiHighErrorRate
expr: |
(
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
/
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
) > 0.05
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API 5xx rate is high
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
- alert: SocializeApiHighLatency
expr: |
histogram_quantile(
0.95,
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
) > 2
for: 10m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize API p95 latency is high
description: API p95 latency has been above 2 seconds for 10 minutes.
- alert: SocializeCoreUsageQuiet
expr: |
(
sum(increase(socialize_content_items_created_total[12h]))
+ sum(increase(socialize_comments_created_total[12h]))
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
+ sum(increase(socialize_feedback_submitted_total[12h]))
) < 1
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m
labels:
severity: info
service: socialize-api
annotations:
summary: New bug feedback submitted
description: A user submitted bug feedback in the last 15 minutes.
- alert: SocializeEmailDeliveryFailures
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Email delivery failures detected
description: One or more email delivery attempts failed in the last 15 minutes.
- alert: SocializeBlobStorageFailures
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Blob storage failures detected
description: One or more blob storage operations failed in the last 15 minutes.
- alert: SocializeBackgroundJobFailures
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Background job failures detected
description: One or more background jobs failed in the last 30 minutes.

View File

@@ -0,0 +1,25 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
storage:
trace:
backend: local
local:
path: /var/tempo/traces
compactor:
compaction:
block_retention: 168h
metrics_generator:
storage:
path: /var/tempo/generator/wal