From 986c7efea64bb62105ff267f0f8e48e8528d57e5 Mon Sep 17 00:00:00 2001
From: Jonathan Bourdon <lowrad000@gmail.com>
Date: Fri, 8 May 2026 15:48:56 -0400
Subject: [PATCH] feat: close preprod observability loop

---
 README.md                                     |   4 +
 .../ObservabilityRegistration.cs              |   1 +
 .../Observability/SocializeMetrics.cs         | 100 +++++++++++
 .../WorkflowHealthSamplerService.cs           | 102 +++++++++++
 .../alertmanager/alertmanager.yml             |  22 +++
 deploy/observability/blackbox/config.yml      |   9 +
 deploy/observability/caddy/grafana.Caddyfile  |  13 ++
 .../observability/compose.observability.yml   |  27 +++
 .../dashboards/socialize-overview.json        |  76 +++++++-
 .../observability/prometheus/prometheus.yml   |  25 +++
 .../prometheus/rules/socialize-alerts.yml     |  30 ++++
 docs/FEATURES/observability.md                |  14 ++
 docs/OPERATIONS/observability-runbook.md      | 163 ++++++++++++++++++
 .../003-preprod-operations-loop.md            |  34 ++++
 14 files changed, 618 insertions(+), 2 deletions(-)
 create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs
 create mode 100644 deploy/observability/alertmanager/alertmanager.yml
 create mode 100644 deploy/observability/blackbox/config.yml
 create mode 100644 deploy/observability/caddy/grafana.Caddyfile
 create mode 100644 docs/OPERATIONS/observability-runbook.md
 create mode 100644 docs/TASKS/observability/003-preprod-operations-loop.md
diff --git a/README.md b/README.md
index 3c4d229b..b16e1bc6 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,10 @@ host. Prometheus alert rules are provisioned under
 `deploy/observability/prometheus/rules/`; notification delivery is intentionally
 left to the preprod operations environment.
 
+Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint.
+See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and
+the optional protected Caddy configuration for Grafana.
+
 ## Solution
 
 ```bash
diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs
index baaab361..3ea0d300 100644
--- a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs
@@ -48,6 +48,7 @@ internal static class ObservabilityRegistration
         }
 
         builder.Services.AddSingleton<SocializeMetrics>();
+        builder.Services.AddHostedService<WorkflowHealthSamplerService>();
         builder.Services
             .AddOpenTelemetry()
             .ConfigureResource(resource => resource.AddService(
diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs
index c52ad933..97810937 100644
--- a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs
@@ -19,6 +19,8 @@ internal sealed class SocializeMetrics : IDisposable
     private readonly Counter<long> _organizationCreatedCounter;
     private readonly Counter<long> _workspaceCreatedCounter;
     private readonly Counter<long> _workspaceInviteCreatedCounter;
+    private readonly object _workflowHealthLock = new();
+    private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty;
 
     public SocializeMetrics()
     {
@@ -58,6 +60,27 @@ internal sealed class SocializeMetrics : IDisposable
         _backgroundJobRunCounter = Meter.CreateCounter<long>(
             "socialize.background_job.runs",
             description: "Background job runs partitioned by job and outcome.");
+
+        Meter.CreateObservableGauge(
+            "socialize.workflow.content_items",
+            ObserveContentItemCounts,
+            description: "Current content item counts by status.");
+        Meter.CreateObservableGauge(
+            "socialize.workflow.feedback_reports",
+            ObserveFeedbackReportCounts,
+            description: "Current feedback report counts by status.");
+        Meter.CreateObservableGauge(
+            "socialize.workflow.pending_invites",
+            ObservePendingInviteCount,
+            description: "Current pending workspace invite count.");
+        Meter.CreateObservableGauge(
+            "socialize.workflow.stale_in_approval",
+            ObserveStaleApprovalCount,
+            description: "Current count of content items in approval longer than the configured stale threshold.");
+        Meter.CreateObservableGauge(
+            "socialize.workflow.active_workspaces",
+            ObserveActiveWorkspaceCounts,
+            description: "Current active workspace counts by observation window.");
     }
 
     public Meter Meter { get; }
@@ -150,9 +173,86 @@ internal sealed class SocializeMetrics : IDisposable
             new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
     }
 
+    public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot)
+    {
+        lock (_workflowHealthLock)
+        {
+            _workflowHealthSnapshot = snapshot;
+        }
+    }
+
     public void Dispose()
     {
         Meter.Dispose();
         ActivitySource.Dispose();
     }
+
+    private Measurement<int>[] ObserveContentItemCounts()
+    {
+        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
+        return snapshot.ContentItemsByStatus
+            .Select(pair => new Measurement<int>(
+                pair.Value,
+                new KeyValuePair<string, object?>("status", pair.Key)))
+            .ToArray();
+    }
+
+    private Measurement<int>[] ObserveFeedbackReportCounts()
+    {
+        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
+        return snapshot.FeedbackReportsByStatus
+            .Select(pair => new Measurement<int>(
+                pair.Value,
+                new KeyValuePair<string, object?>("status", pair.Key)))
+            .ToArray();
+    }
+
+    private Measurement<int> ObservePendingInviteCount()
+    {
+        return new Measurement<int>(GetWorkflowHealthSnapshot().PendingInviteCount);
+    }
+
+    private Measurement<int> ObserveStaleApprovalCount()
+    {
+        return new Measurement<int>(GetWorkflowHealthSnapshot().StaleInApprovalCount);
+    }
+
+    private Measurement<int>[] ObserveActiveWorkspaceCounts()
+    {
+        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
+        return
+        [
+            new Measurement<int>(
+                snapshot.ActiveWorkspaces24Hours,
+                new KeyValuePair<string, object?>("window", "24h")),
+            new Measurement<int>(
+                snapshot.ActiveWorkspaces7Days,
+                new KeyValuePair<string, object?>("window", "7d")),
+        ];
+    }
+
+    private WorkflowHealthSnapshot GetWorkflowHealthSnapshot()
+    {
+        lock (_workflowHealthLock)
+        {
+            return _workflowHealthSnapshot;
+        }
+    }
+}
+
+internal sealed record WorkflowHealthSnapshot(
+    IReadOnlyDictionary<string, int> ContentItemsByStatus,
+    IReadOnlyDictionary<string, int> FeedbackReportsByStatus,
+    int PendingInviteCount,
+    int StaleInApprovalCount,
+    int ActiveWorkspaces24Hours,
+    int ActiveWorkspaces7Days)
+{
+    public static WorkflowHealthSnapshot Empty { get; } = new(
+        new Dictionary<string, int>(StringComparer.Ordinal),
+        new Dictionary<string, int>(StringComparer.Ordinal),
+        0,
+        0,
+        0,
+        0);
 }
diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs b/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs
new file mode 100644
index 00000000..8c0fb66e
--- /dev/null
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs
@@ -0,0 +1,102 @@
+using Microsoft.EntityFrameworkCore;
+using Socialize.Api.Data;
+using Socialize.Api.Modules.Feedback.Data;
+using Socialize.Api.Modules.Workspaces.Data;
+
+namespace Socialize.Api.Infrastructure.Observability;
+
+internal sealed class WorkflowHealthSamplerService(
+    IServiceScopeFactory scopeFactory,
+    SocializeMetrics metrics,
+    ILogger<WorkflowHealthSamplerService> logger)
+    : BackgroundService
+{
+    private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5);
+    private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3);
+
+    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
+    {
+        await SampleAsync(stoppingToken);
+
+        using PeriodicTimer timer = new(SampleInterval);
+        while (!stoppingToken.IsCancellationRequested)
+        {
+            try
+            {
+                await timer.WaitForNextTickAsync(stoppingToken);
+                await SampleAsync(stoppingToken);
+            }
+            catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
+            {
+                logger.LogDebug(ex, "Workflow health sampler stopped.");
+            }
+        }
+    }
+
+    private async Task SampleAsync(CancellationToken stoppingToken)
+    {
+        try
+        {
+            using IServiceScope scope = scopeFactory.CreateScope();
+            AppDbContext dbContext = scope.ServiceProvider.GetRequiredService<AppDbContext>();
+            DateTimeOffset now = DateTimeOffset.UtcNow;
+            DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold);
+            DateTimeOffset active24HourCutoff = now.AddHours(-24);
+            DateTimeOffset active7DayCutoff = now.AddDays(-7);
+
+            Dictionary<string, int> contentItemsByStatus = await dbContext.ContentItems
+                .GroupBy(item => item.Status)
+                .Select(group => new { Status = group.Key, Count = group.Count() })
+                .ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken);
+
+            Dictionary<string, int> feedbackReportsByStatus = await dbContext.FeedbackReports
+                .GroupBy(report => report.Status)
+                .Select(group => new { Status = group.Key, Count = group.Count() })
+                .ToDictionaryAsync(
+                    group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(),
+                    group => group.Count,
+                    StringComparer.Ordinal,
+                    stoppingToken);
+
+            int pendingInviteCount = await dbContext.WorkspaceInvites
+                .CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken);
+
+            int staleInApprovalCount = await dbContext.ContentItems
+                .CountAsync(
+                    item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff,
+                    stoppingToken);
+
+            int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries
+                .Where(entry => entry.CreatedAt >= active24HourCutoff)
+                .Select(entry => entry.WorkspaceId)
+                .Distinct()
+                .CountAsync(stoppingToken);
+
+            int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries
+                .Where(entry => entry.CreatedAt >= active7DayCutoff)
+                .Select(entry => entry.WorkspaceId)
+                .Distinct()
+                .CountAsync(stoppingToken);
+
+            metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot(
+                contentItemsByStatus,
+                feedbackReportsByStatus,
+                pendingInviteCount,
+                staleInApprovalCount,
+                activeWorkspaces24Hours,
+                activeWorkspaces7Days));
+            metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true);
+        }
+        catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
+        {
+            logger.LogDebug(ex, "Workflow health sampler stopped.");
+        }
+#pragma warning disable CA1031
+        catch (Exception ex)
+        {
+            metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false);
+            logger.LogError(ex, "Workflow health sampling failed.");
+        }
+#pragma warning restore CA1031
+    }
+}
diff --git a/deploy/observability/alertmanager/alertmanager.yml b/deploy/observability/alertmanager/alertmanager.yml
new file mode 100644
index 00000000..a9f25d94
--- /dev/null
+++ b/deploy/observability/alertmanager/alertmanager.yml
@@ -0,0 +1,22 @@
+global:
+  resolve_timeout: 5m
+
+route:
+  receiver: preprod-webhook
+  group_by:
+    - alertname
+    - service
+  group_wait: 30s
+  group_interval: 5m
+  repeat_interval: 4h
+  routes:
+    - matchers:
+        - severity="critical"
+      receiver: preprod-webhook
+      repeat_interval: 30m
+
+receivers:
+  - name: preprod-webhook
+    webhook_configs:
+      - url: ${ALERTMANAGER_WEBHOOK_URL}
+        send_resolved: true
diff --git a/deploy/observability/blackbox/config.yml b/deploy/observability/blackbox/config.yml
new file mode 100644
index 00000000..fa8e76f0
--- /dev/null
+++ b/deploy/observability/blackbox/config.yml
@@ -0,0 +1,9 @@
+modules:
+  http_2xx:
+    prober: http
+    timeout: 5s
+    http:
+      method: GET
+      preferred_ip_protocol: ip4
+      valid_status_codes:
+        - 200
diff --git a/deploy/observability/caddy/grafana.Caddyfile b/deploy/observability/caddy/grafana.Caddyfile
new file mode 100644
index 00000000..4eca957d
--- /dev/null
+++ b/deploy/observability/caddy/grafana.Caddyfile
@@ -0,0 +1,13 @@
+# Optional Caddy snippet for exposing Grafana through a protected hostname.
+# Generate a password hash with:
+# caddy hash-password --plaintext '<password>'
+
+{$OBSERVABILITY_HOST} {
+	encode gzip zstd
+
+	basicauth {
+		{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
+	}
+
+	reverse_proxy grafana:3000
+}
diff --git a/deploy/observability/compose.observability.yml b/deploy/observability/compose.observability.yml
index 57048803..f42a1038 100644
--- a/deploy/observability/compose.observability.yml
+++ b/deploy/observability/compose.observability.yml
@@ -26,6 +26,7 @@ services:
       - prometheus
       - loki
       - tempo
+      - alertmanager
     networks:
       - internal
 
@@ -44,6 +45,31 @@ services:
     networks:
       - internal
 
+  alertmanager:
+    image: prom/alertmanager:v0.29.0
+    restart: unless-stopped
+    command:
+      - --config.file=/etc/alertmanager/alertmanager.yml
+      - --storage.path=/alertmanager
+      - --config.expand-env
+    environment:
+      ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
+    volumes:
+      - alertmanager-data:/alertmanager
+      - ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
+    networks:
+      - internal
+
+  blackbox:
+    image: prom/blackbox-exporter:v0.27.0
+    restart: unless-stopped
+    command:
+      - --config.file=/etc/blackbox_exporter/config.yml
+    volumes:
+      - ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
+    networks:
+      - internal
+
   loki:
     image: grafana/loki:3.7.1
     restart: unless-stopped
@@ -84,6 +110,7 @@ services:
       - internal
 
 volumes:
+  alertmanager-data:
   grafana-data:
   prometheus-data:
   loki-data:
diff --git a/deploy/observability/grafana/dashboards/socialize-overview.json b/deploy/observability/grafana/dashboards/socialize-overview.json
index 89599631..fcd9c6a9 100644
--- a/deploy/observability/grafana/dashboards/socialize-overview.json
+++ b/deploy/observability/grafana/dashboards/socialize-overview.json
@@ -333,6 +333,78 @@
       "title": "Operational Events, 1h Rolling",
       "type": "timeseries"
     },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "Prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 20
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "expr": "socialize_workflow_content_items",
+          "legendFormat": "content {{status}}"
+        },
+        {
+          "expr": "socialize_workflow_feedback_reports",
+          "legendFormat": "feedback {{status}}"
+        }
+      ],
+      "title": "Workflow Backlog",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "Prometheus"
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 20
+      },
+      "id": 12,
+      "options": {
+        "legend": {
+          "displayMode": "list",
+          "placement": "bottom"
+        },
+        "tooltip": {
+          "mode": "multi"
+        }
+      },
+      "targets": [
+        {
+          "expr": "socialize_workflow_active_workspaces",
+          "legendFormat": "active workspaces {{window}}"
+        },
+        {
+          "expr": "socialize_workflow_stale_in_approval",
+          "legendFormat": "stale in approval"
+        },
+        {
+          "expr": "socialize_workflow_pending_invites",
+          "legendFormat": "pending invites"
+        }
+      ],
+      "title": "Workflow Health",
+      "type": "timeseries"
+    },
     {
       "datasource": {
         "type": "prometheus",
@@ -342,7 +414,7 @@
         "h": 7,
         "w": 24,
         "x": 0,
-        "y": 20
+        "y": 28
       },
       "id": 9,
       "options": {
@@ -368,7 +440,7 @@
         "h": 9,
         "w": 24,
         "x": 0,
-        "y": 27
+        "y": 35
       },
       "id": 10,
       "options": {
diff --git a/deploy/observability/prometheus/prometheus.yml b/deploy/observability/prometheus/prometheus.yml
index a782e6b0..81231d10 100644
--- a/deploy/observability/prometheus/prometheus.yml
+++ b/deploy/observability/prometheus/prometheus.yml
@@ -2,6 +2,12 @@ global:
   scrape_interval: 15s
   evaluation_interval: 15s
 
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+            - alertmanager:9093
+
 rule_files:
   - /etc/prometheus/rules/*.yml
 
@@ -15,3 +21,22 @@ scrape_configs:
     static_configs:
       - targets:
           - alloy:12345
+
+  - job_name: preprod-uptime
+    metrics_path: /probe
+    params:
+      module:
+        - http_2xx
+    static_configs:
+      - targets:
+          - http://web/
+          - http://api:8080/health/ready
+    relabel_configs:
+      - source_labels:
+          - __address__
+        target_label: __param_target
+      - source_labels:
+          - __param_target
+        target_label: instance
+      - target_label: __address__
+        replacement: blackbox:9115
diff --git a/deploy/observability/prometheus/rules/socialize-alerts.yml b/deploy/observability/prometheus/rules/socialize-alerts.yml
index 20a08e43..8fd0060c 100644
--- a/deploy/observability/prometheus/rules/socialize-alerts.yml
+++ b/deploy/observability/prometheus/rules/socialize-alerts.yml
@@ -11,6 +11,16 @@ groups:
           summary: Socialize API telemetry is missing
           description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
 
+      - alert: SocializePreprodEndpointDown
+        expr: probe_success{job="preprod-uptime"} == 0
+        for: 2m
+        labels:
+          severity: critical
+          service: socialize-preprod
+        annotations:
+          summary: Preprod endpoint is down
+          description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
+
       - alert: SocializeApiHighErrorRate
         expr: |
           (
@@ -56,6 +66,26 @@ groups:
           summary: Socialize core usage is quiet
           description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
 
+      - alert: SocializeContentStaleInApproval
+        expr: socialize_workflow_stale_in_approval > 0
+        for: 30m
+        labels:
+          severity: warning
+          service: socialize-api
+        annotations:
+          summary: Content is stale in approval
+          description: One or more content items have been in approval longer than the configured threshold.
+
+      - alert: SocializeNoActiveWorkspaces
+        expr: socialize_workflow_active_workspaces{window="24h"} < 1
+        for: 1h
+        labels:
+          severity: info
+          service: socialize-api
+        annotations:
+          summary: No active workspaces in the last 24 hours
+          description: No workspace has content workflow activity in the last 24 hours.
+
       - alert: SocializeFeedbackBugSubmitted
         expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
         for: 0m
diff --git a/docs/FEATURES/observability.md b/docs/FEATURES/observability.md
index 0af8e6fc..3104907f 100644
--- a/docs/FEATURES/observability.md
+++ b/docs/FEATURES/observability.md
@@ -78,3 +78,17 @@ Initial alerts should cover:
 - email delivery failures
 - blob storage failures
 - background job failures
+
+## Workflow Health Gauges
+
+Database-derived workflow health metrics should be sampled periodically instead of emitted per request.
+
+Initial gauges should cover:
+
+- content item counts by status
+- feedback report counts by status
+- pending workspace invites
+- content stale in approval
+- active workspace counts over 24-hour and 7-day windows
+
+These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels.
diff --git a/docs/OPERATIONS/observability-runbook.md b/docs/OPERATIONS/observability-runbook.md
new file mode 100644
index 00000000..ee7ef64f
--- /dev/null
+++ b/docs/OPERATIONS/observability-runbook.md
@@ -0,0 +1,163 @@
+# Observability Runbook
+
+## Purpose
+
+This runbook is for preproduction operation of Socialize's self-hosted observability stack.
+
+The goal is to answer:
+
+- Is the app reachable?
+- Is the API healthy?
+- Are errors or latency rising?
+- Are users exercising core workflows?
+- Are emails, blob storage, and background jobs failing?
+- Is work getting stuck?
+
+## Start The Stack
+
+Run from the repository root on the preproduction host:
+
+```bash
+docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
+```
+
+Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0`
+only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel.
+
+Set these before exposing Grafana:
+
+```bash
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=<strong-password>
+```
+
+## Alert Delivery
+
+Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook
+configured by:
+
+```bash
+ALERTMANAGER_WEBHOOK_URL=<private-alert-webhook-url>
+```
+
+If no webhook URL is configured, Alertmanager still starts but alert delivery points
+to a local discard endpoint.
+
+Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours.
+
+## Secure Grafana With Caddy
+
+An optional Caddy snippet is available at:
+
+```txt
+deploy/observability/caddy/grafana.Caddyfile
+```
+
+Generate a Caddy password hash:
+
+```bash
+caddy hash-password --plaintext '<password>'
+```
+
+Configure:
+
+```bash
+OBSERVABILITY_HOST=observability.example.com
+GRAFANA_BASIC_AUTH_USER=<user>
+GRAFANA_BASIC_AUTH_HASH=<hash>
+```
+
+Keep Grafana private unless the hostname is protected.
+
+## First Bring-Up Checks
+
+1. Confirm containers are running:
+
+```bash
+docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps
+```
+
+2. Check API health:
+
+```bash
+curl -i http://127.0.0.1:8080/health
+curl -i http://127.0.0.1:8080/health/ready
+```
+
+3. Open Grafana and check the `Socialize Overview` dashboard.
+
+4. Generate a few real actions:
+
+- log in
+- create a content item
+- add a comment
+- submit feedback
+- create a workspace invite
+
+5. Confirm metrics appear in the dashboard:
+
+- API request rate
+- usage signals
+- workflow backlog
+- operational events
+
+## Alert Triage
+
+`SocializePreprodEndpointDown`
+
+- Check `docker compose ps`.
+- Check `docker compose logs api web`.
+- Check `/health/ready`.
+
+`SocializeApiTelemetryMissing`
+
+- Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`.
+- Check `docker compose logs alloy`.
+- Check whether the API is receiving traffic.
+
+`SocializeApiHighErrorRate`
+
+- Open the API logs panel.
+- Filter by recent `5xx` requests.
+- Open Tempo traces for slow or failing requests if available.
+
+`SocializeApiHighLatency`
+
+- Check the p95 latency by endpoint panel.
+- Inspect slow traces.
+- Check database health and recent deploy activity.
+
+`SocializeEmailDeliveryFailures`
+
+- Check API logs for Resend failures.
+- Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`.
+- Confirm Resend service status outside this stack if needed.
+
+`SocializeBlobStorageFailures`
+
+- Confirm `./blob-storage` volume permissions on the preprod host.
+- Check local disk space.
+- Check API logs for validation or filesystem errors.
+
+`SocializeBackgroundJobFailures`
+
+- Check the operational events panel for the failing job name.
+- Check API logs for the same time window.
+
+`SocializeContentStaleInApproval`
+
+- Use the app to inspect content currently in approval.
+- Contact the relevant internal owner or client contact outside the app if needed.
+
+`SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces`
+
+- Confirm whether quiet usage is expected for the period.
+- If not expected, check login events and API reachability.
+
+## Retention Defaults
+
+- Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`.
+- Tempo keeps traces for 168 hours.
+- Loki uses local filesystem storage for preproduction.
+
+Tune retention before heavy customer usage or long-running demos.
diff --git a/docs/TASKS/observability/003-preprod-operations-loop.md b/docs/TASKS/observability/003-preprod-operations-loop.md
new file mode 100644
index 00000000..6f4a706c
--- /dev/null
+++ b/docs/TASKS/observability/003-preprod-operations-loop.md
@@ -0,0 +1,34 @@
+# Observability 003: Preprod Operations Loop
+
+## Goal
+
+Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook.
+
+## Feature Spec
+
+- `docs/FEATURES/observability.md`
+
+## Scope
+
+- Add Alertmanager to the optional observability compose overlay.
+- Add Blackbox Exporter uptime probes for the web container and API readiness endpoint.
+- Add backend database-derived workflow health gauges.
+- Add Prometheus alerts for uptime probes and workflow health.
+- Add an optional Caddy snippet for protected Grafana exposure.
+- Add an operator runbook for bring-up, alert triage, and security defaults.
+
+## Out Of Scope
+
+- Operating the remote preproduction host.
+- Choosing the final alert destination.
+- Client-facing status page.
+- External third-party uptime monitoring.
+
+## Validation
+
+```bash
+dotnet build backend/Socialize.slnx
+dotnet test backend/Socialize.slnx
+docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
+jq empty deploy/observability/grafana/dashboards/socialize-overview.json
+```