feat: close preprod observability loop

2026-05-08 15:48:56 -04:00
parent 8bcff96821
commit 986c7efea6
14 changed files with 618 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -113,6 +113,10 @@ host. Prometheus alert rules are provisioned under
 `deploy/observability/prometheus/rules/`; notification delivery is intentionally
 left to the preprod operations environment.
 Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint.
 See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and
 the optional protected Caddy configuration for Grafana.
 ## Solution
 ```bash
--- a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs
@@ -48,6 +48,7 @@ internal static class ObservabilityRegistration
        }
        builder.Services.AddSingleton<SocializeMetrics>();
        builder.Services.AddHostedService<WorkflowHealthSamplerService>();
        builder.Services
            .AddOpenTelemetry()
            .ConfigureResource(resource => resource.AddService(
--- a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs
@@ -19,6 +19,8 @@ internal sealed class SocializeMetrics : IDisposable
    private readonly Counter<long> _organizationCreatedCounter;
    private readonly Counter<long> _workspaceCreatedCounter;
    private readonly Counter<long> _workspaceInviteCreatedCounter;
    private readonly object _workflowHealthLock = new();
    private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty;
    public SocializeMetrics()
    {
@@ -58,6 +60,27 @@ internal sealed class SocializeMetrics : IDisposable
        _backgroundJobRunCounter = Meter.CreateCounter<long>(
            "socialize.background_job.runs",
            description: "Background job runs partitioned by job and outcome.");
        Meter.CreateObservableGauge(
            "socialize.workflow.content_items",
            ObserveContentItemCounts,
            description: "Current content item counts by status.");
        Meter.CreateObservableGauge(
            "socialize.workflow.feedback_reports",
            ObserveFeedbackReportCounts,
            description: "Current feedback report counts by status.");
        Meter.CreateObservableGauge(
            "socialize.workflow.pending_invites",
            ObservePendingInviteCount,
            description: "Current pending workspace invite count.");
        Meter.CreateObservableGauge(
            "socialize.workflow.stale_in_approval",
            ObserveStaleApprovalCount,
            description: "Current count of content items in approval longer than the configured stale threshold.");
        Meter.CreateObservableGauge(
            "socialize.workflow.active_workspaces",
            ObserveActiveWorkspaceCounts,
            description: "Current active workspace counts by observation window.");
    }
    public Meter Meter { get; }
@@ -150,9 +173,86 @@ internal sealed class SocializeMetrics : IDisposable
            new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
    }
    public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot)
    {
        lock (_workflowHealthLock)
        {
            _workflowHealthSnapshot = snapshot;
        }
    }
    public void Dispose()
    {
        Meter.Dispose();
        ActivitySource.Dispose();
    }
    private Measurement<int>[] ObserveContentItemCounts()
    {
        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
        return snapshot.ContentItemsByStatus
            .Select(pair => new Measurement<int>(
                pair.Value,
                new KeyValuePair<string, object?>("status", pair.Key)))
            .ToArray();
    }
    private Measurement<int>[] ObserveFeedbackReportCounts()
    {
        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
        return snapshot.FeedbackReportsByStatus
            .Select(pair => new Measurement<int>(
                pair.Value,
                new KeyValuePair<string, object?>("status", pair.Key)))
            .ToArray();
    }
    private Measurement<int> ObservePendingInviteCount()
    {
        return new Measurement<int>(GetWorkflowHealthSnapshot().PendingInviteCount);
    }
    private Measurement<int> ObserveStaleApprovalCount()
    {
        return new Measurement<int>(GetWorkflowHealthSnapshot().StaleInApprovalCount);
    }
    private Measurement<int>[] ObserveActiveWorkspaceCounts()
    {
        WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
        return
        [
            new Measurement<int>(
                snapshot.ActiveWorkspaces24Hours,
                new KeyValuePair<string, object?>("window", "24h")),
            new Measurement<int>(
                snapshot.ActiveWorkspaces7Days,
                new KeyValuePair<string, object?>("window", "7d")),
        ];
    }
    private WorkflowHealthSnapshot GetWorkflowHealthSnapshot()
    {
        lock (_workflowHealthLock)
        {
            return _workflowHealthSnapshot;
        }
    }
 }
 internal sealed record WorkflowHealthSnapshot(
    IReadOnlyDictionary<string, int> ContentItemsByStatus,
    IReadOnlyDictionary<string, int> FeedbackReportsByStatus,
    int PendingInviteCount,
    int StaleInApprovalCount,
    int ActiveWorkspaces24Hours,
    int ActiveWorkspaces7Days)
 {
    public static WorkflowHealthSnapshot Empty { get; } = new(
        new Dictionary<string, int>(StringComparer.Ordinal),
        new Dictionary<string, int>(StringComparer.Ordinal),
        0,
        0,
        0,
        0);
 }
--- a/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs
+++ b/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs
@@ -0,0 +1,102 @@
 using Microsoft.EntityFrameworkCore;
 using Socialize.Api.Data;
 using Socialize.Api.Modules.Feedback.Data;
 using Socialize.Api.Modules.Workspaces.Data;
 namespace Socialize.Api.Infrastructure.Observability;
 internal sealed class WorkflowHealthSamplerService(
    IServiceScopeFactory scopeFactory,
    SocializeMetrics metrics,
    ILogger<WorkflowHealthSamplerService> logger)
    : BackgroundService
 {
    private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5);
    private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3);
    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        await SampleAsync(stoppingToken);
        using PeriodicTimer timer = new(SampleInterval);
        while (!stoppingToken.IsCancellationRequested)
        {
            try
            {
                await timer.WaitForNextTickAsync(stoppingToken);
                await SampleAsync(stoppingToken);
            }
            catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
            {
                logger.LogDebug(ex, "Workflow health sampler stopped.");
            }
        }
    }
    private async Task SampleAsync(CancellationToken stoppingToken)
    {
        try
        {
            using IServiceScope scope = scopeFactory.CreateScope();
            AppDbContext dbContext = scope.ServiceProvider.GetRequiredService<AppDbContext>();
            DateTimeOffset now = DateTimeOffset.UtcNow;
            DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold);
            DateTimeOffset active24HourCutoff = now.AddHours(-24);
            DateTimeOffset active7DayCutoff = now.AddDays(-7);
            Dictionary<string, int> contentItemsByStatus = await dbContext.ContentItems
                .GroupBy(item => item.Status)
                .Select(group => new { Status = group.Key, Count = group.Count() })
                .ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken);
            Dictionary<string, int> feedbackReportsByStatus = await dbContext.FeedbackReports
                .GroupBy(report => report.Status)
                .Select(group => new { Status = group.Key, Count = group.Count() })
                .ToDictionaryAsync(
                    group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(),
                    group => group.Count,
                    StringComparer.Ordinal,
                    stoppingToken);
            int pendingInviteCount = await dbContext.WorkspaceInvites
                .CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken);
            int staleInApprovalCount = await dbContext.ContentItems
                .CountAsync(
                    item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff,
                    stoppingToken);
            int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries
                .Where(entry => entry.CreatedAt >= active24HourCutoff)
                .Select(entry => entry.WorkspaceId)
                .Distinct()
                .CountAsync(stoppingToken);
            int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries
                .Where(entry => entry.CreatedAt >= active7DayCutoff)
                .Select(entry => entry.WorkspaceId)
                .Distinct()
                .CountAsync(stoppingToken);
            metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot(
                contentItemsByStatus,
                feedbackReportsByStatus,
                pendingInviteCount,
                staleInApprovalCount,
                activeWorkspaces24Hours,
                activeWorkspaces7Days));
            metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true);
        }
        catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
        {
            logger.LogDebug(ex, "Workflow health sampler stopped.");
        }
 #pragma warning disable CA1031
        catch (Exception ex)
        {
            metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false);
            logger.LogError(ex, "Workflow health sampling failed.");
        }
 #pragma warning restore CA1031
    }
 }
--- a/deploy/observability/alertmanager/alertmanager.yml
+++ b/deploy/observability/alertmanager/alertmanager.yml
@@ -0,0 +1,22 @@
 global:
  resolve_timeout: 5m
 route:
  receiver: preprod-webhook
  group_by:
    - alertname
    - service
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  routes:
    - matchers:
        - severity="critical"
      receiver: preprod-webhook
      repeat_interval: 30m
 receivers:
  - name: preprod-webhook
    webhook_configs:
      - url: ${ALERTMANAGER_WEBHOOK_URL}
        send_resolved: true
--- a/deploy/observability/blackbox/config.yml
+++ b/deploy/observability/blackbox/config.yml
@@ -0,0 +1,9 @@
 modules:
  http_2xx:
    prober: http
    timeout: 5s
    http:
      method: GET
      preferred_ip_protocol: ip4
      valid_status_codes:
        - 200
--- a/deploy/observability/caddy/grafana.Caddyfile
+++ b/deploy/observability/caddy/grafana.Caddyfile
@@ -0,0 +1,13 @@
 # Optional Caddy snippet for exposing Grafana through a protected hostname.
 # Generate a password hash with:
 # caddy hash-password --plaintext '<password>'
 {$OBSERVABILITY_HOST} {
 	encode gzip zstd
 	basicauth {
 		{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
 	}
 	reverse_proxy grafana:3000
 }
--- a/deploy/observability/compose.observability.yml
+++ b/deploy/observability/compose.observability.yml
@@ -26,6 +26,7 @@ services:
      - prometheus
      - loki
      - tempo
      - alertmanager
    networks:
      - internal
@@ -44,6 +45,31 @@ services:
    networks:
      - internal
  alertmanager:
    image: prom/alertmanager:v0.29.0
    restart: unless-stopped
    command:
      - --config.file=/etc/alertmanager/alertmanager.yml
      - --storage.path=/alertmanager
      - --config.expand-env
    environment:
      ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
    volumes:
      - alertmanager-data:/alertmanager
      - ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
    networks:
      - internal
  blackbox:
    image: prom/blackbox-exporter:v0.27.0
    restart: unless-stopped
    command:
      - --config.file=/etc/blackbox_exporter/config.yml
    volumes:
      - ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
    networks:
      - internal
  loki:
    image: grafana/loki:3.7.1
    restart: unless-stopped
@@ -84,6 +110,7 @@ services:
      - internal
 volumes:
  alertmanager-data:
  grafana-data:
  prometheus-data:
  loki-data:
--- a/deploy/observability/grafana/dashboards/socialize-overview.json
+++ b/deploy/observability/grafana/dashboards/socialize-overview.json
@@ -333,6 +333,78 @@
      "title": "Operational Events, 1h Rolling",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "Prometheus"
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 0,
        "y": 20
      },
      "id": 11,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "expr": "socialize_workflow_content_items",
          "legendFormat": "content {{status}}"
        },
        {
          "expr": "socialize_workflow_feedback_reports",
          "legendFormat": "feedback {{status}}"
        }
      ],
      "title": "Workflow Backlog",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "Prometheus"
      },
      "gridPos": {
        "h": 8,
        "w": 12,
        "x": 12,
        "y": 20
      },
      "id": 12,
      "options": {
        "legend": {
          "displayMode": "list",
          "placement": "bottom"
        },
        "tooltip": {
          "mode": "multi"
        }
      },
      "targets": [
        {
          "expr": "socialize_workflow_active_workspaces",
          "legendFormat": "active workspaces {{window}}"
        },
        {
          "expr": "socialize_workflow_stale_in_approval",
          "legendFormat": "stale in approval"
        },
        {
          "expr": "socialize_workflow_pending_invites",
          "legendFormat": "pending invites"
        }
      ],
      "title": "Workflow Health",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
@@ -342,7 +414,7 @@
        "h": 7,
        "w": 24,
        "x": 0,
-        "y": 20
+        "y": 28
      },
      "id": 9,
      "options": {
@@ -368,7 +440,7 @@
        "h": 9,
        "w": 24,
        "x": 0,
-        "y": 27
+        "y": 35
      },
      "id": 10,
      "options": {
--- a/deploy/observability/prometheus/prometheus.yml
+++ b/deploy/observability/prometheus/prometheus.yml
@@ -2,6 +2,12 @@ global:
  scrape_interval: 15s
  evaluation_interval: 15s
 alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093
 rule_files:
  - /etc/prometheus/rules/*.yml
@@ -15,3 +21,22 @@ scrape_configs:
    static_configs:
      - targets:
          - alloy:12345
  - job_name: preprod-uptime
    metrics_path: /probe
    params:
      module:
        - http_2xx
    static_configs:
      - targets:
          - http://web/
          - http://api:8080/health/ready
    relabel_configs:
      - source_labels:
          - __address__
        target_label: __param_target
      - source_labels:
          - __param_target
        target_label: instance
      - target_label: __address__
        replacement: blackbox:9115
--- a/deploy/observability/prometheus/rules/socialize-alerts.yml
+++ b/deploy/observability/prometheus/rules/socialize-alerts.yml
@@ -11,6 +11,16 @@ groups:
          summary: Socialize API telemetry is missing
          description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
      - alert: SocializePreprodEndpointDown
        expr: probe_success{job="preprod-uptime"} == 0
        for: 2m
        labels:
          severity: critical
          service: socialize-preprod
        annotations:
          summary: Preprod endpoint is down
          description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
      - alert: SocializeApiHighErrorRate
        expr: |
          (
@@ -56,6 +66,26 @@ groups:
          summary: Socialize core usage is quiet
          description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
      - alert: SocializeContentStaleInApproval
        expr: socialize_workflow_stale_in_approval > 0
        for: 30m
        labels:
          severity: warning
          service: socialize-api
        annotations:
          summary: Content is stale in approval
          description: One or more content items have been in approval longer than the configured threshold.
      - alert: SocializeNoActiveWorkspaces
        expr: socialize_workflow_active_workspaces{window="24h"} < 1
        for: 1h
        labels:
          severity: info
          service: socialize-api
        annotations:
          summary: No active workspaces in the last 24 hours
          description: No workspace has content workflow activity in the last 24 hours.
      - alert: SocializeFeedbackBugSubmitted
        expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
        for: 0m
--- a/docs/FEATURES/observability.md
+++ b/docs/FEATURES/observability.md
@@ -78,3 +78,17 @@ Initial alerts should cover:
 - email delivery failures
 - blob storage failures
 - background job failures
 ## Workflow Health Gauges
 Database-derived workflow health metrics should be sampled periodically instead of emitted per request.
 Initial gauges should cover:
 - content item counts by status
 - feedback report counts by status
 - pending workspace invites
 - content stale in approval
 - active workspace counts over 24-hour and 7-day windows
 These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels.
--- a/docs/OPERATIONS/observability-runbook.md
+++ b/docs/OPERATIONS/observability-runbook.md
@@ -0,0 +1,163 @@
 # Observability Runbook
 ## Purpose
 This runbook is for preproduction operation of Socialize's self-hosted observability stack.
 The goal is to answer:
 - Is the app reachable?
 - Is the API healthy?
 - Are errors or latency rising?
 - Are users exercising core workflows?
 - Are emails, blob storage, and background jobs failing?
 - Is work getting stuck?
 ## Start The Stack
 Run from the repository root on the preproduction host:
 ```bash
 docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
 ```
 Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0`
 only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel.
 Set these before exposing Grafana:
 ```bash
 GRAFANA_ADMIN_USER=admin
 GRAFANA_ADMIN_PASSWORD=<strong-password>
 ```
 ## Alert Delivery
 Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook
 configured by:
 ```bash
 ALERTMANAGER_WEBHOOK_URL=<private-alert-webhook-url>
 ```
 If no webhook URL is configured, Alertmanager still starts but alert delivery points
 to a local discard endpoint.
 Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours.
 ## Secure Grafana With Caddy
 An optional Caddy snippet is available at:
 ```txt
 deploy/observability/caddy/grafana.Caddyfile
 ```
 Generate a Caddy password hash:
 ```bash
 caddy hash-password --plaintext '<password>'
 ```
 Configure:
 ```bash
 OBSERVABILITY_HOST=observability.example.com
 GRAFANA_BASIC_AUTH_USER=<user>
 GRAFANA_BASIC_AUTH_HASH=<hash>
 ```
 Keep Grafana private unless the hostname is protected.
 ## First Bring-Up Checks
 1. Confirm containers are running:
 ```bash
 docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps
 ```
 2. Check API health:
 ```bash
 curl -i http://127.0.0.1:8080/health
 curl -i http://127.0.0.1:8080/health/ready
 ```
 3. Open Grafana and check the `Socialize Overview` dashboard.
 4. Generate a few real actions:
 - log in
 - create a content item
 - add a comment
 - submit feedback
 - create a workspace invite
 5. Confirm metrics appear in the dashboard:
 - API request rate
 - usage signals
 - workflow backlog
 - operational events
 ## Alert Triage
 `SocializePreprodEndpointDown`
 - Check `docker compose ps`.
 - Check `docker compose logs api web`.
 - Check `/health/ready`.
 `SocializeApiTelemetryMissing`
 - Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`.
 - Check `docker compose logs alloy`.
 - Check whether the API is receiving traffic.
 `SocializeApiHighErrorRate`
 - Open the API logs panel.
 - Filter by recent `5xx` requests.
 - Open Tempo traces for slow or failing requests if available.
 `SocializeApiHighLatency`
 - Check the p95 latency by endpoint panel.
 - Inspect slow traces.
 - Check database health and recent deploy activity.
 `SocializeEmailDeliveryFailures`
 - Check API logs for Resend failures.
 - Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`.
 - Confirm Resend service status outside this stack if needed.
 `SocializeBlobStorageFailures`
 - Confirm `./blob-storage` volume permissions on the preprod host.
 - Check local disk space.
 - Check API logs for validation or filesystem errors.
 `SocializeBackgroundJobFailures`
 - Check the operational events panel for the failing job name.
 - Check API logs for the same time window.
 `SocializeContentStaleInApproval`
 - Use the app to inspect content currently in approval.
 - Contact the relevant internal owner or client contact outside the app if needed.
 `SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces`
 - Confirm whether quiet usage is expected for the period.
 - If not expected, check login events and API reachability.
 ## Retention Defaults
 - Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`.
 - Tempo keeps traces for 168 hours.
 - Loki uses local filesystem storage for preproduction.
 Tune retention before heavy customer usage or long-running demos.
--- a/docs/TASKS/observability/003-preprod-operations-loop.md
+++ b/docs/TASKS/observability/003-preprod-operations-loop.md
@@ -0,0 +1,34 @@
 # Observability 003: Preprod Operations Loop
 ## Goal
 Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook.
 ## Feature Spec
 - `docs/FEATURES/observability.md`
 ## Scope
 - Add Alertmanager to the optional observability compose overlay.
 - Add Blackbox Exporter uptime probes for the web container and API readiness endpoint.
 - Add backend database-derived workflow health gauges.
 - Add Prometheus alerts for uptime probes and workflow health.
 - Add an optional Caddy snippet for protected Grafana exposure.
 - Add an operator runbook for bring-up, alert triage, and security defaults.
 ## Out Of Scope
 - Operating the remote preproduction host.
 - Choosing the final alert destination.
 - Client-facing status page.
 - External third-party uptime monitoring.
 ## Validation
 ```bash
 dotnet build backend/Socialize.slnx
 dotnet test backend/Socialize.slnx
 docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
 jq empty deploy/observability/grafana/dashboards/socialize-overview.json
 ```