From 986c7efea64bb62105ff267f0f8e48e8528d57e5 Mon Sep 17 00:00:00 2001 From: Jonathan Bourdon Date: Fri, 8 May 2026 15:48:56 -0400 Subject: [PATCH] feat: close preprod observability loop --- README.md | 4 + .../ObservabilityRegistration.cs | 1 + .../Observability/SocializeMetrics.cs | 100 +++++++++++ .../WorkflowHealthSamplerService.cs | 102 +++++++++++ .../alertmanager/alertmanager.yml | 22 +++ deploy/observability/blackbox/config.yml | 9 + deploy/observability/caddy/grafana.Caddyfile | 13 ++ .../observability/compose.observability.yml | 27 +++ .../dashboards/socialize-overview.json | 76 +++++++- .../observability/prometheus/prometheus.yml | 25 +++ .../prometheus/rules/socialize-alerts.yml | 30 ++++ docs/FEATURES/observability.md | 14 ++ docs/OPERATIONS/observability-runbook.md | 163 ++++++++++++++++++ .../003-preprod-operations-loop.md | 34 ++++ 14 files changed, 618 insertions(+), 2 deletions(-) create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs create mode 100644 deploy/observability/alertmanager/alertmanager.yml create mode 100644 deploy/observability/blackbox/config.yml create mode 100644 deploy/observability/caddy/grafana.Caddyfile create mode 100644 docs/OPERATIONS/observability-runbook.md create mode 100644 docs/TASKS/observability/003-preprod-operations-loop.md diff --git a/README.md b/README.md index 3c4d229b..b16e1bc6 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,10 @@ host. Prometheus alert rules are provisioned under `deploy/observability/prometheus/rules/`; notification delivery is intentionally left to the preprod operations environment. +Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint. +See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and +the optional protected Caddy configuration for Grafana. + ## Solution ```bash diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs index baaab361..3ea0d300 100644 --- a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs +++ b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs @@ -48,6 +48,7 @@ internal static class ObservabilityRegistration } builder.Services.AddSingleton(); + builder.Services.AddHostedService(); builder.Services .AddOpenTelemetry() .ConfigureResource(resource => resource.AddService( diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs index c52ad933..97810937 100644 --- a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs +++ b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs @@ -19,6 +19,8 @@ internal sealed class SocializeMetrics : IDisposable private readonly Counter _organizationCreatedCounter; private readonly Counter _workspaceCreatedCounter; private readonly Counter _workspaceInviteCreatedCounter; + private readonly object _workflowHealthLock = new(); + private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty; public SocializeMetrics() { @@ -58,6 +60,27 @@ internal sealed class SocializeMetrics : IDisposable _backgroundJobRunCounter = Meter.CreateCounter( "socialize.background_job.runs", description: "Background job runs partitioned by job and outcome."); + + Meter.CreateObservableGauge( + "socialize.workflow.content_items", + ObserveContentItemCounts, + description: "Current content item counts by status."); + Meter.CreateObservableGauge( + "socialize.workflow.feedback_reports", + ObserveFeedbackReportCounts, + description: "Current feedback report counts by status."); + Meter.CreateObservableGauge( + "socialize.workflow.pending_invites", + ObservePendingInviteCount, + description: "Current pending workspace invite count."); + Meter.CreateObservableGauge( + "socialize.workflow.stale_in_approval", + ObserveStaleApprovalCount, + description: "Current count of content items in approval longer than the configured stale threshold."); + Meter.CreateObservableGauge( + "socialize.workflow.active_workspaces", + ObserveActiveWorkspaceCounts, + description: "Current active workspace counts by observation window."); } public Meter Meter { get; } @@ -150,9 +173,86 @@ internal sealed class SocializeMetrics : IDisposable new KeyValuePair("outcome", succeeded ? "success" : "failure")); } + public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot) + { + lock (_workflowHealthLock) + { + _workflowHealthSnapshot = snapshot; + } + } + public void Dispose() { Meter.Dispose(); ActivitySource.Dispose(); } + + private Measurement[] ObserveContentItemCounts() + { + WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot(); + return snapshot.ContentItemsByStatus + .Select(pair => new Measurement( + pair.Value, + new KeyValuePair("status", pair.Key))) + .ToArray(); + } + + private Measurement[] ObserveFeedbackReportCounts() + { + WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot(); + return snapshot.FeedbackReportsByStatus + .Select(pair => new Measurement( + pair.Value, + new KeyValuePair("status", pair.Key))) + .ToArray(); + } + + private Measurement ObservePendingInviteCount() + { + return new Measurement(GetWorkflowHealthSnapshot().PendingInviteCount); + } + + private Measurement ObserveStaleApprovalCount() + { + return new Measurement(GetWorkflowHealthSnapshot().StaleInApprovalCount); + } + + private Measurement[] ObserveActiveWorkspaceCounts() + { + WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot(); + return + [ + new Measurement( + snapshot.ActiveWorkspaces24Hours, + new KeyValuePair("window", "24h")), + new Measurement( + snapshot.ActiveWorkspaces7Days, + new KeyValuePair("window", "7d")), + ]; + } + + private WorkflowHealthSnapshot GetWorkflowHealthSnapshot() + { + lock (_workflowHealthLock) + { + return _workflowHealthSnapshot; + } + } +} + +internal sealed record WorkflowHealthSnapshot( + IReadOnlyDictionary ContentItemsByStatus, + IReadOnlyDictionary FeedbackReportsByStatus, + int PendingInviteCount, + int StaleInApprovalCount, + int ActiveWorkspaces24Hours, + int ActiveWorkspaces7Days) +{ + public static WorkflowHealthSnapshot Empty { get; } = new( + new Dictionary(StringComparer.Ordinal), + new Dictionary(StringComparer.Ordinal), + 0, + 0, + 0, + 0); } diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs b/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs new file mode 100644 index 00000000..8c0fb66e --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/WorkflowHealthSamplerService.cs @@ -0,0 +1,102 @@ +using Microsoft.EntityFrameworkCore; +using Socialize.Api.Data; +using Socialize.Api.Modules.Feedback.Data; +using Socialize.Api.Modules.Workspaces.Data; + +namespace Socialize.Api.Infrastructure.Observability; + +internal sealed class WorkflowHealthSamplerService( + IServiceScopeFactory scopeFactory, + SocializeMetrics metrics, + ILogger logger) + : BackgroundService +{ + private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5); + private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3); + + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + await SampleAsync(stoppingToken); + + using PeriodicTimer timer = new(SampleInterval); + while (!stoppingToken.IsCancellationRequested) + { + try + { + await timer.WaitForNextTickAsync(stoppingToken); + await SampleAsync(stoppingToken); + } + catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested) + { + logger.LogDebug(ex, "Workflow health sampler stopped."); + } + } + } + + private async Task SampleAsync(CancellationToken stoppingToken) + { + try + { + using IServiceScope scope = scopeFactory.CreateScope(); + AppDbContext dbContext = scope.ServiceProvider.GetRequiredService(); + DateTimeOffset now = DateTimeOffset.UtcNow; + DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold); + DateTimeOffset active24HourCutoff = now.AddHours(-24); + DateTimeOffset active7DayCutoff = now.AddDays(-7); + + Dictionary contentItemsByStatus = await dbContext.ContentItems + .GroupBy(item => item.Status) + .Select(group => new { Status = group.Key, Count = group.Count() }) + .ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken); + + Dictionary feedbackReportsByStatus = await dbContext.FeedbackReports + .GroupBy(report => report.Status) + .Select(group => new { Status = group.Key, Count = group.Count() }) + .ToDictionaryAsync( + group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(), + group => group.Count, + StringComparer.Ordinal, + stoppingToken); + + int pendingInviteCount = await dbContext.WorkspaceInvites + .CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken); + + int staleInApprovalCount = await dbContext.ContentItems + .CountAsync( + item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff, + stoppingToken); + + int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries + .Where(entry => entry.CreatedAt >= active24HourCutoff) + .Select(entry => entry.WorkspaceId) + .Distinct() + .CountAsync(stoppingToken); + + int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries + .Where(entry => entry.CreatedAt >= active7DayCutoff) + .Select(entry => entry.WorkspaceId) + .Distinct() + .CountAsync(stoppingToken); + + metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot( + contentItemsByStatus, + feedbackReportsByStatus, + pendingInviteCount, + staleInApprovalCount, + activeWorkspaces24Hours, + activeWorkspaces7Days)); + metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true); + } + catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested) + { + logger.LogDebug(ex, "Workflow health sampler stopped."); + } +#pragma warning disable CA1031 + catch (Exception ex) + { + metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false); + logger.LogError(ex, "Workflow health sampling failed."); + } +#pragma warning restore CA1031 + } +} diff --git a/deploy/observability/alertmanager/alertmanager.yml b/deploy/observability/alertmanager/alertmanager.yml new file mode 100644 index 00000000..a9f25d94 --- /dev/null +++ b/deploy/observability/alertmanager/alertmanager.yml @@ -0,0 +1,22 @@ +global: + resolve_timeout: 5m + +route: + receiver: preprod-webhook + group_by: + - alertname + - service + group_wait: 30s + group_interval: 5m + repeat_interval: 4h + routes: + - matchers: + - severity="critical" + receiver: preprod-webhook + repeat_interval: 30m + +receivers: + - name: preprod-webhook + webhook_configs: + - url: ${ALERTMANAGER_WEBHOOK_URL} + send_resolved: true diff --git a/deploy/observability/blackbox/config.yml b/deploy/observability/blackbox/config.yml new file mode 100644 index 00000000..fa8e76f0 --- /dev/null +++ b/deploy/observability/blackbox/config.yml @@ -0,0 +1,9 @@ +modules: + http_2xx: + prober: http + timeout: 5s + http: + method: GET + preferred_ip_protocol: ip4 + valid_status_codes: + - 200 diff --git a/deploy/observability/caddy/grafana.Caddyfile b/deploy/observability/caddy/grafana.Caddyfile new file mode 100644 index 00000000..4eca957d --- /dev/null +++ b/deploy/observability/caddy/grafana.Caddyfile @@ -0,0 +1,13 @@ +# Optional Caddy snippet for exposing Grafana through a protected hostname. +# Generate a password hash with: +# caddy hash-password --plaintext '' + +{$OBSERVABILITY_HOST} { + encode gzip zstd + + basicauth { + {$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH} + } + + reverse_proxy grafana:3000 +} diff --git a/deploy/observability/compose.observability.yml b/deploy/observability/compose.observability.yml index 57048803..f42a1038 100644 --- a/deploy/observability/compose.observability.yml +++ b/deploy/observability/compose.observability.yml @@ -26,6 +26,7 @@ services: - prometheus - loki - tempo + - alertmanager networks: - internal @@ -44,6 +45,31 @@ services: networks: - internal + alertmanager: + image: prom/alertmanager:v0.29.0 + restart: unless-stopped + command: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + - --config.expand-env + environment: + ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/} + volumes: + - alertmanager-data:/alertmanager + - ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + networks: + - internal + + blackbox: + image: prom/blackbox-exporter:v0.27.0 + restart: unless-stopped + command: + - --config.file=/etc/blackbox_exporter/config.yml + volumes: + - ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro + networks: + - internal + loki: image: grafana/loki:3.7.1 restart: unless-stopped @@ -84,6 +110,7 @@ services: - internal volumes: + alertmanager-data: grafana-data: prometheus-data: loki-data: diff --git a/deploy/observability/grafana/dashboards/socialize-overview.json b/deploy/observability/grafana/dashboards/socialize-overview.json index 89599631..fcd9c6a9 100644 --- a/deploy/observability/grafana/dashboards/socialize-overview.json +++ b/deploy/observability/grafana/dashboards/socialize-overview.json @@ -333,6 +333,78 @@ "title": "Operational Events, 1h Rolling", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "socialize_workflow_content_items", + "legendFormat": "content {{status}}" + }, + { + "expr": "socialize_workflow_feedback_reports", + "legendFormat": "feedback {{status}}" + } + ], + "title": "Workflow Backlog", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 12, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "socialize_workflow_active_workspaces", + "legendFormat": "active workspaces {{window}}" + }, + { + "expr": "socialize_workflow_stale_in_approval", + "legendFormat": "stale in approval" + }, + { + "expr": "socialize_workflow_pending_invites", + "legendFormat": "pending invites" + } + ], + "title": "Workflow Health", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -342,7 +414,7 @@ "h": 7, "w": 24, "x": 0, - "y": 20 + "y": 28 }, "id": 9, "options": { @@ -368,7 +440,7 @@ "h": 9, "w": 24, "x": 0, - "y": 27 + "y": 35 }, "id": 10, "options": { diff --git a/deploy/observability/prometheus/prometheus.yml b/deploy/observability/prometheus/prometheus.yml index a782e6b0..81231d10 100644 --- a/deploy/observability/prometheus/prometheus.yml +++ b/deploy/observability/prometheus/prometheus.yml @@ -2,6 +2,12 @@ global: scrape_interval: 15s evaluation_interval: 15s +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + rule_files: - /etc/prometheus/rules/*.yml @@ -15,3 +21,22 @@ scrape_configs: static_configs: - targets: - alloy:12345 + + - job_name: preprod-uptime + metrics_path: /probe + params: + module: + - http_2xx + static_configs: + - targets: + - http://web/ + - http://api:8080/health/ready + relabel_configs: + - source_labels: + - __address__ + target_label: __param_target + - source_labels: + - __param_target + target_label: instance + - target_label: __address__ + replacement: blackbox:9115 diff --git a/deploy/observability/prometheus/rules/socialize-alerts.yml b/deploy/observability/prometheus/rules/socialize-alerts.yml index 20a08e43..8fd0060c 100644 --- a/deploy/observability/prometheus/rules/socialize-alerts.yml +++ b/deploy/observability/prometheus/rules/socialize-alerts.yml @@ -11,6 +11,16 @@ groups: summary: Socialize API telemetry is missing description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down. + - alert: SocializePreprodEndpointDown + expr: probe_success{job="preprod-uptime"} == 0 + for: 2m + labels: + severity: critical + service: socialize-preprod + annotations: + summary: Preprod endpoint is down + description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.' + - alert: SocializeApiHighErrorRate expr: | ( @@ -56,6 +66,26 @@ groups: summary: Socialize core usage is quiet description: No content, comment, approval, or feedback activity has been observed over the last 12 hours. + - alert: SocializeContentStaleInApproval + expr: socialize_workflow_stale_in_approval > 0 + for: 30m + labels: + severity: warning + service: socialize-api + annotations: + summary: Content is stale in approval + description: One or more content items have been in approval longer than the configured threshold. + + - alert: SocializeNoActiveWorkspaces + expr: socialize_workflow_active_workspaces{window="24h"} < 1 + for: 1h + labels: + severity: info + service: socialize-api + annotations: + summary: No active workspaces in the last 24 hours + description: No workspace has content workflow activity in the last 24 hours. + - alert: SocializeFeedbackBugSubmitted expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0 for: 0m diff --git a/docs/FEATURES/observability.md b/docs/FEATURES/observability.md index 0af8e6fc..3104907f 100644 --- a/docs/FEATURES/observability.md +++ b/docs/FEATURES/observability.md @@ -78,3 +78,17 @@ Initial alerts should cover: - email delivery failures - blob storage failures - background job failures + +## Workflow Health Gauges + +Database-derived workflow health metrics should be sampled periodically instead of emitted per request. + +Initial gauges should cover: + +- content item counts by status +- feedback report counts by status +- pending workspace invites +- content stale in approval +- active workspace counts over 24-hour and 7-day windows + +These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels. diff --git a/docs/OPERATIONS/observability-runbook.md b/docs/OPERATIONS/observability-runbook.md new file mode 100644 index 00000000..ee7ef64f --- /dev/null +++ b/docs/OPERATIONS/observability-runbook.md @@ -0,0 +1,163 @@ +# Observability Runbook + +## Purpose + +This runbook is for preproduction operation of Socialize's self-hosted observability stack. + +The goal is to answer: + +- Is the app reachable? +- Is the API healthy? +- Are errors or latency rising? +- Are users exercising core workflows? +- Are emails, blob storage, and background jobs failing? +- Is work getting stuck? + +## Start The Stack + +Run from the repository root on the preproduction host: + +```bash +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d +``` + +Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0` +only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel. + +Set these before exposing Grafana: + +```bash +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD= +``` + +## Alert Delivery + +Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook +configured by: + +```bash +ALERTMANAGER_WEBHOOK_URL= +``` + +If no webhook URL is configured, Alertmanager still starts but alert delivery points +to a local discard endpoint. + +Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours. + +## Secure Grafana With Caddy + +An optional Caddy snippet is available at: + +```txt +deploy/observability/caddy/grafana.Caddyfile +``` + +Generate a Caddy password hash: + +```bash +caddy hash-password --plaintext '' +``` + +Configure: + +```bash +OBSERVABILITY_HOST=observability.example.com +GRAFANA_BASIC_AUTH_USER= +GRAFANA_BASIC_AUTH_HASH= +``` + +Keep Grafana private unless the hostname is protected. + +## First Bring-Up Checks + +1. Confirm containers are running: + +```bash +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps +``` + +2. Check API health: + +```bash +curl -i http://127.0.0.1:8080/health +curl -i http://127.0.0.1:8080/health/ready +``` + +3. Open Grafana and check the `Socialize Overview` dashboard. + +4. Generate a few real actions: + +- log in +- create a content item +- add a comment +- submit feedback +- create a workspace invite + +5. Confirm metrics appear in the dashboard: + +- API request rate +- usage signals +- workflow backlog +- operational events + +## Alert Triage + +`SocializePreprodEndpointDown` + +- Check `docker compose ps`. +- Check `docker compose logs api web`. +- Check `/health/ready`. + +`SocializeApiTelemetryMissing` + +- Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`. +- Check `docker compose logs alloy`. +- Check whether the API is receiving traffic. + +`SocializeApiHighErrorRate` + +- Open the API logs panel. +- Filter by recent `5xx` requests. +- Open Tempo traces for slow or failing requests if available. + +`SocializeApiHighLatency` + +- Check the p95 latency by endpoint panel. +- Inspect slow traces. +- Check database health and recent deploy activity. + +`SocializeEmailDeliveryFailures` + +- Check API logs for Resend failures. +- Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`. +- Confirm Resend service status outside this stack if needed. + +`SocializeBlobStorageFailures` + +- Confirm `./blob-storage` volume permissions on the preprod host. +- Check local disk space. +- Check API logs for validation or filesystem errors. + +`SocializeBackgroundJobFailures` + +- Check the operational events panel for the failing job name. +- Check API logs for the same time window. + +`SocializeContentStaleInApproval` + +- Use the app to inspect content currently in approval. +- Contact the relevant internal owner or client contact outside the app if needed. + +`SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces` + +- Confirm whether quiet usage is expected for the period. +- If not expected, check login events and API reachability. + +## Retention Defaults + +- Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`. +- Tempo keeps traces for 168 hours. +- Loki uses local filesystem storage for preproduction. + +Tune retention before heavy customer usage or long-running demos. diff --git a/docs/TASKS/observability/003-preprod-operations-loop.md b/docs/TASKS/observability/003-preprod-operations-loop.md new file mode 100644 index 00000000..6f4a706c --- /dev/null +++ b/docs/TASKS/observability/003-preprod-operations-loop.md @@ -0,0 +1,34 @@ +# Observability 003: Preprod Operations Loop + +## Goal + +Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook. + +## Feature Spec + +- `docs/FEATURES/observability.md` + +## Scope + +- Add Alertmanager to the optional observability compose overlay. +- Add Blackbox Exporter uptime probes for the web container and API readiness endpoint. +- Add backend database-derived workflow health gauges. +- Add Prometheus alerts for uptime probes and workflow health. +- Add an optional Caddy snippet for protected Grafana exposure. +- Add an operator runbook for bring-up, alert triage, and security defaults. + +## Out Of Scope + +- Operating the remote preproduction host. +- Choosing the final alert destination. +- Client-facing status page. +- External third-party uptime monitoring. + +## Validation + +```bash +dotnet build backend/Socialize.slnx +dotnet test backend/Socialize.slnx +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config +jq empty deploy/observability/grafana/dashboards/socialize-overview.json +```