feat: close preprod observability loop

This commit is contained in:
2026-05-08 15:48:56 -04:00
parent 8bcff96821
commit 986c7efea6
14 changed files with 618 additions and 2 deletions

View File

@@ -113,6 +113,10 @@ host. Prometheus alert rules are provisioned under
`deploy/observability/prometheus/rules/`; notification delivery is intentionally `deploy/observability/prometheus/rules/`; notification delivery is intentionally
left to the preprod operations environment. left to the preprod operations environment.
Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint.
See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and
the optional protected Caddy configuration for Grafana.
## Solution ## Solution
```bash ```bash

View File

@@ -48,6 +48,7 @@ internal static class ObservabilityRegistration
} }
builder.Services.AddSingleton<SocializeMetrics>(); builder.Services.AddSingleton<SocializeMetrics>();
builder.Services.AddHostedService<WorkflowHealthSamplerService>();
builder.Services builder.Services
.AddOpenTelemetry() .AddOpenTelemetry()
.ConfigureResource(resource => resource.AddService( .ConfigureResource(resource => resource.AddService(

View File

@@ -19,6 +19,8 @@ internal sealed class SocializeMetrics : IDisposable
private readonly Counter<long> _organizationCreatedCounter; private readonly Counter<long> _organizationCreatedCounter;
private readonly Counter<long> _workspaceCreatedCounter; private readonly Counter<long> _workspaceCreatedCounter;
private readonly Counter<long> _workspaceInviteCreatedCounter; private readonly Counter<long> _workspaceInviteCreatedCounter;
private readonly object _workflowHealthLock = new();
private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty;
public SocializeMetrics() public SocializeMetrics()
{ {
@@ -58,6 +60,27 @@ internal sealed class SocializeMetrics : IDisposable
_backgroundJobRunCounter = Meter.CreateCounter<long>( _backgroundJobRunCounter = Meter.CreateCounter<long>(
"socialize.background_job.runs", "socialize.background_job.runs",
description: "Background job runs partitioned by job and outcome."); description: "Background job runs partitioned by job and outcome.");
Meter.CreateObservableGauge(
"socialize.workflow.content_items",
ObserveContentItemCounts,
description: "Current content item counts by status.");
Meter.CreateObservableGauge(
"socialize.workflow.feedback_reports",
ObserveFeedbackReportCounts,
description: "Current feedback report counts by status.");
Meter.CreateObservableGauge(
"socialize.workflow.pending_invites",
ObservePendingInviteCount,
description: "Current pending workspace invite count.");
Meter.CreateObservableGauge(
"socialize.workflow.stale_in_approval",
ObserveStaleApprovalCount,
description: "Current count of content items in approval longer than the configured stale threshold.");
Meter.CreateObservableGauge(
"socialize.workflow.active_workspaces",
ObserveActiveWorkspaceCounts,
description: "Current active workspace counts by observation window.");
} }
public Meter Meter { get; } public Meter Meter { get; }
@@ -150,9 +173,86 @@ internal sealed class SocializeMetrics : IDisposable
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure")); new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
} }
public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot)
{
lock (_workflowHealthLock)
{
_workflowHealthSnapshot = snapshot;
}
}
public void Dispose() public void Dispose()
{ {
Meter.Dispose(); Meter.Dispose();
ActivitySource.Dispose(); ActivitySource.Dispose();
} }
private Measurement<int>[] ObserveContentItemCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return snapshot.ContentItemsByStatus
.Select(pair => new Measurement<int>(
pair.Value,
new KeyValuePair<string, object?>("status", pair.Key)))
.ToArray();
}
private Measurement<int>[] ObserveFeedbackReportCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return snapshot.FeedbackReportsByStatus
.Select(pair => new Measurement<int>(
pair.Value,
new KeyValuePair<string, object?>("status", pair.Key)))
.ToArray();
}
private Measurement<int> ObservePendingInviteCount()
{
return new Measurement<int>(GetWorkflowHealthSnapshot().PendingInviteCount);
}
private Measurement<int> ObserveStaleApprovalCount()
{
return new Measurement<int>(GetWorkflowHealthSnapshot().StaleInApprovalCount);
}
private Measurement<int>[] ObserveActiveWorkspaceCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return
[
new Measurement<int>(
snapshot.ActiveWorkspaces24Hours,
new KeyValuePair<string, object?>("window", "24h")),
new Measurement<int>(
snapshot.ActiveWorkspaces7Days,
new KeyValuePair<string, object?>("window", "7d")),
];
}
private WorkflowHealthSnapshot GetWorkflowHealthSnapshot()
{
lock (_workflowHealthLock)
{
return _workflowHealthSnapshot;
}
}
}
internal sealed record WorkflowHealthSnapshot(
IReadOnlyDictionary<string, int> ContentItemsByStatus,
IReadOnlyDictionary<string, int> FeedbackReportsByStatus,
int PendingInviteCount,
int StaleInApprovalCount,
int ActiveWorkspaces24Hours,
int ActiveWorkspaces7Days)
{
public static WorkflowHealthSnapshot Empty { get; } = new(
new Dictionary<string, int>(StringComparer.Ordinal),
new Dictionary<string, int>(StringComparer.Ordinal),
0,
0,
0,
0);
} }

View File

@@ -0,0 +1,102 @@
using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data;
using Socialize.Api.Modules.Feedback.Data;
using Socialize.Api.Modules.Workspaces.Data;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class WorkflowHealthSamplerService(
IServiceScopeFactory scopeFactory,
SocializeMetrics metrics,
ILogger<WorkflowHealthSamplerService> logger)
: BackgroundService
{
private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5);
private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3);
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
await SampleAsync(stoppingToken);
using PeriodicTimer timer = new(SampleInterval);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await timer.WaitForNextTickAsync(stoppingToken);
await SampleAsync(stoppingToken);
}
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
{
logger.LogDebug(ex, "Workflow health sampler stopped.");
}
}
}
private async Task SampleAsync(CancellationToken stoppingToken)
{
try
{
using IServiceScope scope = scopeFactory.CreateScope();
AppDbContext dbContext = scope.ServiceProvider.GetRequiredService<AppDbContext>();
DateTimeOffset now = DateTimeOffset.UtcNow;
DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold);
DateTimeOffset active24HourCutoff = now.AddHours(-24);
DateTimeOffset active7DayCutoff = now.AddDays(-7);
Dictionary<string, int> contentItemsByStatus = await dbContext.ContentItems
.GroupBy(item => item.Status)
.Select(group => new { Status = group.Key, Count = group.Count() })
.ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken);
Dictionary<string, int> feedbackReportsByStatus = await dbContext.FeedbackReports
.GroupBy(report => report.Status)
.Select(group => new { Status = group.Key, Count = group.Count() })
.ToDictionaryAsync(
group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(),
group => group.Count,
StringComparer.Ordinal,
stoppingToken);
int pendingInviteCount = await dbContext.WorkspaceInvites
.CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken);
int staleInApprovalCount = await dbContext.ContentItems
.CountAsync(
item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff,
stoppingToken);
int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries
.Where(entry => entry.CreatedAt >= active24HourCutoff)
.Select(entry => entry.WorkspaceId)
.Distinct()
.CountAsync(stoppingToken);
int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries
.Where(entry => entry.CreatedAt >= active7DayCutoff)
.Select(entry => entry.WorkspaceId)
.Distinct()
.CountAsync(stoppingToken);
metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot(
contentItemsByStatus,
feedbackReportsByStatus,
pendingInviteCount,
staleInApprovalCount,
activeWorkspaces24Hours,
activeWorkspaces7Days));
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true);
}
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
{
logger.LogDebug(ex, "Workflow health sampler stopped.");
}
#pragma warning disable CA1031
catch (Exception ex)
{
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false);
logger.LogError(ex, "Workflow health sampling failed.");
}
#pragma warning restore CA1031
}
}

View File

@@ -0,0 +1,22 @@
global:
resolve_timeout: 5m
route:
receiver: preprod-webhook
group_by:
- alertname
- service
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- matchers:
- severity="critical"
receiver: preprod-webhook
repeat_interval: 30m
receivers:
- name: preprod-webhook
webhook_configs:
- url: ${ALERTMANAGER_WEBHOOK_URL}
send_resolved: true

View File

@@ -0,0 +1,9 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
method: GET
preferred_ip_protocol: ip4
valid_status_codes:
- 200

View File

@@ -0,0 +1,13 @@
# Optional Caddy snippet for exposing Grafana through a protected hostname.
# Generate a password hash with:
# caddy hash-password --plaintext '<password>'
{$OBSERVABILITY_HOST} {
encode gzip zstd
basicauth {
{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
}
reverse_proxy grafana:3000
}

View File

@@ -26,6 +26,7 @@ services:
- prometheus - prometheus
- loki - loki
- tempo - tempo
- alertmanager
networks: networks:
- internal - internal
@@ -44,6 +45,31 @@ services:
networks: networks:
- internal - internal
alertmanager:
image: prom/alertmanager:v0.29.0
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --config.expand-env
environment:
ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
volumes:
- alertmanager-data:/alertmanager
- ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
networks:
- internal
blackbox:
image: prom/blackbox-exporter:v0.27.0
restart: unless-stopped
command:
- --config.file=/etc/blackbox_exporter/config.yml
volumes:
- ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
networks:
- internal
loki: loki:
image: grafana/loki:3.7.1 image: grafana/loki:3.7.1
restart: unless-stopped restart: unless-stopped
@@ -84,6 +110,7 @@ services:
- internal - internal
volumes: volumes:
alertmanager-data:
grafana-data: grafana-data:
prometheus-data: prometheus-data:
loki-data: loki-data:

View File

@@ -333,6 +333,78 @@
"title": "Operational Events, 1h Rolling", "title": "Operational Events, 1h Rolling",
"type": "timeseries" "type": "timeseries"
}, },
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 20
},
"id": 11,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_content_items",
"legendFormat": "content {{status}}"
},
{
"expr": "socialize_workflow_feedback_reports",
"legendFormat": "feedback {{status}}"
}
],
"title": "Workflow Backlog",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 20
},
"id": 12,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_active_workspaces",
"legendFormat": "active workspaces {{window}}"
},
{
"expr": "socialize_workflow_stale_in_approval",
"legendFormat": "stale in approval"
},
{
"expr": "socialize_workflow_pending_invites",
"legendFormat": "pending invites"
}
],
"title": "Workflow Health",
"type": "timeseries"
},
{ {
"datasource": { "datasource": {
"type": "prometheus", "type": "prometheus",
@@ -342,7 +414,7 @@
"h": 7, "h": 7,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 20 "y": 28
}, },
"id": 9, "id": 9,
"options": { "options": {
@@ -368,7 +440,7 @@
"h": 9, "h": 9,
"w": 24, "w": 24,
"x": 0, "x": 0,
"y": 27 "y": 35
}, },
"id": 10, "id": 10,
"options": { "options": {

View File

@@ -2,6 +2,12 @@ global:
scrape_interval: 15s scrape_interval: 15s
evaluation_interval: 15s evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files: rule_files:
- /etc/prometheus/rules/*.yml - /etc/prometheus/rules/*.yml
@@ -15,3 +21,22 @@ scrape_configs:
static_configs: static_configs:
- targets: - targets:
- alloy:12345 - alloy:12345
- job_name: preprod-uptime
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- http://web/
- http://api:8080/health/ready
relabel_configs:
- source_labels:
- __address__
target_label: __param_target
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox:9115

View File

@@ -11,6 +11,16 @@ groups:
summary: Socialize API telemetry is missing summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down. description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializePreprodEndpointDown
expr: probe_success{job="preprod-uptime"} == 0
for: 2m
labels:
severity: critical
service: socialize-preprod
annotations:
summary: Preprod endpoint is down
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
- alert: SocializeApiHighErrorRate - alert: SocializeApiHighErrorRate
expr: | expr: |
( (
@@ -56,6 +66,26 @@ groups:
summary: Socialize core usage is quiet summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours. description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeContentStaleInApproval
expr: socialize_workflow_stale_in_approval > 0
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Content is stale in approval
description: One or more content items have been in approval longer than the configured threshold.
- alert: SocializeNoActiveWorkspaces
expr: socialize_workflow_active_workspaces{window="24h"} < 1
for: 1h
labels:
severity: info
service: socialize-api
annotations:
summary: No active workspaces in the last 24 hours
description: No workspace has content workflow activity in the last 24 hours.
- alert: SocializeFeedbackBugSubmitted - alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0 expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m for: 0m

View File

@@ -78,3 +78,17 @@ Initial alerts should cover:
- email delivery failures - email delivery failures
- blob storage failures - blob storage failures
- background job failures - background job failures
## Workflow Health Gauges
Database-derived workflow health metrics should be sampled periodically instead of emitted per request.
Initial gauges should cover:
- content item counts by status
- feedback report counts by status
- pending workspace invites
- content stale in approval
- active workspace counts over 24-hour and 7-day windows
These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels.

View File

@@ -0,0 +1,163 @@
# Observability Runbook
## Purpose
This runbook is for preproduction operation of Socialize's self-hosted observability stack.
The goal is to answer:
- Is the app reachable?
- Is the API healthy?
- Are errors or latency rising?
- Are users exercising core workflows?
- Are emails, blob storage, and background jobs failing?
- Is work getting stuck?
## Start The Stack
Run from the repository root on the preproduction host:
```bash
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
```
Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0`
only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel.
Set these before exposing Grafana:
```bash
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=<strong-password>
```
## Alert Delivery
Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook
configured by:
```bash
ALERTMANAGER_WEBHOOK_URL=<private-alert-webhook-url>
```
If no webhook URL is configured, Alertmanager still starts but alert delivery points
to a local discard endpoint.
Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours.
## Secure Grafana With Caddy
An optional Caddy snippet is available at:
```txt
deploy/observability/caddy/grafana.Caddyfile
```
Generate a Caddy password hash:
```bash
caddy hash-password --plaintext '<password>'
```
Configure:
```bash
OBSERVABILITY_HOST=observability.example.com
GRAFANA_BASIC_AUTH_USER=<user>
GRAFANA_BASIC_AUTH_HASH=<hash>
```
Keep Grafana private unless the hostname is protected.
## First Bring-Up Checks
1. Confirm containers are running:
```bash
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps
```
2. Check API health:
```bash
curl -i http://127.0.0.1:8080/health
curl -i http://127.0.0.1:8080/health/ready
```
3. Open Grafana and check the `Socialize Overview` dashboard.
4. Generate a few real actions:
- log in
- create a content item
- add a comment
- submit feedback
- create a workspace invite
5. Confirm metrics appear in the dashboard:
- API request rate
- usage signals
- workflow backlog
- operational events
## Alert Triage
`SocializePreprodEndpointDown`
- Check `docker compose ps`.
- Check `docker compose logs api web`.
- Check `/health/ready`.
`SocializeApiTelemetryMissing`
- Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`.
- Check `docker compose logs alloy`.
- Check whether the API is receiving traffic.
`SocializeApiHighErrorRate`
- Open the API logs panel.
- Filter by recent `5xx` requests.
- Open Tempo traces for slow or failing requests if available.
`SocializeApiHighLatency`
- Check the p95 latency by endpoint panel.
- Inspect slow traces.
- Check database health and recent deploy activity.
`SocializeEmailDeliveryFailures`
- Check API logs for Resend failures.
- Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`.
- Confirm Resend service status outside this stack if needed.
`SocializeBlobStorageFailures`
- Confirm `./blob-storage` volume permissions on the preprod host.
- Check local disk space.
- Check API logs for validation or filesystem errors.
`SocializeBackgroundJobFailures`
- Check the operational events panel for the failing job name.
- Check API logs for the same time window.
`SocializeContentStaleInApproval`
- Use the app to inspect content currently in approval.
- Contact the relevant internal owner or client contact outside the app if needed.
`SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces`
- Confirm whether quiet usage is expected for the period.
- If not expected, check login events and API reachability.
## Retention Defaults
- Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`.
- Tempo keeps traces for 168 hours.
- Loki uses local filesystem storage for preproduction.
Tune retention before heavy customer usage or long-running demos.

View File

@@ -0,0 +1,34 @@
# Observability 003: Preprod Operations Loop
## Goal
Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook.
## Feature Spec
- `docs/FEATURES/observability.md`
## Scope
- Add Alertmanager to the optional observability compose overlay.
- Add Blackbox Exporter uptime probes for the web container and API readiness endpoint.
- Add backend database-derived workflow health gauges.
- Add Prometheus alerts for uptime probes and workflow health.
- Add an optional Caddy snippet for protected Grafana exposure.
- Add an operator runbook for bring-up, alert triage, and security defaults.
## Out Of Scope
- Operating the remote preproduction host.
- Choosing the final alert destination.
- Client-facing status page.
- External third-party uptime monitoring.
## Validation
```bash
dotnet build backend/Socialize.slnx
dotnet test backend/Socialize.slnx
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
jq empty deploy/observability/grafana/dashboards/socialize-overview.json
```