feat: close preprod observability loop
This commit is contained in:
@@ -113,6 +113,10 @@ host. Prometheus alert rules are provisioned under
|
|||||||
`deploy/observability/prometheus/rules/`; notification delivery is intentionally
|
`deploy/observability/prometheus/rules/`; notification delivery is intentionally
|
||||||
left to the preprod operations environment.
|
left to the preprod operations environment.
|
||||||
|
|
||||||
|
Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint.
|
||||||
|
See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and
|
||||||
|
the optional protected Caddy configuration for Grafana.
|
||||||
|
|
||||||
## Solution
|
## Solution
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ internal static class ObservabilityRegistration
|
|||||||
}
|
}
|
||||||
|
|
||||||
builder.Services.AddSingleton<SocializeMetrics>();
|
builder.Services.AddSingleton<SocializeMetrics>();
|
||||||
|
builder.Services.AddHostedService<WorkflowHealthSamplerService>();
|
||||||
builder.Services
|
builder.Services
|
||||||
.AddOpenTelemetry()
|
.AddOpenTelemetry()
|
||||||
.ConfigureResource(resource => resource.AddService(
|
.ConfigureResource(resource => resource.AddService(
|
||||||
|
|||||||
@@ -19,6 +19,8 @@ internal sealed class SocializeMetrics : IDisposable
|
|||||||
private readonly Counter<long> _organizationCreatedCounter;
|
private readonly Counter<long> _organizationCreatedCounter;
|
||||||
private readonly Counter<long> _workspaceCreatedCounter;
|
private readonly Counter<long> _workspaceCreatedCounter;
|
||||||
private readonly Counter<long> _workspaceInviteCreatedCounter;
|
private readonly Counter<long> _workspaceInviteCreatedCounter;
|
||||||
|
private readonly object _workflowHealthLock = new();
|
||||||
|
private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty;
|
||||||
|
|
||||||
public SocializeMetrics()
|
public SocializeMetrics()
|
||||||
{
|
{
|
||||||
@@ -58,6 +60,27 @@ internal sealed class SocializeMetrics : IDisposable
|
|||||||
_backgroundJobRunCounter = Meter.CreateCounter<long>(
|
_backgroundJobRunCounter = Meter.CreateCounter<long>(
|
||||||
"socialize.background_job.runs",
|
"socialize.background_job.runs",
|
||||||
description: "Background job runs partitioned by job and outcome.");
|
description: "Background job runs partitioned by job and outcome.");
|
||||||
|
|
||||||
|
Meter.CreateObservableGauge(
|
||||||
|
"socialize.workflow.content_items",
|
||||||
|
ObserveContentItemCounts,
|
||||||
|
description: "Current content item counts by status.");
|
||||||
|
Meter.CreateObservableGauge(
|
||||||
|
"socialize.workflow.feedback_reports",
|
||||||
|
ObserveFeedbackReportCounts,
|
||||||
|
description: "Current feedback report counts by status.");
|
||||||
|
Meter.CreateObservableGauge(
|
||||||
|
"socialize.workflow.pending_invites",
|
||||||
|
ObservePendingInviteCount,
|
||||||
|
description: "Current pending workspace invite count.");
|
||||||
|
Meter.CreateObservableGauge(
|
||||||
|
"socialize.workflow.stale_in_approval",
|
||||||
|
ObserveStaleApprovalCount,
|
||||||
|
description: "Current count of content items in approval longer than the configured stale threshold.");
|
||||||
|
Meter.CreateObservableGauge(
|
||||||
|
"socialize.workflow.active_workspaces",
|
||||||
|
ObserveActiveWorkspaceCounts,
|
||||||
|
description: "Current active workspace counts by observation window.");
|
||||||
}
|
}
|
||||||
|
|
||||||
public Meter Meter { get; }
|
public Meter Meter { get; }
|
||||||
@@ -150,9 +173,86 @@ internal sealed class SocializeMetrics : IDisposable
|
|||||||
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
|
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot)
|
||||||
|
{
|
||||||
|
lock (_workflowHealthLock)
|
||||||
|
{
|
||||||
|
_workflowHealthSnapshot = snapshot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void Dispose()
|
public void Dispose()
|
||||||
{
|
{
|
||||||
Meter.Dispose();
|
Meter.Dispose();
|
||||||
ActivitySource.Dispose();
|
ActivitySource.Dispose();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Measurement<int>[] ObserveContentItemCounts()
|
||||||
|
{
|
||||||
|
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
|
||||||
|
return snapshot.ContentItemsByStatus
|
||||||
|
.Select(pair => new Measurement<int>(
|
||||||
|
pair.Value,
|
||||||
|
new KeyValuePair<string, object?>("status", pair.Key)))
|
||||||
|
.ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Measurement<int>[] ObserveFeedbackReportCounts()
|
||||||
|
{
|
||||||
|
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
|
||||||
|
return snapshot.FeedbackReportsByStatus
|
||||||
|
.Select(pair => new Measurement<int>(
|
||||||
|
pair.Value,
|
||||||
|
new KeyValuePair<string, object?>("status", pair.Key)))
|
||||||
|
.ToArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Measurement<int> ObservePendingInviteCount()
|
||||||
|
{
|
||||||
|
return new Measurement<int>(GetWorkflowHealthSnapshot().PendingInviteCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Measurement<int> ObserveStaleApprovalCount()
|
||||||
|
{
|
||||||
|
return new Measurement<int>(GetWorkflowHealthSnapshot().StaleInApprovalCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Measurement<int>[] ObserveActiveWorkspaceCounts()
|
||||||
|
{
|
||||||
|
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
|
||||||
|
return
|
||||||
|
[
|
||||||
|
new Measurement<int>(
|
||||||
|
snapshot.ActiveWorkspaces24Hours,
|
||||||
|
new KeyValuePair<string, object?>("window", "24h")),
|
||||||
|
new Measurement<int>(
|
||||||
|
snapshot.ActiveWorkspaces7Days,
|
||||||
|
new KeyValuePair<string, object?>("window", "7d")),
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
private WorkflowHealthSnapshot GetWorkflowHealthSnapshot()
|
||||||
|
{
|
||||||
|
lock (_workflowHealthLock)
|
||||||
|
{
|
||||||
|
return _workflowHealthSnapshot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal sealed record WorkflowHealthSnapshot(
|
||||||
|
IReadOnlyDictionary<string, int> ContentItemsByStatus,
|
||||||
|
IReadOnlyDictionary<string, int> FeedbackReportsByStatus,
|
||||||
|
int PendingInviteCount,
|
||||||
|
int StaleInApprovalCount,
|
||||||
|
int ActiveWorkspaces24Hours,
|
||||||
|
int ActiveWorkspaces7Days)
|
||||||
|
{
|
||||||
|
public static WorkflowHealthSnapshot Empty { get; } = new(
|
||||||
|
new Dictionary<string, int>(StringComparer.Ordinal),
|
||||||
|
new Dictionary<string, int>(StringComparer.Ordinal),
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
0);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,102 @@
|
|||||||
|
using Microsoft.EntityFrameworkCore;
|
||||||
|
using Socialize.Api.Data;
|
||||||
|
using Socialize.Api.Modules.Feedback.Data;
|
||||||
|
using Socialize.Api.Modules.Workspaces.Data;
|
||||||
|
|
||||||
|
namespace Socialize.Api.Infrastructure.Observability;
|
||||||
|
|
||||||
|
internal sealed class WorkflowHealthSamplerService(
|
||||||
|
IServiceScopeFactory scopeFactory,
|
||||||
|
SocializeMetrics metrics,
|
||||||
|
ILogger<WorkflowHealthSamplerService> logger)
|
||||||
|
: BackgroundService
|
||||||
|
{
|
||||||
|
private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5);
|
||||||
|
private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3);
|
||||||
|
|
||||||
|
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
|
||||||
|
{
|
||||||
|
await SampleAsync(stoppingToken);
|
||||||
|
|
||||||
|
using PeriodicTimer timer = new(SampleInterval);
|
||||||
|
while (!stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
await timer.WaitForNextTickAsync(stoppingToken);
|
||||||
|
await SampleAsync(stoppingToken);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
logger.LogDebug(ex, "Workflow health sampler stopped.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task SampleAsync(CancellationToken stoppingToken)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
using IServiceScope scope = scopeFactory.CreateScope();
|
||||||
|
AppDbContext dbContext = scope.ServiceProvider.GetRequiredService<AppDbContext>();
|
||||||
|
DateTimeOffset now = DateTimeOffset.UtcNow;
|
||||||
|
DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold);
|
||||||
|
DateTimeOffset active24HourCutoff = now.AddHours(-24);
|
||||||
|
DateTimeOffset active7DayCutoff = now.AddDays(-7);
|
||||||
|
|
||||||
|
Dictionary<string, int> contentItemsByStatus = await dbContext.ContentItems
|
||||||
|
.GroupBy(item => item.Status)
|
||||||
|
.Select(group => new { Status = group.Key, Count = group.Count() })
|
||||||
|
.ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken);
|
||||||
|
|
||||||
|
Dictionary<string, int> feedbackReportsByStatus = await dbContext.FeedbackReports
|
||||||
|
.GroupBy(report => report.Status)
|
||||||
|
.Select(group => new { Status = group.Key, Count = group.Count() })
|
||||||
|
.ToDictionaryAsync(
|
||||||
|
group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(),
|
||||||
|
group => group.Count,
|
||||||
|
StringComparer.Ordinal,
|
||||||
|
stoppingToken);
|
||||||
|
|
||||||
|
int pendingInviteCount = await dbContext.WorkspaceInvites
|
||||||
|
.CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken);
|
||||||
|
|
||||||
|
int staleInApprovalCount = await dbContext.ContentItems
|
||||||
|
.CountAsync(
|
||||||
|
item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff,
|
||||||
|
stoppingToken);
|
||||||
|
|
||||||
|
int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries
|
||||||
|
.Where(entry => entry.CreatedAt >= active24HourCutoff)
|
||||||
|
.Select(entry => entry.WorkspaceId)
|
||||||
|
.Distinct()
|
||||||
|
.CountAsync(stoppingToken);
|
||||||
|
|
||||||
|
int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries
|
||||||
|
.Where(entry => entry.CreatedAt >= active7DayCutoff)
|
||||||
|
.Select(entry => entry.WorkspaceId)
|
||||||
|
.Distinct()
|
||||||
|
.CountAsync(stoppingToken);
|
||||||
|
|
||||||
|
metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot(
|
||||||
|
contentItemsByStatus,
|
||||||
|
feedbackReportsByStatus,
|
||||||
|
pendingInviteCount,
|
||||||
|
staleInApprovalCount,
|
||||||
|
activeWorkspaces24Hours,
|
||||||
|
activeWorkspaces7Days));
|
||||||
|
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true);
|
||||||
|
}
|
||||||
|
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
|
||||||
|
{
|
||||||
|
logger.LogDebug(ex, "Workflow health sampler stopped.");
|
||||||
|
}
|
||||||
|
#pragma warning disable CA1031
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false);
|
||||||
|
logger.LogError(ex, "Workflow health sampling failed.");
|
||||||
|
}
|
||||||
|
#pragma warning restore CA1031
|
||||||
|
}
|
||||||
|
}
|
||||||
22
deploy/observability/alertmanager/alertmanager.yml
Normal file
22
deploy/observability/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
global:
|
||||||
|
resolve_timeout: 5m
|
||||||
|
|
||||||
|
route:
|
||||||
|
receiver: preprod-webhook
|
||||||
|
group_by:
|
||||||
|
- alertname
|
||||||
|
- service
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 4h
|
||||||
|
routes:
|
||||||
|
- matchers:
|
||||||
|
- severity="critical"
|
||||||
|
receiver: preprod-webhook
|
||||||
|
repeat_interval: 30m
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: preprod-webhook
|
||||||
|
webhook_configs:
|
||||||
|
- url: ${ALERTMANAGER_WEBHOOK_URL}
|
||||||
|
send_resolved: true
|
||||||
9
deploy/observability/blackbox/config.yml
Normal file
9
deploy/observability/blackbox/config.yml
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
modules:
|
||||||
|
http_2xx:
|
||||||
|
prober: http
|
||||||
|
timeout: 5s
|
||||||
|
http:
|
||||||
|
method: GET
|
||||||
|
preferred_ip_protocol: ip4
|
||||||
|
valid_status_codes:
|
||||||
|
- 200
|
||||||
13
deploy/observability/caddy/grafana.Caddyfile
Normal file
13
deploy/observability/caddy/grafana.Caddyfile
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Optional Caddy snippet for exposing Grafana through a protected hostname.
|
||||||
|
# Generate a password hash with:
|
||||||
|
# caddy hash-password --plaintext '<password>'
|
||||||
|
|
||||||
|
{$OBSERVABILITY_HOST} {
|
||||||
|
encode gzip zstd
|
||||||
|
|
||||||
|
basicauth {
|
||||||
|
{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
|
||||||
|
}
|
||||||
|
|
||||||
|
reverse_proxy grafana:3000
|
||||||
|
}
|
||||||
@@ -26,6 +26,7 @@ services:
|
|||||||
- prometheus
|
- prometheus
|
||||||
- loki
|
- loki
|
||||||
- tempo
|
- tempo
|
||||||
|
- alertmanager
|
||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
|
|
||||||
@@ -44,6 +45,31 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
- internal
|
- internal
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager:v0.29.0
|
||||||
|
restart: unless-stopped
|
||||||
|
command:
|
||||||
|
- --config.file=/etc/alertmanager/alertmanager.yml
|
||||||
|
- --storage.path=/alertmanager
|
||||||
|
- --config.expand-env
|
||||||
|
environment:
|
||||||
|
ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
|
||||||
|
volumes:
|
||||||
|
- alertmanager-data:/alertmanager
|
||||||
|
- ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
|
||||||
|
blackbox:
|
||||||
|
image: prom/blackbox-exporter:v0.27.0
|
||||||
|
restart: unless-stopped
|
||||||
|
command:
|
||||||
|
- --config.file=/etc/blackbox_exporter/config.yml
|
||||||
|
volumes:
|
||||||
|
- ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
|
||||||
|
networks:
|
||||||
|
- internal
|
||||||
|
|
||||||
loki:
|
loki:
|
||||||
image: grafana/loki:3.7.1
|
image: grafana/loki:3.7.1
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
@@ -84,6 +110,7 @@ services:
|
|||||||
- internal
|
- internal
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
alertmanager-data:
|
||||||
grafana-data:
|
grafana-data:
|
||||||
prometheus-data:
|
prometheus-data:
|
||||||
loki-data:
|
loki-data:
|
||||||
|
|||||||
@@ -333,6 +333,78 @@
|
|||||||
"title": "Operational Events, 1h Rolling",
|
"title": "Operational Events, 1h Rolling",
|
||||||
"type": "timeseries"
|
"type": "timeseries"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 0,
|
||||||
|
"y": 20
|
||||||
|
},
|
||||||
|
"id": 11,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "socialize_workflow_content_items",
|
||||||
|
"legendFormat": "content {{status}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "socialize_workflow_feedback_reports",
|
||||||
|
"legendFormat": "feedback {{status}}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Workflow Backlog",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"datasource": {
|
||||||
|
"type": "prometheus",
|
||||||
|
"uid": "Prometheus"
|
||||||
|
},
|
||||||
|
"gridPos": {
|
||||||
|
"h": 8,
|
||||||
|
"w": 12,
|
||||||
|
"x": 12,
|
||||||
|
"y": 20
|
||||||
|
},
|
||||||
|
"id": 12,
|
||||||
|
"options": {
|
||||||
|
"legend": {
|
||||||
|
"displayMode": "list",
|
||||||
|
"placement": "bottom"
|
||||||
|
},
|
||||||
|
"tooltip": {
|
||||||
|
"mode": "multi"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "socialize_workflow_active_workspaces",
|
||||||
|
"legendFormat": "active workspaces {{window}}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "socialize_workflow_stale_in_approval",
|
||||||
|
"legendFormat": "stale in approval"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"expr": "socialize_workflow_pending_invites",
|
||||||
|
"legendFormat": "pending invites"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Workflow Health",
|
||||||
|
"type": "timeseries"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"datasource": {
|
"datasource": {
|
||||||
"type": "prometheus",
|
"type": "prometheus",
|
||||||
@@ -342,7 +414,7 @@
|
|||||||
"h": 7,
|
"h": 7,
|
||||||
"w": 24,
|
"w": 24,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 20
|
"y": 28
|
||||||
},
|
},
|
||||||
"id": 9,
|
"id": 9,
|
||||||
"options": {
|
"options": {
|
||||||
@@ -368,7 +440,7 @@
|
|||||||
"h": 9,
|
"h": 9,
|
||||||
"w": 24,
|
"w": 24,
|
||||||
"x": 0,
|
"x": 0,
|
||||||
"y": 27
|
"y": 35
|
||||||
},
|
},
|
||||||
"id": 10,
|
"id": 10,
|
||||||
"options": {
|
"options": {
|
||||||
|
|||||||
@@ -2,6 +2,12 @@ global:
|
|||||||
scrape_interval: 15s
|
scrape_interval: 15s
|
||||||
evaluation_interval: 15s
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager:9093
|
||||||
|
|
||||||
rule_files:
|
rule_files:
|
||||||
- /etc/prometheus/rules/*.yml
|
- /etc/prometheus/rules/*.yml
|
||||||
|
|
||||||
@@ -15,3 +21,22 @@ scrape_configs:
|
|||||||
static_configs:
|
static_configs:
|
||||||
- targets:
|
- targets:
|
||||||
- alloy:12345
|
- alloy:12345
|
||||||
|
|
||||||
|
- job_name: preprod-uptime
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module:
|
||||||
|
- http_2xx
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- http://web/
|
||||||
|
- http://api:8080/health/ready
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels:
|
||||||
|
- __address__
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels:
|
||||||
|
- __param_target
|
||||||
|
target_label: instance
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox:9115
|
||||||
|
|||||||
@@ -11,6 +11,16 @@ groups:
|
|||||||
summary: Socialize API telemetry is missing
|
summary: Socialize API telemetry is missing
|
||||||
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
|
||||||
|
|
||||||
|
- alert: SocializePreprodEndpointDown
|
||||||
|
expr: probe_success{job="preprod-uptime"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: socialize-preprod
|
||||||
|
annotations:
|
||||||
|
summary: Preprod endpoint is down
|
||||||
|
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
|
||||||
|
|
||||||
- alert: SocializeApiHighErrorRate
|
- alert: SocializeApiHighErrorRate
|
||||||
expr: |
|
expr: |
|
||||||
(
|
(
|
||||||
@@ -56,6 +66,26 @@ groups:
|
|||||||
summary: Socialize core usage is quiet
|
summary: Socialize core usage is quiet
|
||||||
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
|
||||||
|
|
||||||
|
- alert: SocializeContentStaleInApproval
|
||||||
|
expr: socialize_workflow_stale_in_approval > 0
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: socialize-api
|
||||||
|
annotations:
|
||||||
|
summary: Content is stale in approval
|
||||||
|
description: One or more content items have been in approval longer than the configured threshold.
|
||||||
|
|
||||||
|
- alert: SocializeNoActiveWorkspaces
|
||||||
|
expr: socialize_workflow_active_workspaces{window="24h"} < 1
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
service: socialize-api
|
||||||
|
annotations:
|
||||||
|
summary: No active workspaces in the last 24 hours
|
||||||
|
description: No workspace has content workflow activity in the last 24 hours.
|
||||||
|
|
||||||
- alert: SocializeFeedbackBugSubmitted
|
- alert: SocializeFeedbackBugSubmitted
|
||||||
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
|
||||||
for: 0m
|
for: 0m
|
||||||
|
|||||||
@@ -78,3 +78,17 @@ Initial alerts should cover:
|
|||||||
- email delivery failures
|
- email delivery failures
|
||||||
- blob storage failures
|
- blob storage failures
|
||||||
- background job failures
|
- background job failures
|
||||||
|
|
||||||
|
## Workflow Health Gauges
|
||||||
|
|
||||||
|
Database-derived workflow health metrics should be sampled periodically instead of emitted per request.
|
||||||
|
|
||||||
|
Initial gauges should cover:
|
||||||
|
|
||||||
|
- content item counts by status
|
||||||
|
- feedback report counts by status
|
||||||
|
- pending workspace invites
|
||||||
|
- content stale in approval
|
||||||
|
- active workspace counts over 24-hour and 7-day windows
|
||||||
|
|
||||||
|
These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels.
|
||||||
|
|||||||
163
docs/OPERATIONS/observability-runbook.md
Normal file
163
docs/OPERATIONS/observability-runbook.md
Normal file
@@ -0,0 +1,163 @@
|
|||||||
|
# Observability Runbook
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
This runbook is for preproduction operation of Socialize's self-hosted observability stack.
|
||||||
|
|
||||||
|
The goal is to answer:
|
||||||
|
|
||||||
|
- Is the app reachable?
|
||||||
|
- Is the API healthy?
|
||||||
|
- Are errors or latency rising?
|
||||||
|
- Are users exercising core workflows?
|
||||||
|
- Are emails, blob storage, and background jobs failing?
|
||||||
|
- Is work getting stuck?
|
||||||
|
|
||||||
|
## Start The Stack
|
||||||
|
|
||||||
|
Run from the repository root on the preproduction host:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0`
|
||||||
|
only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel.
|
||||||
|
|
||||||
|
Set these before exposing Grafana:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
GRAFANA_ADMIN_USER=admin
|
||||||
|
GRAFANA_ADMIN_PASSWORD=<strong-password>
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alert Delivery
|
||||||
|
|
||||||
|
Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook
|
||||||
|
configured by:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ALERTMANAGER_WEBHOOK_URL=<private-alert-webhook-url>
|
||||||
|
```
|
||||||
|
|
||||||
|
If no webhook URL is configured, Alertmanager still starts but alert delivery points
|
||||||
|
to a local discard endpoint.
|
||||||
|
|
||||||
|
Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours.
|
||||||
|
|
||||||
|
## Secure Grafana With Caddy
|
||||||
|
|
||||||
|
An optional Caddy snippet is available at:
|
||||||
|
|
||||||
|
```txt
|
||||||
|
deploy/observability/caddy/grafana.Caddyfile
|
||||||
|
```
|
||||||
|
|
||||||
|
Generate a Caddy password hash:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
caddy hash-password --plaintext '<password>'
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
OBSERVABILITY_HOST=observability.example.com
|
||||||
|
GRAFANA_BASIC_AUTH_USER=<user>
|
||||||
|
GRAFANA_BASIC_AUTH_HASH=<hash>
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep Grafana private unless the hostname is protected.
|
||||||
|
|
||||||
|
## First Bring-Up Checks
|
||||||
|
|
||||||
|
1. Confirm containers are running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Check API health:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -i http://127.0.0.1:8080/health
|
||||||
|
curl -i http://127.0.0.1:8080/health/ready
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Open Grafana and check the `Socialize Overview` dashboard.
|
||||||
|
|
||||||
|
4. Generate a few real actions:
|
||||||
|
|
||||||
|
- log in
|
||||||
|
- create a content item
|
||||||
|
- add a comment
|
||||||
|
- submit feedback
|
||||||
|
- create a workspace invite
|
||||||
|
|
||||||
|
5. Confirm metrics appear in the dashboard:
|
||||||
|
|
||||||
|
- API request rate
|
||||||
|
- usage signals
|
||||||
|
- workflow backlog
|
||||||
|
- operational events
|
||||||
|
|
||||||
|
## Alert Triage
|
||||||
|
|
||||||
|
`SocializePreprodEndpointDown`
|
||||||
|
|
||||||
|
- Check `docker compose ps`.
|
||||||
|
- Check `docker compose logs api web`.
|
||||||
|
- Check `/health/ready`.
|
||||||
|
|
||||||
|
`SocializeApiTelemetryMissing`
|
||||||
|
|
||||||
|
- Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`.
|
||||||
|
- Check `docker compose logs alloy`.
|
||||||
|
- Check whether the API is receiving traffic.
|
||||||
|
|
||||||
|
`SocializeApiHighErrorRate`
|
||||||
|
|
||||||
|
- Open the API logs panel.
|
||||||
|
- Filter by recent `5xx` requests.
|
||||||
|
- Open Tempo traces for slow or failing requests if available.
|
||||||
|
|
||||||
|
`SocializeApiHighLatency`
|
||||||
|
|
||||||
|
- Check the p95 latency by endpoint panel.
|
||||||
|
- Inspect slow traces.
|
||||||
|
- Check database health and recent deploy activity.
|
||||||
|
|
||||||
|
`SocializeEmailDeliveryFailures`
|
||||||
|
|
||||||
|
- Check API logs for Resend failures.
|
||||||
|
- Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`.
|
||||||
|
- Confirm Resend service status outside this stack if needed.
|
||||||
|
|
||||||
|
`SocializeBlobStorageFailures`
|
||||||
|
|
||||||
|
- Confirm `./blob-storage` volume permissions on the preprod host.
|
||||||
|
- Check local disk space.
|
||||||
|
- Check API logs for validation or filesystem errors.
|
||||||
|
|
||||||
|
`SocializeBackgroundJobFailures`
|
||||||
|
|
||||||
|
- Check the operational events panel for the failing job name.
|
||||||
|
- Check API logs for the same time window.
|
||||||
|
|
||||||
|
`SocializeContentStaleInApproval`
|
||||||
|
|
||||||
|
- Use the app to inspect content currently in approval.
|
||||||
|
- Contact the relevant internal owner or client contact outside the app if needed.
|
||||||
|
|
||||||
|
`SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces`
|
||||||
|
|
||||||
|
- Confirm whether quiet usage is expected for the period.
|
||||||
|
- If not expected, check login events and API reachability.
|
||||||
|
|
||||||
|
## Retention Defaults
|
||||||
|
|
||||||
|
- Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`.
|
||||||
|
- Tempo keeps traces for 168 hours.
|
||||||
|
- Loki uses local filesystem storage for preproduction.
|
||||||
|
|
||||||
|
Tune retention before heavy customer usage or long-running demos.
|
||||||
34
docs/TASKS/observability/003-preprod-operations-loop.md
Normal file
34
docs/TASKS/observability/003-preprod-operations-loop.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Observability 003: Preprod Operations Loop
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook.
|
||||||
|
|
||||||
|
## Feature Spec
|
||||||
|
|
||||||
|
- `docs/FEATURES/observability.md`
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
- Add Alertmanager to the optional observability compose overlay.
|
||||||
|
- Add Blackbox Exporter uptime probes for the web container and API readiness endpoint.
|
||||||
|
- Add backend database-derived workflow health gauges.
|
||||||
|
- Add Prometheus alerts for uptime probes and workflow health.
|
||||||
|
- Add an optional Caddy snippet for protected Grafana exposure.
|
||||||
|
- Add an operator runbook for bring-up, alert triage, and security defaults.
|
||||||
|
|
||||||
|
## Out Of Scope
|
||||||
|
|
||||||
|
- Operating the remote preproduction host.
|
||||||
|
- Choosing the final alert destination.
|
||||||
|
- Client-facing status page.
|
||||||
|
- External third-party uptime monitoring.
|
||||||
|
|
||||||
|
## Validation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dotnet build backend/Socialize.slnx
|
||||||
|
dotnet test backend/Socialize.slnx
|
||||||
|
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
|
||||||
|
jq empty deploy/observability/grafana/dashboards/socialize-overview.json
|
||||||
|
```
|
||||||
Reference in New Issue
Block a user