2 Commits

41 changed files with 2243 additions and 56 deletions

View File

@@ -82,6 +82,41 @@ The deploy workflow writes the remote `.env` file and syncs `deploy/compose.yml`
before running the server deploy script. before running the server deploy script.
Use the raw Resend API key value for `RESEND_API_KEY`, without a `Bearer ` prefix. Use the raw Resend API key value for `RESEND_API_KEY`, without a `Bearer ` prefix.
## Preprod Observability
The optional observability overlay runs a self-hosted Grafana stack for preproduction:
- Grafana `13.0.1`: dashboards
- Prometheus `v3.11.3`: metrics and local alert rules
- Loki `3.7.1`: Docker/container logs
- Tempo `2.10.3`: traces
- Grafana Alloy `v1.16.0`: OTLP receiver and Docker log collector
Start the app with observability:
```bash
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
```
Grafana is exposed at:
```txt
http://127.0.0.1:3000
```
Default credentials are `admin` / `admin` unless `GRAFANA_ADMIN_USER` and
`GRAFANA_ADMIN_PASSWORD` are set. Set `GRAFANA_HTTP_BIND=0.0.0.0` only when the
preprod network boundary is trusted or protected by a reverse proxy/VPN.
Set a non-default `GRAFANA_ADMIN_PASSWORD` before exposing Grafana outside the
host. Prometheus alert rules are provisioned under
`deploy/observability/prometheus/rules/`; notification delivery is intentionally
left to the preprod operations environment.
Set `ALERTMANAGER_WEBHOOK_URL` to route alerts to a private notification endpoint.
See `docs/OPERATIONS/observability-runbook.md` for bring-up, alert triage, and
the optional protected Caddy configuration for Grafana.
## Solution ## Solution
```bash ```bash

View File

@@ -1,5 +1,6 @@
using System.Text; using System.Text;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Microsoft.AspNetCore.Authentication; using Microsoft.AspNetCore.Authentication;
@@ -20,7 +21,10 @@ internal static class ApplicationRegistration
services.AddHttpContextAccessor(); services.AddHttpContextAccessor();
services.AddHealthChecks() services.AddHealthChecks()
.AddDbContextCheck<AppDbContext>(); .AddCheck("self", () => Microsoft.Extensions.Diagnostics.HealthChecks.HealthCheckResult.Healthy(), tags: ["live"])
.AddDbContextCheck<AppDbContext>("postgres", tags: ["ready"])
.AddCheck<LocalBlobStorageHealthCheck>("local_blob_storage", tags: ["ready"])
.AddCheck<EmailerConfigurationHealthCheck>("emailer_configuration", tags: ["ready"]);
services.AddHttpClient(); services.AddHttpClient();
services.AddScoped<AccessScopeService>(); services.AddScoped<AccessScopeService>();

View File

@@ -1,6 +1,7 @@
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
using Socialize.Api.Infrastructure.BlobStorage.Configuration; using Socialize.Api.Infrastructure.BlobStorage.Configuration;
using Socialize.Api.Infrastructure.BlobStorage.Contracts; using Socialize.Api.Infrastructure.BlobStorage.Contracts;
using Socialize.Api.Infrastructure.Observability;
namespace Socialize.Api.Infrastructure.BlobStorage.Services; namespace Socialize.Api.Infrastructure.BlobStorage.Services;
@@ -8,7 +9,8 @@ internal sealed class LocalBlobStorage(
IWebHostEnvironment environment, IWebHostEnvironment environment,
IHttpContextAccessor httpContextAccessor, IHttpContextAccessor httpContextAccessor,
IOptions<LocalBlobStorageOptions> options, IOptions<LocalBlobStorageOptions> options,
ILogger<LocalBlobStorage> logger) ILogger<LocalBlobStorage> logger,
SocializeMetrics metrics)
: IBlobStorage : IBlobStorage
{ {
private const long MaxUploadSize = 10 * 1024 * 1024; private const long MaxUploadSize = 10 * 1024 * 1024;
@@ -30,6 +32,8 @@ internal sealed class LocalBlobStorage(
Stream stream, Stream stream,
string contentType, string contentType,
CancellationToken ct = default) CancellationToken ct = default)
{
try
{ {
stream.Position = 0; stream.Position = 0;
@@ -55,14 +59,33 @@ internal sealed class LocalBlobStorage(
string fileUri = BuildPublicUrl(relativePath); string fileUri = BuildPublicUrl(relativePath);
LogUploadedFile(logger, blobName, containerName, contentType, fileUri, null); LogUploadedFile(logger, blobName, containerName, contentType, fileUri, null);
metrics.RecordBlobStorageOperation("upload", true);
return fileUri; return fileUri;
} }
catch (InvalidOperationException)
{
metrics.RecordBlobStorageOperation("upload", false);
throw;
}
catch (IOException)
{
metrics.RecordBlobStorageOperation("upload", false);
throw;
}
catch (UnauthorizedAccessException)
{
metrics.RecordBlobStorageOperation("upload", false);
throw;
}
}
public async Task<MemoryStream> DownloadFileAsync( public async Task<MemoryStream> DownloadFileAsync(
string containerName, string containerName,
string blobName, string blobName,
CancellationToken ct = default) CancellationToken ct = default)
{
try
{ {
string filePath = Path.Combine(GetRootPath(), GetSafeRelativePath(containerName, blobName)); string filePath = Path.Combine(GetRootPath(), GetSafeRelativePath(containerName, blobName));
@@ -75,9 +98,31 @@ internal sealed class LocalBlobStorage(
await using FileStream fileStream = File.OpenRead(filePath); await using FileStream fileStream = File.OpenRead(filePath);
await fileStream.CopyToAsync(memoryStream, ct); await fileStream.CopyToAsync(memoryStream, ct);
memoryStream.Position = 0; memoryStream.Position = 0;
metrics.RecordBlobStorageOperation("download", true);
return memoryStream; return memoryStream;
} }
catch (InvalidOperationException)
{
metrics.RecordBlobStorageOperation("download", false);
throw;
}
catch (FileNotFoundException)
{
metrics.RecordBlobStorageOperation("download", false);
throw;
}
catch (IOException)
{
metrics.RecordBlobStorageOperation("download", false);
throw;
}
catch (UnauthorizedAccessException)
{
metrics.RecordBlobStorageOperation("download", false);
throw;
}
}
internal string GetRootPath() internal string GetRootPath()
{ {

View File

@@ -1,8 +1,11 @@
using Socialize.Api.Infrastructure.Emailer.Contracts; using Socialize.Api.Infrastructure.Emailer.Contracts;
using Socialize.Api.Infrastructure.Observability;
namespace Socialize.Api.Infrastructure.Emailer.Services; namespace Socialize.Api.Infrastructure.Emailer.Services;
internal class LoggerEmailSender(ILogger<IEmailSender> logger) internal class LoggerEmailSender(
ILogger<IEmailSender> logger,
SocializeMetrics metrics)
: IEmailSender : IEmailSender
{ {
private static readonly Action<ILogger, string, string, string, string, Exception?> LogDevelopmentEmail = private static readonly Action<ILogger, string, string, string, string, Exception?> LogDevelopmentEmail =
@@ -14,6 +17,7 @@ internal class LoggerEmailSender(ILogger<IEmailSender> logger)
public Task SendEmailAsync(string email, string subject, string message) public Task SendEmailAsync(string email, string subject, string message)
{ {
LogDevelopmentEmail(logger, email, subject, Environment.NewLine, message, null); LogDevelopmentEmail(logger, email, subject, Environment.NewLine, message, null);
metrics.RecordEmailDelivery("logger", true);
return Task.CompletedTask; return Task.CompletedTask;
} }

View File

@@ -3,6 +3,7 @@ using System.Text;
using System.Text.Json; using System.Text.Json;
using Socialize.Api.Infrastructure.Emailer.Configuration; using Socialize.Api.Infrastructure.Emailer.Configuration;
using Socialize.Api.Infrastructure.Emailer.Contracts; using Socialize.Api.Infrastructure.Emailer.Contracts;
using Socialize.Api.Infrastructure.Observability;
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
namespace Socialize.Api.Infrastructure.Emailer.Services; namespace Socialize.Api.Infrastructure.Emailer.Services;
@@ -11,13 +12,16 @@ internal class ResendEmailSender : IEmailSender
{ {
private static readonly Uri EndpointUri = new("https://api.resend.com/emails"); private static readonly Uri EndpointUri = new("https://api.resend.com/emails");
private readonly HttpClient _httpClient; private readonly HttpClient _httpClient;
private readonly SocializeMetrics _metrics;
private readonly EmailerOptions _options; private readonly EmailerOptions _options;
public ResendEmailSender( public ResendEmailSender(
IHttpClientFactory httpClientFactory, IHttpClientFactory httpClientFactory,
IOptions<EmailerOptions> options) IOptions<EmailerOptions> options,
SocializeMetrics metrics)
{ {
_httpClient = httpClientFactory.CreateClient(); _httpClient = httpClientFactory.CreateClient();
_metrics = metrics;
_options = options.Value; _options = options.Value;
string apiKey = NormalizeApiKey(_options.ApiKey); string apiKey = NormalizeApiKey(_options.ApiKey);
@@ -49,6 +53,8 @@ internal class ResendEmailSender : IEmailSender
string json = JsonSerializer.Serialize(payload); string json = JsonSerializer.Serialize(payload);
using StringContent content = new(json, Encoding.UTF8, "application/json"); using StringContent content = new(json, Encoding.UTF8, "application/json");
try
{
using HttpResponseMessage response = await _httpClient.PostAsync(EndpointUri, content); using HttpResponseMessage response = await _httpClient.PostAsync(EndpointUri, content);
if (!response.IsSuccessStatusCode) if (!response.IsSuccessStatusCode)
@@ -57,6 +63,24 @@ internal class ResendEmailSender : IEmailSender
throw new InvalidOperationException( throw new InvalidOperationException(
$"Resend email failed: {response.StatusCode} - {body}"); $"Resend email failed: {response.StatusCode} - {body}");
} }
_metrics.RecordEmailDelivery("resend", true);
}
catch (HttpRequestException)
{
_metrics.RecordEmailDelivery("resend", false);
throw;
}
catch (TaskCanceledException)
{
_metrics.RecordEmailDelivery("resend", false);
throw;
}
catch (InvalidOperationException)
{
_metrics.RecordEmailDelivery("resend", false);
throw;
}
} }
private static string NormalizeApiKey(string? apiKey) private static string NormalizeApiKey(string? apiKey)

View File

@@ -0,0 +1,29 @@
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Options;
using Socialize.Api.Infrastructure.Emailer.Configuration;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class EmailerConfigurationHealthCheck(
IWebHostEnvironment environment,
IOptions<EmailerOptions> options)
: IHealthCheck
{
public Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
if (environment.IsDevelopment())
{
return Task.FromResult(HealthCheckResult.Healthy("Development email sender logs email instead of delivering it."));
}
EmailerOptions value = options.Value;
if (string.IsNullOrWhiteSpace(value.ApiKey) || string.IsNullOrWhiteSpace(value.FromEmail))
{
return Task.FromResult(HealthCheckResult.Unhealthy("Emailer API key or from address is missing."));
}
return Task.FromResult(HealthCheckResult.Healthy("Emailer configuration is present."));
}
}

View File

@@ -0,0 +1,40 @@
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Microsoft.Extensions.Options;
using Socialize.Api.Infrastructure.BlobStorage.Configuration;
using Socialize.Api.Infrastructure.BlobStorage.Services;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class LocalBlobStorageHealthCheck(
LocalBlobStorage blobStorage,
IOptions<LocalBlobStorageOptions> options)
: IHealthCheck
{
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
string rootPath = blobStorage.GetRootPath();
if (string.IsNullOrWhiteSpace(options.Value.RequestPath))
{
return HealthCheckResult.Unhealthy("Local blob storage request path is not configured.");
}
try
{
Directory.CreateDirectory(rootPath);
string probePath = Path.Combine(rootPath, ".healthcheck");
await File.WriteAllTextAsync(
probePath,
DateTimeOffset.UtcNow.ToString("O", System.Globalization.CultureInfo.InvariantCulture),
cancellationToken);
File.Delete(probePath);
return HealthCheckResult.Healthy("Local blob storage is writable.");
}
catch (Exception ex) when (ex is IOException or UnauthorizedAccessException)
{
return HealthCheckResult.Unhealthy("Local blob storage is not writable.", ex);
}
}
}

View File

@@ -0,0 +1,162 @@
using System.Text.Json;
using Microsoft.AspNetCore.Diagnostics.HealthChecks;
using Microsoft.Extensions.Diagnostics.HealthChecks;
using Npgsql;
using OpenTelemetry.Logs;
using OpenTelemetry.Metrics;
using OpenTelemetry.Resources;
using OpenTelemetry.Trace;
namespace Socialize.Api.Infrastructure.Observability;
internal static class ObservabilityRegistration
{
private const string DefaultServiceName = "socialize-api";
public static WebApplicationBuilder AddObservability(this WebApplicationBuilder builder)
{
string serviceName = GetConfigurationValue(builder.Configuration, "OTEL_SERVICE_NAME", DefaultServiceName);
string serviceVersion = typeof(Program).Assembly.GetName().Version?.ToString() ?? "unknown";
builder.Logging.Configure(options =>
{
options.ActivityTrackingOptions =
ActivityTrackingOptions.TraceId |
ActivityTrackingOptions.SpanId |
ActivityTrackingOptions.ParentId;
});
builder.Logging.AddJsonConsole(options =>
{
options.IncludeScopes = true;
options.TimestampFormat = "yyyy-MM-ddTHH:mm:ss.fffZ";
options.UseUtcTimestamp = true;
options.JsonWriterOptions = new JsonWriterOptions { Indented = false };
});
bool otlpEnabled = HasOtlpEndpoint(builder.Configuration);
if (otlpEnabled)
{
builder.Logging.AddOpenTelemetry(options =>
{
options.IncludeFormattedMessage = true;
options.IncludeScopes = true;
options.ParseStateValues = true;
options.SetResourceBuilder(BuildResource(serviceName, serviceVersion));
options.AddOtlpExporter();
});
}
builder.Services.AddSingleton<SocializeMetrics>();
builder.Services.AddHostedService<WorkflowHealthSamplerService>();
builder.Services
.AddOpenTelemetry()
.ConfigureResource(resource => resource.AddService(
serviceName,
serviceVersion: serviceVersion))
.WithTracing(tracing =>
{
tracing
.AddSource(SocializeMetrics.ActivitySourceName)
.AddAspNetCoreInstrumentation(options =>
{
options.RecordException = true;
})
.AddHttpClientInstrumentation()
.AddNpgsql();
if (otlpEnabled)
{
tracing.AddOtlpExporter();
}
})
.WithMetrics(metrics =>
{
metrics
.AddMeter(SocializeMetrics.MeterName)
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddRuntimeInstrumentation();
if (otlpEnabled)
{
metrics.AddOtlpExporter();
}
});
return builder;
}
public static IApplicationBuilder UseObservabilityLoggingScope(this IApplicationBuilder app)
{
return app.UseMiddleware<RequestLoggingScopeMiddleware>();
}
public static IEndpointRouteBuilder MapObservabilityHealthChecks(this IEndpointRouteBuilder endpoints)
{
endpoints.MapHealthChecks(
"/health",
new HealthCheckOptions { ResponseWriter = WriteHealthResponseAsync });
endpoints.MapHealthChecks(
"/health/live",
new HealthCheckOptions
{
Predicate = registration => registration.Tags.Contains("live", StringComparer.Ordinal),
ResponseWriter = WriteHealthResponseAsync,
});
endpoints.MapHealthChecks(
"/health/ready",
new HealthCheckOptions
{
Predicate = registration => registration.Tags.Contains("ready", StringComparer.Ordinal),
ResponseWriter = WriteHealthResponseAsync,
});
return endpoints;
}
private static ResourceBuilder BuildResource(string serviceName, string serviceVersion)
{
return ResourceBuilder.CreateDefault().AddService(
serviceName,
serviceVersion: serviceVersion);
}
private static bool HasOtlpEndpoint(ConfigurationManager configuration)
{
return !string.IsNullOrWhiteSpace(configuration["OTEL_EXPORTER_OTLP_ENDPOINT"]) ||
!string.IsNullOrWhiteSpace(configuration["Otlp:Endpoint"]);
}
private static string GetConfigurationValue(
ConfigurationManager configuration,
string key,
string fallback)
{
string? value = configuration[key];
return string.IsNullOrWhiteSpace(value) ? fallback : value;
}
private static async Task WriteHealthResponseAsync(HttpContext context, HealthReport report)
{
context.Response.ContentType = "application/json";
var response = new
{
status = report.Status.ToString(),
checks = report.Entries.Select(entry => new
{
name = entry.Key,
status = entry.Value.Status.ToString(),
description = entry.Value.Description,
duration = entry.Value.Duration.TotalMilliseconds,
}),
duration = report.TotalDuration.TotalMilliseconds,
};
await JsonSerializer.SerializeAsync(
context.Response.Body,
response,
cancellationToken: context.RequestAborted);
}
}

View File

@@ -0,0 +1,61 @@
using System.Diagnostics;
using Socialize.Api.Infrastructure.Security;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class RequestLoggingScopeMiddleware(
RequestDelegate next,
ILogger<RequestLoggingScopeMiddleware> logger)
{
public async Task InvokeAsync(HttpContext context)
{
Dictionary<string, object?> scope = new()
{
["trace_id"] = Activity.Current?.TraceId.ToString() ?? context.TraceIdentifier,
["span_id"] = Activity.Current?.SpanId.ToString(),
["http.method"] = context.Request.Method,
["url.path"] = context.Request.Path.Value,
};
if (context.User.Identity?.IsAuthenticated == true)
{
scope["user.id"] = context.User.GetUserId();
scope["user.email"] = context.User.GetEmail();
}
AddGuidIfPresent(scope, "organization.id", context, "organizationId");
AddGuidIfPresent(scope, "workspace.id", context, "workspaceId");
AddGuidIfPresent(scope, "client.id", context, "clientId");
AddGuidIfPresent(scope, "campaign.id", context, "campaignId");
AddGuidIfPresent(scope, "content_item.id", context, "contentItemId");
using IDisposable? _ = logger.BeginScope(scope);
await next(context);
}
private static void AddGuidIfPresent(
Dictionary<string, object?> scope,
string scopeKey,
HttpContext context,
string requestKey)
{
string? value = GetRouteOrQueryValue(context, requestKey);
if (Guid.TryParse(value, out Guid id))
{
scope[scopeKey] = id;
}
}
private static string? GetRouteOrQueryValue(HttpContext context, string key)
{
object? routeValue = context.Request.RouteValues[key];
if (routeValue is not null)
{
return Convert.ToString(routeValue, System.Globalization.CultureInfo.InvariantCulture);
}
return context.Request.Query.TryGetValue(key, out Microsoft.Extensions.Primitives.StringValues queryValue)
? queryValue.ToString()
: null;
}
}

View File

@@ -0,0 +1,258 @@
using System.Diagnostics;
using System.Diagnostics.Metrics;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class SocializeMetrics : IDisposable
{
public const string MeterName = "Socialize.Api";
public const string ActivitySourceName = "Socialize.Api";
private readonly Counter<long> _approvalDecisionCounter;
private readonly Counter<long> _backgroundJobRunCounter;
private readonly Counter<long> _blobStorageOperationCounter;
private readonly Counter<long> _commentCreatedCounter;
private readonly Counter<long> _contentItemCreatedCounter;
private readonly Counter<long> _emailDeliveryCounter;
private readonly Counter<long> _feedbackSubmittedCounter;
private readonly Counter<long> _loginAttemptCounter;
private readonly Counter<long> _organizationCreatedCounter;
private readonly Counter<long> _workspaceCreatedCounter;
private readonly Counter<long> _workspaceInviteCreatedCounter;
private readonly object _workflowHealthLock = new();
private WorkflowHealthSnapshot _workflowHealthSnapshot = WorkflowHealthSnapshot.Empty;
public SocializeMetrics()
{
Meter = new Meter(MeterName);
ActivitySource = new ActivitySource(ActivitySourceName);
_loginAttemptCounter = Meter.CreateCounter<long>(
"socialize.login.attempts",
description: "Login attempts partitioned by outcome.");
_organizationCreatedCounter = Meter.CreateCounter<long>(
"socialize.organizations.created",
description: "Organizations created.");
_workspaceCreatedCounter = Meter.CreateCounter<long>(
"socialize.workspaces.created",
description: "Workspaces created.");
_contentItemCreatedCounter = Meter.CreateCounter<long>(
"socialize.content_items.created",
description: "Content items created.");
_commentCreatedCounter = Meter.CreateCounter<long>(
"socialize.comments.created",
description: "Comments created.");
_approvalDecisionCounter = Meter.CreateCounter<long>(
"socialize.approval_decisions.submitted",
description: "Approval decisions submitted.");
_feedbackSubmittedCounter = Meter.CreateCounter<long>(
"socialize.feedback.submitted",
description: "Feedback reports submitted.");
_workspaceInviteCreatedCounter = Meter.CreateCounter<long>(
"socialize.workspace_invites.created",
description: "Workspace invites created.");
_emailDeliveryCounter = Meter.CreateCounter<long>(
"socialize.email.delivery",
description: "Email delivery attempts partitioned by outcome and provider.");
_blobStorageOperationCounter = Meter.CreateCounter<long>(
"socialize.blob_storage.operations",
description: "Blob storage operations partitioned by operation and outcome.");
_backgroundJobRunCounter = Meter.CreateCounter<long>(
"socialize.background_job.runs",
description: "Background job runs partitioned by job and outcome.");
Meter.CreateObservableGauge(
"socialize.workflow.content_items",
ObserveContentItemCounts,
description: "Current content item counts by status.");
Meter.CreateObservableGauge(
"socialize.workflow.feedback_reports",
ObserveFeedbackReportCounts,
description: "Current feedback report counts by status.");
Meter.CreateObservableGauge(
"socialize.workflow.pending_invites",
ObservePendingInviteCount,
description: "Current pending workspace invite count.");
Meter.CreateObservableGauge(
"socialize.workflow.stale_in_approval",
ObserveStaleApprovalCount,
description: "Current count of content items in approval longer than the configured stale threshold.");
Meter.CreateObservableGauge(
"socialize.workflow.active_workspaces",
ObserveActiveWorkspaceCounts,
description: "Current active workspace counts by observation window.");
}
public Meter Meter { get; }
public ActivitySource ActivitySource { get; }
public void RecordLoginAttempt(bool succeeded, string reason)
{
_loginAttemptCounter.Add(
1,
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"),
new KeyValuePair<string, object?>("reason", reason));
}
public void RecordOrganizationCreated(Guid organizationId)
{
_organizationCreatedCounter.Add(
1,
new KeyValuePair<string, object?>("organization.id", organizationId));
}
public void RecordWorkspaceCreated(Guid organizationId, Guid workspaceId)
{
_workspaceCreatedCounter.Add(
1,
new KeyValuePair<string, object?>("organization.id", organizationId),
new KeyValuePair<string, object?>("workspace.id", workspaceId));
}
public void RecordContentItemCreated(Guid workspaceId)
{
_contentItemCreatedCounter.Add(
1,
new KeyValuePair<string, object?>("workspace.id", workspaceId));
}
public void RecordCommentCreated(Guid workspaceId, bool hasAttachment)
{
_commentCreatedCounter.Add(
1,
new KeyValuePair<string, object?>("workspace.id", workspaceId),
new KeyValuePair<string, object?>("has_attachment", hasAttachment));
}
public void RecordApprovalDecisionSubmitted(Guid workspaceId, string decision)
{
_approvalDecisionCounter.Add(
1,
new KeyValuePair<string, object?>("workspace.id", workspaceId),
new KeyValuePair<string, object?>("decision", decision));
}
public void RecordFeedbackSubmitted(string type, Guid? workspaceId)
{
_feedbackSubmittedCounter.Add(
1,
new KeyValuePair<string, object?>("feedback.type", type),
new KeyValuePair<string, object?>("workspace.id", workspaceId?.ToString() ?? "none"));
}
public void RecordWorkspaceInviteCreated(Guid workspaceId, string role)
{
_workspaceInviteCreatedCounter.Add(
1,
new KeyValuePair<string, object?>("workspace.id", workspaceId),
new KeyValuePair<string, object?>("role", role));
}
public void RecordEmailDelivery(string provider, bool succeeded)
{
_emailDeliveryCounter.Add(
1,
new KeyValuePair<string, object?>("provider", provider),
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
}
public void RecordBlobStorageOperation(string operation, bool succeeded)
{
_blobStorageOperationCounter.Add(
1,
new KeyValuePair<string, object?>("operation", operation),
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
}
public void RecordBackgroundJobRun(string job, bool succeeded)
{
_backgroundJobRunCounter.Add(
1,
new KeyValuePair<string, object?>("job", job),
new KeyValuePair<string, object?>("outcome", succeeded ? "success" : "failure"));
}
public void UpdateWorkflowHealth(WorkflowHealthSnapshot snapshot)
{
lock (_workflowHealthLock)
{
_workflowHealthSnapshot = snapshot;
}
}
public void Dispose()
{
Meter.Dispose();
ActivitySource.Dispose();
}
private Measurement<int>[] ObserveContentItemCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return snapshot.ContentItemsByStatus
.Select(pair => new Measurement<int>(
pair.Value,
new KeyValuePair<string, object?>("status", pair.Key)))
.ToArray();
}
private Measurement<int>[] ObserveFeedbackReportCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return snapshot.FeedbackReportsByStatus
.Select(pair => new Measurement<int>(
pair.Value,
new KeyValuePair<string, object?>("status", pair.Key)))
.ToArray();
}
private Measurement<int> ObservePendingInviteCount()
{
return new Measurement<int>(GetWorkflowHealthSnapshot().PendingInviteCount);
}
private Measurement<int> ObserveStaleApprovalCount()
{
return new Measurement<int>(GetWorkflowHealthSnapshot().StaleInApprovalCount);
}
private Measurement<int>[] ObserveActiveWorkspaceCounts()
{
WorkflowHealthSnapshot snapshot = GetWorkflowHealthSnapshot();
return
[
new Measurement<int>(
snapshot.ActiveWorkspaces24Hours,
new KeyValuePair<string, object?>("window", "24h")),
new Measurement<int>(
snapshot.ActiveWorkspaces7Days,
new KeyValuePair<string, object?>("window", "7d")),
];
}
private WorkflowHealthSnapshot GetWorkflowHealthSnapshot()
{
lock (_workflowHealthLock)
{
return _workflowHealthSnapshot;
}
}
}
internal sealed record WorkflowHealthSnapshot(
IReadOnlyDictionary<string, int> ContentItemsByStatus,
IReadOnlyDictionary<string, int> FeedbackReportsByStatus,
int PendingInviteCount,
int StaleInApprovalCount,
int ActiveWorkspaces24Hours,
int ActiveWorkspaces7Days)
{
public static WorkflowHealthSnapshot Empty { get; } = new(
new Dictionary<string, int>(StringComparer.Ordinal),
new Dictionary<string, int>(StringComparer.Ordinal),
0,
0,
0,
0);
}

View File

@@ -0,0 +1,102 @@
using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data;
using Socialize.Api.Modules.Feedback.Data;
using Socialize.Api.Modules.Workspaces.Data;
namespace Socialize.Api.Infrastructure.Observability;
internal sealed class WorkflowHealthSamplerService(
IServiceScopeFactory scopeFactory,
SocializeMetrics metrics,
ILogger<WorkflowHealthSamplerService> logger)
: BackgroundService
{
private static readonly TimeSpan SampleInterval = TimeSpan.FromMinutes(5);
private static readonly TimeSpan StaleApprovalThreshold = TimeSpan.FromDays(3);
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
await SampleAsync(stoppingToken);
using PeriodicTimer timer = new(SampleInterval);
while (!stoppingToken.IsCancellationRequested)
{
try
{
await timer.WaitForNextTickAsync(stoppingToken);
await SampleAsync(stoppingToken);
}
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
{
logger.LogDebug(ex, "Workflow health sampler stopped.");
}
}
}
private async Task SampleAsync(CancellationToken stoppingToken)
{
try
{
using IServiceScope scope = scopeFactory.CreateScope();
AppDbContext dbContext = scope.ServiceProvider.GetRequiredService<AppDbContext>();
DateTimeOffset now = DateTimeOffset.UtcNow;
DateTimeOffset staleApprovalCutoff = now.Subtract(StaleApprovalThreshold);
DateTimeOffset active24HourCutoff = now.AddHours(-24);
DateTimeOffset active7DayCutoff = now.AddDays(-7);
Dictionary<string, int> contentItemsByStatus = await dbContext.ContentItems
.GroupBy(item => item.Status)
.Select(group => new { Status = group.Key, Count = group.Count() })
.ToDictionaryAsync(group => group.Status, group => group.Count, StringComparer.Ordinal, stoppingToken);
Dictionary<string, int> feedbackReportsByStatus = await dbContext.FeedbackReports
.GroupBy(report => report.Status)
.Select(group => new { Status = group.Key, Count = group.Count() })
.ToDictionaryAsync(
group => group.Status == FeedbackStatus.WontDo ? "WontDo" : group.Status.ToString(),
group => group.Count,
StringComparer.Ordinal,
stoppingToken);
int pendingInviteCount = await dbContext.WorkspaceInvites
.CountAsync(invite => invite.Status == WorkspaceInviteStatuses.Pending, stoppingToken);
int staleInApprovalCount = await dbContext.ContentItems
.CountAsync(
item => item.Status == "In approval" && item.CreatedAt <= staleApprovalCutoff,
stoppingToken);
int activeWorkspaces24Hours = await dbContext.ContentItemActivityEntries
.Where(entry => entry.CreatedAt >= active24HourCutoff)
.Select(entry => entry.WorkspaceId)
.Distinct()
.CountAsync(stoppingToken);
int activeWorkspaces7Days = await dbContext.ContentItemActivityEntries
.Where(entry => entry.CreatedAt >= active7DayCutoff)
.Select(entry => entry.WorkspaceId)
.Distinct()
.CountAsync(stoppingToken);
metrics.UpdateWorkflowHealth(new WorkflowHealthSnapshot(
contentItemsByStatus,
feedbackReportsByStatus,
pendingInviteCount,
staleInApprovalCount,
activeWorkspaces24Hours,
activeWorkspaces7Days));
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), true);
}
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
{
logger.LogDebug(ex, "Workflow health sampler stopped.");
}
#pragma warning disable CA1031
catch (Exception ex)
{
metrics.RecordBackgroundJobRun(nameof(WorkflowHealthSamplerService), false);
logger.LogError(ex, "Workflow health sampling failed.");
}
#pragma warning restore CA1031
}
}

View File

@@ -1,6 +1,7 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.ContentItems.Data; using Socialize.Api.Modules.ContentItems.Data;
using Socialize.Api.Modules.ContentItems.Contracts; using Socialize.Api.Modules.ContentItems.Contracts;
@@ -37,7 +38,8 @@ internal class SubmitApprovalDecisionHandler(
AccessScopeService accessScopeService, AccessScopeService accessScopeService,
ApprovalWorkflowRuntimeService approvalWorkflowRuntimeService, ApprovalWorkflowRuntimeService approvalWorkflowRuntimeService,
IContentItemActivityWriter activityWriter, IContentItemActivityWriter activityWriter,
INotificationEventWriter notificationEventWriter) INotificationEventWriter notificationEventWriter,
SocializeMetrics metrics)
: Endpoint<SubmitApprovalDecisionRequest, ApprovalRequestDto> : Endpoint<SubmitApprovalDecisionRequest, ApprovalRequestDto>
{ {
public override void Configure() public override void Configure()
@@ -157,6 +159,7 @@ internal class SubmitApprovalDecisionHandler(
$$"""{"stage":"{{approval.Stage}}","status":"{{contentItem.Status}}"}"""), $$"""{"stage":"{{approval.Stage}}","status":"{{contentItem.Status}}"}"""),
ct); ct);
} }
metrics.RecordApprovalDecisionSubmitted(approval.WorkspaceId, normalizedDecision);
List<ApprovalDecision> decisions = await dbContext.ApprovalDecisions List<ApprovalDecision> decisions = await dbContext.ApprovalDecisions
.Where(candidate => candidate.ApprovalRequestId == approval.Id) .Where(candidate => candidate.ApprovalRequestId == approval.Id)

View File

@@ -1,7 +1,10 @@
using Socialize.Api.Infrastructure.Observability;
namespace Socialize.Api.Modules.CalendarIntegrations.Services; namespace Socialize.Api.Modules.CalendarIntegrations.Services;
internal sealed class CalendarImportBackgroundService( internal sealed class CalendarImportBackgroundService(
IServiceScopeFactory scopeFactory, IServiceScopeFactory scopeFactory,
SocializeMetrics metrics,
ILogger<CalendarImportBackgroundService> logger) ILogger<CalendarImportBackgroundService> logger)
: BackgroundService : BackgroundService
{ {
@@ -22,6 +25,7 @@ internal sealed class CalendarImportBackgroundService(
using IServiceScope scope = scopeFactory.CreateScope(); using IServiceScope scope = scopeFactory.CreateScope();
CalendarImportSyncService syncService = scope.ServiceProvider.GetRequiredService<CalendarImportSyncService>(); CalendarImportSyncService syncService = scope.ServiceProvider.GetRequiredService<CalendarImportSyncService>();
await syncService.RefreshDueSourcesAsync(stoppingToken); await syncService.RefreshDueSourcesAsync(stoppingToken);
metrics.RecordBackgroundJobRun(nameof(CalendarImportBackgroundService), true);
} }
catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested) catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested)
{ {
@@ -30,6 +34,7 @@ internal sealed class CalendarImportBackgroundService(
#pragma warning disable CA1031 // Background service should log and continue after unexpected sync failures. #pragma warning disable CA1031 // Background service should log and continue after unexpected sync failures.
catch (Exception ex) catch (Exception ex)
{ {
metrics.RecordBackgroundJobRun(nameof(CalendarImportBackgroundService), false);
logger.LogError(ex, "Calendar import background sync failed."); logger.LogError(ex, "Calendar import background sync failed.");
} }
#pragma warning restore CA1031 #pragma warning restore CA1031

View File

@@ -2,6 +2,7 @@ using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.BlobStorage.Contracts; using Socialize.Api.Infrastructure.BlobStorage.Contracts;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.ContentItems.Contracts; using Socialize.Api.Modules.ContentItems.Contracts;
using Socialize.Api.Modules.ContentItems.Data; using Socialize.Api.Modules.ContentItems.Data;
@@ -34,7 +35,8 @@ internal class CreateCommentHandler(
AccessScopeService accessScopeService, AccessScopeService accessScopeService,
IBlobStorage blobStorage, IBlobStorage blobStorage,
IContentItemActivityWriter activityWriter, IContentItemActivityWriter activityWriter,
INotificationEventWriter notificationEventWriter) INotificationEventWriter notificationEventWriter,
SocializeMetrics metrics)
: Endpoint<CreateCommentRequest, CommentDto> : Endpoint<CreateCommentRequest, CommentDto>
{ {
public override void Configure() public override void Configure()
@@ -156,6 +158,7 @@ internal class CreateCommentHandler(
dbContext.Comments.Add(comment); dbContext.Comments.Add(comment);
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordCommentCreated(comment.WorkspaceId, comment.AttachmentBlobName is not null);
string? authorPortraitUrl = await dbContext.Users string? authorPortraitUrl = await dbContext.Users
.Where(candidate => candidate.Id == comment.AuthorUserId) .Where(candidate => candidate.Id == comment.AuthorUserId)

View File

@@ -1,6 +1,7 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.ContentItems.Contracts; using Socialize.Api.Modules.ContentItems.Contracts;
using Socialize.Api.Modules.Notifications.Contracts; using Socialize.Api.Modules.Notifications.Contracts;
@@ -39,7 +40,8 @@ internal class CreateContentItemHandler(
AppDbContext dbContext, AppDbContext dbContext,
AccessScopeService accessScopeService, AccessScopeService accessScopeService,
IContentItemActivityWriter activityWriter, IContentItemActivityWriter activityWriter,
INotificationEventWriter notificationEventWriter) INotificationEventWriter notificationEventWriter,
SocializeMetrics metrics)
: Endpoint<CreateContentItemRequest, ContentItemDto> : Endpoint<CreateContentItemRequest, ContentItemDto>
{ {
public override void Configure() public override void Configure()
@@ -123,6 +125,7 @@ internal class CreateContentItemHandler(
CreatedAt = DateTimeOffset.UtcNow, CreatedAt = DateTimeOffset.UtcNow,
}); });
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordContentItemCreated(item.WorkspaceId);
await activityWriter.WriteAsync( await activityWriter.WriteAsync(
new ContentItemActivityWriteModel( new ContentItemActivityWriteModel(

View File

@@ -1,5 +1,6 @@
using FastEndpoints; using FastEndpoints;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.Feedback.Contracts; using Socialize.Api.Modules.Feedback.Contracts;
using Socialize.Api.Modules.Feedback.Data; using Socialize.Api.Modules.Feedback.Data;
@@ -45,7 +46,8 @@ internal class SubmitFeedbackRequestValidator
internal class SubmitFeedbackHandler( internal class SubmitFeedbackHandler(
AppDbContext dbContext, AppDbContext dbContext,
FeedbackNotificationService notificationService) FeedbackNotificationService notificationService,
SocializeMetrics metrics)
: Endpoint<SubmitFeedbackRequest, FeedbackReportDto> : Endpoint<SubmitFeedbackRequest, FeedbackReportDto>
{ {
public override void Configure() public override void Configure()
@@ -93,6 +95,7 @@ internal class SubmitFeedbackHandler(
dbContext.FeedbackReports.Add(report); dbContext.FeedbackReports.Add(report);
await notificationService.AddNewReportNotificationsAsync(report, ct); await notificationService.AddNewReportNotificationsAsync(report, ct);
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordFeedbackSubmitted(report.Type.ToString(), report.WorkspaceId);
await SendAsync(report.ToDto(), StatusCodes.Status201Created, ct); await SendAsync(report.ToDto(), StatusCodes.Status201Created, ct);
} }

View File

@@ -1,5 +1,6 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.Identity.Data; using Socialize.Api.Modules.Identity.Data;
using Socialize.Api.Modules.Identity.Configuration; using Socialize.Api.Modules.Identity.Configuration;
@@ -21,7 +22,8 @@ internal record LoginResponse(
internal class LoginHandler( internal class LoginHandler(
UserManager userManager, UserManager userManager,
IOptionsSnapshot<JwtOptions> jwtOptions, IOptionsSnapshot<JwtOptions> jwtOptions,
AccessTokenFactory accessTokenFactory) AccessTokenFactory accessTokenFactory,
SocializeMetrics metrics)
: Endpoint<LoginRequest, LoginResponse> : Endpoint<LoginRequest, LoginResponse>
{ {
public override void Configure() public override void Configure()
@@ -40,6 +42,7 @@ internal class LoginHandler(
user ??= await userManager.FindByNameAsync(request.Email); user ??= await userManager.FindByNameAsync(request.Email);
if (user is null) if (user is null)
{ {
metrics.RecordLoginAttempt(false, "unknown_user");
await SendStringAsync( await SendStringAsync(
"Invalid email or password", "Invalid email or password",
401, 401,
@@ -51,6 +54,7 @@ internal class LoginHandler(
bool isPasswordValid = await userManager.CheckPasswordAsync(user, request.Password); bool isPasswordValid = await userManager.CheckPasswordAsync(user, request.Password);
if (!isPasswordValid) if (!isPasswordValid)
{ {
metrics.RecordLoginAttempt(false, "invalid_password");
await SendStringAsync( await SendStringAsync(
"Invalid email or password", "Invalid email or password",
401, 401,
@@ -61,6 +65,7 @@ internal class LoginHandler(
// Check if the email is confirmed // Check if the email is confirmed
if (!user.EmailConfirmed) if (!user.EmailConfirmed)
{ {
metrics.RecordLoginAttempt(false, "email_unconfirmed");
await SendStringAsync( await SendStringAsync(
"Email not verified. Please check your email for verification instructions.", "Email not verified. Please check your email for verification instructions.",
401, 401,
@@ -76,6 +81,7 @@ internal class LoginHandler(
// Generate JWT token // Generate JWT token
string accessToken = await accessTokenFactory.CreateAsync(user); string accessToken = await accessTokenFactory.CreateAsync(user);
metrics.RecordLoginAttempt(true, "success");
await SendOkAsync( await SendOkAsync(
new LoginResponse(accessToken, user.RefreshToken), new LoginResponse(accessToken, user.RefreshToken),

View File

@@ -1,6 +1,7 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.Organizations.Data; using Socialize.Api.Modules.Organizations.Data;
using Socialize.Api.Modules.Organizations.Services; using Socialize.Api.Modules.Organizations.Services;
@@ -21,7 +22,8 @@ internal class CreateOrganizationRequestValidator
} }
internal class CreateOrganizationHandler( internal class CreateOrganizationHandler(
AppDbContext dbContext) AppDbContext dbContext,
SocializeMetrics metrics)
: Endpoint<CreateOrganizationRequest, OrganizationDto> : Endpoint<CreateOrganizationRequest, OrganizationDto>
{ {
public override void Configure() public override void Configure()
@@ -66,6 +68,7 @@ internal class CreateOrganizationHandler(
dbContext.Organizations.Add(organization); dbContext.Organizations.Add(organization);
dbContext.OrganizationMemberships.Add(ownerMembership); dbContext.OrganizationMemberships.Add(ownerMembership);
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordOrganizationCreated(organization.Id);
await SendAsync( await SendAsync(
OrganizationDto.FromOrganization( OrganizationDto.FromOrganization(

View File

@@ -1,4 +1,5 @@
using Microsoft.Extensions.Options; using Microsoft.Extensions.Options;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Modules.ReleaseCommunications.Configuration; using Socialize.Api.Modules.ReleaseCommunications.Configuration;
namespace Socialize.Api.Modules.ReleaseCommunications.Services; namespace Socialize.Api.Modules.ReleaseCommunications.Services;
@@ -6,6 +7,7 @@ namespace Socialize.Api.Modules.ReleaseCommunications.Services;
internal sealed class ReleaseUpdateEmailDigestBackgroundService( internal sealed class ReleaseUpdateEmailDigestBackgroundService(
IServiceScopeFactory scopeFactory, IServiceScopeFactory scopeFactory,
IOptions<ReleaseCommunicationEmailOptions> options, IOptions<ReleaseCommunicationEmailOptions> options,
SocializeMetrics metrics,
ILogger<ReleaseUpdateEmailDigestBackgroundService> logger) ILogger<ReleaseUpdateEmailDigestBackgroundService> logger)
: BackgroundService : BackgroundService
{ {
@@ -42,6 +44,7 @@ internal sealed class ReleaseUpdateEmailDigestBackgroundService(
TimeSpan.FromHours(options.Value.DigestIntervalHours), TimeSpan.FromHours(options.Value.DigestIntervalHours),
force: false, force: false,
ct: stoppingToken); ct: stoppingToken);
metrics.RecordBackgroundJobRun(nameof(ReleaseUpdateEmailDigestBackgroundService), true);
if (sentCount > 0 && logger.IsEnabled(LogLevel.Information)) if (sentCount > 0 && logger.IsEnabled(LogLevel.Information))
{ {
logger.LogInformation("Sent {SentCount} release update digest emails.", sentCount); logger.LogInformation("Sent {SentCount} release update digest emails.", sentCount);
@@ -54,6 +57,7 @@ internal sealed class ReleaseUpdateEmailDigestBackgroundService(
#pragma warning disable CA1031 #pragma warning disable CA1031
catch (Exception ex) catch (Exception ex)
{ {
metrics.RecordBackgroundJobRun(nameof(ReleaseUpdateEmailDigestBackgroundService), false);
logger.LogError(ex, "Release update digest service failed."); logger.LogError(ex, "Release update digest service failed.");
} }
#pragma warning restore CA1031 #pragma warning restore CA1031

View File

@@ -1,6 +1,7 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.Workspaces.Data; using Socialize.Api.Modules.Workspaces.Data;
@@ -24,7 +25,8 @@ internal class CreateWorkspaceRequestValidator
internal class CreateWorkspaceHandler( internal class CreateWorkspaceHandler(
AppDbContext dbContext, AppDbContext dbContext,
AccessScopeService accessScopeService) AccessScopeService accessScopeService,
SocializeMetrics metrics)
: Endpoint<CreateWorkspaceRequest, WorkspaceDto> : Endpoint<CreateWorkspaceRequest, WorkspaceDto>
{ {
public override void Configure() public override void Configure()
@@ -65,6 +67,7 @@ internal class CreateWorkspaceHandler(
dbContext.Workspaces.Add(workspace); dbContext.Workspaces.Add(workspace);
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordWorkspaceCreated(workspace.OrganizationId, workspace.Id);
WorkspaceDto dto = WorkspaceDto.FromWorkspace(workspace, []); WorkspaceDto dto = WorkspaceDto.FromWorkspace(workspace, []);

View File

@@ -1,6 +1,7 @@
using FastEndpoints; using FastEndpoints;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Socialize.Api.Data; using Socialize.Api.Data;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.Security; using Socialize.Api.Infrastructure.Security;
using Socialize.Api.Modules.Identity.Contracts; using Socialize.Api.Modules.Identity.Contracts;
using Socialize.Api.Modules.Workspaces.Data; using Socialize.Api.Modules.Workspaces.Data;
@@ -31,7 +32,8 @@ internal class CreateWorkspaceInviteRequestValidator
internal class CreateWorkspaceInviteHandler( internal class CreateWorkspaceInviteHandler(
AppDbContext dbContext, AppDbContext dbContext,
AccessScopeService accessScopeService) AccessScopeService accessScopeService,
SocializeMetrics metrics)
: Endpoint<CreateWorkspaceInviteRequest, WorkspaceInviteDto> : Endpoint<CreateWorkspaceInviteRequest, WorkspaceInviteDto>
{ {
public override void Configure() public override void Configure()
@@ -91,6 +93,7 @@ internal class CreateWorkspaceInviteHandler(
dbContext.WorkspaceInvites.Add(invite); dbContext.WorkspaceInvites.Add(invite);
await dbContext.SaveChangesAsync(ct); await dbContext.SaveChangesAsync(ct);
metrics.RecordWorkspaceInviteCreated(invite.WorkspaceId, invite.Role);
await SendAsync( await SendAsync(
new WorkspaceInviteDto( new WorkspaceInviteDto(

View File

@@ -6,6 +6,7 @@ using Socialize;
using Socialize.Api.Infrastructure.BlobStorage.Configuration; using Socialize.Api.Infrastructure.BlobStorage.Configuration;
using Socialize.Api.Infrastructure.BlobStorage.Services; using Socialize.Api.Infrastructure.BlobStorage.Services;
using Socialize.Api.Infrastructure; using Socialize.Api.Infrastructure;
using Socialize.Api.Infrastructure.Observability;
using Socialize.Api.Infrastructure.TestData; using Socialize.Api.Infrastructure.TestData;
using Socialize.Api.Modules.Approvals; using Socialize.Api.Modules.Approvals;
using Socialize.Api.Modules.Assets; using Socialize.Api.Modules.Assets;
@@ -44,6 +45,8 @@ builder.Services.AddCors(options =>
) )
); );
builder.AddObservability();
// Add services to the container. // Add services to the container.
builder.Services.AddWebServices(); builder.Services.AddWebServices();
builder.Services.AddAuthorizationAndAuthentication(builder.Configuration); builder.Services.AddAuthorizationAndAuthentication(builder.Configuration);
@@ -110,6 +113,7 @@ app.UseCors("AllowAll");
app.UseAuthentication(); app.UseAuthentication();
app.UseAuthorization(); app.UseAuthorization();
app.UseObservabilityLoggingScope();
// Initialize and seed the db. // Initialize and seed the db.
await app.UseAppDataAsync(); await app.UseAppDataAsync();
@@ -122,7 +126,7 @@ if (!app.Environment.IsDevelopment())
app.UseHsts(); app.UseHsts();
} }
app.UseHealthChecks("/health"); app.MapObservabilityHealthChecks();
LocalBlobStorageOptions localBlobStorageOptions = app.Services LocalBlobStorageOptions localBlobStorageOptions = app.Services
.GetRequiredService<IOptions<LocalBlobStorageOptions>>() .GetRequiredService<IOptions<LocalBlobStorageOptions>>()

View File

@@ -28,7 +28,13 @@
<PackageReference Include="Microsoft.EntityFrameworkCore.Relational" Version="10.0.0" /> <PackageReference Include="Microsoft.EntityFrameworkCore.Relational" Version="10.0.0" />
<PackageReference Include="Microsoft.Extensions.Diagnostics.HealthChecks.EntityFrameworkCore" <PackageReference Include="Microsoft.Extensions.Diagnostics.HealthChecks.EntityFrameworkCore"
Version="10.0.0" /> Version="10.0.0" />
<PackageReference Include="Npgsql.OpenTelemetry" Version="10.0.2" />
<PackageReference Include="Npgsql.EntityFrameworkCore.PostgreSQL" Version="10.0.0" /> <PackageReference Include="Npgsql.EntityFrameworkCore.PostgreSQL" Version="10.0.0" />
<PackageReference Include="OpenTelemetry.Exporter.OpenTelemetryProtocol" Version="1.15.3" />
<PackageReference Include="OpenTelemetry.Extensions.Hosting" Version="1.15.3" />
<PackageReference Include="OpenTelemetry.Instrumentation.AspNetCore" Version="1.15.2" />
<PackageReference Include="OpenTelemetry.Instrumentation.Http" Version="1.15.1" />
<PackageReference Include="OpenTelemetry.Instrumentation.Runtime" Version="1.15.1" />
<PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.0"> <PackageReference Include="Microsoft.EntityFrameworkCore.Design" Version="10.0.0">
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets> <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
<PrivateAssets>all</PrivateAssets> <PrivateAssets>all</PrivateAssets>

View File

@@ -3,8 +3,9 @@ services:
image: postgres:16 image: postgres:16
restart: unless-stopped restart: unless-stopped
env_file: env_file:
- /etc/socialize/socialize.env - path: /etc/socialize/socialize.env
- .deploy.env - path: .deploy.env
required: false
environment: environment:
POSTGRES_DB: ${POSTGRES_DB} POSTGRES_DB: ${POSTGRES_DB}
POSTGRES_USER: ${POSTGRES_USER} POSTGRES_USER: ${POSTGRES_USER}
@@ -23,8 +24,9 @@ services:
image: git.mapachotes.com/jbourdon/socialize-api:${SOCIALIZE_IMAGE_TAG} image: git.mapachotes.com/jbourdon/socialize-api:${SOCIALIZE_IMAGE_TAG}
restart: unless-stopped restart: unless-stopped
env_file: env_file:
- /etc/socialize/socialize.env - path: /etc/socialize/socialize.env
- .deploy.env - path: .deploy.env
required: false
environment: environment:
ASPNETCORE_ENVIRONMENT: ${ASPNETCORE_ENVIRONMENT} ASPNETCORE_ENVIRONMENT: ${ASPNETCORE_ENVIRONMENT}
ASPNETCORE_URLS: ${ASPNETCORE_URLS} ASPNETCORE_URLS: ${ASPNETCORE_URLS}

View File

@@ -0,0 +1,22 @@
global:
resolve_timeout: 5m
route:
receiver: preprod-webhook
group_by:
- alertname
- service
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
routes:
- matchers:
- severity="critical"
receiver: preprod-webhook
repeat_interval: 30m
receivers:
- name: preprod-webhook
webhook_configs:
- url: ${ALERTMANAGER_WEBHOOK_URL}
send_resolved: true

View File

@@ -0,0 +1,95 @@
logging {
level = "info"
format = "logfmt"
}
otelcol.receiver.otlp "api" {
grpc {
endpoint = "0.0.0.0:4317"
}
http {
endpoint = "0.0.0.0:4318"
}
output {
metrics = [otelcol.processor.transform.metric_labels.input]
traces = [otelcol.processor.batch.default.input]
}
}
otelcol.processor.transform "metric_labels" {
error_mode = "ignore"
metric_statements {
context = "datapoint"
statements = [
`set(attributes["service.name"], resource.attributes["service.name"])`,
`set(attributes["deployment.environment"], resource.attributes["deployment.environment"])`,
]
}
output {
metrics = [otelcol.processor.batch.default.input]
}
}
otelcol.processor.batch "default" {
output {
metrics = [otelcol.exporter.prometheus.local.input]
traces = [otelcol.exporter.otlp.tempo.input]
}
}
otelcol.exporter.prometheus "local" {
forward_to = [prometheus.remote_write.local.receiver]
}
prometheus.remote_write "local" {
endpoint {
url = "http://prometheus:9090/api/v1/write"
}
}
otelcol.exporter.otlp "tempo" {
client {
endpoint = "tempo:4317"
tls {
insecure = true
}
}
}
discovery.docker "linux" {
host = "unix:///var/run/docker.sock"
}
discovery.relabel "docker_logs" {
targets = []
rule {
source_labels = ["__meta_docker_container_name"]
regex = "/(.*)"
target_label = "service_name"
}
rule {
source_labels = ["__meta_docker_container_label_com_docker_compose_service"]
target_label = "compose_service"
}
}
loki.source.docker "default" {
host = "unix:///var/run/docker.sock"
targets = discovery.docker.linux.targets
labels = {"platform" = "docker"}
relabel_rules = discovery.relabel.docker_logs.rules
forward_to = [loki.write.local.receiver]
}
loki.write "local" {
endpoint {
url = "http://loki:3100/loki/api/v1/push"
}
}

View File

@@ -0,0 +1,9 @@
modules:
http_2xx:
prober: http
timeout: 5s
http:
method: GET
preferred_ip_protocol: ip4
valid_status_codes:
- 200

View File

@@ -0,0 +1,13 @@
# Optional Caddy snippet for exposing Grafana through a protected hostname.
# Generate a password hash with:
# caddy hash-password --plaintext '<password>'
{$OBSERVABILITY_HOST} {
encode gzip zstd
basicauth {
{$GRAFANA_BASIC_AUTH_USER} {$GRAFANA_BASIC_AUTH_HASH}
}
reverse_proxy grafana:3000
}

View File

@@ -0,0 +1,121 @@
services:
api:
environment:
OTEL_SERVICE_NAME: socialize-api
OTEL_EXPORTER_OTLP_ENDPOINT: http://alloy:4317
OTEL_EXPORTER_OTLP_PROTOCOL: grpc
OTEL_RESOURCE_ATTRIBUTES: deployment.environment=preprod
depends_on:
alloy:
condition: service_started
grafana:
image: grafana/grafana:13.0.1
restart: unless-stopped
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
ports:
- "${GRAFANA_HTTP_BIND:-127.0.0.1}:3000:3000"
depends_on:
- prometheus
- loki
- tempo
- alertmanager
networks:
- internal
prometheus:
image: prom/prometheus:v3.11.3
restart: unless-stopped
command:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=${PROMETHEUS_RETENTION:-15d}
- --web.enable-remote-write-receiver
volumes:
- prometheus-data:/prometheus
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./observability/prometheus/rules:/etc/prometheus/rules:ro
networks:
- internal
alertmanager:
image: prom/alertmanager:v0.29.0
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --config.expand-env
environment:
ALERTMANAGER_WEBHOOK_URL: ${ALERTMANAGER_WEBHOOK_URL:-http://127.0.0.1:9/}
volumes:
- alertmanager-data:/alertmanager
- ./observability/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
networks:
- internal
blackbox:
image: prom/blackbox-exporter:v0.27.0
restart: unless-stopped
command:
- --config.file=/etc/blackbox_exporter/config.yml
volumes:
- ./observability/blackbox/config.yml:/etc/blackbox_exporter/config.yml:ro
networks:
- internal
loki:
image: grafana/loki:3.7.1
restart: unless-stopped
command: -config.file=/etc/loki/local-config.yml
volumes:
- loki-data:/loki
- ./observability/loki/local-config.yml:/etc/loki/local-config.yml:ro
networks:
- internal
tempo:
image: grafana/tempo:2.10.3
restart: unless-stopped
command: -config.file=/etc/tempo.yml
volumes:
- tempo-data:/var/tempo
- ./observability/tempo/tempo.yml:/etc/tempo.yml:ro
networks:
- internal
alloy:
image: grafana/alloy:v1.16.0
restart: unless-stopped
command:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
volumes:
- alloy-data:/var/lib/alloy/data
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./observability/alloy/config.alloy:/etc/alloy/config.alloy:ro
expose:
- "4317"
- "4318"
- "12345"
networks:
- internal
volumes:
alertmanager-data:
grafana-data:
prometheus-data:
loki-data:
tempo-data:
alloy-data:
networks:
internal:

View File

@@ -0,0 +1,485 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
},
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m]))",
"legendFormat": "requests/sec"
}
],
"title": "API Requests/sec",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
},
"id": 2,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])), 0.001)",
"legendFormat": "5xx rate"
}
],
"title": "API 5xx Rate",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 12,
"y": 0
},
"id": 3,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
"legendFormat": "p95"
}
],
"title": "API p95 Latency",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 6,
"x": 18,
"y": 0
},
"id": 4,
"options": {
"colorMode": "background",
"graphMode": "area",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\"})",
"legendFormat": "firing"
}
],
"title": "Firing Alerts",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "reqps"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
},
"id": 5,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])) by (http_request_method, http_route)",
"legendFormat": "{{http_request_method}} {{http_route}}"
}
],
"title": "Request Rate By Endpoint",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"fieldConfig": {
"defaults": {
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
},
"id": 6,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "histogram_quantile(0.95, sum by (le, http_route) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))",
"legendFormat": "{{http_route}}"
}
],
"title": "p95 Latency By Endpoint",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 12
},
"id": 7,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(socialize_login_attempts_total[24h])) by (outcome)",
"legendFormat": "login {{outcome}}"
},
{
"expr": "sum(increase(socialize_organizations_created_total[24h]))",
"legendFormat": "organizations"
},
{
"expr": "sum(increase(socialize_workspaces_created_total[24h]))",
"legendFormat": "workspaces"
},
{
"expr": "sum(increase(socialize_content_items_created_total[24h]))",
"legendFormat": "content"
},
{
"expr": "sum(increase(socialize_comments_created_total[24h]))",
"legendFormat": "comments"
},
{
"expr": "sum(increase(socialize_approval_decisions_submitted_total[24h]))",
"legendFormat": "approvals"
},
{
"expr": "sum(increase(socialize_feedback_submitted_total[24h]))",
"legendFormat": "feedback"
}
],
"title": "Usage Signals, 24h Rolling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 12
},
"id": 8,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "sum(increase(socialize_email_delivery_total[1h])) by (outcome, provider)",
"legendFormat": "email {{provider}} {{outcome}}"
},
{
"expr": "sum(increase(socialize_blob_storage_operations_total[1h])) by (operation, outcome)",
"legendFormat": "blob {{operation}} {{outcome}}"
},
{
"expr": "sum(increase(socialize_background_job_runs_total[1h])) by (job, outcome)",
"legendFormat": "job {{job}} {{outcome}}"
}
],
"title": "Operational Events, 1h Rolling",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 20
},
"id": 11,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_content_items",
"legendFormat": "content {{status}}"
},
{
"expr": "socialize_workflow_feedback_reports",
"legendFormat": "feedback {{status}}"
}
],
"title": "Workflow Backlog",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 20
},
"id": 12,
"options": {
"legend": {
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "multi"
}
},
"targets": [
{
"expr": "socialize_workflow_active_workspaces",
"legendFormat": "active workspaces {{window}}"
},
{
"expr": "socialize_workflow_stale_in_approval",
"legendFormat": "stale in approval"
},
{
"expr": "socialize_workflow_pending_invites",
"legendFormat": "pending invites"
}
],
"title": "Workflow Health",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "Prometheus"
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 28
},
"id": 9,
"options": {
"showHeader": true
},
"targets": [
{
"expr": "ALERTS{alertstate=\"firing\"}",
"format": "table",
"instant": true,
"legendFormat": "{{alertname}}"
}
],
"title": "Firing Alerts",
"type": "table"
},
{
"datasource": {
"type": "loki",
"uid": "Loki"
},
"gridPos": {
"h": 9,
"w": 24,
"x": 0,
"y": 35
},
"id": 10,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"expr": "{platform=\"docker\", compose_service=\"api\"}",
"refId": "A"
}
],
"title": "API Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": [
"socialize",
"preprod"
],
"templating": {
"list": []
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Socialize Overview",
"uid": "socialize-overview",
"version": 2,
"weekStart": ""
}

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: Socialize
orgId: 1
folder: Socialize
type: file
disableDeletion: false
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,26 @@
apiVersion: 1
datasources:
- name: Prometheus
uid: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
- name: Loki
uid: Loki
type: loki
access: proxy
url: http://loki:3100
- name: Tempo
uid: Tempo
type: tempo
access: proxy
url: http://tempo:3200
jsonData:
tracesToLogsV2:
datasourceUid: Loki
serviceMap:
datasourceUid: Prometheus

View File

@@ -0,0 +1,32 @@
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
filesystem:
directory: /loki/chunks
limits_config:
allow_structured_metadata: true
volume_enabled: true
analytics:
reporting_enabled: false

View File

@@ -0,0 +1,42 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- prometheus:9090
- job_name: alloy
static_configs:
- targets:
- alloy:12345
- job_name: preprod-uptime
metrics_path: /probe
params:
module:
- http_2xx
static_configs:
- targets:
- http://web/
- http://api:8080/health/ready
relabel_configs:
- source_labels:
- __address__
target_label: __param_target
- source_labels:
- __param_target
target_label: instance
- target_label: __address__
replacement: blackbox:9115

View File

@@ -0,0 +1,127 @@
groups:
- name: socialize-preprod
rules:
- alert: SocializeApiTelemetryMissing
expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"})
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API telemetry is missing
description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down.
- alert: SocializePreprodEndpointDown
expr: probe_success{job="preprod-uptime"} == 0
for: 2m
labels:
severity: critical
service: socialize-preprod
annotations:
summary: Preprod endpoint is down
description: '{{ $labels.instance }} has failed blackbox checks for 2 minutes.'
- alert: SocializeApiHighErrorRate
expr: |
(
sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m]))
/
clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001)
) > 0.05
for: 5m
labels:
severity: critical
service: socialize-api
annotations:
summary: Socialize API 5xx rate is high
description: More than 5% of API requests are returning 5xx responses over 5 minutes.
- alert: SocializeApiHighLatency
expr: |
histogram_quantile(
0.95,
sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m]))
) > 2
for: 10m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize API p95 latency is high
description: API p95 latency has been above 2 seconds for 10 minutes.
- alert: SocializeCoreUsageQuiet
expr: |
(
sum(increase(socialize_content_items_created_total[12h]))
+ sum(increase(socialize_comments_created_total[12h]))
+ sum(increase(socialize_approval_decisions_submitted_total[12h]))
+ sum(increase(socialize_feedback_submitted_total[12h]))
) < 1
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Socialize core usage is quiet
description: No content, comment, approval, or feedback activity has been observed over the last 12 hours.
- alert: SocializeContentStaleInApproval
expr: socialize_workflow_stale_in_approval > 0
for: 30m
labels:
severity: warning
service: socialize-api
annotations:
summary: Content is stale in approval
description: One or more content items have been in approval longer than the configured threshold.
- alert: SocializeNoActiveWorkspaces
expr: socialize_workflow_active_workspaces{window="24h"} < 1
for: 1h
labels:
severity: info
service: socialize-api
annotations:
summary: No active workspaces in the last 24 hours
description: No workspace has content workflow activity in the last 24 hours.
- alert: SocializeFeedbackBugSubmitted
expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0
for: 0m
labels:
severity: info
service: socialize-api
annotations:
summary: New bug feedback submitted
description: A user submitted bug feedback in the last 15 minutes.
- alert: SocializeEmailDeliveryFailures
expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Email delivery failures detected
description: One or more email delivery attempts failed in the last 15 minutes.
- alert: SocializeBlobStorageFailures
expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Blob storage failures detected
description: One or more blob storage operations failed in the last 15 minutes.
- alert: SocializeBackgroundJobFailures
expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0
for: 0m
labels:
severity: warning
service: socialize-api
annotations:
summary: Background job failures detected
description: One or more background jobs failed in the last 30 minutes.

View File

@@ -0,0 +1,25 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
storage:
trace:
backend: local
local:
path: /var/tempo/traces
compactor:
compaction:
block_retention: 168h
metrics_generator:
storage:
path: /var/tempo/generator/wal

View File

@@ -0,0 +1,94 @@
# Observability
## Status
Draft
## Goal
Give the SaaS operator preproduction visibility into whether Socialize is healthy and whether real users are exercising core workflows.
This feature is operator-facing. It is not a client-facing analytics suite or status page.
## Initial Scope
- structured backend logs suitable for centralized log search
- OpenTelemetry traces and metrics emitted by the API
- self-hosted Grafana observability stack for preproduction
- health, readiness, and liveness endpoints
- aggregate product usage counters for core workflow actions
- dashboards and alerts for app health and adoption signals
## Operational Signals
Health signals should cover:
- API availability
- Postgres connectivity
- request rate, latency, and error rate
- slow endpoints
- outbound HTTP failures
- background service failures
- email delivery failures
- blob storage failures
- authentication failures
Usage signals should cover aggregate counts for:
- login attempts and successful logins
- organizations and workspaces created
- content items created
- comments created
- approval decisions submitted
- feedback reports submitted
- workspace invites created
## Privacy And Safety Rules
- Do not log request bodies, access tokens, refresh tokens, passwords, uploaded file contents, screenshots, or raw customer content.
- Usage metrics are aggregate operational signals, not behavioral tracking.
- User, organization, and workspace identifiers may be included as structured attributes when already available to backend code.
- The first implementation targets preproduction and self-hosted Docker infrastructure only.
## Deployment Shape
The application emits OpenTelemetry over OTLP to a local collector.
The preproduction observability stack runs as an optional Docker Compose overlay with:
- Grafana for dashboards and alerting
- Prometheus for metrics
- Loki for logs
- Tempo for traces
- Grafana Alloy for log collection and telemetry routing
The normal application compose file must remain usable without the observability overlay.
## Alerting
Preproduction alerting should start with local Prometheus alert rules. Notification routing is a separate operational setup step because the first preproduction target may use email, chat, or a private incident channel.
Initial alerts should cover:
- app telemetry missing
- high API error rate
- high API p95 latency
- core usage unexpectedly quiet
- feedback bug reports submitted
- email delivery failures
- blob storage failures
- background job failures
## Workflow Health Gauges
Database-derived workflow health metrics should be sampled periodically instead of emitted per request.
Initial gauges should cover:
- content item counts by status
- feedback report counts by status
- pending workspace invites
- content stale in approval
- active workspace counts over 24-hour and 7-day windows
These are operator health signals. They should stay aggregate enough to avoid high-cardinality metric labels.

View File

@@ -0,0 +1,163 @@
# Observability Runbook
## Purpose
This runbook is for preproduction operation of Socialize's self-hosted observability stack.
The goal is to answer:
- Is the app reachable?
- Is the API healthy?
- Are errors or latency rising?
- Are users exercising core workflows?
- Are emails, blob storage, and background jobs failing?
- Is work getting stuck?
## Start The Stack
Run from the repository root on the preproduction host:
```bash
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d
```
Grafana listens on `127.0.0.1:3000` by default. Set `GRAFANA_HTTP_BIND=0.0.0.0`
only when Grafana is protected by a reverse proxy, VPN, firewall rule, or SSH tunnel.
Set these before exposing Grafana:
```bash
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=<strong-password>
```
## Alert Delivery
Prometheus sends alerts to Alertmanager. Alertmanager sends alerts to the webhook
configured by:
```bash
ALERTMANAGER_WEBHOOK_URL=<private-alert-webhook-url>
```
If no webhook URL is configured, Alertmanager still starts but alert delivery points
to a local discard endpoint.
Critical alerts repeat every 30 minutes. Other alerts repeat every 4 hours.
## Secure Grafana With Caddy
An optional Caddy snippet is available at:
```txt
deploy/observability/caddy/grafana.Caddyfile
```
Generate a Caddy password hash:
```bash
caddy hash-password --plaintext '<password>'
```
Configure:
```bash
OBSERVABILITY_HOST=observability.example.com
GRAFANA_BASIC_AUTH_USER=<user>
GRAFANA_BASIC_AUTH_HASH=<hash>
```
Keep Grafana private unless the hostname is protected.
## First Bring-Up Checks
1. Confirm containers are running:
```bash
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml ps
```
2. Check API health:
```bash
curl -i http://127.0.0.1:8080/health
curl -i http://127.0.0.1:8080/health/ready
```
3. Open Grafana and check the `Socialize Overview` dashboard.
4. Generate a few real actions:
- log in
- create a content item
- add a comment
- submit feedback
- create a workspace invite
5. Confirm metrics appear in the dashboard:
- API request rate
- usage signals
- workflow backlog
- operational events
## Alert Triage
`SocializePreprodEndpointDown`
- Check `docker compose ps`.
- Check `docker compose logs api web`.
- Check `/health/ready`.
`SocializeApiTelemetryMissing`
- Check that `api` has `OTEL_EXPORTER_OTLP_ENDPOINT=http://alloy:4317`.
- Check `docker compose logs alloy`.
- Check whether the API is receiving traffic.
`SocializeApiHighErrorRate`
- Open the API logs panel.
- Filter by recent `5xx` requests.
- Open Tempo traces for slow or failing requests if available.
`SocializeApiHighLatency`
- Check the p95 latency by endpoint panel.
- Inspect slow traces.
- Check database health and recent deploy activity.
`SocializeEmailDeliveryFailures`
- Check API logs for Resend failures.
- Confirm `RESEND_API_KEY` and `RESEND_FROM_EMAIL`.
- Confirm Resend service status outside this stack if needed.
`SocializeBlobStorageFailures`
- Confirm `./blob-storage` volume permissions on the preprod host.
- Check local disk space.
- Check API logs for validation or filesystem errors.
`SocializeBackgroundJobFailures`
- Check the operational events panel for the failing job name.
- Check API logs for the same time window.
`SocializeContentStaleInApproval`
- Use the app to inspect content currently in approval.
- Contact the relevant internal owner or client contact outside the app if needed.
`SocializeCoreUsageQuiet` or `SocializeNoActiveWorkspaces`
- Confirm whether quiet usage is expected for the period.
- If not expected, check login events and API reachability.
## Retention Defaults
- Prometheus keeps 15 days by default through `PROMETHEUS_RETENTION`.
- Tempo keeps traces for 168 hours.
- Loki uses local filesystem storage for preproduction.
Tune retention before heavy customer usage or long-running demos.

View File

@@ -0,0 +1,44 @@
# Observability 001: Preprod Foundation
## Goal
Add the first preproduction observability foundation for Socialize so the operator can tell whether the app is healthy and whether core workflows are being used.
## Feature Spec
- `docs/FEATURES/observability.md`
## Scope
- Add backend OpenTelemetry registration for traces and metrics.
- Add structured JSON console logging with request correlation context.
- Add aggregate custom counters for core usage events.
- Expand health endpoints with liveness and readiness checks.
- Add an optional Docker Compose observability overlay for Grafana, Prometheus, Loki, Tempo, and Alloy.
- Add basic Grafana datasource/dashboard provisioning.
## Likely Files
- `backend/src/Socialize.Api/Program.cs`
- `backend/src/Socialize.Api/ApplicationRegistration.cs`
- `backend/src/Socialize.Api/Infrastructure/Observability/*`
- selected backend handlers for usage counters
- `backend/src/Socialize.Api/Socialize.Api.csproj`
- `deploy/observability/*`
- `README.md`
## Out Of Scope
- Client-facing analytics or status page.
- Frontend behavioral analytics.
- Cloud telemetry providers.
- Long-term telemetry retention policy.
- Full product analytics warehouse.
## Validation
```bash
dotnet build backend/Socialize.slnx
dotnet test backend/Socialize.slnx
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
```

View File

@@ -0,0 +1,32 @@
# Observability 002: Alerts And Dashboard Hardening
## Goal
Make the preproduction observability stack actionable by adding alert rules, better operator dashboards, pinned image versions, and operational counters for services that commonly fail silently.
## Feature Spec
- `docs/FEATURES/observability.md`
## Scope
- Pin Grafana, Prometheus, Loki, Tempo, and Alloy image tags in the observability compose overlay.
- Add Prometheus alert rules for API health, error rate, latency, usage silence, feedback bugs, email failures, blob failures, and background job failures.
- Expand the Grafana dashboard with health, usage, operational failure, alert, log, and trace-oriented panels.
- Add backend counters for email delivery, blob storage operations, and background job runs.
- Document alerting and safe Grafana exposure expectations.
## Out Of Scope
- Notification delivery integration for alerts.
- Client-facing status page.
- Cloud observability backends.
- Full product analytics or session tracking.
## Validation
```bash
dotnet build backend/Socialize.slnx
dotnet test backend/Socialize.slnx
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
```

View File

@@ -0,0 +1,34 @@
# Observability 003: Preprod Operations Loop
## Goal
Close the preproduction operations loop by adding alert delivery scaffolding, uptime probes, workflow health gauges, secured Grafana guidance, and an operator runbook.
## Feature Spec
- `docs/FEATURES/observability.md`
## Scope
- Add Alertmanager to the optional observability compose overlay.
- Add Blackbox Exporter uptime probes for the web container and API readiness endpoint.
- Add backend database-derived workflow health gauges.
- Add Prometheus alerts for uptime probes and workflow health.
- Add an optional Caddy snippet for protected Grafana exposure.
- Add an operator runbook for bring-up, alert triage, and security defaults.
## Out Of Scope
- Operating the remote preproduction host.
- Choosing the final alert destination.
- Client-facing status page.
- External third-party uptime monitoring.
## Validation
```bash
dotnet build backend/Socialize.slnx
dotnet test backend/Socialize.slnx
docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config
jq empty deploy/observability/grafana/dashboards/socialize-overview.json
```