From 8bcff96821174423b4c8799a5cbc9b41bc1af71b Mon Sep 17 00:00:00 2001 From: Jonathan Bourdon Date: Fri, 8 May 2026 15:45:31 -0400 Subject: [PATCH] feat: add preprod observability foundation --- README.md | 31 ++ .../Socialize.Api/ApplicationRegistration.cs | 6 +- .../BlobStorage/Services/LocalBlobStorage.cs | 113 +++-- .../Emailer/Services/LoggerEmailSender.cs | 6 +- .../Emailer/Services/ResendEmailSender.cs | 38 +- .../EmailerConfigurationHealthCheck.cs | 29 ++ .../LocalBlobStorageHealthCheck.cs | 40 ++ .../ObservabilityRegistration.cs | 161 +++++++ .../RequestLoggingScopeMiddleware.cs | 61 +++ .../Observability/SocializeMetrics.cs | 158 +++++++ .../Handlers/SubmitApprovalDecision.cs | 5 +- .../CalendarImportBackgroundService.cs | 5 + .../Comments/Handlers/CreateComment.cs | 5 +- .../Handlers/CreateContentItem.cs | 5 +- .../Feedback/Handlers/SubmitFeedback.cs | 5 +- .../Modules/Identity/Handlers/Login.cs | 8 +- .../Handlers/CreateOrganization.cs | 5 +- ...leaseUpdateEmailDigestBackgroundService.cs | 4 + .../Workspaces/Handlers/CreateWorkspace.cs | 5 +- .../Handlers/CreateWorkspaceInvite.cs | 5 +- backend/src/Socialize.Api/Program.cs | 6 +- .../src/Socialize.Api/Socialize.Api.csproj | 6 + deploy/compose.yml | 10 +- deploy/observability/alloy/config.alloy | 95 ++++ .../observability/compose.observability.yml | 94 ++++ .../dashboards/socialize-overview.json | 413 ++++++++++++++++++ .../provisioning/dashboards/dashboards.yml | 11 + .../provisioning/datasources/datasources.yml | 26 ++ deploy/observability/loki/local-config.yml | 32 ++ .../observability/prometheus/prometheus.yml | 17 + .../prometheus/rules/socialize-alerts.yml | 97 ++++ deploy/observability/tempo/tempo.yml | 25 ++ docs/FEATURES/observability.md | 80 ++++ .../001-observability-foundation.md | 44 ++ .../002-alerts-dashboard-hardening.md | 32 ++ 35 files changed, 1627 insertions(+), 56 deletions(-) create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/EmailerConfigurationHealthCheck.cs create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/LocalBlobStorageHealthCheck.cs create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/RequestLoggingScopeMiddleware.cs create mode 100644 backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs create mode 100644 deploy/observability/alloy/config.alloy create mode 100644 deploy/observability/compose.observability.yml create mode 100644 deploy/observability/grafana/dashboards/socialize-overview.json create mode 100644 deploy/observability/grafana/provisioning/dashboards/dashboards.yml create mode 100644 deploy/observability/grafana/provisioning/datasources/datasources.yml create mode 100644 deploy/observability/loki/local-config.yml create mode 100644 deploy/observability/prometheus/prometheus.yml create mode 100644 deploy/observability/prometheus/rules/socialize-alerts.yml create mode 100644 deploy/observability/tempo/tempo.yml create mode 100644 docs/FEATURES/observability.md create mode 100644 docs/TASKS/observability/001-observability-foundation.md create mode 100644 docs/TASKS/observability/002-alerts-dashboard-hardening.md diff --git a/README.md b/README.md index 35a5ad95..3c4d229b 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,37 @@ The deploy workflow writes the remote `.env` file and syncs `deploy/compose.yml` before running the server deploy script. Use the raw Resend API key value for `RESEND_API_KEY`, without a `Bearer ` prefix. +## Preprod Observability + +The optional observability overlay runs a self-hosted Grafana stack for preproduction: + +- Grafana `13.0.1`: dashboards +- Prometheus `v3.11.3`: metrics and local alert rules +- Loki `3.7.1`: Docker/container logs +- Tempo `2.10.3`: traces +- Grafana Alloy `v1.16.0`: OTLP receiver and Docker log collector + +Start the app with observability: + +```bash +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml up -d +``` + +Grafana is exposed at: + +```txt +http://127.0.0.1:3000 +``` + +Default credentials are `admin` / `admin` unless `GRAFANA_ADMIN_USER` and +`GRAFANA_ADMIN_PASSWORD` are set. Set `GRAFANA_HTTP_BIND=0.0.0.0` only when the +preprod network boundary is trusted or protected by a reverse proxy/VPN. + +Set a non-default `GRAFANA_ADMIN_PASSWORD` before exposing Grafana outside the +host. Prometheus alert rules are provisioned under +`deploy/observability/prometheus/rules/`; notification delivery is intentionally +left to the preprod operations environment. + ## Solution ```bash diff --git a/backend/src/Socialize.Api/ApplicationRegistration.cs b/backend/src/Socialize.Api/ApplicationRegistration.cs index d11ba062..75f1013c 100644 --- a/backend/src/Socialize.Api/ApplicationRegistration.cs +++ b/backend/src/Socialize.Api/ApplicationRegistration.cs @@ -1,5 +1,6 @@ using System.Text; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Microsoft.EntityFrameworkCore; using Microsoft.AspNetCore.Authentication; @@ -20,7 +21,10 @@ internal static class ApplicationRegistration services.AddHttpContextAccessor(); services.AddHealthChecks() - .AddDbContextCheck(); + .AddCheck("self", () => Microsoft.Extensions.Diagnostics.HealthChecks.HealthCheckResult.Healthy(), tags: ["live"]) + .AddDbContextCheck("postgres", tags: ["ready"]) + .AddCheck("local_blob_storage", tags: ["ready"]) + .AddCheck("emailer_configuration", tags: ["ready"]); services.AddHttpClient(); services.AddScoped(); diff --git a/backend/src/Socialize.Api/Infrastructure/BlobStorage/Services/LocalBlobStorage.cs b/backend/src/Socialize.Api/Infrastructure/BlobStorage/Services/LocalBlobStorage.cs index 55ce45e6..53b3a630 100644 --- a/backend/src/Socialize.Api/Infrastructure/BlobStorage/Services/LocalBlobStorage.cs +++ b/backend/src/Socialize.Api/Infrastructure/BlobStorage/Services/LocalBlobStorage.cs @@ -1,6 +1,7 @@ using Microsoft.Extensions.Options; using Socialize.Api.Infrastructure.BlobStorage.Configuration; using Socialize.Api.Infrastructure.BlobStorage.Contracts; +using Socialize.Api.Infrastructure.Observability; namespace Socialize.Api.Infrastructure.BlobStorage.Services; @@ -8,7 +9,8 @@ internal sealed class LocalBlobStorage( IWebHostEnvironment environment, IHttpContextAccessor httpContextAccessor, IOptions options, - ILogger logger) + ILogger logger, + SocializeMetrics metrics) : IBlobStorage { private const long MaxUploadSize = 10 * 1024 * 1024; @@ -31,32 +33,51 @@ internal sealed class LocalBlobStorage( string contentType, CancellationToken ct = default) { - stream.Position = 0; - - if (stream.Length > MaxUploadSize) + try { - logger.LogError("Blob storage: File size exceeds the maximum allowed size of {MaxUploadSize} bytes.", MaxUploadSize); - throw new InvalidOperationException($"Blob storage: File size exceeds the maximum allowed size of {MaxUploadSize} bytes."); - } + stream.Position = 0; - if (!ContentTypes.IsAllowed(contentType, stream)) + if (stream.Length > MaxUploadSize) + { + logger.LogError("Blob storage: File size exceeds the maximum allowed size of {MaxUploadSize} bytes.", MaxUploadSize); + throw new InvalidOperationException($"Blob storage: File size exceeds the maximum allowed size of {MaxUploadSize} bytes."); + } + + if (!ContentTypes.IsAllowed(contentType, stream)) + { + logger.LogError("Blob storage: Unsupported file type {ContentType}.", contentType); + throw new InvalidOperationException("Unsupported file type."); + } + + string relativePath = GetSafeRelativePath(containerName, blobName); + string filePath = Path.Combine(GetRootPath(), relativePath); + Directory.CreateDirectory(Path.GetDirectoryName(filePath) ?? GetRootPath()); + + await using FileStream fileStream = File.Create(filePath); + await stream.CopyToAsync(fileStream, ct); + await File.WriteAllTextAsync(GetContentTypeMetadataPath(filePath), contentType, ct); + + string fileUri = BuildPublicUrl(relativePath); + LogUploadedFile(logger, blobName, containerName, contentType, fileUri, null); + metrics.RecordBlobStorageOperation("upload", true); + + return fileUri; + } + catch (InvalidOperationException) { - logger.LogError("Blob storage: Unsupported file type {ContentType}.", contentType); - throw new InvalidOperationException("Unsupported file type."); + metrics.RecordBlobStorageOperation("upload", false); + throw; + } + catch (IOException) + { + metrics.RecordBlobStorageOperation("upload", false); + throw; + } + catch (UnauthorizedAccessException) + { + metrics.RecordBlobStorageOperation("upload", false); + throw; } - - string relativePath = GetSafeRelativePath(containerName, blobName); - string filePath = Path.Combine(GetRootPath(), relativePath); - Directory.CreateDirectory(Path.GetDirectoryName(filePath) ?? GetRootPath()); - - await using FileStream fileStream = File.Create(filePath); - await stream.CopyToAsync(fileStream, ct); - await File.WriteAllTextAsync(GetContentTypeMetadataPath(filePath), contentType, ct); - - string fileUri = BuildPublicUrl(relativePath); - LogUploadedFile(logger, blobName, containerName, contentType, fileUri, null); - - return fileUri; } public async Task DownloadFileAsync( @@ -64,19 +85,43 @@ internal sealed class LocalBlobStorage( string blobName, CancellationToken ct = default) { - string filePath = Path.Combine(GetRootPath(), GetSafeRelativePath(containerName, blobName)); - - if (!File.Exists(filePath)) + try { - throw new FileNotFoundException("Blob storage: Local file was not found.", blobName); + string filePath = Path.Combine(GetRootPath(), GetSafeRelativePath(containerName, blobName)); + + if (!File.Exists(filePath)) + { + throw new FileNotFoundException("Blob storage: Local file was not found.", blobName); + } + + MemoryStream memoryStream = new(); + await using FileStream fileStream = File.OpenRead(filePath); + await fileStream.CopyToAsync(memoryStream, ct); + memoryStream.Position = 0; + metrics.RecordBlobStorageOperation("download", true); + + return memoryStream; + } + catch (InvalidOperationException) + { + metrics.RecordBlobStorageOperation("download", false); + throw; + } + catch (FileNotFoundException) + { + metrics.RecordBlobStorageOperation("download", false); + throw; + } + catch (IOException) + { + metrics.RecordBlobStorageOperation("download", false); + throw; + } + catch (UnauthorizedAccessException) + { + metrics.RecordBlobStorageOperation("download", false); + throw; } - - MemoryStream memoryStream = new(); - await using FileStream fileStream = File.OpenRead(filePath); - await fileStream.CopyToAsync(memoryStream, ct); - memoryStream.Position = 0; - - return memoryStream; } internal string GetRootPath() diff --git a/backend/src/Socialize.Api/Infrastructure/Emailer/Services/LoggerEmailSender.cs b/backend/src/Socialize.Api/Infrastructure/Emailer/Services/LoggerEmailSender.cs index 74f729bc..47649e39 100644 --- a/backend/src/Socialize.Api/Infrastructure/Emailer/Services/LoggerEmailSender.cs +++ b/backend/src/Socialize.Api/Infrastructure/Emailer/Services/LoggerEmailSender.cs @@ -1,8 +1,11 @@ using Socialize.Api.Infrastructure.Emailer.Contracts; +using Socialize.Api.Infrastructure.Observability; namespace Socialize.Api.Infrastructure.Emailer.Services; -internal class LoggerEmailSender(ILogger logger) +internal class LoggerEmailSender( + ILogger logger, + SocializeMetrics metrics) : IEmailSender { private static readonly Action LogDevelopmentEmail = @@ -14,6 +17,7 @@ internal class LoggerEmailSender(ILogger logger) public Task SendEmailAsync(string email, string subject, string message) { LogDevelopmentEmail(logger, email, subject, Environment.NewLine, message, null); + metrics.RecordEmailDelivery("logger", true); return Task.CompletedTask; } diff --git a/backend/src/Socialize.Api/Infrastructure/Emailer/Services/ResendEmailSender.cs b/backend/src/Socialize.Api/Infrastructure/Emailer/Services/ResendEmailSender.cs index dc7b0312..b6e5efcf 100644 --- a/backend/src/Socialize.Api/Infrastructure/Emailer/Services/ResendEmailSender.cs +++ b/backend/src/Socialize.Api/Infrastructure/Emailer/Services/ResendEmailSender.cs @@ -3,6 +3,7 @@ using System.Text; using System.Text.Json; using Socialize.Api.Infrastructure.Emailer.Configuration; using Socialize.Api.Infrastructure.Emailer.Contracts; +using Socialize.Api.Infrastructure.Observability; using Microsoft.Extensions.Options; namespace Socialize.Api.Infrastructure.Emailer.Services; @@ -11,13 +12,16 @@ internal class ResendEmailSender : IEmailSender { private static readonly Uri EndpointUri = new("https://api.resend.com/emails"); private readonly HttpClient _httpClient; + private readonly SocializeMetrics _metrics; private readonly EmailerOptions _options; public ResendEmailSender( IHttpClientFactory httpClientFactory, - IOptions options) + IOptions options, + SocializeMetrics metrics) { _httpClient = httpClientFactory.CreateClient(); + _metrics = metrics; _options = options.Value; string apiKey = NormalizeApiKey(_options.ApiKey); @@ -49,13 +53,33 @@ internal class ResendEmailSender : IEmailSender string json = JsonSerializer.Serialize(payload); using StringContent content = new(json, Encoding.UTF8, "application/json"); - using HttpResponseMessage response = await _httpClient.PostAsync(EndpointUri, content); - - if (!response.IsSuccessStatusCode) + try { - string body = await response.Content.ReadAsStringAsync(); - throw new InvalidOperationException( - $"Resend email failed: {response.StatusCode} - {body}"); + using HttpResponseMessage response = await _httpClient.PostAsync(EndpointUri, content); + + if (!response.IsSuccessStatusCode) + { + string body = await response.Content.ReadAsStringAsync(); + throw new InvalidOperationException( + $"Resend email failed: {response.StatusCode} - {body}"); + } + + _metrics.RecordEmailDelivery("resend", true); + } + catch (HttpRequestException) + { + _metrics.RecordEmailDelivery("resend", false); + throw; + } + catch (TaskCanceledException) + { + _metrics.RecordEmailDelivery("resend", false); + throw; + } + catch (InvalidOperationException) + { + _metrics.RecordEmailDelivery("resend", false); + throw; } } diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/EmailerConfigurationHealthCheck.cs b/backend/src/Socialize.Api/Infrastructure/Observability/EmailerConfigurationHealthCheck.cs new file mode 100644 index 00000000..4cd3bc12 --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/EmailerConfigurationHealthCheck.cs @@ -0,0 +1,29 @@ +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Options; +using Socialize.Api.Infrastructure.Emailer.Configuration; + +namespace Socialize.Api.Infrastructure.Observability; + +internal sealed class EmailerConfigurationHealthCheck( + IWebHostEnvironment environment, + IOptions options) + : IHealthCheck +{ + public Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + if (environment.IsDevelopment()) + { + return Task.FromResult(HealthCheckResult.Healthy("Development email sender logs email instead of delivering it.")); + } + + EmailerOptions value = options.Value; + if (string.IsNullOrWhiteSpace(value.ApiKey) || string.IsNullOrWhiteSpace(value.FromEmail)) + { + return Task.FromResult(HealthCheckResult.Unhealthy("Emailer API key or from address is missing.")); + } + + return Task.FromResult(HealthCheckResult.Healthy("Emailer configuration is present.")); + } +} diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/LocalBlobStorageHealthCheck.cs b/backend/src/Socialize.Api/Infrastructure/Observability/LocalBlobStorageHealthCheck.cs new file mode 100644 index 00000000..9635b3a2 --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/LocalBlobStorageHealthCheck.cs @@ -0,0 +1,40 @@ +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Microsoft.Extensions.Options; +using Socialize.Api.Infrastructure.BlobStorage.Configuration; +using Socialize.Api.Infrastructure.BlobStorage.Services; + +namespace Socialize.Api.Infrastructure.Observability; + +internal sealed class LocalBlobStorageHealthCheck( + LocalBlobStorage blobStorage, + IOptions options) + : IHealthCheck +{ + public async Task CheckHealthAsync( + HealthCheckContext context, + CancellationToken cancellationToken = default) + { + string rootPath = blobStorage.GetRootPath(); + if (string.IsNullOrWhiteSpace(options.Value.RequestPath)) + { + return HealthCheckResult.Unhealthy("Local blob storage request path is not configured."); + } + + try + { + Directory.CreateDirectory(rootPath); + string probePath = Path.Combine(rootPath, ".healthcheck"); + await File.WriteAllTextAsync( + probePath, + DateTimeOffset.UtcNow.ToString("O", System.Globalization.CultureInfo.InvariantCulture), + cancellationToken); + File.Delete(probePath); + + return HealthCheckResult.Healthy("Local blob storage is writable."); + } + catch (Exception ex) when (ex is IOException or UnauthorizedAccessException) + { + return HealthCheckResult.Unhealthy("Local blob storage is not writable.", ex); + } + } +} diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs new file mode 100644 index 00000000..baaab361 --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/ObservabilityRegistration.cs @@ -0,0 +1,161 @@ +using System.Text.Json; +using Microsoft.AspNetCore.Diagnostics.HealthChecks; +using Microsoft.Extensions.Diagnostics.HealthChecks; +using Npgsql; +using OpenTelemetry.Logs; +using OpenTelemetry.Metrics; +using OpenTelemetry.Resources; +using OpenTelemetry.Trace; + +namespace Socialize.Api.Infrastructure.Observability; + +internal static class ObservabilityRegistration +{ + private const string DefaultServiceName = "socialize-api"; + + public static WebApplicationBuilder AddObservability(this WebApplicationBuilder builder) + { + string serviceName = GetConfigurationValue(builder.Configuration, "OTEL_SERVICE_NAME", DefaultServiceName); + string serviceVersion = typeof(Program).Assembly.GetName().Version?.ToString() ?? "unknown"; + + builder.Logging.Configure(options => + { + options.ActivityTrackingOptions = + ActivityTrackingOptions.TraceId | + ActivityTrackingOptions.SpanId | + ActivityTrackingOptions.ParentId; + }); + + builder.Logging.AddJsonConsole(options => + { + options.IncludeScopes = true; + options.TimestampFormat = "yyyy-MM-ddTHH:mm:ss.fffZ"; + options.UseUtcTimestamp = true; + options.JsonWriterOptions = new JsonWriterOptions { Indented = false }; + }); + + bool otlpEnabled = HasOtlpEndpoint(builder.Configuration); + if (otlpEnabled) + { + builder.Logging.AddOpenTelemetry(options => + { + options.IncludeFormattedMessage = true; + options.IncludeScopes = true; + options.ParseStateValues = true; + options.SetResourceBuilder(BuildResource(serviceName, serviceVersion)); + options.AddOtlpExporter(); + }); + } + + builder.Services.AddSingleton(); + builder.Services + .AddOpenTelemetry() + .ConfigureResource(resource => resource.AddService( + serviceName, + serviceVersion: serviceVersion)) + .WithTracing(tracing => + { + tracing + .AddSource(SocializeMetrics.ActivitySourceName) + .AddAspNetCoreInstrumentation(options => + { + options.RecordException = true; + }) + .AddHttpClientInstrumentation() + .AddNpgsql(); + + if (otlpEnabled) + { + tracing.AddOtlpExporter(); + } + }) + .WithMetrics(metrics => + { + metrics + .AddMeter(SocializeMetrics.MeterName) + .AddAspNetCoreInstrumentation() + .AddHttpClientInstrumentation() + .AddRuntimeInstrumentation(); + + if (otlpEnabled) + { + metrics.AddOtlpExporter(); + } + }); + + return builder; + } + + public static IApplicationBuilder UseObservabilityLoggingScope(this IApplicationBuilder app) + { + return app.UseMiddleware(); + } + + public static IEndpointRouteBuilder MapObservabilityHealthChecks(this IEndpointRouteBuilder endpoints) + { + endpoints.MapHealthChecks( + "/health", + new HealthCheckOptions { ResponseWriter = WriteHealthResponseAsync }); + endpoints.MapHealthChecks( + "/health/live", + new HealthCheckOptions + { + Predicate = registration => registration.Tags.Contains("live", StringComparer.Ordinal), + ResponseWriter = WriteHealthResponseAsync, + }); + endpoints.MapHealthChecks( + "/health/ready", + new HealthCheckOptions + { + Predicate = registration => registration.Tags.Contains("ready", StringComparer.Ordinal), + ResponseWriter = WriteHealthResponseAsync, + }); + + return endpoints; + } + + private static ResourceBuilder BuildResource(string serviceName, string serviceVersion) + { + return ResourceBuilder.CreateDefault().AddService( + serviceName, + serviceVersion: serviceVersion); + } + + private static bool HasOtlpEndpoint(ConfigurationManager configuration) + { + return !string.IsNullOrWhiteSpace(configuration["OTEL_EXPORTER_OTLP_ENDPOINT"]) || + !string.IsNullOrWhiteSpace(configuration["Otlp:Endpoint"]); + } + + private static string GetConfigurationValue( + ConfigurationManager configuration, + string key, + string fallback) + { + string? value = configuration[key]; + return string.IsNullOrWhiteSpace(value) ? fallback : value; + } + + private static async Task WriteHealthResponseAsync(HttpContext context, HealthReport report) + { + context.Response.ContentType = "application/json"; + + var response = new + { + status = report.Status.ToString(), + checks = report.Entries.Select(entry => new + { + name = entry.Key, + status = entry.Value.Status.ToString(), + description = entry.Value.Description, + duration = entry.Value.Duration.TotalMilliseconds, + }), + duration = report.TotalDuration.TotalMilliseconds, + }; + + await JsonSerializer.SerializeAsync( + context.Response.Body, + response, + cancellationToken: context.RequestAborted); + } +} diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/RequestLoggingScopeMiddleware.cs b/backend/src/Socialize.Api/Infrastructure/Observability/RequestLoggingScopeMiddleware.cs new file mode 100644 index 00000000..365a405b --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/RequestLoggingScopeMiddleware.cs @@ -0,0 +1,61 @@ +using System.Diagnostics; +using Socialize.Api.Infrastructure.Security; + +namespace Socialize.Api.Infrastructure.Observability; + +internal sealed class RequestLoggingScopeMiddleware( + RequestDelegate next, + ILogger logger) +{ + public async Task InvokeAsync(HttpContext context) + { + Dictionary scope = new() + { + ["trace_id"] = Activity.Current?.TraceId.ToString() ?? context.TraceIdentifier, + ["span_id"] = Activity.Current?.SpanId.ToString(), + ["http.method"] = context.Request.Method, + ["url.path"] = context.Request.Path.Value, + }; + + if (context.User.Identity?.IsAuthenticated == true) + { + scope["user.id"] = context.User.GetUserId(); + scope["user.email"] = context.User.GetEmail(); + } + + AddGuidIfPresent(scope, "organization.id", context, "organizationId"); + AddGuidIfPresent(scope, "workspace.id", context, "workspaceId"); + AddGuidIfPresent(scope, "client.id", context, "clientId"); + AddGuidIfPresent(scope, "campaign.id", context, "campaignId"); + AddGuidIfPresent(scope, "content_item.id", context, "contentItemId"); + + using IDisposable? _ = logger.BeginScope(scope); + await next(context); + } + + private static void AddGuidIfPresent( + Dictionary scope, + string scopeKey, + HttpContext context, + string requestKey) + { + string? value = GetRouteOrQueryValue(context, requestKey); + if (Guid.TryParse(value, out Guid id)) + { + scope[scopeKey] = id; + } + } + + private static string? GetRouteOrQueryValue(HttpContext context, string key) + { + object? routeValue = context.Request.RouteValues[key]; + if (routeValue is not null) + { + return Convert.ToString(routeValue, System.Globalization.CultureInfo.InvariantCulture); + } + + return context.Request.Query.TryGetValue(key, out Microsoft.Extensions.Primitives.StringValues queryValue) + ? queryValue.ToString() + : null; + } +} diff --git a/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs new file mode 100644 index 00000000..c52ad933 --- /dev/null +++ b/backend/src/Socialize.Api/Infrastructure/Observability/SocializeMetrics.cs @@ -0,0 +1,158 @@ +using System.Diagnostics; +using System.Diagnostics.Metrics; + +namespace Socialize.Api.Infrastructure.Observability; + +internal sealed class SocializeMetrics : IDisposable +{ + public const string MeterName = "Socialize.Api"; + public const string ActivitySourceName = "Socialize.Api"; + + private readonly Counter _approvalDecisionCounter; + private readonly Counter _backgroundJobRunCounter; + private readonly Counter _blobStorageOperationCounter; + private readonly Counter _commentCreatedCounter; + private readonly Counter _contentItemCreatedCounter; + private readonly Counter _emailDeliveryCounter; + private readonly Counter _feedbackSubmittedCounter; + private readonly Counter _loginAttemptCounter; + private readonly Counter _organizationCreatedCounter; + private readonly Counter _workspaceCreatedCounter; + private readonly Counter _workspaceInviteCreatedCounter; + + public SocializeMetrics() + { + Meter = new Meter(MeterName); + ActivitySource = new ActivitySource(ActivitySourceName); + + _loginAttemptCounter = Meter.CreateCounter( + "socialize.login.attempts", + description: "Login attempts partitioned by outcome."); + _organizationCreatedCounter = Meter.CreateCounter( + "socialize.organizations.created", + description: "Organizations created."); + _workspaceCreatedCounter = Meter.CreateCounter( + "socialize.workspaces.created", + description: "Workspaces created."); + _contentItemCreatedCounter = Meter.CreateCounter( + "socialize.content_items.created", + description: "Content items created."); + _commentCreatedCounter = Meter.CreateCounter( + "socialize.comments.created", + description: "Comments created."); + _approvalDecisionCounter = Meter.CreateCounter( + "socialize.approval_decisions.submitted", + description: "Approval decisions submitted."); + _feedbackSubmittedCounter = Meter.CreateCounter( + "socialize.feedback.submitted", + description: "Feedback reports submitted."); + _workspaceInviteCreatedCounter = Meter.CreateCounter( + "socialize.workspace_invites.created", + description: "Workspace invites created."); + _emailDeliveryCounter = Meter.CreateCounter( + "socialize.email.delivery", + description: "Email delivery attempts partitioned by outcome and provider."); + _blobStorageOperationCounter = Meter.CreateCounter( + "socialize.blob_storage.operations", + description: "Blob storage operations partitioned by operation and outcome."); + _backgroundJobRunCounter = Meter.CreateCounter( + "socialize.background_job.runs", + description: "Background job runs partitioned by job and outcome."); + } + + public Meter Meter { get; } + + public ActivitySource ActivitySource { get; } + + public void RecordLoginAttempt(bool succeeded, string reason) + { + _loginAttemptCounter.Add( + 1, + new KeyValuePair("outcome", succeeded ? "success" : "failure"), + new KeyValuePair("reason", reason)); + } + + public void RecordOrganizationCreated(Guid organizationId) + { + _organizationCreatedCounter.Add( + 1, + new KeyValuePair("organization.id", organizationId)); + } + + public void RecordWorkspaceCreated(Guid organizationId, Guid workspaceId) + { + _workspaceCreatedCounter.Add( + 1, + new KeyValuePair("organization.id", organizationId), + new KeyValuePair("workspace.id", workspaceId)); + } + + public void RecordContentItemCreated(Guid workspaceId) + { + _contentItemCreatedCounter.Add( + 1, + new KeyValuePair("workspace.id", workspaceId)); + } + + public void RecordCommentCreated(Guid workspaceId, bool hasAttachment) + { + _commentCreatedCounter.Add( + 1, + new KeyValuePair("workspace.id", workspaceId), + new KeyValuePair("has_attachment", hasAttachment)); + } + + public void RecordApprovalDecisionSubmitted(Guid workspaceId, string decision) + { + _approvalDecisionCounter.Add( + 1, + new KeyValuePair("workspace.id", workspaceId), + new KeyValuePair("decision", decision)); + } + + public void RecordFeedbackSubmitted(string type, Guid? workspaceId) + { + _feedbackSubmittedCounter.Add( + 1, + new KeyValuePair("feedback.type", type), + new KeyValuePair("workspace.id", workspaceId?.ToString() ?? "none")); + } + + public void RecordWorkspaceInviteCreated(Guid workspaceId, string role) + { + _workspaceInviteCreatedCounter.Add( + 1, + new KeyValuePair("workspace.id", workspaceId), + new KeyValuePair("role", role)); + } + + public void RecordEmailDelivery(string provider, bool succeeded) + { + _emailDeliveryCounter.Add( + 1, + new KeyValuePair("provider", provider), + new KeyValuePair("outcome", succeeded ? "success" : "failure")); + } + + public void RecordBlobStorageOperation(string operation, bool succeeded) + { + _blobStorageOperationCounter.Add( + 1, + new KeyValuePair("operation", operation), + new KeyValuePair("outcome", succeeded ? "success" : "failure")); + } + + public void RecordBackgroundJobRun(string job, bool succeeded) + { + _backgroundJobRunCounter.Add( + 1, + new KeyValuePair("job", job), + new KeyValuePair("outcome", succeeded ? "success" : "failure")); + } + + public void Dispose() + { + Meter.Dispose(); + ActivitySource.Dispose(); + } +} diff --git a/backend/src/Socialize.Api/Modules/Approvals/Handlers/SubmitApprovalDecision.cs b/backend/src/Socialize.Api/Modules/Approvals/Handlers/SubmitApprovalDecision.cs index cca63370..ae1e721e 100644 --- a/backend/src/Socialize.Api/Modules/Approvals/Handlers/SubmitApprovalDecision.cs +++ b/backend/src/Socialize.Api/Modules/Approvals/Handlers/SubmitApprovalDecision.cs @@ -1,6 +1,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.ContentItems.Data; using Socialize.Api.Modules.ContentItems.Contracts; @@ -37,7 +38,8 @@ internal class SubmitApprovalDecisionHandler( AccessScopeService accessScopeService, ApprovalWorkflowRuntimeService approvalWorkflowRuntimeService, IContentItemActivityWriter activityWriter, - INotificationEventWriter notificationEventWriter) + INotificationEventWriter notificationEventWriter, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -157,6 +159,7 @@ internal class SubmitApprovalDecisionHandler( $$"""{"stage":"{{approval.Stage}}","status":"{{contentItem.Status}}"}"""), ct); } + metrics.RecordApprovalDecisionSubmitted(approval.WorkspaceId, normalizedDecision); List decisions = await dbContext.ApprovalDecisions .Where(candidate => candidate.ApprovalRequestId == approval.Id) diff --git a/backend/src/Socialize.Api/Modules/CalendarIntegrations/Services/CalendarImportBackgroundService.cs b/backend/src/Socialize.Api/Modules/CalendarIntegrations/Services/CalendarImportBackgroundService.cs index 8bd03c43..ab071647 100644 --- a/backend/src/Socialize.Api/Modules/CalendarIntegrations/Services/CalendarImportBackgroundService.cs +++ b/backend/src/Socialize.Api/Modules/CalendarIntegrations/Services/CalendarImportBackgroundService.cs @@ -1,7 +1,10 @@ +using Socialize.Api.Infrastructure.Observability; + namespace Socialize.Api.Modules.CalendarIntegrations.Services; internal sealed class CalendarImportBackgroundService( IServiceScopeFactory scopeFactory, + SocializeMetrics metrics, ILogger logger) : BackgroundService { @@ -22,6 +25,7 @@ internal sealed class CalendarImportBackgroundService( using IServiceScope scope = scopeFactory.CreateScope(); CalendarImportSyncService syncService = scope.ServiceProvider.GetRequiredService(); await syncService.RefreshDueSourcesAsync(stoppingToken); + metrics.RecordBackgroundJobRun(nameof(CalendarImportBackgroundService), true); } catch (OperationCanceledException ex) when (stoppingToken.IsCancellationRequested) { @@ -30,6 +34,7 @@ internal sealed class CalendarImportBackgroundService( #pragma warning disable CA1031 // Background service should log and continue after unexpected sync failures. catch (Exception ex) { + metrics.RecordBackgroundJobRun(nameof(CalendarImportBackgroundService), false); logger.LogError(ex, "Calendar import background sync failed."); } #pragma warning restore CA1031 diff --git a/backend/src/Socialize.Api/Modules/Comments/Handlers/CreateComment.cs b/backend/src/Socialize.Api/Modules/Comments/Handlers/CreateComment.cs index b3a97ca0..985ee8b2 100644 --- a/backend/src/Socialize.Api/Modules/Comments/Handlers/CreateComment.cs +++ b/backend/src/Socialize.Api/Modules/Comments/Handlers/CreateComment.cs @@ -2,6 +2,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; using Socialize.Api.Infrastructure.BlobStorage.Contracts; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.ContentItems.Contracts; using Socialize.Api.Modules.ContentItems.Data; @@ -34,7 +35,8 @@ internal class CreateCommentHandler( AccessScopeService accessScopeService, IBlobStorage blobStorage, IContentItemActivityWriter activityWriter, - INotificationEventWriter notificationEventWriter) + INotificationEventWriter notificationEventWriter, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -156,6 +158,7 @@ internal class CreateCommentHandler( dbContext.Comments.Add(comment); await dbContext.SaveChangesAsync(ct); + metrics.RecordCommentCreated(comment.WorkspaceId, comment.AttachmentBlobName is not null); string? authorPortraitUrl = await dbContext.Users .Where(candidate => candidate.Id == comment.AuthorUserId) diff --git a/backend/src/Socialize.Api/Modules/ContentItems/Handlers/CreateContentItem.cs b/backend/src/Socialize.Api/Modules/ContentItems/Handlers/CreateContentItem.cs index 54a5836f..25d3d69a 100644 --- a/backend/src/Socialize.Api/Modules/ContentItems/Handlers/CreateContentItem.cs +++ b/backend/src/Socialize.Api/Modules/ContentItems/Handlers/CreateContentItem.cs @@ -1,6 +1,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.ContentItems.Contracts; using Socialize.Api.Modules.Notifications.Contracts; @@ -39,7 +40,8 @@ internal class CreateContentItemHandler( AppDbContext dbContext, AccessScopeService accessScopeService, IContentItemActivityWriter activityWriter, - INotificationEventWriter notificationEventWriter) + INotificationEventWriter notificationEventWriter, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -123,6 +125,7 @@ internal class CreateContentItemHandler( CreatedAt = DateTimeOffset.UtcNow, }); await dbContext.SaveChangesAsync(ct); + metrics.RecordContentItemCreated(item.WorkspaceId); await activityWriter.WriteAsync( new ContentItemActivityWriteModel( diff --git a/backend/src/Socialize.Api/Modules/Feedback/Handlers/SubmitFeedback.cs b/backend/src/Socialize.Api/Modules/Feedback/Handlers/SubmitFeedback.cs index 5027a0ae..7067aabd 100644 --- a/backend/src/Socialize.Api/Modules/Feedback/Handlers/SubmitFeedback.cs +++ b/backend/src/Socialize.Api/Modules/Feedback/Handlers/SubmitFeedback.cs @@ -1,5 +1,6 @@ using FastEndpoints; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.Feedback.Contracts; using Socialize.Api.Modules.Feedback.Data; @@ -45,7 +46,8 @@ internal class SubmitFeedbackRequestValidator internal class SubmitFeedbackHandler( AppDbContext dbContext, - FeedbackNotificationService notificationService) + FeedbackNotificationService notificationService, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -93,6 +95,7 @@ internal class SubmitFeedbackHandler( dbContext.FeedbackReports.Add(report); await notificationService.AddNewReportNotificationsAsync(report, ct); await dbContext.SaveChangesAsync(ct); + metrics.RecordFeedbackSubmitted(report.Type.ToString(), report.WorkspaceId); await SendAsync(report.ToDto(), StatusCodes.Status201Created, ct); } diff --git a/backend/src/Socialize.Api/Modules/Identity/Handlers/Login.cs b/backend/src/Socialize.Api/Modules/Identity/Handlers/Login.cs index e4ca22e0..ff9a6f84 100644 --- a/backend/src/Socialize.Api/Modules/Identity/Handlers/Login.cs +++ b/backend/src/Socialize.Api/Modules/Identity/Handlers/Login.cs @@ -1,5 +1,6 @@ using FastEndpoints; using Microsoft.Extensions.Options; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.Identity.Data; using Socialize.Api.Modules.Identity.Configuration; @@ -21,7 +22,8 @@ internal record LoginResponse( internal class LoginHandler( UserManager userManager, IOptionsSnapshot jwtOptions, - AccessTokenFactory accessTokenFactory) + AccessTokenFactory accessTokenFactory, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -40,6 +42,7 @@ internal class LoginHandler( user ??= await userManager.FindByNameAsync(request.Email); if (user is null) { + metrics.RecordLoginAttempt(false, "unknown_user"); await SendStringAsync( "Invalid email or password", 401, @@ -51,6 +54,7 @@ internal class LoginHandler( bool isPasswordValid = await userManager.CheckPasswordAsync(user, request.Password); if (!isPasswordValid) { + metrics.RecordLoginAttempt(false, "invalid_password"); await SendStringAsync( "Invalid email or password", 401, @@ -61,6 +65,7 @@ internal class LoginHandler( // Check if the email is confirmed if (!user.EmailConfirmed) { + metrics.RecordLoginAttempt(false, "email_unconfirmed"); await SendStringAsync( "Email not verified. Please check your email for verification instructions.", 401, @@ -76,6 +81,7 @@ internal class LoginHandler( // Generate JWT token string accessToken = await accessTokenFactory.CreateAsync(user); + metrics.RecordLoginAttempt(true, "success"); await SendOkAsync( new LoginResponse(accessToken, user.RefreshToken), diff --git a/backend/src/Socialize.Api/Modules/Organizations/Handlers/CreateOrganization.cs b/backend/src/Socialize.Api/Modules/Organizations/Handlers/CreateOrganization.cs index ee930a3e..360c014c 100644 --- a/backend/src/Socialize.Api/Modules/Organizations/Handlers/CreateOrganization.cs +++ b/backend/src/Socialize.Api/Modules/Organizations/Handlers/CreateOrganization.cs @@ -1,6 +1,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.Organizations.Data; using Socialize.Api.Modules.Organizations.Services; @@ -21,7 +22,8 @@ internal class CreateOrganizationRequestValidator } internal class CreateOrganizationHandler( - AppDbContext dbContext) + AppDbContext dbContext, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -66,6 +68,7 @@ internal class CreateOrganizationHandler( dbContext.Organizations.Add(organization); dbContext.OrganizationMemberships.Add(ownerMembership); await dbContext.SaveChangesAsync(ct); + metrics.RecordOrganizationCreated(organization.Id); await SendAsync( OrganizationDto.FromOrganization( diff --git a/backend/src/Socialize.Api/Modules/ReleaseCommunications/Services/ReleaseUpdateEmailDigestBackgroundService.cs b/backend/src/Socialize.Api/Modules/ReleaseCommunications/Services/ReleaseUpdateEmailDigestBackgroundService.cs index cbf0f4e4..a36577c7 100644 --- a/backend/src/Socialize.Api/Modules/ReleaseCommunications/Services/ReleaseUpdateEmailDigestBackgroundService.cs +++ b/backend/src/Socialize.Api/Modules/ReleaseCommunications/Services/ReleaseUpdateEmailDigestBackgroundService.cs @@ -1,4 +1,5 @@ using Microsoft.Extensions.Options; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Modules.ReleaseCommunications.Configuration; namespace Socialize.Api.Modules.ReleaseCommunications.Services; @@ -6,6 +7,7 @@ namespace Socialize.Api.Modules.ReleaseCommunications.Services; internal sealed class ReleaseUpdateEmailDigestBackgroundService( IServiceScopeFactory scopeFactory, IOptions options, + SocializeMetrics metrics, ILogger logger) : BackgroundService { @@ -42,6 +44,7 @@ internal sealed class ReleaseUpdateEmailDigestBackgroundService( TimeSpan.FromHours(options.Value.DigestIntervalHours), force: false, ct: stoppingToken); + metrics.RecordBackgroundJobRun(nameof(ReleaseUpdateEmailDigestBackgroundService), true); if (sentCount > 0 && logger.IsEnabled(LogLevel.Information)) { logger.LogInformation("Sent {SentCount} release update digest emails.", sentCount); @@ -54,6 +57,7 @@ internal sealed class ReleaseUpdateEmailDigestBackgroundService( #pragma warning disable CA1031 catch (Exception ex) { + metrics.RecordBackgroundJobRun(nameof(ReleaseUpdateEmailDigestBackgroundService), false); logger.LogError(ex, "Release update digest service failed."); } #pragma warning restore CA1031 diff --git a/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspace.cs b/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspace.cs index 9061273e..53ae5546 100644 --- a/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspace.cs +++ b/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspace.cs @@ -1,6 +1,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.Workspaces.Data; @@ -24,7 +25,8 @@ internal class CreateWorkspaceRequestValidator internal class CreateWorkspaceHandler( AppDbContext dbContext, - AccessScopeService accessScopeService) + AccessScopeService accessScopeService, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -65,6 +67,7 @@ internal class CreateWorkspaceHandler( dbContext.Workspaces.Add(workspace); await dbContext.SaveChangesAsync(ct); + metrics.RecordWorkspaceCreated(workspace.OrganizationId, workspace.Id); WorkspaceDto dto = WorkspaceDto.FromWorkspace(workspace, []); diff --git a/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspaceInvite.cs b/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspaceInvite.cs index 04c0fcc6..b426d0fe 100644 --- a/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspaceInvite.cs +++ b/backend/src/Socialize.Api/Modules/Workspaces/Handlers/CreateWorkspaceInvite.cs @@ -1,6 +1,7 @@ using FastEndpoints; using Microsoft.EntityFrameworkCore; using Socialize.Api.Data; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.Security; using Socialize.Api.Modules.Identity.Contracts; using Socialize.Api.Modules.Workspaces.Data; @@ -31,7 +32,8 @@ internal class CreateWorkspaceInviteRequestValidator internal class CreateWorkspaceInviteHandler( AppDbContext dbContext, - AccessScopeService accessScopeService) + AccessScopeService accessScopeService, + SocializeMetrics metrics) : Endpoint { public override void Configure() @@ -91,6 +93,7 @@ internal class CreateWorkspaceInviteHandler( dbContext.WorkspaceInvites.Add(invite); await dbContext.SaveChangesAsync(ct); + metrics.RecordWorkspaceInviteCreated(invite.WorkspaceId, invite.Role); await SendAsync( new WorkspaceInviteDto( diff --git a/backend/src/Socialize.Api/Program.cs b/backend/src/Socialize.Api/Program.cs index c9d927df..a576d3b3 100644 --- a/backend/src/Socialize.Api/Program.cs +++ b/backend/src/Socialize.Api/Program.cs @@ -6,6 +6,7 @@ using Socialize; using Socialize.Api.Infrastructure.BlobStorage.Configuration; using Socialize.Api.Infrastructure.BlobStorage.Services; using Socialize.Api.Infrastructure; +using Socialize.Api.Infrastructure.Observability; using Socialize.Api.Infrastructure.TestData; using Socialize.Api.Modules.Approvals; using Socialize.Api.Modules.Assets; @@ -44,6 +45,8 @@ builder.Services.AddCors(options => ) ); +builder.AddObservability(); + // Add services to the container. builder.Services.AddWebServices(); builder.Services.AddAuthorizationAndAuthentication(builder.Configuration); @@ -110,6 +113,7 @@ app.UseCors("AllowAll"); app.UseAuthentication(); app.UseAuthorization(); +app.UseObservabilityLoggingScope(); // Initialize and seed the db. await app.UseAppDataAsync(); @@ -122,7 +126,7 @@ if (!app.Environment.IsDevelopment()) app.UseHsts(); } -app.UseHealthChecks("/health"); +app.MapObservabilityHealthChecks(); LocalBlobStorageOptions localBlobStorageOptions = app.Services .GetRequiredService>() diff --git a/backend/src/Socialize.Api/Socialize.Api.csproj b/backend/src/Socialize.Api/Socialize.Api.csproj index f017c668..f42100e2 100644 --- a/backend/src/Socialize.Api/Socialize.Api.csproj +++ b/backend/src/Socialize.Api/Socialize.Api.csproj @@ -28,7 +28,13 @@ + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive all diff --git a/deploy/compose.yml b/deploy/compose.yml index 2d33f2b6..adf20b95 100644 --- a/deploy/compose.yml +++ b/deploy/compose.yml @@ -3,8 +3,9 @@ services: image: postgres:16 restart: unless-stopped env_file: - - /etc/socialize/socialize.env - - .deploy.env + - path: /etc/socialize/socialize.env + - path: .deploy.env + required: false environment: POSTGRES_DB: ${POSTGRES_DB} POSTGRES_USER: ${POSTGRES_USER} @@ -23,8 +24,9 @@ services: image: git.mapachotes.com/jbourdon/socialize-api:${SOCIALIZE_IMAGE_TAG} restart: unless-stopped env_file: - - /etc/socialize/socialize.env - - .deploy.env + - path: /etc/socialize/socialize.env + - path: .deploy.env + required: false environment: ASPNETCORE_ENVIRONMENT: ${ASPNETCORE_ENVIRONMENT} ASPNETCORE_URLS: ${ASPNETCORE_URLS} diff --git a/deploy/observability/alloy/config.alloy b/deploy/observability/alloy/config.alloy new file mode 100644 index 00000000..c7fc3dbb --- /dev/null +++ b/deploy/observability/alloy/config.alloy @@ -0,0 +1,95 @@ +logging { + level = "info" + format = "logfmt" +} + +otelcol.receiver.otlp "api" { + grpc { + endpoint = "0.0.0.0:4317" + } + + http { + endpoint = "0.0.0.0:4318" + } + + output { + metrics = [otelcol.processor.transform.metric_labels.input] + traces = [otelcol.processor.batch.default.input] + } +} + +otelcol.processor.transform "metric_labels" { + error_mode = "ignore" + + metric_statements { + context = "datapoint" + statements = [ + `set(attributes["service.name"], resource.attributes["service.name"])`, + `set(attributes["deployment.environment"], resource.attributes["deployment.environment"])`, + ] + } + + output { + metrics = [otelcol.processor.batch.default.input] + } +} + +otelcol.processor.batch "default" { + output { + metrics = [otelcol.exporter.prometheus.local.input] + traces = [otelcol.exporter.otlp.tempo.input] + } +} + +otelcol.exporter.prometheus "local" { + forward_to = [prometheus.remote_write.local.receiver] +} + +prometheus.remote_write "local" { + endpoint { + url = "http://prometheus:9090/api/v1/write" + } +} + +otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo:4317" + + tls { + insecure = true + } + } +} + +discovery.docker "linux" { + host = "unix:///var/run/docker.sock" +} + +discovery.relabel "docker_logs" { + targets = [] + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "service_name" + } + + rule { + source_labels = ["__meta_docker_container_label_com_docker_compose_service"] + target_label = "compose_service" + } +} + +loki.source.docker "default" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.linux.targets + labels = {"platform" = "docker"} + relabel_rules = discovery.relabel.docker_logs.rules + forward_to = [loki.write.local.receiver] +} + +loki.write "local" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } +} diff --git a/deploy/observability/compose.observability.yml b/deploy/observability/compose.observability.yml new file mode 100644 index 00000000..57048803 --- /dev/null +++ b/deploy/observability/compose.observability.yml @@ -0,0 +1,94 @@ +services: + api: + environment: + OTEL_SERVICE_NAME: socialize-api + OTEL_EXPORTER_OTLP_ENDPOINT: http://alloy:4317 + OTEL_EXPORTER_OTLP_PROTOCOL: grpc + OTEL_RESOURCE_ATTRIBUTES: deployment.environment=preprod + depends_on: + alloy: + condition: service_started + + grafana: + image: grafana/grafana:13.0.1 + restart: unless-stopped + environment: + GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin} + GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin} + GF_USERS_ALLOW_SIGN_UP: "false" + volumes: + - grafana-data:/var/lib/grafana + - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro + - ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro + ports: + - "${GRAFANA_HTTP_BIND:-127.0.0.1}:3000:3000" + depends_on: + - prometheus + - loki + - tempo + networks: + - internal + + prometheus: + image: prom/prometheus:v3.11.3 + restart: unless-stopped + command: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=${PROMETHEUS_RETENTION:-15d} + - --web.enable-remote-write-receiver + volumes: + - prometheus-data:/prometheus + - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./observability/prometheus/rules:/etc/prometheus/rules:ro + networks: + - internal + + loki: + image: grafana/loki:3.7.1 + restart: unless-stopped + command: -config.file=/etc/loki/local-config.yml + volumes: + - loki-data:/loki + - ./observability/loki/local-config.yml:/etc/loki/local-config.yml:ro + networks: + - internal + + tempo: + image: grafana/tempo:2.10.3 + restart: unless-stopped + command: -config.file=/etc/tempo.yml + volumes: + - tempo-data:/var/tempo + - ./observability/tempo/tempo.yml:/etc/tempo.yml:ro + networks: + - internal + + alloy: + image: grafana/alloy:v1.16.0 + restart: unless-stopped + command: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + volumes: + - alloy-data:/var/lib/alloy/data + - /var/run/docker.sock:/var/run/docker.sock:ro + - ./observability/alloy/config.alloy:/etc/alloy/config.alloy:ro + expose: + - "4317" + - "4318" + - "12345" + networks: + - internal + +volumes: + grafana-data: + prometheus-data: + loki-data: + tempo-data: + alloy-data: + +networks: + internal: diff --git a/deploy/observability/grafana/dashboards/socialize-overview.json b/deploy/observability/grafana/dashboards/socialize-overview.json new file mode 100644 index 00000000..89599631 --- /dev/null +++ b/deploy/observability/grafana/dashboards/socialize-overview.json @@ -0,0 +1,413 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m]))", + "legendFormat": "requests/sec" + } + ], + "title": "API Requests/sec", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\", http_response_status_code=~\"5..\"}[5m])) / clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])), 0.001)", + "legendFormat": "5xx rate" + } + ], + "title": "API 5xx Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))", + "legendFormat": "p95" + } + ], + "title": "API p95 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\"})", + "legendFormat": "firing" + } + ], + "title": "Firing Alerts", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum(rate(http_server_request_duration_seconds_count{service_name=\"socialize-api\"}[5m])) by (http_request_method, http_route)", + "legendFormat": "{{http_request_method}} {{http_route}}" + } + ], + "title": "Request Rate By Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum by (le, http_route) (rate(http_server_request_duration_seconds_bucket{service_name=\"socialize-api\"}[5m])))", + "legendFormat": "{{http_route}}" + } + ], + "title": "p95 Latency By Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 7, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum(increase(socialize_login_attempts_total[24h])) by (outcome)", + "legendFormat": "login {{outcome}}" + }, + { + "expr": "sum(increase(socialize_organizations_created_total[24h]))", + "legendFormat": "organizations" + }, + { + "expr": "sum(increase(socialize_workspaces_created_total[24h]))", + "legendFormat": "workspaces" + }, + { + "expr": "sum(increase(socialize_content_items_created_total[24h]))", + "legendFormat": "content" + }, + { + "expr": "sum(increase(socialize_comments_created_total[24h]))", + "legendFormat": "comments" + }, + { + "expr": "sum(increase(socialize_approval_decisions_submitted_total[24h]))", + "legendFormat": "approvals" + }, + { + "expr": "sum(increase(socialize_feedback_submitted_total[24h]))", + "legendFormat": "feedback" + } + ], + "title": "Usage Signals, 24h Rolling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 8, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "multi" + } + }, + "targets": [ + { + "expr": "sum(increase(socialize_email_delivery_total[1h])) by (outcome, provider)", + "legendFormat": "email {{provider}} {{outcome}}" + }, + { + "expr": "sum(increase(socialize_blob_storage_operations_total[1h])) by (operation, outcome)", + "legendFormat": "blob {{operation}} {{outcome}}" + }, + { + "expr": "sum(increase(socialize_background_job_runs_total[1h])) by (job, outcome)", + "legendFormat": "job {{job}} {{outcome}}" + } + ], + "title": "Operational Events, 1h Rolling", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "Prometheus" + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 9, + "options": { + "showHeader": true + }, + "targets": [ + { + "expr": "ALERTS{alertstate=\"firing\"}", + "format": "table", + "instant": true, + "legendFormat": "{{alertname}}" + } + ], + "title": "Firing Alerts", + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "Loki" + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 10, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "expr": "{platform=\"docker\", compose_service=\"api\"}", + "refId": "A" + } + ], + "title": "API Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "socialize", + "preprod" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Socialize Overview", + "uid": "socialize-overview", + "version": 2, + "weekStart": "" +} diff --git a/deploy/observability/grafana/provisioning/dashboards/dashboards.yml b/deploy/observability/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..96de9114 --- /dev/null +++ b/deploy/observability/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: Socialize + orgId: 1 + folder: Socialize + type: file + disableDeletion: false + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards diff --git a/deploy/observability/grafana/provisioning/datasources/datasources.yml b/deploy/observability/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 00000000..ec1abd44 --- /dev/null +++ b/deploy/observability/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,26 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + uid: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + + - name: Loki + uid: Loki + type: loki + access: proxy + url: http://loki:3100 + + - name: Tempo + uid: Tempo + type: tempo + access: proxy + url: http://tempo:3200 + jsonData: + tracesToLogsV2: + datasourceUid: Loki + serviceMap: + datasourceUid: Prometheus diff --git a/deploy/observability/loki/local-config.yml b/deploy/observability/loki/local-config.yml new file mode 100644 index 00000000..5d5236b5 --- /dev/null +++ b/deploy/observability/loki/local-config.yml @@ -0,0 +1,32 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /loki/chunks + +limits_config: + allow_structured_metadata: true + volume_enabled: true + +analytics: + reporting_enabled: false diff --git a/deploy/observability/prometheus/prometheus.yml b/deploy/observability/prometheus/prometheus.yml new file mode 100644 index 00000000..a782e6b0 --- /dev/null +++ b/deploy/observability/prometheus/prometheus.yml @@ -0,0 +1,17 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + +rule_files: + - /etc/prometheus/rules/*.yml + +scrape_configs: + - job_name: prometheus + static_configs: + - targets: + - prometheus:9090 + + - job_name: alloy + static_configs: + - targets: + - alloy:12345 diff --git a/deploy/observability/prometheus/rules/socialize-alerts.yml b/deploy/observability/prometheus/rules/socialize-alerts.yml new file mode 100644 index 00000000..20a08e43 --- /dev/null +++ b/deploy/observability/prometheus/rules/socialize-alerts.yml @@ -0,0 +1,97 @@ +groups: + - name: socialize-preprod + rules: + - alert: SocializeApiTelemetryMissing + expr: absent(http_server_request_duration_seconds_count{service_name="socialize-api"}) + for: 5m + labels: + severity: critical + service: socialize-api + annotations: + summary: Socialize API telemetry is missing + description: No API request telemetry has been received for 5 minutes. The API or telemetry pipeline may be down. + + - alert: SocializeApiHighErrorRate + expr: | + ( + sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api", http_response_status_code=~"5.."}[5m])) + / + clamp_min(sum(rate(http_server_request_duration_seconds_count{service_name="socialize-api"}[5m])), 0.001) + ) > 0.05 + for: 5m + labels: + severity: critical + service: socialize-api + annotations: + summary: Socialize API 5xx rate is high + description: More than 5% of API requests are returning 5xx responses over 5 minutes. + + - alert: SocializeApiHighLatency + expr: | + histogram_quantile( + 0.95, + sum by (le) (rate(http_server_request_duration_seconds_bucket{service_name="socialize-api"}[5m])) + ) > 2 + for: 10m + labels: + severity: warning + service: socialize-api + annotations: + summary: Socialize API p95 latency is high + description: API p95 latency has been above 2 seconds for 10 minutes. + + - alert: SocializeCoreUsageQuiet + expr: | + ( + sum(increase(socialize_content_items_created_total[12h])) + + sum(increase(socialize_comments_created_total[12h])) + + sum(increase(socialize_approval_decisions_submitted_total[12h])) + + sum(increase(socialize_feedback_submitted_total[12h])) + ) < 1 + for: 30m + labels: + severity: warning + service: socialize-api + annotations: + summary: Socialize core usage is quiet + description: No content, comment, approval, or feedback activity has been observed over the last 12 hours. + + - alert: SocializeFeedbackBugSubmitted + expr: sum(increase(socialize_feedback_submitted_total{feedback_type="Bug"}[15m])) > 0 + for: 0m + labels: + severity: info + service: socialize-api + annotations: + summary: New bug feedback submitted + description: A user submitted bug feedback in the last 15 minutes. + + - alert: SocializeEmailDeliveryFailures + expr: sum(increase(socialize_email_delivery_total{outcome="failure"}[15m])) > 0 + for: 0m + labels: + severity: warning + service: socialize-api + annotations: + summary: Email delivery failures detected + description: One or more email delivery attempts failed in the last 15 minutes. + + - alert: SocializeBlobStorageFailures + expr: sum(increase(socialize_blob_storage_operations_total{outcome="failure"}[15m])) > 0 + for: 0m + labels: + severity: warning + service: socialize-api + annotations: + summary: Blob storage failures detected + description: One or more blob storage operations failed in the last 15 minutes. + + - alert: SocializeBackgroundJobFailures + expr: sum(increase(socialize_background_job_runs_total{outcome="failure"}[30m])) > 0 + for: 0m + labels: + severity: warning + service: socialize-api + annotations: + summary: Background job failures detected + description: One or more background jobs failed in the last 30 minutes. diff --git a/deploy/observability/tempo/tempo.yml b/deploy/observability/tempo/tempo.yml new file mode 100644 index 00000000..a78f1bef --- /dev/null +++ b/deploy/observability/tempo/tempo.yml @@ -0,0 +1,25 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +storage: + trace: + backend: local + local: + path: /var/tempo/traces + +compactor: + compaction: + block_retention: 168h + +metrics_generator: + storage: + path: /var/tempo/generator/wal diff --git a/docs/FEATURES/observability.md b/docs/FEATURES/observability.md new file mode 100644 index 00000000..0af8e6fc --- /dev/null +++ b/docs/FEATURES/observability.md @@ -0,0 +1,80 @@ +# Observability + +## Status + +Draft + +## Goal + +Give the SaaS operator preproduction visibility into whether Socialize is healthy and whether real users are exercising core workflows. + +This feature is operator-facing. It is not a client-facing analytics suite or status page. + +## Initial Scope + +- structured backend logs suitable for centralized log search +- OpenTelemetry traces and metrics emitted by the API +- self-hosted Grafana observability stack for preproduction +- health, readiness, and liveness endpoints +- aggregate product usage counters for core workflow actions +- dashboards and alerts for app health and adoption signals + +## Operational Signals + +Health signals should cover: + +- API availability +- Postgres connectivity +- request rate, latency, and error rate +- slow endpoints +- outbound HTTP failures +- background service failures +- email delivery failures +- blob storage failures +- authentication failures + +Usage signals should cover aggregate counts for: + +- login attempts and successful logins +- organizations and workspaces created +- content items created +- comments created +- approval decisions submitted +- feedback reports submitted +- workspace invites created + +## Privacy And Safety Rules + +- Do not log request bodies, access tokens, refresh tokens, passwords, uploaded file contents, screenshots, or raw customer content. +- Usage metrics are aggregate operational signals, not behavioral tracking. +- User, organization, and workspace identifiers may be included as structured attributes when already available to backend code. +- The first implementation targets preproduction and self-hosted Docker infrastructure only. + +## Deployment Shape + +The application emits OpenTelemetry over OTLP to a local collector. + +The preproduction observability stack runs as an optional Docker Compose overlay with: + +- Grafana for dashboards and alerting +- Prometheus for metrics +- Loki for logs +- Tempo for traces +- Grafana Alloy for log collection and telemetry routing + +The normal application compose file must remain usable without the observability overlay. + +## Alerting + +Preproduction alerting should start with local Prometheus alert rules. Notification routing is a separate operational setup step because the first preproduction target may use email, chat, or a private incident channel. + +Initial alerts should cover: + +- app telemetry missing +- high API error rate +- high API p95 latency +- core usage unexpectedly quiet +- feedback bug reports submitted +- email delivery failures +- blob storage failures +- background job failures diff --git a/docs/TASKS/observability/001-observability-foundation.md b/docs/TASKS/observability/001-observability-foundation.md new file mode 100644 index 00000000..d6a53d3a --- /dev/null +++ b/docs/TASKS/observability/001-observability-foundation.md @@ -0,0 +1,44 @@ +# Observability 001: Preprod Foundation + +## Goal + +Add the first preproduction observability foundation for Socialize so the operator can tell whether the app is healthy and whether core workflows are being used. + +## Feature Spec + +- `docs/FEATURES/observability.md` + +## Scope + +- Add backend OpenTelemetry registration for traces and metrics. +- Add structured JSON console logging with request correlation context. +- Add aggregate custom counters for core usage events. +- Expand health endpoints with liveness and readiness checks. +- Add an optional Docker Compose observability overlay for Grafana, Prometheus, Loki, Tempo, and Alloy. +- Add basic Grafana datasource/dashboard provisioning. + +## Likely Files + +- `backend/src/Socialize.Api/Program.cs` +- `backend/src/Socialize.Api/ApplicationRegistration.cs` +- `backend/src/Socialize.Api/Infrastructure/Observability/*` +- selected backend handlers for usage counters +- `backend/src/Socialize.Api/Socialize.Api.csproj` +- `deploy/observability/*` +- `README.md` + +## Out Of Scope + +- Client-facing analytics or status page. +- Frontend behavioral analytics. +- Cloud telemetry providers. +- Long-term telemetry retention policy. +- Full product analytics warehouse. + +## Validation + +```bash +dotnet build backend/Socialize.slnx +dotnet test backend/Socialize.slnx +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config +``` diff --git a/docs/TASKS/observability/002-alerts-dashboard-hardening.md b/docs/TASKS/observability/002-alerts-dashboard-hardening.md new file mode 100644 index 00000000..f571733e --- /dev/null +++ b/docs/TASKS/observability/002-alerts-dashboard-hardening.md @@ -0,0 +1,32 @@ +# Observability 002: Alerts And Dashboard Hardening + +## Goal + +Make the preproduction observability stack actionable by adding alert rules, better operator dashboards, pinned image versions, and operational counters for services that commonly fail silently. + +## Feature Spec + +- `docs/FEATURES/observability.md` + +## Scope + +- Pin Grafana, Prometheus, Loki, Tempo, and Alloy image tags in the observability compose overlay. +- Add Prometheus alert rules for API health, error rate, latency, usage silence, feedback bugs, email failures, blob failures, and background job failures. +- Expand the Grafana dashboard with health, usage, operational failure, alert, log, and trace-oriented panels. +- Add backend counters for email delivery, blob storage operations, and background job runs. +- Document alerting and safe Grafana exposure expectations. + +## Out Of Scope + +- Notification delivery integration for alerts. +- Client-facing status page. +- Cloud observability backends. +- Full product analytics or session tracking. + +## Validation + +```bash +dotnet build backend/Socialize.slnx +dotnet test backend/Socialize.slnx +docker compose -f deploy/compose.yml -f deploy/observability/compose.observability.yml config +```