-
Notifications
You must be signed in to change notification settings - Fork 0
VPR-141 feat(healthchecks): add /health endpoints and UI dashboard #159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
77f2c9f
9c92454
e2e1665
23fcaa1
a109c2d
a83c9f1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| namespace Viper.Classes | ||
| { | ||
| /// <summary> | ||
| /// Cloudflare's published IPv4/IPv6 networks, used to mark CF as a known | ||
| /// proxy in ForwardedHeadersOptions. Fetched from cloudflare.com at startup | ||
| /// so we automatically pick up rotations; falls back to a hardcoded snapshot | ||
| /// when the fetch fails (CF outage during deploy, sandboxed network, etc). | ||
| /// </summary> | ||
| public static class CloudflareNetworks | ||
| { | ||
| // Snapshot of https://www.cloudflare.com/ips/ - only used when the | ||
| // runtime fetch fails. Refresh occasionally if logs show this falling | ||
| // through and current CF IPs aren't in the list. | ||
| private static readonly string[] HardcodedFallback = | ||
| [ | ||
| "173.245.48.0/20", | ||
| "103.21.244.0/22", | ||
| "103.22.200.0/22", | ||
| "103.31.4.0/22", | ||
| "141.101.64.0/18", | ||
| "108.162.192.0/18", | ||
| "190.93.240.0/20", | ||
| "188.114.96.0/20", | ||
| "197.234.240.0/22", | ||
| "198.41.128.0/17", | ||
| "162.158.0.0/15", | ||
| "104.16.0.0/13", | ||
| "104.24.0.0/14", | ||
| "172.64.0.0/13", | ||
| "131.0.72.0/22", | ||
| "2400:cb00::/32", | ||
| "2606:4700::/32", | ||
| "2803:f800::/32", | ||
| "2405:b500::/32", | ||
| "2405:8100::/32", | ||
| "2a06:98c0::/29", | ||
| "2c0f:f248::/32", | ||
| ]; | ||
|
|
||
| public static IReadOnlyList<string> FetchOrFallback(NLog.Logger logger) | ||
| { | ||
| try | ||
| { | ||
| using var http = new HttpClient { Timeout = TimeSpan.FromSeconds(5) }; | ||
| var v4 = http.GetStringAsync("https://www.cloudflare.com/ips-v4/").GetAwaiter().GetResult(); | ||
| var v6 = http.GetStringAsync("https://www.cloudflare.com/ips-v6/").GetAwaiter().GetResult(); | ||
| var cidrs = (v4 + "\n" + v6) | ||
| .Split('\n', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); | ||
| logger.Info("Fetched {Count} Cloudflare networks from cloudflare.com", cidrs.Length); | ||
| return cidrs; | ||
| } | ||
| catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException) | ||
| { | ||
| logger.Warn(ex, "Failed to fetch Cloudflare IP ranges; using hardcoded fallback ({Count} entries)", HardcodedFallback.Length); | ||
| return HardcodedFallback; | ||
| } | ||
| } | ||
|
Comment on lines
+40
to
+57
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial | 💤 Low value LGTM. The synchronous blocking via One edge case: ♻️ Broader exception handling- catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException)
+ catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException or System.Net.Sockets.SocketException)Or use a general catch that still logs and falls back gracefully: - catch (Exception ex) when (ex is HttpRequestException or TaskCanceledException)
+ catch (Exception ex)🤖 Prompt for AI Agents |
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,81 @@ | ||
| using Microsoft.Extensions.Diagnostics.HealthChecks; | ||
|
|
||
| namespace Viper.Classes.HealthChecks | ||
| { | ||
| /// <summary> | ||
| /// Throttles an inner health check by caching its last result for a | ||
| /// status-dependent duration: Healthy results are reused for longer, while | ||
| /// Unhealthy/Degraded results refresh on a tighter cycle so recovery is | ||
| /// noticed quickly. When a cached result is returned, the original probe | ||
| /// timestamp is appended to the description so operators can tell how | ||
| /// stale the reading is. | ||
| /// </summary> | ||
| public class AdaptivePollingHealthCheck : IHealthCheck | ||
| { | ||
| private readonly IHealthCheck _inner; | ||
| private readonly TimeSpan _healthyCacheDuration; | ||
| private readonly TimeSpan _unhealthyCacheDuration; | ||
| private readonly SemaphoreSlim _semaphore = new(1, 1); | ||
| private DateTime _lastCheckTime; | ||
| private HealthCheckResult? _lastResult; | ||
|
|
||
| public AdaptivePollingHealthCheck( | ||
| IHealthCheck inner, | ||
| TimeSpan healthyCacheDuration, | ||
| TimeSpan unhealthyCacheDuration) | ||
| { | ||
| _inner = inner; | ||
| _healthyCacheDuration = healthyCacheDuration; | ||
| _unhealthyCacheDuration = unhealthyCacheDuration; | ||
| } | ||
|
|
||
| public async Task<HealthCheckResult> CheckHealthAsync( | ||
| HealthCheckContext context, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| await _semaphore.WaitAsync(cancellationToken); | ||
| try | ||
| { | ||
| if (_lastResult.HasValue) | ||
| { | ||
| // S6561: DateTime.Now used for elapsed-time calc. Accepted | ||
| // because VIPER convention is DateTimeKind.Local and a | ||
| // sub-hour DST skew only shifts one cache window. | ||
| #pragma warning disable S6561 | ||
| var elapsed = DateTime.Now - _lastCheckTime; | ||
| #pragma warning restore S6561 | ||
| var cacheDuration = _lastResult.Value.Status == HealthStatus.Healthy | ||
| ? _healthyCacheDuration | ||
| : _unhealthyCacheDuration; | ||
|
|
||
| if (elapsed < cacheDuration) | ||
| { | ||
| return AppendTimestamp(_lastResult.Value, _lastCheckTime); | ||
| } | ||
| } | ||
|
|
||
| var result = await _inner.CheckHealthAsync(context, cancellationToken); | ||
| _lastResult = result; | ||
| _lastCheckTime = DateTime.Now; | ||
| return result; | ||
| } | ||
| finally | ||
| { | ||
| _semaphore.Release(); | ||
| } | ||
| } | ||
|
|
||
| private static HealthCheckResult AppendTimestamp(HealthCheckResult result, DateTime lastCheckedAt) | ||
| { | ||
| var stamp = $"Last checked: {lastCheckedAt:MMM d, h:mm tt}"; | ||
| var description = string.IsNullOrWhiteSpace(result.Description) | ||
| ? stamp | ||
| : $"{result.Description}\n{stamp}"; | ||
| return new HealthCheckResult( | ||
| result.Status, | ||
| description, | ||
| result.Exception, | ||
| result.Data); | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| using Amazon; | ||
| using Amazon.Runtime; | ||
| using Amazon.SimpleSystemsManagement; | ||
| using Amazon.SimpleSystemsManagement.Model; | ||
| using Microsoft.Extensions.Diagnostics.HealthChecks; | ||
|
|
||
| namespace Viper.Classes.HealthChecks | ||
| { | ||
| /// <summary> | ||
| /// Verifies AWS SSM Parameter Store is reachable with the app's credentials. | ||
| /// Uses a lightweight DescribeParameters probe (MaxResults=1) so the check | ||
| /// does not actually fetch any parameter values. | ||
| /// </summary> | ||
| public class AwsSsmHealthCheck : IHealthCheck | ||
| { | ||
| private readonly RegionEndpoint _region; | ||
| private readonly bool _healthyWhenMissing; | ||
|
|
||
| /// <param name="healthyWhenMissing"> | ||
| /// If true, missing credentials or client-side SDK errors return Healthy | ||
| /// with a "skipped" description. Use for Development where local machines | ||
| /// may not have AWS credentials configured. | ||
| /// </param> | ||
| public AwsSsmHealthCheck(RegionEndpoint? region = null, bool healthyWhenMissing = false) | ||
| { | ||
| _region = region ?? RegionEndpoint.USWest1; | ||
| _healthyWhenMissing = healthyWhenMissing; | ||
| } | ||
|
|
||
| public async Task<HealthCheckResult> CheckHealthAsync( | ||
| HealthCheckContext context, | ||
| CancellationToken cancellationToken = default) | ||
| { | ||
| try | ||
| { | ||
| using var client = new AmazonSimpleSystemsManagementClient(_region); | ||
| await client.DescribeParametersAsync( | ||
| new DescribeParametersRequest { MaxResults = 1 }, | ||
| cancellationToken); | ||
| return HealthCheckResult.Healthy("AWS SSM reachable."); | ||
| } | ||
| catch (AmazonServiceException ex) | ||
| { | ||
| return _healthyWhenMissing | ||
| ? HealthCheckResult.Healthy("AWS SSM not configured (skipped).") | ||
| : HealthCheckResult.Unhealthy($"AWS SSM unreachable: {ex.ErrorCode}."); | ||
|
Comment on lines
+36
to
+46
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: Amazon.Extensions.Configuration.SystemsManager primarily uses the AWS SSM API operation GetParametersByPath (with Recursive=true and WithDecryption=true) to load configuration parameters from Parameter Store when a path prefix is specified (the typical usage via AddSystemsManager(path)). It may also use GetParameters (batched) if specific ParameterNames are provided, or GetParameter for Secrets Manager-referenced paths. The corresponding IAM permission is ssm:GetParametersByPath on the resource ARN matching the path prefix (e.g., "arn:aws:ssm:::parameter/myapp/"). This differs from ssm:DescribeParameters (used by the DescribeParameters API), which only lists metadata about parameters (names, types, etc.) but does not return parameter values. ssm:DescribeParameters requires no resource-level restrictions (often "Resource": "") and cannot access SecureString values or decrypt them. In contrast, ssm:GetParametersByPath directly retrieves and decrypts values but requires permission on the specific parameter resources matching the path, providing finer-grained control over value access. Citations:
Use the same SSM operation the configuration loader uses. The health check calls Probe an actual parameter from the configured path instead—either query a known key with 🤖 Prompt for AI Agents |
||
| } | ||
| catch (AmazonClientException) | ||
| { | ||
| return _healthyWhenMissing | ||
| ? HealthCheckResult.Healthy("AWS SSM not configured (skipped).") | ||
| : HealthCheckResult.Unhealthy("AWS SSM client error (credentials or network)."); | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧹 Nitpick | 🔵 Trivial | 💤 Low value
Consider extracting duplicated health-check logic.
The test and prod PowerShell blocks are identical except for the URL. A shared function would reduce maintenance burden.
♻️ Example consolidation
Then call
verifyHealth("https://secure-test.vetmed.ucdavis.edu/2/health")in each stage.🤖 Prompt for AI Agents