From 85bcc752b40af108f4ae7c0f119d846b2b2065db Mon Sep 17 00:00:00 2001 From: Christoph Haas Date: Sun, 25 Jan 2026 21:33:18 +0100 Subject: [PATCH] feat: time-window threshold for alerts --- bin/core/src/monitor/alert/server.rs | 48 +++++++++++++++---- client/core/rs/src/entities/server.rs | 31 ++++++++++++ client/core/ts/src/types.ts | 20 +++++++- frontend/public/client/types.d.ts | 18 +++++++ frontend/public/client/types.js | 2 +- .../components/resources/server/config.tsx | 15 ++++++ 6 files changed, 124 insertions(+), 10 deletions(-) diff --git a/bin/core/src/monitor/alert/server.rs b/bin/core/src/monitor/alert/server.rs index 3717682b1..c71697bd4 100644 --- a/bin/core/src/monitor/alert/server.rs +++ b/bin/core/src/monitor/alert/server.rs @@ -33,7 +33,7 @@ type OpenDiskAlertMap = OpenAlertMap; /// Alert buffer to prevent immediate alerts on transient issues struct AlertBuffer { - buffer: Mutex>, + buffer: Mutex>, } impl AlertBuffer { @@ -43,20 +43,42 @@ impl AlertBuffer { } } - /// Check if alert should be opened. Requires two consecutive calls to return true. + /// Check if alert should be opened. + /// If a time-window is configured, only return true if the alert has been + /// triggered for the duration of the window. + /// Otherwise require two consecutive calls to return true. fn ready_to_open( &self, + ts: i64, server_id: String, variant: AlertDataVariant, + window_secs: i64, ) -> bool { let mut lock = self.buffer.lock().unwrap(); - let ready = lock.entry((server_id, variant)).or_default(); - if *ready { - *ready = false; - true + let buffer_val = lock.entry((server_id, variant)).or_default(); + + if window_secs <= 0 { + // No time window configured, use simple check for consecutive calls. + if *buffer_val == 1 { + *buffer_val = 0; + true + } else { + *buffer_val = 1; + false + } } else { - *ready = true; - false + if *buffer_val == 0 { + // First trigger, set timestamp. + *buffer_val = ts; + return false; + } + // Check if time window has elapsed. + let window_ms = window_secs.saturating_mul(1_000); + if ts - *buffer_val >= window_ms { + true + } else { + false + } } } @@ -117,8 +139,10 @@ pub async fn alert_servers( // Only open unreachable alert if not in maintenance and buffer is ready if !in_maintenance && buffer.ready_to_open( + ts, server_status.id.clone(), AlertDataVariant::ServerUnreachable, + 0, // No time window for unreachable ) { let alert = Alert { @@ -196,8 +220,10 @@ pub async fn alert_servers( // Only open version mismatch alert if not in maintenance and buffer is ready if !in_maintenance && buffer.ready_to_open( + ts, server_status.id.clone(), AlertDataVariant::ServerVersionMismatch, + 0, // No time window for version mismatch ) { let alert = Alert { @@ -266,8 +292,10 @@ pub async fn alert_servers( // Only open CPU alert if not in maintenance and buffer is ready if !in_maintenance && buffer.ready_to_open( + ts, server_status.id.clone(), AlertDataVariant::ServerCpu, + server.config.cpu_alert_window_seconds, ) { let alert = Alert { @@ -345,8 +373,10 @@ pub async fn alert_servers( // Only open memory alert if not in maintenance and buffer is ready if !in_maintenance && buffer.ready_to_open( + ts, server_status.id.clone(), AlertDataVariant::ServerMem, + server.config.mem_alert_window_seconds, ) { let alert = Alert { @@ -447,8 +477,10 @@ pub async fn alert_servers( // Only open disk alert if not in maintenance and buffer is ready if !in_maintenance && buffer.ready_to_open( + ts, server_status.id.clone(), AlertDataVariant::ServerDisk, + server.config.disk_alert_window_seconds, ) { let disk = diff --git a/client/core/rs/src/entities/server.rs b/client/core/rs/src/entities/server.rs index c6c50a032..fc6164aea 100644 --- a/client/core/rs/src/entities/server.rs +++ b/client/core/rs/src/entities/server.rs @@ -205,6 +205,30 @@ pub struct ServerConfig { #[partial_default(default_disk_critical())] pub disk_critical: f64, + /// Minimum duration in seconds that CPU usage must stay above the + /// configured thresholds before an alert is opened. + /// 0 means alerts are opened immediately when thresholds are crossed. + #[serde(default = "default_alert_window_seconds")] + #[builder(default = "default_alert_window_seconds()")] + #[partial_default(default_alert_window_seconds())] + pub cpu_alert_window_seconds: I64, + + /// Minimum duration in seconds that memory usage must stay above the + /// configured thresholds before an alert is opened. + /// 0 means alerts are opened immediately when thresholds are crossed. + #[serde(default = "default_alert_window_seconds")] + #[builder(default = "default_alert_window_seconds()")] + #[partial_default(default_alert_window_seconds())] + pub mem_alert_window_seconds: I64, + + /// Minimum duration in seconds that disk usage must stay above the + /// configured thresholds before an alert is opened. + /// 0 means alerts are opened immediately when thresholds are crossed. + #[serde(default = "default_alert_window_seconds")] + #[builder(default = "default_alert_window_seconds()")] + #[partial_default(default_alert_window_seconds())] + pub disk_alert_window_seconds: I64, + /// Scheduled maintenance windows during which alerts will be suppressed. #[serde(default)] #[builder(default)] @@ -265,6 +289,10 @@ fn default_disk_critical() -> f64 { 95.0 } +fn default_alert_window_seconds() -> i64 { + 0 +} + impl Default for ServerConfig { fn default() -> Self { Self { @@ -289,6 +317,9 @@ impl Default for ServerConfig { mem_critical: default_mem_critical(), disk_warning: default_disk_warning(), disk_critical: default_disk_critical(), + cpu_alert_window_seconds: default_alert_window_seconds(), + mem_alert_window_seconds: default_alert_window_seconds(), + disk_alert_window_seconds: default_alert_window_seconds(), maintenance_windows: Default::default(), } } diff --git a/client/core/ts/src/types.ts b/client/core/ts/src/types.ts index 248fbed4c..aec182f92 100644 --- a/client/core/ts/src/types.ts +++ b/client/core/ts/src/types.ts @@ -1,5 +1,5 @@ /* - Generated by typeshare 1.13.3 + Generated by typeshare 1.13.4 */ export interface MongoIdObj { @@ -2065,6 +2065,24 @@ export interface ServerConfig { disk_warning: number; /** The percentage threshhold which triggers CRITICAL state for DISK. */ disk_critical: number; + /** + * Minimum duration in seconds that CPU usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + cpu_alert_window_seconds: I64; + /** + * Minimum duration in seconds that memory usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + mem_alert_window_seconds: I64; + /** + * Minimum duration in seconds that disk usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + disk_alert_window_seconds: I64; /** Scheduled maintenance windows during which alerts will be suppressed. */ maintenance_windows?: MaintenanceWindow[]; } diff --git a/frontend/public/client/types.d.ts b/frontend/public/client/types.d.ts index b4c3a3b66..24d5214ee 100644 --- a/frontend/public/client/types.d.ts +++ b/frontend/public/client/types.d.ts @@ -2197,6 +2197,24 @@ export interface ServerConfig { disk_warning: number; /** The percentage threshhold which triggers CRITICAL state for DISK. */ disk_critical: number; + /** + * Minimum duration in seconds that CPU usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + cpu_alert_window_seconds: I64; + /** + * Minimum duration in seconds that memory usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + mem_alert_window_seconds: I64; + /** + * Minimum duration in seconds that disk usage must stay above the + * configured thresholds before an alert is opened. + * 0 means alerts are opened immediately when thresholds are crossed. + */ + disk_alert_window_seconds: I64; /** Scheduled maintenance windows during which alerts will be suppressed. */ maintenance_windows?: MaintenanceWindow[]; } diff --git a/frontend/public/client/types.js b/frontend/public/client/types.js index aaf7f1622..46e3dfe0d 100644 --- a/frontend/public/client/types.js +++ b/frontend/public/client/types.js @@ -1,5 +1,5 @@ /* - Generated by typeshare 1.13.3 + Generated by typeshare 1.13.4 */ /** The levels of permission that a User or UserGroup can have on a resource. */ export var PermissionLevel; diff --git a/frontend/src/components/resources/server/config.tsx b/frontend/src/components/resources/server/config.tsx index 2b53d7c5d..f63440827 100644 --- a/frontend/src/components/resources/server/config.tsx +++ b/frontend/src/components/resources/server/config.tsx @@ -165,6 +165,11 @@ export const ServerConfig = ({ description: "Send an alert if the CPU usage is above the configured thresholds.", }, + cpu_alert_window_seconds: { + label: "CPU Alert Window (seconds)", + description: + "Only trigger CPU alerts if CPU usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.", + }, cpu_warning: { description: "Send a 'Warning' alert if the CPU usage in % is above these thresholds", @@ -185,6 +190,11 @@ export const ServerConfig = ({ description: "Send an alert if the memory usage is above the configured thresholds.", }, + mem_alert_window_seconds: { + label: "Memory Alert Window (seconds)", + description: + "Only trigger memory alerts if usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.", + }, mem_warning: { label: "Memory Warning", description: @@ -206,6 +216,11 @@ export const ServerConfig = ({ description: "Send an alert if the Disk Usage (for any mounted disk) is above the configured thresholds.", }, + disk_alert_window_seconds: { + label: "Disk Alert Window (seconds)", + description: + "Only trigger disk alerts if usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.", + }, disk_warning: { description: "Send a 'Warning' alert if the disk usage in % is above these thresholds",