Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions bin/core/src/monitor/alert/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ type OpenDiskAlertMap = OpenAlertMap<PathBuf>;

/// Alert buffer to prevent immediate alerts on transient issues
struct AlertBuffer {
buffer: Mutex<HashMap<(String, AlertDataVariant), bool>>,
buffer: Mutex<HashMap<(String, AlertDataVariant), i64>>,
}

impl AlertBuffer {
Expand All @@ -43,20 +43,42 @@ impl AlertBuffer {
}
}

/// Check if alert should be opened. Requires two consecutive calls to return true.
/// Check if alert should be opened.
/// If a time-window is configured, only return true if the alert has been
/// triggered for the duration of the window.
/// Otherwise require two consecutive calls to return true.
fn ready_to_open(
&self,
ts: i64,
server_id: String,
variant: AlertDataVariant,
window_secs: i64,
) -> bool {
let mut lock = self.buffer.lock().unwrap();
let ready = lock.entry((server_id, variant)).or_default();
if *ready {
*ready = false;
true
let buffer_val = lock.entry((server_id, variant)).or_default();

if window_secs <= 0 {
// No time window configured, use simple check for consecutive calls.
if *buffer_val == 1 {
*buffer_val = 0;
true
} else {
*buffer_val = 1;
false
}
} else {
*ready = true;
false
if *buffer_val == 0 {
// First trigger, set timestamp.
*buffer_val = ts;
return false;
}
// Check if time window has elapsed.
let window_ms = window_secs.saturating_mul(1_000);
if ts - *buffer_val >= window_ms {
true
} else {
false
}
}
}

Expand Down Expand Up @@ -117,8 +139,10 @@ pub async fn alert_servers(
// Only open unreachable alert if not in maintenance and buffer is ready
if !in_maintenance
&& buffer.ready_to_open(
ts,
server_status.id.clone(),
AlertDataVariant::ServerUnreachable,
0, // No time window for unreachable
)
{
let alert = Alert {
Expand Down Expand Up @@ -196,8 +220,10 @@ pub async fn alert_servers(
// Only open version mismatch alert if not in maintenance and buffer is ready
if !in_maintenance
&& buffer.ready_to_open(
ts,
server_status.id.clone(),
AlertDataVariant::ServerVersionMismatch,
0, // No time window for version mismatch
)
{
let alert = Alert {
Expand Down Expand Up @@ -266,8 +292,10 @@ pub async fn alert_servers(
// Only open CPU alert if not in maintenance and buffer is ready
if !in_maintenance
&& buffer.ready_to_open(
ts,
server_status.id.clone(),
AlertDataVariant::ServerCpu,
server.config.cpu_alert_window_seconds,
)
{
let alert = Alert {
Expand Down Expand Up @@ -345,8 +373,10 @@ pub async fn alert_servers(
// Only open memory alert if not in maintenance and buffer is ready
if !in_maintenance
&& buffer.ready_to_open(
ts,
server_status.id.clone(),
AlertDataVariant::ServerMem,
server.config.mem_alert_window_seconds,
)
{
let alert = Alert {
Expand Down Expand Up @@ -447,8 +477,10 @@ pub async fn alert_servers(
// Only open disk alert if not in maintenance and buffer is ready
if !in_maintenance
&& buffer.ready_to_open(
ts,
server_status.id.clone(),
AlertDataVariant::ServerDisk,
server.config.disk_alert_window_seconds,
)
{
let disk =
Expand Down
31 changes: 31 additions & 0 deletions client/core/rs/src/entities/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,30 @@ pub struct ServerConfig {
#[partial_default(default_disk_critical())]
pub disk_critical: f64,

/// Minimum duration in seconds that CPU usage must stay above the
/// configured thresholds before an alert is opened.
/// 0 means alerts are opened immediately when thresholds are crossed.
#[serde(default = "default_alert_window_seconds")]
#[builder(default = "default_alert_window_seconds()")]
#[partial_default(default_alert_window_seconds())]
pub cpu_alert_window_seconds: I64,

/// Minimum duration in seconds that memory usage must stay above the
/// configured thresholds before an alert is opened.
/// 0 means alerts are opened immediately when thresholds are crossed.
#[serde(default = "default_alert_window_seconds")]
#[builder(default = "default_alert_window_seconds()")]
#[partial_default(default_alert_window_seconds())]
pub mem_alert_window_seconds: I64,

/// Minimum duration in seconds that disk usage must stay above the
/// configured thresholds before an alert is opened.
/// 0 means alerts are opened immediately when thresholds are crossed.
#[serde(default = "default_alert_window_seconds")]
#[builder(default = "default_alert_window_seconds()")]
#[partial_default(default_alert_window_seconds())]
pub disk_alert_window_seconds: I64,

/// Scheduled maintenance windows during which alerts will be suppressed.
#[serde(default)]
#[builder(default)]
Expand Down Expand Up @@ -265,6 +289,10 @@ fn default_disk_critical() -> f64 {
95.0
}

fn default_alert_window_seconds() -> i64 {
0
}

impl Default for ServerConfig {
fn default() -> Self {
Self {
Expand All @@ -289,6 +317,9 @@ impl Default for ServerConfig {
mem_critical: default_mem_critical(),
disk_warning: default_disk_warning(),
disk_critical: default_disk_critical(),
cpu_alert_window_seconds: default_alert_window_seconds(),
mem_alert_window_seconds: default_alert_window_seconds(),
disk_alert_window_seconds: default_alert_window_seconds(),
maintenance_windows: Default::default(),
}
}
Expand Down
20 changes: 19 additions & 1 deletion client/core/ts/src/types.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Generated by typeshare 1.13.3
Generated by typeshare 1.13.4
*/

export interface MongoIdObj {
Expand Down Expand Up @@ -2065,6 +2065,24 @@ export interface ServerConfig {
disk_warning: number;
/** The percentage threshhold which triggers CRITICAL state for DISK. */
disk_critical: number;
/**
* Minimum duration in seconds that CPU usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
cpu_alert_window_seconds: I64;
/**
* Minimum duration in seconds that memory usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
mem_alert_window_seconds: I64;
/**
* Minimum duration in seconds that disk usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
disk_alert_window_seconds: I64;
/** Scheduled maintenance windows during which alerts will be suppressed. */
maintenance_windows?: MaintenanceWindow[];
}
Expand Down
18 changes: 18 additions & 0 deletions frontend/public/client/types.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2197,6 +2197,24 @@ export interface ServerConfig {
disk_warning: number;
/** The percentage threshhold which triggers CRITICAL state for DISK. */
disk_critical: number;
/**
* Minimum duration in seconds that CPU usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
cpu_alert_window_seconds: I64;
/**
* Minimum duration in seconds that memory usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
mem_alert_window_seconds: I64;
/**
* Minimum duration in seconds that disk usage must stay above the
* configured thresholds before an alert is opened.
* 0 means alerts are opened immediately when thresholds are crossed.
*/
disk_alert_window_seconds: I64;
/** Scheduled maintenance windows during which alerts will be suppressed. */
maintenance_windows?: MaintenanceWindow[];
}
Expand Down
2 changes: 1 addition & 1 deletion frontend/public/client/types.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Generated by typeshare 1.13.3
Generated by typeshare 1.13.4
*/
/** The levels of permission that a User or UserGroup can have on a resource. */
export var PermissionLevel;
Expand Down
15 changes: 15 additions & 0 deletions frontend/src/components/resources/server/config.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ export const ServerConfig = ({
description:
"Send an alert if the CPU usage is above the configured thresholds.",
},
cpu_alert_window_seconds: {
label: "CPU Alert Window (seconds)",
description:
"Only trigger CPU alerts if CPU usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.",
},
cpu_warning: {
description:
"Send a 'Warning' alert if the CPU usage in % is above these thresholds",
Expand All @@ -185,6 +190,11 @@ export const ServerConfig = ({
description:
"Send an alert if the memory usage is above the configured thresholds.",
},
mem_alert_window_seconds: {
label: "Memory Alert Window (seconds)",
description:
"Only trigger memory alerts if usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.",
},
mem_warning: {
label: "Memory Warning",
description:
Expand All @@ -206,6 +216,11 @@ export const ServerConfig = ({
description:
"Send an alert if the Disk Usage (for any mounted disk) is above the configured thresholds.",
},
disk_alert_window_seconds: {
label: "Disk Alert Window (seconds)",
description:
"Only trigger disk alerts if usage stays above the configured thresholds for at least this many seconds. 0 keeps the existing instantaneous behavior.",
},
disk_warning: {
description:
"Send a 'Warning' alert if the disk usage in % is above these thresholds",
Expand Down