Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
8dde0fa
feat(l1): add snap sync observability — metrics, RPC endpoints, monit…
ElFantasma Apr 9, 2026
e363561
fix(l1): add diagnostics for header download phase in snap sync
ElFantasma Apr 9, 2026
b6681a5
feat(l1): dynamic log level on degradation — TRACE for peer comms whe…
ElFantasma Apr 10, 2026
bc7b049
fix(l1): compute live sync diagnostics on query instead of stale snap…
ElFantasma Apr 10, 2026
f20c95f
feat(l1): render arrays of objects as tables in REPL formatter
ElFantasma Apr 10, 2026
4eda5a8
fix(l1): improve monitor failure capture — force dump on failure, det…
ElFantasma Apr 10, 2026
5066523
refactor(l1): rename _check_degradation to _check_alert_conditions
ElFantasma Apr 10, 2026
3ca7e07
fix(l1): create run directory early so snapshot dumps work during deg…
ElFantasma Apr 11, 2026
3bbfdc8
feat(l1): peer_top responsive layout — use full terminal width
ElFantasma Apr 13, 2026
3269543
fix(l1): address review feedback — clippy, metrics pattern, monitor f…
ElFantasma Apr 14, 2026
68c9d9b
chore(l1): remove redundant peer_top.sh — peer_top.py is the canonica…
ElFantasma Apr 14, 2026
14b22c5
refactor(l1): make watched phases configurable in monitor
ElFantasma Apr 14, 2026
e217075
fix(l1): default watched phases to empty — opt-in, not opt-out
ElFantasma Apr 14, 2026
e04782a
Merge branch 'main' into feat/snap-sync-observability-pr
ElFantasma Apr 14, 2026
fc87aa6
feat(l1): add MULTISYNC_WATCHED_PHASES to Makefile for opt-in TRACE
ElFantasma Apr 14, 2026
bc3764e
Merge branch 'main' into feat/snap-sync-observability-pr
ElFantasma Apr 14, 2026
572fa9c
fix(l1): remove internal jargon from code comment
ElFantasma Apr 14, 2026
11b38ff
feat(l1): add snap sync progress metrics and Grafana dashboard
ElFantasma Apr 14, 2026
36e9d3d
feat(l1): enable metrics in multisync containers, expose mainnet port…
ElFantasma Apr 14, 2026
ef39564
fix(l1): fix global declaration order in docker_monitor.py
ElFantasma Apr 14, 2026
9fef0cb
fix(l1): expose pivot_timestamp gauge — Grafana computes age as time(…
ElFantasma Apr 14, 2026
8551e29
feat(l1): add per-phase elapsed + ETA panels to Grafana dashboard
ElFantasma Apr 14, 2026
05a659f
fix(l1): push peer health metrics (eligible, snap, inflight) during s…
ElFantasma Apr 14, 2026
8ff1321
refactor(l1): remove duplicate peer health push from after_sync — dur…
ElFantasma Apr 14, 2026
f7f8670
refactor(l1): remove redundant prometheus pushes from snap_sync — pus…
ElFantasma Apr 14, 2026
0b860ad
feat(l1): expose metrics ports for all three multisync chains (3701/3…
ElFantasma Apr 14, 2026
0507f59
fix(l1): deduplicate dashboard panels, consistent 4-panel rows
ElFantasma Apr 14, 2026
964be80
fix(l1): consistent dashboard layout — Progress+Counts, Rate, ETA, El…
ElFantasma Apr 14, 2026
eb1155b
Merge branch 'main' into feat/snap-sync-observability-pr
ElFantasma Apr 14, 2026
4d29538
feat(l1): add composite Sync Rates Overview chart to dashboard top se…
ElFantasma Apr 14, 2026
54db019
fix(l1): move composite chart legend to bottom for time axis alignment
ElFantasma Apr 14, 2026
866ea60
fix(l1): distinguish watched-phase tracing from real degradation in m…
ElFantasma Apr 15, 2026
905e368
fix(l1): log reason changes even when already in alert state (monitor…
ElFantasma Apr 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions crates/blockchain/metrics/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ pub(crate) async fn get_metrics() -> String {
Err(_) => tracing::error!("Failed to gather METRICS_P2P"),
};

// METRICS_SYNC registers into the default Prometheus registry at init,
// so its metrics are already included in gather_default_metrics() above.

ret_string.push('\n');
if let Some(node_metrics) = METRICS_NODE.get() {
match node_metrics.gather_metrics() {
Expand Down
2 changes: 2 additions & 0 deletions crates/blockchain/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ pub mod process;
pub mod profiling;
#[cfg(feature = "api")]
pub mod rpc;
#[cfg(any(feature = "api", feature = "metrics"))]
pub mod sync;
#[cfg(any(feature = "api", feature = "transactions"))]
pub mod transactions;

Expand Down
205 changes: 205 additions & 0 deletions crates/blockchain/metrics/sync.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
use prometheus::{
IntCounterVec, IntGauge, IntGaugeVec, register_int_counter_vec, register_int_gauge,
register_int_gauge_vec,
};
use std::sync::LazyLock;

// Metrics defined in this module register into the Prometheus default registry.
// The metrics API exposes them via `gather_default_metrics()`.

pub static METRICS_SYNC: LazyLock<MetricsSync> = LazyLock::new(MetricsSync::default);

#[derive(Debug, Clone)]
pub struct MetricsSync {
// --- Current state (gauges) ---
pub stage: IntGauge,
pub pivot_block: IntGauge,
pub eligible_peers: IntGauge,
pub snap_peers: IntGauge,
pub inflight_requests: IntGauge,
pub pivot_age_seconds: IntGauge,
pub pivot_timestamp: IntGauge,
pub phase_start_timestamp: IntGaugeVec,

// --- Progress counters (gauges set from METRICS atomics) ---
// Use rate() in Grafana to derive throughput.
pub headers_downloaded: IntGauge,
pub headers_total: IntGauge,
pub accounts_downloaded: IntGauge,
pub accounts_inserted: IntGauge,
pub storage_downloaded: IntGauge,
pub storage_inserted: IntGauge,
pub state_leaves_healed: IntGauge,
pub storage_leaves_healed: IntGauge,
pub bytecodes_downloaded: IntGauge,
pub bytecodes_total: IntGauge,

// --- Outcome counters (counter vecs) ---
pub pivot_updates: IntCounterVec,
pub storage_requests: IntCounterVec,
pub header_resolution: IntCounterVec,
}

impl Default for MetricsSync {
fn default() -> Self {
Self::new()
}
}

impl MetricsSync {
pub fn new() -> Self {
MetricsSync {
// Current state
stage: register_int_gauge!(
"ethrex_sync_stage",
"Current snap sync stage (0=idle, 1=headers, 2=account_ranges, 3=account_insertion, 4=storage_ranges, 5=storage_insertion, 6=state_healing, 7=storage_healing, 8=bytecodes)"
)
.expect("Failed to create ethrex_sync_stage"),
pivot_block: register_int_gauge!(
"ethrex_sync_pivot_block",
"Current pivot block number"
)
.expect("Failed to create ethrex_sync_pivot_block"),
eligible_peers: register_int_gauge!(
"ethrex_sync_eligible_peers",
"Number of peers eligible for requests"
)
.expect("Failed to create ethrex_sync_eligible_peers"),
snap_peers: register_int_gauge!(
"ethrex_sync_snap_peers",
"Number of connected peers supporting the snap protocol"
)
.expect("Failed to create ethrex_sync_snap_peers"),
inflight_requests: register_int_gauge!(
"ethrex_sync_inflight_requests",
"Total inflight requests across all peers"
)
.expect("Failed to create ethrex_sync_inflight_requests"),
pivot_age_seconds: register_int_gauge!(
"ethrex_sync_pivot_age_seconds",
"Age of the current pivot block in seconds"
)
.expect("Failed to create ethrex_sync_pivot_age_seconds"),
pivot_timestamp: register_int_gauge!(
"ethrex_sync_pivot_timestamp",
"Unix timestamp of the current pivot block (use time() - this for age in Grafana)"
)
.expect("Failed to create ethrex_sync_pivot_timestamp"),

phase_start_timestamp: register_int_gauge_vec!(
"ethrex_sync_phase_start_timestamp",
"Unix timestamp when each phase began (use time() - this for elapsed in Grafana)",
&["phase"]
)
.expect("Failed to create ethrex_sync_phase_start_timestamp"),

// Progress (set periodically from METRICS atomics)
headers_downloaded: register_int_gauge!(
"ethrex_sync_headers_downloaded",
"Headers downloaded so far"
)
.expect("Failed to create ethrex_sync_headers_downloaded"),
headers_total: register_int_gauge!(
"ethrex_sync_headers_total",
"Total headers to download (pivot block number)"
)
.expect("Failed to create ethrex_sync_headers_total"),
accounts_downloaded: register_int_gauge!(
"ethrex_sync_accounts_downloaded",
"Account ranges downloaded from peers"
)
.expect("Failed to create ethrex_sync_accounts_downloaded"),
accounts_inserted: register_int_gauge!(
"ethrex_sync_accounts_inserted",
"Accounts inserted into storage"
)
.expect("Failed to create ethrex_sync_accounts_inserted"),
storage_downloaded: register_int_gauge!(
"ethrex_sync_storage_downloaded",
"Storage leaves downloaded from peers"
)
.expect("Failed to create ethrex_sync_storage_downloaded"),
storage_inserted: register_int_gauge!(
"ethrex_sync_storage_inserted",
"Storage leaves inserted into storage"
)
.expect("Failed to create ethrex_sync_storage_inserted"),
state_leaves_healed: register_int_gauge!(
"ethrex_sync_state_leaves_healed",
"State trie leaves healed"
)
.expect("Failed to create ethrex_sync_state_leaves_healed"),
storage_leaves_healed: register_int_gauge!(
"ethrex_sync_storage_leaves_healed",
"Storage trie leaves healed"
)
.expect("Failed to create ethrex_sync_storage_leaves_healed"),
bytecodes_downloaded: register_int_gauge!(
"ethrex_sync_bytecodes_downloaded",
"Bytecodes downloaded so far"
)
.expect("Failed to create ethrex_sync_bytecodes_downloaded"),
bytecodes_total: register_int_gauge!(
"ethrex_sync_bytecodes_total",
"Total bytecodes to download"
)
.expect("Failed to create ethrex_sync_bytecodes_total"),

// Outcome counters
pivot_updates: register_int_counter_vec!(
"ethrex_sync_pivot_updates_total",
"Total pivot update attempts by outcome",
&["outcome"]
)
.expect("Failed to create ethrex_sync_pivot_updates_total"),
storage_requests: register_int_counter_vec!(
"ethrex_sync_storage_requests_total",
"Total storage range requests by outcome",
&["outcome"]
)
.expect("Failed to create ethrex_sync_storage_requests_total"),
header_resolution: register_int_counter_vec!(
"ethrex_sync_header_resolution_total",
"Total header resolution attempts by outcome",
&["outcome"]
)
.expect("Failed to create ethrex_sync_header_resolution_total"),
}
}

// --- Gauge setters (used by p2p sync code directly) ---

pub fn set_eligible_peers(&self, count: i64) {
self.eligible_peers.set(count);
}

pub fn set_snap_peers(&self, count: i64) {
self.snap_peers.set(count);
}

pub fn set_inflight_requests(&self, count: i64) {
self.inflight_requests.set(count);
}

pub fn set_pivot_age_seconds(&self, age: i64) {
self.pivot_age_seconds.set(age);
}

pub fn set_current_phase(&self, phase: i64) {
self.stage.set(phase);
}

// --- Counter incrementers ---

pub fn inc_pivot_update(&self, outcome: &str) {
self.pivot_updates.with_label_values(&[outcome]).inc();
}

pub fn inc_storage_request(&self, outcome: &str) {
self.storage_requests.with_label_values(&[outcome]).inc();
}

pub fn inc_header_resolution(&self, outcome: &str) {
self.header_resolution.with_label_values(&[outcome]).inc();
}
}
2 changes: 2 additions & 0 deletions crates/networking/p2p/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ pub struct Metrics {
/* Snap Sync */
// Common
pub sync_head_block: AtomicU64,
pub pivot_timestamp: AtomicU64,
pub sync_head_hash: Arc<Mutex<H256>>,
pub current_step: Arc<CurrentStep>,

Expand Down Expand Up @@ -708,6 +709,7 @@ impl Default for Metrics {
/* Snap Sync */
// Common
sync_head_block: AtomicU64::new(0),
pivot_timestamp: AtomicU64::new(0),
sync_head_hash: Arc::new(Mutex::new(H256::default())),
current_step: Arc::new(CurrentStep(AtomicU8::new(0))),

Expand Down
106 changes: 106 additions & 0 deletions crates/networking/p2p/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -390,11 +390,29 @@ pub async fn periodically_show_peer_stats_during_syncing(
phase_elapsed_str,
&phase_metrics(previous_step, &phase_start).await,
);

// Emit final metrics for completed phase
#[cfg(feature = "metrics")]
push_sync_prometheus_metrics(previous_step);
}

// Start new phase
phase_start_time = std::time::Instant::now();

// Record phase start timestamp for Grafana elapsed panels
#[cfg(feature = "metrics")]
{
let (_, phase_name) = phase_info(current_step);
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
ethrex_metrics::sync::METRICS_SYNC
.phase_start_timestamp
.with_label_values(&[phase_name])
.set(now as i64);
}

// Capture metrics at phase start
phase_start = PhaseCounters::capture_current();
prev_interval = phase_start;
Expand All @@ -416,6 +434,22 @@ pub async fn periodically_show_peer_stats_during_syncing(
)
.await;

// Push progress + peer health to Prometheus
#[cfg(feature = "metrics")]
{
push_sync_prometheus_metrics(current_step);
let diag = peer_table.get_peer_diagnostics().await.unwrap_or_default();
let snap_peers = diag
.iter()
.filter(|p| p.capabilities.iter().any(|c| c.starts_with("snap/")))
.count();
let eligible = diag.iter().filter(|p| p.eligible).count();
let inflight: i64 = diag.iter().map(|p| p.inflight_requests).sum();
ethrex_metrics::sync::METRICS_SYNC.set_snap_peers(snap_peers as i64);
ethrex_metrics::sync::METRICS_SYNC.set_eligible_peers(eligible as i64);
ethrex_metrics::sync::METRICS_SYNC.set_inflight_requests(inflight);
}

// Update previous interval counters for next rate calculation
prev_interval = PhaseCounters::capture_current();

Expand Down Expand Up @@ -684,6 +718,78 @@ async fn log_phase_progress(
}
}

/// Push snap sync progress to Prometheus gauges (from METRICS atomics).
/// Called each polling cycle. Rates are NOT computed here — use rate() in Grafana.
#[cfg(feature = "metrics")]
fn push_sync_prometheus_metrics(step: CurrentStepValue) {
use ethrex_metrics::sync::METRICS_SYNC;
use std::sync::atomic::Ordering::Relaxed;

let (phase_num, _) = phase_info(step);
METRICS_SYNC.stage.set(phase_num as i64);
METRICS_SYNC
.pivot_block
.set(METRICS.sync_head_block.load(Relaxed) as i64);

// Push raw pivot timestamp — Grafana computes age via time() - timestamp
let pivot_ts = METRICS.pivot_timestamp.load(Relaxed);
if pivot_ts > 0 {
METRICS_SYNC.pivot_timestamp.set(pivot_ts as i64);
}
// Also update pivot_age_seconds for RPC/peer_top consumers
if pivot_ts > 0 {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
METRICS_SYNC
.pivot_age_seconds
.set(now.saturating_sub(pivot_ts) as i64);
}

match step {
CurrentStepValue::DownloadingHeaders => {
let total = METRICS.sync_head_block.load(Relaxed);
let downloaded = u64::min(METRICS.downloaded_headers.get(), total);
METRICS_SYNC.headers_downloaded.set(downloaded as i64);
METRICS_SYNC.headers_total.set(total as i64);
}
CurrentStepValue::RequestingAccountRanges => {
let downloaded = METRICS.downloaded_account_tries.load(Relaxed);
METRICS_SYNC.accounts_downloaded.set(downloaded as i64);
}
CurrentStepValue::InsertingAccountRanges | CurrentStepValue::InsertingAccountRangesNoDb => {
let total = METRICS.downloaded_account_tries.load(Relaxed);
let inserted = METRICS.account_tries_inserted.load(Relaxed);
METRICS_SYNC.accounts_downloaded.set(total as i64);
METRICS_SYNC.accounts_inserted.set(inserted as i64);
}
CurrentStepValue::RequestingStorageRanges => {
let downloaded = METRICS.storage_leaves_downloaded.get();
METRICS_SYNC.storage_downloaded.set(downloaded as i64);
}
CurrentStepValue::InsertingStorageRanges => {
let inserted = METRICS.storage_leaves_inserted.get();
METRICS_SYNC.storage_inserted.set(inserted as i64);
}
CurrentStepValue::HealingState => {
let healed = METRICS.global_state_trie_leafs_healed.load(Relaxed);
METRICS_SYNC.state_leaves_healed.set(healed as i64);
}
CurrentStepValue::HealingStorage => {
let healed = METRICS.global_storage_tries_leafs_healed.load(Relaxed);
METRICS_SYNC.storage_leaves_healed.set(healed as i64);
}
CurrentStepValue::RequestingBytecodes => {
let total = METRICS.bytecodes_to_download.load(Relaxed);
let downloaded = METRICS.downloaded_bytecodes.load(Relaxed);
METRICS_SYNC.bytecodes_downloaded.set(downloaded as i64);
METRICS_SYNC.bytecodes_total.set(total as i64);
}
CurrentStepValue::None => {}
}
}

fn progress_bar(percentage: f64, width: usize) -> String {
let clamped_percentage = percentage.clamp(0.0, 100.0);
let filled = ((clamped_percentage / 100.0) * width as f64) as usize;
Expand Down
Loading
Loading