Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 2026-04-20
- #2196 Renames 5 metrics to add the `sov_` prefix; update Grafana/Flux dashboards accordingly: `state_db_materialization` → `sov_state_db_materialization`, `nomt_db_stats` → `sov_nomt_db_stats`, `nomt_begin_session` → `sov_nomt_begin_session`, `storage_manager_finalization` → `sov_storage_manager_finalization`, `pruner` → `sov_db_pruner`. Also adds a metric inventory to the `sov-metrics` crate README.
- #2196 The previously-inline metrics emitted by the SDK (`sov_rollup_num_of_in_flight_blobs`, `sov_rollup_blobs_enter_scope`, `sov_rollup_blobs_exit_scope`, `sov_rollup_current_sequence_number`, `sov_rollup_in_progress_batch_size`, `sov_rollup_sequence_number_delta`) now go through dedicated types implementing `Metric`. The blob-sender scope markers intentionally changed their placeholder field from `foo=1` to `marker=1i`; update Grafana/Flux dashboards accordingly. `MetricsTracker::submit_inline` is retained for external SDK users whose downstream code depends on it.
- #2196 *Internal*: `sov_rollup_zkvm` and `sov_rollup_gas_constant` now emit caller-supplied `metadata` as InfluxDB string fields rather than tags. Previously these tags could explode series cardinality when the `bench` / `gas-constant-estimation` features were enabled. The on-the-wire field keys are unchanged; if you were selecting them via `group by` (a tag operation) you'll need to switch to field-based filtering.

# 2026-04-16
- #2746 Removes re-export of `DaSyncState` and `SyncStatus` from sov-modules-api. Please use `sov-rollup-interface` directly
- #2744 **Manual intervention might be needed**: Adds `serde(deny_unknown_fields)`, which can fail rollup at startup if genesis config is not tidy.
Expand Down
32 changes: 0 additions & 32 deletions crates/full-node/sov-blob-sender/src/in_flight_blob.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
use std::io::Write;

use sov_metrics::Metric;
use sov_modules_api::DaSpec;
use tokio::task::JoinHandle;

Expand All @@ -21,32 +18,3 @@ pub struct InFlightBlobInfo<Da: DaSpec> {
pub was_resurrected: bool,
pub last_known_state: BlobExecutionStatus<Da>,
}

impl<Da: DaSpec> Metric for InFlightBlobInfo<Da> {
fn measurement_name(&self) -> &'static str {
"sov_rollup_in_flight_blobs_snapshot"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(
buffer,
"{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"{}\"",
self.measurement_name(),
uuid::Uuid::from_u128(self.blob_iid).as_simple(),
self.is_batch,
self.size_in_bytes,
self.was_resurrected,
self.start_time.elapsed().as_millis(),
serde_json::to_string(&self.last_known_state).unwrap().replace("\\", "\\\\").replace("\"", "\\\""),
)
}
}

pub fn track_num_of_in_flight_blobs(count: u64) {
sov_metrics::track_metrics(|tracker| {
tracker.submit_inline(
"sov_rollup_num_of_in_flight_blobs",
format!("num_of_in_flight_blobs={count}i"),
);
});
}
10 changes: 7 additions & 3 deletions crates/full-node/sov-blob-sender/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod db;
mod in_flight_blob;
mod metrics;

use std::collections::HashMap;
use std::path::Path;
Expand All @@ -10,7 +11,10 @@ use std::time::{Duration, SystemTime};
use async_trait::async_trait;
use db::BlobSenderDb;
pub use db::BlobToSend;
use in_flight_blob::{track_num_of_in_flight_blobs, InFlightBlob, InFlightBlobInfo};
use in_flight_blob::{InFlightBlob, InFlightBlobInfo};
use metrics::{
submit_blobs_enter_scope_marker, submit_blobs_exit_scope_marker, track_num_of_in_flight_blobs,
};
use sov_db::ledger_db::LedgerDb;
use sov_modules_api::{DaSpec, EventModuleName, RuntimeEventResponse};
use sov_rollup_interface::common::HexHash;
Expand Down Expand Up @@ -366,11 +370,11 @@ where

let len = infos.len();
sov_metrics::track_metrics(|tracker| {
tracker.submit_inline("sov_rollup_blobs_enter_scope", "foo=1");
submit_blobs_enter_scope_marker(tracker);
for b in infos {
tracker.submit(b);
}
tracker.submit_inline("sov_rollup_blobs_exit_scope", "foo=1");
submit_blobs_exit_scope_marker(tracker);
});

track_num_of_in_flight_blobs(len as u64);
Expand Down
91 changes: 91 additions & 0 deletions crates/full-node/sov-blob-sender/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use std::io::Write;

use sov_metrics::{write_escaped_field_value, Metric};
use sov_modules_api::DaSpec;

use crate::in_flight_blob::InFlightBlobInfo;

impl<Da: DaSpec> Metric for InFlightBlobInfo<Da> {
fn measurement_name(&self) -> &'static str {
"sov_rollup_in_flight_blobs_snapshot"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
let last_known_state_json = serde_json::to_string(&self.last_known_state).unwrap();
write!(
buffer,
"{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"",
self.measurement_name(),
uuid::Uuid::from_u128(self.blob_iid).as_simple(),
self.is_batch,
self.size_in_bytes,
self.was_resurrected,
self.start_time.elapsed().as_millis(),
)?;
write_escaped_field_value(buffer, &last_known_state_json)?;
buffer.write_all(b"\"")
}
}

/// Gauge of the current total of in-flight blobs (blobs handed to the sender but not yet
/// finalized on the DA). Emitted as `sov_rollup_num_of_in_flight_blobs`.
///
/// Growing unboundedly indicates the DA submission pipeline cannot keep up with blob
/// production; correlate with `sov_rollup_in_flight_blobs_snapshot` to see per-blob state.
#[derive(Debug)]
struct InFlightBlobCountMetric {
/// Number of blobs currently in-flight.
count: u64,
}

impl Metric for InFlightBlobCountMetric {
fn measurement_name(&self) -> &'static str {
"sov_rollup_num_of_in_flight_blobs"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(
buffer,
"{} num_of_in_flight_blobs={}i",
self.measurement_name(),
self.count,
)
}
}

pub(super) fn track_num_of_in_flight_blobs(count: u64) {
sov_metrics::track_metrics(|tracker| {
tracker.submit(InFlightBlobCountMetric { count });
});
}

/// InfluxDB line protocol requires at least one field per point; markers have no payload
/// of their own, so we emit a constant marker field.
const MARKER_FIELD: &str = "marker=1i";

#[derive(Debug)]
struct BlobScopeMarker {
measurement_name: &'static str,
}

impl Metric for BlobScopeMarker {
fn measurement_name(&self) -> &'static str {
self.measurement_name
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(buffer, "{} {MARKER_FIELD}", self.measurement_name())
}
}

pub(super) fn submit_blobs_enter_scope_marker(tracker: &sov_metrics::MetricsTracker) {
tracker.submit(BlobScopeMarker {
measurement_name: "sov_rollup_blobs_enter_scope",
});
}

pub(super) fn submit_blobs_exit_scope_marker(tracker: &sov_metrics::MetricsTracker) {
tracker.submit(BlobScopeMarker {
measurement_name: "sov_rollup_blobs_exit_scope",
});
}
18 changes: 16 additions & 2 deletions crates/full-node/sov-db/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,23 @@ use crate::schema::types::slot_key::{SlotKey, SlotValue};

pub mod nomt;

/// Shape of the state writes performed during one slot's materialization, split into
/// user-space (module state) and kernel-space (kernel state). Emitted as `sov_state_db_materialization`.
///
/// **What healthy looks like:** all fields scale roughly with transaction volume; `max_*` sizes
/// are stable across slots.
///
/// **Diagnostic signals:**
/// - `max_value_size` suddenly jumping → a module is writing a large blob to state (potential
/// unbounded-growth bug; find the module via slot replay).
/// - `cumulative_values_size` climbing without matching transaction volume → state bloat.
/// - `kernel_items` spiking while user workload is flat → kernel-level anomaly worth tracing.
///
/// **Correlate with:** `sov_rollup_slot_execution_time_us` (large materializations slow slot
/// processing) and `sov_nomt_commit_detailed` (downstream commit cost).
#[derive(Debug)]
pub struct StateMaterializationMetrics {
/// How many key-value items have been materialized for user space
/// How many key-value items have been materialized for user space.
pub user_items: usize,
/// How many key-value items have been materialized for kernel space.
pub kernel_items: usize,
Expand Down Expand Up @@ -56,7 +70,7 @@ impl StateMaterializationMetrics {

impl Metric for StateMaterializationMetrics {
fn measurement_name(&self) -> &'static str {
"state_db_materialization"
"sov_state_db_materialization"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand Down
73 changes: 69 additions & 4 deletions crates/full-node/sov-db/src/metrics/nomt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,25 @@ use nomt::Nomt;
use sov_metrics::Metric;
use sov_rollup_interface::reexports::digest;

/// Snapshot of NOMT hash-table occupancy and page-cache effectiveness for one database.
/// The `db` tag disambiguates instances (user / kernel / ledger). Emitted as `sov_nomt_db_stats`.
///
/// **What healthy looks like:** `hash_table_occupied / hash_table_capacity < 0.9`, page cache miss
/// ratio low and steady, page/value fetch times flat.
///
/// **Diagnostic signals:**
/// - Occupancy > 0.9 → NOMT emits a warning log; hash collisions start to degrade lookups
/// and inserts. Remediation: resync the database with a larger `hash_table_capacity`.
/// - `page_cache_misses / page_requests` rising → working set has outgrown the page cache.
/// Remediation: raise the NOMT page-cache size or add RAM.
/// - `avg_page_fetch_time_ns` spiking while miss ratio is flat → underlying disk is saturated
/// (compare with OS-level I/O metrics and other DBs' NOMT stats).
///
/// **Correlate with:** `sov_nomt_commit_detailed` (slow commits often trace back here) and
/// `sov_storage_manager_finalization` (finalization commit_time).
#[derive(Debug)]
pub struct NomtDbMetric {
/// Logical name of the NOMT instance (user/kernel/ledger); used as the InfluxDB `db` tag.
pub db: &'static str,
pub hash_table_capacity: usize,
pub hash_table_occupied: usize,
Expand Down Expand Up @@ -43,7 +60,7 @@ impl NomtDbMetric {

impl Metric for NomtDbMetric {
fn measurement_name(&self) -> &'static str {
"nomt_db_stats"
"sov_nomt_db_stats"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -69,16 +86,34 @@ impl Metric for NomtDbMetric {
}
}

/// Per-slot cost of opening a NOMT session and the unfinalized-overlay depth at that moment.
/// Emitted as `sov_nomt_begin_session`.
///
/// **What healthy looks like:** `init_time` low and flat; `overlays` bounded by the finalization
/// window (i.e., stays near the configured `STATE_ROOT_DELAY_BLOCKS`).
///
/// **Diagnostic signals:**
/// - `overlays` monotonically increasing → finalization is falling behind (the node cannot
/// promote unfinalized state to disk fast enough). Check `sov_rollup_runner_da.sync_distance`
/// and `sov_runner_process_stf_changes`.
/// - `init_time` spiking with stable `overlays` → storage-engine contention at session start
/// (often correlated with compaction or heavy commits on the same DB).
///
/// **Correlate with:** `sov_storage_manager_finalization` (finalization latency drives overlays),
/// `sov_nomt_commit_detailed` (commits can block session starts on the same DB).
#[derive(Debug)]
pub struct NomtBeginSessionMetric {
/// Logical name of the NOMT instance (user/kernel/ledger); InfluxDB `db` tag.
pub db: &'static str,
/// Number of unfinalized overlays stacked on top of the on-disk state when the session opened.
/// This is the "lag to finalization" in slots.
pub overlays: usize,
pub init_time: std::time::Duration,
}

impl Metric for NomtBeginSessionMetric {
fn measurement_name(&self) -> &'static str {
"nomt_begin_session"
"sov_nomt_begin_session"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -94,16 +129,29 @@ impl Metric for NomtBeginSessionMetric {
}
}

/// Wall-clock breakdown of finalizing one slot in the storage manager (promoting the oldest
/// overlay to on-disk state). Emitted as `sov_storage_manager_finalization`.
///
/// **What healthy looks like:** all three fields sub-second and stable from slot to slot.
///
/// **Diagnostic signals:**
/// - `commit_time` dominates slot time → storage is the rollup's bottleneck. Investigate
/// `sov_nomt_db_stats` (disk/page-cache pressure) and `sov_nomt_commit_detailed`.
/// - `pruning_commit_time` is `Some` and consistently large → pruner backlog; cross-check
/// `sov_db_pruner` throughput and the rollup's retention configuration.
/// - `preparation_time` rising → large overlays are being materialized; see
/// `sov_nomt_begin_session.overlays` and `sov_state_db_materialization`.
#[derive(Debug)]
pub struct StorageManagerFinalizationMetric {
pub preparation_time: std::time::Duration,
pub commit_time: std::time::Duration,
/// `None` when pruning did not run this slot; `Some(_)` otherwise.
pub pruning_commit_time: Option<std::time::Duration>,
}

impl Metric for StorageManagerFinalizationMetric {
fn measurement_name(&self) -> &'static str {
"storage_manager_finalization"
"sov_storage_manager_finalization"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -127,8 +175,25 @@ impl Metric for StorageManagerFinalizationMetric {
}
}

/// Throughput and efficiency of one pruner pass over a NOMT database. Emitted as `sov_db_pruner`.
/// The `db` tag distinguishes which database is being pruned.
///
/// **What healthy looks like:** `time` roughly linear in `keys_inspected`; a non-trivial
/// `keys_to_prune / keys_inspected` ratio (the pruner finds work on every pass).
///
/// **Diagnostic signals:**
/// - Sustained `keys_to_prune ≈ 0` with non-zero `keys_inspected` → pruner is scanning but
/// finding nothing: retention config may be wrong, or nothing is eligible for pruning yet.
/// - `keys_inspected` flat while `time` spikes → disk bottleneck on this DB (check
/// `sov_nomt_db_stats` page fetch times).
/// - No emissions at all over long windows → the pruner task may be stuck; confirm the
/// pruner background task is still alive.
///
/// **Correlate with:** `sov_storage_manager_finalization.pruning_commit_time` (the commit
/// cost paired with each inspection pass).
#[derive(Debug)]
pub struct PrunerMetric {
/// Logical name of the NOMT instance being pruned; InfluxDB `db` tag.
pub db: &'static str,
pub keys_inspected: usize,
pub keys_to_prune: usize,
Expand All @@ -137,7 +202,7 @@ pub struct PrunerMetric {

impl Metric for PrunerMetric {
fn measurement_name(&self) -> &'static str {
"pruner"
"sov_db_pruner"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand Down
Loading
Loading