Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 2026-04-20
- #2196 Renames 5 metrics to add the `sov_` prefix; update Grafana/Flux dashboards accordingly: `state_db_materialization` → `sov_state_db_materialization`, `nomt_db_stats` → `sov_nomt_db_stats`, `nomt_begin_session` → `sov_nomt_begin_session`, `storage_manager_finalization` → `sov_storage_manager_finalization`, `pruner` → `sov_db_pruner`. Also adds a metric inventory to the `sov-metrics` crate README.
- #2196 The previously-inline metrics emitted by the SDK (`sov_rollup_num_of_in_flight_blobs`, `sov_rollup_blobs_enter_scope`, `sov_rollup_blobs_exit_scope`, `sov_rollup_current_sequence_number`, `sov_rollup_in_progress_batch_size`, `sov_rollup_sequence_number_delta`) now go through dedicated types implementing `Metric`. The blob-sender scope markers intentionally changed their placeholder field from `foo=1` to `marker=1i`; update Grafana/Flux dashboards accordingly. `MetricsTracker::submit_inline` is retained for external SDK users whose downstream code depends on it.
- #2196 *Internal*: `sov_rollup_zkvm` and `sov_rollup_gas_constant` now emit caller-supplied `metadata` as InfluxDB string fields rather than tags. Previously these tags could explode series cardinality when the `bench` / `gas-constant-estimation` features were enabled. The on-the-wire field keys are unchanged; if you were selecting them via `group by` (a tag operation) you'll need to switch to field-based filtering.
# 2026-04-21
- #2768 *Minor breaking change (code)*: Removed unused `Runtime::resolve_address` method from the native `Runtime` trait in `sov-modules-api`.
The method had no call sites; address resolution continues to happen via `Accounts::resolve_sender_address{_read_only}` directly.
Expand Down
32 changes: 0 additions & 32 deletions crates/full-node/sov-blob-sender/src/in_flight_blob.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
use std::io::Write;

use sov_metrics::Metric;
use sov_modules_api::DaSpec;
use tokio::task::JoinHandle;

Expand All @@ -21,32 +18,3 @@ pub struct InFlightBlobInfo<Da: DaSpec> {
pub was_resurrected: bool,
pub last_known_state: BlobExecutionStatus<Da>,
}

impl<Da: DaSpec> Metric for InFlightBlobInfo<Da> {
fn measurement_name(&self) -> &'static str {
"sov_rollup_in_flight_blobs_snapshot"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(
buffer,
"{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"{}\"",
self.measurement_name(),
uuid::Uuid::from_u128(self.blob_iid).as_simple(),
self.is_batch,
self.size_in_bytes,
self.was_resurrected,
self.start_time.elapsed().as_millis(),
serde_json::to_string(&self.last_known_state).unwrap().replace("\\", "\\\\").replace("\"", "\\\""),
)
}
}

pub fn track_num_of_in_flight_blobs(count: u64) {
sov_metrics::track_metrics(|tracker| {
tracker.submit_inline(
"sov_rollup_num_of_in_flight_blobs",
format!("num_of_in_flight_blobs={count}i"),
);
});
}
10 changes: 7 additions & 3 deletions crates/full-node/sov-blob-sender/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
mod db;
mod in_flight_blob;
mod metrics;

use std::collections::HashMap;
use std::path::Path;
Expand All @@ -10,7 +11,10 @@ use std::time::{Duration, SystemTime};
use async_trait::async_trait;
use db::BlobSenderDb;
pub use db::BlobToSend;
use in_flight_blob::{track_num_of_in_flight_blobs, InFlightBlob, InFlightBlobInfo};
use in_flight_blob::{InFlightBlob, InFlightBlobInfo};
use metrics::{
submit_blobs_enter_scope_marker, submit_blobs_exit_scope_marker, track_num_of_in_flight_blobs,
};
use sov_db::ledger_db::LedgerDb;
use sov_modules_api::{DaSpec, EventModuleName, RuntimeEventResponse};
use sov_rollup_interface::common::HexHash;
Expand Down Expand Up @@ -366,11 +370,11 @@ where

let len = infos.len();
sov_metrics::track_metrics(|tracker| {
tracker.submit_inline("sov_rollup_blobs_enter_scope", "foo=1");
submit_blobs_enter_scope_marker(tracker);
for b in infos {
tracker.submit(b);
}
tracker.submit_inline("sov_rollup_blobs_exit_scope", "foo=1");
submit_blobs_exit_scope_marker(tracker);
});

track_num_of_in_flight_blobs(len as u64);
Expand Down
91 changes: 91 additions & 0 deletions crates/full-node/sov-blob-sender/src/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
use std::io::Write;

use sov_metrics::{write_escaped_field_value, Metric};
use sov_modules_api::DaSpec;

use crate::in_flight_blob::InFlightBlobInfo;

impl<Da: DaSpec> Metric for InFlightBlobInfo<Da> {
fn measurement_name(&self) -> &'static str {
"sov_rollup_in_flight_blobs_snapshot"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
let last_known_state_json = serde_json::to_string(&self.last_known_state).unwrap();
write!(
buffer,
"{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"",
self.measurement_name(),
uuid::Uuid::from_u128(self.blob_iid).as_simple(),
self.is_batch,
self.size_in_bytes,
self.was_resurrected,
self.start_time.elapsed().as_millis(),
)?;
write_escaped_field_value(buffer, &last_known_state_json)?;
buffer.write_all(b"\"")
}
}

/// Gauge of the current total of in-flight blobs (blobs handed to the sender but not yet
/// finalized on the DA). Emitted as `sov_rollup_num_of_in_flight_blobs`.
///
/// Growing unboundedly indicates the DA submission pipeline cannot keep up with blob
/// production; correlate with `sov_rollup_in_flight_blobs_snapshot` to see per-blob state.
#[derive(Debug)]
struct InFlightBlobCountMetric {
/// Number of blobs currently in-flight.
count: u64,
}

impl Metric for InFlightBlobCountMetric {
fn measurement_name(&self) -> &'static str {
"sov_rollup_num_of_in_flight_blobs"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(
buffer,
"{} num_of_in_flight_blobs={}i",
self.measurement_name(),
self.count,
)
}
}

pub(super) fn track_num_of_in_flight_blobs(count: u64) {
sov_metrics::track_metrics(|tracker| {
tracker.submit(InFlightBlobCountMetric { count });
});
}

/// InfluxDB line protocol requires at least one field per point; markers have no payload
/// of their own, so we emit a constant marker field.
const MARKER_FIELD: &str = "marker=1i";

#[derive(Debug)]
struct BlobScopeMarker {
measurement_name: &'static str,
}

impl Metric for BlobScopeMarker {
fn measurement_name(&self) -> &'static str {
self.measurement_name
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
write!(buffer, "{} {MARKER_FIELD}", self.measurement_name())
}
}

pub(super) fn submit_blobs_enter_scope_marker(tracker: &sov_metrics::MetricsTracker) {
tracker.submit(BlobScopeMarker {
measurement_name: "sov_rollup_blobs_enter_scope",
});
}

pub(super) fn submit_blobs_exit_scope_marker(tracker: &sov_metrics::MetricsTracker) {
tracker.submit(BlobScopeMarker {
measurement_name: "sov_rollup_blobs_exit_scope",
});
}
18 changes: 16 additions & 2 deletions crates/full-node/sov-db/src/metrics/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,23 @@ use crate::schema::types::slot_key::{SlotKey, SlotValue};

pub mod nomt;

/// Shape of the state writes performed during one slot's materialization, split into
/// user-space (module state) and kernel-space (kernel state). Emitted as `sov_state_db_materialization`.
///
/// **What healthy looks like:** all fields scale roughly with transaction volume; `max_*` sizes
/// are stable across slots.
///
/// **Diagnostic signals:**
/// - `max_value_size` suddenly jumping → a module is writing a large blob to state (potential
/// unbounded-growth bug; find the module via slot replay).
/// - `cumulative_values_size` climbing without matching transaction volume → state bloat.
/// - `kernel_items` spiking while user workload is flat → kernel-level anomaly worth tracing.
///
/// **Correlate with:** `sov_rollup_slot_execution_time_us` (large materializations slow slot
/// processing) and `sov_nomt_commit_detailed` (downstream commit cost).
#[derive(Debug)]
pub struct StateMaterializationMetrics {
/// How many key-value items have been materialized for user space
/// How many key-value items have been materialized for user space.
pub user_items: usize,
/// How many key-value items have been materialized for kernel space.
pub kernel_items: usize,
Expand Down Expand Up @@ -56,7 +70,7 @@ impl StateMaterializationMetrics {

impl Metric for StateMaterializationMetrics {
fn measurement_name(&self) -> &'static str {
"state_db_materialization"
"sov_state_db_materialization"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand Down
73 changes: 69 additions & 4 deletions crates/full-node/sov-db/src/metrics/nomt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,25 @@ use nomt::Nomt;
use sov_metrics::Metric;
use sov_rollup_interface::reexports::digest;

/// Snapshot of NOMT hash-table occupancy and page-cache effectiveness for one database.
/// The `db` tag disambiguates instances (user / kernel / ledger). Emitted as `sov_nomt_db_stats`.
///
/// **What healthy looks like:** `hash_table_occupied / hash_table_capacity < 0.9`, page cache miss
/// ratio low and steady, page/value fetch times flat.
///
/// **Diagnostic signals:**
/// - Occupancy > 0.9 → NOMT emits a warning log; hash collisions start to degrade lookups
/// and inserts. Remediation: resync the database with a larger `hash_table_capacity`.
/// - `page_cache_misses / page_requests` rising → working set has outgrown the page cache.
/// Remediation: raise the NOMT page-cache size or add RAM.
/// - `avg_page_fetch_time_ns` spiking while miss ratio is flat → underlying disk is saturated
/// (compare with OS-level I/O metrics and other DBs' NOMT stats).
///
/// **Correlate with:** `sov_nomt_commit_detailed` (slow commits often trace back here) and
/// `sov_storage_manager_finalization` (finalization commit_time).
#[derive(Debug)]
pub struct NomtDbMetric {
/// Logical name of the NOMT instance (user/kernel/ledger); used as the InfluxDB `db` tag.
pub db: &'static str,
pub hash_table_capacity: usize,
pub hash_table_occupied: usize,
Expand Down Expand Up @@ -43,7 +60,7 @@ impl NomtDbMetric {

impl Metric for NomtDbMetric {
fn measurement_name(&self) -> &'static str {
"nomt_db_stats"
"sov_nomt_db_stats"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -69,16 +86,34 @@ impl Metric for NomtDbMetric {
}
}

/// Per-slot cost of opening a NOMT session and the unfinalized-overlay depth at that moment.
/// Emitted as `sov_nomt_begin_session`.
///
/// **What healthy looks like:** `init_time` low and flat; `overlays` bounded by the finalization
/// window (i.e., stays near the configured `STATE_ROOT_DELAY_BLOCKS`).
///
/// **Diagnostic signals:**
/// - `overlays` monotonically increasing → finalization is falling behind (the node cannot
/// promote unfinalized state to disk fast enough). Check `sov_rollup_runner_da.sync_distance`
/// and `sov_runner_process_stf_changes`.
/// - `init_time` spiking with stable `overlays` → storage-engine contention at session start
/// (often correlated with compaction or heavy commits on the same DB).
///
/// **Correlate with:** `sov_storage_manager_finalization` (finalization latency drives overlays),
/// `sov_nomt_commit_detailed` (commits can block session starts on the same DB).
#[derive(Debug)]
pub struct NomtBeginSessionMetric {
/// Logical name of the NOMT instance (user/kernel/ledger); InfluxDB `db` tag.
pub db: &'static str,
/// Number of unfinalized overlays stacked on top of the on-disk state when the session opened.
/// This is the "lag to finalization" in slots.
pub overlays: usize,
pub init_time: std::time::Duration,
}

impl Metric for NomtBeginSessionMetric {
fn measurement_name(&self) -> &'static str {
"nomt_begin_session"
"sov_nomt_begin_session"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -94,16 +129,29 @@ impl Metric for NomtBeginSessionMetric {
}
}

/// Wall-clock breakdown of finalizing one slot in the storage manager (promoting the oldest
/// overlay to on-disk state). Emitted as `sov_storage_manager_finalization`.
///
/// **What healthy looks like:** all three fields sub-second and stable from slot to slot.
///
/// **Diagnostic signals:**
/// - `commit_time` dominates slot time → storage is the rollup's bottleneck. Investigate
/// `sov_nomt_db_stats` (disk/page-cache pressure) and `sov_nomt_commit_detailed`.
/// - `pruning_commit_time` is `Some` and consistently large → pruner backlog; cross-check
/// `sov_db_pruner` throughput and the rollup's retention configuration.
/// - `preparation_time` rising → large overlays are being materialized; see
/// `sov_nomt_begin_session.overlays` and `sov_state_db_materialization`.
#[derive(Debug)]
pub struct StorageManagerFinalizationMetric {
pub preparation_time: std::time::Duration,
pub commit_time: std::time::Duration,
/// `None` when pruning did not run this slot; `Some(_)` otherwise.
pub pruning_commit_time: Option<std::time::Duration>,
}

impl Metric for StorageManagerFinalizationMetric {
fn measurement_name(&self) -> &'static str {
"storage_manager_finalization"
"sov_storage_manager_finalization"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand All @@ -127,8 +175,25 @@ impl Metric for StorageManagerFinalizationMetric {
}
}

/// Throughput and efficiency of one pruner pass over a NOMT database. Emitted as `sov_db_pruner`.
/// The `db` tag distinguishes which database is being pruned.
///
/// **What healthy looks like:** `time` roughly linear in `keys_inspected`; a non-trivial
/// `keys_to_prune / keys_inspected` ratio (the pruner finds work on every pass).
///
/// **Diagnostic signals:**
/// - Sustained `keys_to_prune ≈ 0` with non-zero `keys_inspected` → pruner is scanning but
/// finding nothing: retention config may be wrong, or nothing is eligible for pruning yet.
/// - `keys_inspected` flat while `time` spikes → disk bottleneck on this DB (check
/// `sov_nomt_db_stats` page fetch times).
/// - No emissions at all over long windows → the pruner task may be stuck; confirm the
/// pruner background task is still alive.
///
/// **Correlate with:** `sov_storage_manager_finalization.pruning_commit_time` (the commit
/// cost paired with each inspection pass).
#[derive(Debug)]
pub struct PrunerMetric {
/// Logical name of the NOMT instance being pruned; InfluxDB `db` tag.
pub db: &'static str,
pub keys_inspected: usize,
pub keys_to_prune: usize,
Expand All @@ -137,7 +202,7 @@ pub struct PrunerMetric {

impl Metric for PrunerMetric {
fn measurement_name(&self) -> &'static str {
"pruner"
"sov_db_pruner"
}

fn serialize_for_telegraf(&self, buffer: &mut Vec<u8>) -> std::io::Result<()> {
Expand Down
Loading
Loading