diff --git a/CHANGELOG.md b/CHANGELOG.md index b541706f08..9913e0889d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 2026-04-20 +- #2196 Renames 5 metrics to add the `sov_` prefix; update Grafana/Flux dashboards accordingly: `state_db_materialization` → `sov_state_db_materialization`, `nomt_db_stats` → `sov_nomt_db_stats`, `nomt_begin_session` → `sov_nomt_begin_session`, `storage_manager_finalization` → `sov_storage_manager_finalization`, `pruner` → `sov_db_pruner`. Also adds a metric inventory to the `sov-metrics` crate README. +- #2196 The previously-inline metrics emitted by the SDK (`sov_rollup_num_of_in_flight_blobs`, `sov_rollup_blobs_enter_scope`, `sov_rollup_blobs_exit_scope`, `sov_rollup_current_sequence_number`, `sov_rollup_in_progress_batch_size`, `sov_rollup_sequence_number_delta`) now go through dedicated types implementing `Metric`. The blob-sender scope markers intentionally changed their placeholder field from `foo=1` to `marker=1i`; update Grafana/Flux dashboards accordingly. `MetricsTracker::submit_inline` is retained for external SDK users whose downstream code depends on it. +- #2196 *Internal*: `sov_rollup_zkvm` and `sov_rollup_gas_constant` now emit caller-supplied `metadata` as InfluxDB string fields rather than tags. Previously these tags could explode series cardinality when the `bench` / `gas-constant-estimation` features were enabled. The on-the-wire field keys are unchanged; if you were selecting them via `group by` (a tag operation) you'll need to switch to field-based filtering. # 2026-04-21 - #2768 *Minor breaking change (code)*: Removed unused `Runtime::resolve_address` method from the native `Runtime` trait in `sov-modules-api`. The method had no call sites; address resolution continues to happen via `Accounts::resolve_sender_address{_read_only}` directly. diff --git a/crates/full-node/sov-blob-sender/src/in_flight_blob.rs b/crates/full-node/sov-blob-sender/src/in_flight_blob.rs index 0a70974fe7..07cc231024 100644 --- a/crates/full-node/sov-blob-sender/src/in_flight_blob.rs +++ b/crates/full-node/sov-blob-sender/src/in_flight_blob.rs @@ -1,6 +1,3 @@ -use std::io::Write; - -use sov_metrics::Metric; use sov_modules_api::DaSpec; use tokio::task::JoinHandle; @@ -21,32 +18,3 @@ pub struct InFlightBlobInfo { pub was_resurrected: bool, pub last_known_state: BlobExecutionStatus, } - -impl Metric for InFlightBlobInfo { - fn measurement_name(&self) -> &'static str { - "sov_rollup_in_flight_blobs_snapshot" - } - - fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { - write!( - buffer, - "{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"{}\"", - self.measurement_name(), - uuid::Uuid::from_u128(self.blob_iid).as_simple(), - self.is_batch, - self.size_in_bytes, - self.was_resurrected, - self.start_time.elapsed().as_millis(), - serde_json::to_string(&self.last_known_state).unwrap().replace("\\", "\\\\").replace("\"", "\\\""), - ) - } -} - -pub fn track_num_of_in_flight_blobs(count: u64) { - sov_metrics::track_metrics(|tracker| { - tracker.submit_inline( - "sov_rollup_num_of_in_flight_blobs", - format!("num_of_in_flight_blobs={count}i"), - ); - }); -} diff --git a/crates/full-node/sov-blob-sender/src/lib.rs b/crates/full-node/sov-blob-sender/src/lib.rs index 8c5788da83..15df5966bf 100644 --- a/crates/full-node/sov-blob-sender/src/lib.rs +++ b/crates/full-node/sov-blob-sender/src/lib.rs @@ -1,5 +1,6 @@ mod db; mod in_flight_blob; +mod metrics; use std::collections::HashMap; use std::path::Path; @@ -10,7 +11,10 @@ use std::time::{Duration, SystemTime}; use async_trait::async_trait; use db::BlobSenderDb; pub use db::BlobToSend; -use in_flight_blob::{track_num_of_in_flight_blobs, InFlightBlob, InFlightBlobInfo}; +use in_flight_blob::{InFlightBlob, InFlightBlobInfo}; +use metrics::{ + submit_blobs_enter_scope_marker, submit_blobs_exit_scope_marker, track_num_of_in_flight_blobs, +}; use sov_db::ledger_db::LedgerDb; use sov_modules_api::{DaSpec, EventModuleName, RuntimeEventResponse}; use sov_rollup_interface::common::HexHash; @@ -366,11 +370,11 @@ where let len = infos.len(); sov_metrics::track_metrics(|tracker| { - tracker.submit_inline("sov_rollup_blobs_enter_scope", "foo=1"); + submit_blobs_enter_scope_marker(tracker); for b in infos { tracker.submit(b); } - tracker.submit_inline("sov_rollup_blobs_exit_scope", "foo=1"); + submit_blobs_exit_scope_marker(tracker); }); track_num_of_in_flight_blobs(len as u64); diff --git a/crates/full-node/sov-blob-sender/src/metrics.rs b/crates/full-node/sov-blob-sender/src/metrics.rs new file mode 100644 index 0000000000..76e7f594eb --- /dev/null +++ b/crates/full-node/sov-blob-sender/src/metrics.rs @@ -0,0 +1,91 @@ +use std::io::Write; + +use sov_metrics::{write_escaped_field_value, Metric}; +use sov_modules_api::DaSpec; + +use crate::in_flight_blob::InFlightBlobInfo; + +impl Metric for InFlightBlobInfo { + fn measurement_name(&self) -> &'static str { + "sov_rollup_in_flight_blobs_snapshot" + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + let last_known_state_json = serde_json::to_string(&self.last_known_state).unwrap(); + write!( + buffer, + "{} blob_iid=\"{}\",is_batch={},size_in_bytes={}i,was_resurrected={},duration_ms={}i,last_known_state=\"", + self.measurement_name(), + uuid::Uuid::from_u128(self.blob_iid).as_simple(), + self.is_batch, + self.size_in_bytes, + self.was_resurrected, + self.start_time.elapsed().as_millis(), + )?; + write_escaped_field_value(buffer, &last_known_state_json)?; + buffer.write_all(b"\"") + } +} + +/// Gauge of the current total of in-flight blobs (blobs handed to the sender but not yet +/// finalized on the DA). Emitted as `sov_rollup_num_of_in_flight_blobs`. +/// +/// Growing unboundedly indicates the DA submission pipeline cannot keep up with blob +/// production; correlate with `sov_rollup_in_flight_blobs_snapshot` to see per-blob state. +#[derive(Debug)] +struct InFlightBlobCountMetric { + /// Number of blobs currently in-flight. + count: u64, +} + +impl Metric for InFlightBlobCountMetric { + fn measurement_name(&self) -> &'static str { + "sov_rollup_num_of_in_flight_blobs" + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + write!( + buffer, + "{} num_of_in_flight_blobs={}i", + self.measurement_name(), + self.count, + ) + } +} + +pub(super) fn track_num_of_in_flight_blobs(count: u64) { + sov_metrics::track_metrics(|tracker| { + tracker.submit(InFlightBlobCountMetric { count }); + }); +} + +/// InfluxDB line protocol requires at least one field per point; markers have no payload +/// of their own, so we emit a constant marker field. +const MARKER_FIELD: &str = "marker=1i"; + +#[derive(Debug)] +struct BlobScopeMarker { + measurement_name: &'static str, +} + +impl Metric for BlobScopeMarker { + fn measurement_name(&self) -> &'static str { + self.measurement_name + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + write!(buffer, "{} {MARKER_FIELD}", self.measurement_name()) + } +} + +pub(super) fn submit_blobs_enter_scope_marker(tracker: &sov_metrics::MetricsTracker) { + tracker.submit(BlobScopeMarker { + measurement_name: "sov_rollup_blobs_enter_scope", + }); +} + +pub(super) fn submit_blobs_exit_scope_marker(tracker: &sov_metrics::MetricsTracker) { + tracker.submit(BlobScopeMarker { + measurement_name: "sov_rollup_blobs_exit_scope", + }); +} diff --git a/crates/full-node/sov-db/src/metrics/mod.rs b/crates/full-node/sov-db/src/metrics/mod.rs index 62741cd2dd..09c8051608 100644 --- a/crates/full-node/sov-db/src/metrics/mod.rs +++ b/crates/full-node/sov-db/src/metrics/mod.rs @@ -6,9 +6,23 @@ use crate::schema::types::slot_key::{SlotKey, SlotValue}; pub mod nomt; +/// Shape of the state writes performed during one slot's materialization, split into +/// user-space (module state) and kernel-space (kernel state). Emitted as `sov_state_db_materialization`. +/// +/// **What healthy looks like:** all fields scale roughly with transaction volume; `max_*` sizes +/// are stable across slots. +/// +/// **Diagnostic signals:** +/// - `max_value_size` suddenly jumping → a module is writing a large blob to state (potential +/// unbounded-growth bug; find the module via slot replay). +/// - `cumulative_values_size` climbing without matching transaction volume → state bloat. +/// - `kernel_items` spiking while user workload is flat → kernel-level anomaly worth tracing. +/// +/// **Correlate with:** `sov_rollup_slot_execution_time_us` (large materializations slow slot +/// processing) and `sov_nomt_commit_detailed` (downstream commit cost). #[derive(Debug)] pub struct StateMaterializationMetrics { - /// How many key-value items have been materialized for user space + /// How many key-value items have been materialized for user space. pub user_items: usize, /// How many key-value items have been materialized for kernel space. pub kernel_items: usize, @@ -56,7 +70,7 @@ impl StateMaterializationMetrics { impl Metric for StateMaterializationMetrics { fn measurement_name(&self) -> &'static str { - "state_db_materialization" + "sov_state_db_materialization" } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { diff --git a/crates/full-node/sov-db/src/metrics/nomt.rs b/crates/full-node/sov-db/src/metrics/nomt.rs index 19bfe5bb97..483afcfe59 100644 --- a/crates/full-node/sov-db/src/metrics/nomt.rs +++ b/crates/full-node/sov-db/src/metrics/nomt.rs @@ -5,8 +5,25 @@ use nomt::Nomt; use sov_metrics::Metric; use sov_rollup_interface::reexports::digest; +/// Snapshot of NOMT hash-table occupancy and page-cache effectiveness for one database. +/// The `db` tag disambiguates instances (user / kernel / ledger). Emitted as `sov_nomt_db_stats`. +/// +/// **What healthy looks like:** `hash_table_occupied / hash_table_capacity < 0.9`, page cache miss +/// ratio low and steady, page/value fetch times flat. +/// +/// **Diagnostic signals:** +/// - Occupancy > 0.9 → NOMT emits a warning log; hash collisions start to degrade lookups +/// and inserts. Remediation: resync the database with a larger `hash_table_capacity`. +/// - `page_cache_misses / page_requests` rising → working set has outgrown the page cache. +/// Remediation: raise the NOMT page-cache size or add RAM. +/// - `avg_page_fetch_time_ns` spiking while miss ratio is flat → underlying disk is saturated +/// (compare with OS-level I/O metrics and other DBs' NOMT stats). +/// +/// **Correlate with:** `sov_nomt_commit_detailed` (slow commits often trace back here) and +/// `sov_storage_manager_finalization` (finalization commit_time). #[derive(Debug)] pub struct NomtDbMetric { + /// Logical name of the NOMT instance (user/kernel/ledger); used as the InfluxDB `db` tag. pub db: &'static str, pub hash_table_capacity: usize, pub hash_table_occupied: usize, @@ -43,7 +60,7 @@ impl NomtDbMetric { impl Metric for NomtDbMetric { fn measurement_name(&self) -> &'static str { - "nomt_db_stats" + "sov_nomt_db_stats" } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { @@ -69,16 +86,34 @@ impl Metric for NomtDbMetric { } } +/// Per-slot cost of opening a NOMT session and the unfinalized-overlay depth at that moment. +/// Emitted as `sov_nomt_begin_session`. +/// +/// **What healthy looks like:** `init_time` low and flat; `overlays` bounded by the finalization +/// window (i.e., stays near the configured `STATE_ROOT_DELAY_BLOCKS`). +/// +/// **Diagnostic signals:** +/// - `overlays` monotonically increasing → finalization is falling behind (the node cannot +/// promote unfinalized state to disk fast enough). Check `sov_rollup_runner_da.sync_distance` +/// and `sov_runner_process_stf_changes`. +/// - `init_time` spiking with stable `overlays` → storage-engine contention at session start +/// (often correlated with compaction or heavy commits on the same DB). +/// +/// **Correlate with:** `sov_storage_manager_finalization` (finalization latency drives overlays), +/// `sov_nomt_commit_detailed` (commits can block session starts on the same DB). #[derive(Debug)] pub struct NomtBeginSessionMetric { + /// Logical name of the NOMT instance (user/kernel/ledger); InfluxDB `db` tag. pub db: &'static str, + /// Number of unfinalized overlays stacked on top of the on-disk state when the session opened. + /// This is the "lag to finalization" in slots. pub overlays: usize, pub init_time: std::time::Duration, } impl Metric for NomtBeginSessionMetric { fn measurement_name(&self) -> &'static str { - "nomt_begin_session" + "sov_nomt_begin_session" } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { @@ -94,16 +129,29 @@ impl Metric for NomtBeginSessionMetric { } } +/// Wall-clock breakdown of finalizing one slot in the storage manager (promoting the oldest +/// overlay to on-disk state). Emitted as `sov_storage_manager_finalization`. +/// +/// **What healthy looks like:** all three fields sub-second and stable from slot to slot. +/// +/// **Diagnostic signals:** +/// - `commit_time` dominates slot time → storage is the rollup's bottleneck. Investigate +/// `sov_nomt_db_stats` (disk/page-cache pressure) and `sov_nomt_commit_detailed`. +/// - `pruning_commit_time` is `Some` and consistently large → pruner backlog; cross-check +/// `sov_db_pruner` throughput and the rollup's retention configuration. +/// - `preparation_time` rising → large overlays are being materialized; see +/// `sov_nomt_begin_session.overlays` and `sov_state_db_materialization`. #[derive(Debug)] pub struct StorageManagerFinalizationMetric { pub preparation_time: std::time::Duration, pub commit_time: std::time::Duration, + /// `None` when pruning did not run this slot; `Some(_)` otherwise. pub pruning_commit_time: Option, } impl Metric for StorageManagerFinalizationMetric { fn measurement_name(&self) -> &'static str { - "storage_manager_finalization" + "sov_storage_manager_finalization" } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { @@ -127,8 +175,25 @@ impl Metric for StorageManagerFinalizationMetric { } } +/// Throughput and efficiency of one pruner pass over a NOMT database. Emitted as `sov_db_pruner`. +/// The `db` tag distinguishes which database is being pruned. +/// +/// **What healthy looks like:** `time` roughly linear in `keys_inspected`; a non-trivial +/// `keys_to_prune / keys_inspected` ratio (the pruner finds work on every pass). +/// +/// **Diagnostic signals:** +/// - Sustained `keys_to_prune ≈ 0` with non-zero `keys_inspected` → pruner is scanning but +/// finding nothing: retention config may be wrong, or nothing is eligible for pruning yet. +/// - `keys_inspected` flat while `time` spikes → disk bottleneck on this DB (check +/// `sov_nomt_db_stats` page fetch times). +/// - No emissions at all over long windows → the pruner task may be stuck; confirm the +/// pruner background task is still alive. +/// +/// **Correlate with:** `sov_storage_manager_finalization.pruning_commit_time` (the commit +/// cost paired with each inspection pass). #[derive(Debug)] pub struct PrunerMetric { + /// Logical name of the NOMT instance being pruned; InfluxDB `db` tag. pub db: &'static str, pub keys_inspected: usize, pub keys_to_prune: usize, @@ -137,7 +202,7 @@ pub struct PrunerMetric { impl Metric for PrunerMetric { fn measurement_name(&self) -> &'static str { - "pruner" + "sov_db_pruner" } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { diff --git a/crates/full-node/sov-metrics/README.md b/crates/full-node/sov-metrics/README.md index d3f3a710a4..31f3b88ba0 100644 --- a/crates/full-node/sov-metrics/README.md +++ b/crates/full-node/sov-metrics/README.md @@ -133,3 +133,143 @@ fn some_expensive_operation(input: u64) -> u64 { - The `track_metrics` function records all metrics during the function's execution. - Timestamping is handled automatically when metrics are being tracked. + +## Metric inventory + +All measurement names emitted by the SDK are listed below. Every name is prefixed with +`sov_` for discoverability in InfluxDB / Grafana. New metrics should follow the same +convention; prefer the `sov_rollup_`, `sov_nomt_`, `sov_db_`, `sov_evm_`, `sov_sequencer_`, +`sov_hyperlane_`, `sov_proxy_`, or `sov_celestia_adapter_` namespaces that match the +emitting subsystem. + +The "What to look for" column is aimed at both human operators and autonomous agents +diagnosing a running rollup: it names the dominant diagnostic signal in each metric and +points at the metric's usual partners when the signal is ambiguous. Numeric thresholds +are rules of thumb, not hard limits. + +### Runner / STF + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_runner_da` | gauge | `sov-metrics/src/influxdb/tracker.rs` | `sync_distance` trending up → node falling behind DA tip; `get_block_time_ms` spiking → DA RPC slow or unreachable. | +| `sov_rollup_runner_counts` | counter | `sov-metrics/src/influxdb/tracker.rs` | Rate of `batches`, `transactions`, `proofs_processed` per DA height. Sustained zeros on a public network = sequencer starvation or DA extraction bug. | +| `sov_rollup_runner_times_us` | timer | `sov-metrics/src/influxdb/tracker.rs` | Which stage dominates: `apply_slot`, `stf_transition`, `extract_blobs`, `processing_changes`. The bottleneck identifies which subsystem to investigate next. | +| `sov_runner_process_stf_changes` | timer | `sov-metrics/src/influxdb/tracker.rs` | Post-STF pipeline (finalization, ledger materialization, prover dispatch, API push). `sending_stf_to_prover_time` high → prover queue/IO backlog. | +| `sov_rollup_transaction_execution_us` | timer | `sov-metrics/src/influxdb/tracker.rs` | Per-tx latency tagged by `status`/`context`/`call_message`/`sequencer`. Group by tag to find expensive message types or misbehaving sequencers. ⚠️ See cardinality note below. | +| `sov_rollup_slot_execution_time_us` | timer | `sov-metrics/src/influxdb/tracker.rs` | End-to-end slot latency; the top-level SLO for rollup throughput. If this grows, drill into `sov_rollup_runner_times_us` + `sov_storage_manager_finalization`. | +| `sov_rollup_batch_processing` | timer + counter | `sov-metrics/src/influxdb/tracker.rs` | Per-batch cost and tx count; use together to compute per-tx cost trends inside a batch. | +| `sov_rollup_auth_and_process_metrics` | timer | `sov-metrics/src/influx_db_nonnative.rs` | Authentication + processing cost (emitted in non-native / ZK-guest code paths). Budgets ZK cycle consumption. | + +### HTTP / RPC surface + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_rpc_handlers` | timer + status | `sov-metrics/src/influxdb/tracker.rs` | Slow RPC methods (tag `request_name`) and error rates (tag `status`). Sudden error bursts with low latency = validation failures; slow + errors = downstream dependency. | +| `sov_rollup_http_handlers` | timer + status | `sov-metrics/src/influxdb/tracker.rs` | Same pattern for REST. ⚠️ The `path` tag currently uses raw `request_uri.path()`; paths containing IDs explode cardinality. Normalization is tracked in PR #2753. | + +### ZK VM + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_zkvm` | cycles + memory | `sov-metrics/src/influxdb/tracker.rs` | Cycles / heap usage per named call site in the guest. Use to find hot functions blowing up circuit size. `name` is the only tag (bounded — one per `#[cycle_tracker]`-annotated function); caller-supplied `metadata` is emitted as string fields, so it stays cardinality-safe even when call sites pass hashes or heights. | +| `sov_rollup_zkvm_proving` | timer | `sov-metrics/src/influxdb/tracker.rs` | Proof generation wall-clock per `circuit`. Success/failure ratio via `is_success`. | +| `sov_nomt_prover_compute_state` | counter | `sov-state/src/nomt/prover_storage.rs` | Prover-side state computations, split by `with_witness`. | + +### Runtime / infrastructure + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_tokio_runtime` | runtime | `sov-metrics/src/influxdb/tracker.rs` | Tokio worker saturation and scheduling. Busy workers pegged near total workers = runtime is CPU-bound; high poll latencies = blocking code on async thread. | +| `sov_rollup_dropped_metrics` | counter | `sov-metrics/src/influxdb/tracker.rs` | **Any non-zero sustained value means the metrics pipeline is backpressured** — Telegraf or the publisher task can't keep up. Other metrics become unreliable until this is zero again. | +| `sov_rollup_rate_limiter` | timer + counter | `sov-metrics/src/influxdb/tracker.rs` | Rate limiter hits by `limiter_type`. Non-zero = clients throttled; cross-check against `sov_rollup_rpc_handlers` / `sov_rollup_http_handlers` 429 responses. | +| `sov_rollup_gas_constant` | counter | `sov-metrics/src/influxdb/gas_constant_estimation.rs` | Empirical gas cost samples keyed by `name`/`constant` (both bounded tags). Only emitted with the `gas-constant-estimation` feature. Caller-supplied `metadata` is emitted as string fields, so it stays cardinality-safe even when call sites pass hashes or heights. | + +### Module implementations + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_evm_tx` | timer | `sov-evm/src/metrics.rs` | EVM tx timing breakdown (`fetch_state`, `execution`, `state_commit`, `receipt`, `get_head`). Which stage dominates identifies whether the bottleneck is revm, state I/O, or commit. | +| `sov_evm_db_metrics` | timer + counter | `sov-evm/src/db/metrics.rs` | revm DB access counts and durations per access type (`account`, `code`, `storage`, `block_hash`). High `storage_count` = tx is doing many slot reads. | +| `sov_rollup_value_setter` | timer | `sov-synthetic-load/src/metrics.rs` | Only emitted in synthetic-load benchmarks (`sov-synthetic-load`); tag `context` distinguishes the workload shape. Ignore in production. | +| `sov_hyperlane_rate_limiter_capacity` | gauge | `hyperlane/src/warp/metrics.rs` | Current and max rate-limiter capacity by route, remote domain, and direction. Watch `current_capacity` near zero for throttled bridge traffic. | + +### Storage (sov-db / NOMT) + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_state_db_materialization` | counter + size | `sov-db/src/metrics/mod.rs` | `max_value_size` jumping = a module wrote a large blob; `cumulative_values_size` climbing without matching tx volume = state bloat. Correlate with `sov_rollup_slot_execution_time_us`. | +| `sov_nomt_db_stats` | cache | `sov-db/src/metrics/nomt.rs` | `hash_table_occupied / hash_table_capacity > 0.9` → NOMT warns + perf degrades, need resync with larger capacity. `page_cache_misses / page_requests` high → insufficient RAM/cache. | +| `sov_nomt_begin_session` | timer | `sov-db/src/metrics/nomt.rs` | `overlays` climbing = finalization is lagging. `init_time` spiking on its own = storage engine contention. Tag `db` splits by instance. | +| `sov_storage_manager_finalization` | timer | `sov-db/src/metrics/nomt.rs` | `commit_time` dominating slot cost = storage is the bottleneck. `pruning_commit_time` repeatedly large = pruner backlog (see `sov_db_pruner`). | +| `sov_db_pruner` | counter + timer | `sov-db/src/metrics/nomt.rs` | `keys_to_prune` near zero while `keys_inspected` is non-zero = wasted scans (retention misconfigured?). No emissions over long windows = pruner task may be stuck. | +| `sov_nomt_commit_detailed` | timer | `sov-db/src/metrics/nomt.rs` | Breaks NOMT commit into write_user / write_kernel / flat / accessory / ledger phases. Use to attribute slow `sov_storage_manager_finalization.commit_time` to a specific phase. | + +### Sequencer (preferred role) + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_current_sequence_number` | gauge | `sov-sequencer/src/metrics.rs` | Must increase monotonically. Flat for extended periods while the rollup is live = sequencer stalled. | +| `sov_rollup_in_progress_batch_size` | gauge | `sov-sequencer/src/metrics.rs` | Growing unboundedly = sequencer is accumulating but not producing batches; cross-check `sov_rollup_preferred_sequencer_channel`. | +| `sov_rollup_preferred_sequencer_update_state` | timer + counter | `sov-sequencer/src/metrics.rs` | Main state-update loop cost. `total_message_processing_duration` high = event flood or slow event handlers (see `sov_rollup_preferred_sequencer_executor_event`). | +| `sov_rollup_preferred_sequencer_channel` | timer | `sov-sequencer/src/metrics.rs` | Blocking time on the channel send, tagged by `reason`. High = downstream consumer is slow; consumer identified by the reason tag. | +| `sov_rollup_preferred_sequencer_executor_event` | timer | `sov-sequencer/src/metrics.rs` | Per-event-type handling time; tag `event_type` is low-cardinality so safe to group by. | +| `sov_rollup_preferred_sequencer_fetch_batches_to_replay` | timer + counter | `sov-sequencer/src/metrics.rs` | Replay fetch on rebase/restart. Spikes here map to rebase windows; see `STATE_ROOT_DELAY_BLOCKS`. | +| `sov_rollup_preferred_sequencer_slot_numbers` | gauge | `sov-sequencer/src/metrics.rs` | Four slot-number views (`true`, `latest_finalized`, `node_visible`, `seq_visible`). Divergence between `seq_visible` and `node_visible` → sequencer and node disagree on visibility. | +| `sov_rollup_preferred_sequencer_prune` | timer | `sov-sequencer/src/metrics.rs` | Internal sequencer prune (different from DB pruner). Large values here don't directly affect txs but grow memory. | +| `sov_rollup_preferred_sequencer_executor_event_sending` | timer | `sov-sequencer/src/metrics.rs` | Send-side blocking for executor events. `blocked_for_us` non-zero = executor is backpressuring the sequencer. | +| `sov_rollup_nonce_buffer_main_queue_blocked` | timer | `sov-sequencer/src/metrics.rs` | **Only emitted when send blocked** (`blocked_for_us > 0`); presence of this metric = main queue capacity pressure. | +| `sov_rollup_nonce_buffer_main_queue_depth` | gauge | `sov-sequencer/src/metrics.rs` | Instantaneous main queue depth. Compare against configured capacity to spot near-full saturation. | +| `sov_rollup_nonce_buffer_timeout_queue` | timer + gauge | `sov-sequencer/src/metrics.rs` | Timeout queue activity (txs parked waiting for their nonce to become current). Deep queue = out-of-order nonces from clients. | +| `sov_rollup_sequence_number_delta` | gauge | `sov-sequencer/src/metrics.rs` | Gap between expected and observed sequence numbers. Non-zero briefly is normal during rebase; sustained non-zero = desync. | +| `sov_sequencer_cache_warmup_metrics` | gauge | `sov-sequencer/src/preferred/cache_warm_up_executor.rs` | Tx-channel size seen by the cache warm-up executor; use as a sanity check that warm-up is receiving work. | + +### Blob sender + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_rollup_in_flight_blobs_snapshot` | snapshot | `sov-blob-sender/src/metrics.rs` | Per-blob lifecycle snapshot. `duration_ms` high with stable `last_known_state` = blob stuck in that state (DA submission hanging or resurrection loop). | +| `sov_rollup_num_of_in_flight_blobs` | gauge | `sov-blob-sender/src/metrics.rs` | Growing unboundedly = DA submission is not keeping up with blob production. | +| `sov_rollup_blobs_enter_scope` | counter | `sov-blob-sender/src/metrics.rs` | Rate of new blobs being handed to the sender. Compare with `exit_scope` for throughput balance. | +| `sov_rollup_blobs_exit_scope` | counter | `sov-blob-sender/src/metrics.rs` | Rate of blobs leaving the sender (success or drop). `enter - exit` over a window ≈ backlog growth. | + +### Celestia adapter + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_celestia_adapter_header_get_by_height` | timer + status | `celestia/src/metrics/client.rs` | Celestia header fetch RPC; `is_success=false` = node down or misconfigured. Elevated latency = bridge/RPC slow. | +| `sov_celestia_adapter_header_network_head` | timer + status | `celestia/src/metrics/client.rs` | Polling for network head; similar failure-mode semantics as `header_get_by_height`. | +| `sov_celestia_adapter_share_get_namespace_data` | timer + status | `celestia/src/metrics/client.rs` | Data-share retrieval per namespace. Failures here often surface upstream as `sov_rollup_runner_da` gaps. | +| `sov_celestia_adapter_state_submit_pay_for_blob` | timer + status | `celestia/src/metrics/client.rs` | PFB submission to Celestia; failures block DA posting entirely. Pair with `sov_rollup_in_flight_blobs_snapshot` to confirm blobs are stuck here vs. elsewhere. | +| `sov_celestia_adapter_blob_get_all` | timer + status | `celestia/src/metrics/client.rs` | Blob lookup RPC; failures mean submitted blobs cannot be fetched back from Celestia. | +| `sov_celestia_adapter_state_balance_for_address` | timer + status | `celestia/src/metrics/client.rs` | Balance lookup for the Celestia account; failures block balance-aware health checks and funding diagnostics. | +| `sov_celestia_adapter_header_sync_state` | timer + status | `celestia/src/metrics/client.rs` | Header sync-state RPC; failures or high latency make DA-head visibility unreliable. | +| `sov_celestia_adapter_state_estimate_gas_price` | timer + status | `celestia/src/metrics/client.rs` | Gas-price estimation RPC; failures can prevent cost-aware PFB submission. | +| `sov_celestia_adapter_get_block` | timer | `celestia/src/metrics/full.rs` | Block-level fetch latency (full-node path). `height` and `square_width` are fields (not tags); `square_width` is a useful indicator of on-chain activity. | +| `sov_celestia_adapter_submit_blob` | timer | `celestia/src/metrics/full.rs` | Full-node blob submission path; tag `namespace` is low-cardinality. | +| `sov_celestia_adapter_periodic_data` | gauge | `celestia/src/metrics/full.rs` | Periodic adapter health snapshot: balance, gas price, and sync distance. Low balance or growing sync distance points to operator intervention. | + +### Proxy utilities + +| Name | Kind | Defined in | What to look for | +|---|---|---|---| +| `sov_proxy_latest_height_check` | gauge | `sov-proxy-utils/src/node_check_metric.rs` | Cross-node latest-height health: `nodes_failed` non-zero or `height_diff` widening means the proxy pool is inconsistent. | +| `sov_proxy_root_hash_check` | gauge | `sov-proxy-utils/src/node_check_metric.rs` | Cross-node state-root consistency at a slot. `unique_state_roots > 1` is a consensus-critical disagreement signal. | +| `sov_proxy_cluster_update_failure` | counter | `sov-proxy-utils/src/node_discovery_metrics.rs` | Node-discovery refresh failures by `stage`; sustained increments mean proxy membership is stale. | + +### Known cardinality caveats + +Grafana / Flux queries that group by high-cardinality tags can blow out InfluxDB memory. +Known cases today: + +- `sov_rollup_http_handlers` — `path` is the raw `request_uri.path()`; paths containing + IDs (`/blocks/12345`, `/tx/0xabc…`) create one series per ID. Normalization is tracked + in PR #2753. +- `sov_hyperlane_rate_limiter_capacity` — series count is `monitored_routes × enrolled_domains × 2`. + Cardinality is operator-controlled via `WarpExecutionConfig.monitored_route_ids`; a large + monitored list × many enrolled destinations can still pressure InfluxDB. Prefer enumerating + only the routes you actively care about. + +Note: `sov_rollup_zkvm` and `sov_rollup_gas_constant` used to carry caller-supplied +`metadata` as tags. Those are now emitted as string fields instead, so enabling the +`bench` or `gas-constant-estimation` features against a real InfluxDB no longer risks +series explosion. diff --git a/crates/full-node/sov-metrics/src/influxdb/gas_constant_estimation.rs b/crates/full-node/sov-metrics/src/influxdb/gas_constant_estimation.rs index 84afc76aa1..42ba4bedbb 100644 --- a/crates/full-node/sov-metrics/src/influxdb/gas_constant_estimation.rs +++ b/crates/full-node/sov-metrics/src/influxdb/gas_constant_estimation.rs @@ -4,7 +4,7 @@ use std::io::{self, Write}; use tokio::task_local; -use crate::influxdb::safe_telegraf_string; +use crate::influxdb::write_metadata_fields_for_telegraf; use crate::{timestamp, Metric, MetricsTracker}; task_local! { @@ -55,17 +55,15 @@ impl GasConstantTracker { #[derive(Debug)] pub struct GasConstantMetric { - /// Name of the caller site, usually a function or method + /// Name of the caller site, usually a function or method. Emitted as the `name` tag. pub name: String, - /// The gas constant tracked + /// The gas constant being tracked. Emitted as the `constant` tag. pub constant: String, - /// A numerical value representing the number of invocations of the gas constant + /// Number of invocations of the gas constant within the caller site. pub num_invocations: i64, - /// Additional metadata to be included in the metrics. The metadata is added as a - /// measurement attribute according to the [influxdb line protocol](https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/) - /// We are parsing the metadata in the `tag_key=tag_value` format of influxdb. - /// This can be used to filter metrics data in telegraf, by querying metrics for some - /// specific metadata. + /// Arbitrary key/value metadata captured from the caller's arguments. + /// Emitted as string **fields** (not tags) so unbounded values like hashes or heights + /// do not cause series-cardinality explosion in InfluxDB. pub metadata: Vec<(String, String)>, } @@ -99,33 +97,18 @@ impl Metric for GasConstantMetric { } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + // `name` and `constant` are the only tags (both bounded — one value per annotated + // function and per declared gas constant). `metadata` is emitted as string *fields* + // rather than tags because callers pass unbounded values (hashes, heights, tx ids) + // and using those as tags would explode InfluxDB series cardinality. write!( buffer, - "{},name={},constant={}", + "{},name={},constant={} num_invocations={}", self.measurement_name(), self.name, self.constant, + self.num_invocations, )?; - - let parsed_metadata = self - .metadata - .iter() - .map(|(key, value)| { - // Replace spaces with underscores to make them compatible with telegraf - // Source: (Special telegraf characters)[`https://docs.influxdata.com/influxdb/cloud/reference/syntax/line-protocol/#special-characters`] - let telegraf_formatted_key = safe_telegraf_string(key); - - format!("{telegraf_formatted_key}={value}") - }) - .collect::>(); - - if !parsed_metadata.is_empty() { - // We are adding the metadata as measurement tags in the influxdb line protocol. - write!(buffer, ",{}", parsed_metadata.join(","))?; - } - - // Now actual value. Note, leading space is important. - write!(buffer, " num_invocations={}", self.num_invocations)?; - Ok(()) + write_metadata_fields_for_telegraf(buffer, &self.metadata) } } diff --git a/crates/full-node/sov-metrics/src/influxdb/mod.rs b/crates/full-node/sov-metrics/src/influxdb/mod.rs index bf69886750..172856481c 100644 --- a/crates/full-node/sov-metrics/src/influxdb/mod.rs +++ b/crates/full-node/sov-metrics/src/influxdb/mod.rs @@ -168,6 +168,36 @@ pub fn safe_telegraf_string(string: &str) -> String { replace_chars(string, &TELEGRAF_ESCAPED_CHARS) } +/// Writes an InfluxDB line-protocol string-field *value* into `buffer`, escaping `\` +/// and `"` in a single pass. Does not emit the surrounding quotes; callers own the `"..."` +/// wrapping. Allocation-free in the common case (no `\` or `"` in the input). +pub fn write_escaped_field_value(buffer: &mut Vec, value: &str) -> std::io::Result<()> { + for chunk in value.split_inclusive(['\\', '"']) { + let (head, trailer) = match chunk.as_bytes().last() { + Some(&b'\\') => (&chunk[..chunk.len() - 1], &b"\\\\"[..]), + Some(&b'"') => (&chunk[..chunk.len() - 1], &b"\\\""[..]), + _ => (chunk, &b""[..]), + }; + buffer.write_all(head.as_bytes())?; + buffer.write_all(trailer)?; + } + Ok(()) +} + +pub(crate) fn write_metadata_fields_for_telegraf( + buffer: &mut Vec, + metadata: &[(String, String)], +) -> std::io::Result<()> { + for (key, value) in metadata { + let safe_key = safe_telegraf_string(key); + write!(buffer, ",{safe_key}=\"")?; + write_escaped_field_value(buffer, value)?; + buffer.write_all(b"\"")?; + } + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; @@ -180,6 +210,39 @@ mod tests { use std::str::FromStr; use tokio::sync::watch; + #[test] + fn escaped_field_value_preserves_plain_input() { + let mut buffer = Vec::new(); + write_escaped_field_value(&mut buffer, "plain utf-8 – no escape").unwrap(); + assert_eq!( + std::str::from_utf8(&buffer).unwrap(), + "plain utf-8 – no escape" + ); + } + + #[test] + fn escaped_field_value_escapes_backslash_and_quote() { + let mut buffer = Vec::new(); + write_escaped_field_value(&mut buffer, r#"a\b"c"\"#).unwrap(); + assert_eq!(std::str::from_utf8(&buffer).unwrap(), r#"a\\b\"c\"\\"#); + } + + #[test] + fn metadata_fields_are_written_as_escaped_string_fields() { + let mut buffer = b"my_metric,tag=value count=1".to_vec(); + let metadata = vec![ + ("block height".to_string(), "10".to_string()), + ("path".to_string(), r#"a\b"c"#.to_string()), + ]; + + write_metadata_fields_for_telegraf(&mut buffer, &metadata).unwrap(); + + assert_eq!( + std::str::from_utf8(&buffer).unwrap(), + r#"my_metric,tag=value count=1,block\ height="10",path="a\\b\"c""# + ); + } + /// Starts publisher tasks and checks that tracker pushes all required metrics #[tokio::test(flavor = "multi_thread")] async fn test_runner_metrics_published() -> anyhow::Result<()> { diff --git a/crates/full-node/sov-metrics/src/influxdb/tracker.rs b/crates/full-node/sov-metrics/src/influxdb/tracker.rs index 00244a774c..e8e32ae9d2 100644 --- a/crates/full-node/sov-metrics/src/influxdb/tracker.rs +++ b/crates/full-node/sov-metrics/src/influxdb/tracker.rs @@ -6,8 +6,8 @@ use std::sync::OnceLock; use crate::influxdb::KnownMetric; use crate::influxdb::{ - publisher, safe_telegraf_string, Metric, SubmittableMetric, SubmittableMetricKind, - DROPPED_METRICS_COUNT, + publisher, safe_telegraf_string, write_metadata_fields_for_telegraf, Metric, SubmittableMetric, + SubmittableMetricKind, DROPPED_METRICS_COUNT, }; use crate::{MetricsTracker, MonitoringConfig}; @@ -40,6 +40,11 @@ pub fn init_metrics_tracker( impl MetricsTracker { /// Quick way to submit a string metric without dealing with [`Metric`]. + /// + /// Prefer defining a dedicated struct that implements [`Metric`] and calling + /// [`MetricsTracker::submit`] — this keeps the wire format reviewable and avoids the + /// per-call allocation that `ToString` forces. `submit_inline` is retained for + /// external SDK users whose downstream code still depends on it. pub fn submit_inline(&self, measurement: &'static str, rest: impl ToString) { #[derive(Debug)] struct InlineMetric(&'static str, String); @@ -613,9 +618,11 @@ impl KnownMetric for HttpMetrics { /// Representation of cycle count and free heap for a particular chunk of execution inside ZK VM guest. #[derive(Debug)] pub struct ZkVmExecutionChunk { - /// Name of the caller site, usually a function or method + /// Name of the caller site, usually a function or method. Emitted as the `name` tag. pub name: String, - /// Metadata associated with the metric. Usually input values collected from the caller function + /// Arbitrary key/value metadata captured from the caller's arguments. + /// Emitted as string **fields** (not tags) so unbounded values like hashes or heights + /// do not cause series-cardinality explosion in InfluxDB. pub metadata: Vec<(String, String)>, /// A number of ZKVM cycles have been spent on this call. pub cycles_count: u64, @@ -652,28 +659,20 @@ impl Metric for ZkVmExecutionChunk { } fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { - // We are adding the metadata as measurmement tags in the influxdb line protocol. - let metadata = self - .metadata - .iter() - .map(|(key, value)| { - // Uses special telegraf formatting - let telegraf_formatted_key = safe_telegraf_string(key); - - format!("{telegraf_formatted_key}={value}") - }) - .collect::>() - .join(","); - + // `name` is the only tag (bounded — one value per `#[cycle_tracker]`-annotated + // function identifier). `metadata` is emitted as string *fields* rather than tags + // because callers pass unbounded values (hashes, heights, tx ids) and using those + // as tags would explode InfluxDB series cardinality. write!( buffer, - "{},name={}{metadata} cycles_count={},free_heap_bytes={},memory_used={}", + "{},name={} cycles_count={},free_heap_bytes={},memory_used={}", self.measurement_name(), self.name, self.cycles_count, self.free_heap_bytes, self.memory_used - ) + )?; + write_metadata_fields_for_telegraf(buffer, &self.metadata) } } diff --git a/crates/full-node/sov-metrics/src/lib.rs b/crates/full-node/sov-metrics/src/lib.rs index ebfaf0f8bd..f66d57697b 100644 --- a/crates/full-node/sov-metrics/src/lib.rs +++ b/crates/full-node/sov-metrics/src/lib.rs @@ -15,8 +15,8 @@ pub use influx_db_nonnative::{ #[cfg(feature = "native")] pub use influxdb::{ init_metrics_tracker, safe_telegraf_string, spawn_tokio_runtime_metrics_task, timestamp, - track_metrics, BatchMetrics, BatchOutcome, HttpMetrics, Metric, MetricsTracker, - MonitoringConfig, RateLimiterMetrics, RpcMetrics, RunnerMetrics, + track_metrics, write_escaped_field_value, BatchMetrics, BatchOutcome, HttpMetrics, Metric, + MetricsTracker, MonitoringConfig, RateLimiterMetrics, RpcMetrics, RunnerMetrics, RunnerProcessStfChangesMetrics, SlotProcessingMetrics, TelegrafSocketConfig, TransactionEffect, TransactionProcessingMetrics, UserSpaceSlotProcessingMetrics, ZkCircuit, ZkProvingTime, ZkVmExecutionChunk, diff --git a/crates/full-node/sov-sequencer/src/metrics.rs b/crates/full-node/sov-sequencer/src/metrics.rs index da32361591..b47da95d4d 100644 --- a/crates/full-node/sov-sequencer/src/metrics.rs +++ b/crates/full-node/sov-sequencer/src/metrics.rs @@ -2,21 +2,90 @@ use std::io::Write; use sov_metrics::Metric; +/// Gauge of the latest sequence number produced by the sequencer. +/// Emitted as `sov_rollup_current_sequence_number`. Should increase monotonically; +/// flatlining while the rollup is live indicates the sequencer is stalled. +#[derive(Debug)] +struct CurrentSequenceNumberMetric { + /// Latest sequence number emitted. + sequence_number: u64, +} + +impl Metric for CurrentSequenceNumberMetric { + fn measurement_name(&self) -> &'static str { + "sov_rollup_current_sequence_number" + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + write!( + buffer, + "{} current_sequence_number={}", + self.measurement_name(), + self.sequence_number, + ) + } +} + +/// Gauge of the transaction count in the currently-open batch. +/// Emitted as `sov_rollup_in_progress_batch_size`. Growing unboundedly means the sequencer +/// is accumulating txs but not closing the batch; cross-check `sov_rollup_preferred_sequencer_channel`. +#[derive(Debug)] +struct InProgressBatchSizeMetric { + /// Number of transactions in the currently-open batch. + num_txs: u64, +} + +impl Metric for InProgressBatchSizeMetric { + fn measurement_name(&self) -> &'static str { + "sov_rollup_in_progress_batch_size" + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + write!( + buffer, + "{} num_txs={}", + self.measurement_name(), + self.num_txs, + ) + } +} + pub fn track_sequence_number(sequence_number: u64) { sov_metrics::track_metrics(|tracker| { - tracker.submit_inline( - "sov_rollup_current_sequence_number", - format!("current_sequence_number={sequence_number}"), - ); + tracker.submit(CurrentSequenceNumberMetric { sequence_number }); }); } pub fn track_in_progress_batch_size(num_txs: u64) { sov_metrics::track_metrics(|tracker| { - tracker.submit_inline( - "sov_rollup_in_progress_batch_size", - format!("num_txs={num_txs}"), - ); + tracker.submit(InProgressBatchSizeMetric { num_txs }); + }); +} + +/// Gauge of the signed gap between the sequencer's next-unassigned sequence number and +/// what the node observes. Emitted as `sov_rollup_sequence_number_delta`. +/// +/// Brief non-zero values during rebase are normal; sustained non-zero = desync between +/// sequencer and node. +#[derive(Debug)] +struct SequenceNumberDeltaMetric { + /// `sequencer_next - node_next`. Signed because either side can be ahead transiently. + delta: i64, +} + +impl Metric for SequenceNumberDeltaMetric { + fn measurement_name(&self) -> &'static str { + "sov_rollup_sequence_number_delta" + } + + fn serialize_for_telegraf(&self, buffer: &mut Vec) -> std::io::Result<()> { + write!(buffer, "{} delta={}i", self.measurement_name(), self.delta,) + } +} + +pub fn track_sequence_number_delta(delta: i64) { + sov_metrics::track_metrics(|tracker| { + tracker.submit(SequenceNumberDeltaMetric { delta }); }); } diff --git a/crates/full-node/sov-sequencer/src/preferred/sync_sequencer_state/inner.rs b/crates/full-node/sov-sequencer/src/preferred/sync_sequencer_state/inner.rs index cdecce7124..28e357d594 100644 --- a/crates/full-node/sov-sequencer/src/preferred/sync_sequencer_state/inner.rs +++ b/crates/full-node/sov-sequencer/src/preferred/sync_sequencer_state/inner.rs @@ -1,5 +1,6 @@ use crate::metrics::{ - track_sequence_number, PreferredSequencerChannelMetrics, PreferredSequencerChannelMetricsBatch, + track_sequence_number, track_sequence_number_delta, PreferredSequencerChannelMetrics, + PreferredSequencerChannelMetricsBatch, }; use crate::preferred::block_executor::{ AcceptedTxWithBudgetInfo, RollupBlockExecutor, RollupBlockExecutorError, @@ -225,15 +226,8 @@ where let next_sequence_number_according_to_node = get_next_sequence_number_according_to_node(latest_state_info, &mut runtime); - sov_metrics::track_metrics(|tracker| { - tracker.submit_inline( - "sov_rollup_sequence_number_delta", - format!( - "delta={}i", - (next_sequence_number as i64) - (next_sequence_number_according_to_node as i64) - ), - ); - }); + let delta = (next_sequence_number as i64) - (next_sequence_number_according_to_node as i64); + track_sequence_number_delta(delta); match latest_finalized_sequence_number(latest_state_info, &mut runtime) { Some(num) => { diff --git a/crates/module-system/sov-modules-macros/tests/integration/metrics.rs b/crates/module-system/sov-modules-macros/tests/integration/metrics.rs index 327b5c2c98..a37aa76441 100644 --- a/crates/module-system/sov-modules-macros/tests/integration/metrics.rs +++ b/crates/module-system/sov-modules-macros/tests/integration/metrics.rs @@ -57,7 +57,7 @@ async fn test_metrics_macro() { // We have one invocation of the metric here. let mut buf = [0; 1024]; - timeout( + let (bytes_received, _) = timeout( std::time::Duration::from_secs(10), channel.recv_from(&mut buf), ) @@ -65,12 +65,15 @@ async fn test_metrics_macro() { .expect("Timeout while waiting for the UDP channel to receive data") .unwrap(); - let mut parsed_buf = std::str::from_utf8(&buf[..]).unwrap().split(" "); + let received_metric = std::str::from_utf8(&buf[..bytes_received]).unwrap(); + let mut parsed_buf = received_metric.split(' '); assert_eq!( parsed_buf.next().unwrap(), - "sov_rollup_gas_constant,name=test_metrics,constant=test,input=10" + "sov_rollup_gas_constant,name=test_metrics,constant=test" ); - assert_eq!(parsed_buf.next().unwrap(), "num_invocations=1"); + let fields = parsed_buf.next().unwrap(); + assert!(fields.split(',').any(|field| field == "num_invocations=1")); + assert!(fields.split(',').any(|field| field == "input=\"10\"")); } #[track_gas_constants_usage] @@ -118,7 +121,7 @@ async fn test_metrics_macro_without_input() { // We have one invocation of the metric here. let mut buf = [0; 1024]; - timeout( + let (bytes_received, _) = timeout( std::time::Duration::from_secs(10), channel.recv_from(&mut buf), ) @@ -126,10 +129,12 @@ async fn test_metrics_macro_without_input() { .expect("Timeout while waiting for the UDP channel to receive data") .unwrap(); - let mut parsed_buf = std::str::from_utf8(&buf[..]).unwrap().split(" "); + let received_metric = std::str::from_utf8(&buf[..bytes_received]).unwrap(); + let mut parsed_buf = received_metric.split(' '); assert_eq!( parsed_buf.next().unwrap(), "sov_rollup_gas_constant,name=test_metrics_without_input,constant=test" ); - assert_eq!(parsed_buf.next().unwrap(), "num_invocations=1"); + let fields = parsed_buf.next().unwrap(); + assert!(fields.split(',').any(|field| field == "num_invocations=1")); }