diff --git a/src/Nethermind/Nethermind.Db.Test/DbTrackerTests.cs b/src/Nethermind/Nethermind.Db.Test/DbTrackerTests.cs index 0a5ced09b314..a49eb00b988b 100644 --- a/src/Nethermind/Nethermind.Db.Test/DbTrackerTests.cs +++ b/src/Nethermind/Nethermind.Db.Test/DbTrackerTests.cs @@ -13,6 +13,7 @@ using Nethermind.Core; using Nethermind.Core.Test; using Nethermind.Core.Test.Builders; +using Nethermind.Db.FullPruning; using Nethermind.Db.Rocks; using Nethermind.Db.Rocks.Config; using Nethermind.Init.Modules; @@ -26,6 +27,26 @@ namespace Nethermind.Db.Test; public class DbTrackerTests { + // Names mutated across tests are reset here so test order does not cause flakiness. + private static readonly string[] TouchedMetricKeys = + { + "TestDb", "GoodDb", "ThrowingDb", "SkippedDb", "TrackedDb", "PrunedState", + }; + + [TearDown] + public void TearDown() + { + foreach (string key in TouchedMetricKeys) + { + ((IDictionary)Metrics.DbReads).Remove(key); + ((IDictionary)Metrics.DbWrites).Remove(key); + ((IDictionary)Metrics.DbSize).Remove(key); + ((IDictionary)Metrics.DbMemtableSize).Remove(key); + ((IDictionary)Metrics.DbBlockCacheSize).Remove(key); + ((IDictionary)Metrics.DbIndexFilterSize).Remove(key); + } + } + [Test] public void TestTrackOnlyCreatedDb() { @@ -77,6 +98,123 @@ public void TestUpdateDbMetric(bool isProcessing) Assert.That(Metrics.DbReads["TestDb"], isProcessing ? Is.EqualTo(0) : Is.EqualTo(10)); } + [Test] + public void TestSkipMetricsTracking() + { + using IContainer container = new ContainerBuilder() + .AddSingleton() + .AddSingleton(new DbConfig()) + .AddSingleton() + .AddSingleton(new MetricsConfig()) + .AddSingleton(LimboLogs.Instance) + .AddSingleton(NoopMonitoringService.Instance) + .AddDecorator() + .AddSingleton() + .Build(); + + IDbFactory dbFactory = container.Resolve(); + DbMonitoringModule.DbTracker tracker = container.Resolve(); + + DbSettings skipped = new("SkippedDb", "SkippedDb") { SkipMetricsTracking = true }; + DbSettings tracked = new("TrackedDb", "TrackedDb"); + + dbFactory.CreateDb(skipped); + dbFactory.CreateDb(tracked); + + List> entries = tracker.GetAllDbMeta().ToList(); + entries.Should().ContainSingle().Which.Key.Should().Be("TrackedDb"); + } + + [Parallelizable(ParallelScope.None)] + [Test] + public void ExceptionInGatherMetricDoesNotAbortOtherDbs() + { + IMonitoringService monitoringService = Substitute.For(); + Action updateAction = null!; + monitoringService + .When(m => m.AddMetricsUpdateAction(Arg.Any())) + .Do(c => updateAction = (Action)c[0]); + + IDbFactory fakeDbFactory = Substitute.For(); + + using IContainer container = new ContainerBuilder() + .AddSingleton() + .AddSingleton(new DbConfig()) + .AddSingleton() + .AddSingleton(new MetricsConfig()) + .AddSingleton(LimboLogs.Instance) + .AddSingleton(monitoringService) + .AddDecorator() + .AddSingleton(fakeDbFactory) + .Build(); + + ThrowingDb throwingDb = new(); + FakeDb goodDb = new(new IDbMeta.DbMetric { TotalReads = 42 }); + fakeDbFactory.CreateDb(Arg.Is(s => s.DbName == "ThrowingDb")).Returns(throwingDb); + fakeDbFactory.CreateDb(Arg.Is(s => s.DbName == "GoodDb")).Returns(goodDb); + + IDbFactory intercepted = container.Resolve(); + intercepted.CreateDb(new DbSettings("ThrowingDb", "ThrowingDb")); + intercepted.CreateDb(new DbSettings("GoodDb", "GoodDb")); + + Metrics.DbReads["GoodDb"] = 0; + + updateAction!(); + + Assert.That(Metrics.DbReads.ContainsKey("GoodDb"), Is.True); + Assert.That(Metrics.DbReads["GoodDb"], Is.EqualTo(42)); + } + + [Parallelizable(ParallelScope.None)] + [Test] + public void FullPruningDbTrackedWrapper_SurvivesPruningCycle() + { + IMonitoringService monitoringService = Substitute.For(); + Action updateAction = null!; + monitoringService + .When(m => m.AddMetricsUpdateAction(Arg.Any())) + .Do(c => updateAction = (Action)c[0]); + + // Inner factory returns a new FakeDb per call with a distinct size, so we can tell which + // inner DB the FullPruningDb wrapper is currently pointing at. + IDbFactory innerFactory = Substitute.For(); + FakeDb innerDbV0 = new(new IDbMeta.DbMetric { Size = 100 }); + FakeDb innerDbV1 = new(new IDbMeta.DbMetric { Size = 200 }); + innerFactory.CreateDb(Arg.Any()).Returns(innerDbV0, innerDbV1); + + // DbMetricIntervalSeconds = 0 disables the interval guard so we can update twice in a row. + MetricsConfig metricsConfig = new() { DbMetricIntervalSeconds = 0 }; + + using IContainer container = new ContainerBuilder() + .AddSingleton() + .AddSingleton(new DbConfig()) + .AddSingleton() + .AddSingleton(metricsConfig) + .AddSingleton(LimboLogs.Instance) + .AddSingleton(monitoringService) + .Build(); + + DbMonitoringModule.DbTracker tracker = container.Resolve(); + FullPruningDb pruningDb = new(new DbSettings("PrunedState", "PrunedState"), innerFactory); + + // Mirror WorldStateModule's behavior: register the outer wrapper, not the inner DBs. + tracker.AddDb("PrunedState", pruningDb); + + updateAction!(); + Assert.That(Metrics.DbSize["PrunedState"], Is.EqualTo(100)); + + // Trigger and commit a full pruning cycle; pruningDb._currentDb now points to innerDbV1. + pruningDb.TryStartPruning(out IPruningContext context).Should().BeTrue(); + context.Commit(); + context.Dispose(); + + updateAction!(); + + // After pruning, the wrapper delegates GatherMetric() to the new inner DB. No stale entry. + Assert.That(Metrics.DbSize["PrunedState"], Is.EqualTo(200)); + tracker.GetAllDbMeta().Should().ContainSingle().Which.Key.Should().Be("PrunedState"); + } + [Parallelizable(ParallelScope.None)] [Test] public void DoesNotUpdateIfIntervalHasNotPassed() @@ -149,4 +287,9 @@ private class FakeDb(IDbMeta.DbMetric metric) : TestMemDb, IDbMeta internal void SetMetric(IDbMeta.DbMetric metric) => _metric = metric; } + + private class ThrowingDb : TestMemDb, IDbMeta + { + public override IDbMeta.DbMetric GatherMetric() => throw new InvalidOperationException("Simulated GatherMetric failure"); + } } diff --git a/src/Nethermind/Nethermind.Db.Test/Nethermind.Db.Test.csproj b/src/Nethermind/Nethermind.Db.Test/Nethermind.Db.Test.csproj index a6ec51bc5f78..e5d48d084411 100644 --- a/src/Nethermind/Nethermind.Db.Test/Nethermind.Db.Test.csproj +++ b/src/Nethermind/Nethermind.Db.Test/Nethermind.Db.Test.csproj @@ -13,6 +13,7 @@ + diff --git a/src/Nethermind/Nethermind.Db/DbSettings.cs b/src/Nethermind/Nethermind.Db/DbSettings.cs index 2c7feceed718..b2da38d59484 100644 --- a/src/Nethermind/Nethermind.Db/DbSettings.cs +++ b/src/Nethermind/Nethermind.Db/DbSettings.cs @@ -12,6 +12,8 @@ public class DbSettings(string name, string path) public bool DeleteOnStart { get; set; } public bool CanDeleteFolder { get; set; } = true; + /// When true, this database will not be registered with the metrics tracker. + public bool SkipMetricsTracking { get; set; } public IMergeOperator? MergeOperator { get; set; } public Dictionary? ColumnsMergeOperators { get; set; } diff --git a/src/Nethermind/Nethermind.Db/FullPruning/FullPruningInnerDbFactory.cs b/src/Nethermind/Nethermind.Db/FullPruning/FullPruningInnerDbFactory.cs index 42edb7b34b6b..da4d1ca9a4f8 100644 --- a/src/Nethermind/Nethermind.Db/FullPruning/FullPruningInnerDbFactory.cs +++ b/src/Nethermind/Nethermind.Db/FullPruning/FullPruningInnerDbFactory.cs @@ -63,6 +63,9 @@ private DbSettings GetRocksDbSettings(DbSettings originalSetting) string dbPath = firstDb ? originalSetting.DbPath : _fileSystem.Path.Combine(originalSetting.DbPath, _index.ToString()); DbSettings dbSettings = originalSetting.Clone(dbName, dbPath); dbSettings.CanDeleteFolder = !firstDb; // we cannot delete main db folder, only indexed subfolders + // Inner DBs are tracked via the FullPruningDb wrapper (registered with a stable name) + // so we skip tracking these indexed sub-DBs to avoid stale references after pruning. + dbSettings.SkipMetricsTracking = true; return dbSettings; } diff --git a/src/Nethermind/Nethermind.Init/Modules/DbMonitoringModule.cs b/src/Nethermind/Nethermind.Init/Modules/DbMonitoringModule.cs index d9e4c37be428..5c53705d9ffb 100644 --- a/src/Nethermind/Nethermind.Init/Modules/DbMonitoringModule.cs +++ b/src/Nethermind/Nethermind.Init/Modules/DbMonitoringModule.cs @@ -46,6 +46,7 @@ protected override void Load(ContainerBuilder builder) public class DbTracker { private readonly ConcurrentDictionary _createdDbs = new(); + private readonly HashSet _failingDbs = new(); private readonly int _intervalSec; private readonly Lazy _sharedBlockCache; private long _lastDbMetricsUpdate = 0; @@ -84,14 +85,27 @@ private void UpdateDbMetrics() foreach (KeyValuePair kv in GetAllDbMeta()) { - // Note: At the moment, the metric for a columns db is combined across column. - IDbMeta.DbMetric dbMetric = kv.Value.GatherMetric(); - Db.Metrics.DbSize[kv.Key] = dbMetric.Size; - Db.Metrics.DbBlockCacheSize[kv.Key] = dbMetric.CacheSize; - Db.Metrics.DbMemtableSize[kv.Key] = dbMetric.MemtableSize; - Db.Metrics.DbIndexFilterSize[kv.Key] = dbMetric.IndexSize; - Db.Metrics.DbReads[kv.Key] = dbMetric.TotalReads; - Db.Metrics.DbWrites[kv.Key] = dbMetric.TotalWrites; + try + { + // Note: At the moment, the metric for a columns db is combined across column. + IDbMeta.DbMetric dbMetric = kv.Value.GatherMetric(); + Db.Metrics.DbSize[kv.Key] = dbMetric.Size; + Db.Metrics.DbBlockCacheSize[kv.Key] = dbMetric.CacheSize; + Db.Metrics.DbMemtableSize[kv.Key] = dbMetric.MemtableSize; + Db.Metrics.DbIndexFilterSize[kv.Key] = dbMetric.IndexSize; + Db.Metrics.DbReads[kv.Key] = dbMetric.TotalReads; + Db.Metrics.DbWrites[kv.Key] = dbMetric.TotalWrites; + if (_failingDbs.Remove(kv.Key) && _logger.IsInfo) + _logger.Info($"DB metric collection recovered for '{kv.Key}'"); + } + catch (Exception e) + { + // Remove stale entries so Prometheus does not report old values indefinitely. + RemoveStaleMetricEntry(kv.Key); + // Log only on the first failure of a streak; recovery is logged when GatherMetric succeeds again. + if (_failingDbs.Add(kv.Key) && _logger.IsWarn) + _logger.Warn($"Failed to gather metrics for DB '{kv.Key}': {e.Message}"); + } } Db.Metrics.DbBlockCacheSize["Shared"] = _sharedBlockCache.Value.GetUsage(); @@ -104,12 +118,30 @@ private void UpdateDbMetrics() } } + // Uses IDictionary.Remove so it works for both the default NonBlocking.ConcurrentDictionary + // and the plain Dictionary used under the ZK_EVM compile flag. + private static void RemoveStaleMetricEntry(string name) + { + IDictionary reads = Db.Metrics.DbReads; + IDictionary writes = Db.Metrics.DbWrites; + IDictionary size = Db.Metrics.DbSize; + IDictionary memtable = Db.Metrics.DbMemtableSize; + IDictionary blockCache = Db.Metrics.DbBlockCacheSize; + IDictionary indexFilter = Db.Metrics.DbIndexFilterSize; + reads.Remove(name); + writes.Remove(name); + size.Remove(name); + memtable.Remove(name); + blockCache.Remove(name); + indexFilter.Remove(name); + } + public class DbFactoryInterceptor(DbTracker tracker, IDbFactory baseFactory) : IDbFactory { public IDb CreateDb(DbSettings dbSettings) { IDb db = baseFactory.CreateDb(dbSettings); - if (db is IDbMeta dbMeta) + if (!dbSettings.SkipMetricsTracking && db is IDbMeta dbMeta) { tracker.AddDb(dbSettings.DbName, dbMeta); } @@ -119,7 +151,7 @@ public IDb CreateDb(DbSettings dbSettings) public IColumnsDb CreateColumnsDb(DbSettings dbSettings) where T : struct, Enum { IColumnsDb db = baseFactory.CreateColumnsDb(dbSettings); - if (db is IDbMeta dbMeta) + if (!dbSettings.SkipMetricsTracking && db is IDbMeta dbMeta) { tracker.AddDb(dbSettings.DbName, dbMeta); } diff --git a/src/Nethermind/Nethermind.Init/Modules/WorldStateModule.cs b/src/Nethermind/Nethermind.Init/Modules/WorldStateModule.cs index 149ab2bc9eb9..e124786b73c8 100644 --- a/src/Nethermind/Nethermind.Init/Modules/WorldStateModule.cs +++ b/src/Nethermind/Nethermind.Init/Modules/WorldStateModule.cs @@ -33,12 +33,20 @@ protected override void Load(ContainerBuilder builder) DbSettings stateDbSettings = new(GetTitleDbName(DbNames.State), DbNames.State); IFileSystem fileSystem = ctx.Resolve(); IDbFactory dbFactory = ctx.Resolve(); - return new FullPruningDb( + FullPruningDb db = new( stateDbSettings, dbFactory is not MemDbFactory ? new FullPruningInnerDbFactory(dbFactory, fileSystem, stateDbSettings.DbPath) : dbFactory, () => Interlocked.Increment(ref Nethermind.Db.Metrics.StateDbInPruningWrites)); + // Register the outer wrapper so GatherMetric() always reflects the currently active + // inner DB, even across full-pruning cycles. The inner DBs are not tracked: + // - via FullPruningInnerDbFactory they get SkipMetricsTracking = true so the + // DbFactoryInterceptor skips registration. + // - via the MemDbFactory branch they're MemDbs created outside any interceptor and + // therefore never reach the tracker either. + ctx.ResolveOptional()?.AddDb(stateDbSettings.DbName, db); + return db; }) .AddSingleton(ctx =>