Skip to content
75 changes: 75 additions & 0 deletions tests/reconfiguration.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,81 @@ def test_add_node(network, args, from_snapshot=True):
return network


@reqs.description("Adding a node with corrupted ledger file")
def test_add_node_with_corrupted_ledger(network, args):
Comment thread
eddyashton marked this conversation as resolved.
# Reproduce issue #6612: a node joining with a corrupted (truncated) ledger
# file should fail to start rather than crash unexpectedly.
new_node = network.create_node()

# Set up the join node (copies ledger, snapshots, etc.) but do not start it yet
network.setup_join_node(
new_node,
args.package,
args,
from_snapshot=True,
fetch_recent_snapshot=True,
)

# Find the latest uncommitted ledger file in the node's working directory
ledger_dir = new_node.remote.get_main_ledger_dir()
ledger_files = sorted(
[
f
for f in os.listdir(ledger_dir)
if f.startswith("ledger_") and not f.endswith(".committed")
]
)

if not ledger_files:
LOG.warning("No uncommitted ledger files found, skipping corruption test")
new_node.stop()
network.nodes.remove(new_node)
return network

# Corrupt the latest uncommitted ledger file by truncating it in the middle
# of a transaction, so the transaction size does not match the number of
# bytes available left to read in the file (as described in issue #6612)
ledger = ccf.ledger.Ledger([ledger_dir], committed_only=False)
chunk_filename = None
truncate_offset = None
for chunk in ledger:
for tx in chunk:
offset, next_offset = tx.get_offsets()
chunk_filename = chunk.filename()
truncate_offset = offset + (next_offset - offset) // 2

if truncate_offset is None:
Comment thread
achamayou marked this conversation as resolved.
Outdated
LOG.warning("Could not find a transaction to corrupt, skipping")
new_node.stop()
network.nodes.remove(new_node)
return network

LOG.info(
f"Corrupting ledger file {chunk_filename} by truncating at offset {truncate_offset}"
)
with open(chunk_filename, "r+", encoding="utf-8") as f:
f.truncate(truncate_offset)

# Attempt to start the node - it should fail due to the corrupted ledger
try:
network.run_join_node(new_node, timeout=3)
except (RuntimeError, TimeoutError) as e:
LOG.info(
f"Node {new_node.local_node_id} with corrupted ledger failed to start, as expected: {e}"
)
# Cleanup: run_join_node may have already stopped and removed the node
# on TimeoutError, but not on RuntimeError
new_node.stop()
if new_node in network.nodes:
network.nodes.remove(new_node)
else:
assert (
False
), f"Node {new_node.local_node_id} with corrupted ledger unexpectedly started"

return network


@reqs.description("Test ignore_first_sigterm")
def test_ignore_first_sigterm(network, args):
# Note: host is supplied explicitly to avoid having differently
Expand Down
1 change: 1 addition & 0 deletions tests/suite/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
reconfiguration.test_retire_primary,
e2e_logging.test_rekey,
reconfiguration.test_add_node,
reconfiguration.test_add_node_with_corrupted_ledger,
nodes.test_kill_primary,
nodes.test_commit_view_history,
reconfiguration.test_add_node,
Expand Down