From 6e33bfb2ef9f28c4e5619f245909c52f21ab2697 Mon Sep 17 00:00:00 2001 From: Michal Kuratczyk Date: Tue, 12 May 2026 11:22:02 +0200 Subject: [PATCH 1/2] Handle corrupted indexes Handle two unlikely situations where a corrupted indexes file could prevent startup. --- src/ra_snapshot.erl | 14 ++++++-- test/ra_log_2_SUITE.erl | 74 +++++++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/src/ra_snapshot.erl b/src/ra_snapshot.erl index e0a982dd..941b79e8 100644 --- a/src/ra_snapshot.erl +++ b/src/ra_snapshot.erl @@ -1037,7 +1037,14 @@ indexes(Dir) -> {ok, <>} -> case erlang:crc32(Data) of Crc -> - {ok, binary_to_term(Data)}; + try + case binary_to_term(Data) of + Seq when is_list(Seq) -> {ok, Seq}; + _ -> {error, invalid_format} + end + catch _:_ -> + {error, invalid_format} + end; _ -> {error, checksum_error} end; @@ -1047,7 +1054,10 @@ indexes(Dir) -> %% Backward compatibility: old format without header %% Try to parse as plain term_to_binary data try - {ok, binary_to_term(Bin)} + case binary_to_term(Bin) of + Seq when is_list(Seq) -> {ok, Seq}; + _ -> {error, invalid_format} + end catch _:_ -> {error, invalid_format} diff --git a/test/ra_log_2_SUITE.erl b/test/ra_log_2_SUITE.erl index 718671fe..e2171edd 100644 --- a/test/ra_log_2_SUITE.erl +++ b/test/ra_log_2_SUITE.erl @@ -93,7 +93,7 @@ all_tests() -> snapshot_installation_with_live_indexes, init_with_dangling_symlink, init_after_missing_segments_event, - snapshot_install_with_empty_indexes_file + snapshot_install_with_corrupted_indexes ]. groups() -> @@ -685,7 +685,11 @@ recover_after_snapshot(Config) -> last_written_index_term := {2, 1}}, Overview), ok. -snapshot_install_with_empty_indexes_file(Config) -> +snapshot_install_with_corrupted_indexes(Config) -> + %% Verify that any byte sequence in the indexes file (empty, truncated, + %% garbage, adversarial) never causes startup to abort. The server must + %% always recover gracefully: either use the indexes (if valid) or rebuild + %% them from the snapshot. UId = ?config(uid, Config), MachineConf = {module, ?MODULE, #{}}, LogConf = #{uid => UId, @@ -739,25 +743,53 @@ snapshot_install_with_empty_indexes_file(Config) -> ra_log:close(Log4), IndexesFile = filename:join(SnapDir, <<"indexes">>), - ok = file:write_file(IndexesFile, <<>>), - - application:stop(ra), - start_ra(Config), - timer:sleep(100), - ct:pal("snapshot state ~p", - [ra_log_snapshot_state:read(ra_log_snapshot_state, UId)]), - - - Log5 = ra_log_init(Config, LogConf), - - %% Fetch items 1..10 (should be dropped because of snapshot) - {[], _} = ra_log_take(1, 10, Log5), - - %% Fetch items 11..15 - {[_, _, _, _, _], _} = ra_log_take(11, 16, Log5), - - ra_log:close(Log5), - ok. + {ok, ValidIndexesContent} = file:read_file(IndexesFile), + TruncatedBytes = binary:part(ValidIndexesContent, 0, + max(1, byte_size(ValidIndexesContent) div 2)), + + %% CRC of <<>> is 0, so this 9-byte blob passes the CRC check and then + %% passes <<>> to binary_to_term — this was an uncaught crash before the fix. + CrcOfEmpty = erlang:crc32(<<>>), + Gap1Trigger = <<"RASI", 1, CrcOfEmpty:32/unsigned>>, + + CorruptContents = + [ + <<>>, %% empty + <<"garbage">>, %% random bytes, no RASI magic + binary:copy(<<0>>, 50), %% all-zero bytes + <<"RASI", 99>>, %% valid magic, unknown version + <<"RASI", 1, 0:32, "notterm">>, %% valid header, wrong CRC + Gap1Trigger, %% CRC of <<>> passes, binary_to_term(<<>>) would throw + term_to_binary(some_atom), %% old format, non-list Erlang term + term_to_binary(42), %% old format, integer + term_to_binary(#{}), %% old format, map + term_to_binary([]), %% old format, valid empty ra_seq — must work + TruncatedBytes %% first half of a legitimately written file + ], + + lists:foreach( + fun (Content) -> + ct:pal("Testing indexes file content (~w bytes): ~w", + [byte_size(Content), Content]), + ok = file:write_file(IndexesFile, Content), + + application:stop(ra), + start_ra(Config), + timer:sleep(100), + + Log = ra_log_init(Config, LogConf), + + %% Fetch items 1..10 (should be dropped because of snapshot) + {[], _} = ra_log_take(1, 10, Log), + %% Fetch items 11..15 + {[_, _, _, _, _], _} = ra_log_take(11, 16, Log), + + ra_log:close(Log), + + %% Restore a clean indexes file for the next iteration + ok = file:write_file(IndexesFile, ValidIndexesContent) + end, + CorruptContents). writes_lower_than_snapshot_index_are_dropped(Config) -> logger:set_primary_config(level, debug), From cb163dde1a23c18f03178a3df3a9a953c2357dea Mon Sep 17 00:00:00 2001 From: Michal Kuratczyk Date: Tue, 12 May 2026 12:14:36 +0200 Subject: [PATCH 2/2] Additional logging for corrupted indexes --- src/ra_snapshot.erl | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/ra_snapshot.erl b/src/ra_snapshot.erl index 941b79e8..a18bd00e 100644 --- a/src/ra_snapshot.erl +++ b/src/ra_snapshot.erl @@ -633,7 +633,8 @@ begin_snapshot(#{index := Idx, -spec promote_checkpoint(Idx :: ra_index(), State0 :: state()) -> {boolean(), State :: state(), Effects :: [effect()]}. promote_checkpoint(PromotionIdx, - #?MODULE{module = Mod, + #?MODULE{uid = UId, + module = Mod, sync_server = SyncServer, snapshot_directory = SnapDir, checkpoint_directory = CheckpointDir, @@ -660,7 +661,12 @@ promote_checkpoint(PromotionIdx, Indexes = case indexes(Snapshot) of {ok, Idxs} -> Idxs; - _ -> + {error, Err} -> + ?WARN("ra_snapshot: ~ts: indexes file " + "corrupt during checkpoint " + "promotion (~w), using empty " + "indexes", + [UId, Err]), [] end, EndTime = erlang:monotonic_time(), @@ -1040,9 +1046,15 @@ indexes(Dir) -> try case binary_to_term(Data) of Seq when is_list(Seq) -> {ok, Seq}; - _ -> {error, invalid_format} + Invalid -> + ?WARN("ra_snapshot: indexes file ~ts: CRC passed " + "but decoded term is not a list: ~0p", + [File, Invalid]), + {error, invalid_format} end - catch _:_ -> + catch _:Err -> + ?WARN("ra_snapshot: indexes file ~ts: CRC passed but " + "binary_to_term failed: ~w", [File, Err]), {error, invalid_format} end; _ -> @@ -1056,10 +1068,15 @@ indexes(Dir) -> try case binary_to_term(Bin) of Seq when is_list(Seq) -> {ok, Seq}; - _ -> {error, invalid_format} + Invalid -> + ?WARN("ra_snapshot: indexes file ~ts: old format term " + "is not a list: ~0p", [File, Invalid]), + {error, invalid_format} end catch _:_ -> + ?WARN("ra_snapshot: indexes file ~ts: failed to parse " + "as term (empty or truncated)", [File]), {error, invalid_format} end; {error, enoent} ->