From 2c7837fafcc4b2beb899ef8ee31133c6a7ff4c14 Mon Sep 17 00:00:00 2001 From: LeChristopher Blackwell Date: Sat, 27 Jun 2026 13:17:04 -0700 Subject: [PATCH 1/2] fix: disambiguate colliding qualified_names to prevent silent node loss qualified_name is the graph's stable identity key, built as file::name or file::parent.name. It is not unique per symbol: same-named symbols in one file (e.g. methods of different object literals, or repeated generated helpers) produce the same key. upsert_node inserts with ON CONFLICT(qualified_name) DO UPDATE, so colliding nodes collapsed last-writer-wins and earlier definitions were silently dropped. Deduplicate per file in the batch store functions, which already hold the whole file's node list. The first occurrence keeps its bare key so existing edges (which reference the same parser-computed key) still resolve to it; later collisions are suffixed with :L, which is always unique. Unique nodes keep byte-identical keys, so embeddings and incremental updates are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- code_review_graph/graph.py | 48 ++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 4ed3c7fc..5e30091c 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -186,10 +186,19 @@ def close(self) -> None: # --- Write operations --- - def upsert_node(self, node: NodeInfo, file_hash: str = "") -> int: - """Insert or update a node. Returns the node ID.""" + def upsert_node( + self, node: NodeInfo, file_hash: str = "", qualified: str | None = None + ) -> int: + """Insert or update a node. Returns the node ID. + + Pass ``qualified`` to override the computed key — used by the batch store + functions to disambiguate same-named symbols in one file (see + ``_qualified_names_for_file``). Without it, collisions would collapse + under ``ON CONFLICT(qualified_name) DO UPDATE`` and silently drop nodes. + """ now = time.time() - qualified = self._make_qualified(node) + if qualified is None: + qualified = self._make_qualified(node) extra = json.dumps(node.extra) if node.extra else "{}" self._conn.execute( @@ -276,8 +285,9 @@ def store_file_nodes_edges( self._begin_immediate() try: self.remove_file_data(file_path) - for node in nodes: - self.upsert_node(node, file_hash=fhash) + qualified_names = self._qualified_names_for_file(nodes) + for node, qualified in zip(nodes, qualified_names): + self.upsert_node(node, file_hash=fhash, qualified=qualified) for edge in edges: self.upsert_edge(edge) self._conn.commit() @@ -294,8 +304,9 @@ def store_file_batch( try: for file_path, nodes, edges, fhash in batch: self.remove_file_data(file_path) - for node in nodes: - self.upsert_node(node, file_hash=fhash) + qualified_names = self._qualified_names_for_file(nodes) + for node, qualified in zip(nodes, qualified_names): + self.upsert_node(node, file_hash=fhash, qualified=qualified) for edge in edges: self.upsert_edge(edge) self._conn.commit() @@ -1300,6 +1311,29 @@ def _make_qualified(self, node: NodeInfo) -> str: return f"{node.file_path}::{node.parent_name}.{node.name}" return f"{node.file_path}::{node.name}" + def _qualified_names_for_file(self, nodes: list[NodeInfo]) -> list[str]: + """Compute collision-free qualified names for one file's nodes. + + The first occurrence of a key keeps its bare form so existing edges + (which reference the same parser-computed key) still resolve to it. + Later same-key symbols are suffixed with their start line — two defs + cannot share a ``line_start``, so this is always unique. + """ + names: list[str] = [] + seen: set[str] = set() + for index, node in enumerate(nodes): + base = self._make_qualified(node) + if base not in seen: + seen.add(base) + names.append(base) + continue + candidate = f"{base}:L{node.line_start}" + if candidate in seen: + candidate = f"{base}:L{node.line_start}#{index}" + seen.add(candidate) + names.append(candidate) + return names + def _row_to_node(self, row: sqlite3.Row) -> GraphNode: return GraphNode( id=row["id"], From a552b0b417b98f079834d9840b38fb239ca9e77c Mon Sep 17 00:00:00 2001 From: LeChristopher Blackwell Date: Sat, 27 Jun 2026 13:26:58 -0700 Subject: [PATCH 2/2] feat: report duplicate symbol names disambiguated during a build Surfaces same-file qualified_name collisions that were previously invisible. Adds GraphStore.find_disambiguated_nodes() (matches the :L marker the batch store appends), threads a disambiguated_nodes list into the full_build and incremental_update results, logs it, and prints a one-line summary after build/update. Counts are unchanged: now that collisions are kept rather than dropped, the reported node total and the stored COUNT(*) already agree. Adds tests/test_collisions.py covering no-drop, the bare-key-first invariant, edge-resolution safety, and the disambiguation report. Co-Authored-By: Claude Opus 4.8 (1M context) --- code_review_graph/cli.py | 12 +++++ code_review_graph/graph.py | 20 ++++++++ code_review_graph/incremental.py | 11 +++++ tests/test_collisions.py | 78 ++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+) create mode 100644 tests/test_collisions.py diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 102ad8a8..cf1d2f60 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -88,6 +88,16 @@ def _supports_color() -> bool: return sys.stdout.isatty() +def _print_disambiguated(result: dict, limit: int = 10) -> None: + """Report duplicate symbol names that were disambiguated during a build.""" + dups = result.get("disambiguated_nodes") or [] + if not dups: + return + shown = ", ".join(dups[:limit]) + more = f" (+{len(dups) - limit} more)" if len(dups) > limit else "" + print(f"Disambiguated {len(dups)} duplicate symbol name(s): {shown}{more}") + + def _print_banner() -> None: """Print the startup banner with graph art and available commands.""" color = _supports_color() @@ -994,6 +1004,7 @@ def main() -> None: nodes = result.get("total_nodes", 0) edges = result.get("total_edges", 0) print(f"Full build: {parsed} files, {nodes} nodes, {edges} edges (postprocess={pp})") + _print_disambiguated(result) if result.get("errors"): print(f"Errors: {len(result['errors'])}") @@ -1019,6 +1030,7 @@ def main() -> None: f"{nodes} nodes, {edges} edges" f" (postprocess={pp})" ) + _print_disambiguated(result) # --brief: append a one-line change-impact summary with the same # estimated context-savings approximation that detect-changes uses. diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 5e30091c..ba26df53 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -10,6 +10,7 @@ import json import logging import os +import re import sqlite3 import threading import time @@ -25,6 +26,10 @@ logger = logging.getLogger(__name__) +# Suffix appended by _qualified_names_for_file to a 2nd+ same-file collision, +# e.g. "path::getById:L394". This regex recognizes those keys after the fact. +_DISAMBIGUATED_RE = re.compile(r":L\d+(?:#\d+)?$") + # --------------------------------------------------------------------------- # Schema # --------------------------------------------------------------------------- @@ -877,6 +882,21 @@ def get_stats(self) -> GraphStats: last_updated=last_updated, ) + def find_disambiguated_nodes(self) -> list[str]: + """Qualified names suffixed to resolve a same-file name collision. + + When two same-named symbols share a file, the first keeps its bare key + and later ones get a ``:L`` suffix (see _qualified_names_for_file). + Surfacing these makes the otherwise-invisible collisions reportable. + """ + rows = self._conn.execute( + "SELECT qualified_name FROM nodes WHERE qualified_name GLOB '*:L[0-9]*'" + ).fetchall() + return sorted( + r["qualified_name"] for r in rows + if _DISAMBIGUATED_RE.search(r["qualified_name"]) + ) + def get_nodes_by_size( self, min_lines: int = 50, diff --git a/code_review_graph/incremental.py b/code_review_graph/incremental.py index 81dc3026..601de69e 100644 --- a/code_review_graph/incremental.py +++ b/code_review_graph/incremental.py @@ -905,10 +905,18 @@ def full_build( spring_stats = _run_spring_resolver(store) temporal_stats = _run_temporal_resolver(store) + disambiguated = store.find_disambiguated_nodes() + if disambiguated: + logger.info( + "Disambiguated %d duplicate qualified_name(s): %s", + len(disambiguated), ", ".join(disambiguated), + ) + return { "files_parsed": len(files), "total_nodes": total_nodes, "total_edges": total_edges, + "disambiguated_nodes": disambiguated, "errors": errors, "rescript_resolution": rescript_stats, "spring_resolution": spring_stats, @@ -1043,10 +1051,13 @@ def incremental_update( spring_stats = _run_spring_resolver(store) if spring_changed else None temporal_stats = _run_temporal_resolver(store) if spring_changed else None + disambiguated = store.find_disambiguated_nodes() + return { "files_updated": len(all_files), "total_nodes": total_nodes, "total_edges": total_edges, + "disambiguated_nodes": disambiguated, "changed_files": list(changed_files), "dependent_files": list(dependent_files), "errors": errors, diff --git a/tests/test_collisions.py b/tests/test_collisions.py new file mode 100644 index 00000000..90c97a3a --- /dev/null +++ b/tests/test_collisions.py @@ -0,0 +1,78 @@ +"""Same-file qualified_name collisions must not silently drop nodes. + +Same-named symbols in one file (methods of different object literals, repeated +generated helpers) produce the same qualified_name. Before the fix, the insert's +ON CONFLICT(qualified_name) DO UPDATE collapsed them last-writer-wins and the +earlier definitions vanished. The batch store now disambiguates them. +""" + +import tempfile +from pathlib import Path + +from code_review_graph.graph import GraphStore +from code_review_graph.parser import EdgeInfo, NodeInfo + + +def _func(name, line, path="/repo/db.ts", parent=None): + return NodeInfo( + kind="Function", name=name, file_path=path, + line_start=line, line_end=line + 2, language="typescript", + parent_name=parent, + ) + + +class TestQualifiedNameCollisions: + def setup_method(self): + self.tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + self.store = GraphStore(self.tmp.name) + + def teardown_method(self): + self.store.close() + Path(self.tmp.name).unlink(missing_ok=True) + + def test_same_name_methods_both_persist(self): + # Two object literals each defining getById -> empty parent_name -> collide. + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + assert self.store.get_stats().total_nodes == 2 + # First occurrence keeps the bare key; the second carries a line suffix. + first = self.store.get_node("/repo/db.ts::getById") + assert first is not None and first.line_start == 10 + second = self.store.get_node("/repo/db.ts::getById:L20") + assert second is not None and second.line_start == 20 + + def test_three_same_name_module_functions_persist(self): + nodes = [_func("rootRouteImport", n) for n in (5, 10, 15)] + self.store.store_file_nodes_edges("/repo/routeTree.gen.ts", nodes, []) + assert self.store.get_stats().total_nodes == 3 + + def test_edge_to_bare_key_resolves_to_first_def(self): + # An edge targeting the bare key still names a real node (the first def), + # so collisions never dangle existing edges. + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + edge = EdgeInfo( + kind="CALLS", source="/repo/api.ts::handler", + target="/repo/db.ts::getById", file_path="/repo/api.ts", line=3, + ) + self.store.store_file_nodes_edges( + "/repo/api.ts", [_func("handler", 1, path="/repo/api.ts")], [edge] + ) + target = self.store.get_node("/repo/db.ts::getById") + assert target is not None and target.line_start == 10 + + def test_find_disambiguated_nodes_lists_suffixed_keys(self): + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + assert self.store.find_disambiguated_nodes() == ["/repo/db.ts::getById:L20"] + + def test_unique_names_keep_bare_keys_unchanged(self): + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("a", 10), _func("b", 20)], [] + ) + assert self.store.find_disambiguated_nodes() == [] + assert self.store.get_node("/repo/db.ts::a") is not None + assert self.store.get_node("/repo/db.ts::b") is not None