From 2c61ef2822072e23861e3dc50328756042fbae32 Mon Sep 17 00:00:00 2001
From: LuizFNJ <luizfnj3103@gmail.com>
Date: Sun, 1 Mar 2026 19:23:14 +0100
Subject: [PATCH] feature: Implement URL extraction, deduplication, and
 structured sources for reports

---
 app/nodes/online_research.py | 76 ++++++++++++++++++++++++++++++++----
 app/state.py                 |  1 +
 2 files changed, 69 insertions(+), 8 deletions(-)

diff --git a/app/nodes/online_research.py b/app/nodes/online_research.py
index de5782f..541b021 100644
--- a/app/nodes/online_research.py
+++ b/app/nodes/online_research.py
@@ -1,6 +1,8 @@
 """Node: Search online sources for fact-checking evidence."""
 
 import logging
+import re
+from urllib.parse import urlparse
 
 from langchain_openai import ChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate
@@ -12,6 +14,7 @@
 from plugins.registry import get_langchain_tools, get_tools_for_selection
 
 logger = logging.getLogger(__name__)
+_URL_PATTERN = re.compile(r"(https?://[^\s'\"<>()]+)", re.IGNORECASE)
 
 _prompt = ChatPromptTemplate.from_messages([
     (
@@ -36,15 +39,15 @@
 def search_online(state: AgentState) -> dict:
     claim = state["claim"]
     context = state.get("context", {})
-    sources = context.get("sources", [])
+    seed_sources = context.get("sources", [])
     language = state.get("language", "pt")
     doc_context = []
 
-    logger.info("[search_online] Starting — claim='%s' sources=%d language=%s", claim[:80], len(sources), language)
+    logger.info("[search_online] Starting — claim='%s' sources=%d language=%s", claim[:80], len(seed_sources), language)
 
-    if sources:
-        logger.info("[search_online] Loading %d source URLs", len(sources))
-        loader = WebBaseLoader(sources)
+    if seed_sources:
+        logger.info("[search_online] Loading %d source URLs", len(seed_sources))
+        loader = WebBaseLoader(seed_sources)
         load_document = loader.load()
         doc_context = load_document[0].page_content
         logger.info("[search_online] Loaded source content (length=%d chars)", len(doc_context))
@@ -60,22 +63,79 @@ def search_online(state: AgentState) -> dict:
         tools.extend(get_langchain_tools())
     logger.info("[search_online] Agent tools: %s", [t.name for t in tools])
     agent = create_tool_calling_agent(llm, tools, _prompt)
-    executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
+    executor = AgentExecutor(
+        agent=agent,
+        tools=tools,
+        verbose=False,
+        return_intermediate_steps=True,
+    )
 
     result = executor.invoke({
         "claim": claim,
-        "sources": sources,
+        "sources": seed_sources,
         "language": language,
         "context": doc_context,
     })
 
+    collected_urls = _collect_urls(result, seed_sources)
+    structured_sources = _build_structured_sources(collected_urls)
     output_len = len(str(result.get("output", "")))
     tool_names = [t.name for t in tools]
     logger.info("[search_online] Completed — output length=%d chars", output_len)
     return {
         "messages": [result],
+        "sources": structured_sources,
         "reasoning_log": [
             f"[search_online] Researched claim online using tools {tool_names}, "
-            f"collected {output_len} chars of evidence"
+            f"collected {output_len} chars of evidence and {len(collected_urls)} unique URLs"
         ],
     }
+
+
+def _extract_urls(text: str) -> list[str]:
+    """Extract URLs from free text and normalize trailing punctuation."""
+    if not text:
+        return []
+    cleaned = []
+    for url in _URL_PATTERN.findall(text):
+        cleaned.append(url.rstrip(".,);:!?"))
+    return cleaned
+
+
+def _collect_urls(result: dict, seed_urls: list[str]) -> list[str]:
+    """Collect and deduplicate source URLs from agent output and tool traces."""
+    ordered: list[str] = []
+    seen: set[str] = set()
+
+    def _add(url: str) -> None:
+        if not url or url in seen:
+            return
+        seen.add(url)
+        ordered.append(url)
+
+    for url in seed_urls or []:
+        _add(url)
+
+    # Defensive fallback in case LLM outputs URLs directly
+    for url in _extract_urls(str(result.get("output", ""))):
+        _add(url)
+
+    for step in result.get("intermediate_steps", []) or []:
+        step_text = str(step)
+        for url in _extract_urls(step_text):
+            _add(url)
+
+    return ordered
+
+
+def _build_structured_sources(urls: list[str]) -> list[dict]:
+    """Convert URL list into structured source objects for report output."""
+    sources: list[dict] = []
+    for url in urls:
+        host = urlparse(url).netloc or url
+        sources.append({
+            "title": host,
+            "url": url,
+            "type": "web",
+        })
+    return sources
diff --git a/app/state.py b/app/state.py
index 8fddacf..95956e2 100644
--- a/app/state.py
+++ b/app/state.py
@@ -21,6 +21,7 @@ class AgentState(TypedDict, total=False):
     can_be_fact_checked: bool
     search_type: str
     language: str
+    sources: Optional[list[dict]]
 
     # Gazette pipeline — adaptive search loop fields
     search_strategies: Optional[list[str]]