From 2c61ef2822072e23861e3dc50328756042fbae32 Mon Sep 17 00:00:00 2001 From: LuizFNJ Date: Sun, 1 Mar 2026 19:23:14 +0100 Subject: [PATCH] feature: Implement URL extraction, deduplication, and structured sources for reports --- app/nodes/online_research.py | 76 ++++++++++++++++++++++++++++++++---- app/state.py | 1 + 2 files changed, 69 insertions(+), 8 deletions(-) diff --git a/app/nodes/online_research.py b/app/nodes/online_research.py index de5782f..541b021 100644 --- a/app/nodes/online_research.py +++ b/app/nodes/online_research.py @@ -1,6 +1,8 @@ """Node: Search online sources for fact-checking evidence.""" import logging +import re +from urllib.parse import urlparse from langchain_openai import ChatOpenAI from langchain_core.prompts import ChatPromptTemplate @@ -12,6 +14,7 @@ from plugins.registry import get_langchain_tools, get_tools_for_selection logger = logging.getLogger(__name__) +_URL_PATTERN = re.compile(r"(https?://[^\s'\"<>()]+)", re.IGNORECASE) _prompt = ChatPromptTemplate.from_messages([ ( @@ -36,15 +39,15 @@ def search_online(state: AgentState) -> dict: claim = state["claim"] context = state.get("context", {}) - sources = context.get("sources", []) + seed_sources = context.get("sources", []) language = state.get("language", "pt") doc_context = [] - logger.info("[search_online] Starting — claim='%s' sources=%d language=%s", claim[:80], len(sources), language) + logger.info("[search_online] Starting — claim='%s' sources=%d language=%s", claim[:80], len(seed_sources), language) - if sources: - logger.info("[search_online] Loading %d source URLs", len(sources)) - loader = WebBaseLoader(sources) + if seed_sources: + logger.info("[search_online] Loading %d source URLs", len(seed_sources)) + loader = WebBaseLoader(seed_sources) load_document = loader.load() doc_context = load_document[0].page_content logger.info("[search_online] Loaded source content (length=%d chars)", len(doc_context)) @@ -60,22 +63,79 @@ def search_online(state: AgentState) -> dict: tools.extend(get_langchain_tools()) logger.info("[search_online] Agent tools: %s", [t.name for t in tools]) agent = create_tool_calling_agent(llm, tools, _prompt) - executor = AgentExecutor(agent=agent, tools=tools, verbose=False) + executor = AgentExecutor( + agent=agent, + tools=tools, + verbose=False, + return_intermediate_steps=True, + ) result = executor.invoke({ "claim": claim, - "sources": sources, + "sources": seed_sources, "language": language, "context": doc_context, }) + collected_urls = _collect_urls(result, seed_sources) + structured_sources = _build_structured_sources(collected_urls) output_len = len(str(result.get("output", ""))) tool_names = [t.name for t in tools] logger.info("[search_online] Completed — output length=%d chars", output_len) return { "messages": [result], + "sources": structured_sources, "reasoning_log": [ f"[search_online] Researched claim online using tools {tool_names}, " - f"collected {output_len} chars of evidence" + f"collected {output_len} chars of evidence and {len(collected_urls)} unique URLs" ], } + + +def _extract_urls(text: str) -> list[str]: + """Extract URLs from free text and normalize trailing punctuation.""" + if not text: + return [] + cleaned = [] + for url in _URL_PATTERN.findall(text): + cleaned.append(url.rstrip(".,);:!?")) + return cleaned + + +def _collect_urls(result: dict, seed_urls: list[str]) -> list[str]: + """Collect and deduplicate source URLs from agent output and tool traces.""" + ordered: list[str] = [] + seen: set[str] = set() + + def _add(url: str) -> None: + if not url or url in seen: + return + seen.add(url) + ordered.append(url) + + for url in seed_urls or []: + _add(url) + + # Defensive fallback in case LLM outputs URLs directly + for url in _extract_urls(str(result.get("output", ""))): + _add(url) + + for step in result.get("intermediate_steps", []) or []: + step_text = str(step) + for url in _extract_urls(step_text): + _add(url) + + return ordered + + +def _build_structured_sources(urls: list[str]) -> list[dict]: + """Convert URL list into structured source objects for report output.""" + sources: list[dict] = [] + for url in urls: + host = urlparse(url).netloc or url + sources.append({ + "title": host, + "url": url, + "type": "web", + }) + return sources diff --git a/app/state.py b/app/state.py index 8fddacf..95956e2 100644 --- a/app/state.py +++ b/app/state.py @@ -21,6 +21,7 @@ class AgentState(TypedDict, total=False): can_be_fact_checked: bool search_type: str language: str + sources: Optional[list[dict]] # Gazette pipeline — adaptive search loop fields search_strategies: Optional[list[str]]