diff --git a/README.md b/README.md index 8b3208854..952931f09 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # PaperQA2 +## To run the project locally + +In line with the existing [CONTRIBUTING.md](CONTRIBUTING.md) file. Executing `uv sync` in the project root is sufficient to start editing and running the project code locally. + ## To run on our infrastructure There is a basic `azure.json` configuration file in `src/paperqa/configs` that provides a simple configuration `paperqa`'s `Settings` object. @@ -14,6 +18,58 @@ For it to work, it requires a `.env` file in the project root directory populate - `OPENALEX_MAILTO` To make use of the configuration, simply create a `Settings` object using its `from_name` class method, passing the stem of the json config as a string, i.e. `Settings.from_name("azure")`. + +## To run the DESTINY repo paper helper + +The following additional environment variables are required: + +- `DESTINY_API_URL` (ATTOW https://destiny-repository-stag-app.proudmeadow-2a76e8ac.swedencentral.azurecontainerapps.io) +- `DESTINY_CLIENT_ID` (ATTOW 96ed941e-15dc-4ec0-b9e7-e4eda99efd2e) +- `DESTINY_AUTHORITY` (ATTOW https://login.microsoftonline.com/f870e5ae-5521-4a94-b9ff-cdde7d36dd35) +- `DESTINY_SCOPES` (ATTOW api://14e3f6c0-b8aa-46c6-98d9-29b0dd2a0f7c/.default as a list, i.e. between double quotes ending with a comma) +- `DESTINY_LOGIN_HINT` (your UCL email address, see Lena's authentication notebook in the teams channel for more) + +See `test_contribs.py` for an example of running the paper helper. + +Using this forked version of paper-qa as a local package/dependency should work if not: + +```python +import os +from dotenv import load_dotenv +from paperqa import Settings +from paperqa.contrib.destiny_paper_helper import DESTINYPaperHelper +from paperqa.settings import IndexSettings + +load_dotenv() + +paper_directory = "~/some-directory" + +settings = Settings.from_name("azure").model_copy( + update={ + "paper_directory": paper_directory, + "index": IndexSettings(paper_directory=paper_directory) + } +) +helper = DESTINYPaperHelper( + settings, + api_url=os.getenv("DESTINY_API_URL"), + client_id=os.getenv("DESTINY_CLIENT_ID"), + authority=os.getenv("DESTINY_AUTHORITY"), + login_hint=os.getenv("DESTINY_LOGIN_HINT"), + scopes=os.getenv("DESTINY_SCOPES").split(","), +) + +question = "What is the progress on climate change intervention research?" + +papers = await helper.fetch_relevant_papers(question) + +docs = await helper.aadd_docs(papers) + +session = await docs.aquery(question, settings=helper.settings) + +print(session.answer) +``` + [![GitHub](https://img.shields.io/badge/GitHub-black?logo=github&logoColor=white)](https://github.com/Future-House/paper-qa) diff --git a/pyproject.toml b/pyproject.toml index 640b45ce8..e4365ad70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ classifiers = [ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python", @@ -28,11 +27,14 @@ classifiers = [ ] dependencies = [ "anyio", + "destiny-sdk>=0.6.0", "fhaviary[llm]>=0.27", # For partial tool concurrency "fhlmi>=0.41.0", # Pin for LiteLLMModel.get_router "html2text", # TODO: evaluate moving to an opt-in dependency "httpx", "httpx-aiohttp", + "luqum>=1.0.0", + "msal>=1.34.0", "numpy", "paper-qa-pypdf", # TODO: after https://peps.python.org/pep-0771/, make this opt-out if 'pymupdf' extra is specified` "pyalex>=0.19", @@ -56,7 +58,7 @@ maintainers = [ ] name = "paper-qa" readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.12" [project.optional-dependencies] dev = [ diff --git a/src/paperqa/contrib/__init__.py b/src/paperqa/contrib/__init__.py index 5552a5539..bf683f4bf 100644 --- a/src/paperqa/contrib/__init__.py +++ b/src/paperqa/contrib/__init__.py @@ -1,3 +1,5 @@ -from .zotero import ZoteroDB - -__all__ = ["ZoteroDB"] +try: + from .zotero import ZoteroDB + __all__ = ["ZoteroDB"] +except ImportError: + __all__ = [] diff --git a/src/paperqa/contrib/destiny_paper_helper.py b/src/paperqa/contrib/destiny_paper_helper.py new file mode 100644 index 000000000..a7faba2d0 --- /dev/null +++ b/src/paperqa/contrib/destiny_paper_helper.py @@ -0,0 +1,241 @@ +import json +import logging +from pathlib import Path +from typing import Any + +import anyio +import httpx +import httpx_aiohttp +from aviary.message import Message +from destiny_sdk.enhancements import EnhancementType +from destiny_sdk.identifiers import ExternalIdentifierType +from destiny_sdk.references import ReferenceSearchResult, Reference +from lmi import LiteLLMModel +from luqum.parser import parser, ParseSyntaxError +from msal import PublicClientApplication +from pydantic import Field, BaseModel, field_validator +from pydantic.v1 import ValidationError + +from paperqa import Settings, Docs +from paperqa.prompts import DESTINY_search_api_docs + +logger = logging.getLogger(__name__) + +class DESTINYSearchQuery(BaseModel): + query: str = Field(description="The search query to be passed to DESTINY's Search API") + +class FailedToGetRelevantPapersError(Exception): + pass + +class LuceneQuery(BaseModel): + """Validated Lucene query model.""" + + query: str = Field( + description="A valid Lucene query syntax string" + ) + + @field_validator('query') + @classmethod + def validate_lucene_syntax(cls, v: str) -> str: + """Validate that the query is valid Lucene syntax.""" + try: + parser.parse(v) + return v + except ParseSyntaxError as e: + raise ValueError(f"Invalid Lucene syntax: {e}") + +class DESTINYPaperHelper: + def __init__( + self, + settings: Settings, + api_url: str, + client_id: str, + authority: str, + login_hint: str, + scopes: list[str], + search_endpoint: str = "/v1/references/search/", + max_timeout: float = 15.0, + max_attempts: int = 5 + ): + self.settings = settings + Path(settings.paper_directory).mkdir(parents=True, exist_ok=True) + + self.api_url = api_url + self.search_endpoint = search_endpoint + + self.app = PublicClientApplication( + client_id=client_id, + authority=authority, + client_credential=None + ) + # TODO we might not need a token + self.token = self.app.acquire_token_interactive( + login_hint=login_hint, + scopes=scopes + ) + + self.access_token = self.token["access_token"] + + self.max_timeout = max_timeout + self.max_attempts = max_attempts + + self.llm_model = LiteLLMModel( + name=self.settings.llm, + config=self.settings.llm_config + ) + async def fetch_relevant_papers(self, question: str) -> dict[str, Reference]: + """Get relevant papers/references for a given question using an LLM.""" + relevant_references = await self._get_relevant_references(question) + await self.download_papers(relevant_references) + return {str(ref.id):ref for ref in relevant_references} + + async def download_papers(self, references: list[Reference]) -> None: + """Download PDFs of all relevant papers found from the DESTINY repository search.""" + downloaded_references = Path(self.settings.paper_directory).glob("*.pdf") + downloaded_ids = {ref.stem for ref in downloaded_references} + for ref in references: + if str(ref.id) not in downloaded_ids: + await self._download_pdf(ref) + + async def _download_pdf(self, reference: Reference) -> bool: + """Download a single PDF file""" + pdf_urls = self._parse_pdf_urls_from_reference(reference) + + async with httpx_aiohttp.HttpxAiohttpClient( + follow_redirects=True, + timeout=self.max_timeout + ) as client: + for url in pdf_urls: + try: + response = await client.get(url) + response.raise_for_status() + async with await anyio.open_file( + f"{self.settings.paper_directory}/{str(reference.id)}.pdf", "wb" + ) as f: + await f.write(response.content) + logger.info(f"Successfully downloaded {str(reference.id)}.pdf") + return True + except httpx.HTTPStatusError as e: + logger.warning( + f"Failed to download the PDF. Status code: {e.response.status_code}, text:" + f" {response.text}" + ) + except httpx.ReadTimeout as e: + logger.warning( + f"Failed to download the {str(reference.id)}.pdf. Timeout reached: {e}" + ) + return False + + def _parse_pdf_urls_from_reference(self, ref: Reference) -> list[str]: + pdf_urls = [] + for enhancement in ref.enhancements: + metadata = enhancement.content + if metadata.enhancement_type is EnhancementType.LOCATION: + # pdf urls are instances of HttpUrl so need to be cast to strings + pdf_urls += [ + str(location.pdf_url) for location in metadata.locations + if location.pdf_url is not None + ] + + return pdf_urls + + async def _get_relevant_references(self, question: str) -> list[Reference]: + """Perform a search using DESTINY's search API using an LLM generated search query.""" + search_query = await self._generate_lucene_search_query(question) + for _ in range(self.max_attempts): + try: + resp = httpx.get( + f"{self.api_url}{self.search_endpoint}?q={search_query.query}", + headers={"Authorization": f"Bearer {self.access_token}"}, + timeout=self.max_timeout + ) + resp.raise_for_status() + search_result = ReferenceSearchResult.model_validate(resp.json()) + references = search_result.references + if not references: + raise ValueError(f"No references found for {search_query.query}") + return references + except ValidationError as e: + print(f"Invalid response format: {e}") + raise e + except ValueError as e: + print(f"Value Error: {e}") + additional_context = f"The last search returned no references: {e}. Try creating a different search query to {search_query}." + search_query = await self._generate_lucene_search_query(question, additional_context) + except httpx.HTTPStatusError as e: + print(f"HTTP Status Error: {e}") + additional_context = f"The last search query produced: {e} with response: {resp.json()["detail"]}. Try creating a different search query to {search_query}." + search_query = await self._generate_lucene_search_query(question, additional_context) + raise FailedToGetRelevantPapersError( + f"Received HTTP status errors {self.max_attempts} times. Last search_query: {search_query}" + ) + + # TODO sometimes the model generates a query that passes our validation but fails on the API call + async def _generate_lucene_search_query(self, question: str, additional_context: str = "") -> LuceneQuery: + prompt = f"{additional_context}\n\n" + ( + "You are the helper model that aims to generate a search query in Lucene query syntax retrieve relevant papers" + " for the user's question from the DESTINY Repository." + "User's question:\n" + ) + f"{question}\n\n{DESTINY_search_api_docs}" + + response = await self.llm_model.call_single( + messages=[Message(role="user", content=prompt)], + output_type=LuceneQuery, + temperature=0.1 + ) + + # unsure if using LuceneQuery as output type runs the validation check + # so we explicitly run it here + # TODO we should check if this is redundant + lucene_query = LuceneQuery.model_validate(json.loads(str(response.text))) + + return lucene_query + + def _parse_metadata_from_reference(self, ref: Reference) -> dict[str, Any]: + metadata = {} + + for identifier in ref.identifiers: + if identifier.identifier_type is ExternalIdentifierType.DOI: + metadata["doi"] = identifier.identifier + + for enhancement in ref.enhancements: + content = enhancement.content + + match content.enhancement_type: + case EnhancementType.BIBLIOGRAPHIC: + metadata["authors"] = [author.display_name for author in content.authorship] + metadata["title"] = content.title + case EnhancementType.ABSTRACT: + metadata["abstract"] = content.abstract + + return metadata + + async def aadd_docs( + self, references: dict[str, Reference] | None = None, docs: Docs | None = None + ) -> Docs: + if docs is None: + docs = Docs() + for doc_path in Path(self.settings.paper_directory).rglob( # noqa: ASYNC240 + "*.pdf" + ): + ref = references.get(doc_path.stem) if references is not None else None + if ref: + metadata = self._parse_metadata_from_reference(ref) + # TODO find a way to use bibliographic data + try: + await docs.aadd( + doc_path, + settings=self.settings, + title=metadata.get("title", "Unknown"), + abstract=metadata.get("abstract", "Unknown"), + doi=metadata.get("doi", "Unknown"), + authors=metadata.get("authors", None) + ) + except ValueError as e: + logging.warning(f"Failed to aadd {doc_path} to Docs: {e}") + else: + await docs.aadd(doc_path, settings=self.settings) + return docs + + + + diff --git a/src/paperqa/prompts.py b/src/paperqa/prompts.py index 65335e35f..93ec3bf95 100644 --- a/src/paperqa/prompts.py +++ b/src/paperqa/prompts.py @@ -220,3 +220,201 @@ "\n\n{context_text}Describe the screenshot," # Allow for empty context_text " or if uncertain on a description please state why:" ) +DESTINY_search_api_docs = """ +### [API Query String Search](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#id7)[#](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#api-query-string-search "Link to this heading") + +The simplest API interface for searching references is the [query string search](https://destiny-repository-prod-app.politesea-556f2857.swedencentral.azurecontainerapps.io/redoc#tag/search/operation/search_references_v1_references_search__get) at /v1/references/search/. This endpoint requires [authentication](https://destiny-evidence.github.io/destiny-repository/procedures/oauth.html). + +#### [Parameters](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#id8)[#](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#parameters "Link to this heading") + +The only required parameter is the query string `q`. Additional optional parameters can be provided to filter, sort, and page through results. + +##### Query String (required)[#](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#query-string-required "Link to this heading") + +The `q` parameter is a query string in the [Lucene syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-query-string-query.html#query-string-syntax). + +At it’s simplest, this can be a simple keyword search, which will search over `title` and `abstract`: + +# Get references with "climate change" anywhere in the title or abstract: +?q=climate change + +# Get references with both "climate change" and "health" anywhere in the title or abstract: +?q=climate change AND health + +Note + +Query parameters must be [URL-encoded](https://www.w3schools.com/tags/ref_urlencode.ASP). For example, spaces must be encoded as `%20` or `+`. Most HTTP client libraries will do this automatically. + +More complex queries can be constructed using the search syntax and the set of [searchable fields](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#search-fields). + +# Get references with "climate", "climatology" etc in the title and either "John Doe" or "Jane Smith" as an author: +?q=title:"climat*" AND authors:("John Doe" OR "Jane Smith") + +# Get references with "adaptation" or "mitigation" in the abstract that haven't yet been classified against the `Intervention` taxonomy: +?q=abstract:(adaptation OR mitigation) AND NOT evaluated_schemes:classification:taxonomy:Intervention + +# Get references with "climate change" in any order and a typoed "health": +?q="change climate"~2 AND helth~ + +##### Start Year and End Year[#](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#start-year-and-end-year "Link to this heading") + +The minimum and maximum publication years (inclusive) for references to return. + +# Get references published from 2015 onwards: +?q=...&start_year=2015 + +# Get references published up to and including 2020: +?q=...&end_year=2020 + +# Get references published from 2015 to 2020: +?q=...&start_year=2015&end_year=2020 + +##### Annotations[#](https://destiny-evidence.github.io/destiny-repository/procedures/search.html#annotations "Link to this heading") + +The `annotation` parameter can be used to filter results based on their annotations. + +These are provided in the format `[/