Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# PaperQA2

## To run the project locally

In line with the existing [CONTRIBUTING.md](CONTRIBUTING.md) file. Executing `uv sync` in the project root is sufficient to start editing and running the project code locally.

## To run on our infrastructure

There is a basic `azure.json` configuration file in `src/paperqa/configs` that provides a simple configuration `paperqa`'s `Settings` object.
Expand All @@ -14,6 +18,58 @@ For it to work, it requires a `.env` file in the project root directory populate
- `OPENALEX_MAILTO`

To make use of the configuration, simply create a `Settings` object using its `from_name` class method, passing the stem of the json config as a string, i.e. `Settings.from_name("azure")`.

## To run the DESTINY repo paper helper

The following additional environment variables are required:

- `DESTINY_API_URL` (ATTOW https://destiny-repository-stag-app.proudmeadow-2a76e8ac.swedencentral.azurecontainerapps.io)
- `DESTINY_CLIENT_ID` (ATTOW 96ed941e-15dc-4ec0-b9e7-e4eda99efd2e)
- `DESTINY_AUTHORITY` (ATTOW https://login.microsoftonline.com/f870e5ae-5521-4a94-b9ff-cdde7d36dd35)
- `DESTINY_SCOPES` (ATTOW api://14e3f6c0-b8aa-46c6-98d9-29b0dd2a0f7c/.default as a list, i.e. between double quotes ending with a comma)
- `DESTINY_LOGIN_HINT` (your UCL email address, see Lena's authentication notebook in the teams channel for more)

See `test_contribs.py` for an example of running the paper helper.

Using this forked version of paper-qa as a local package/dependency should work if not:

```python
import os
from dotenv import load_dotenv
from paperqa import Settings
from paperqa.contrib.destiny_paper_helper import DESTINYPaperHelper
from paperqa.settings import IndexSettings

load_dotenv()

paper_directory = "~/some-directory"

settings = Settings.from_name("azure").model_copy(
update={
"paper_directory": paper_directory,
"index": IndexSettings(paper_directory=paper_directory)
}
)
helper = DESTINYPaperHelper(
settings,
api_url=os.getenv("DESTINY_API_URL"),
client_id=os.getenv("DESTINY_CLIENT_ID"),
authority=os.getenv("DESTINY_AUTHORITY"),
login_hint=os.getenv("DESTINY_LOGIN_HINT"),
scopes=os.getenv("DESTINY_SCOPES").split(","),
)

question = "What is the progress on climate change intervention research?"

papers = await helper.fetch_relevant_papers(question)

docs = await helper.aadd_docs(papers)

session = await docs.aquery(question, settings=helper.settings)

print(session.answer)
```

<!-- pyml disable-num-lines 6 line-length -->

[![GitHub](https://img.shields.io/badge/GitHub-black?logo=github&logoColor=white)](https://github.com/Future-House/paper-qa)
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,21 @@ classifiers = [
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
"anyio",
"destiny-sdk>=0.6.0",
"fhaviary[llm]>=0.27", # For partial tool concurrency
"fhlmi>=0.41.0", # Pin for LiteLLMModel.get_router
"html2text", # TODO: evaluate moving to an opt-in dependency
"httpx",
"httpx-aiohttp",
"luqum>=1.0.0",
"msal>=1.34.0",
"numpy",
"paper-qa-pypdf", # TODO: after https://peps.python.org/pep-0771/, make this opt-out if 'pymupdf' extra is specified`
"pyalex>=0.19",
Expand All @@ -56,7 +58,7 @@ maintainers = [
]
name = "paper-qa"
readme = "README.md"
requires-python = ">=3.11"
requires-python = ">=3.12"

[project.optional-dependencies]
dev = [
Expand Down
8 changes: 5 additions & 3 deletions src/paperqa/contrib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from .zotero import ZoteroDB

__all__ = ["ZoteroDB"]
try:
from .zotero import ZoteroDB
__all__ = ["ZoteroDB"]
except ImportError:
__all__ = []
241 changes: 241 additions & 0 deletions src/paperqa/contrib/destiny_paper_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import json
import logging
from pathlib import Path
from typing import Any

import anyio
import httpx
import httpx_aiohttp
from aviary.message import Message
from destiny_sdk.enhancements import EnhancementType
from destiny_sdk.identifiers import ExternalIdentifierType
from destiny_sdk.references import ReferenceSearchResult, Reference
from lmi import LiteLLMModel
from luqum.parser import parser, ParseSyntaxError
from msal import PublicClientApplication
from pydantic import Field, BaseModel, field_validator
from pydantic.v1 import ValidationError

from paperqa import Settings, Docs
from paperqa.prompts import DESTINY_search_api_docs

logger = logging.getLogger(__name__)

class DESTINYSearchQuery(BaseModel):
query: str = Field(description="The search query to be passed to DESTINY's Search API")

class FailedToGetRelevantPapersError(Exception):
pass

class LuceneQuery(BaseModel):
"""Validated Lucene query model."""

query: str = Field(
description="A valid Lucene query syntax string"
)

@field_validator('query')
@classmethod
def validate_lucene_syntax(cls, v: str) -> str:
"""Validate that the query is valid Lucene syntax."""
try:
parser.parse(v)
return v
except ParseSyntaxError as e:
raise ValueError(f"Invalid Lucene syntax: {e}")

class DESTINYPaperHelper:
def __init__(
self,
settings: Settings,
api_url: str,
client_id: str,
authority: str,
login_hint: str,
scopes: list[str],
search_endpoint: str = "/v1/references/search/",
max_timeout: float = 15.0,
max_attempts: int = 5
):
self.settings = settings
Path(settings.paper_directory).mkdir(parents=True, exist_ok=True)

self.api_url = api_url
self.search_endpoint = search_endpoint

self.app = PublicClientApplication(
client_id=client_id,
authority=authority,
client_credential=None
)
# TODO we might not need a token
self.token = self.app.acquire_token_interactive(
login_hint=login_hint,
scopes=scopes
)

self.access_token = self.token["access_token"]

self.max_timeout = max_timeout
self.max_attempts = max_attempts

self.llm_model = LiteLLMModel(
name=self.settings.llm,
config=self.settings.llm_config
)
async def fetch_relevant_papers(self, question: str) -> dict[str, Reference]:
"""Get relevant papers/references for a given question using an LLM."""
relevant_references = await self._get_relevant_references(question)
await self.download_papers(relevant_references)
return {str(ref.id):ref for ref in relevant_references}

async def download_papers(self, references: list[Reference]) -> None:
"""Download PDFs of all relevant papers found from the DESTINY repository search."""
downloaded_references = Path(self.settings.paper_directory).glob("*.pdf")
downloaded_ids = {ref.stem for ref in downloaded_references}
for ref in references:
if str(ref.id) not in downloaded_ids:
await self._download_pdf(ref)

async def _download_pdf(self, reference: Reference) -> bool:
"""Download a single PDF file"""
pdf_urls = self._parse_pdf_urls_from_reference(reference)

async with httpx_aiohttp.HttpxAiohttpClient(
follow_redirects=True,
timeout=self.max_timeout
) as client:
for url in pdf_urls:
try:
response = await client.get(url)
response.raise_for_status()
async with await anyio.open_file(
f"{self.settings.paper_directory}/{str(reference.id)}.pdf", "wb"
) as f:
await f.write(response.content)
logger.info(f"Successfully downloaded {str(reference.id)}.pdf")
return True
except httpx.HTTPStatusError as e:
logger.warning(
f"Failed to download the PDF. Status code: {e.response.status_code}, text:"
f" {response.text}"
)
except httpx.ReadTimeout as e:
logger.warning(
f"Failed to download the {str(reference.id)}.pdf. Timeout reached: {e}"
)
return False

def _parse_pdf_urls_from_reference(self, ref: Reference) -> list[str]:
pdf_urls = []
for enhancement in ref.enhancements:
metadata = enhancement.content
if metadata.enhancement_type is EnhancementType.LOCATION:
# pdf urls are instances of HttpUrl so need to be cast to strings
pdf_urls += [
str(location.pdf_url) for location in metadata.locations
if location.pdf_url is not None
]

return pdf_urls

async def _get_relevant_references(self, question: str) -> list[Reference]:
"""Perform a search using DESTINY's search API using an LLM generated search query."""
search_query = await self._generate_lucene_search_query(question)
for _ in range(self.max_attempts):
try:
resp = httpx.get(
f"{self.api_url}{self.search_endpoint}?q={search_query.query}",
headers={"Authorization": f"Bearer {self.access_token}"},
timeout=self.max_timeout
)
resp.raise_for_status()
search_result = ReferenceSearchResult.model_validate(resp.json())
references = search_result.references
if not references:
raise ValueError(f"No references found for {search_query.query}")
return references
except ValidationError as e:
print(f"Invalid response format: {e}")
raise e
except ValueError as e:
print(f"Value Error: {e}")
additional_context = f"The last search returned no references: {e}. Try creating a different search query to {search_query}."
search_query = await self._generate_lucene_search_query(question, additional_context)
except httpx.HTTPStatusError as e:
print(f"HTTP Status Error: {e}")
additional_context = f"The last search query produced: {e} with response: {resp.json()["detail"]}. Try creating a different search query to {search_query}."
search_query = await self._generate_lucene_search_query(question, additional_context)
raise FailedToGetRelevantPapersError(
f"Received HTTP status errors {self.max_attempts} times. Last search_query: {search_query}"
)

# TODO sometimes the model generates a query that passes our validation but fails on the API call
async def _generate_lucene_search_query(self, question: str, additional_context: str = "") -> LuceneQuery:
prompt = f"{additional_context}\n\n" + (
"You are the helper model that aims to generate a search query in Lucene query syntax retrieve relevant papers"
" for the user's question from the DESTINY Repository." + "User's question:\n"
) + f"{question}\n\n{DESTINY_search_api_docs}"

response = await self.llm_model.call_single(
messages=[Message(role="user", content=prompt)],
output_type=LuceneQuery,
temperature=0.1
)

# unsure if using LuceneQuery as output type runs the validation check
# so we explicitly run it here
# TODO we should check if this is redundant
lucene_query = LuceneQuery.model_validate(json.loads(str(response.text)))

return lucene_query

def _parse_metadata_from_reference(self, ref: Reference) -> dict[str, Any]:
metadata = {}

for identifier in ref.identifiers:
if identifier.identifier_type is ExternalIdentifierType.DOI:
metadata["doi"] = identifier.identifier

for enhancement in ref.enhancements:
content = enhancement.content

match content.enhancement_type:
case EnhancementType.BIBLIOGRAPHIC:
metadata["authors"] = [author.display_name for author in content.authorship]
metadata["title"] = content.title
case EnhancementType.ABSTRACT:
metadata["abstract"] = content.abstract

return metadata

async def aadd_docs(
self, references: dict[str, Reference] | None = None, docs: Docs | None = None
) -> Docs:
if docs is None:
docs = Docs()
for doc_path in Path(self.settings.paper_directory).rglob( # noqa: ASYNC240
"*.pdf"
):
ref = references.get(doc_path.stem) if references is not None else None
if ref:
metadata = self._parse_metadata_from_reference(ref)
# TODO find a way to use bibliographic data
try:
await docs.aadd(
doc_path,
settings=self.settings,
title=metadata.get("title", "Unknown"),
abstract=metadata.get("abstract", "Unknown"),
doi=metadata.get("doi", "Unknown"),
authors=metadata.get("authors", None)
)
except ValueError as e:
logging.warning(f"Failed to aadd {doc_path} to Docs: {e}")
else:
await docs.aadd(doc_path, settings=self.settings)
return docs




Loading