Skip to content
203 changes: 203 additions & 0 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import re
from typing import List
from urllib.parse import urlparse

Expand All @@ -12,6 +13,52 @@
get_databus_id_parts_from_file_url,
)

from databusclient.api.utils import compute_sha256_and_length

# compiled regex for SHA-256 hex strings
_SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$")

def _extract_checksum_from_node(node) -> str | None:
Comment thread
Integer-Ctrl marked this conversation as resolved.
"""
Try to extract a 64-char hex checksum from a JSON-LD file node.
Handles these common shapes:
- checksum or sha256sum fields as plain string
- checksum fields as dict with '@value'
- nested values under the allowed keys (lists or '@value' objects)
"""
def find_in_value(v):
if isinstance(v, str):
s = v.strip()
if _SHA256_RE.match(s):
return s
if isinstance(v, dict):
# common JSON-LD value object
if "@value" in v and isinstance(v["@value"], str):
res = find_in_value(v["@value"])
if res:
return res
# try all nested dict values
for vv in v.values():
res = find_in_value(vv)
if res:
return res
if isinstance(v, list):
for item in v:
res = find_in_value(item)
if res:
return res
return None

# Only inspect the explicitly allowed keys to avoid false positives.
for key in ("checksum", "sha256sum", "sha256", "databus:checksum"):
if key in node:
res = find_in_value(node[key])
if res:
return res

return None



# Hosts that require Vault token based authentication. Central source of truth.
VAULT_REQUIRED_HOSTS = {
Expand All @@ -25,13 +72,76 @@ class DownloadAuthError(Exception):



def _extract_checksums_from_jsonld(json_str: str) -> dict:
"""
Parse a JSON-LD string and return a mapping of file URI (and @id) -> checksum.

Uses the existing _extract_checksum_from_node logic to extract checksums
from `Part` nodes. Both the node's `file` and `@id` (if present and a
string) are mapped to the checksum to preserve existing lookup behavior.
"""
try:
jd = json.loads(json_str)
except Exception:
return {}
graph = jd.get("@graph", [])
checksums: dict = {}
for node in graph:
if node.get("@type") == "Part":
expected = _extract_checksum_from_node(node)
if not expected:
continue
file_uri = node.get("file")
if isinstance(file_uri, str):
checksums[file_uri] = expected
node_id = node.get("@id")
if isinstance(node_id, str):
checksums[node_id] = expected
return checksums

Comment thread
Integer-Ctrl marked this conversation as resolved.

def _resolve_checksums_for_urls(file_urls: List[str], databus_key: str | None) -> dict:
"""
Group file URLs by their Version URI, fetch each Version JSON-LD once,
and return a combined url->checksum mapping for the provided URLs.

Best-effort: failures to fetch or parse individual versions are skipped.
"""
versions_map: dict = {}
for file_url in file_urls:
try:
host, accountId, groupId, artifactId, versionId, fileId = get_databus_id_parts_from_file_url(file_url)
except Exception:
continue
if versionId is None:
continue
if host is None or accountId is None or groupId is None or artifactId is None:
continue
version_uri = f"https://{host}/{accountId}/{groupId}/{artifactId}/{versionId}"
versions_map.setdefault(version_uri, []).append(file_url)

checksums: dict = {}
for version_uri, urls_in_version in versions_map.items():
try:
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
extracted_checksums = _extract_checksums_from_jsonld(json_str)
for url in urls_in_version:
if url in extracted_checksums:
checksums[url] = extracted_checksums[url]
except Exception:
# Best-effort: skip versions we cannot fetch or parse
continue
return checksums

Comment on lines +109 to +141
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Normalize URL schemes when mapping url -> checksum (http/https mismatch breaks validation).
Right now lookups require exact string match; if metadata uses https://... but the input URL is http://... (or vice-versa), validation will be skipped unexpectedly.

Proposed fix (canonicalize)
+def _canonicalize_url(u: str) -> str:
+    return "https://" + u.removeprefix("http://").removeprefix("https://")
@@
     for version_uri, urls_in_version in versions_map.items():
@@
             extracted_checksums = _extract_checksums_from_jsonld(json_str)
             for url in urls_in_version:
-                if url in extracted_checksums:
-                    checksums[url] = extracted_checksums[url]
+                canon = _canonicalize_url(url)
+                if url in extracted_checksums:
+                    checksums[url] = extracted_checksums[url]
+                elif canon in extracted_checksums:
+                    checksums[url] = extracted_checksums[canon]
@@
     for url in urls:
         expected = None
         if checksums and isinstance(checksums, dict):
-            expected = checksums.get(url)
+            expected = checksums.get(url) or checksums.get(_canonicalize_url(url))

Also applies to: 341-345

🤖 Prompt for AI Agents
In @databusclient/api/download.py around lines 103 - 135, In
_resolve_checksums_for_urls, normalize URL schemes before storing and looking up
checksums so http/https mismatches don't skip validation: when building
versions_map and when matching url -> checksum against extracted_checksums (from
_extract_checksums_from_jsonld), canonicalize both the input file_url and the
keys in extracted_checksums to a consistent form (e.g., force https or strip
scheme) and use those canonical URLs for dictionary keys and lookups; ensure
this normalization is applied for the initial grouping
(get_databus_id_parts_from_file_url flow) and the final mapping loop that
assigns checksums.

def _download_file(
url,
localDir,
vault_token_file=None,
databus_key=None,
auth_url=None,
client_id=None,
validate_checksum: bool = False,
expected_checksum: str | None = None,
) -> None:
"""
Download a file from the internet with a progress bar using tqdm.
Expand Down Expand Up @@ -183,6 +293,29 @@ def _download_file(
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
raise IOError("Downloaded size does not match Content-Length header")

# --- 6. Optional checksum validation ---
if validate_checksum:
# reuse compute_sha256_and_length from webdav extension
try:
actual, _ = compute_sha256_and_length(filename)
except (OSError, IOError) as e:
print(f"WARNING: error computing checksum for {filename}: {e}")
actual = None

if expected_checksum is None:
print(f"WARNING: no expected checksum available for {filename}; skipping validation")
elif actual is None:
print(f"WARNING: could not compute checksum for {filename}; skipping validation")
else:
if actual.lower() != expected_checksum.lower():
try:
os.remove(filename) # delete corrupted file
except OSError:
pass
raise IOError(
f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}"
)


def _download_files(
urls: List[str],
Expand All @@ -191,6 +324,8 @@ def _download_files(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
checksums: dict | None = None,
) -> None:
"""
Download multiple files from the databus.
Expand All @@ -204,13 +339,18 @@ def _download_files(
- client_id: Client ID for token exchange
"""
for url in urls:
expected = None
if checksums and isinstance(checksums, dict):
expected = checksums.get(url)
_download_file(
url=url,
localDir=localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
expected_checksum=expected,
)


Expand Down Expand Up @@ -358,6 +498,7 @@ def _download_collection(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False
Comment thread
coderabbitai[bot] marked this conversation as resolved.
) -> None:
"""
Download all files in a databus collection.
Expand All @@ -375,13 +516,21 @@ def _download_collection(
file_urls = _get_file_download_urls_from_sparql_query(
endpoint, query, databus_key=databus_key
)

# If checksum validation requested, attempt to build url->checksum mapping
checksums: dict = {}
if validate_checksum:
checksums = _resolve_checksums_for_urls(list(file_urls), databus_key)

_download_files(
list(file_urls),
localDir,
vault_token_file=vault_token,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


Expand All @@ -392,6 +541,7 @@ def _download_version(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download all files in a databus artifact version.
Expand All @@ -406,13 +556,22 @@ def _download_version(
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
# build url -> checksum mapping from JSON-LD when available
checksums: dict = {}
try:
checksums = _extract_checksums_from_jsonld(json_str)
except Exception:
checksums = {}

_download_files(
file_urls,
localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums,
)


Expand All @@ -424,6 +583,7 @@ def _download_artifact(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download files in a databus artifact.
Expand All @@ -445,13 +605,22 @@ def _download_artifact(
print(f"Downloading version: {version_uri}")
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
# extract checksums for this version
checksums: dict = {}
try:
checksums = _extract_checksums_from_jsonld(json_str)
except Exception:
checksums = {}

_download_files(
file_urls,
localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums,
)


Expand Down Expand Up @@ -527,6 +696,7 @@ def _download_group(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download files in a databus group.
Expand All @@ -552,6 +722,7 @@ def _download_group(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)


Expand Down Expand Up @@ -598,6 +769,7 @@ def download(
all_versions=None,
auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token",
client_id="vault-token-exchange",
validate_checksum: bool = False
) -> None:
"""
Download datasets from databus.
Expand Down Expand Up @@ -638,16 +810,34 @@ def download(
databus_key,
auth_url,
client_id,
validate_checksum=validate_checksum,
)
elif file is not None:
print(f"Downloading file: {databusURI}")
# Try to fetch expected checksum from the parent Version metadata
expected = None
if validate_checksum:
try:
version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}"
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
checks = _extract_checksums_from_jsonld(json_str)
expected = checks.get(databusURI)
if expected is None:
# fallback: try lookup by @id (helper already maps @id too)
expected = checks.get(databusURI)
except Exception as e:
print(f"WARNING: Could not fetch checksum for single file: {e}")

# Call the worker to download the single file (passes expected checksum)
_download_file(
databusURI,
localDir,
vault_token_file=token,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
expected_checksum=expected,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Comment on lines +823 to 847
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's explore the repository structure
find . -type f -name "*.py" -o -name "*.json" -o -name "*.jsonld" | head -20

Repository: dbpedia/databus-python-client

Length of output: 498


🏁 Script executed:

# Look at the download.py file around the lines in question
sed -n '820,850p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1635


🏁 Script executed:

# Search for how URIs are used and formatted in the codebase
rg "databusURI|@id|file.*http" --type py -A 2 -B 2 | head -50

Repository: dbpedia/databus-python-client

Length of output: 2944


🏁 Script executed:

# Look for test fixtures with JSON-LD examples
find . -type f \( -name "*.json" -o -name "*.jsonld" -o -name "*fixture*" -o -name "*test*data*" \) | grep -E "\.(json|jsonld)$|fixture|test_data" | head -20

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Check for documentation about URI formatting
find . -type f \( -name "*.md" -o -name "*.rst" -o -name "*.txt" \) | xargs grep -l "URI\|http\|checksum" 2>/dev/null | head -10

Repository: dbpedia/databus-python-client

Length of output: 119


🏁 Script executed:

# Look at _extract_checksums_from_jsonld function in detail
sed -n '780,810p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1693


🏁 Script executed:

# Check test files for actual JSON-LD examples with checksums
rg "checksum|sha256" tests/ --type py -A 3 -B 3 | head -60

Repository: dbpedia/databus-python-client

Length of output: 1821


🏁 Script executed:

# Search for any code that might pass http:// URIs vs https://
rg "http://" databusclient/ --type py -B 2 -A 2

Repository: dbpedia/databus-python-client

Length of output: 2041


🏁 Script executed:

# Look at the entire Part node extraction logic to understand URI handling
sed -n '760,820p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2558


🏁 Script executed:

# Check if there are test cases that would catch protocol mismatches
rg "_extract_checksums" tests/ -A 10 -B 5

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Look at the complete _extract_checksums_from_jsonld function
sed -n '750,820p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2844


🏁 Script executed:

# Check if there are any tests that test URI protocol mismatches
rg "http://" tests/ --type py -B 5 -A 5

Repository: dbpedia/databus-python-client

Length of output: 657


🏁 Script executed:

# Look for test cases that specifically test checksum validation
rg "validate_checksum|expected_checksum" tests/ --type py -B 5 -A 5

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Check what the actual URIs passed to download look like in practice/tests
rg "download\(" tests/ --type py -B 2 -A 2 | head -40

Repository: dbpedia/databus-python-client

Length of output: 775


🏁 Script executed:

# Look at line 539 mentioned in the original scratchpad to see if there's a pattern
sed -n '535,545p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 322


🏁 Script executed:

# Look at the complete _extract_checksums_from_jsonld function definition
sed -n '680,750p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2371


🏁 Script executed:

# Check if there are any comments explaining the protocol handling
grep -n "http\|protocol" databusclient/api/download.py | head -20

Repository: dbpedia/databus-python-client

Length of output: 985


🏁 Script executed:

# Look for version of the code to understand context better
sed -n '1,50p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1573


🏁 Script executed:

# Find the _extract_checksums_from_jsonld function
sed -n '600,680p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 3016


🏁 Script executed:

# Check if there's any documentation about URI normalization needs
grep -rn "removeprefix\|protocol" databusclient/ --type py

Repository: dbpedia/databus-python-client

Length of output: 190


🏁 Script executed:

# Look at what URIs are passed into the checksum extraction
sed -n '690,750p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2115


🏁 Script executed:

# Find the complete _extract_checksums_from_jsonld function
rg "_extract_checksums_from_jsonld" databusclient/api/download.py -A 30

Repository: dbpedia/databus-python-client

Length of output: 5363


🏁 Script executed:

# Check utils.py to understand how URIs are handled
cat databusclient/api/utils.py

Repository: dbpedia/databus-python-client

Length of output: 1861


Add inline documentation explaining the protocol normalization workaround.

Lines 831-833 normalize databusURI by removing protocol prefixes and re-adding https://. This is defensive programming for cases where a caller might pass an http:// URI that matches a metadata entry only when normalized to https://. Since _extract_checksums_from_jsonld already maps both file and @id URIs from Part nodes, the protocol normalization is a separate (reasonable) safeguard. However, adding a brief comment explaining the reason would clarify intent:

expected = checks.get(databusURI) or checks.get(
    # Fallback: normalize to https in case metadata uses different protocol
    "https://" + databusURI.removeprefix("http://").removeprefix("https://")
)
🤖 Prompt for AI Agents
In @databusclient/api/download.py around lines 823 - 847, Add a short inline
comment explaining why we normalize databusURI to https when looking up
checksums: in the block that sets expected from checks.get(databusURI) or
checks.get("https://" + databusURI.removeprefix(...)), annotate that this is a
defensive fallback to handle callers passing http:// URIs while metadata entries
may be stored with https://, and reference databusURI, checks, expected and
_extract_checksums_from_jsonld to clarify the relationship.

elif version is not None:
print(f"Downloading version: {databusURI}")
Expand All @@ -658,6 +848,7 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
elif artifact is not None:
print(
Expand All @@ -671,6 +862,7 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
elif group is not None and group != "collections":
print(
Expand All @@ -684,6 +876,7 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
elif account is not None:
print("accountId not supported yet") # TODO
Expand All @@ -702,11 +895,21 @@ def download(
res = _get_file_download_urls_from_sparql_query(
uri_endpoint, databusURI, databus_key=databus_key
)

# If checksum validation requested, try to build url->checksum mapping
checksums: dict = {}
if validate_checksum:
checksums = _resolve_checksums_for_urls(res, databus_key)
if not checksums:
print("WARNING: Checksum validation enabled but no checksums found for query results.")

_download_files(
res,
localDir,
vault_token_file=token,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums if checksums else None,
)
Loading