Skip to content
144 changes: 144 additions & 0 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,53 @@
)


def _extract_checksum_from_node(node) -> str | None:
Comment thread
Integer-Ctrl marked this conversation as resolved.
"""
Try to extract a 64-char hex checksum from a JSON-LD file node.
Handles these common shapes:
- checksum or sha256sum fields as plain string
- checksum fields as dict with '@value'
- nested values (recursively search strings for a 64-char hex)
"""
def find_in_value(v):
if isinstance(v, str):
s = v.strip()
if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s):
return s
if isinstance(v, dict):
# common JSON-LD value object
if "@value" in v and isinstance(v["@value"], str):
res = find_in_value(v["@value"])
if res:
return res
# try all nested dict values
for vv in v.values():
res = find_in_value(vv)
if res:
return res
if isinstance(v, list):
for item in v:
res = find_in_value(item)
if res:
return res
return None

# direct keys to try first
for key in ("checksum", "sha256sum", "sha256", "databus:checksum"):
if key in node:
res = find_in_value(node[key])
if res:
return res

# fallback: search all values recursively for a 64-char hex string
for v in node.values():
res = find_in_value(v)
if res:
return res
return None



# Hosts that require Vault token based authentication. Central source of truth.
VAULT_REQUIRED_HOSTS = {
"data.dbpedia.io",
Expand All @@ -32,6 +79,8 @@ def _download_file(
databus_key=None,
auth_url=None,
client_id=None,
validate_checksum: bool = False,
expected_checksum: str | None = None,
) -> None:
"""
Download a file from the internet with a progress bar using tqdm.
Expand Down Expand Up @@ -183,6 +232,26 @@ def _download_file(
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
raise IOError("Downloaded size does not match Content-Length header")

# --- 6. Optional checksum validation ---
if validate_checksum:
# reuse compute_sha256_and_length from webdav extension
try:
from databusclient.extensions.webdav import compute_sha256_and_length

actual, _ = compute_sha256_and_length(filename)
except Exception:
actual = None

if expected_checksum is None:
print(f"WARNING: no expected checksum available for {filename}; skipping validation")
elif actual is None:
print(f"WARNING: could not compute checksum for {filename}; skipping validation")
else:
if actual.lower() != expected_checksum.lower():
raise IOError(
f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}"
)


def _download_files(
urls: List[str],
Expand All @@ -191,6 +260,8 @@ def _download_files(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
checksums: dict | None = None,
) -> None:
"""
Download multiple files from the databus.
Expand All @@ -204,13 +275,18 @@ def _download_files(
- client_id: Client ID for token exchange
"""
for url in urls:
expected = None
if checksums and isinstance(checksums, dict):
expected = checksums.get(url)
_download_file(
url=url,
localDir=localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
expected_checksum=expected,
)


Expand Down Expand Up @@ -358,6 +434,7 @@ def _download_collection(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False
Comment thread
coderabbitai[bot] marked this conversation as resolved.
) -> None:
"""
Download all files in a databus collection.
Expand All @@ -382,6 +459,7 @@ def _download_collection(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.


Expand All @@ -392,6 +470,7 @@ def _download_version(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download all files in a databus artifact version.
Expand All @@ -406,13 +485,31 @@ def _download_version(
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
# build url -> checksum mapping from JSON-LD when available
checksums: dict = {}
try:
json_dict = json.loads(json_str)
graph = json_dict.get("@graph", [])
for node in graph:
if node.get("@type") == "Part":
file_uri = node.get("file")
if not isinstance(file_uri, str):
continue
expected = _extract_checksum_from_node(node)
if expected:
checksums[file_uri] = expected
except Exception:
checksums = {}

_download_files(
file_urls,
localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums,
)


Expand All @@ -424,6 +521,7 @@ def _download_artifact(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download files in a databus artifact.
Expand All @@ -445,13 +543,31 @@ def _download_artifact(
print(f"Downloading version: {version_uri}")
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
# extract checksums for this version
checksums: dict = {}
try:
jd = json.loads(json_str)
graph = jd.get("@graph", [])
for node in graph:
if node.get("@type") == "Part":
file_uri = node.get("file")
if not isinstance(file_uri, str):
continue
expected = _extract_checksum_from_node(node)
if expected:
checksums[file_uri] = expected
except Exception:
checksums = {}

_download_files(
file_urls,
localDir,
vault_token_file=vault_token_file,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
checksums=checksums,
)


Expand Down Expand Up @@ -527,6 +643,7 @@ def _download_group(
databus_key: str = None,
auth_url: str = None,
client_id: str = None,
validate_checksum: bool = False,
) -> None:
"""
Download files in a databus group.
Expand All @@ -552,6 +669,7 @@ def _download_group(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)


Expand Down Expand Up @@ -598,6 +716,7 @@ def download(
all_versions=None,
auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token",
client_id="vault-token-exchange",
validate_checksum: bool = False
) -> None:
"""
Download datasets from databus.
Expand Down Expand Up @@ -638,16 +757,36 @@ def download(
databus_key,
auth_url,
client_id,
validate_checksum=validate_checksum,
)
elif file is not None:
print(f"Downloading file: {databusURI}")
# Try to fetch expected checksum from the parent Version metadata
expected = None
if validate_checksum:
try:
version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}"
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
json_dict = json.loads(json_str)
graph = json_dict.get("@graph", [])
for node in graph:
if node.get("file") == databusURI or node.get("@id") == databusURI:
expected = _extract_checksum_from_node(node)
if expected:
break
except Exception as e:
print(f"WARNING: Could not fetch checksum for single file: {e}")

# Call the worker to download the single file (passes expected checksum)
_download_file(
databusURI,
localDir,
vault_token_file=token,
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
expected_checksum=expected,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Comment on lines +823 to 847
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, let's explore the repository structure
find . -type f -name "*.py" -o -name "*.json" -o -name "*.jsonld" | head -20

Repository: dbpedia/databus-python-client

Length of output: 498


🏁 Script executed:

# Look at the download.py file around the lines in question
sed -n '820,850p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1635


🏁 Script executed:

# Search for how URIs are used and formatted in the codebase
rg "databusURI|@id|file.*http" --type py -A 2 -B 2 | head -50

Repository: dbpedia/databus-python-client

Length of output: 2944


🏁 Script executed:

# Look for test fixtures with JSON-LD examples
find . -type f \( -name "*.json" -o -name "*.jsonld" -o -name "*fixture*" -o -name "*test*data*" \) | grep -E "\.(json|jsonld)$|fixture|test_data" | head -20

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Check for documentation about URI formatting
find . -type f \( -name "*.md" -o -name "*.rst" -o -name "*.txt" \) | xargs grep -l "URI\|http\|checksum" 2>/dev/null | head -10

Repository: dbpedia/databus-python-client

Length of output: 119


🏁 Script executed:

# Look at _extract_checksums_from_jsonld function in detail
sed -n '780,810p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1693


🏁 Script executed:

# Check test files for actual JSON-LD examples with checksums
rg "checksum|sha256" tests/ --type py -A 3 -B 3 | head -60

Repository: dbpedia/databus-python-client

Length of output: 1821


🏁 Script executed:

# Search for any code that might pass http:// URIs vs https://
rg "http://" databusclient/ --type py -B 2 -A 2

Repository: dbpedia/databus-python-client

Length of output: 2041


🏁 Script executed:

# Look at the entire Part node extraction logic to understand URI handling
sed -n '760,820p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2558


🏁 Script executed:

# Check if there are test cases that would catch protocol mismatches
rg "_extract_checksums" tests/ -A 10 -B 5

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Look at the complete _extract_checksums_from_jsonld function
sed -n '750,820p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2844


🏁 Script executed:

# Check if there are any tests that test URI protocol mismatches
rg "http://" tests/ --type py -B 5 -A 5

Repository: dbpedia/databus-python-client

Length of output: 657


🏁 Script executed:

# Look for test cases that specifically test checksum validation
rg "validate_checksum|expected_checksum" tests/ --type py -B 5 -A 5

Repository: dbpedia/databus-python-client

Length of output: 55


🏁 Script executed:

# Check what the actual URIs passed to download look like in practice/tests
rg "download\(" tests/ --type py -B 2 -A 2 | head -40

Repository: dbpedia/databus-python-client

Length of output: 775


🏁 Script executed:

# Look at line 539 mentioned in the original scratchpad to see if there's a pattern
sed -n '535,545p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 322


🏁 Script executed:

# Look at the complete _extract_checksums_from_jsonld function definition
sed -n '680,750p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2371


🏁 Script executed:

# Check if there are any comments explaining the protocol handling
grep -n "http\|protocol" databusclient/api/download.py | head -20

Repository: dbpedia/databus-python-client

Length of output: 985


🏁 Script executed:

# Look for version of the code to understand context better
sed -n '1,50p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 1573


🏁 Script executed:

# Find the _extract_checksums_from_jsonld function
sed -n '600,680p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 3016


🏁 Script executed:

# Check if there's any documentation about URI normalization needs
grep -rn "removeprefix\|protocol" databusclient/ --type py

Repository: dbpedia/databus-python-client

Length of output: 190


🏁 Script executed:

# Look at what URIs are passed into the checksum extraction
sed -n '690,750p' databusclient/api/download.py

Repository: dbpedia/databus-python-client

Length of output: 2115


🏁 Script executed:

# Find the complete _extract_checksums_from_jsonld function
rg "_extract_checksums_from_jsonld" databusclient/api/download.py -A 30

Repository: dbpedia/databus-python-client

Length of output: 5363


🏁 Script executed:

# Check utils.py to understand how URIs are handled
cat databusclient/api/utils.py

Repository: dbpedia/databus-python-client

Length of output: 1861


Add inline documentation explaining the protocol normalization workaround.

Lines 831-833 normalize databusURI by removing protocol prefixes and re-adding https://. This is defensive programming for cases where a caller might pass an http:// URI that matches a metadata entry only when normalized to https://. Since _extract_checksums_from_jsonld already maps both file and @id URIs from Part nodes, the protocol normalization is a separate (reasonable) safeguard. However, adding a brief comment explaining the reason would clarify intent:

expected = checks.get(databusURI) or checks.get(
    # Fallback: normalize to https in case metadata uses different protocol
    "https://" + databusURI.removeprefix("http://").removeprefix("https://")
)
🤖 Prompt for AI Agents
In @databusclient/api/download.py around lines 823 - 847, Add a short inline
comment explaining why we normalize databusURI to https when looking up
checksums: in the block that sets expected from checks.get(databusURI) or
checks.get("https://" + databusURI.removeprefix(...)), annotate that this is a
defensive fallback to handle callers passing http:// URIs while metadata entries
may be stored with https://, and reference databusURI, checks, expected and
_extract_checksums_from_jsonld to clarify the relationship.

elif version is not None:
print(f"Downloading version: {databusURI}")
Expand All @@ -658,6 +797,8 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
expected_checksum=expected,
)
elif artifact is not None:
print(
Expand All @@ -671,6 +812,7 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
elif group is not None and group != "collections":
print(
Expand All @@ -684,6 +826,7 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
elif account is not None:
print("accountId not supported yet") # TODO
Expand All @@ -709,4 +852,5 @@ def download(
databus_key=databus_key,
auth_url=auth_url,
client_id=client_id,
validate_checksum=validate_checksum,
)
12 changes: 10 additions & 2 deletions databusclient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ def deploy(
show_default=True,
help="Client ID for token exchange",
)
@click.option(
"--validate-checksum",
is_flag=True,
help="Validate checksums of downloaded files"
)
def download(
databusuris: List[str],
localdir,
Expand All @@ -167,7 +172,9 @@ def download(
all_versions,
authurl,
clientid,
validate_checksum,
):

"""
Download datasets from databus, optionally using vault access if vault options are provided.
"""
Expand All @@ -181,7 +188,8 @@ def download(
all_versions=all_versions,
auth_url=authurl,
client_id=clientid,
)
validate_checksum=validate_checksum
)
except DownloadAuthError as e:
raise click.ClickException(str(e))

Expand Down Expand Up @@ -214,4 +222,4 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool)


if __name__ == "__main__":
app()
download()
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated