dbpedia · Integer-Ctrl · Jan 13, 2026 · Dec 31, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/databusclient/api/download.py b/databusclient/api/download.py
@@ -1,5 +1,6 @@
 import json
 import os
+import re
 from typing import List
 from urllib.parse import urlparse
 
@@ -12,6 +13,52 @@
     get_databus_id_parts_from_file_url,
 )
 
+from databusclient.api.utils import compute_sha256_and_length
+
+# compiled regex for SHA-256 hex strings
+_SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$")
+
+def _extract_checksum_from_node(node) -> str | None:
+    """
+    Try to extract a 64-char hex checksum from a JSON-LD file node.
+    Handles these common shapes:
+    - checksum or sha256sum fields as plain string
+    - checksum fields as dict with '@value'
+    - nested values under the allowed keys (lists or '@value' objects)
+    """
+    def find_in_value(v):
+        if isinstance(v, str):
+            s = v.strip()
+            if _SHA256_RE.match(s):
+                return s
+        if isinstance(v, dict):
+            # common JSON-LD value object
+            if "@value" in v and isinstance(v["@value"], str):
+                res = find_in_value(v["@value"])
+                if res:
+                    return res
+            # try all nested dict values
+            for vv in v.values():
+                res = find_in_value(vv)
+                if res:
+                    return res
+        if isinstance(v, list):
+            for item in v:
+                res = find_in_value(item)
+                if res:
+                    return res
+        return None
+
+    # Only inspect the explicitly allowed keys to avoid false positives.
+    for key in ("checksum", "sha256sum", "sha256", "databus:checksum"):
+        if key in node:
+            res = find_in_value(node[key])
+            if res:
+                return res
+
+    return None
+
+
 
 # Hosts that require Vault token based authentication. Central source of truth.
 VAULT_REQUIRED_HOSTS = {
@@ -25,13 +72,76 @@ class DownloadAuthError(Exception):
 
 
 
+def _extract_checksums_from_jsonld(json_str: str) -> dict:
+    """
+    Parse a JSON-LD string and return a mapping of file URI (and @id) -> checksum.
+
+    Uses the existing _extract_checksum_from_node logic to extract checksums
+    from `Part` nodes. Both the node's `file` and `@id` (if present and a
+    string) are mapped to the checksum to preserve existing lookup behavior.
+    """
+    try:
+        jd = json.loads(json_str)
+    except Exception:
+        return {}
+    graph = jd.get("@graph", [])
+    checksums: dict = {}
+    for node in graph:
+        if node.get("@type") == "Part":
+            expected = _extract_checksum_from_node(node)
+            if not expected:
+                continue
+            file_uri = node.get("file")
+            if isinstance(file_uri, str):
+                checksums[file_uri] = expected
+            node_id = node.get("@id")
+            if isinstance(node_id, str):
+                checksums[node_id] = expected
+    return checksums
+
+
+def _resolve_checksums_for_urls(file_urls: List[str], databus_key: str | None) -> dict:
+    """
+    Group file URLs by their Version URI, fetch each Version JSON-LD once,
+    and return a combined url->checksum mapping for the provided URLs.
+
+    Best-effort: failures to fetch or parse individual versions are skipped.
+    """
+    versions_map: dict = {}
+    for file_url in file_urls:
+        try:
+            host, accountId, groupId, artifactId, versionId, fileId = get_databus_id_parts_from_file_url(file_url)
+        except Exception:
+            continue
+        if versionId is None:
+            continue
+        if host is None or accountId is None or groupId is None or artifactId is None:
+            continue
+        version_uri = f"https://{host}/{accountId}/{groupId}/{artifactId}/{versionId}"
+        versions_map.setdefault(version_uri, []).append(file_url)
+
+    checksums: dict = {}
+    for version_uri, urls_in_version in versions_map.items():
+        try:
+            json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
+            extracted_checksums = _extract_checksums_from_jsonld(json_str)
+            for url in urls_in_version:
+                if url in extracted_checksums:
+                    checksums[url] = extracted_checksums[url]
+        except Exception:
+            # Best-effort: skip versions we cannot fetch or parse
+            continue
+    return checksums
+
 def _download_file(
     url,
     localDir,
     vault_token_file=None,
     databus_key=None,
     auth_url=None,
     client_id=None,
+    validate_checksum: bool = False,
+    expected_checksum: str | None = None,
 ) -> None:
     """
     Download a file from the internet with a progress bar using tqdm.
@@ -183,6 +293,29 @@ def _download_file(
     if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
         raise IOError("Downloaded size does not match Content-Length header")
 
+    # --- 6. Optional checksum validation ---
+    if validate_checksum:
+        # reuse compute_sha256_and_length from webdav extension
+        try:
+            actual, _ = compute_sha256_and_length(filename)
+        except (OSError, IOError) as e:
+            print(f"WARNING: error computing checksum for {filename}: {e}")
+            actual = None
+
+        if expected_checksum is None:
+            print(f"WARNING: no expected checksum available for {filename}; skipping validation")
+        elif actual is None:
+            print(f"WARNING: could not compute checksum for {filename}; skipping validation")
+        else:
+            if actual.lower() != expected_checksum.lower():
+                try: 
+                    os.remove(filename)  # delete corrupted file
+                except OSError: 
+                    pass
+                raise IOError(
+                    f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}"
+                )
+
 
 def _download_files(
     urls: List[str],
@@ -191,6 +324,8 @@ def _download_files(
     databus_key: str = None,
     auth_url: str = None,
     client_id: str = None,
+    validate_checksum: bool = False,
+    checksums: dict | None = None,
 ) -> None:
     """
     Download multiple files from the databus.
@@ -204,13 +339,18 @@ def _download_files(
     - client_id: Client ID for token exchange
     """
     for url in urls:
+        expected = None
+        if checksums and isinstance(checksums, dict):
+            expected = checksums.get(url)
         _download_file(
             url=url,
             localDir=localDir,
             vault_token_file=vault_token_file,
             databus_key=databus_key,
             auth_url=auth_url,
             client_id=client_id,
+            validate_checksum=validate_checksum,
+            expected_checksum=expected,
         )
 
 
@@ -358,6 +498,7 @@ def _download_collection(
     databus_key: str = None,
     auth_url: str = None,
     client_id: str = None,
+    validate_checksum: bool = False
 ) -> None:
     """
     Download all files in a databus collection.
@@ -375,13 +516,21 @@ def _download_collection(
     file_urls = _get_file_download_urls_from_sparql_query(
         endpoint, query, databus_key=databus_key
     )
+
+    # If checksum validation requested, attempt to build url->checksum mapping
+    checksums: dict = {}
+    if validate_checksum:
+        checksums = _resolve_checksums_for_urls(list(file_urls), databus_key)
+
     _download_files(
         list(file_urls),
         localDir,
         vault_token_file=vault_token,
         databus_key=databus_key,
         auth_url=auth_url,
         client_id=client_id,
+        validate_checksum=validate_checksum,
+        checksums=checksums if checksums else None,
     )
 
 
@@ -392,6 +541,7 @@ def _download_version(
     databus_key: str = None,
     auth_url: str = None,
     client_id: str = None,
+    validate_checksum: bool = False,
 ) -> None:
     """
     Download all files in a databus artifact version.
@@ -406,13 +556,22 @@ def _download_version(
     """
     json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
     file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
+    # build url -> checksum mapping from JSON-LD when available
+    checksums: dict = {}
+    try:
+        checksums = _extract_checksums_from_jsonld(json_str)
+    except Exception:
+        checksums = {}
+
     _download_files(
         file_urls,
         localDir,
         vault_token_file=vault_token_file,
         databus_key=databus_key,
         auth_url=auth_url,
         client_id=client_id,
+        validate_checksum=validate_checksum,
+        checksums=checksums,
     )
 
 
@@ -424,6 +583,7 @@ def _download_artifact(
     databus_key: str = None,
     auth_url: str = None,
     client_id: str = None,
+    validate_checksum: bool = False,
 ) -> None:
     """
     Download files in a databus artifact.
@@ -445,13 +605,22 @@ def _download_artifact(
         print(f"Downloading version: {version_uri}")
         json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
         file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
+        # extract checksums for this version
+        checksums: dict = {}
+        try:
+            checksums = _extract_checksums_from_jsonld(json_str)
+        except Exception:
+            checksums = {}
+
         _download_files(
             file_urls,
             localDir,
             vault_token_file=vault_token_file,
             databus_key=databus_key,
             auth_url=auth_url,
             client_id=client_id,
+            validate_checksum=validate_checksum,
+            checksums=checksums,
         )
 
 
@@ -527,6 +696,7 @@ def _download_group(
     databus_key: str = None,
     auth_url: str = None,
     client_id: str = None,
+    validate_checksum: bool = False,
 ) -> None:
     """
     Download files in a databus group.
@@ -552,6 +722,7 @@ def _download_group(
             databus_key=databus_key,
             auth_url=auth_url,
             client_id=client_id,
+            validate_checksum=validate_checksum,
         )
 
 
@@ -598,6 +769,7 @@ def download(
     all_versions=None,
     auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token",
     client_id="vault-token-exchange",
+    validate_checksum: bool = False
 ) -> None:
     """
     Download datasets from databus.
@@ -638,16 +810,34 @@ def download(
                     databus_key,
                     auth_url,
                     client_id,
+                    validate_checksum=validate_checksum,
                 )
             elif file is not None:
                 print(f"Downloading file: {databusURI}")
+                # Try to fetch expected checksum from the parent Version metadata
+                expected = None
+                if validate_checksum:
+                    try:
+                            version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}"
+                            json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
+                            checks = _extract_checksums_from_jsonld(json_str)
+                            expected = checks.get(databusURI)
+                            if expected is None:
+                                # fallback: try lookup by @id (helper already maps @id too)
+                                expected = checks.get(databusURI)
+                    except Exception as e:
+                        print(f"WARNING: Could not fetch checksum for single file: {e}")
+
+                # Call the worker to download the single file (passes expected checksum)
                 _download_file(
                     databusURI,
                     localDir,
                     vault_token_file=token,
                     databus_key=databus_key,
                     auth_url=auth_url,
                     client_id=client_id,
+                    validate_checksum=validate_checksum,
+                    expected_checksum=expected,
                 )
             elif version is not None:
                 print(f"Downloading version: {databusURI}")
@@ -658,6 +848,7 @@ def download(
                     databus_key=databus_key,
                     auth_url=auth_url,
                     client_id=client_id,
+                    validate_checksum=validate_checksum,
                 )
             elif artifact is not None:
                 print(
@@ -671,6 +862,7 @@ def download(
                     databus_key=databus_key,
                     auth_url=auth_url,
                     client_id=client_id,
+                    validate_checksum=validate_checksum,
                 )
             elif group is not None and group != "collections":
                 print(
@@ -684,6 +876,7 @@ def download(
                     databus_key=databus_key,
                     auth_url=auth_url,
                     client_id=client_id,
+                    validate_checksum=validate_checksum,
                 )
             elif account is not None:
                 print("accountId not supported yet")  # TODO
@@ -702,11 +895,21 @@ def download(
             res = _get_file_download_urls_from_sparql_query(
                 uri_endpoint, databusURI, databus_key=databus_key
             )
+
+            # If checksum validation requested, try to build url->checksum mapping
+            checksums: dict = {}
+            if validate_checksum:
+                checksums = _resolve_checksums_for_urls(res, databus_key)
+                if not checksums:
+                    print("WARNING: Checksum validation enabled but no checksums found for query results.")
+
             _download_files(
                 res,
                 localDir,
                 vault_token_file=token,
                 databus_key=databus_key,
                 auth_url=auth_url,
                 client_id=client_id,
+                validate_checksum=validate_checksum,
+                checksums=checksums if checksums else None,
             )