-
Notifications
You must be signed in to change notification settings - Fork 16
feature added: --validate checksum flag #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
350617c
743c623
e33ab8c
dc51aa9
5875a82
e253b81
278ee5e
ab28258
18250ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| import json | ||
| import os | ||
| import re | ||
| from typing import List | ||
| from urllib.parse import urlparse | ||
|
|
||
|
|
@@ -12,6 +13,52 @@ | |
| get_databus_id_parts_from_file_url, | ||
| ) | ||
|
|
||
| from databusclient.api.utils import compute_sha256_and_length | ||
|
|
||
| # compiled regex for SHA-256 hex strings | ||
| _SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$") | ||
|
|
||
| def _extract_checksum_from_node(node) -> str | None: | ||
| """ | ||
| Try to extract a 64-char hex checksum from a JSON-LD file node. | ||
| Handles these common shapes: | ||
| - checksum or sha256sum fields as plain string | ||
| - checksum fields as dict with '@value' | ||
| - nested values under the allowed keys (lists or '@value' objects) | ||
| """ | ||
| def find_in_value(v): | ||
| if isinstance(v, str): | ||
| s = v.strip() | ||
| if _SHA256_RE.match(s): | ||
| return s | ||
| if isinstance(v, dict): | ||
| # common JSON-LD value object | ||
| if "@value" in v and isinstance(v["@value"], str): | ||
| res = find_in_value(v["@value"]) | ||
| if res: | ||
| return res | ||
| # try all nested dict values | ||
| for vv in v.values(): | ||
| res = find_in_value(vv) | ||
| if res: | ||
| return res | ||
| if isinstance(v, list): | ||
| for item in v: | ||
| res = find_in_value(item) | ||
| if res: | ||
| return res | ||
| return None | ||
|
|
||
| # Only inspect the explicitly allowed keys to avoid false positives. | ||
| for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): | ||
| if key in node: | ||
| res = find_in_value(node[key]) | ||
| if res: | ||
| return res | ||
|
|
||
| return None | ||
|
|
||
|
|
||
|
|
||
| # Hosts that require Vault token based authentication. Central source of truth. | ||
| VAULT_REQUIRED_HOSTS = { | ||
|
|
@@ -25,13 +72,76 @@ class DownloadAuthError(Exception): | |
|
|
||
|
|
||
|
|
||
| def _extract_checksums_from_jsonld(json_str: str) -> dict: | ||
| """ | ||
| Parse a JSON-LD string and return a mapping of file URI (and @id) -> checksum. | ||
|
|
||
| Uses the existing _extract_checksum_from_node logic to extract checksums | ||
| from `Part` nodes. Both the node's `file` and `@id` (if present and a | ||
| string) are mapped to the checksum to preserve existing lookup behavior. | ||
| """ | ||
| try: | ||
| jd = json.loads(json_str) | ||
| except Exception: | ||
| return {} | ||
| graph = jd.get("@graph", []) | ||
| checksums: dict = {} | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| expected = _extract_checksum_from_node(node) | ||
| if not expected: | ||
| continue | ||
| file_uri = node.get("file") | ||
| if isinstance(file_uri, str): | ||
| checksums[file_uri] = expected | ||
| node_id = node.get("@id") | ||
| if isinstance(node_id, str): | ||
| checksums[node_id] = expected | ||
| return checksums | ||
|
|
||
|
Integer-Ctrl marked this conversation as resolved.
|
||
|
|
||
| def _resolve_checksums_for_urls(file_urls: List[str], databus_key: str | None) -> dict: | ||
| """ | ||
| Group file URLs by their Version URI, fetch each Version JSON-LD once, | ||
| and return a combined url->checksum mapping for the provided URLs. | ||
|
|
||
| Best-effort: failures to fetch or parse individual versions are skipped. | ||
| """ | ||
| versions_map: dict = {} | ||
| for file_url in file_urls: | ||
| try: | ||
| host, accountId, groupId, artifactId, versionId, fileId = get_databus_id_parts_from_file_url(file_url) | ||
| except Exception: | ||
| continue | ||
| if versionId is None: | ||
| continue | ||
| if host is None or accountId is None or groupId is None or artifactId is None: | ||
| continue | ||
| version_uri = f"https://{host}/{accountId}/{groupId}/{artifactId}/{versionId}" | ||
| versions_map.setdefault(version_uri, []).append(file_url) | ||
|
|
||
| checksums: dict = {} | ||
| for version_uri, urls_in_version in versions_map.items(): | ||
| try: | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| extracted_checksums = _extract_checksums_from_jsonld(json_str) | ||
| for url in urls_in_version: | ||
| if url in extracted_checksums: | ||
| checksums[url] = extracted_checksums[url] | ||
| except Exception: | ||
| # Best-effort: skip versions we cannot fetch or parse | ||
| continue | ||
| return checksums | ||
|
|
||
|
Comment on lines
+109
to
+141
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Normalize URL schemes when mapping Proposed fix (canonicalize)+def _canonicalize_url(u: str) -> str:
+ return "https://" + u.removeprefix("http://").removeprefix("https://")
@@
for version_uri, urls_in_version in versions_map.items():
@@
extracted_checksums = _extract_checksums_from_jsonld(json_str)
for url in urls_in_version:
- if url in extracted_checksums:
- checksums[url] = extracted_checksums[url]
+ canon = _canonicalize_url(url)
+ if url in extracted_checksums:
+ checksums[url] = extracted_checksums[url]
+ elif canon in extracted_checksums:
+ checksums[url] = extracted_checksums[canon]
@@
for url in urls:
expected = None
if checksums and isinstance(checksums, dict):
- expected = checksums.get(url)
+ expected = checksums.get(url) or checksums.get(_canonicalize_url(url))Also applies to: 341-345 🤖 Prompt for AI Agents |
||
| def _download_file( | ||
| url, | ||
| localDir, | ||
| vault_token_file=None, | ||
| databus_key=None, | ||
| auth_url=None, | ||
| client_id=None, | ||
| validate_checksum: bool = False, | ||
| expected_checksum: str | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download a file from the internet with a progress bar using tqdm. | ||
|
|
@@ -183,6 +293,29 @@ def _download_file( | |
| if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: | ||
| raise IOError("Downloaded size does not match Content-Length header") | ||
|
|
||
| # --- 6. Optional checksum validation --- | ||
| if validate_checksum: | ||
| # reuse compute_sha256_and_length from webdav extension | ||
| try: | ||
| actual, _ = compute_sha256_and_length(filename) | ||
| except (OSError, IOError) as e: | ||
| print(f"WARNING: error computing checksum for {filename}: {e}") | ||
| actual = None | ||
|
|
||
| if expected_checksum is None: | ||
| print(f"WARNING: no expected checksum available for {filename}; skipping validation") | ||
| elif actual is None: | ||
| print(f"WARNING: could not compute checksum for {filename}; skipping validation") | ||
| else: | ||
| if actual.lower() != expected_checksum.lower(): | ||
| try: | ||
| os.remove(filename) # delete corrupted file | ||
| except OSError: | ||
| pass | ||
| raise IOError( | ||
| f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" | ||
| ) | ||
|
|
||
|
|
||
| def _download_files( | ||
| urls: List[str], | ||
|
|
@@ -191,6 +324,8 @@ def _download_files( | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| checksums: dict | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download multiple files from the databus. | ||
|
|
@@ -204,13 +339,18 @@ def _download_files( | |
| - client_id: Client ID for token exchange | ||
| """ | ||
| for url in urls: | ||
| expected = None | ||
| if checksums and isinstance(checksums, dict): | ||
| expected = checksums.get(url) | ||
| _download_file( | ||
| url=url, | ||
| localDir=localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -358,6 +498,7 @@ def _download_collection( | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| ) -> None: | ||
| """ | ||
| Download all files in a databus collection. | ||
|
|
@@ -375,13 +516,21 @@ def _download_collection( | |
| file_urls = _get_file_download_urls_from_sparql_query( | ||
| endpoint, query, databus_key=databus_key | ||
| ) | ||
|
|
||
| # If checksum validation requested, attempt to build url->checksum mapping | ||
| checksums: dict = {} | ||
| if validate_checksum: | ||
| checksums = _resolve_checksums_for_urls(list(file_urls), databus_key) | ||
|
|
||
| _download_files( | ||
| list(file_urls), | ||
| localDir, | ||
| vault_token_file=vault_token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums if checksums else None, | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
|
|
||
|
|
@@ -392,6 +541,7 @@ def _download_version( | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download all files in a databus artifact version. | ||
|
|
@@ -406,13 +556,22 @@ def _download_version( | |
| """ | ||
| json_str = fetch_databus_jsonld(uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # build url -> checksum mapping from JSON-LD when available | ||
| checksums: dict = {} | ||
| try: | ||
| checksums = _extract_checksums_from_jsonld(json_str) | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -424,6 +583,7 @@ def _download_artifact( | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus artifact. | ||
|
|
@@ -445,13 +605,22 @@ def _download_artifact( | |
| print(f"Downloading version: {version_uri}") | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # extract checksums for this version | ||
| checksums: dict = {} | ||
| try: | ||
| checksums = _extract_checksums_from_jsonld(json_str) | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -527,6 +696,7 @@ def _download_group( | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus group. | ||
|
|
@@ -552,6 +722,7 @@ def _download_group( | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -598,6 +769,7 @@ def download( | |
| all_versions=None, | ||
| auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", | ||
| client_id="vault-token-exchange", | ||
| validate_checksum: bool = False | ||
| ) -> None: | ||
| """ | ||
| Download datasets from databus. | ||
|
|
@@ -638,16 +810,34 @@ def download( | |
| databus_key, | ||
| auth_url, | ||
| client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif file is not None: | ||
| print(f"Downloading file: {databusURI}") | ||
| # Try to fetch expected checksum from the parent Version metadata | ||
| expected = None | ||
| if validate_checksum: | ||
| try: | ||
| version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| checks = _extract_checksums_from_jsonld(json_str) | ||
| expected = checks.get(databusURI) | ||
| if expected is None: | ||
| # fallback: try lookup by @id (helper already maps @id too) | ||
| expected = checks.get(databusURI) | ||
| except Exception as e: | ||
| print(f"WARNING: Could not fetch checksum for single file: {e}") | ||
|
|
||
| # Call the worker to download the single file (passes expected checksum) | ||
| _download_file( | ||
| databusURI, | ||
| localDir, | ||
| vault_token_file=token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
Comment on lines
+823
to
847
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's explore the repository structure
find . -type f -name "*.py" -o -name "*.json" -o -name "*.jsonld" | head -20Repository: dbpedia/databus-python-client Length of output: 498 🏁 Script executed: # Look at the download.py file around the lines in question
sed -n '820,850p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1635 🏁 Script executed: # Search for how URIs are used and formatted in the codebase
rg "databusURI|@id|file.*http" --type py -A 2 -B 2 | head -50Repository: dbpedia/databus-python-client Length of output: 2944 🏁 Script executed: # Look for test fixtures with JSON-LD examples
find . -type f \( -name "*.json" -o -name "*.jsonld" -o -name "*fixture*" -o -name "*test*data*" \) | grep -E "\.(json|jsonld)$|fixture|test_data" | head -20Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Check for documentation about URI formatting
find . -type f \( -name "*.md" -o -name "*.rst" -o -name "*.txt" \) | xargs grep -l "URI\|http\|checksum" 2>/dev/null | head -10Repository: dbpedia/databus-python-client Length of output: 119 🏁 Script executed: # Look at _extract_checksums_from_jsonld function in detail
sed -n '780,810p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1693 🏁 Script executed: # Check test files for actual JSON-LD examples with checksums
rg "checksum|sha256" tests/ --type py -A 3 -B 3 | head -60Repository: dbpedia/databus-python-client Length of output: 1821 🏁 Script executed: # Search for any code that might pass http:// URIs vs https://
rg "http://" databusclient/ --type py -B 2 -A 2Repository: dbpedia/databus-python-client Length of output: 2041 🏁 Script executed: # Look at the entire Part node extraction logic to understand URI handling
sed -n '760,820p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2558 🏁 Script executed: # Check if there are test cases that would catch protocol mismatches
rg "_extract_checksums" tests/ -A 10 -B 5Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Look at the complete _extract_checksums_from_jsonld function
sed -n '750,820p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2844 🏁 Script executed: # Check if there are any tests that test URI protocol mismatches
rg "http://" tests/ --type py -B 5 -A 5Repository: dbpedia/databus-python-client Length of output: 657 🏁 Script executed: # Look for test cases that specifically test checksum validation
rg "validate_checksum|expected_checksum" tests/ --type py -B 5 -A 5Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Check what the actual URIs passed to download look like in practice/tests
rg "download\(" tests/ --type py -B 2 -A 2 | head -40Repository: dbpedia/databus-python-client Length of output: 775 🏁 Script executed: # Look at line 539 mentioned in the original scratchpad to see if there's a pattern
sed -n '535,545p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 322 🏁 Script executed: # Look at the complete _extract_checksums_from_jsonld function definition
sed -n '680,750p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2371 🏁 Script executed: # Check if there are any comments explaining the protocol handling
grep -n "http\|protocol" databusclient/api/download.py | head -20Repository: dbpedia/databus-python-client Length of output: 985 🏁 Script executed: # Look for version of the code to understand context better
sed -n '1,50p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1573 🏁 Script executed: # Find the _extract_checksums_from_jsonld function
sed -n '600,680p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 3016 🏁 Script executed: # Check if there's any documentation about URI normalization needs
grep -rn "removeprefix\|protocol" databusclient/ --type pyRepository: dbpedia/databus-python-client Length of output: 190 🏁 Script executed: # Look at what URIs are passed into the checksum extraction
sed -n '690,750p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2115 🏁 Script executed: # Find the complete _extract_checksums_from_jsonld function
rg "_extract_checksums_from_jsonld" databusclient/api/download.py -A 30Repository: dbpedia/databus-python-client Length of output: 5363 🏁 Script executed: # Check utils.py to understand how URIs are handled
cat databusclient/api/utils.pyRepository: dbpedia/databus-python-client Length of output: 1861 Add inline documentation explaining the protocol normalization workaround. Lines 831-833 normalize expected = checks.get(databusURI) or checks.get(
# Fallback: normalize to https in case metadata uses different protocol
"https://" + databusURI.removeprefix("http://").removeprefix("https://")
)🤖 Prompt for AI Agents |
||
| elif version is not None: | ||
| print(f"Downloading version: {databusURI}") | ||
|
|
@@ -658,6 +848,7 @@ def download( | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif artifact is not None: | ||
| print( | ||
|
|
@@ -671,6 +862,7 @@ def download( | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif group is not None and group != "collections": | ||
| print( | ||
|
|
@@ -684,6 +876,7 @@ def download( | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif account is not None: | ||
| print("accountId not supported yet") # TODO | ||
|
|
@@ -702,11 +895,21 @@ def download( | |
| res = _get_file_download_urls_from_sparql_query( | ||
| uri_endpoint, databusURI, databus_key=databus_key | ||
| ) | ||
|
|
||
| # If checksum validation requested, try to build url->checksum mapping | ||
| checksums: dict = {} | ||
| if validate_checksum: | ||
| checksums = _resolve_checksums_for_urls(res, databus_key) | ||
| if not checksums: | ||
| print("WARNING: Checksum validation enabled but no checksums found for query results.") | ||
|
|
||
| _download_files( | ||
| res, | ||
| localDir, | ||
| vault_token_file=token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums if checksums else None, | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.