-
Notifications
You must be signed in to change notification settings - Fork 16
feature added: --validate checksum flag #44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
350617c
743c623
e33ab8c
dc51aa9
5875a82
e253b81
278ee5e
ab28258
18250ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,54 @@ | |
| get_databus_id_parts_from_file_url, | ||
| ) | ||
|
|
||
| from databusclient.extensions.webdav import compute_sha256_and_length | ||
|
|
||
| def _extract_checksum_from_node(node) -> str | None: | ||
|
Integer-Ctrl marked this conversation as resolved.
|
||
| """ | ||
| Try to extract a 64-char hex checksum from a JSON-LD file node. | ||
| Handles these common shapes: | ||
| - checksum or sha256sum fields as plain string | ||
| - checksum fields as dict with '@value' | ||
| - nested values (recursively search strings for a 64-char hex) | ||
| """ | ||
| def find_in_value(v): | ||
| if isinstance(v, str): | ||
| s = v.strip() | ||
| if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s): | ||
| return s | ||
| if isinstance(v, dict): | ||
| # common JSON-LD value object | ||
| if "@value" in v and isinstance(v["@value"], str): | ||
| res = find_in_value(v["@value"]) | ||
| if res: | ||
| return res | ||
| # try all nested dict values | ||
| for vv in v.values(): | ||
| res = find_in_value(vv) | ||
| if res: | ||
| return res | ||
| if isinstance(v, list): | ||
| for item in v: | ||
| res = find_in_value(item) | ||
| if res: | ||
| return res | ||
| return None | ||
|
|
||
| # direct keys to try first | ||
| for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): | ||
| if key in node: | ||
| res = find_in_value(node[key]) | ||
| if res: | ||
| return res | ||
|
|
||
| # fallback: search all values recursively for a 64-char hex string | ||
| for v in node.values(): | ||
| res = find_in_value(v) | ||
| if res: | ||
| return res | ||
| return None | ||
|
|
||
|
|
||
|
|
||
| # Hosts that require Vault token based authentication. Central source of truth. | ||
| VAULT_REQUIRED_HOSTS = { | ||
|
|
@@ -32,6 +80,8 @@ | |
| databus_key=None, | ||
| auth_url=None, | ||
| client_id=None, | ||
| validate_checksum: bool = False, | ||
| expected_checksum: str | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download a file from the internet with a progress bar using tqdm. | ||
|
|
@@ -183,6 +233,27 @@ | |
| if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: | ||
| raise IOError("Downloaded size does not match Content-Length header") | ||
|
|
||
| # --- 6. Optional checksum validation --- | ||
| if validate_checksum: | ||
| # reuse compute_sha256_and_length from webdav extension | ||
| try: | ||
| actual, _ = compute_sha256_and_length(filename) | ||
| except (OSError, IOError) as e: | ||
| print(f"WARNING: error computing checksum for {filename}: {e}") | ||
| actual = None | ||
|
|
||
| if expected_checksum is None: | ||
| print(f"WARNING: no expected checksum available for {filename}; skipping validation") | ||
| elif actual is None: | ||
| print(f"WARNING: could not compute checksum for {filename}; skipping validation") | ||
| else: | ||
| if actual.lower() != expected_checksum.lower(): | ||
| try: os.remove(filename) # delete corrupted file | ||
| except OSError: pass | ||
| raise IOError( | ||
| f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" | ||
| ) | ||
|
|
||
|
|
||
| def _download_files( | ||
| urls: List[str], | ||
|
|
@@ -191,6 +262,8 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| checksums: dict | None = None, | ||
| ) -> None: | ||
| """ | ||
| Download multiple files from the databus. | ||
|
|
@@ -204,13 +277,18 @@ | |
| - client_id: Client ID for token exchange | ||
| """ | ||
| for url in urls: | ||
| expected = None | ||
| if checksums and isinstance(checksums, dict): | ||
| expected = checksums.get(url) | ||
| _download_file( | ||
| url=url, | ||
| localDir=localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -358,6 +436,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| ) -> None: | ||
| """ | ||
| Download all files in a databus collection. | ||
|
|
@@ -375,13 +454,53 @@ | |
| file_urls = _get_file_download_urls_from_sparql_query( | ||
| endpoint, query, databus_key=databus_key | ||
| ) | ||
|
|
||
| # If checksum validation requested, attempt to build url->checksum mapping | ||
| # by fetching the Version JSON-LD for each file's version. We group files | ||
| # by their version URI to avoid fetching the same metadata repeatedly. | ||
| checksums: dict = {} | ||
| if validate_checksum: | ||
| # Map version_uri -> list of file urls | ||
| versions_map: dict = {} | ||
| for fu in file_urls: | ||
| try: | ||
| h, acc, grp, art, ver, f = get_databus_id_parts_from_file_url(fu) | ||
| except Exception: | ||
| continue | ||
| if ver is None: | ||
| continue | ||
| if h is None or acc is None or grp is None or art is None: | ||
|
Integer-Ctrl marked this conversation as resolved.
Outdated
|
||
| continue | ||
| version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" | ||
| versions_map.setdefault(version_uri, []).append(fu) | ||
|
|
||
| # Fetch each version's JSON-LD once and extract checksums for its files | ||
| for version_uri, urls_in_version in versions_map.items(): | ||
| try: | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| jd = json.loads(json_str) | ||
| graph = jd.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected and file_uri in urls_in_version: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| # Best-effort: if fetching a version fails, skip it | ||
| continue | ||
|
|
||
| _download_files( | ||
| list(file_urls), | ||
| localDir, | ||
| vault_token_file=vault_token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums if checksums else None, | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
|
|
||
|
|
@@ -392,6 +511,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download all files in a databus artifact version. | ||
|
|
@@ -406,13 +526,31 @@ | |
| """ | ||
| json_str = fetch_databus_jsonld(uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # build url -> checksum mapping from JSON-LD when available | ||
| checksums: dict = {} | ||
| try: | ||
| json_dict = json.loads(json_str) | ||
| graph = json_dict.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -424,6 +562,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus artifact. | ||
|
|
@@ -445,13 +584,31 @@ | |
| print(f"Downloading version: {version_uri}") | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) | ||
| # extract checksums for this version | ||
| checksums: dict = {} | ||
| try: | ||
| jd = json.loads(json_str) | ||
| graph = jd.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("@type") == "Part": | ||
| file_uri = node.get("file") | ||
| if not isinstance(file_uri, str): | ||
| continue | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| checksums[file_uri] = expected | ||
| except Exception: | ||
| checksums = {} | ||
|
|
||
| _download_files( | ||
| file_urls, | ||
| localDir, | ||
| vault_token_file=vault_token_file, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| checksums=checksums, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -527,6 +684,7 @@ | |
| databus_key: str = None, | ||
| auth_url: str = None, | ||
| client_id: str = None, | ||
| validate_checksum: bool = False, | ||
| ) -> None: | ||
| """ | ||
| Download files in a databus group. | ||
|
|
@@ -552,6 +710,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
|
|
||
|
|
||
|
|
@@ -598,6 +757,7 @@ | |
| all_versions=None, | ||
| auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", | ||
| client_id="vault-token-exchange", | ||
| validate_checksum: bool = False | ||
| ) -> None: | ||
| """ | ||
| Download datasets from databus. | ||
|
|
@@ -638,16 +798,36 @@ | |
| databus_key, | ||
| auth_url, | ||
| client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif file is not None: | ||
| print(f"Downloading file: {databusURI}") | ||
| # Try to fetch expected checksum from the parent Version metadata | ||
| expected = None | ||
| if validate_checksum: | ||
| try: | ||
| version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" | ||
| json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) | ||
| json_dict = json.loads(json_str) | ||
| graph = json_dict.get("@graph", []) | ||
| for node in graph: | ||
| if node.get("file") == databusURI or node.get("@id") == databusURI: | ||
| expected = _extract_checksum_from_node(node) | ||
| if expected: | ||
| break | ||
| except Exception as e: | ||
| print(f"WARNING: Could not fetch checksum for single file: {e}") | ||
|
|
||
| # Call the worker to download the single file (passes expected checksum) | ||
| _download_file( | ||
| databusURI, | ||
| localDir, | ||
| vault_token_file=token, | ||
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| expected_checksum=expected, | ||
| ) | ||
|
coderabbitai[bot] marked this conversation as resolved.
Comment on lines
+823
to
847
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's explore the repository structure
find . -type f -name "*.py" -o -name "*.json" -o -name "*.jsonld" | head -20Repository: dbpedia/databus-python-client Length of output: 498 🏁 Script executed: # Look at the download.py file around the lines in question
sed -n '820,850p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1635 🏁 Script executed: # Search for how URIs are used and formatted in the codebase
rg "databusURI|@id|file.*http" --type py -A 2 -B 2 | head -50Repository: dbpedia/databus-python-client Length of output: 2944 🏁 Script executed: # Look for test fixtures with JSON-LD examples
find . -type f \( -name "*.json" -o -name "*.jsonld" -o -name "*fixture*" -o -name "*test*data*" \) | grep -E "\.(json|jsonld)$|fixture|test_data" | head -20Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Check for documentation about URI formatting
find . -type f \( -name "*.md" -o -name "*.rst" -o -name "*.txt" \) | xargs grep -l "URI\|http\|checksum" 2>/dev/null | head -10Repository: dbpedia/databus-python-client Length of output: 119 🏁 Script executed: # Look at _extract_checksums_from_jsonld function in detail
sed -n '780,810p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1693 🏁 Script executed: # Check test files for actual JSON-LD examples with checksums
rg "checksum|sha256" tests/ --type py -A 3 -B 3 | head -60Repository: dbpedia/databus-python-client Length of output: 1821 🏁 Script executed: # Search for any code that might pass http:// URIs vs https://
rg "http://" databusclient/ --type py -B 2 -A 2Repository: dbpedia/databus-python-client Length of output: 2041 🏁 Script executed: # Look at the entire Part node extraction logic to understand URI handling
sed -n '760,820p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2558 🏁 Script executed: # Check if there are test cases that would catch protocol mismatches
rg "_extract_checksums" tests/ -A 10 -B 5Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Look at the complete _extract_checksums_from_jsonld function
sed -n '750,820p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2844 🏁 Script executed: # Check if there are any tests that test URI protocol mismatches
rg "http://" tests/ --type py -B 5 -A 5Repository: dbpedia/databus-python-client Length of output: 657 🏁 Script executed: # Look for test cases that specifically test checksum validation
rg "validate_checksum|expected_checksum" tests/ --type py -B 5 -A 5Repository: dbpedia/databus-python-client Length of output: 55 🏁 Script executed: # Check what the actual URIs passed to download look like in practice/tests
rg "download\(" tests/ --type py -B 2 -A 2 | head -40Repository: dbpedia/databus-python-client Length of output: 775 🏁 Script executed: # Look at line 539 mentioned in the original scratchpad to see if there's a pattern
sed -n '535,545p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 322 🏁 Script executed: # Look at the complete _extract_checksums_from_jsonld function definition
sed -n '680,750p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2371 🏁 Script executed: # Check if there are any comments explaining the protocol handling
grep -n "http\|protocol" databusclient/api/download.py | head -20Repository: dbpedia/databus-python-client Length of output: 985 🏁 Script executed: # Look for version of the code to understand context better
sed -n '1,50p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 1573 🏁 Script executed: # Find the _extract_checksums_from_jsonld function
sed -n '600,680p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 3016 🏁 Script executed: # Check if there's any documentation about URI normalization needs
grep -rn "removeprefix\|protocol" databusclient/ --type pyRepository: dbpedia/databus-python-client Length of output: 190 🏁 Script executed: # Look at what URIs are passed into the checksum extraction
sed -n '690,750p' databusclient/api/download.pyRepository: dbpedia/databus-python-client Length of output: 2115 🏁 Script executed: # Find the complete _extract_checksums_from_jsonld function
rg "_extract_checksums_from_jsonld" databusclient/api/download.py -A 30Repository: dbpedia/databus-python-client Length of output: 5363 🏁 Script executed: # Check utils.py to understand how URIs are handled
cat databusclient/api/utils.pyRepository: dbpedia/databus-python-client Length of output: 1861 Add inline documentation explaining the protocol normalization workaround. Lines 831-833 normalize expected = checks.get(databusURI) or checks.get(
# Fallback: normalize to https in case metadata uses different protocol
"https://" + databusURI.removeprefix("http://").removeprefix("https://")
)🤖 Prompt for AI Agents |
||
| elif version is not None: | ||
| print(f"Downloading version: {databusURI}") | ||
|
|
@@ -658,6 +838,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif artifact is not None: | ||
| print( | ||
|
|
@@ -671,6 +852,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif group is not None and group != "collections": | ||
| print( | ||
|
|
@@ -684,6 +866,7 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
| elif account is not None: | ||
| print("accountId not supported yet") # TODO | ||
|
|
@@ -697,6 +880,8 @@ | |
| # query as argument | ||
| else: | ||
| print("QUERY {}", databusURI.replace("\n", " ")) | ||
| if validate_checksum: | ||
| print("WARNING: Checksum validation is not supported for user-defined SPARQL queries.") | ||
|
Integer-Ctrl marked this conversation as resolved.
Outdated
|
||
| if uri_endpoint is None: # endpoint is required for queries (--databus) | ||
| raise ValueError("No endpoint given for query") | ||
| res = _get_file_download_urls_from_sparql_query( | ||
|
|
@@ -709,4 +894,5 @@ | |
| databus_key=databus_key, | ||
| auth_url=auth_url, | ||
| client_id=client_id, | ||
| validate_checksum=validate_checksum, | ||
| ) | ||
Uh oh!
There was an error while loading. Please reload this page.