Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD
- Optional filter to specify which source compression format should be converted. Use with `--convert-to` to convert only files with a specific compression format. Example: `--convert-to gz --convert-from bz2` converts only `.bz2` files to `.gz`, leaving other formats unchanged.
- `--validate-checksum`
- Validates the checksums of downloaded files against the checksums provided by the Databus. If a checksum does not match, an error is raised and the file is deleted.
- **Filters (Pipe syntax)**
- You can filter files within a version/artifact/group using a pipe-separated syntax: `$URI|filter1|filter2`.
- Content variants: `key=value` (e.g. `lang=en`) or just `value` (e.g. `en`) to match any variant.
- Format: `.extension` (e.g. `.ttl`).
- Compression: `..compression` (e.g. `..gz`).
- Example: `databusclient download "https://.../version|lang=en|.ttl|..gz"`

**Help and further information on download command:**
```bash
Expand Down Expand Up @@ -337,6 +343,7 @@ Options:
https://cloud.example.com/remote.php/webdav)
--remote TEXT rclone remote name (e.g., 'nextcloud')
--path TEXT Remote path on Nextcloud (e.g., 'datasets/mydataset')
--dry-run Generate and print JSON-LD without deploying (gen preview)
--help Show this message and exit.
```

Expand Down
144 changes: 114 additions & 30 deletions databusclient/api/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,22 +752,24 @@ def _download_version(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download all files in a databus artifact version.
"""Download matching files in a databus artifact version.

Args:
uri: The full databus artifact version URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
uri: The full databus artifact version URI (base URI without filters).
localDir: Local directory to download files to.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters (content variants, format, compression).
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str, filters=filters)
# build url -> checksum mapping from JSON-LD when available
checksums: dict = {}
try:
Expand Down Expand Up @@ -800,20 +802,22 @@ def _download_artifact(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download files in a databus artifact.

Args:
uri: The full databus artifact URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
all_versions: If True, download all versions of the artifact; otherwise, only download the latest version.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
localDir: Local directory to download files to.
all_versions: If True, download all versions; otherwise, only latest.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters to apply to each version's files.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
versions = _get_databus_versions_of_artifact(json_str, all_versions=all_versions)
Expand All @@ -822,7 +826,9 @@ def _download_artifact(
for version_uri in versions:
print(f"Downloading version: {version_uri}")
json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key)
file_urls = _get_file_download_urls_from_artifact_jsonld(json_str)
file_urls = _get_file_download_urls_from_artifact_jsonld(
json_str, filters=filters
)
# extract checksums for this version
checksums: dict = {}
try:
Expand Down Expand Up @@ -882,14 +888,73 @@ def _get_databus_versions_of_artifact(
return version_urls[0]


def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]:
def _matches_filters(node: dict, filters: List[str]) -> bool:
"""Check if a JSON-LD node matches the given filters.

Filters can be:
- .extension (e.g. .ttl)
- ..compression (e.g. ..gz)
- key=value (content variant)
- value (match any content variant value)
"""
if not filters:
return True

for f in filters:
if f.startswith(".."):
# Compression filter
expected = f[2:].lower()
actual = str(node.get("compression", "")).lower()
if actual != expected:
return False
elif f.startswith("."):
# Format extension filter
expected = f[1:].lower()
actual = str(node.get("formatExtension", "")).lower()
if actual != expected:
return False
Comment on lines +903 to +915
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fall back to the file name when formatExtension or compression is missing.

file_format and compression are optional in published metadata, so .ttl / ..gz currently fail on otherwise valid Part nodes that only expose file. That makes the new filter syntax silently skip matching files on those datasets.

💡 Localized fix
     for f in filters:
         if f.startswith(".."):
             # Compression filter
             expected = f[2:].lower()
-            actual = str(node.get("compression", "")).lower()
+            actual = str(node.get("compression", "")).lower()
+            if not actual:
+                actual = _detect_compression_format(str(node.get("file", ""))) or ""
             if actual != expected:
                 return False
         elif f.startswith("."):
             # Format extension filter
             expected = f[1:].lower()
-            actual = str(node.get("formatExtension", "")).lower()
+            actual = str(node.get("formatExtension", "")).lower()
+            if not actual:
+                path = urlparse(str(node.get("file", ""))).path
+                basename = os.path.basename(path).lower()
+                basename = re.sub(r"\.(bz2|gz|xz)$", "", basename)
+                actual = basename.rsplit(".", 1)[-1] if "." in basename else ""
             if actual != expected:
                 return False
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@databusclient/api/download.py` around lines 903 - 915, The filter logic in
the for-loop that checks f.startswith(".") and f.startswith("..") currently
reads formatExtension and compression from node.get(...) only, causing valid
Part nodes with only a "file" field to fail; update the checks in that loop to
fall back to extracting the extension and compression from node["file"] when
node.get("formatExtension") or node.get("compression") are empty—use the file
name (node.get("file") or node["file"]) to derive the format extension (e.g.,
the suffix after the last '.') and the compression (e.g., a trailing
.gz/.bz2/etc.), then compare those derived values (lowercased) against expected
in the existing f.startswith(".") and f.startswith("..") branches so the filters
match when metadata fields are absent.

elif "=" in f:
# Specific content variant key=value
key, val = f.split("=", 1)
# Try various common prefixes
actual = None
for prefix in ["dcv:", "dataid-cv:", ""]:
potential_val = node.get(f"{prefix}{key}")
if potential_val is not None:
if isinstance(potential_val, dict):
actual = potential_val.get("@value")
else:
actual = potential_val
break
if str(actual) != val:
return False
else:
# Match any content variant value
found = False
for k, v in node.items():
if k.startswith("dcv:") or k.startswith("dataid-cv:"):
actual_val = v
if isinstance(v, dict):
actual_val = v.get("@value")
if str(actual_val) == f:
found = True
break
if not found:
return False
return True


def _get_file_download_urls_from_artifact_jsonld(
json_str: str, filters: List[str] = None
) -> List[str]:
"""Parse the JSON-LD of a databus artifact version to extract download URLs.

Args:
json_str: JSON-LD string of the databus artifact version.
filters: Optional list of filters to apply to the files.

Returns:
List of all file download URLs in the artifact version.
List of matching file download URLs in the artifact version.
"""

databusIdUrl: List[str] = []
Expand All @@ -898,6 +963,9 @@ def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]:
graph = json_dict.get("@graph", [])
for node in graph:
if node.get("@type") == "Part":
if not _matches_filters(node, filters):
continue

file_uri = node.get("file")
if not isinstance(file_uri, str):
continue
Expand All @@ -916,20 +984,22 @@ def _download_group(
convert_to: str = None,
convert_from: str = None,
validate_checksum: bool = False,
filters: List[str] = None,
) -> None:
"""Download files in a databus group.

Args:
uri: The full databus group URI.
localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory.
all_versions: If True, download all versions of each artifact in the group; otherwise, only download the latest version.
vault_token_file: Path to Vault refresh token file for protected downloads.
databus_key: Databus API key for protected downloads.
localDir: Local directory to download files to.
all_versions: If True, download all versions; otherwise, only latest.
vault_token_file: Path to Vault refresh token file.
databus_key: Databus API key.
auth_url: Keycloak token endpoint URL.
client_id: Client ID for token exchange.
convert_to: Target compression format for on-the-fly conversion.
convert_to: Target compression format.
convert_from: Optional source compression format filter.
validate_checksum: Whether to validate checksums after downloading.
validate_checksum: Whether to validate checksums.
filters: Optional list of filters to apply to each file.
"""
json_str = fetch_databus_jsonld(uri, databus_key=databus_key)
artifacts = _get_databus_artifacts_of_group(json_str)
Expand All @@ -946,6 +1016,7 @@ def _download_group(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)


Expand Down Expand Up @@ -1013,8 +1084,18 @@ def download(
validate_checksum: Whether to validate checksums after downloading.
"""
for databusURI in databusURIs:
# Support pipe-separated filters for version/artifact/group URIs
# Syntax: https://.../version|key1=val1|.format|..compression
filters = []
base_uri = databusURI
if databusURI.startswith("http://") or databusURI.startswith("https://"):
if "|" in databusURI:
parts = databusURI.split("|")
base_uri = parts[0]
filters = parts[1:]

host, account, group, artifact, version, file = (
get_databus_id_parts_from_file_url(databusURI)
get_databus_id_parts_from_file_url(base_uri)
)

# Determine endpoint per-URI if not explicitly provided
Expand Down Expand Up @@ -1064,9 +1145,9 @@ def download(
expected_checksum=expected,
)
elif version is not None:
print(f"Downloading version: {databusURI}")
print(f"Downloading version: {base_uri}")
_download_version(
databusURI,
base_uri,
localDir,
vault_token_file=token,
databus_key=databus_key,
Expand All @@ -1075,13 +1156,14 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif artifact is not None:
print(
f"Downloading {'all' if all_versions else 'latest'} version(s) of artifact: {databusURI}"
f"Downloading {'all' if all_versions else 'latest'} version(s) of artifact: {base_uri}"
)
_download_artifact(
databusURI,
base_uri,
localDir,
all_versions=all_versions,
vault_token_file=token,
Expand All @@ -1091,13 +1173,14 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif group is not None and group != "collections":
print(
f"Downloading group and all its artifacts and versions: {databusURI}"
f"Downloading group and all its artifacts and versions: {base_uri}"
)
_download_group(
databusURI,
base_uri,
localDir,
all_versions=all_versions,
vault_token_file=token,
Expand All @@ -1107,6 +1190,7 @@ def download(
convert_to=convert_to,
convert_from=convert_from,
validate_checksum=validate_checksum,
filters=filters,
)
elif account is not None:
print("accountId not supported yet") # TODO
Expand Down
51 changes: 50 additions & 1 deletion databusclient/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def app():
)
@click.option("--remote", help="rclone remote name (e.g., 'nextcloud')")
@click.option("--path", help="Remote path on Nextcloud (e.g., 'datasets/mydataset')")
@click.option(
"--dry-run", is_flag=True, help="Generate and print JSON-LD without deploying"
)
@click.argument("distributions", nargs=-1)
def deploy(
version_id,
Expand All @@ -73,6 +76,7 @@ def deploy(
webdav_url,
remote,
path,
dry_run,
distributions: List[str],
):
"""
Expand Down Expand Up @@ -105,6 +109,12 @@ def deploy(
license_url=license_url,
distributions=distributions,
)

if dry_run:
click.echo("[DRY-RUN] Generated DataID JSON-LD:")
click.echo(json.dumps(dataid, indent=2))
return

api_deploy.deploy(dataid=dataid, api_key=apikey)
return

Expand All @@ -113,6 +123,21 @@ def deploy(
click.echo(f"[MODE] Deploy from metadata file: {metadata_file}")
with open(metadata_file, "r") as f:
metadata = json.load(f)

if dry_run:
click.echo("[DRY-RUN] Would deploy from metadata file")
# We could still generate the full DataID here to show it
dataid = api_deploy.create_dataset(
version_id=version_id,
artifact_version_title=title,
artifact_version_abstract=abstract,
artifact_version_description=description,
license_url=license_url,
distributions=api_deploy._create_distributions_from_metadata(metadata),
)
click.echo(json.dumps(dataid, indent=2))
return

api_deploy.deploy_from_metadata(
metadata, version_id, title, abstract, description, license_url, apikey
)
Expand All @@ -134,7 +159,17 @@ def deploy(

click.echo("[MODE] Upload & Deploy to DBpedia Databus via Nextcloud")
click.echo(f"→ Uploading to: {remote}:{path}")
metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url)
if dry_run:
click.echo("[DRY-RUN] Skipping WebDAV upload")
metadata = []
else:
metadata = webdav.upload_to_webdav(distributions, remote, path, webdav_url)

if dry_run:
click.echo("[DRY-RUN] Generated metadata (partial):")
click.echo(json.dumps(metadata, indent=2))
return
Comment on lines +162 to +171
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

--dry-run in WebDAV mode never builds the DataID preview.

This branch returns after printing [], so it skips the same dataset-construction path that real metadata deploys use. The result is that deploy --dry-run --webdav-url ... does not actually simulate the deploy or surface metadata/DataID errors until a real run.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@databusclient/cli.py` around lines 162 - 171, The current dry-run branch
returns before building the dataset/DataID preview so `--dry-run --webdav-url`
prints an empty list; fix by ensuring metadata is constructed even in dry-run
mode: extract or reuse the same metadata-generation logic used for real deploys
(the code that produces `metadata` from `distributions`, `remote`, `path`, and
`webdav_url`) and call it regardless of `dry_run`, but only skip the actual
upload side-effect when `dry_run` is true (i.e., call `webdav.upload_to_webdav`
or a new `build_metadata` helper in both cases or pass a dry_run flag to
`webdav.upload_to_webdav`), and remove the premature `return` so the
DataID/metadata preview is printed for dry runs.


api_deploy.deploy_from_metadata(
metadata, version_id, title, abstract, description, license_url, apikey
)
Expand Down Expand Up @@ -254,5 +289,19 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool)
)


@app.command()
@click.argument("shell", type=click.Choice(["bash", "zsh", "fish"]))
def completion(shell):
"""Generate shell completion script."""
import os

if shell == "bash":
os.system("_DATABUSCLIENT_COMPLETE=bash_source databusclient")
elif shell == "zsh":
os.system("_DATABUSCLIENT_COMPLETE=zsh_source databusclient")
elif shell == "fish":
os.system("_DATABUSCLIENT_COMPLETE=fish_source databusclient")


if __name__ == "__main__":
app()
Loading
Loading