Skip to content
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion dandi/cli/cmd_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
instance_option,
map_to_click_exceptions,
)
from ..upload import UploadExisting, UploadValidation
from ..upload import DataladMode, UploadExisting, UploadValidation


@click.command()
Expand Down Expand Up @@ -48,6 +48,18 @@
default="require",
show_default=True,
)
@click.option(
"--datalad",
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems we are not doing anything "datalad" specific and operate directly with git-annex, let's call it --git-annex (and GitAnnexMode) and do 'auto' by default.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Renamed to --git-annex with GitAnnexMode enum and changed default to auto.

type=click.Choice(tuple(e.value for e in DataladMode)),
default="no",
show_default=True,
help=(
"Enable datalad/git-annex support to link local uploaded files with remote URLs. "
"'yes' - always use datalad; "
"'no' - never use datalad (default); "
"'auto' - auto-detect git-annex repository and use datalad if found."
),
)
@click.argument("paths", nargs=-1) # , type=click.Path(exists=True, dir_okay=False))
# &
# Development options: Set DANDI_DEVEL for them to become available
Expand Down Expand Up @@ -75,6 +87,7 @@ def upload(
dandi_instance: str,
existing: UploadExisting,
validation: UploadValidation,
datalad: DataladMode,
# Development options should come as kwargs
allow_any_path: bool = False,
upload_dandiset_metadata: bool = False,
Expand Down Expand Up @@ -115,4 +128,5 @@ def upload(
jobs=jobs,
jobs_per_file=jobs_per_file,
sync=sync,
datalad=datalad,
)
85 changes: 85 additions & 0 deletions dandi/tests/test_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -707,3 +707,88 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) ->
match=f"contains {DOWNLOAD_SUFFIX} path which indicates incomplete download",
):
new_dandiset.upload(allow_any_path=True)


@pytest.mark.ai_generated
def test_is_git_annex_repo(tmp_path: Path) -> None:
"""Test detection of git-annex repositories."""
import subprocess

from ..upload import _is_git_annex_repo
Comment thread
yarikoptic marked this conversation as resolved.
Outdated

# Check if git-annex is available
try:
subprocess.run(
["git", "annex", "version"],
capture_output=True,
check=True,
)
except (FileNotFoundError, subprocess.CalledProcessError):
pytest.skip("git-annex not available")

# Not a git repo
assert not _is_git_annex_repo(tmp_path)

# Create a git repo
subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True)
assert not _is_git_annex_repo(tmp_path)

# Initialize git-annex
subprocess.run(
["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True
)
assert _is_git_annex_repo(tmp_path)


@pytest.mark.ai_generated
def test_get_file_annex_backend(tmp_path: Path) -> None:
"""Test getting the git-annex backend for a file."""
import subprocess

from ..upload import _get_file_annex_backend

# Check if git-annex is available
try:
subprocess.run(
["git", "annex", "version"],
capture_output=True,
check=True,
)
except (FileNotFoundError, subprocess.CalledProcessError):
pytest.skip("git-annex not available")

# Create a git-annex repo
subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True)
subprocess.run(
["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True
)

# Create a test file
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fold the tests following after this line into test_is_git_annex_repo and rename it like test_git_annex_repo_ops

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Consolidated into single test_git_annex_repo_ops that tests enum, detection, and backend operations.

test_file = tmp_path / "test.txt"
test_file.write_text("test content")

# Not annexed yet
assert _get_file_annex_backend(test_file) is None

# Add to annex
subprocess.run(
["git", "annex", "add", "test.txt"],
cwd=tmp_path,
check=True,
capture_output=True,
)

# Should detect backend (default is SHA256E on most systems)
backend = _get_file_annex_backend(test_file)
assert backend is not None
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is no test which actually tests then registering of URL within git-annex. We have integration tests running on a dockerized instance, so you could upload such a file and ensure that we get URL registed in annex. No need for a dedicated new test -- bundle testing into this one



@pytest.mark.ai_generated
def test_datalad_mode_enum() -> None:
"""Test DataladMode enum values."""
from ..upload import DataladMode

assert DataladMode.YES.value == "yes"
assert DataladMode.NO.value == "no"
assert DataladMode.AUTO.value == "auto"
assert str(DataladMode.YES) == "yes"
184 changes: 184 additions & 0 deletions dandi/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@
from enum import Enum
from functools import reduce
import io
import os
import os.path
from pathlib import Path
import re
import subprocess
import time
from time import sleep
from typing import Any, TypedDict, cast
Expand Down Expand Up @@ -66,6 +68,119 @@ def _check_dandidownload_paths(dfile: DandiFile) -> None:
)


def _is_git_annex_repo(path: Path) -> bool:
"""
Check if the given path is within a git-annex repository.

Returns True if .git/annex directory exists in the repository root.
"""
try:
# Find git repository root
result = subprocess.run(
["git", "rev-parse", "--show-toplevel"],
cwd=path if path.is_dir() else path.parent,
capture_output=True,
text=True,
check=False,
)
if result.returncode == 0:
repo_root = Path(result.stdout.strip())
annex_dir = repo_root / ".git" / "annex"
return annex_dir.exists()
Comment thread
yarikoptic marked this conversation as resolved.
Outdated
except Exception as e:
lgr.debug("Error checking for git-annex repository: %s", e)
return False


def _get_file_annex_backend(filepath: Path) -> str | None:
"""
Get the git-annex backend for a file if it's annexed.

Returns the backend name (e.g., 'SHA256E') or None if file is not annexed.
"""
if not filepath.exists():
return None

try:
# Check if file is a symlink (git-annex files are symlinks)
if not filepath.is_symlink():
return None

# Read the symlink target
target = os.readlink(filepath)

# Parse the backend from the annex key format
# Typical format: ../../.git/annex/objects/XX/YY/SHA256E-sNNNNN--HASH/HASH
# Verify this is actually in the annex objects directory
target_path = Path(target)
if "annex" not in target_path.parts or "objects" not in target_path.parts:
return None

# Find the part that looks like an annex key (contains backend and size)
for part in target_path.parts:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just take the filename

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Now using Path(target).name to extract just the filename.

if "-s" in part and "--" in part:
# This looks like an annex key: BACKEND-sSIZE--HASH
backend = part.split("-")[0]
return backend
except Exception as e:
lgr.debug("Error checking git-annex backend for %s: %s", filepath, e)

return None


def _register_url_with_annex(
filepath: Path, remote_url: str, expected_size: int | None = None
) -> bool:
"""
Register a remote URL with git-annex for a file.

Args:
filepath: Local file path
remote_url: Remote URL to register
expected_size: Expected file size (for validation)

Returns:
True if successful, False otherwise
"""
try:
# Use git-annex addurl with --relaxed to skip size/hash checks
# and --file to specify the target file
cmd = [
"git",
"annex",
"addurl",
"--relaxed",
"--file",
str(filepath),
remote_url,
]

lgr.debug("Registering URL with git-annex: %s -> %s", filepath, remote_url)

result = subprocess.run(
cmd,
cwd=filepath.parent,
capture_output=True,
text=True,
check=False,
)

if result.returncode == 0:
lgr.info("Successfully registered %s with git-annex", filepath)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
lgr.info("Successfully registered %s with git-annex", filepath)
lgr.info("Successfully registered url %s to %s with git-annex", remote_url, filepath)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Updated log message to include both URL and filepath.

return True
else:
lgr.warning(
"Failed to register %s with git-annex: %s",
Comment thread
yarikoptic marked this conversation as resolved.
Outdated
filepath,
result.stderr.strip(),
)
return False

except Exception as e:
lgr.warning("Error registering URL with git-annex for %s: %s", filepath, e)
return False


class Uploaded(TypedDict):
size: int
errors: list[str]
Expand All @@ -91,6 +206,17 @@ def __str__(self) -> str:
return self.value


class DataladMode(str, Enum):
"""Mode for datalad/git-annex integration during upload"""

YES = "yes"
NO = "no"
AUTO = "auto"

def __str__(self) -> str:
return self.value


def upload(
paths: Sequence[str | Path] | None = None,
existing: UploadExisting = UploadExisting.REFRESH,
Expand All @@ -102,6 +228,7 @@ def upload(
jobs: int | None = None,
jobs_per_file: int | None = None,
sync: bool = False,
datalad: DataladMode = DataladMode.NO,
) -> None:
if paths:
paths = [Path(p).absolute() for p in paths]
Expand All @@ -114,6 +241,16 @@ def upload(
" paths. Use 'dandi download' or 'organize' first."
)

# Determine if we should use datalad based on mode
use_datalad = False
if datalad == DataladMode.YES:
use_datalad = True
elif datalad == DataladMode.AUTO:
# Auto-detect git-annex repository
use_datalad = _is_git_annex_repo(dandiset.path_obj)
if use_datalad:
lgr.info("Auto-detected git-annex repository; enabling datalad support")

with ExitStack() as stack:
# We need to use the client as a context manager in order to ensure the
# session gets properly closed. Otherwise, pytest sometimes complains
Expand Down Expand Up @@ -384,9 +521,12 @@ def process_path(dfile: DandiFile) -> Iterator[dict]:
#
yield {"status": "uploading"}
validating = False
uploaded_asset: RemoteAsset | None = None
for r in dfile.iter_upload(
remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant
):
if r["status"] == "done":
uploaded_asset = r.get("asset")
r.pop("asset", None) # to keep pyout from choking
if r["status"] == "uploading":
uploaded_paths[strpath]["size"] = r.pop("current")
Expand All @@ -398,6 +538,50 @@ def process_path(dfile: DandiFile) -> Iterator[dict]:
validating = True
else:
yield r

# Handle datalad integration after successful upload
if use_datalad and uploaded_asset is not None:
try:
# Check backend and warn if not SHA256E
current_backend = _get_file_annex_backend(dfile.filepath)
if current_backend and current_backend != "SHA256E":
lgr.warning(
"%s: File backend is %s, but dandiset standard is SHA256E",
strpath,
current_backend,
)

# Get the remote URL from asset metadata
# The contentUrl field contains an array of URLs where the
# asset can be accessed. We use the first URL, which is
# typically the S3 URL. Additional URLs may include API
# endpoints or alternative storage locations.
asset_metadata = uploaded_asset.get_raw_metadata()
content_urls = asset_metadata.get("contentUrl", [])

if content_urls:
# Register the first URL with git-annex
remote_url = content_urls[0]
file_size = asset_metadata.get("contentSize")

if _register_url_with_annex(
dfile.filepath, remote_url, file_size
):
lgr.debug(
"%s: Successfully registered remote URL with git-annex",
strpath,
)
else:
lgr.debug(
"%s: No contentUrl found in asset metadata; "
"skipping datalad registration",
strpath,
)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might take time to get URL registered in backend. So we would need to loop for e.g. up to 20 seconds, reasking each 2 and then failing if none is provided after 20 seconds, and exiting the loop if we successfully registered the URL.

note that there could be multiple URLs, review code in https://github.com/dandi/backups2datalad on dealing with public vs private / embargoed dandisets. For embargoed we need to ensure enabling datalad special remote and associate api based URLs to facilitate authentication. If public -- then public S3 urls.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Added polling loop (20s timeout, 2s intervals) for contentUrl. Implemented S3 URL preference for public dandisets. Note: embargoed dandisets with datalad special remote authentication would need additional work - for now this handles the public case which is the primary use case.

except Exception as e:
lgr.warning(
"%s: Error during datalad integration: %s", strpath, e
)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove swallow + warn -- let it raise. If doesn't work, user should disable use of git-annex. We might want to hint to use --git-annex no or whatever that option would be.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in fbb8785. Removed try/except wrapper - errors now raise with helpful hint message to use --git-annex no.


yield {"status": "done"}

except Exception as exc:
Expand Down
2 changes: 2 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ commands = sphinx-build -E -W -b html source build

[pytest]
addopts = --tb=short --durations=10 --timeout=300
markers =
ai_generated: marks tests as AI-generated (deselect with '-m "not ai_generated"')
Comment thread
yarikoptic marked this conversation as resolved.
Outdated
filterwarnings =
error
ignore:No cached namespaces found .*:UserWarning
Expand Down
Loading