-
Notifications
You must be signed in to change notification settings - Fork 33
Add git-annex support for upload with auto-detection #1797
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 6 commits
2cf6132
672e973
475c29c
296fa86
0658ea8
4eb69e5
fbb8785
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -707,3 +707,88 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) -> | |
| match=f"contains {DOWNLOAD_SUFFIX} path which indicates incomplete download", | ||
| ): | ||
| new_dandiset.upload(allow_any_path=True) | ||
|
|
||
|
|
||
| @pytest.mark.ai_generated | ||
| def test_is_git_annex_repo(tmp_path: Path) -> None: | ||
| """Test detection of git-annex repositories.""" | ||
| import subprocess | ||
|
|
||
| from ..upload import _is_git_annex_repo | ||
|
yarikoptic marked this conversation as resolved.
Outdated
|
||
|
|
||
| # Check if git-annex is available | ||
| try: | ||
| subprocess.run( | ||
| ["git", "annex", "version"], | ||
| capture_output=True, | ||
| check=True, | ||
| ) | ||
| except (FileNotFoundError, subprocess.CalledProcessError): | ||
| pytest.skip("git-annex not available") | ||
|
|
||
| # Not a git repo | ||
| assert not _is_git_annex_repo(tmp_path) | ||
|
|
||
| # Create a git repo | ||
| subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) | ||
| assert not _is_git_annex_repo(tmp_path) | ||
|
|
||
| # Initialize git-annex | ||
| subprocess.run( | ||
| ["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True | ||
| ) | ||
| assert _is_git_annex_repo(tmp_path) | ||
|
|
||
|
|
||
| @pytest.mark.ai_generated | ||
| def test_get_file_annex_backend(tmp_path: Path) -> None: | ||
| """Test getting the git-annex backend for a file.""" | ||
| import subprocess | ||
|
|
||
| from ..upload import _get_file_annex_backend | ||
|
|
||
| # Check if git-annex is available | ||
| try: | ||
| subprocess.run( | ||
| ["git", "annex", "version"], | ||
| capture_output=True, | ||
| check=True, | ||
| ) | ||
| except (FileNotFoundError, subprocess.CalledProcessError): | ||
| pytest.skip("git-annex not available") | ||
|
|
||
| # Create a git-annex repo | ||
| subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) | ||
| subprocess.run( | ||
| ["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True | ||
| ) | ||
|
|
||
| # Create a test file | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fold the tests following after this line into test_is_git_annex_repo and rename it like test_git_annex_repo_ops
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in fbb8785. Consolidated into single |
||
| test_file = tmp_path / "test.txt" | ||
| test_file.write_text("test content") | ||
|
|
||
| # Not annexed yet | ||
| assert _get_file_annex_backend(test_file) is None | ||
|
|
||
| # Add to annex | ||
| subprocess.run( | ||
| ["git", "annex", "add", "test.txt"], | ||
| cwd=tmp_path, | ||
| check=True, | ||
| capture_output=True, | ||
| ) | ||
|
|
||
| # Should detect backend (default is SHA256E on most systems) | ||
| backend = _get_file_annex_backend(test_file) | ||
| assert backend is not None | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no test which actually tests then registering of URL within git-annex. We have integration tests running on a dockerized instance, so you could upload such a file and ensure that we get URL registed in annex. No need for a dedicated new test -- bundle testing into this one |
||
|
|
||
|
|
||
| @pytest.mark.ai_generated | ||
| def test_datalad_mode_enum() -> None: | ||
| """Test DataladMode enum values.""" | ||
| from ..upload import DataladMode | ||
|
|
||
| assert DataladMode.YES.value == "yes" | ||
| assert DataladMode.NO.value == "no" | ||
| assert DataladMode.AUTO.value == "auto" | ||
| assert str(DataladMode.YES) == "yes" | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -6,9 +6,11 @@ | |||||
| from enum import Enum | ||||||
| from functools import reduce | ||||||
| import io | ||||||
| import os | ||||||
| import os.path | ||||||
| from pathlib import Path | ||||||
| import re | ||||||
| import subprocess | ||||||
| import time | ||||||
| from time import sleep | ||||||
| from typing import Any, TypedDict, cast | ||||||
|
|
@@ -66,6 +68,119 @@ def _check_dandidownload_paths(dfile: DandiFile) -> None: | |||||
| ) | ||||||
|
|
||||||
|
|
||||||
| def _is_git_annex_repo(path: Path) -> bool: | ||||||
| """ | ||||||
| Check if the given path is within a git-annex repository. | ||||||
|
|
||||||
| Returns True if .git/annex directory exists in the repository root. | ||||||
| """ | ||||||
| try: | ||||||
| # Find git repository root | ||||||
| result = subprocess.run( | ||||||
| ["git", "rev-parse", "--show-toplevel"], | ||||||
| cwd=path if path.is_dir() else path.parent, | ||||||
| capture_output=True, | ||||||
| text=True, | ||||||
| check=False, | ||||||
| ) | ||||||
| if result.returncode == 0: | ||||||
| repo_root = Path(result.stdout.strip()) | ||||||
| annex_dir = repo_root / ".git" / "annex" | ||||||
| return annex_dir.exists() | ||||||
|
yarikoptic marked this conversation as resolved.
Outdated
|
||||||
| except Exception as e: | ||||||
| lgr.debug("Error checking for git-annex repository: %s", e) | ||||||
| return False | ||||||
|
|
||||||
|
|
||||||
| def _get_file_annex_backend(filepath: Path) -> str | None: | ||||||
| """ | ||||||
| Get the git-annex backend for a file if it's annexed. | ||||||
|
|
||||||
| Returns the backend name (e.g., 'SHA256E') or None if file is not annexed. | ||||||
| """ | ||||||
| if not filepath.exists(): | ||||||
| return None | ||||||
|
|
||||||
| try: | ||||||
| # Check if file is a symlink (git-annex files are symlinks) | ||||||
| if not filepath.is_symlink(): | ||||||
| return None | ||||||
|
|
||||||
| # Read the symlink target | ||||||
| target = os.readlink(filepath) | ||||||
|
|
||||||
| # Parse the backend from the annex key format | ||||||
| # Typical format: ../../.git/annex/objects/XX/YY/SHA256E-sNNNNN--HASH/HASH | ||||||
| # Verify this is actually in the annex objects directory | ||||||
| target_path = Path(target) | ||||||
| if "annex" not in target_path.parts or "objects" not in target_path.parts: | ||||||
| return None | ||||||
|
|
||||||
| # Find the part that looks like an annex key (contains backend and size) | ||||||
| for part in target_path.parts: | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just take the filename
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in fbb8785. Now using |
||||||
| if "-s" in part and "--" in part: | ||||||
| # This looks like an annex key: BACKEND-sSIZE--HASH | ||||||
| backend = part.split("-")[0] | ||||||
| return backend | ||||||
| except Exception as e: | ||||||
| lgr.debug("Error checking git-annex backend for %s: %s", filepath, e) | ||||||
|
|
||||||
| return None | ||||||
|
|
||||||
|
|
||||||
| def _register_url_with_annex( | ||||||
| filepath: Path, remote_url: str, expected_size: int | None = None | ||||||
| ) -> bool: | ||||||
| """ | ||||||
| Register a remote URL with git-annex for a file. | ||||||
|
|
||||||
| Args: | ||||||
| filepath: Local file path | ||||||
| remote_url: Remote URL to register | ||||||
| expected_size: Expected file size (for validation) | ||||||
|
|
||||||
| Returns: | ||||||
| True if successful, False otherwise | ||||||
| """ | ||||||
| try: | ||||||
| # Use git-annex addurl with --relaxed to skip size/hash checks | ||||||
| # and --file to specify the target file | ||||||
| cmd = [ | ||||||
| "git", | ||||||
| "annex", | ||||||
| "addurl", | ||||||
| "--relaxed", | ||||||
| "--file", | ||||||
| str(filepath), | ||||||
| remote_url, | ||||||
| ] | ||||||
|
|
||||||
| lgr.debug("Registering URL with git-annex: %s -> %s", filepath, remote_url) | ||||||
|
|
||||||
| result = subprocess.run( | ||||||
| cmd, | ||||||
| cwd=filepath.parent, | ||||||
| capture_output=True, | ||||||
| text=True, | ||||||
| check=False, | ||||||
| ) | ||||||
|
|
||||||
| if result.returncode == 0: | ||||||
| lgr.info("Successfully registered %s with git-annex", filepath) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in fbb8785. Updated log message to include both URL and filepath. |
||||||
| return True | ||||||
| else: | ||||||
| lgr.warning( | ||||||
| "Failed to register %s with git-annex: %s", | ||||||
|
yarikoptic marked this conversation as resolved.
Outdated
|
||||||
| filepath, | ||||||
| result.stderr.strip(), | ||||||
| ) | ||||||
| return False | ||||||
|
|
||||||
| except Exception as e: | ||||||
| lgr.warning("Error registering URL with git-annex for %s: %s", filepath, e) | ||||||
| return False | ||||||
|
|
||||||
|
|
||||||
| class Uploaded(TypedDict): | ||||||
| size: int | ||||||
| errors: list[str] | ||||||
|
|
@@ -91,6 +206,17 @@ def __str__(self) -> str: | |||||
| return self.value | ||||||
|
|
||||||
|
|
||||||
| class DataladMode(str, Enum): | ||||||
| """Mode for datalad/git-annex integration during upload""" | ||||||
|
|
||||||
| YES = "yes" | ||||||
| NO = "no" | ||||||
| AUTO = "auto" | ||||||
|
|
||||||
| def __str__(self) -> str: | ||||||
| return self.value | ||||||
|
|
||||||
|
|
||||||
| def upload( | ||||||
| paths: Sequence[str | Path] | None = None, | ||||||
| existing: UploadExisting = UploadExisting.REFRESH, | ||||||
|
|
@@ -102,6 +228,7 @@ def upload( | |||||
| jobs: int | None = None, | ||||||
| jobs_per_file: int | None = None, | ||||||
| sync: bool = False, | ||||||
| datalad: DataladMode = DataladMode.NO, | ||||||
| ) -> None: | ||||||
| if paths: | ||||||
| paths = [Path(p).absolute() for p in paths] | ||||||
|
|
@@ -114,6 +241,16 @@ def upload( | |||||
| " paths. Use 'dandi download' or 'organize' first." | ||||||
| ) | ||||||
|
|
||||||
| # Determine if we should use datalad based on mode | ||||||
| use_datalad = False | ||||||
| if datalad == DataladMode.YES: | ||||||
| use_datalad = True | ||||||
| elif datalad == DataladMode.AUTO: | ||||||
| # Auto-detect git-annex repository | ||||||
| use_datalad = _is_git_annex_repo(dandiset.path_obj) | ||||||
| if use_datalad: | ||||||
| lgr.info("Auto-detected git-annex repository; enabling datalad support") | ||||||
|
|
||||||
| with ExitStack() as stack: | ||||||
| # We need to use the client as a context manager in order to ensure the | ||||||
| # session gets properly closed. Otherwise, pytest sometimes complains | ||||||
|
|
@@ -384,9 +521,12 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: | |||||
| # | ||||||
| yield {"status": "uploading"} | ||||||
| validating = False | ||||||
| uploaded_asset: RemoteAsset | None = None | ||||||
| for r in dfile.iter_upload( | ||||||
| remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant | ||||||
| ): | ||||||
| if r["status"] == "done": | ||||||
| uploaded_asset = r.get("asset") | ||||||
| r.pop("asset", None) # to keep pyout from choking | ||||||
| if r["status"] == "uploading": | ||||||
| uploaded_paths[strpath]["size"] = r.pop("current") | ||||||
|
|
@@ -398,6 +538,50 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: | |||||
| validating = True | ||||||
| else: | ||||||
| yield r | ||||||
|
|
||||||
| # Handle datalad integration after successful upload | ||||||
| if use_datalad and uploaded_asset is not None: | ||||||
| try: | ||||||
| # Check backend and warn if not SHA256E | ||||||
| current_backend = _get_file_annex_backend(dfile.filepath) | ||||||
| if current_backend and current_backend != "SHA256E": | ||||||
| lgr.warning( | ||||||
| "%s: File backend is %s, but dandiset standard is SHA256E", | ||||||
| strpath, | ||||||
| current_backend, | ||||||
| ) | ||||||
|
|
||||||
| # Get the remote URL from asset metadata | ||||||
| # The contentUrl field contains an array of URLs where the | ||||||
| # asset can be accessed. We use the first URL, which is | ||||||
| # typically the S3 URL. Additional URLs may include API | ||||||
| # endpoints or alternative storage locations. | ||||||
| asset_metadata = uploaded_asset.get_raw_metadata() | ||||||
| content_urls = asset_metadata.get("contentUrl", []) | ||||||
|
|
||||||
| if content_urls: | ||||||
| # Register the first URL with git-annex | ||||||
| remote_url = content_urls[0] | ||||||
| file_size = asset_metadata.get("contentSize") | ||||||
|
|
||||||
| if _register_url_with_annex( | ||||||
| dfile.filepath, remote_url, file_size | ||||||
| ): | ||||||
| lgr.debug( | ||||||
| "%s: Successfully registered remote URL with git-annex", | ||||||
| strpath, | ||||||
| ) | ||||||
| else: | ||||||
| lgr.debug( | ||||||
| "%s: No contentUrl found in asset metadata; " | ||||||
| "skipping datalad registration", | ||||||
| strpath, | ||||||
| ) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it might take time to get URL registered in backend. So we would need to loop for e.g. up to 20 seconds, reasking each 2 and then failing if none is provided after 20 seconds, and exiting the loop if we successfully registered the URL. note that there could be multiple URLs, review code in https://github.com/dandi/backups2datalad on dealing with public vs private / embargoed dandisets. For embargoed we need to ensure enabling datalad special remote and associate api based URLs to facilitate authentication. If public -- then public S3 urls.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in fbb8785. Added polling loop (20s timeout, 2s intervals) for contentUrl. Implemented S3 URL preference for public dandisets. Note: embargoed dandisets with datalad special remote authentication would need additional work - for now this handles the public case which is the primary use case. |
||||||
| except Exception as e: | ||||||
| lgr.warning( | ||||||
| "%s: Error during datalad integration: %s", strpath, e | ||||||
| ) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove swallow + warn -- let it raise. If doesn't work, user should disable use of git-annex. We might want to hint to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in fbb8785. Removed try/except wrapper - errors now raise with helpful hint message to use |
||||||
|
|
||||||
| yield {"status": "done"} | ||||||
|
|
||||||
| except Exception as exc: | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
seems we are not doing anything "datalad" specific and operate directly with git-annex, let's call it
--git-annex(and GitAnnexMode) and do 'auto' by default.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in fbb8785. Renamed to
--git-annexwithGitAnnexModeenum and changed default toauto.