Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions dandi/dandiapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1240,16 +1240,29 @@ def get_asset_by_path(self, path: str) -> RemoteAsset:
paths (e.g., ``../``) within it.
"""
path = self._normalize_path(path)
try:
# Weed out any assets that happen to have the given path as a
# proper prefix:
(asset,) = (
a for a in self.get_assets_with_path_prefix(path) if a.path == path
)
except ValueError:
raise NotFoundError(f"No asset at path {path!r}")
else:
return asset
# Weed out any assets that happen to have the given path as a
# proper prefix:
assets = [a for a in self.get_assets_with_path_prefix(path) if a.path == path]
if assets:
if len(assets) > 1:
lgr.warning(
"Multiple assets found at path %r; returning the first one",
path,
)
asset = assets[0]
elif not assets:
zarr_suf = ".zarr/"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about .ngff/?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, should also be supported

if (zarr_suf in path) and (zarr_suffix_idx := path.index(zarr_suf)):
# If path ends with .zarr/, we might have a zarr asset without
# a trailing slash in the path. Try again:
zarr_path_len = zarr_suffix_idx + len(zarr_suf)
zarr_path = path[: zarr_path_len - 1] # -1 for trailing /
subpath = path[len(zarr_path) :]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have several issues with this:

  • This code accepts paths containing a component that's just the Zarr suffix, e.g., foo/.zarr/bar.
  • The behavior differs from dandidav, which tests each .zarr/.ngff-suffixed leading portion for Zarr-ness.
  • Instead of futzing around with strings, I think it'd better to convert path to a pathlib.PurePosixPath and check path.parts to see if any part ends with .zarr (or .ngff, and also that the suffix is not the whole of the part).

Copy link
Copy Markdown
Member Author

@yarikoptic yarikoptic Mar 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I agree that dealing with Path instance might be more robust

asset = self.get_asset_by_path(zarr_path)
asset.subpath = subpath
else:
raise NotFoundError(f"No asset at path {path!r}")
return asset
Comment thread Fixed

def download_directory(
self,
Expand Down Expand Up @@ -1378,6 +1391,8 @@ class BaseRemoteAsset(ABC, APIBase):
identifier: str = Field(alias="asset_id")
#: The asset's (forward-slash-separated) path
path: str
#: The path within the asset, as e.g. path in a zarr/
subpath: str | None = Field(default=None)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like the idea of a BaseRemoteAsset object pointing to a resource inside an asset; the class is supposed to be for assets only. It'd be better to introduce a separate class for Zarr prefixes (and possibly also add a variant of get_asset_by_path() for returning this separate class).

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I also disliked it a little but it is already quite a hierarchy of classes there leading to nearly combinatorial explosion. If you see a sensible way, e.g. through adding a single extra class - please go ahead.

Comment thread
yarikoptic marked this conversation as resolved.
Outdated
#: The size of the asset in bytes
size: int
#: The date at which the asset was created
Expand Down
14 changes: 12 additions & 2 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ class Downloader:
url: ParsedDandiURL
output_dir: InitVar[str | Path]
output_prefix: Path = field(init=False)
#: just a convenience combination of output_dir and output_prefix
output_path: Path = field(init=False)
existing: DownloadExisting
get_metadata: bool
Expand Down Expand Up @@ -333,6 +334,12 @@ def download_generator(self) -> Iterator[dict]:
asset.path,
)
mtime = asset.modified
if asset.subpath:
lgr.warning(
"No downloading of subpaths within blobs yet. Got %s for %s",
asset.subpath,
asset.path,
)
_download_generator = _download_file(
asset.get_download_file_iter(),
download_path,
Expand All @@ -352,7 +359,8 @@ def download_generator(self) -> Iterator[dict]:
), f"Asset {asset.path} is neither blob nor Zarr"
_download_generator = _download_zarr(
asset,
download_path,
prefix=asset.subpath,
download_path=download_path,
toplevel_path=self.output_path,
existing=self.existing,
jobs=self.jobs_per_zarr,
Expand Down Expand Up @@ -812,6 +820,7 @@ def _download_file(
lgr.warning("downloader logic: We should not be here!")

final_digest = None

if downloaded_digest and not resuming:
assert downloaded_digest is not None
final_digest = downloaded_digest.hexdigest() # we care only about hex
Expand Down Expand Up @@ -977,6 +986,7 @@ def _download_zarr(
toplevel_path: str | Path,
existing: DownloadExisting,
lock: Lock,
prefix: str | None = None,
jobs: int | None = None,
) -> Iterator[dict]:
# Avoid heavy import by importing within function:
Expand All @@ -993,7 +1003,7 @@ def digest_callback(path: str, algoname: str, d: str) -> None:
digests[path] = d

def downloads_gen():
for entry in asset.iterfiles():
for entry in asset.iterfiles(prefix=prefix):
entries.append(entry)
etag = entry.digest
assert etag.algorithm is DigestType.md5
Expand Down