Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 49 additions & 12 deletions skyllh/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __init__(
username=None,
password=None,
post_transfer_func=None,
url=None,
**kwargs,
):
"""Creates a new instance to define the origin of a dataset.
Expand All @@ -95,7 +96,8 @@ def __init__(
connect to the remote host.
filename : str | None
If the origin is not a directory but a file, this specifies the
filename.
filename. When ``url`` is set this becomes the local filename used
when saving the downloaded file.
host : str | None
The name or IP of the remote host.
port : int | None
Expand All @@ -115,6 +117,12 @@ def __init__(

where ``ds`` is an instance of ``Dataset``, and ``dst_path`` is the
destination path.
url : str | None
An optional complete download URL (e.g. an API endpoint with query
parameters). When set, the ``transfer_func`` should use this URL
directly instead of constructing one from ``host``, ``base_path``,
and ``sub_path``. The ``filename`` field then serves as the local
filename for the downloaded file.
"""
super().__init__(**kwargs)

Expand All @@ -127,6 +135,7 @@ def __init__(
self.username = username
self.password = password
self.post_transfer_func = post_transfer_func
self.url = url

@property
def base_path(self):
Expand Down Expand Up @@ -259,6 +268,22 @@ def post_transfer_func(self, obj):
)
self._post_transfer_func = obj

@property
def url(self):
"""The optional complete download URL. When set, the transfer function
should use this URL directly instead of constructing one from ``host``,
``base_path``, and ``sub_path``.
"""
return self._url

@url.setter
def url(self, obj):
if obj is not None and not isinstance(obj, str):
raise TypeError(
f'The property url must be None, or an instance of str! Its current type is {classname(obj)}!'
)
self._url = obj

def __str__(self):
"""Pretty string representation of this class."""
transfer_cls = get_class_of_func(self.transfer_func)
Expand Down Expand Up @@ -457,7 +482,7 @@ def post_transfer_unzip(

# Unzip the dataset file.
zip_file = os.path.join(dst_path, ds.origin.filename)
cmd = f'unzip "{zip_file}" -d "{dst_path}"'
cmd = f'unzip -q -o "{zip_file}" -d "{dst_path}"'
DatasetTransfer.execute_system_command(cmd, logger)

# Remove the zip file.
Expand Down Expand Up @@ -634,8 +659,7 @@ def transfer(
The password for the user name required to connect to the remote
host.
"""
cls = get_class_of_func(self.transfer)
logger = get_logger(f'{classname(cls)}.transfer')
logger = get_logger(module_class_method_name(self, 'transfer'))

host = origin.host
port = origin.port
Expand All @@ -655,7 +679,7 @@ def transfer(
dst_path = os.path.join(dst_base_path, dst_sub_path)
DatasetTransfer.ensure_dst_path(dst_path)

cmd = 'wget '
cmd = 'wget -q '
if username is None:
# No user name is specified.
pass
Expand All @@ -665,16 +689,25 @@ def transfer(
else:
# Only a user name is specified.
cmd += f'--user={username} '
cmd += f'{self.protocol}://{host}'
if port is not None:
cmd += f':{port}'
if path[0:1] != '/':
cmd += '/'
cmd += f'{path} -P {dst_path}'
if origin.url is not None:
# Use the full URL directly; save with an explicit output path
# so the local filename matches origin.filename regardless of
# what the server sends in the URL path or Content-Disposition.
output_file = os.path.join(dst_path, os.path.basename(file))
cmd += f'"{origin.url}" -O "{output_file}"'
else:
cmd += f'{self.protocol}://{host}'
if port is not None:
cmd += f':{port}'
if path[0:1] != '/':
cmd += '/'
cmd += f'{path} -P {dst_path}'
logger.info('Downloading "%s" into "%s".', file, dst_path)
try:
DatasetTransfer.execute_system_command(cmd, logger)
except SystemCommandError as err:
raise DatasetTransferError(str(err)) from err
logger.info('Download of "%s" completed.', file)


class Dataset(
Expand Down Expand Up @@ -1438,7 +1471,11 @@ def make_data_available(
)

if self.origin.post_transfer_func is not None:
self.origin.post_transfer_func(ds=self, dst_path=base_path)
if not self.origin.is_directory and self.origin.sub_path:
post_dst_path = os.path.join(base_path, self.origin.sub_path)
else:
post_dst_path = base_path
self.origin.post_transfer_func(ds=self, dst_path=post_dst_path)

return True

Expand Down
17 changes: 9 additions & 8 deletions skyllh/datasets/i3/PublicData_10y_ps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from skyllh.core.dataset import (
DatasetCollection,
DatasetOrigin,
DatasetTransfer,
WGETDatasetTransfer,
)
from skyllh.i3.dataset import (
Expand Down Expand Up @@ -256,12 +257,12 @@ def create_dataset_collection(

# Define the origin of the dataset.
origin = DatasetOrigin(
base_path='data-releases',
sub_path='',
filename='20210126_PS-IC40-IC86_VII.zip',
host='icecube.wisc.edu',
transfer_func=WGETDatasetTransfer(protocol='http').transfer,
post_transfer_func=WGETDatasetTransfer.post_transfer_unzip,
url='https://dataverse.harvard.edu/api/access/dataset/:persistentId/versions/1.0?persistentId=doi:10.7910/DVN/VKL316',
base_path='',
sub_path='icecube_10year_ps',
filename='tmp.zip',
transfer_func=WGETDatasetTransfer(protocol='https').transfer,
post_transfer_func=DatasetTransfer.post_transfer_unzip,
)

# Define the common keyword arguments for all data sets.
Expand Down Expand Up @@ -348,7 +349,7 @@ def create_dataset_collection(
# ---------- IC79 ----------------------------------------------------------
IC79 = I3Dataset(
name='IC79',
exp_pathfilenames='events/IC79_exp.csv',
exp_pathfilenames='events/IC79_exp-1.csv',
mc_pathfilenames=None,
grl_pathfilenames='uptime/IC79_exp.csv',
**ds_kwargs,
Expand Down Expand Up @@ -502,7 +503,7 @@ def create_dataset_collection(
# ---------- IC86-VII ------------------------------------------------------
IC86_VII = I3Dataset(
name='IC86_VII',
exp_pathfilenames='events/IC86_VII_exp.csv',
exp_pathfilenames='events/IC86_VII_exp-1.csv',
mc_pathfilenames=None,
grl_pathfilenames='uptime/IC86_VII_exp.csv',
**ds_kwargs,
Expand Down