diff --git a/skyllh/core/dataset.py b/skyllh/core/dataset.py index fe950eef07..9d5add0fac 100644 --- a/skyllh/core/dataset.py +++ b/skyllh/core/dataset.py @@ -71,6 +71,7 @@ def __init__( username=None, password=None, post_transfer_func=None, + url=None, **kwargs, ): """Creates a new instance to define the origin of a dataset. @@ -95,7 +96,8 @@ def __init__( connect to the remote host. filename : str | None If the origin is not a directory but a file, this specifies the - filename. + filename. When ``url`` is set this becomes the local filename used + when saving the downloaded file. host : str | None The name or IP of the remote host. port : int | None @@ -115,6 +117,12 @@ def __init__( where ``ds`` is an instance of ``Dataset``, and ``dst_path`` is the destination path. + url : str | None + An optional complete download URL (e.g. an API endpoint with query + parameters). When set, the ``transfer_func`` should use this URL + directly instead of constructing one from ``host``, ``base_path``, + and ``sub_path``. The ``filename`` field then serves as the local + filename for the downloaded file. """ super().__init__(**kwargs) @@ -127,6 +135,7 @@ def __init__( self.username = username self.password = password self.post_transfer_func = post_transfer_func + self.url = url @property def base_path(self): @@ -259,6 +268,22 @@ def post_transfer_func(self, obj): ) self._post_transfer_func = obj + @property + def url(self): + """The optional complete download URL. When set, the transfer function + should use this URL directly instead of constructing one from ``host``, + ``base_path``, and ``sub_path``. + """ + return self._url + + @url.setter + def url(self, obj): + if obj is not None and not isinstance(obj, str): + raise TypeError( + f'The property url must be None, or an instance of str! Its current type is {classname(obj)}!' + ) + self._url = obj + def __str__(self): """Pretty string representation of this class.""" transfer_cls = get_class_of_func(self.transfer_func) @@ -457,7 +482,7 @@ def post_transfer_unzip( # Unzip the dataset file. zip_file = os.path.join(dst_path, ds.origin.filename) - cmd = f'unzip "{zip_file}" -d "{dst_path}"' + cmd = f'unzip -q -o "{zip_file}" -d "{dst_path}"' DatasetTransfer.execute_system_command(cmd, logger) # Remove the zip file. @@ -634,8 +659,7 @@ def transfer( The password for the user name required to connect to the remote host. """ - cls = get_class_of_func(self.transfer) - logger = get_logger(f'{classname(cls)}.transfer') + logger = get_logger(module_class_method_name(self, 'transfer')) host = origin.host port = origin.port @@ -655,7 +679,7 @@ def transfer( dst_path = os.path.join(dst_base_path, dst_sub_path) DatasetTransfer.ensure_dst_path(dst_path) - cmd = 'wget ' + cmd = 'wget -q ' if username is None: # No user name is specified. pass @@ -665,16 +689,25 @@ def transfer( else: # Only a user name is specified. cmd += f'--user={username} ' - cmd += f'{self.protocol}://{host}' - if port is not None: - cmd += f':{port}' - if path[0:1] != '/': - cmd += '/' - cmd += f'{path} -P {dst_path}' + if origin.url is not None: + # Use the full URL directly; save with an explicit output path + # so the local filename matches origin.filename regardless of + # what the server sends in the URL path or Content-Disposition. + output_file = os.path.join(dst_path, os.path.basename(file)) + cmd += f'"{origin.url}" -O "{output_file}"' + else: + cmd += f'{self.protocol}://{host}' + if port is not None: + cmd += f':{port}' + if path[0:1] != '/': + cmd += '/' + cmd += f'{path} -P {dst_path}' + logger.info('Downloading "%s" into "%s".', file, dst_path) try: DatasetTransfer.execute_system_command(cmd, logger) except SystemCommandError as err: raise DatasetTransferError(str(err)) from err + logger.info('Download of "%s" completed.', file) class Dataset( @@ -1438,7 +1471,11 @@ def make_data_available( ) if self.origin.post_transfer_func is not None: - self.origin.post_transfer_func(ds=self, dst_path=base_path) + if not self.origin.is_directory and self.origin.sub_path: + post_dst_path = os.path.join(base_path, self.origin.sub_path) + else: + post_dst_path = base_path + self.origin.post_transfer_func(ds=self, dst_path=post_dst_path) return True diff --git a/skyllh/datasets/i3/PublicData_10y_ps.py b/skyllh/datasets/i3/PublicData_10y_ps.py index e1b635839f..7c3a479774 100644 --- a/skyllh/datasets/i3/PublicData_10y_ps.py +++ b/skyllh/datasets/i3/PublicData_10y_ps.py @@ -3,6 +3,7 @@ from skyllh.core.dataset import ( DatasetCollection, DatasetOrigin, + DatasetTransfer, WGETDatasetTransfer, ) from skyllh.i3.dataset import ( @@ -256,12 +257,12 @@ def create_dataset_collection( # Define the origin of the dataset. origin = DatasetOrigin( - base_path='data-releases', - sub_path='', - filename='20210126_PS-IC40-IC86_VII.zip', - host='icecube.wisc.edu', - transfer_func=WGETDatasetTransfer(protocol='http').transfer, - post_transfer_func=WGETDatasetTransfer.post_transfer_unzip, + url='https://dataverse.harvard.edu/api/access/dataset/:persistentId/versions/1.0?persistentId=doi:10.7910/DVN/VKL316', + base_path='', + sub_path='icecube_10year_ps', + filename='tmp.zip', + transfer_func=WGETDatasetTransfer(protocol='https').transfer, + post_transfer_func=DatasetTransfer.post_transfer_unzip, ) # Define the common keyword arguments for all data sets. @@ -348,7 +349,7 @@ def create_dataset_collection( # ---------- IC79 ---------------------------------------------------------- IC79 = I3Dataset( name='IC79', - exp_pathfilenames='events/IC79_exp.csv', + exp_pathfilenames='events/IC79_exp-1.csv', mc_pathfilenames=None, grl_pathfilenames='uptime/IC79_exp.csv', **ds_kwargs, @@ -502,7 +503,7 @@ def create_dataset_collection( # ---------- IC86-VII ------------------------------------------------------ IC86_VII = I3Dataset( name='IC86_VII', - exp_pathfilenames='events/IC86_VII_exp.csv', + exp_pathfilenames='events/IC86_VII_exp-1.csv', mc_pathfilenames=None, grl_pathfilenames='uptime/IC86_VII_exp.csv', **ds_kwargs,