Skip to content
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ New Features
- :func:`selectors.has_nulls` now takes a ``proportion`` parameter, which allows
selecting columns that have a fraction of null values above the given threshold.
:pr:`1881` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
- Added a new dataset, :func:`fetch_electricity_usage`, which contains electricity usage data
for several French cities and corresponding weather data.
:pr:`2013` by :user:`Lisa McBride<lisaleemcb>`.


Changes
Expand Down
1 change: 1 addition & 0 deletions doc/api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@
"datasets.fetch_country_happiness",
"datasets.fetch_credit_fraud",
"datasets.fetch_drug_directory",
"datasets.fetch_electricity_usage",
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
"datasets.fetch_employee_salaries",
"datasets.fetch_flight_delays",
"datasets.fetch_medical_charge",
Expand Down
2 changes: 2 additions & 0 deletions skrub/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
fetch_country_happiness,
fetch_credit_fraud,
fetch_drug_directory,
fetch_electricity_usage,
fetch_employee_salaries,
fetch_flight_delays,
fetch_medical_charge,
Expand All @@ -23,6 +24,7 @@
"fetch_country_happiness",
"fetch_credit_fraud",
"fetch_drug_directory",
"fetch_electricity_usage",
"fetch_employee_salaries",
"fetch_flight_delays",
"fetch_medical_charge",
Expand Down
43 changes: 42 additions & 1 deletion skrub/datasets/_fetching.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pathlib import Path

from ._utils import load_dataset_files, load_simple_dataset
from ._utils import download_dataset, load_dataset_files, load_simple_dataset


def fetch_employee_salaries(data_home=None, split="all"):
Expand Down Expand Up @@ -606,3 +606,44 @@ def fetch_california_housing(data_home=None):
The path to the california housing CSV file.
"""
return load_simple_dataset("california_housing", data_home)


def fetch_electricity_usage(data_home=None):
"""Fetches the electricity usage dataset (forecasting), available at \
https://github.com/skrub-data/skrub-data-files

Description of the dataset:
This dataset was generated from data obtained from the
ENTSOE Open Data portal under the open source license (CC-BY 4.0):
https://transparencyplatform.zendesk.com/hc/article_attachments/40921869376401

and the Open Meteo Historical Weather API:
https://open-meteo.com/en/docs/historical-forecast-api
in accordance with the licence described:
https://open-meteo.com/en/licence

This is a time-series forecasting use case. This dataset gives the total
electricity load in MW in France, covering a time range from
March 23, 2021 to May 31, 2025. In addition, the dataset contains
weather data for several cities within France.

It can be downloaded/loaded using the
sklearn.datasets.fetch_electricity_usage function.
Size on disk: 26MB.

Parameters
----------
data_home: str or path, default=None
The directory where to download and unzip the files.

Returns
-------
Path : PosixPath
The path to the electricity usage CSV files

Comment thread
lisaleemcb marked this conversation as resolved.
References
----------
.. [1] For more detailed instructions on how to use this dataset, please refer
to the example here: `EuroSciPy2025 <https://github.com/skrub-data/EuroSciPy2025>`_
"""
return download_dataset("electricity_usage", data_home=data_home)
13 changes: 12 additions & 1 deletion skrub/datasets/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@
],
"sha256": "0c3885894baf02fc787109801ec2c34cc25cd4a31e0066a16941b74157474887",
},
"electricity_usage": {
"urls": [
"https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/electricity_usage.zip",
"https://osf.io/download/d8ykq",
],
"sha256": "d929d73ce79e0e07a200941a476bde253ecb8fcd5d08ab276c0b5458dbf3bc7f",
},
"employee_salaries": {
"urls": [
"https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/employee_salaries.zip",
Expand Down Expand Up @@ -245,7 +252,7 @@ def load_simple_dataset(dataset_name, data_home=None):
return bunch


def load_dataset_files(dataset_name, data_home):
def download_dataset(dataset_name, data_home):
data_home = get_data_home(data_home)
dataset_dir = data_home / dataset_name
datafiles_dir = dataset_dir / dataset_name
Expand All @@ -270,7 +277,11 @@ def load_dataset_files(dataset_name, data_home):

if not datafiles_dir.exists():
_extract_archive(dataset_dir, archive_path)
return datafiles_dir


def load_dataset_files(dataset_name, data_home):
datafiles_dir = download_dataset(dataset_name, data_home)
bunch = Bunch()

# If there is a file named <dataset_name>.csv, we load the path as the main
Expand Down
49 changes: 49 additions & 0 deletions skrub/datasets/tests/test_fetching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from tempfile import TemporaryDirectory

import pandas as pd
Expand Down Expand Up @@ -174,3 +175,51 @@ def _error_on_get(*args, **kwargs):
with TemporaryDirectory() as temp_dir:
with pytest.raises(OSError, match="Can't download"):
_ = _fetching.fetch_employee_salaries(data_home=temp_dir)


@xfail_with_download_error
@pytest.mark.parametrize(
"dataset_name, dataset_path",
[
("electricity_usage", ("electricity_usage", "electricity_usage")),
],
)
def test_dataset_paths(dataset_name, dataset_path):
"Test datasets whose fetcher returns a PosixPath."
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
path = getattr(_fetching, f"fetch_{dataset_name}")()
assert path.parts[-2:] == dataset_path


@xfail_with_download_error
@pytest.mark.parametrize(
"dataset_name, files",
[
(
"electricity_usage",
[
"weather_bayonne.csv",
"weather_brest.csv",
"weather_lille.csv",
"weather_limoges.csv",
"weather_lyon.csv",
"weather_marseille.csv",
"weather_nantes.csv",
"weather_paris.csv",
"weather_strasbourg.csv",
"weather_toulouse.csv",
"Total Load - Day Ahead _ Actual_202501010000-202601010000.csv",
"Total Load - Day Ahead _ Actual_202401010000-202501010000.csv",
"Total Load - Day Ahead _ Actual_202301010000-202401010000.csv",
"Total Load - Day Ahead _ Actual_202201010000-202301010000.csv",
"Total Load - Day Ahead _ Actual_202101010000-202201010000.csv",
],
),
],
)
def test_dataset_files(dataset_name, files):
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
"Test datasets whose fetcher returns a PosixPath."
path = getattr(_fetching, f"fetch_{dataset_name}")()
for file in files:
print(f"Checking for {file}")
print(file in os.listdir(path))
Comment thread
lisaleemcb marked this conversation as resolved.
Outdated
assert all(file in os.listdir(path) for file in files)
Loading