Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,9 @@ New Features
- :func:`selectors.has_nulls` now takes a ``proportion`` parameter, which allows
selecting columns that have a fraction of null values above the given threshold.
:pr:`1881` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
- Added a new dataset, :func:`fetch_electricity_usage`, which contains electricity usage data
for several French cities and corresponding weather data.
:pr:`2013` by :user:`Lisa McBride<lisaleemcb>`.


Changes
Expand Down
1 change: 1 addition & 0 deletions doc/api_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@
"datasets.fetch_country_happiness",
"datasets.fetch_credit_fraud",
"datasets.fetch_drug_directory",
"datasets.fetch_electricity_forecasting",
"datasets.fetch_employee_salaries",
"datasets.fetch_flight_delays",
"datasets.fetch_medical_charge",
Expand Down
2 changes: 2 additions & 0 deletions skrub/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
fetch_country_happiness,
fetch_credit_fraud,
fetch_drug_directory,
fetch_electricity_forecasting,
fetch_employee_salaries,
fetch_flight_delays,
fetch_medical_charge,
Expand All @@ -23,6 +24,7 @@
"fetch_country_happiness",
"fetch_credit_fraud",
"fetch_drug_directory",
"fetch_electricity_forecasting",
"fetch_employee_salaries",
"fetch_flight_delays",
"fetch_medical_charge",
Expand Down
43 changes: 42 additions & 1 deletion skrub/datasets/_fetching.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pathlib import Path

from ._utils import load_dataset_files, load_simple_dataset
from ._utils import download_dataset, load_dataset_files, load_simple_dataset


def fetch_employee_salaries(data_home=None, split="all"):
Expand Down Expand Up @@ -606,3 +606,44 @@ def fetch_california_housing(data_home=None):
The path to the california housing CSV file.
"""
return load_simple_dataset("california_housing", data_home)


def fetch_electricity_forecasting(data_home=None):
"""Fetches the electricity usage dataset (forecasting), available at \
https://github.com/skrub-data/skrub-data-files

Description of the dataset:
This dataset was generated from data obtained from the
ENTSOE Open Data portal under the open source license (CC-BY 4.0):
https://transparencyplatform.zendesk.com/hc/article_attachments/40921869376401

and the Open Meteo Historical Weather API:
https://open-meteo.com/en/docs/historical-forecast-api
in accordance with the licence described:
https://open-meteo.com/en/licence

This is a time-series forecasting use case. This dataset gives the total
electricity load in MW in France, covering a time range from
March 23, 2021 to May 31, 2025. In addition, the dataset contains
weather data for several cities within France.

It can be downloaded/loaded using the
sklearn.datasets.fetch_electricity_forecasting function.
Size on disk: 26MB.

Parameters
----------
data_home: str or path, default=None
The directory where to download and unzip the files.

Returns
-------
Path : PosixPath
The path to the electricity usage CSV files

Comment thread
lisaleemcb marked this conversation as resolved.
References
----------
.. [1] For more detailed instructions on how to use this dataset, please refer
to the example here: `EuroSciPy2025 <https://github.com/skrub-data/EuroSciPy2025>`_
"""
return download_dataset("electricity_forecasting", data_home=data_home)
13 changes: 12 additions & 1 deletion skrub/datasets/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@
],
"sha256": "0c3885894baf02fc787109801ec2c34cc25cd4a31e0066a16941b74157474887",
},
"electricity_forecasting": {
"urls": [
"https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/electricity_forecasting.zip",
"https://osf.io/download/d8ykq",
],
"sha256": "55d565f02298cb5006eb697775775170370f3b3ca96bfadd3447f1afa57b87b0",
},
"employee_salaries": {
"urls": [
"https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/employee_salaries.zip",
Expand Down Expand Up @@ -245,7 +252,7 @@ def load_simple_dataset(dataset_name, data_home=None):
return bunch


def load_dataset_files(dataset_name, data_home):
def download_dataset(dataset_name, data_home):
data_home = get_data_home(data_home)
dataset_dir = data_home / dataset_name
datafiles_dir = dataset_dir / dataset_name
Expand All @@ -270,7 +277,11 @@ def load_dataset_files(dataset_name, data_home):

if not datafiles_dir.exists():
_extract_archive(dataset_dir, archive_path)
return datafiles_dir


def load_dataset_files(dataset_name, data_home):
datafiles_dir = download_dataset(dataset_name, data_home)
bunch = Bunch()

# If there is a file named <dataset_name>.csv, we load the path as the main
Expand Down
27 changes: 27 additions & 0 deletions skrub/datasets/tests/test_fetching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from pathlib import Path
from tempfile import TemporaryDirectory

import pandas as pd
Expand Down Expand Up @@ -174,3 +175,29 @@ def _error_on_get(*args, **kwargs):
with TemporaryDirectory() as temp_dir:
with pytest.raises(OSError, match="Can't download"):
_ = _fetching.fetch_employee_salaries(data_home=temp_dir)


@xfail_with_download_error
def test_electricity_forecasting():
files = set(
[
"weather_bayonne.csv",
"weather_brest.csv",
"weather_lille.csv",
"weather_limoges.csv",
"weather_lyon.csv",
"weather_marseille.csv",
"weather_nantes.csv",
"weather_paris.csv",
"weather_strasbourg.csv",
"weather_toulouse.csv",
"Total Load - Day Ahead _ Actual_202501010000-202601010000.csv",
"Total Load - Day Ahead _ Actual_202401010000-202501010000.csv",
"Total Load - Day Ahead _ Actual_202301010000-202401010000.csv",
"Total Load - Day Ahead _ Actual_202201010000-202301010000.csv",
"Total Load - Day Ahead _ Actual_202101010000-202201010000.csv",
]
)
path = _fetching.fetch_electricity_forecasting()
downloaded = [f.name for f in Path(path).iterdir() if f.is_file()]
assert set(downloaded) == files
Loading