skrub-data · lisaleemcb · Apr 1, 2026 · Apr 2, 2026 · Apr 26, 2026 · May 6, 2026
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -134,6 +134,9 @@ New Features
 - :func:`selectors.has_nulls` now takes a ``proportion`` parameter, which allows
   selecting columns that have a fraction of null values above the given threshold.
   :pr:`1881` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
+- Added a new dataset, :func:`fetch_electricity_usage`, which contains electricity usage data
+  for several French cities and corresponding weather data.
+  :pr:`2013` by :user:`Lisa McBride<lisaleemcb>`.
 
 
 Changes

diff --git a/doc/api_reference.py b/doc/api_reference.py
@@ -322,6 +322,7 @@
                     "datasets.fetch_country_happiness",
                     "datasets.fetch_credit_fraud",
                     "datasets.fetch_drug_directory",
+                    "datasets.fetch_electricity_usage",
                     "datasets.fetch_employee_salaries",
                     "datasets.fetch_flight_delays",
                     "datasets.fetch_medical_charge",

diff --git a/skrub/datasets/__init__.py b/skrub/datasets/__init__.py
@@ -4,6 +4,7 @@
     fetch_country_happiness,
     fetch_credit_fraud,
     fetch_drug_directory,
+    fetch_electricity_usage,
     fetch_employee_salaries,
     fetch_flight_delays,
     fetch_medical_charge,
@@ -23,6 +24,7 @@
     "fetch_country_happiness",
     "fetch_credit_fraud",
     "fetch_drug_directory",
+    "fetch_electricity_usage",
     "fetch_employee_salaries",
     "fetch_flight_delays",
     "fetch_medical_charge",

diff --git a/skrub/datasets/_fetching.py b/skrub/datasets/_fetching.py
@@ -4,7 +4,7 @@
 
 from pathlib import Path
 
-from ._utils import load_dataset_files, load_simple_dataset
+from ._utils import download_dataset, load_dataset_files, load_simple_dataset
 
 
 def fetch_employee_salaries(data_home=None, split="all"):
@@ -606,3 +606,44 @@ def fetch_california_housing(data_home=None):
             The path to the california housing CSV file.
     """
     return load_simple_dataset("california_housing", data_home)
+
+
+def fetch_electricity_usage(data_home=None):
+    """Fetches the electricity usage dataset (forecasting), available at \
+        https://github.com/skrub-data/skrub-data-files
+
+    Description of the dataset:
+        This dataset was generated from data obtained from the
+        ENTSOE Open Data portal under the open source license (CC-BY 4.0):
+        https://transparencyplatform.zendesk.com/hc/article_attachments/40921869376401
+
+        and the Open Meteo Historical Weather API:
+        https://open-meteo.com/en/docs/historical-forecast-api
+        in accordance with the licence described:
+        https://open-meteo.com/en/licence
+
+        This is a time-series forecasting use case. This dataset gives the total
+        electricity load in MW in France, covering a time range from
+        March 23, 2021 to May 31, 2025. In addition, the dataset contains
+        weather data for several cities within France.
+
+        It can be downloaded/loaded using the
+        sklearn.datasets.fetch_electricity_usage function.
+        Size on disk: 26MB.
+
+    Parameters
+    ----------
+    data_home: str or path, default=None
+        The directory where to download and unzip the files.
+
+    Returns
+    -------
+    Path : PosixPath
+         The path to the electricity usage CSV files
+
+    References
+    ----------
+    .. [1] For more detailed instructions on how to use this dataset, please refer
+           to the example here: `EuroSciPy2025 <https://github.com/skrub-data/EuroSciPy2025>`_
+    """
+    return download_dataset("electricity_usage", data_home=data_home)
diff --git a/skrub/datasets/_utils.py b/skrub/datasets/_utils.py
@@ -50,6 +50,13 @@
         ],
         "sha256": "0c3885894baf02fc787109801ec2c34cc25cd4a31e0066a16941b74157474887",
     },
+    "electricity_usage": {
+        "urls": [
+            "https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/electricity_usage.zip",
+            "https://osf.io/download/d8ykq",
+        ],
+        "sha256": "d929d73ce79e0e07a200941a476bde253ecb8fcd5d08ab276c0b5458dbf3bc7f",
+    },
     "employee_salaries": {
         "urls": [
             "https://github.com/skrub-data/skrub-data-files/raw/refs/heads/main/employee_salaries.zip",
@@ -245,7 +252,7 @@ def load_simple_dataset(dataset_name, data_home=None):
     return bunch
 
 
-def load_dataset_files(dataset_name, data_home):
+def download_dataset(dataset_name, data_home):
     data_home = get_data_home(data_home)
     dataset_dir = data_home / dataset_name
     datafiles_dir = dataset_dir / dataset_name
@@ -270,7 +277,11 @@ def load_dataset_files(dataset_name, data_home):
 
     if not datafiles_dir.exists():
         _extract_archive(dataset_dir, archive_path)
+    return datafiles_dir
+
 
+def load_dataset_files(dataset_name, data_home):
+    datafiles_dir = download_dataset(dataset_name, data_home)
     bunch = Bunch()
 
     # If there is a file named <dataset_name>.csv, we load the path as the main

diff --git a/skrub/datasets/tests/test_fetching.py b/skrub/datasets/tests/test_fetching.py
@@ -1,3 +1,4 @@
+import os
 from tempfile import TemporaryDirectory
 
 import pandas as pd
@@ -174,3 +175,51 @@ def _error_on_get(*args, **kwargs):
     with TemporaryDirectory() as temp_dir:
         with pytest.raises(OSError, match="Can't download"):
             _ = _fetching.fetch_employee_salaries(data_home=temp_dir)
+
+
+@xfail_with_download_error
+@pytest.mark.parametrize(
+    "dataset_name, dataset_path",
+    [
+        ("electricity_usage", ("electricity_usage", "electricity_usage")),
+    ],
+)
+def test_dataset_paths(dataset_name, dataset_path):
+    "Test datasets whose fetcher returns a PosixPath."
+    path = getattr(_fetching, f"fetch_{dataset_name}")()
+    assert path.parts[-2:] == dataset_path
+
+
+@xfail_with_download_error
+@pytest.mark.parametrize(
+    "dataset_name, files",
+    [
+        (
+            "electricity_usage",
+            [
+                "weather_bayonne.csv",
+                "weather_brest.csv",
+                "weather_lille.csv",
+                "weather_limoges.csv",
+                "weather_lyon.csv",
+                "weather_marseille.csv",
+                "weather_nantes.csv",
+                "weather_paris.csv",
+                "weather_strasbourg.csv",
+                "weather_toulouse.csv",
+                "Total Load - Day Ahead _ Actual_202501010000-202601010000.csv",
+                "Total Load - Day Ahead _ Actual_202401010000-202501010000.csv",
+                "Total Load - Day Ahead _ Actual_202301010000-202401010000.csv",
+                "Total Load - Day Ahead _ Actual_202201010000-202301010000.csv",
+                "Total Load - Day Ahead _ Actual_202101010000-202201010000.csv",
+            ],
+        ),
+    ],
+)
+def test_dataset_files(dataset_name, files):
+    "Test datasets whose fetcher returns a PosixPath."
+    path = getattr(_fetching, f"fetch_{dataset_name}")()
+    for file in files:
+        print(f"Checking for {file}")
+        print(file in os.listdir(path))
+    assert all(file in os.listdir(path) for file in files)