From 3d8d5fa4cadec707676d565f4b1f236bc9348df4 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Mon, 8 Jun 2026 23:16:21 +0200 Subject: [PATCH 01/13] Add notes --- notes | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 notes diff --git a/notes b/notes new file mode 100644 index 000000000..ec2952bf0 --- /dev/null +++ b/notes @@ -0,0 +1,27 @@ +cd D:\skrub-women-in-tech + +python -m venv env_skrub +.\env_skrub\Scripts\Activate.ps1 or source env_skrub/bin/activatepip install -e ".[dev]" + + +cd skrub (package) + +cd D:\skrub-women-in-tech +python -m pip install --upgrade pip +pip install -e ".[dev]" + + +pip install pre-commit (install) +pre-commit install (run) + + +Configure git blame +git config blame.ignoreRevsFile .git-blame-ignore-revs +(env_skrub) PS D:\skrub-women-in-tech> git config --get blame.ignoreRevsFile +.git-blame-ignore-revs + + +==== +Run the test suite + +pytest --pyargs skrub From edc7dadad9bd76a534738c9f71701ede733358d5 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 12:05:11 +0200 Subject: [PATCH 02/13] add estimated memory usage --- .../_data/templates/dataframe-sample.html | 7 +++++ skrub/_reporting/_data/templates/report.md | 6 ++++ skrub/_reporting/_summarize.py | 28 +++++++++++++++++++ skrub/_reporting/tests/test_table_report.py | 1 + 4 files changed, 42 insertions(+) diff --git a/skrub/_reporting/_data/templates/dataframe-sample.html b/skrub/_reporting/_data/templates/dataframe-sample.html index 9544f6f5a..6cfe701ad 100644 --- a/skrub/_reporting/_data/templates/dataframe-sample.html +++ b/skrub/_reporting/_data/templates/dataframe-sample.html @@ -86,6 +86,13 @@ {{ summary.n_rows | format_number }} rows ✕ {{ summary.n_columns | format_number }} columns + {%- if summary.get("memory_usage_kb") is not none -%} + (memory usage: {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB) + {%- if summary.get("memory_estimate_unreliable") -%} + — estimate may be inaccurate for complex objects + {%- endif -%} + + {%- endif -%} {% if 'is_subsampled' in summary %} (subsampled from more rows) {% endif %} diff --git a/skrub/_reporting/_data/templates/report.md b/skrub/_reporting/_data/templates/report.md index 3239b6184..b836c5f56 100644 --- a/skrub/_reporting/_data/templates/report.md +++ b/skrub/_reporting/_data/templates/report.md @@ -2,6 +2,12 @@ The provided dataframe uses the {{ summary.dataframe_module }} library. It has **shape** {{ summary.n_rows }} rows × {{ summary.n_columns }} columns. +{% if summary.get("memory_usage_kb") is not none %} +**memory usage** {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB. +{% if summary.get("memory_estimate_unreliable") %} +_Note: memory estimate may be inaccurate for complex object columns._ +{% endif %} +{% endif %} Columns are marked as "high cardinality" if they contain more than {{ summary.cardinality_threshold }} unique values. diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index b544986e2..979104ffa 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -18,6 +18,28 @@ _N_TOP_ASSOCIATIONS = 1000 +def _memory_usage_kb(df): + if sbd.dataframe_module_name(df) == "pandas": + memory_usage_bytes = df.memory_usage(deep=False).sum() + else: + estimated_size = getattr(df, "estimated_size", None) + if estimated_size is None: + return None + memory_usage_bytes = estimated_size() + return memory_usage_bytes / 1024 + + +def _has_complex_objects(df): + """Return True when pandas has object-dtype columns. + + The memory estimate is less reliable for object-dtype columns, so we warn + as soon as any are present. + """ + if sbd.dataframe_module_name(df) != "pandas": + return False + return any(dtype == object for dtype in df.dtypes) + + def summarize_dataframe( df, *, @@ -77,6 +99,7 @@ def summarize_dataframe( "dataframe_module": sbd.dataframe_module_name(df), "n_rows": n_rows, "n_columns": n_columns, + "memory_usage_kb": _memory_usage_kb(df), "columns": [], "dataframe_is_empty": not n_rows or not n_columns, "plots_skipped": not with_plots, @@ -90,6 +113,11 @@ def summarize_dataframe( } if title is not None: summary["title"] = title + # detect complex objects that make memory estimates unreliable + try: + summary["memory_estimate_unreliable"] = _has_complex_objects(df) + except Exception: + summary["memory_estimate_unreliable"] = False if order_by is not None: df = sbd.sort(df, by=order_by) summary["order_by"] = order_by diff --git a/skrub/_reporting/tests/test_table_report.py b/skrub/_reporting/tests/test_table_report.py index a5a3eaab2..953de0f76 100644 --- a/skrub/_reporting/tests/test_table_report.py +++ b/skrub/_reporting/tests/test_table_report.py @@ -46,6 +46,7 @@ def test_report(air_quality): assert "With nulls" in html assert "First 10" in html assert "First 2" in html + assert "memory usage:" in html for col_name in sbd.column_names(air_quality): assert col_name in html report_id = get_report_id(html) From e5f11aa3e49ceff664ae55155c7a9088abebba2c Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 13:50:37 +0200 Subject: [PATCH 03/13] add to CHANGES.rst --- CHANGES.rst | 3 +++ skrub/_reporting/_data/templates/dataframe-sample.html | 2 +- skrub/_reporting/tests/test_markdown_template.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index fce0bc259..2c60567c8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -41,6 +41,7 @@ New Features regular numeric format (``(432)`` becomes ``-432``). :pr:`1772` by :user:`Gabriela Gómez Jiménez `. + Changes ------- - :meth:`choose_from` now transparently converts `outcomes` to a list when it is another type of sequence. :pr:`2100` by @@ -54,6 +55,8 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. +- The :class:`TableReport` can now be exported the estimated memory usage in TableReport when display data. + :pr:`` by :user:`Salam AlKaissi Sanae Janati Idrissi `. Bugfixes -------- diff --git a/skrub/_reporting/_data/templates/dataframe-sample.html b/skrub/_reporting/_data/templates/dataframe-sample.html index 6cfe701ad..372606413 100644 --- a/skrub/_reporting/_data/templates/dataframe-sample.html +++ b/skrub/_reporting/_data/templates/dataframe-sample.html @@ -87,7 +87,7 @@ {{ summary.n_columns | format_number }} columns {%- if summary.get("memory_usage_kb") is not none -%} - (memory usage: {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB) + (estimated memory usage: {{ "%.1f" | format(summary.get("memory_usage_kb")) }} KB) {%- if summary.get("memory_estimate_unreliable") -%} — estimate may be inaccurate for complex objects {%- endif -%} diff --git a/skrub/_reporting/tests/test_markdown_template.py b/skrub/_reporting/tests/test_markdown_template.py index 55d3b81b7..53187b8e0 100644 --- a/skrub/_reporting/tests/test_markdown_template.py +++ b/skrub/_reporting/tests/test_markdown_template.py @@ -37,6 +37,7 @@ def test_markdown_report_structure_and_titles(df_module): assert "# " in markdown_default # Header should exist # Shape info should be present assert "**shape** 3 rows × 3 columns" in markdown + assert "**memory usage**" in markdown # Unique values should be present (default value) assert "40 unique values." in markdown From c727ed66dfe59c89896f9526dea7ca560b26a08f Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 14:08:51 +0200 Subject: [PATCH 04/13] update CHANGES --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2c60567c8..93842f40b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -56,7 +56,7 @@ Changes - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. - The :class:`TableReport` can now be exported the estimated memory usage in TableReport when display data. - :pr:`` by :user:`Salam AlKaissi Sanae Janati Idrissi `. + :pr:`2153` by :user:`Salam AlKaissi Sanae Janati Idrissi `. Bugfixes -------- From 2afd0f5b561747a11627fbe71083cd94184e0261 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 14:17:42 +0200 Subject: [PATCH 05/13] remove notes file --- notes | 27 --------------------------- 1 file changed, 27 deletions(-) delete mode 100644 notes diff --git a/notes b/notes deleted file mode 100644 index ec2952bf0..000000000 --- a/notes +++ /dev/null @@ -1,27 +0,0 @@ -cd D:\skrub-women-in-tech - -python -m venv env_skrub -.\env_skrub\Scripts\Activate.ps1 or source env_skrub/bin/activatepip install -e ".[dev]" - - -cd skrub (package) - -cd D:\skrub-women-in-tech -python -m pip install --upgrade pip -pip install -e ".[dev]" - - -pip install pre-commit (install) -pre-commit install (run) - - -Configure git blame -git config blame.ignoreRevsFile .git-blame-ignore-revs -(env_skrub) PS D:\skrub-women-in-tech> git config --get blame.ignoreRevsFile -.git-blame-ignore-revs - - -==== -Run the test suite - -pytest --pyargs skrub From c8b4ce47a6adf1fbd6bfd91f93d3cb44911e60d6 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI <112949087+salam-alkaissi@users.noreply.github.com> Date: Wed, 10 Jun 2026 14:31:37 +0200 Subject: [PATCH 06/13] change on _table_report.py Updated JSON output to include memory usage information. --- skrub/_reporting/_table_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_reporting/_table_report.py b/skrub/_reporting/_table_report.py index db491170f..dd0525069 100644 --- a/skrub/_reporting/_table_report.py +++ b/skrub/_reporting/_table_report.py @@ -240,7 +240,7 @@ class TableReport: >>> j = TableReport(df, plot_distributions=False).json() >>> print(j) - {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, "columns": ... + {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, "memory_usage_kb": 0.1 ... Advanced configuration: you can add custom column filters that will appear From 5b5fd6222b7554b2e2adcff4ebb6ba5cc0c4c04f Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 14:36:03 +0200 Subject: [PATCH 07/13] change on _table_report.py --- skrub/_reporting/_table_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_reporting/_table_report.py b/skrub/_reporting/_table_report.py index db491170f..e6edd8858 100644 --- a/skrub/_reporting/_table_report.py +++ b/skrub/_reporting/_table_report.py @@ -240,7 +240,7 @@ class TableReport: >>> j = TableReport(df, plot_distributions=False).json() >>> print(j) - {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, "columns": ... + {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3 ... Advanced configuration: you can add custom column filters that will appear From f4b99e90e2605a8a87f8c25c3775ea3ecaf33cd6 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 15:07:34 +0200 Subject: [PATCH 08/13] change on table_report --- skrub/_reporting/_table_report.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_reporting/_table_report.py b/skrub/_reporting/_table_report.py index e6edd8858..d8c4286f6 100644 --- a/skrub/_reporting/_table_report.py +++ b/skrub/_reporting/_table_report.py @@ -240,7 +240,7 @@ class TableReport: >>> j = TableReport(df, plot_distributions=False).json() >>> print(j) - {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3 ... + {"dataframe_module": "pandas", "n_rows": 2, "n_columns": 3, ...} Advanced configuration: you can add custom column filters that will appear From 339e41d259024e5299f339bb078df6cd7346d0e5 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 15:24:22 +0200 Subject: [PATCH 09/13] change on summarize.py --- skrub/_reporting/_summarize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index 979104ffa..5efc12a33 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -114,10 +114,10 @@ def summarize_dataframe( if title is not None: summary["title"] = title # detect complex objects that make memory estimates unreliable - try: - summary["memory_estimate_unreliable"] = _has_complex_objects(df) - except Exception: - summary["memory_estimate_unreliable"] = False + # try: + summary["memory_estimate_unreliable"] = _has_complex_objects(df) + # except Exception: + # summary["memory_estimate_unreliable"] = False if order_by is not None: df = sbd.sort(df, by=order_by) summary["order_by"] = order_by From 8e96017e853d9826963045e8828635b98e3d013d Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 15:25:58 +0200 Subject: [PATCH 10/13] change on summarize py --- skrub/_reporting/_summarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index 5efc12a33..93ab54a1a 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -113,9 +113,9 @@ def summarize_dataframe( } if title is not None: summary["title"] = title - # detect complex objects that make memory estimates unreliable - # try: - summary["memory_estimate_unreliable"] = _has_complex_objects(df) + # detect complex objects that make memory estimates unreliable + # try: + summary["memory_estimate_unreliable"] = _has_complex_objects(df) # except Exception: # summary["memory_estimate_unreliable"] = False if order_by is not None: From bbb67da4279ffb8267496b355b8c1e80c5f5df8c Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 15:41:06 +0200 Subject: [PATCH 11/13] update on summarize.py file --- skrub/_reporting/_summarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index 93ab54a1a..5efc12a33 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -113,9 +113,9 @@ def summarize_dataframe( } if title is not None: summary["title"] = title - # detect complex objects that make memory estimates unreliable - # try: - summary["memory_estimate_unreliable"] = _has_complex_objects(df) + # detect complex objects that make memory estimates unreliable + # try: + summary["memory_estimate_unreliable"] = _has_complex_objects(df) # except Exception: # summary["memory_estimate_unreliable"] = False if order_by is not None: From 4a047fa7ccbb7db6e6a0c8add4c183d7264d61ca Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Wed, 10 Jun 2026 16:23:16 +0200 Subject: [PATCH 12/13] update on summarize py --- skrub/_reporting/_summarize.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/skrub/_reporting/_summarize.py b/skrub/_reporting/_summarize.py index 5efc12a33..6f8500bfa 100644 --- a/skrub/_reporting/_summarize.py +++ b/skrub/_reporting/_summarize.py @@ -2,6 +2,9 @@ import sys +from skrub._dataframe._common import raise_dispatch_unregistered_type +from skrub._dispatch import dispatch + from .. import _column_associations, _config from .. import _dataframe as sbd from . import _plotting, _sample_table, _utils @@ -18,14 +21,20 @@ _N_TOP_ASSOCIATIONS = 1000 -def _memory_usage_kb(df): - if sbd.dataframe_module_name(df) == "pandas": - memory_usage_bytes = df.memory_usage(deep=False).sum() - else: - estimated_size = getattr(df, "estimated_size", None) - if estimated_size is None: - return None - memory_usage_bytes = estimated_size() +@dispatch +def _memory_usage_kb(obj): + raise_dispatch_unregistered_type(obj) + + +@_memory_usage_kb.specialize("pandas") +def _memory_usage_pandas(obj): + memory_usage_bytes = obj.memory_usage(deep=False).sum() + return memory_usage_bytes / 1024 + + +@_memory_usage_kb.specialize("polars") +def _memory_usage_polars(obj): + memory_usage_bytes = obj.estimated_size() return memory_usage_bytes / 1024 From 63493cbec0cd4dae8c11fe908e2b57afedec1570 Mon Sep 17 00:00:00 2001 From: Salam AL KAISSI Date: Thu, 11 Jun 2026 12:27:50 +0200 Subject: [PATCH 13/13] update on CHANGES.rst --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 93842f40b..7b1ef93f7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -55,7 +55,7 @@ Changes :pr:`2096` by :user:`Ayesha Siddiqua `. - The :class:`TableReport` can now be exported in markdown format with ``.markdown``. :pr:`2048` by :user:`Riccardo Cappuzzo `. -- The :class:`TableReport` can now be exported the estimated memory usage in TableReport when display data. +- The :class:`TableReport` can now display the estimated memory usage of the data it is applied to. :pr:`2153` by :user:`Salam AlKaissi Sanae Janati Idrissi `. Bugfixes